xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision cbc6b2250e380677eada4bbc58a36dc55ca92067)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
147e8381f9SStefano Zampini struct VecCUDAEquals
157e8381f9SStefano Zampini {
167e8381f9SStefano Zampini   template <typename Tuple>
177e8381f9SStefano Zampini   __host__ __device__
187e8381f9SStefano Zampini   void operator()(Tuple t)
197e8381f9SStefano Zampini   {
207e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
217e8381f9SStefano Zampini   }
227e8381f9SStefano Zampini };
237e8381f9SStefano Zampini 
24*cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25*cbc6b225SStefano Zampini {
26*cbc6b225SStefano Zampini   cudaError_t        cerr;
27*cbc6b225SStefano Zampini   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
28*cbc6b225SStefano Zampini   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
29*cbc6b225SStefano Zampini 
30*cbc6b225SStefano Zampini   PetscFunctionBegin;
31*cbc6b225SStefano Zampini   if (!cusparseStruct) PetscFunctionReturn(0);
32*cbc6b225SStefano Zampini   if (cusparseStruct->use_extended_coo) {
33*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr);
34*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr);
35*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr);
36*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr);
37*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr);
38*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr);
39*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr);
40*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr);
41*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr);
42*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr);
43*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr);
44*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr);
45*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr);
46*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr);
47*cbc6b225SStefano Zampini     cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr);
48*cbc6b225SStefano Zampini   }
49*cbc6b225SStefano Zampini   cusparseStruct->use_extended_coo = PETSC_FALSE;
50*cbc6b225SStefano Zampini   delete cusparseStruct->coo_p;
51*cbc6b225SStefano Zampini   delete cusparseStruct->coo_pw;
52*cbc6b225SStefano Zampini   cusparseStruct->coo_p = NULL;
53*cbc6b225SStefano Zampini   cusparseStruct->coo_pw = NULL;
54*cbc6b225SStefano Zampini   PetscFunctionReturn(0);
55*cbc6b225SStefano Zampini }
56*cbc6b225SStefano Zampini 
57219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
587e8381f9SStefano Zampini {
597e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
607e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
617e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
627e8381f9SStefano Zampini   PetscErrorCode     ierr;
637e8381f9SStefano Zampini 
647e8381f9SStefano Zampini   PetscFunctionBegin;
65e61fc153SStefano Zampini   if (cusp->coo_p && v) {
6608391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
6708391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
6808391a17SStefano Zampini 
6908391a17SStefano Zampini     if (isCudaMem(v)) {
7008391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
7108391a17SStefano Zampini     } else {
72e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
73e61fc153SStefano Zampini       w->assign(v,v+n);
7408391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
7508391a17SStefano Zampini       d_v = w->data();
7608391a17SStefano Zampini     }
7708391a17SStefano Zampini 
7808391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
797e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
8008391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
817e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
8208391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
837e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
8408391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
85e61fc153SStefano Zampini     delete w;
86219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
87219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
887e8381f9SStefano Zampini   } else {
89219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr);
90219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
917e8381f9SStefano Zampini   }
927e8381f9SStefano Zampini   PetscFunctionReturn(0);
937e8381f9SStefano Zampini }
947e8381f9SStefano Zampini 
957e8381f9SStefano Zampini template <typename Tuple>
967e8381f9SStefano Zampini struct IsNotOffDiagT
977e8381f9SStefano Zampini {
987e8381f9SStefano Zampini   PetscInt _cstart,_cend;
997e8381f9SStefano Zampini 
1007e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
1017e8381f9SStefano Zampini   __host__ __device__
1027e8381f9SStefano Zampini   inline bool operator()(Tuple t)
1037e8381f9SStefano Zampini   {
1047e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
1057e8381f9SStefano Zampini   }
1067e8381f9SStefano Zampini };
1077e8381f9SStefano Zampini 
1087e8381f9SStefano Zampini struct IsOffDiag
1097e8381f9SStefano Zampini {
1107e8381f9SStefano Zampini   PetscInt _cstart,_cend;
1117e8381f9SStefano Zampini 
1127e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
1137e8381f9SStefano Zampini   __host__ __device__
1147e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
1157e8381f9SStefano Zampini   {
1167e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
1177e8381f9SStefano Zampini   }
1187e8381f9SStefano Zampini };
1197e8381f9SStefano Zampini 
1207e8381f9SStefano Zampini struct GlobToLoc
1217e8381f9SStefano Zampini {
1227e8381f9SStefano Zampini   PetscInt _start;
1237e8381f9SStefano Zampini 
1247e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
1257e8381f9SStefano Zampini   __host__ __device__
1267e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
1277e8381f9SStefano Zampini   {
1287e8381f9SStefano Zampini     return c - _start;
1297e8381f9SStefano Zampini   }
1307e8381f9SStefano Zampini };
1317e8381f9SStefano Zampini 
132219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
1337e8381f9SStefano Zampini {
1347e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1357e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1367e8381f9SStefano Zampini   PetscErrorCode         ierr;
13782a78a4eSJed Brown   PetscInt               N,*jj;
1387e8381f9SStefano Zampini   size_t                 noff = 0;
139ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1407e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1417e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1427e8381f9SStefano Zampini   cudaError_t            cerr;
1437e8381f9SStefano Zampini 
1447e8381f9SStefano Zampini   PetscFunctionBegin;
1457e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1467e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1477e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1487e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1497e8381f9SStefano Zampini 
1507e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1517e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1527e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
15308391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1547e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1557e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1567e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1577e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1587e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1597e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1607e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1617e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1627e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1637e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1647e8381f9SStefano Zampini   }
1657e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1667e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1677e8381f9SStefano Zampini 
1687e8381f9SStefano Zampini   /* from global to local */
1697e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1707e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
17108391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1727e8381f9SStefano Zampini 
1737e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
174ddea5d60SJunchao Zhang   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
1757e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1767e8381f9SStefano Zampini   auto o_j = d_j.begin();
17708391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
178ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1797e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
180ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
18108391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1827e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
183*cbc6b225SStefano Zampini   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
184ddea5d60SJunchao Zhang   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
1857e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1867e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1877e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1887e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
18982a78a4eSJed Brown   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
1902c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
1917e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1927e8381f9SStefano Zampini 
1937e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1947e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1957e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1967e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1987e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1997e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
2007e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2017e8381f9SStefano Zampini 
2027e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
203219fbbafSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
204219fbbafSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
2057e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
2067e8381f9SStefano Zampini 
2077e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
2087e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
2097e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
2107e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
211a0e72f99SJunchao Zhang   /*
2127e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
2137e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
214a0e72f99SJunchao Zhang   */
2157e8381f9SStefano Zampini 
2167e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2177e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
218*cbc6b225SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
2197e8381f9SStefano Zampini   PetscFunctionReturn(0);
2207e8381f9SStefano Zampini }
2219ae82921SPaul Mullowney 
222219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
223219fbbafSJunchao Zhang {
224219fbbafSJunchao Zhang   PetscErrorCode         ierr;
225*cbc6b225SStefano Zampini   Mat_MPIAIJ             *mpiaij = (Mat_MPIAIJ*)mat->data;
226219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE     *mpidev;
227*cbc6b225SStefano Zampini   PetscBool              coo_basic = PETSC_TRUE;
228219fbbafSJunchao Zhang   PetscMemType           mtype = PETSC_MEMTYPE_DEVICE;
229219fbbafSJunchao Zhang   PetscInt               rstart,rend;
230219fbbafSJunchao Zhang   cudaError_t            cerr;
231219fbbafSJunchao Zhang 
232219fbbafSJunchao Zhang   PetscFunctionBegin;
233*cbc6b225SStefano Zampini   ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr);
234*cbc6b225SStefano Zampini   ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr);
235*cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
236*cbc6b225SStefano Zampini   ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr);
237*cbc6b225SStefano Zampini #else
238*cbc6b225SStefano Zampini   ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr);
239*cbc6b225SStefano Zampini #endif
240*cbc6b225SStefano Zampini   ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr);
241*cbc6b225SStefano Zampini   mat->assembled = PETSC_FALSE;
242*cbc6b225SStefano Zampini   mat->was_assembled = PETSC_FALSE;
243*cbc6b225SStefano Zampini   ierr = MatResetPreallocationCOO_MPIAIJ(mat);CHKERRQ(ierr);
244*cbc6b225SStefano Zampini   ierr = MatResetPreallocationCOO_MPIAIJCUSPARSE(mat);CHKERRQ(ierr);
245219fbbafSJunchao Zhang   if (coo_i) {
246219fbbafSJunchao Zhang     ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
247219fbbafSJunchao Zhang     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
248219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
249219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
250*cbc6b225SStefano Zampini         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
251219fbbafSJunchao Zhang       }
252219fbbafSJunchao Zhang     }
253219fbbafSJunchao Zhang   }
254219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
255*cbc6b225SStefano Zampini   ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
256219fbbafSJunchao Zhang   if (coo_basic) {
257219fbbafSJunchao Zhang     ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
258219fbbafSJunchao Zhang   } else {
259*cbc6b225SStefano Zampini     ierr = MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
260*cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
261*cbc6b225SStefano Zampini     /* creates the GPU memory */
262baf1f5d7SJed Brown     ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);CHKERRQ(ierr);
263baf1f5d7SJed Brown     ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);CHKERRQ(ierr);
264*cbc6b225SStefano Zampini     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
265219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
266219fbbafSJunchao Zhang 
267219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr);
268219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
269219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr);
270219fbbafSJunchao Zhang 
271219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr);
272219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
273219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr);
274219fbbafSJunchao Zhang 
275219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr);
276219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
277219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr);
278219fbbafSJunchao Zhang 
279219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr);
280219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
281219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr);
282219fbbafSJunchao Zhang 
283219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr);
284219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
285219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
286219fbbafSJunchao Zhang 
287219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
288219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
289219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
290219fbbafSJunchao Zhang 
291219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
292219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
293219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
294219fbbafSJunchao Zhang 
295219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
296219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
297219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
298219fbbafSJunchao Zhang 
299219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
300219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
301219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
302219fbbafSJunchao Zhang 
303219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
304219fbbafSJunchao Zhang   }
305219fbbafSJunchao Zhang   PetscFunctionReturn(0);
306219fbbafSJunchao Zhang }
307219fbbafSJunchao Zhang 
308219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
309219fbbafSJunchao Zhang {
310219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
311219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
312219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
313219fbbafSJunchao Zhang }
314219fbbafSJunchao Zhang 
315219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
316219fbbafSJunchao Zhang {
317219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
318219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
319219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
320219fbbafSJunchao Zhang }
321219fbbafSJunchao Zhang 
322219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
323219fbbafSJunchao Zhang {
324219fbbafSJunchao Zhang   PetscErrorCode                 ierr;
325219fbbafSJunchao Zhang   cudaError_t                    cerr;
326219fbbafSJunchao Zhang   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
327219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE             *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
328219fbbafSJunchao Zhang   Mat                            A = mpiaij->A,B = mpiaij->B;
329219fbbafSJunchao Zhang   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
330*cbc6b225SStefano Zampini   PetscScalar                    *Aa,*Ba = NULL;
331219fbbafSJunchao Zhang   PetscScalar                    *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
332219fbbafSJunchao Zhang   const PetscScalar              *v1 = v;
333219fbbafSJunchao Zhang   const PetscCount               *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
334219fbbafSJunchao Zhang   const PetscCount               *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
335219fbbafSJunchao Zhang   const PetscCount               *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
336219fbbafSJunchao Zhang   const PetscCount               *Cperm1 = mpidev->Cperm1_d;
337219fbbafSJunchao Zhang   PetscMemType                   memtype;
338219fbbafSJunchao Zhang 
339219fbbafSJunchao Zhang   PetscFunctionBegin;
340219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
341*cbc6b225SStefano Zampini     PetscMPIInt size;
342*cbc6b225SStefano Zampini 
343*cbc6b225SStefano Zampini     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr);
344219fbbafSJunchao Zhang     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
345219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
346219fbbafSJunchao Zhang       cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
3477487cd7cSJunchao Zhang       cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
348219fbbafSJunchao Zhang     }
349219fbbafSJunchao Zhang 
350219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
351219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
3527487cd7cSJunchao Zhang       cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
353*cbc6b225SStefano Zampini       if (size > 1) {
354*cbc6b225SStefano Zampini         ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr);
3557487cd7cSJunchao Zhang         cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
356*cbc6b225SStefano Zampini       }
357219fbbafSJunchao Zhang     } else {
358219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */
359*cbc6b225SStefano Zampini       if (size > 1) { ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); }
360219fbbafSJunchao Zhang     }
361219fbbafSJunchao Zhang 
362219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
363*cbc6b225SStefano Zampini     if (mpiaij->sendlen) {
364219fbbafSJunchao Zhang       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
365*cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
366*cbc6b225SStefano Zampini     }
367219fbbafSJunchao Zhang 
368219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
369*cbc6b225SStefano Zampini     if (size > 1) { ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); }
370219fbbafSJunchao Zhang     /* Add local entries to A and B */
371*cbc6b225SStefano Zampini     if (Annz1) {
372219fbbafSJunchao Zhang       MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
373*cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
374*cbc6b225SStefano Zampini     }
375*cbc6b225SStefano Zampini     if (Bnnz1) {
376219fbbafSJunchao Zhang       MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
377*cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
378*cbc6b225SStefano Zampini     }
379*cbc6b225SStefano Zampini     if (size > 1) { ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); }
380219fbbafSJunchao Zhang 
381219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
382*cbc6b225SStefano Zampini     if (Annz2) {
383219fbbafSJunchao Zhang       MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
384*cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
385*cbc6b225SStefano Zampini     }
386*cbc6b225SStefano Zampini     if (Bnnz2) {
387219fbbafSJunchao Zhang       MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
388*cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
389*cbc6b225SStefano Zampini     }
390219fbbafSJunchao Zhang 
391219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
392219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);
393*cbc6b225SStefano Zampini       if (size > 1) { ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); }
394219fbbafSJunchao Zhang     } else {
395219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);
396*cbc6b225SStefano Zampini       if (size > 1) { ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); }
397219fbbafSJunchao Zhang     }
398219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
399219fbbafSJunchao Zhang   } else {
400219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr);
401219fbbafSJunchao Zhang   }
402*cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
403219fbbafSJunchao Zhang   PetscFunctionReturn(0);
404219fbbafSJunchao Zhang }
405219fbbafSJunchao Zhang 
406ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
407ed502f03SStefano Zampini {
408ed502f03SStefano Zampini   Mat            Ad,Ao;
409ed502f03SStefano Zampini   const PetscInt *cmap;
410ed502f03SStefano Zampini   PetscErrorCode ierr;
411ed502f03SStefano Zampini 
412ed502f03SStefano Zampini   PetscFunctionBegin;
413ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
414ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
415ed502f03SStefano Zampini   if (glob) {
416ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
417ed502f03SStefano Zampini 
418ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
419ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
420ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
421ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
422ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
423ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
424ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
425ed502f03SStefano Zampini   }
426ed502f03SStefano Zampini   PetscFunctionReturn(0);
427ed502f03SStefano Zampini }
428ed502f03SStefano Zampini 
4299ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
4309ae82921SPaul Mullowney {
431bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
432bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
4339ae82921SPaul Mullowney   PetscErrorCode     ierr;
4349ae82921SPaul Mullowney   PetscInt           i;
4359ae82921SPaul Mullowney 
4369ae82921SPaul Mullowney   PetscFunctionBegin;
4379ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
4389ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
4397e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
4409ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
4412c71b3e2SJacob Faibussowitsch       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
4429ae82921SPaul Mullowney     }
4439ae82921SPaul Mullowney   }
4447e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
4459ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
4462c71b3e2SJacob Faibussowitsch       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
4479ae82921SPaul Mullowney     }
4489ae82921SPaul Mullowney   }
4496a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
4506a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
4516a29ce69SStefano Zampini #else
4526a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
4536a29ce69SStefano Zampini #endif
4546a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
4556a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
4566a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
4576a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4586a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
4596a29ce69SStefano Zampini   if (!b->A) {
4609ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
4619ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
4623bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
4636a29ce69SStefano Zampini   }
4646a29ce69SStefano Zampini   if (!b->B) {
4656a29ce69SStefano Zampini     PetscMPIInt size;
46655b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
4679ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
4686a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
4693bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
4709ae82921SPaul Mullowney   }
471d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
472d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4736a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
4746a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
4759ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
4769ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
477e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
478e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
479b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
480b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
4819ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4829ae82921SPaul Mullowney   PetscFunctionReturn(0);
4839ae82921SPaul Mullowney }
484e057df02SPaul Mullowney 
4859ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
4869ae82921SPaul Mullowney {
4879ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
4889ae82921SPaul Mullowney   PetscErrorCode ierr;
4899ae82921SPaul Mullowney 
4909ae82921SPaul Mullowney   PetscFunctionBegin;
4919ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4924d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
4939ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4949ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
4959ae82921SPaul Mullowney   PetscFunctionReturn(0);
4969ae82921SPaul Mullowney }
497ca45077fSPaul Mullowney 
4983fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
4993fa6b06aSMark Adams {
5003fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
5013fa6b06aSMark Adams   PetscErrorCode ierr;
5023fa6b06aSMark Adams 
5033fa6b06aSMark Adams   PetscFunctionBegin;
5043fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
5053fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
5063fa6b06aSMark Adams   PetscFunctionReturn(0);
5073fa6b06aSMark Adams }
508042217e8SBarry Smith 
509fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
510fdc842d1SBarry Smith {
511fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
512fdc842d1SBarry Smith   PetscErrorCode ierr;
513fdc842d1SBarry Smith 
514fdc842d1SBarry Smith   PetscFunctionBegin;
515fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
5164d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
517fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
518fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
519fdc842d1SBarry Smith   PetscFunctionReturn(0);
520fdc842d1SBarry Smith }
521fdc842d1SBarry Smith 
522ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
523ca45077fSPaul Mullowney {
524ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
525ca45077fSPaul Mullowney   PetscErrorCode ierr;
526ca45077fSPaul Mullowney 
527ca45077fSPaul Mullowney   PetscFunctionBegin;
5289b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
529ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
5309b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
5319b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
532ca45077fSPaul Mullowney   PetscFunctionReturn(0);
533ca45077fSPaul Mullowney }
5349ae82921SPaul Mullowney 
535e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
536ca45077fSPaul Mullowney {
537ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
538bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
539e057df02SPaul Mullowney 
540ca45077fSPaul Mullowney   PetscFunctionBegin;
541e057df02SPaul Mullowney   switch (op) {
542e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
543e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
544045c96e1SPaul Mullowney     break;
545e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
546e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
547045c96e1SPaul Mullowney     break;
548e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
549e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
550e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
551045c96e1SPaul Mullowney     break;
552e057df02SPaul Mullowney   default:
55398921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
554045c96e1SPaul Mullowney   }
555ca45077fSPaul Mullowney   PetscFunctionReturn(0);
556ca45077fSPaul Mullowney }
557e057df02SPaul Mullowney 
5584416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
559e057df02SPaul Mullowney {
560e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
561e057df02SPaul Mullowney   PetscErrorCode           ierr;
562e057df02SPaul Mullowney   PetscBool                flg;
563a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
564a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
5655fd66863SKarl Rupp 
566e057df02SPaul Mullowney   PetscFunctionBegin;
567e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
568e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
569e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
570a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
571e057df02SPaul Mullowney     if (flg) {
572e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
573e057df02SPaul Mullowney     }
574e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
575a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
576e057df02SPaul Mullowney     if (flg) {
577e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
578e057df02SPaul Mullowney     }
579e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
580a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
581e057df02SPaul Mullowney     if (flg) {
582e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
583e057df02SPaul Mullowney     }
584e057df02SPaul Mullowney   }
5850af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
586e057df02SPaul Mullowney   PetscFunctionReturn(0);
587e057df02SPaul Mullowney }
588e057df02SPaul Mullowney 
58934d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
59034d6c7a5SJose E. Roman {
59134d6c7a5SJose E. Roman   PetscErrorCode     ierr;
5923fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
593042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
594042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
595042217e8SBarry Smith 
59634d6c7a5SJose E. Roman   PetscFunctionBegin;
59734d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
598042217e8SBarry Smith   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
599042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
600042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
601042217e8SBarry Smith     cudaError_t                cerr;
602042217e8SBarry Smith 
603042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
604042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
605042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
606042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
60799551766SMark Adams     if (h_mat->allocated_indices) {
60899551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
60999551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
61099551766SMark Adams       if (h_mat->offdiag.j) {
61199551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
61299551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
61399551766SMark Adams       }
61499551766SMark Adams     }
615042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
616042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
617042217e8SBarry Smith     cusp->deviceMat = NULL;
6183fa6b06aSMark Adams   }
61934d6c7a5SJose E. Roman   PetscFunctionReturn(0);
62034d6c7a5SJose E. Roman }
62134d6c7a5SJose E. Roman 
622bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
623bbf3fe20SPaul Mullowney {
624bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
625219fbbafSJunchao Zhang   cudaError_t        cerr;
6263fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
6273fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
628b06137fdSPaul Mullowney   cusparseStatus_t   stat;
629bbf3fe20SPaul Mullowney 
630bbf3fe20SPaul Mullowney   PetscFunctionBegin;
6312c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
6323fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
633042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
634042217e8SBarry Smith 
6353fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
636042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
637042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
638042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
63999551766SMark Adams     if (h_mat->allocated_indices) {
64099551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
64199551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
64299551766SMark Adams       if (h_mat->offdiag.j) {
64399551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
64499551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
64599551766SMark Adams       }
64699551766SMark Adams     }
647042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
648042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
6493fa6b06aSMark Adams   }
650bbf3fe20SPaul Mullowney   try {
6513fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
6523fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
65357d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
654a0e72f99SJunchao Zhang     /* We want cusparseStruct to use PetscDefaultCudaStream
65517403302SKarl Rupp     if (cusparseStruct->stream) {
656042217e8SBarry Smith       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
65717403302SKarl Rupp     }
658a0e72f99SJunchao Zhang     */
659*cbc6b225SStefano Zampini     /* Free COO */
660*cbc6b225SStefano Zampini     ierr = MatResetPreallocationCOO_MPIAIJCUSPARSE(A);CHKERRQ(ierr);
661bbf3fe20SPaul Mullowney     delete cusparseStruct;
662bbf3fe20SPaul Mullowney   } catch(char *ex) {
66398921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
664bbf3fe20SPaul Mullowney   }
6653338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
666ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
6677e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
6687e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
6693338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
670ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
671bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
672bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
673bbf3fe20SPaul Mullowney }
674ca45077fSPaul Mullowney 
6753338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
6769ae82921SPaul Mullowney {
6779ae82921SPaul Mullowney   PetscErrorCode     ierr;
678bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
679b06137fdSPaul Mullowney   cusparseStatus_t   stat;
6803338378cSStefano Zampini   Mat                A;
6819ae82921SPaul Mullowney 
6829ae82921SPaul Mullowney   PetscFunctionBegin;
683219fbbafSJunchao Zhang   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
6843338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
6853338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
6863338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
6873338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
6883338378cSStefano Zampini   }
6893338378cSStefano Zampini   A = *newmat;
6906f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
69134136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
69234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
69334136279SStefano Zampini 
694bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
695d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
696d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
697d98d7c49SStefano Zampini   if (a->lvec) {
698d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
699d98d7c49SStefano Zampini   }
700d98d7c49SStefano Zampini 
7013338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
702219fbbafSJunchao Zhang     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
703219fbbafSJunchao Zhang     stat     = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat);
704219fbbafSJunchao Zhang     a->spptr = cusp;
7053338378cSStefano Zampini   }
7062205254eSKarl Rupp 
70734d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
708bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
709fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
710bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
711bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
712bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
7133fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
7144e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
7152205254eSKarl Rupp 
716bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
717ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
7183338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
719bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
7207e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
7217e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
722ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
723ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
724ae48a8d0SStefano Zampini #endif
7259ae82921SPaul Mullowney   PetscFunctionReturn(0);
7269ae82921SPaul Mullowney }
7279ae82921SPaul Mullowney 
7283338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
7293338378cSStefano Zampini {
7303338378cSStefano Zampini   PetscErrorCode ierr;
7313338378cSStefano Zampini 
7323338378cSStefano Zampini   PetscFunctionBegin;
733a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
7343338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
7353338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
7363338378cSStefano Zampini   PetscFunctionReturn(0);
7373338378cSStefano Zampini }
7383338378cSStefano Zampini 
7393f9c0db1SPaul Mullowney /*@
7403f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
741e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
7423f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
743e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
744e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
745e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
7469ae82921SPaul Mullowney 
747d083f849SBarry Smith    Collective
748e057df02SPaul Mullowney 
749e057df02SPaul Mullowney    Input Parameters:
750e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
751e057df02SPaul Mullowney .  m - number of rows
752e057df02SPaul Mullowney .  n - number of columns
753e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
754e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
7550298fd71SBarry Smith          (possibly different for each row) or NULL
756e057df02SPaul Mullowney 
757e057df02SPaul Mullowney    Output Parameter:
758e057df02SPaul Mullowney .  A - the matrix
759e057df02SPaul Mullowney 
760e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
761e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
762e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
763e057df02SPaul Mullowney 
764e057df02SPaul Mullowney    Notes:
765e057df02SPaul Mullowney    If nnz is given then nz is ignored
766e057df02SPaul Mullowney 
767e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
768e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
769e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
770e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
771e057df02SPaul Mullowney 
772e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7730298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
774e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
775e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
776e057df02SPaul Mullowney 
777e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
778e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
779e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
780e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
781e057df02SPaul Mullowney 
782e057df02SPaul Mullowney    Level: intermediate
783e057df02SPaul Mullowney 
784e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
785e057df02SPaul Mullowney @*/
7869ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
7879ae82921SPaul Mullowney {
7889ae82921SPaul Mullowney   PetscErrorCode ierr;
7899ae82921SPaul Mullowney   PetscMPIInt    size;
7909ae82921SPaul Mullowney 
7919ae82921SPaul Mullowney   PetscFunctionBegin;
7929ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
7939ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
794ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
7959ae82921SPaul Mullowney   if (size > 1) {
7969ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
7979ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
7989ae82921SPaul Mullowney   } else {
7999ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
8009ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
8019ae82921SPaul Mullowney   }
8029ae82921SPaul Mullowney   PetscFunctionReturn(0);
8039ae82921SPaul Mullowney }
8049ae82921SPaul Mullowney 
8053ca39a21SBarry Smith /*MC
8066bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
807e057df02SPaul Mullowney 
8082692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
8092692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
8102692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
8119ae82921SPaul Mullowney 
8129ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
8139ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
8149ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
8159ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
8169ae82921SPaul Mullowney    the above preallocation routines for simplicity.
8179ae82921SPaul Mullowney 
8189ae82921SPaul Mullowney    Options Database Keys:
819e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
8208468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8218468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8228468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8239ae82921SPaul Mullowney 
8249ae82921SPaul Mullowney   Level: beginner
8259ae82921SPaul Mullowney 
8266bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
8276bb69460SJunchao Zhang M*/
8286bb69460SJunchao Zhang 
8296bb69460SJunchao Zhang /*MC
8306bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
8316bb69460SJunchao Zhang 
8326bb69460SJunchao Zhang   Level: beginner
8336bb69460SJunchao Zhang 
8346bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
8359ae82921SPaul Mullowney M*/
8363fa6b06aSMark Adams 
837042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
838042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
8393fa6b06aSMark Adams {
840042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
841042217e8SBarry Smith   PetscMPIInt                size;
8423fa6b06aSMark Adams   PetscErrorCode             ierr;
843042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
844042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
84599551766SMark Adams   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
846042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
847042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
8483fa6b06aSMark Adams 
8493fa6b06aSMark Adams   PetscFunctionBegin;
8502c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
851042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
8523fa6b06aSMark Adams     *B = NULL;
8533fa6b06aSMark Adams     PetscFunctionReturn(0);
8543fa6b06aSMark Adams   }
855042217e8SBarry Smith   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
85699551766SMark Adams   // get jaca
8573fa6b06aSMark Adams   if (size == 1) {
858042217e8SBarry Smith     PetscBool isseqaij;
859042217e8SBarry Smith 
860042217e8SBarry Smith     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
861042217e8SBarry Smith     if (isseqaij) {
8623fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
8632c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
864042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
865042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
866042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
8673fa6b06aSMark Adams     } else {
8683fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
8692c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
870042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8713fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
872042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
873042217e8SBarry Smith       d_mat = spptr->deviceMat;
874042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
8753fa6b06aSMark Adams     }
876042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
877042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
8782c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
879042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
880042217e8SBarry Smith       bi = NULL;
881042217e8SBarry Smith       bj = NULL;
882042217e8SBarry Smith       ba = NULL;
883042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
884042217e8SBarry Smith   } else {
885042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
8862c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
887042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
88899551766SMark Adams     jacb = (Mat_SeqAIJ*)aij->B->data;
889042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8903fa6b06aSMark Adams 
8912c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
892042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
893042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
8942c71b3e2SJacob Faibussowitsch     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
895042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
896042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
897042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
898042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
899042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
9002c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
9012c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
902042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
903042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
9043fa6b06aSMark Adams       if (jacb->compressedrow.use) {
905042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
906042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
907042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
9083fa6b06aSMark Adams         }
909042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
910042217e8SBarry Smith       } else {
911042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
912042217e8SBarry Smith       }
913042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
914042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
915042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
916042217e8SBarry Smith     d_mat = spptr->deviceMat;
917042217e8SBarry Smith   }
9183fa6b06aSMark Adams   if (jaca->compressedrow.use) {
919042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
920042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
921042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
9223fa6b06aSMark Adams     }
923042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
9243fa6b06aSMark Adams   } else {
925042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
9263fa6b06aSMark Adams   }
927042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
928042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
929042217e8SBarry Smith 
930042217e8SBarry Smith   if (!d_mat) {
931042217e8SBarry Smith     cudaError_t                cerr;
932042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
933042217e8SBarry Smith 
934042217e8SBarry Smith     // create and populate strucy on host and copy on device
935042217e8SBarry Smith     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
936042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
937042217e8SBarry Smith     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
938042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
939042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
94099551766SMark Adams       PetscInt   *colmap;
941042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
942042217e8SBarry Smith 
9432c71b3e2SJacob Faibussowitsch       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
944042217e8SBarry Smith 
945042217e8SBarry Smith       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
946042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
947365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
948365b711fSMark Adams       { // have to make a long version of these
94999551766SMark Adams         int        *h_bi32, *h_bj32;
95099551766SMark Adams         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
951365b711fSMark Adams         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
95299551766SMark Adams         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
95399551766SMark Adams         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
95499551766SMark Adams         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
95599551766SMark Adams         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
95699551766SMark Adams 
95799551766SMark Adams         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
95899551766SMark Adams         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
95999551766SMark Adams         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
96099551766SMark Adams         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
96199551766SMark Adams 
96299551766SMark Adams         h_mat->offdiag.i = d_bi64;
96399551766SMark Adams         h_mat->offdiag.j = d_bj64;
96499551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
96599551766SMark Adams 
966365b711fSMark Adams         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
967365b711fSMark Adams       }
968365b711fSMark Adams #else
96999551766SMark Adams       h_mat->offdiag.i = (PetscInt*)bi;
97099551766SMark Adams       h_mat->offdiag.j = (PetscInt*)bj;
97199551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
972365b711fSMark Adams #endif
973042217e8SBarry Smith       h_mat->offdiag.a = ba;
974042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
975042217e8SBarry Smith 
97699551766SMark Adams       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
97799551766SMark Adams       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
978042217e8SBarry Smith       ierr = PetscFree(colmap);CHKERRQ(ierr);
979042217e8SBarry Smith     }
980042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
981042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
982042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
983042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
98449b994a9SMark Adams     h_mat->M      = A->cmap->N;
985365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
986365b711fSMark Adams     {
9872c71b3e2SJacob Faibussowitsch       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
98899551766SMark Adams       int        *h_ai32, *h_aj32;
98999551766SMark Adams       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
990365b711fSMark Adams       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
99199551766SMark Adams       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
99299551766SMark Adams       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
99399551766SMark Adams       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
99499551766SMark Adams       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
99599551766SMark Adams 
99699551766SMark Adams       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
99799551766SMark Adams       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
99899551766SMark Adams       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
99999551766SMark Adams       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
100099551766SMark Adams 
100199551766SMark Adams       h_mat->diag.i = d_ai64;
100299551766SMark Adams       h_mat->diag.j = d_aj64;
100399551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
100499551766SMark Adams 
1005365b711fSMark Adams       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
1006365b711fSMark Adams     }
1007365b711fSMark Adams #else
100899551766SMark Adams     h_mat->diag.i = (PetscInt*)ai;
100999551766SMark Adams     h_mat->diag.j = (PetscInt*)aj;
101099551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
1011365b711fSMark Adams #endif
1012042217e8SBarry Smith     h_mat->diag.a = aa;
1013042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
1014042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
1015042217e8SBarry Smith     // copy pointers and metadata to device
1016042217e8SBarry Smith     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1017042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
1018042217e8SBarry Smith   } else {
1019042217e8SBarry Smith     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
1020042217e8SBarry Smith   }
1021042217e8SBarry Smith   *B = d_mat;
1022042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
10233fa6b06aSMark Adams   PetscFunctionReturn(0);
10243fa6b06aSMark Adams }
1025