xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 28b400f66ebc7ae0049166a2294dfcd3df27e64b)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
147e8381f9SStefano Zampini struct VecCUDAEquals
157e8381f9SStefano Zampini {
167e8381f9SStefano Zampini   template <typename Tuple>
177e8381f9SStefano Zampini   __host__ __device__
187e8381f9SStefano Zampini   void operator()(Tuple t)
197e8381f9SStefano Zampini   {
207e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
217e8381f9SStefano Zampini   }
227e8381f9SStefano Zampini };
237e8381f9SStefano Zampini 
24cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat)
25cbc6b225SStefano Zampini {
26cbc6b225SStefano Zampini   Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)mat->data;
27cbc6b225SStefano Zampini   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
28cbc6b225SStefano Zampini 
29cbc6b225SStefano Zampini   PetscFunctionBegin;
30cbc6b225SStefano Zampini   if (!cusparseStruct) PetscFunctionReturn(0);
31cbc6b225SStefano Zampini   if (cusparseStruct->use_extended_coo) {
325f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Aimap1_d));
335f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Ajmap1_d));
345f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Aperm1_d));
355f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Bimap1_d));
365f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Bjmap1_d));
375f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Bperm1_d));
385f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Aimap2_d));
395f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Ajmap2_d));
405f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Aperm2_d));
415f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Bimap2_d));
425f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Bjmap2_d));
435f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Bperm2_d));
445f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->Cperm1_d));
455f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->sendbuf_d));
465f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(cusparseStruct->recvbuf_d));
47cbc6b225SStefano Zampini   }
48cbc6b225SStefano Zampini   cusparseStruct->use_extended_coo = PETSC_FALSE;
49cbc6b225SStefano Zampini   delete cusparseStruct->coo_p;
50cbc6b225SStefano Zampini   delete cusparseStruct->coo_pw;
51cbc6b225SStefano Zampini   cusparseStruct->coo_p  = NULL;
52cbc6b225SStefano Zampini   cusparseStruct->coo_pw = NULL;
53cbc6b225SStefano Zampini   PetscFunctionReturn(0);
54cbc6b225SStefano Zampini }
55cbc6b225SStefano Zampini 
56219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
577e8381f9SStefano Zampini {
587e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
597e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
607e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
617e8381f9SStefano Zampini 
627e8381f9SStefano Zampini   PetscFunctionBegin;
63e61fc153SStefano Zampini   if (cusp->coo_p && v) {
6408391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
6508391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
6608391a17SStefano Zampini 
6708391a17SStefano Zampini     if (isCudaMem(v)) {
6808391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
6908391a17SStefano Zampini     } else {
70e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
71e61fc153SStefano Zampini       w->assign(v,v+n);
725f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar)));
7308391a17SStefano Zampini       d_v = w->data();
7408391a17SStefano Zampini     }
7508391a17SStefano Zampini 
7608391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
777e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
7808391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
797e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
805f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeBegin());
817e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
825f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogGpuTimeEnd());
83e61fc153SStefano Zampini     delete w;
845f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode));
855f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode));
867e8381f9SStefano Zampini   } else {
875f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode));
885f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode));
897e8381f9SStefano Zampini   }
907e8381f9SStefano Zampini   PetscFunctionReturn(0);
917e8381f9SStefano Zampini }
927e8381f9SStefano Zampini 
937e8381f9SStefano Zampini template <typename Tuple>
947e8381f9SStefano Zampini struct IsNotOffDiagT
957e8381f9SStefano Zampini {
967e8381f9SStefano Zampini   PetscInt _cstart,_cend;
977e8381f9SStefano Zampini 
987e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
997e8381f9SStefano Zampini   __host__ __device__
1007e8381f9SStefano Zampini   inline bool operator()(Tuple t)
1017e8381f9SStefano Zampini   {
1027e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
1037e8381f9SStefano Zampini   }
1047e8381f9SStefano Zampini };
1057e8381f9SStefano Zampini 
1067e8381f9SStefano Zampini struct IsOffDiag
1077e8381f9SStefano Zampini {
1087e8381f9SStefano Zampini   PetscInt _cstart,_cend;
1097e8381f9SStefano Zampini 
1107e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
1117e8381f9SStefano Zampini   __host__ __device__
1127e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
1137e8381f9SStefano Zampini   {
1147e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
1157e8381f9SStefano Zampini   }
1167e8381f9SStefano Zampini };
1177e8381f9SStefano Zampini 
1187e8381f9SStefano Zampini struct GlobToLoc
1197e8381f9SStefano Zampini {
1207e8381f9SStefano Zampini   PetscInt _start;
1217e8381f9SStefano Zampini 
1227e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
1237e8381f9SStefano Zampini   __host__ __device__
1247e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
1257e8381f9SStefano Zampini   {
1267e8381f9SStefano Zampini     return c - _start;
1277e8381f9SStefano Zampini   }
1287e8381f9SStefano Zampini };
1297e8381f9SStefano Zampini 
130219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
1317e8381f9SStefano Zampini {
1327e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1337e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
13482a78a4eSJed Brown   PetscInt               N,*jj;
1357e8381f9SStefano Zampini   size_t                 noff = 0;
136ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1377e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1387e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1397e8381f9SStefano Zampini 
1407e8381f9SStefano Zampini   PetscFunctionBegin;
1415f80ce2aSJacob Faibussowitsch   if (b->A) CHKERRQ(MatCUSPARSEClearHandle(b->A));
1425f80ce2aSJacob Faibussowitsch   if (b->B) CHKERRQ(MatCUSPARSEClearHandle(b->B));
1435f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&b->A));
1445f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&b->B));
1457e8381f9SStefano Zampini 
1465f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt)));
1477e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1487e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1495f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
1507e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1517e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1527e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1537e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1547e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1557e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1567e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1577e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1587e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1597e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1607e8381f9SStefano Zampini   }
1617e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1627e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1637e8381f9SStefano Zampini 
1647e8381f9SStefano Zampini   /* from global to local */
1657e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1667e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
1675f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
1687e8381f9SStefano Zampini 
1697e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1705f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */
1715f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost));
1727e8381f9SStefano Zampini   auto o_j = d_j.begin();
1735f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeBegin());
174ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1757e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
176ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
1775f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuTimeEnd());
1787e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
179cbc6b225SStefano Zampini   /* allocate the garray, the columns of B will not be mapped in the multiply setup */
1805f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscMalloc1(noff,&b->garray));
1815f80ce2aSJacob Faibussowitsch   CHKERRCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost));
1825f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt)));
1835f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g));
1845f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH));
1855f80ce2aSJacob Faibussowitsch   CHKERRQ(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj));
1862c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
1875f80ce2aSJacob Faibussowitsch   CHKERRQ(ISLocalToGlobalMappingDestroy(&l2g));
1887e8381f9SStefano Zampini 
1895f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->A));
1905f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
1915f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(b->A,MATSEQAIJCUSPARSE));
1925f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
1935f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->B));
1945f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff));
1955f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(b->B,MATSEQAIJCUSPARSE));
1965f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
1977e8381f9SStefano Zampini 
1987e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1995f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get()));
2005f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj));
2015f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(jj));
2027e8381f9SStefano Zampini 
2035f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat));
2045f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat));
2055f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetHandle(b->A,cusp->handle));
2065f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetHandle(b->B,cusp->handle));
207a0e72f99SJunchao Zhang   /*
2085f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetStream(b->A,cusp->stream));
2095f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetStream(b->B,cusp->stream));
210a0e72f99SJunchao Zhang   */
2117e8381f9SStefano Zampini 
2125f80ce2aSJacob Faibussowitsch   CHKERRQ(MatBindToCPU(b->A,B->boundtocpu));
2135f80ce2aSJacob Faibussowitsch   CHKERRQ(MatBindToCPU(b->B,B->boundtocpu));
2145f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetUpMultiply_MPIAIJ(B));
2157e8381f9SStefano Zampini   PetscFunctionReturn(0);
2167e8381f9SStefano Zampini }
2179ae82921SPaul Mullowney 
218219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
219219fbbafSJunchao Zhang {
220cbc6b225SStefano Zampini   Mat_MPIAIJ         *mpiaij    = (Mat_MPIAIJ*)mat->data;
221219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev;
222cbc6b225SStefano Zampini   PetscBool           coo_basic = PETSC_TRUE;
223219fbbafSJunchao Zhang   PetscMemType        mtype     = PETSC_MEMTYPE_DEVICE;
224219fbbafSJunchao Zhang   PetscInt            rstart,rend;
225219fbbafSJunchao Zhang 
226219fbbafSJunchao Zhang   PetscFunctionBegin;
2275f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(mpiaij->garray));
2285f80ce2aSJacob Faibussowitsch   CHKERRQ(VecDestroy(&mpiaij->lvec));
229cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
2305f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableDestroy(&mpiaij->colmap));
231cbc6b225SStefano Zampini #else
2325f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(mpiaij->colmap));
233cbc6b225SStefano Zampini #endif
2345f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterDestroy(&mpiaij->Mvctx));
235cbc6b225SStefano Zampini   mat->assembled                                                                      = PETSC_FALSE;
236cbc6b225SStefano Zampini   mat->was_assembled                                                                  = PETSC_FALSE;
2375f80ce2aSJacob Faibussowitsch   CHKERRQ(MatResetPreallocationCOO_MPIAIJ(mat));
2385f80ce2aSJacob Faibussowitsch   CHKERRQ(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat));
239219fbbafSJunchao Zhang   if (coo_i) {
2405f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLayoutGetRange(mat->rmap,&rstart,&rend));
2415f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscGetMemType(coo_i,&mtype));
242219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
243219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
244cbc6b225SStefano Zampini         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;}
245219fbbafSJunchao Zhang       }
246219fbbafSJunchao Zhang     }
247219fbbafSJunchao Zhang   }
248219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
2495f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat)));
250219fbbafSJunchao Zhang   if (coo_basic) {
2515f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j));
252219fbbafSJunchao Zhang   } else {
2535f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j));
254cbc6b225SStefano Zampini     mat->offloadmask = PETSC_OFFLOAD_CPU;
255cbc6b225SStefano Zampini     /* creates the GPU memory */
2565f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
2575f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
258cbc6b225SStefano Zampini     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
259219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
260219fbbafSJunchao Zhang 
2615f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount)));
2625f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount)));
2635f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount)));
264219fbbafSJunchao Zhang 
2655f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount)));
2665f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount)));
2675f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount)));
268219fbbafSJunchao Zhang 
2695f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount)));
2705f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount)));
2715f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount)));
272219fbbafSJunchao Zhang 
2735f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount)));
2745f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount)));
2755f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount)));
276219fbbafSJunchao Zhang 
2775f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount)));
2785f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar)));
2795f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar)));
280219fbbafSJunchao Zhang 
2815f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
2825f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2835f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
284219fbbafSJunchao Zhang 
2855f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice));
2865f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2875f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice));
288219fbbafSJunchao Zhang 
2895f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
2905f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2915f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
292219fbbafSJunchao Zhang 
2935f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice));
2945f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice));
2955f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice));
296219fbbafSJunchao Zhang 
2975f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice));
298219fbbafSJunchao Zhang   }
299219fbbafSJunchao Zhang   PetscFunctionReturn(0);
300219fbbafSJunchao Zhang }
301219fbbafSJunchao Zhang 
302219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
303219fbbafSJunchao Zhang {
304219fbbafSJunchao Zhang   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
305219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
306219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
307219fbbafSJunchao Zhang }
308219fbbafSJunchao Zhang 
309219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
310219fbbafSJunchao Zhang {
311219fbbafSJunchao Zhang   PetscCount       i = blockIdx.x*blockDim.x + threadIdx.x;
312219fbbafSJunchao Zhang   const PetscCount grid_size  = gridDim.x * blockDim.x;
313219fbbafSJunchao Zhang   for (; i<nnz; i            += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
314219fbbafSJunchao Zhang }
315219fbbafSJunchao Zhang 
316219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
317219fbbafSJunchao Zhang {
318219fbbafSJunchao Zhang   Mat_MPIAIJ         *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
319219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
320219fbbafSJunchao Zhang   Mat                 A      = mpiaij->A,B = mpiaij->B;
321219fbbafSJunchao Zhang   PetscCount          Annz1  = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
322cbc6b225SStefano Zampini   PetscScalar        *Aa,*Ba = NULL;
323219fbbafSJunchao Zhang   PetscScalar        *vsend  = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
324219fbbafSJunchao Zhang   const PetscScalar  *v1     = v;
325219fbbafSJunchao Zhang   const PetscCount   *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
326219fbbafSJunchao Zhang   const PetscCount   *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
327219fbbafSJunchao Zhang   const PetscCount   *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
328219fbbafSJunchao Zhang   const PetscCount   *Cperm1 = mpidev->Cperm1_d;
329219fbbafSJunchao Zhang   PetscMemType        memtype;
330219fbbafSJunchao Zhang 
331219fbbafSJunchao Zhang   PetscFunctionBegin;
332219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
333cbc6b225SStefano Zampini     PetscMPIInt size;
334cbc6b225SStefano Zampini 
3355f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size));
3365f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscGetMemType(v,&memtype));
337219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
3385f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar)));
3395f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice));
340219fbbafSJunchao Zhang     }
341219fbbafSJunchao Zhang 
342219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3435f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */
3445f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar)));
345cbc6b225SStefano Zampini       if (size > 1) {
3465f80ce2aSJacob Faibussowitsch         CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba));
3475f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar)));
348cbc6b225SStefano Zampini       }
349219fbbafSJunchao Zhang     } else {
3505f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */
3515f80ce2aSJacob Faibussowitsch       if (size > 1) CHKERRQ(MatSeqAIJCUSPARSEGetArray(B,&Ba));
352219fbbafSJunchao Zhang     }
353219fbbafSJunchao Zhang 
354219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
355cbc6b225SStefano Zampini     if (mpiaij->sendlen) {
356219fbbafSJunchao Zhang       MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
357cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
358cbc6b225SStefano Zampini     }
359219fbbafSJunchao Zhang 
360219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
3615f80ce2aSJacob Faibussowitsch     if (size > 1) CHKERRQ(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE));
362219fbbafSJunchao Zhang     /* Add local entries to A and B */
363cbc6b225SStefano Zampini     if (Annz1) {
364219fbbafSJunchao Zhang       MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
365cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
366cbc6b225SStefano Zampini     }
367cbc6b225SStefano Zampini     if (Bnnz1) {
368219fbbafSJunchao Zhang       MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
369cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
370cbc6b225SStefano Zampini     }
3715f80ce2aSJacob Faibussowitsch     if (size > 1) CHKERRQ(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE));
372219fbbafSJunchao Zhang 
373219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
374cbc6b225SStefano Zampini     if (Annz2) {
375219fbbafSJunchao Zhang       MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
376cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
377cbc6b225SStefano Zampini     }
378cbc6b225SStefano Zampini     if (Bnnz2) {
379219fbbafSJunchao Zhang       MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
380cbc6b225SStefano Zampini       CHKERRCUDA(cudaPeekAtLastError());
381cbc6b225SStefano Zampini     }
382219fbbafSJunchao Zhang 
383219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
3845f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa));
3855f80ce2aSJacob Faibussowitsch       if (size > 1) CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba));
386219fbbafSJunchao Zhang     } else {
3875f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSERestoreArray(A,&Aa));
3885f80ce2aSJacob Faibussowitsch       if (size > 1) CHKERRQ(MatSeqAIJCUSPARSERestoreArray(B,&Ba));
389219fbbafSJunchao Zhang     }
3905f80ce2aSJacob Faibussowitsch     if (PetscMemTypeHost(memtype)) CHKERRCUDA(cudaFree((void*)v1));
391219fbbafSJunchao Zhang   } else {
3925f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode));
393219fbbafSJunchao Zhang   }
394cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
395219fbbafSJunchao Zhang   PetscFunctionReturn(0);
396219fbbafSJunchao Zhang }
397219fbbafSJunchao Zhang 
398ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
399ed502f03SStefano Zampini {
400ed502f03SStefano Zampini   Mat             Ad,Ao;
401ed502f03SStefano Zampini   const PetscInt *cmap;
402ed502f03SStefano Zampini 
403ed502f03SStefano Zampini   PetscFunctionBegin;
4045f80ce2aSJacob Faibussowitsch   CHKERRQ(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap));
4055f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc));
406ed502f03SStefano Zampini   if (glob) {
407ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
408ed502f03SStefano Zampini 
4095f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetLocalSize(Ad,NULL,&dn));
4105f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetLocalSize(Ao,NULL,&on));
4115f80ce2aSJacob Faibussowitsch     CHKERRQ(MatGetOwnershipRangeColumn(A,&cst,NULL));
4125f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscMalloc1(dn+on,&gidx));
413ed502f03SStefano Zampini     for (i = 0; i<dn; i++) gidx[i]    = cst + i;
414ed502f03SStefano Zampini     for (i = 0; i<on; i++) gidx[i+dn] = cmap[i];
4155f80ce2aSJacob Faibussowitsch     CHKERRQ(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob));
416ed502f03SStefano Zampini   }
417ed502f03SStefano Zampini   PetscFunctionReturn(0);
418ed502f03SStefano Zampini }
419ed502f03SStefano Zampini 
4209ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
4219ae82921SPaul Mullowney {
422bbf3fe20SPaul Mullowney   Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data;
423bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
4249ae82921SPaul Mullowney   PetscInt           i;
4259ae82921SPaul Mullowney 
4269ae82921SPaul Mullowney   PetscFunctionBegin;
4275f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(B->rmap));
4285f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscLayoutSetUp(B->cmap));
4297e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
4309ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
4312c71b3e2SJacob Faibussowitsch       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
4329ae82921SPaul Mullowney     }
4339ae82921SPaul Mullowney   }
4347e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
4359ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
4362c71b3e2SJacob Faibussowitsch       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
4379ae82921SPaul Mullowney     }
4389ae82921SPaul Mullowney   }
4396a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
4405f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscTableDestroy(&b->colmap));
4416a29ce69SStefano Zampini #else
4425f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(b->colmap));
4436a29ce69SStefano Zampini #endif
4445f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(b->garray));
4455f80ce2aSJacob Faibussowitsch   CHKERRQ(VecDestroy(&b->lvec));
4465f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterDestroy(&b->Mvctx));
4476a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4485f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy(&b->B));
4496a29ce69SStefano Zampini   if (!b->A) {
4505f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->A));
4515f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n));
4525f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A));
4536a29ce69SStefano Zampini   }
4546a29ce69SStefano Zampini   if (!b->B) {
4556a29ce69SStefano Zampini     PetscMPIInt size;
4565f80ce2aSJacob Faibussowitsch     CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size));
4575f80ce2aSJacob Faibussowitsch     CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->B));
4585f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0));
4595f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B));
4609ae82921SPaul Mullowney   }
4615f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(b->A,MATSEQAIJCUSPARSE));
4625f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetType(b->B,MATSEQAIJCUSPARSE));
4635f80ce2aSJacob Faibussowitsch   CHKERRQ(MatBindToCPU(b->A,B->boundtocpu));
4645f80ce2aSJacob Faibussowitsch   CHKERRQ(MatBindToCPU(b->B,B->boundtocpu));
4655f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz));
4665f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz));
4675f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat));
4685f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat));
4695f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetHandle(b->A,cusparseStruct->handle));
4705f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCUSPARSESetHandle(b->B,cusparseStruct->handle));
4719ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4729ae82921SPaul Mullowney   PetscFunctionReturn(0);
4739ae82921SPaul Mullowney }
474e057df02SPaul Mullowney 
4759ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
4769ae82921SPaul Mullowney {
4779ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
4789ae82921SPaul Mullowney 
4799ae82921SPaul Mullowney   PetscFunctionBegin;
4805f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
4815f80ce2aSJacob Faibussowitsch   CHKERRQ((*a->A->ops->mult)(a->A,xx,yy));
4825f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
4835f80ce2aSJacob Faibussowitsch   CHKERRQ((*a->B->ops->multadd)(a->B,a->lvec,yy,yy));
4849ae82921SPaul Mullowney   PetscFunctionReturn(0);
4859ae82921SPaul Mullowney }
486ca45077fSPaul Mullowney 
4873fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
4883fa6b06aSMark Adams {
4893fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
4903fa6b06aSMark Adams 
4913fa6b06aSMark Adams   PetscFunctionBegin;
4925f80ce2aSJacob Faibussowitsch   CHKERRQ(MatZeroEntries(l->A));
4935f80ce2aSJacob Faibussowitsch   CHKERRQ(MatZeroEntries(l->B));
4943fa6b06aSMark Adams   PetscFunctionReturn(0);
4953fa6b06aSMark Adams }
496042217e8SBarry Smith 
497fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
498fdc842d1SBarry Smith {
499fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
500fdc842d1SBarry Smith 
501fdc842d1SBarry Smith   PetscFunctionBegin;
5025f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
5035f80ce2aSJacob Faibussowitsch   CHKERRQ((*a->A->ops->multadd)(a->A,xx,yy,zz));
5045f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD));
5055f80ce2aSJacob Faibussowitsch   CHKERRQ((*a->B->ops->multadd)(a->B,a->lvec,zz,zz));
506fdc842d1SBarry Smith   PetscFunctionReturn(0);
507fdc842d1SBarry Smith }
508fdc842d1SBarry Smith 
509ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
510ca45077fSPaul Mullowney {
511ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data;
512ca45077fSPaul Mullowney 
513ca45077fSPaul Mullowney   PetscFunctionBegin;
5145f80ce2aSJacob Faibussowitsch   CHKERRQ((*a->B->ops->multtranspose)(a->B,xx,a->lvec));
5155f80ce2aSJacob Faibussowitsch   CHKERRQ((*a->A->ops->multtranspose)(a->A,xx,yy));
5165f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
5175f80ce2aSJacob Faibussowitsch   CHKERRQ(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE));
518ca45077fSPaul Mullowney   PetscFunctionReturn(0);
519ca45077fSPaul Mullowney }
5209ae82921SPaul Mullowney 
521e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
522ca45077fSPaul Mullowney {
523ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
524bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
525e057df02SPaul Mullowney 
526ca45077fSPaul Mullowney   PetscFunctionBegin;
527e057df02SPaul Mullowney   switch (op) {
528e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
529e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
530045c96e1SPaul Mullowney     break;
531e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
532e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
533045c96e1SPaul Mullowney     break;
534e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
535e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
536e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
537045c96e1SPaul Mullowney     break;
538e057df02SPaul Mullowney   default:
53998921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
540045c96e1SPaul Mullowney   }
541ca45077fSPaul Mullowney   PetscFunctionReturn(0);
542ca45077fSPaul Mullowney }
543e057df02SPaul Mullowney 
5444416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
545e057df02SPaul Mullowney {
546e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
547e057df02SPaul Mullowney   PetscBool                flg;
548a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
549a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
5505fd66863SKarl Rupp 
551e057df02SPaul Mullowney   PetscFunctionBegin;
5525f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options"));
553e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
5545f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
5555f80ce2aSJacob Faibussowitsch                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
5565f80ce2aSJacob Faibussowitsch     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format));
5575f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
5585f80ce2aSJacob Faibussowitsch                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg));
5595f80ce2aSJacob Faibussowitsch     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format));
5605f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
5615f80ce2aSJacob Faibussowitsch                              "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg));
5625f80ce2aSJacob Faibussowitsch     if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format));
563e057df02SPaul Mullowney   }
5645f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscOptionsTail());
565e057df02SPaul Mullowney   PetscFunctionReturn(0);
566e057df02SPaul Mullowney }
567e057df02SPaul Mullowney 
56834d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
56934d6c7a5SJose E. Roman {
5703fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
571042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
572042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
573042217e8SBarry Smith 
57434d6c7a5SJose E. Roman   PetscFunctionBegin;
5755f80ce2aSJacob Faibussowitsch   CHKERRQ(MatAssemblyEnd_MPIAIJ(A,mode));
5765f80ce2aSJacob Faibussowitsch   if (mpiaij->lvec) CHKERRQ(VecSetType(mpiaij->lvec,VECSEQCUDA));
577042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
578042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
579042217e8SBarry Smith 
5805f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(A,"Destroy device mat since nonzerostate changed\n"));
5815f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscNew(&h_mat));
5825f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
5835f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(h_mat->colmap));
58499551766SMark Adams     if (h_mat->allocated_indices) {
5855f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaFree(h_mat->diag.i));
5865f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaFree(h_mat->diag.j));
58799551766SMark Adams       if (h_mat->offdiag.j) {
5885f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFree(h_mat->offdiag.i));
5895f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFree(h_mat->offdiag.j));
59099551766SMark Adams       }
59199551766SMark Adams     }
5925f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(d_mat));
5935f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(h_mat));
594042217e8SBarry Smith     cusp->deviceMat = NULL;
5953fa6b06aSMark Adams   }
59634d6c7a5SJose E. Roman   PetscFunctionReturn(0);
59734d6c7a5SJose E. Roman }
59834d6c7a5SJose E. Roman 
599bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
600bbf3fe20SPaul Mullowney {
6013fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
6023fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
603bbf3fe20SPaul Mullowney 
604bbf3fe20SPaul Mullowney   PetscFunctionBegin;
605*28b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
6063fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
607042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
608042217e8SBarry Smith 
6095f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(A,"Have device matrix\n"));
6105f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscNew(&h_mat));
6115f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost));
6125f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(h_mat->colmap));
61399551766SMark Adams     if (h_mat->allocated_indices) {
6145f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaFree(h_mat->diag.i));
6155f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaFree(h_mat->diag.j));
61699551766SMark Adams       if (h_mat->offdiag.j) {
6175f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFree(h_mat->offdiag.i));
6185f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaFree(h_mat->offdiag.j));
61999551766SMark Adams       }
62099551766SMark Adams     }
6215f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaFree(d_mat));
6225f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(h_mat));
6233fa6b06aSMark Adams   }
624bbf3fe20SPaul Mullowney   try {
6255f80ce2aSJacob Faibussowitsch     if (aij->A) CHKERRQ(MatCUSPARSEClearHandle(aij->A));
6265f80ce2aSJacob Faibussowitsch     if (aij->B) CHKERRQ(MatCUSPARSEClearHandle(aij->B));
6275f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseDestroy(cusparseStruct->handle));
628a0e72f99SJunchao Zhang     /* We want cusparseStruct to use PetscDefaultCudaStream
62917403302SKarl Rupp     if (cusparseStruct->stream) {
6305f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaStreamDestroy(cusparseStruct->stream));
63117403302SKarl Rupp     }
632a0e72f99SJunchao Zhang     */
633cbc6b225SStefano Zampini     /* Free COO */
6345f80ce2aSJacob Faibussowitsch     CHKERRQ(MatResetPreallocationCOO_MPIAIJCUSPARSE(A));
635bbf3fe20SPaul Mullowney     delete cusparseStruct;
636bbf3fe20SPaul Mullowney   } catch(char *ex) {
63798921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
638bbf3fe20SPaul Mullowney   }
6395f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL));
6405f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL));
6415f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL));
6425f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL));
6435f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL));
6445f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL));
6455f80ce2aSJacob Faibussowitsch   CHKERRQ(MatDestroy_MPIAIJ(A));
646bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
647bbf3fe20SPaul Mullowney }
648ca45077fSPaul Mullowney 
6493338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
6509ae82921SPaul Mullowney {
651bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
6523338378cSStefano Zampini   Mat         A;
6539ae82921SPaul Mullowney 
6549ae82921SPaul Mullowney   PetscFunctionBegin;
6555f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6565f80ce2aSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) CHKERRQ(MatDuplicate(B,MAT_COPY_VALUES,newmat));
6575f80ce2aSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) CHKERRQ(MatCopy(B,*newmat,SAME_NONZERO_PATTERN));
6583338378cSStefano Zampini   A = *newmat;
6596f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
6605f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscFree(A->defaultvectype));
6615f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscStrallocpy(VECCUDA,&A->defaultvectype));
66234136279SStefano Zampini 
663bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
6645f80ce2aSJacob Faibussowitsch   if (a->A) CHKERRQ(MatSetType(a->A,MATSEQAIJCUSPARSE));
6655f80ce2aSJacob Faibussowitsch   if (a->B) CHKERRQ(MatSetType(a->B,MATSEQAIJCUSPARSE));
6665f80ce2aSJacob Faibussowitsch   if (a->lvec) CHKERRQ(VecSetType(a->lvec,VECSEQCUDA));
667d98d7c49SStefano Zampini 
6683338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
669219fbbafSJunchao Zhang     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
6705f80ce2aSJacob Faibussowitsch     CHKERRCUSPARSE(cusparseCreate(&(cusp->handle)));
671219fbbafSJunchao Zhang     a->spptr = cusp;
6723338378cSStefano Zampini   }
6732205254eSKarl Rupp 
67434d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
675bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
676fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
677bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
678bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
679bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
6803fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
6814e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
6822205254eSKarl Rupp 
6835f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE));
6845f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
6855f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
6865f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE));
6875f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE));
6885f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE));
689ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
6905f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE));
691ae48a8d0SStefano Zampini #endif
6929ae82921SPaul Mullowney   PetscFunctionReturn(0);
6939ae82921SPaul Mullowney }
6949ae82921SPaul Mullowney 
6953338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
6963338378cSStefano Zampini {
6973338378cSStefano Zampini   PetscFunctionBegin;
6985f80ce2aSJacob Faibussowitsch   CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
6995f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate_MPIAIJ(A));
7005f80ce2aSJacob Faibussowitsch   CHKERRQ(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A));
7013338378cSStefano Zampini   PetscFunctionReturn(0);
7023338378cSStefano Zampini }
7033338378cSStefano Zampini 
7043f9c0db1SPaul Mullowney /*@
7053f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
706e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
7073f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
708e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
709e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
710e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
7119ae82921SPaul Mullowney 
712d083f849SBarry Smith    Collective
713e057df02SPaul Mullowney 
714e057df02SPaul Mullowney    Input Parameters:
715e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
716e057df02SPaul Mullowney .  m - number of rows
717e057df02SPaul Mullowney .  n - number of columns
718e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
719e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
7200298fd71SBarry Smith          (possibly different for each row) or NULL
721e057df02SPaul Mullowney 
722e057df02SPaul Mullowney    Output Parameter:
723e057df02SPaul Mullowney .  A - the matrix
724e057df02SPaul Mullowney 
725e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
726e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
727e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
728e057df02SPaul Mullowney 
729e057df02SPaul Mullowney    Notes:
730e057df02SPaul Mullowney    If nnz is given then nz is ignored
731e057df02SPaul Mullowney 
732e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
733e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
734e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
735e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
736e057df02SPaul Mullowney 
737e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7380298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
739e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
740e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
741e057df02SPaul Mullowney 
742e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
743e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
744e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
745e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
746e057df02SPaul Mullowney 
747e057df02SPaul Mullowney    Level: intermediate
748e057df02SPaul Mullowney 
749e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
750e057df02SPaul Mullowney @*/
7519ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
7529ae82921SPaul Mullowney {
7539ae82921SPaul Mullowney   PetscMPIInt size;
7549ae82921SPaul Mullowney 
7559ae82921SPaul Mullowney   PetscFunctionBegin;
7565f80ce2aSJacob Faibussowitsch   CHKERRQ(MatCreate(comm,A));
7575f80ce2aSJacob Faibussowitsch   CHKERRQ(MatSetSizes(*A,m,n,M,N));
7585f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(comm,&size));
7599ae82921SPaul Mullowney   if (size > 1) {
7605f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetType(*A,MATMPIAIJCUSPARSE));
7615f80ce2aSJacob Faibussowitsch     CHKERRQ(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz));
7629ae82921SPaul Mullowney   } else {
7635f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSetType(*A,MATSEQAIJCUSPARSE));
7645f80ce2aSJacob Faibussowitsch     CHKERRQ(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz));
7659ae82921SPaul Mullowney   }
7669ae82921SPaul Mullowney   PetscFunctionReturn(0);
7679ae82921SPaul Mullowney }
7689ae82921SPaul Mullowney 
7693ca39a21SBarry Smith /*MC
7706bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
771e057df02SPaul Mullowney 
7722692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
7732692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
7742692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
7759ae82921SPaul Mullowney 
7769ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
7779ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
7789ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
7799ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
7809ae82921SPaul Mullowney    the above preallocation routines for simplicity.
7819ae82921SPaul Mullowney 
7829ae82921SPaul Mullowney    Options Database Keys:
783e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
7848468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7858468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7868468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
7879ae82921SPaul Mullowney 
7889ae82921SPaul Mullowney   Level: beginner
7899ae82921SPaul Mullowney 
7906bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
7916bb69460SJunchao Zhang M*/
7926bb69460SJunchao Zhang 
7936bb69460SJunchao Zhang /*MC
7946bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
7956bb69460SJunchao Zhang 
7966bb69460SJunchao Zhang   Level: beginner
7976bb69460SJunchao Zhang 
7986bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
7999ae82921SPaul Mullowney M*/
8003fa6b06aSMark Adams 
801042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
802042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
8033fa6b06aSMark Adams {
804042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
805042217e8SBarry Smith   PetscMPIInt                size;
806042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
807042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
80899551766SMark Adams   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
809042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
810042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
8113fa6b06aSMark Adams 
8123fa6b06aSMark Adams   PetscFunctionBegin;
813*28b400f6SJacob Faibussowitsch   PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
814042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
8153fa6b06aSMark Adams     *B = NULL;
8163fa6b06aSMark Adams     PetscFunctionReturn(0);
8173fa6b06aSMark Adams   }
8185f80ce2aSJacob Faibussowitsch   CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size));
81999551766SMark Adams   // get jaca
8203fa6b06aSMark Adams   if (size == 1) {
821042217e8SBarry Smith     PetscBool isseqaij;
822042217e8SBarry Smith 
8235f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij));
824042217e8SBarry Smith     if (isseqaij) {
8253fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
826*28b400f6SJacob Faibussowitsch       PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
827042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
828042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
8295f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A));
8303fa6b06aSMark Adams     } else {
8313fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
832*28b400f6SJacob Faibussowitsch       PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
833042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8343fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
835042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
836042217e8SBarry Smith       d_mat = spptr->deviceMat;
8375f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->A));
8383fa6b06aSMark Adams     }
839042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
840042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
841*28b400f6SJacob Faibussowitsch       PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
842042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
843042217e8SBarry Smith       bi = NULL;
844042217e8SBarry Smith       bj = NULL;
845042217e8SBarry Smith       ba = NULL;
846042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
847042217e8SBarry Smith   } else {
848042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
849*28b400f6SJacob Faibussowitsch     PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
850042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
85199551766SMark Adams     jacb = (Mat_SeqAIJ*)aij->B->data;
852042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8533fa6b06aSMark Adams 
8542c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
855042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
856042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
8572c71b3e2SJacob Faibussowitsch     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
858042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
8595f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->A));
8605f80ce2aSJacob Faibussowitsch       CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->B));
861042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
862042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
863*28b400f6SJacob Faibussowitsch       PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
864*28b400f6SJacob Faibussowitsch       PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
865042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
866042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
8673fa6b06aSMark Adams       if (jacb->compressedrow.use) {
868042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
869042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
870042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
8713fa6b06aSMark Adams         }
872042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
873042217e8SBarry Smith       } else {
874042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
875042217e8SBarry Smith       }
876042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
877042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
878042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
879042217e8SBarry Smith     d_mat = spptr->deviceMat;
880042217e8SBarry Smith   }
8813fa6b06aSMark Adams   if (jaca->compressedrow.use) {
882042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
883042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
884042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
8853fa6b06aSMark Adams     }
886042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
8873fa6b06aSMark Adams   } else {
888042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
8893fa6b06aSMark Adams   }
890042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
891042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
892042217e8SBarry Smith 
893042217e8SBarry Smith   if (!d_mat) {
894042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
895042217e8SBarry Smith 
896042217e8SBarry Smith     // create and populate strucy on host and copy on device
8975f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(A,"Create device matrix\n"));
8985f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscNew(&h_mat));
8995f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat)));
900042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
901042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
90299551766SMark Adams       PetscInt   *colmap;
903042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
904042217e8SBarry Smith 
9052c71b3e2SJacob Faibussowitsch       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
906042217e8SBarry Smith 
9075f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscCalloc1(N+1,&colmap));
908042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
909365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
910365b711fSMark Adams       { // have to make a long version of these
91199551766SMark Adams         int        *h_bi32, *h_bj32;
91299551766SMark Adams         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
9135f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64));
9145f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost));
91599551766SMark Adams         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
9165f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost));
91799551766SMark Adams         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
91899551766SMark Adams 
9195f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64)));
9205f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice));
9215f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64)));
9225f80ce2aSJacob Faibussowitsch         CHKERRCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice));
92399551766SMark Adams 
92499551766SMark Adams         h_mat->offdiag.i = d_bi64;
92599551766SMark Adams         h_mat->offdiag.j = d_bj64;
92699551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
92799551766SMark Adams 
9285f80ce2aSJacob Faibussowitsch         CHKERRQ(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64));
929365b711fSMark Adams       }
930365b711fSMark Adams #else
93199551766SMark Adams       h_mat->offdiag.i = (PetscInt*)bi;
93299551766SMark Adams       h_mat->offdiag.j = (PetscInt*)bj;
93399551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
934365b711fSMark Adams #endif
935042217e8SBarry Smith       h_mat->offdiag.a = ba;
936042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
937042217e8SBarry Smith 
9385f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap)));
9395f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice));
9405f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscFree(colmap));
941042217e8SBarry Smith     }
942042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
943042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
944042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
945042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
94649b994a9SMark Adams     h_mat->M      = A->cmap->N;
947365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
948365b711fSMark Adams     {
9492c71b3e2SJacob Faibussowitsch       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
95099551766SMark Adams       int        *h_ai32, *h_aj32;
95199551766SMark Adams       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
9525f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64));
9535f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost));
95499551766SMark Adams       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
9555f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost));
95699551766SMark Adams       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
95799551766SMark Adams 
9585f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64)));
9595f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice));
9605f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64)));
9615f80ce2aSJacob Faibussowitsch       CHKERRCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice));
96299551766SMark Adams 
96399551766SMark Adams       h_mat->diag.i = d_ai64;
96499551766SMark Adams       h_mat->diag.j = d_aj64;
96599551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
96699551766SMark Adams 
9675f80ce2aSJacob Faibussowitsch       CHKERRQ(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64));
968365b711fSMark Adams     }
969365b711fSMark Adams #else
97099551766SMark Adams     h_mat->diag.i = (PetscInt*)ai;
97199551766SMark Adams     h_mat->diag.j = (PetscInt*)aj;
97299551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
973365b711fSMark Adams #endif
974042217e8SBarry Smith     h_mat->diag.a = aa;
975042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
976042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
977042217e8SBarry Smith     // copy pointers and metadata to device
9785f80ce2aSJacob Faibussowitsch     CHKERRCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice));
9795f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscFree(h_mat));
980042217e8SBarry Smith   } else {
9815f80ce2aSJacob Faibussowitsch     CHKERRQ(PetscInfo(A,"Reusing device matrix\n"));
982042217e8SBarry Smith   }
983042217e8SBarry Smith   *B = d_mat;
984042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
9853fa6b06aSMark Adams   PetscFunctionReturn(0);
9863fa6b06aSMark Adams }
987