xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 7487cd7ca1dddf3cbc146be559ee2e39856c5efc)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
30fd2b57fSKarl Rupp 
43d13b8fdSMatthew G. Knepley #include <petscconf.h>
59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
87e8381f9SStefano Zampini #include <thrust/advance.h>
9a2cee5feSJed Brown #include <thrust/partition.h>
10a2cee5feSJed Brown #include <thrust/sort.h>
11a2cee5feSJed Brown #include <thrust/unique.h>
124eb5378fSStefano Zampini #include <petscsf.h>
137e8381f9SStefano Zampini 
147e8381f9SStefano Zampini struct VecCUDAEquals
157e8381f9SStefano Zampini {
167e8381f9SStefano Zampini   template <typename Tuple>
177e8381f9SStefano Zampini   __host__ __device__
187e8381f9SStefano Zampini   void operator()(Tuple t)
197e8381f9SStefano Zampini   {
207e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
217e8381f9SStefano Zampini   }
227e8381f9SStefano Zampini };
237e8381f9SStefano Zampini 
24219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode)
257e8381f9SStefano Zampini {
267e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
277e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
287e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
297e8381f9SStefano Zampini   PetscErrorCode     ierr;
307e8381f9SStefano Zampini 
317e8381f9SStefano Zampini   PetscFunctionBegin;
32e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3308391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3408391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3508391a17SStefano Zampini 
3608391a17SStefano Zampini     if (isCudaMem(v)) {
3708391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3808391a17SStefano Zampini     } else {
39e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
40e61fc153SStefano Zampini       w->assign(v,v+n);
4108391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
4208391a17SStefano Zampini       d_v = w->data();
4308391a17SStefano Zampini     }
4408391a17SStefano Zampini 
4508391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
467e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4708391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
487e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4908391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
507e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
5108391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
52e61fc153SStefano Zampini     delete w;
53219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
54219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini   } else {
56219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr);
57219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
587e8381f9SStefano Zampini   }
597e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
607e8381f9SStefano Zampini   A->num_ass++;
617e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
627e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
634eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
647e8381f9SStefano Zampini   PetscFunctionReturn(0);
657e8381f9SStefano Zampini }
667e8381f9SStefano Zampini 
677e8381f9SStefano Zampini template <typename Tuple>
687e8381f9SStefano Zampini struct IsNotOffDiagT
697e8381f9SStefano Zampini {
707e8381f9SStefano Zampini   PetscInt _cstart,_cend;
717e8381f9SStefano Zampini 
727e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
737e8381f9SStefano Zampini   __host__ __device__
747e8381f9SStefano Zampini   inline bool operator()(Tuple t)
757e8381f9SStefano Zampini   {
767e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
777e8381f9SStefano Zampini   }
787e8381f9SStefano Zampini };
797e8381f9SStefano Zampini 
807e8381f9SStefano Zampini struct IsOffDiag
817e8381f9SStefano Zampini {
827e8381f9SStefano Zampini   PetscInt _cstart,_cend;
837e8381f9SStefano Zampini 
847e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
857e8381f9SStefano Zampini   __host__ __device__
867e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
877e8381f9SStefano Zampini   {
887e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
897e8381f9SStefano Zampini   }
907e8381f9SStefano Zampini };
917e8381f9SStefano Zampini 
927e8381f9SStefano Zampini struct GlobToLoc
937e8381f9SStefano Zampini {
947e8381f9SStefano Zampini   PetscInt _start;
957e8381f9SStefano Zampini 
967e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
977e8381f9SStefano Zampini   __host__ __device__
987e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
997e8381f9SStefano Zampini   {
1007e8381f9SStefano Zampini     return c - _start;
1017e8381f9SStefano Zampini   }
1027e8381f9SStefano Zampini };
1037e8381f9SStefano Zampini 
104219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[])
1057e8381f9SStefano Zampini {
1067e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1077e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1087e8381f9SStefano Zampini   PetscErrorCode         ierr;
10982a78a4eSJed Brown   PetscInt               N,*jj;
1107e8381f9SStefano Zampini   size_t                 noff = 0;
111ddea5d60SJunchao Zhang   THRUSTINTARRAY         d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */
1127e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1137e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1147e8381f9SStefano Zampini   cudaError_t            cerr;
1157e8381f9SStefano Zampini 
1167e8381f9SStefano Zampini   PetscFunctionBegin;
1177e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1187e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1197e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1207e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1217e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1237e8381f9SStefano Zampini 
1247e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1257e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1267e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1277e8381f9SStefano Zampini   delete cusp->coo_p;
1287e8381f9SStefano Zampini   delete cusp->coo_pw;
1297e8381f9SStefano Zampini   cusp->coo_p = NULL;
1307e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13108391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1327e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1337e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1347e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1357e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1367e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1377e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1387e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1397e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1407e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1417e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1427e8381f9SStefano Zampini   }
1437e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1447e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1457e8381f9SStefano Zampini 
1467e8381f9SStefano Zampini   /* from global to local */
1477e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1487e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
14908391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1507e8381f9SStefano Zampini 
1517e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
152ddea5d60SJunchao Zhang   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */
1537e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1547e8381f9SStefano Zampini   auto o_j = d_j.begin();
15508391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
156ddea5d60SJunchao Zhang   thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */
1577e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
158ddea5d60SJunchao Zhang   auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */
15908391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1607e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
161ddea5d60SJunchao Zhang   ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr);
1627e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1637e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1647e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1657e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
16682a78a4eSJed Brown   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr);
1672c71b3e2SJacob Faibussowitsch   PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no);
1687e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1697e8381f9SStefano Zampini 
1707e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1747e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1757e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1767e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1777e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1787e8381f9SStefano Zampini 
1797e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
180219fbbafSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
181219fbbafSJunchao Zhang   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1827e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1837e8381f9SStefano Zampini 
1847e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1857e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1867e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1877e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
188a0e72f99SJunchao Zhang   /*
1897e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1907e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
191a0e72f99SJunchao Zhang   */
1927e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1937e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1947e8381f9SStefano Zampini   B->nonzerostate++;
1957e8381f9SStefano Zampini 
1967e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
1987e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
1997e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
2007e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2017e8381f9SStefano Zampini   PetscFunctionReturn(0);
2027e8381f9SStefano Zampini }
2039ae82921SPaul Mullowney 
204219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[])
205219fbbafSJunchao Zhang {
206219fbbafSJunchao Zhang   PetscErrorCode         ierr;
207219fbbafSJunchao Zhang   Mat                    newmat;
208219fbbafSJunchao Zhang   Mat_MPIAIJ             *mpiaij;
209219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE     *mpidev;
210219fbbafSJunchao Zhang   PetscInt               coo_basic = 1;
211219fbbafSJunchao Zhang   PetscMemType           mtype = PETSC_MEMTYPE_DEVICE;
212219fbbafSJunchao Zhang   PetscInt               rstart,rend;
213219fbbafSJunchao Zhang   cudaError_t            cerr;
214219fbbafSJunchao Zhang 
215219fbbafSJunchao Zhang   PetscFunctionBegin;
216219fbbafSJunchao Zhang   if (coo_i) {
217219fbbafSJunchao Zhang     ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr);
218219fbbafSJunchao Zhang     ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr);
219219fbbafSJunchao Zhang     if (PetscMemTypeHost(mtype)) {
220219fbbafSJunchao Zhang       for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */
221219fbbafSJunchao Zhang         if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;}
222219fbbafSJunchao Zhang       }
223219fbbafSJunchao Zhang     }
224219fbbafSJunchao Zhang   }
225219fbbafSJunchao Zhang   /* All ranks must agree on the value of coo_basic */
226219fbbafSJunchao Zhang   ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr);
227219fbbafSJunchao Zhang 
228219fbbafSJunchao Zhang   if (coo_basic) {
229219fbbafSJunchao Zhang     ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
230219fbbafSJunchao Zhang   } else {
231219fbbafSJunchao Zhang     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
232219fbbafSJunchao Zhang     ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr);
233219fbbafSJunchao Zhang     ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr);
234219fbbafSJunchao Zhang     ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr);
235219fbbafSJunchao Zhang     ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */
236219fbbafSJunchao Zhang     ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr);
237219fbbafSJunchao Zhang     ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr);
238219fbbafSJunchao Zhang     ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr);
239219fbbafSJunchao Zhang     ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr);
240219fbbafSJunchao Zhang     ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */
241219fbbafSJunchao Zhang     mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */
242219fbbafSJunchao Zhang     mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
243219fbbafSJunchao Zhang     mpidev->use_extended_coo = PETSC_TRUE;
244219fbbafSJunchao Zhang 
245219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr);
246219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
247219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr);
248219fbbafSJunchao Zhang 
249219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr);
250219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
251219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr);
252219fbbafSJunchao Zhang 
253219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr);
254219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
255219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr);
256219fbbafSJunchao Zhang 
257219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr);
258219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr);
259219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr);
260219fbbafSJunchao Zhang 
261219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr);
262219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
263219fbbafSJunchao Zhang     cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr);
264219fbbafSJunchao Zhang 
265219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
266219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
267219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
268219fbbafSJunchao Zhang 
269219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
270219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
271219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
272219fbbafSJunchao Zhang 
273219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
274219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
275219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
276219fbbafSJunchao Zhang 
277219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
278219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
279219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
280219fbbafSJunchao Zhang 
281219fbbafSJunchao Zhang     cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
282219fbbafSJunchao Zhang   }
283219fbbafSJunchao Zhang   PetscFunctionReturn(0);
284219fbbafSJunchao Zhang }
285219fbbafSJunchao Zhang 
286219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])
287219fbbafSJunchao Zhang {
288219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
289219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
290219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]];
291219fbbafSJunchao Zhang }
292219fbbafSJunchao Zhang 
293219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[])
294219fbbafSJunchao Zhang {
295219fbbafSJunchao Zhang   PetscCount        i = blockIdx.x*blockDim.x + threadIdx.x;
296219fbbafSJunchao Zhang   const PetscCount  grid_size = gridDim.x * blockDim.x;
297219fbbafSJunchao Zhang   for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];}
298219fbbafSJunchao Zhang }
299219fbbafSJunchao Zhang 
300219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)
301219fbbafSJunchao Zhang {
302219fbbafSJunchao Zhang   PetscErrorCode                 ierr;
303219fbbafSJunchao Zhang   cudaError_t                    cerr;
304219fbbafSJunchao Zhang   Mat_MPIAIJ                     *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data);
305219fbbafSJunchao Zhang   Mat_MPIAIJCUSPARSE             *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr);
306219fbbafSJunchao Zhang   Mat                            A = mpiaij->A,B = mpiaij->B;
307219fbbafSJunchao Zhang   PetscCount                     Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2;
308219fbbafSJunchao Zhang   PetscScalar                    *Aa,*Ba;
309219fbbafSJunchao Zhang   PetscScalar                    *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d;
310219fbbafSJunchao Zhang   const PetscScalar              *v1 = v;
311219fbbafSJunchao Zhang   const PetscCount               *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d;
312219fbbafSJunchao Zhang   const PetscCount               *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d;
313219fbbafSJunchao Zhang   const PetscCount               *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d;
314219fbbafSJunchao Zhang   const PetscCount               *Cperm1 = mpidev->Cperm1_d;
315219fbbafSJunchao Zhang   PetscMemType                   memtype;
316219fbbafSJunchao Zhang 
317219fbbafSJunchao Zhang   PetscFunctionBegin;
318219fbbafSJunchao Zhang   if (mpidev->use_extended_coo) {
319219fbbafSJunchao Zhang     ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr);
320219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
321219fbbafSJunchao Zhang       cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr);
322*7487cd7cSJunchao Zhang       cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
323219fbbafSJunchao Zhang     }
324219fbbafSJunchao Zhang 
325219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
326219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */
327219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr);
328*7487cd7cSJunchao Zhang       cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
329*7487cd7cSJunchao Zhang       cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr);
330219fbbafSJunchao Zhang     } else {
331219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */
332219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr);
333219fbbafSJunchao Zhang     }
334219fbbafSJunchao Zhang 
335219fbbafSJunchao Zhang     /* Pack entries to be sent to remote */
336219fbbafSJunchao Zhang     MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend);
337219fbbafSJunchao Zhang 
338219fbbafSJunchao Zhang     /* Send remote entries to their owner and overlap the communication with local computation */
339219fbbafSJunchao Zhang     ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr);
340219fbbafSJunchao Zhang     /* Add local entries to A and B */
341219fbbafSJunchao Zhang     MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa);
342219fbbafSJunchao Zhang     MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba);
343219fbbafSJunchao Zhang     ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr);
344219fbbafSJunchao Zhang 
345219fbbafSJunchao Zhang     /* Add received remote entries to A and B */
346219fbbafSJunchao Zhang     MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa);
347219fbbafSJunchao Zhang     MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba);
348219fbbafSJunchao Zhang 
349219fbbafSJunchao Zhang     if (imode == INSERT_VALUES) {
350219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr);
351219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr);
352219fbbafSJunchao Zhang     } else {
353219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr);
354219fbbafSJunchao Zhang       ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr);
355219fbbafSJunchao Zhang     }
356219fbbafSJunchao Zhang     if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);}
357219fbbafSJunchao Zhang   } else {
358219fbbafSJunchao Zhang     ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr);
359219fbbafSJunchao Zhang   }
360219fbbafSJunchao Zhang   PetscFunctionReturn(0);
361219fbbafSJunchao Zhang }
362219fbbafSJunchao Zhang 
363ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
364ed502f03SStefano Zampini {
365ed502f03SStefano Zampini   Mat            Ad,Ao;
366ed502f03SStefano Zampini   const PetscInt *cmap;
367ed502f03SStefano Zampini   PetscErrorCode ierr;
368ed502f03SStefano Zampini 
369ed502f03SStefano Zampini   PetscFunctionBegin;
370ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
371ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
372ed502f03SStefano Zampini   if (glob) {
373ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
374ed502f03SStefano Zampini 
375ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
376ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
377ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
378ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
379ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
380ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
381ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
382ed502f03SStefano Zampini   }
383ed502f03SStefano Zampini   PetscFunctionReturn(0);
384ed502f03SStefano Zampini }
385ed502f03SStefano Zampini 
3869ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
3879ae82921SPaul Mullowney {
388bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
389bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
3909ae82921SPaul Mullowney   PetscErrorCode     ierr;
3919ae82921SPaul Mullowney   PetscInt           i;
3929ae82921SPaul Mullowney 
3939ae82921SPaul Mullowney   PetscFunctionBegin;
3949ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
3959ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
3967e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
3979ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
3982c71b3e2SJacob Faibussowitsch       PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]);
3999ae82921SPaul Mullowney     }
4009ae82921SPaul Mullowney   }
4017e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
4029ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
4032c71b3e2SJacob Faibussowitsch       PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]);
4049ae82921SPaul Mullowney     }
4059ae82921SPaul Mullowney   }
4066a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
4076a29ce69SStefano Zampini   ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr);
4086a29ce69SStefano Zampini #else
4096a29ce69SStefano Zampini   ierr = PetscFree(b->colmap);CHKERRQ(ierr);
4106a29ce69SStefano Zampini #endif
4116a29ce69SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
4126a29ce69SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
4136a29ce69SStefano Zampini   ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr);
4146a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
4156a29ce69SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
4166a29ce69SStefano Zampini   if (!b->A) {
4179ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
4189ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
4193bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
4206a29ce69SStefano Zampini   }
4216a29ce69SStefano Zampini   if (!b->B) {
4226a29ce69SStefano Zampini     PetscMPIInt size;
42355b25c41SPierre Jolivet     ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr);
4249ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
4256a29ce69SStefano Zampini     ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr);
4263bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
4279ae82921SPaul Mullowney   }
428d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
429d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
4306a29ce69SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
4316a29ce69SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
4329ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
4339ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
434e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
435e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
436b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
437b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
438a0e72f99SJunchao Zhang   /* Let A, B use b's handle with pre-set stream
439b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
440b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
441a0e72f99SJunchao Zhang   */
4429ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
4439ae82921SPaul Mullowney   PetscFunctionReturn(0);
4449ae82921SPaul Mullowney }
445e057df02SPaul Mullowney 
4469ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
4479ae82921SPaul Mullowney {
4489ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
4499ae82921SPaul Mullowney   PetscErrorCode ierr;
4509ae82921SPaul Mullowney   PetscInt       nt;
4519ae82921SPaul Mullowney 
4529ae82921SPaul Mullowney   PetscFunctionBegin;
4539ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
4542c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
45556258e06SRichard Tran Mills   /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */
45656258e06SRichard Tran Mills   if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);}
4579ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4584d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
4599ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4609ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
4619ae82921SPaul Mullowney   PetscFunctionReturn(0);
4629ae82921SPaul Mullowney }
463ca45077fSPaul Mullowney 
4643fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
4653fa6b06aSMark Adams {
4663fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
4673fa6b06aSMark Adams   PetscErrorCode ierr;
4683fa6b06aSMark Adams 
4693fa6b06aSMark Adams   PetscFunctionBegin;
4703fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
4713fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
4723fa6b06aSMark Adams   PetscFunctionReturn(0);
4733fa6b06aSMark Adams }
474042217e8SBarry Smith 
475fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
476fdc842d1SBarry Smith {
477fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
478fdc842d1SBarry Smith   PetscErrorCode ierr;
479fdc842d1SBarry Smith   PetscInt       nt;
480fdc842d1SBarry Smith 
481fdc842d1SBarry Smith   PetscFunctionBegin;
482fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
4832c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt);
484fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
4854d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
486fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
487fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
488fdc842d1SBarry Smith   PetscFunctionReturn(0);
489fdc842d1SBarry Smith }
490fdc842d1SBarry Smith 
491ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
492ca45077fSPaul Mullowney {
493ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
494ca45077fSPaul Mullowney   PetscErrorCode ierr;
495ca45077fSPaul Mullowney   PetscInt       nt;
496ca45077fSPaul Mullowney 
497ca45077fSPaul Mullowney   PetscFunctionBegin;
498ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
4992c71b3e2SJacob Faibussowitsch   PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt);
5009b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
501ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
5029b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
5039b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
504ca45077fSPaul Mullowney   PetscFunctionReturn(0);
505ca45077fSPaul Mullowney }
5069ae82921SPaul Mullowney 
507e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
508ca45077fSPaul Mullowney {
509ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
510bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
511e057df02SPaul Mullowney 
512ca45077fSPaul Mullowney   PetscFunctionBegin;
513e057df02SPaul Mullowney   switch (op) {
514e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
515e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
516045c96e1SPaul Mullowney     break;
517e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
518e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
519045c96e1SPaul Mullowney     break;
520e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
521e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
522e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
523045c96e1SPaul Mullowney     break;
524e057df02SPaul Mullowney   default:
52598921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
526045c96e1SPaul Mullowney   }
527ca45077fSPaul Mullowney   PetscFunctionReturn(0);
528ca45077fSPaul Mullowney }
529e057df02SPaul Mullowney 
5304416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
531e057df02SPaul Mullowney {
532e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
533e057df02SPaul Mullowney   PetscErrorCode           ierr;
534e057df02SPaul Mullowney   PetscBool                flg;
535a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
536a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
5375fd66863SKarl Rupp 
538e057df02SPaul Mullowney   PetscFunctionBegin;
539e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
540e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
541e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
542a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
543e057df02SPaul Mullowney     if (flg) {
544e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
545e057df02SPaul Mullowney     }
546e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
547a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
548e057df02SPaul Mullowney     if (flg) {
549e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
550e057df02SPaul Mullowney     }
551e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
552a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
553e057df02SPaul Mullowney     if (flg) {
554e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
555e057df02SPaul Mullowney     }
556e057df02SPaul Mullowney   }
5570af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
558e057df02SPaul Mullowney   PetscFunctionReturn(0);
559e057df02SPaul Mullowney }
560e057df02SPaul Mullowney 
56134d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
56234d6c7a5SJose E. Roman {
56334d6c7a5SJose E. Roman   PetscErrorCode     ierr;
5643fa6b06aSMark Adams   Mat_MPIAIJ         *mpiaij = (Mat_MPIAIJ*)A->data;
565042217e8SBarry Smith   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
566042217e8SBarry Smith   PetscObjectState   onnz = A->nonzerostate;
567042217e8SBarry Smith 
56834d6c7a5SJose E. Roman   PetscFunctionBegin;
56934d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
570042217e8SBarry Smith   if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); }
571042217e8SBarry Smith   if (onnz != A->nonzerostate && cusp->deviceMat) {
572042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat;
573042217e8SBarry Smith     cudaError_t                cerr;
574042217e8SBarry Smith 
575042217e8SBarry Smith     ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr);
576042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
577042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
578042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
57999551766SMark Adams     if (h_mat->allocated_indices) {
58099551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
58199551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
58299551766SMark Adams       if (h_mat->offdiag.j) {
58399551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
58499551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
58599551766SMark Adams       }
58699551766SMark Adams     }
587042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
588042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
589042217e8SBarry Smith     cusp->deviceMat = NULL;
5903fa6b06aSMark Adams   }
59134d6c7a5SJose E. Roman   PetscFunctionReturn(0);
59234d6c7a5SJose E. Roman }
59334d6c7a5SJose E. Roman 
594bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
595bbf3fe20SPaul Mullowney {
596bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
597219fbbafSJunchao Zhang   cudaError_t        cerr;
5983fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
5993fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
600b06137fdSPaul Mullowney   cusparseStatus_t   stat;
601bbf3fe20SPaul Mullowney 
602bbf3fe20SPaul Mullowney   PetscFunctionBegin;
6032c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
6043fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
605042217e8SBarry Smith     PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat;
606042217e8SBarry Smith 
6073fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
608042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
609042217e8SBarry Smith     cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
610042217e8SBarry Smith     cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr);
61199551766SMark Adams     if (h_mat->allocated_indices) {
61299551766SMark Adams       cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr);
61399551766SMark Adams       cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr);
61499551766SMark Adams       if (h_mat->offdiag.j) {
61599551766SMark Adams         cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr);
61699551766SMark Adams         cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr);
61799551766SMark Adams       }
61899551766SMark Adams     }
619042217e8SBarry Smith     cerr = cudaFree(d_mat);CHKERRCUDA(cerr);
620042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
6213fa6b06aSMark Adams   }
622bbf3fe20SPaul Mullowney   try {
6233fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
6243fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
62557d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
626a0e72f99SJunchao Zhang     /* We want cusparseStruct to use PetscDefaultCudaStream
62717403302SKarl Rupp     if (cusparseStruct->stream) {
628042217e8SBarry Smith       cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
62917403302SKarl Rupp     }
630a0e72f99SJunchao Zhang     */
631219fbbafSJunchao Zhang     /* Extended COO */
632219fbbafSJunchao Zhang     if (cusparseStruct->use_extended_coo) {
633219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr);
634219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr);
635219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr);
636219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr);
637219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr);
638219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr);
639219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr);
640219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr);
641219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr);
642219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr);
643219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr);
644219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr);
645219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr);
646219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr);
647219fbbafSJunchao Zhang       cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr);
648219fbbafSJunchao Zhang     } else {
6497e8381f9SStefano Zampini       delete cusparseStruct->coo_p;
6507e8381f9SStefano Zampini       delete cusparseStruct->coo_pw;
651219fbbafSJunchao Zhang     }
652bbf3fe20SPaul Mullowney     delete cusparseStruct;
653bbf3fe20SPaul Mullowney   } catch(char *ex) {
65498921bdaSJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
655bbf3fe20SPaul Mullowney   }
6563338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
657ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
6587e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
6597e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
6603338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
661ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr);
662bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
663bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
664bbf3fe20SPaul Mullowney }
665ca45077fSPaul Mullowney 
6663338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
6679ae82921SPaul Mullowney {
6689ae82921SPaul Mullowney   PetscErrorCode     ierr;
669bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
670b06137fdSPaul Mullowney   cusparseStatus_t   stat;
6713338378cSStefano Zampini   Mat                A;
6729ae82921SPaul Mullowney 
6739ae82921SPaul Mullowney   PetscFunctionBegin;
674219fbbafSJunchao Zhang   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
6753338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
6763338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
6773338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
6783338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
6793338378cSStefano Zampini   }
6803338378cSStefano Zampini   A = *newmat;
6816f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
68234136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
68334136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
68434136279SStefano Zampini 
685bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
686d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
687d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
688d98d7c49SStefano Zampini   if (a->lvec) {
689d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
690d98d7c49SStefano Zampini   }
691d98d7c49SStefano Zampini 
6923338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
693219fbbafSJunchao Zhang     Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE;
694219fbbafSJunchao Zhang     stat     = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat);
695219fbbafSJunchao Zhang     a->spptr = cusp;
6963338378cSStefano Zampini   }
6972205254eSKarl Rupp 
69834d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
699bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
700fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
701bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
702bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
703bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
7043fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
7054e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
7062205254eSKarl Rupp 
707bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
708ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
7093338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
710bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
7117e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
7127e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
713ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
714ae48a8d0SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr);
715ae48a8d0SStefano Zampini #endif
7169ae82921SPaul Mullowney   PetscFunctionReturn(0);
7179ae82921SPaul Mullowney }
7189ae82921SPaul Mullowney 
7193338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
7203338378cSStefano Zampini {
7213338378cSStefano Zampini   PetscErrorCode ierr;
7223338378cSStefano Zampini 
7233338378cSStefano Zampini   PetscFunctionBegin;
724a4af0ceeSJacob Faibussowitsch   ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr);
7253338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
7263338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
7273338378cSStefano Zampini   PetscFunctionReturn(0);
7283338378cSStefano Zampini }
7293338378cSStefano Zampini 
7303f9c0db1SPaul Mullowney /*@
7313f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
732e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
7333f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
734e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
735e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
736e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
7379ae82921SPaul Mullowney 
738d083f849SBarry Smith    Collective
739e057df02SPaul Mullowney 
740e057df02SPaul Mullowney    Input Parameters:
741e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
742e057df02SPaul Mullowney .  m - number of rows
743e057df02SPaul Mullowney .  n - number of columns
744e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
745e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
7460298fd71SBarry Smith          (possibly different for each row) or NULL
747e057df02SPaul Mullowney 
748e057df02SPaul Mullowney    Output Parameter:
749e057df02SPaul Mullowney .  A - the matrix
750e057df02SPaul Mullowney 
751e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
752e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
753e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
754e057df02SPaul Mullowney 
755e057df02SPaul Mullowney    Notes:
756e057df02SPaul Mullowney    If nnz is given then nz is ignored
757e057df02SPaul Mullowney 
758e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
759e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
760e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
761e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
762e057df02SPaul Mullowney 
763e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
7640298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
765e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
766e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
767e057df02SPaul Mullowney 
768e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
769e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
770e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
771e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
772e057df02SPaul Mullowney 
773e057df02SPaul Mullowney    Level: intermediate
774e057df02SPaul Mullowney 
775e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
776e057df02SPaul Mullowney @*/
7779ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
7789ae82921SPaul Mullowney {
7799ae82921SPaul Mullowney   PetscErrorCode ierr;
7809ae82921SPaul Mullowney   PetscMPIInt    size;
7819ae82921SPaul Mullowney 
7829ae82921SPaul Mullowney   PetscFunctionBegin;
7839ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
7849ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
785ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
7869ae82921SPaul Mullowney   if (size > 1) {
7879ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
7889ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
7899ae82921SPaul Mullowney   } else {
7909ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
7919ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
7929ae82921SPaul Mullowney   }
7939ae82921SPaul Mullowney   PetscFunctionReturn(0);
7949ae82921SPaul Mullowney }
7959ae82921SPaul Mullowney 
7963ca39a21SBarry Smith /*MC
7976bb69460SJunchao Zhang    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE.
798e057df02SPaul Mullowney 
7992692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
8002692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
8012692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
8029ae82921SPaul Mullowney 
8039ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
8049ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
8059ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
8069ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
8079ae82921SPaul Mullowney    the above preallocation routines for simplicity.
8089ae82921SPaul Mullowney 
8099ae82921SPaul Mullowney    Options Database Keys:
810e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
8118468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8128468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8138468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
8149ae82921SPaul Mullowney 
8159ae82921SPaul Mullowney   Level: beginner
8169ae82921SPaul Mullowney 
8176bb69460SJunchao Zhang  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
8186bb69460SJunchao Zhang M*/
8196bb69460SJunchao Zhang 
8206bb69460SJunchao Zhang /*MC
8216bb69460SJunchao Zhang    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE.
8226bb69460SJunchao Zhang 
8236bb69460SJunchao Zhang   Level: beginner
8246bb69460SJunchao Zhang 
8256bb69460SJunchao Zhang  .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE
8269ae82921SPaul Mullowney M*/
8273fa6b06aSMark Adams 
828042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat.
829042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B)
8303fa6b06aSMark Adams {
831042217e8SBarry Smith   PetscSplitCSRDataStructure d_mat;
832042217e8SBarry Smith   PetscMPIInt                size;
8333fa6b06aSMark Adams   PetscErrorCode             ierr;
834042217e8SBarry Smith   int                        *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL;
835042217e8SBarry Smith   PetscScalar                *aa = NULL,*ba = NULL;
83699551766SMark Adams   Mat_SeqAIJ                 *jaca = NULL, *jacb = NULL;
837042217e8SBarry Smith   Mat_SeqAIJCUSPARSE         *cusparsestructA = NULL;
838042217e8SBarry Smith   CsrMatrix                  *matrixA = NULL,*matrixB = NULL;
8393fa6b06aSMark Adams 
8403fa6b06aSMark Adams   PetscFunctionBegin;
8412c71b3e2SJacob Faibussowitsch   PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix");
842042217e8SBarry Smith   if (A->factortype != MAT_FACTOR_NONE) {
8433fa6b06aSMark Adams     *B = NULL;
8443fa6b06aSMark Adams     PetscFunctionReturn(0);
8453fa6b06aSMark Adams   }
846042217e8SBarry Smith   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr);
84799551766SMark Adams   // get jaca
8483fa6b06aSMark Adams   if (size == 1) {
849042217e8SBarry Smith     PetscBool isseqaij;
850042217e8SBarry Smith 
851042217e8SBarry Smith     ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr);
852042217e8SBarry Smith     if (isseqaij) {
8533fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
8542c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
855042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr;
856042217e8SBarry Smith       d_mat = cusparsestructA->deviceMat;
857042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr);
8583fa6b06aSMark Adams     } else {
8593fa6b06aSMark Adams       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
8602c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
861042217e8SBarry Smith       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8623fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
863042217e8SBarry Smith       cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
864042217e8SBarry Smith       d_mat = spptr->deviceMat;
865042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
8663fa6b06aSMark Adams     }
867042217e8SBarry Smith     if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
868042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
8692c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
870042217e8SBarry Smith       matrixA = (CsrMatrix*)matstruct->mat;
871042217e8SBarry Smith       bi = NULL;
872042217e8SBarry Smith       bj = NULL;
873042217e8SBarry Smith       ba = NULL;
874042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
875042217e8SBarry Smith   } else {
876042217e8SBarry Smith     Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
8772c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion");
878042217e8SBarry Smith     jaca = (Mat_SeqAIJ*)aij->A->data;
87999551766SMark Adams     jacb = (Mat_SeqAIJ*)aij->B->data;
880042217e8SBarry Smith     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
8813fa6b06aSMark Adams 
8822c71b3e2SJacob Faibussowitsch     PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)");
883042217e8SBarry Smith     cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
884042217e8SBarry Smith     Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
8852c71b3e2SJacob Faibussowitsch     PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
886042217e8SBarry Smith     if (cusparsestructB->format==MAT_CUSPARSE_CSR) {
887042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr);
888042217e8SBarry Smith       ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr);
889042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
890042217e8SBarry Smith       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
8912c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A");
8922c71b3e2SJacob Faibussowitsch       PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B");
893042217e8SBarry Smith       matrixA = (CsrMatrix*)matstructA->mat;
894042217e8SBarry Smith       matrixB = (CsrMatrix*)matstructB->mat;
8953fa6b06aSMark Adams       if (jacb->compressedrow.use) {
896042217e8SBarry Smith         if (!cusparsestructB->rowoffsets_gpu) {
897042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
898042217e8SBarry Smith           cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1);
8993fa6b06aSMark Adams         }
900042217e8SBarry Smith         bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data());
901042217e8SBarry Smith       } else {
902042217e8SBarry Smith         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
903042217e8SBarry Smith       }
904042217e8SBarry Smith       bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
905042217e8SBarry Smith       ba = thrust::raw_pointer_cast(matrixB->values->data());
906042217e8SBarry Smith     } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
907042217e8SBarry Smith     d_mat = spptr->deviceMat;
908042217e8SBarry Smith   }
9093fa6b06aSMark Adams   if (jaca->compressedrow.use) {
910042217e8SBarry Smith     if (!cusparsestructA->rowoffsets_gpu) {
911042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1);
912042217e8SBarry Smith       cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1);
9133fa6b06aSMark Adams     }
914042217e8SBarry Smith     ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data());
9153fa6b06aSMark Adams   } else {
916042217e8SBarry Smith     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
9173fa6b06aSMark Adams   }
918042217e8SBarry Smith   aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
919042217e8SBarry Smith   aa = thrust::raw_pointer_cast(matrixA->values->data());
920042217e8SBarry Smith 
921042217e8SBarry Smith   if (!d_mat) {
922042217e8SBarry Smith     cudaError_t                cerr;
923042217e8SBarry Smith     PetscSplitCSRDataStructure h_mat;
924042217e8SBarry Smith 
925042217e8SBarry Smith     // create and populate strucy on host and copy on device
926042217e8SBarry Smith     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
927042217e8SBarry Smith     ierr = PetscNew(&h_mat);CHKERRQ(ierr);
928042217e8SBarry Smith     cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr);
929042217e8SBarry Smith     if (size > 1) { /* need the colmap array */
930042217e8SBarry Smith       Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data;
93199551766SMark Adams       PetscInt   *colmap;
932042217e8SBarry Smith       PetscInt   ii,n = aij->B->cmap->n,N = A->cmap->N;
933042217e8SBarry Smith 
9342c71b3e2SJacob Faibussowitsch       PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
935042217e8SBarry Smith 
936042217e8SBarry Smith       ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr);
937042217e8SBarry Smith       for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1);
938365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
939365b711fSMark Adams       { // have to make a long version of these
94099551766SMark Adams         int        *h_bi32, *h_bj32;
94199551766SMark Adams         PetscInt   *h_bi64, *h_bj64, *d_bi64, *d_bj64;
942365b711fSMark Adams         ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr);
94399551766SMark Adams         cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
94499551766SMark Adams         for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i];
94599551766SMark Adams         cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
94699551766SMark Adams         for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i];
94799551766SMark Adams 
94899551766SMark Adams         cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr);
94999551766SMark Adams         cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
95099551766SMark Adams         cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr);
95199551766SMark Adams         cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
95299551766SMark Adams 
95399551766SMark Adams         h_mat->offdiag.i = d_bi64;
95499551766SMark Adams         h_mat->offdiag.j = d_bj64;
95599551766SMark Adams         h_mat->allocated_indices = PETSC_TRUE;
95699551766SMark Adams 
957365b711fSMark Adams         ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr);
958365b711fSMark Adams       }
959365b711fSMark Adams #else
96099551766SMark Adams       h_mat->offdiag.i = (PetscInt*)bi;
96199551766SMark Adams       h_mat->offdiag.j = (PetscInt*)bj;
96299551766SMark Adams       h_mat->allocated_indices = PETSC_FALSE;
963365b711fSMark Adams #endif
964042217e8SBarry Smith       h_mat->offdiag.a = ba;
965042217e8SBarry Smith       h_mat->offdiag.n = A->rmap->n;
966042217e8SBarry Smith 
96799551766SMark Adams       cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr);
96899551766SMark Adams       cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
969042217e8SBarry Smith       ierr = PetscFree(colmap);CHKERRQ(ierr);
970042217e8SBarry Smith     }
971042217e8SBarry Smith     h_mat->rstart = A->rmap->rstart;
972042217e8SBarry Smith     h_mat->rend   = A->rmap->rend;
973042217e8SBarry Smith     h_mat->cstart = A->cmap->rstart;
974042217e8SBarry Smith     h_mat->cend   = A->cmap->rend;
97549b994a9SMark Adams     h_mat->M      = A->cmap->N;
976365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES)
977365b711fSMark Adams     {
9782c71b3e2SJacob Faibussowitsch       PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt));
97999551766SMark Adams       int        *h_ai32, *h_aj32;
98099551766SMark Adams       PetscInt   *h_ai64, *h_aj64, *d_ai64, *d_aj64;
981365b711fSMark Adams       ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr);
98299551766SMark Adams       cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
98399551766SMark Adams       for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i];
98499551766SMark Adams       cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
98599551766SMark Adams       for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i];
98699551766SMark Adams 
98799551766SMark Adams       cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr);
98899551766SMark Adams       cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
98999551766SMark Adams       cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr);
99099551766SMark Adams       cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
99199551766SMark Adams 
99299551766SMark Adams       h_mat->diag.i = d_ai64;
99399551766SMark Adams       h_mat->diag.j = d_aj64;
99499551766SMark Adams       h_mat->allocated_indices = PETSC_TRUE;
99599551766SMark Adams 
996365b711fSMark Adams       ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr);
997365b711fSMark Adams     }
998365b711fSMark Adams #else
99999551766SMark Adams     h_mat->diag.i = (PetscInt*)ai;
100099551766SMark Adams     h_mat->diag.j = (PetscInt*)aj;
100199551766SMark Adams     h_mat->allocated_indices = PETSC_FALSE;
1002365b711fSMark Adams #endif
1003042217e8SBarry Smith     h_mat->diag.a = aa;
1004042217e8SBarry Smith     h_mat->diag.n = A->rmap->n;
1005042217e8SBarry Smith     h_mat->rank   = PetscGlobalRank;
1006042217e8SBarry Smith     // copy pointers and metadata to device
1007042217e8SBarry Smith     cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
1008042217e8SBarry Smith     ierr = PetscFree(h_mat);CHKERRQ(ierr);
1009042217e8SBarry Smith   } else {
1010042217e8SBarry Smith     ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr);
1011042217e8SBarry Smith   }
1012042217e8SBarry Smith   *B = d_mat;
1013042217e8SBarry Smith   A->offloadmask = PETSC_OFFLOAD_GPU;
10143fa6b06aSMark Adams   PetscFunctionReturn(0);
10153fa6b06aSMark Adams }
1016