1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 147e8381f9SStefano Zampini struct VecCUDAEquals 157e8381f9SStefano Zampini { 167e8381f9SStefano Zampini template <typename Tuple> 177e8381f9SStefano Zampini __host__ __device__ 187e8381f9SStefano Zampini void operator()(Tuple t) 197e8381f9SStefano Zampini { 207e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 217e8381f9SStefano Zampini } 227e8381f9SStefano Zampini }; 237e8381f9SStefano Zampini 24*cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 25*cbc6b225SStefano Zampini { 26*cbc6b225SStefano Zampini cudaError_t cerr; 27*cbc6b225SStefano Zampini Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; 28*cbc6b225SStefano Zampini Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 29*cbc6b225SStefano Zampini 30*cbc6b225SStefano Zampini PetscFunctionBegin; 31*cbc6b225SStefano Zampini if (!cusparseStruct) PetscFunctionReturn(0); 32*cbc6b225SStefano Zampini if (cusparseStruct->use_extended_coo) { 33*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr); 34*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr); 35*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr); 36*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr); 37*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr); 38*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr); 39*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr); 40*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr); 41*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr); 42*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr); 43*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr); 44*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr); 45*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr); 46*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr); 47*cbc6b225SStefano Zampini cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr); 48*cbc6b225SStefano Zampini } 49*cbc6b225SStefano Zampini cusparseStruct->use_extended_coo = PETSC_FALSE; 50*cbc6b225SStefano Zampini delete cusparseStruct->coo_p; 51*cbc6b225SStefano Zampini delete cusparseStruct->coo_pw; 52*cbc6b225SStefano Zampini cusparseStruct->coo_p = NULL; 53*cbc6b225SStefano Zampini cusparseStruct->coo_pw = NULL; 54*cbc6b225SStefano Zampini PetscFunctionReturn(0); 55*cbc6b225SStefano Zampini } 56*cbc6b225SStefano Zampini 57219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 587e8381f9SStefano Zampini { 597e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 607e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 617e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 627e8381f9SStefano Zampini PetscErrorCode ierr; 637e8381f9SStefano Zampini 647e8381f9SStefano Zampini PetscFunctionBegin; 65e61fc153SStefano Zampini if (cusp->coo_p && v) { 6608391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 6708391a17SStefano Zampini THRUSTARRAY *w = NULL; 6808391a17SStefano Zampini 6908391a17SStefano Zampini if (isCudaMem(v)) { 7008391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 7108391a17SStefano Zampini } else { 72e61fc153SStefano Zampini w = new THRUSTARRAY(n); 73e61fc153SStefano Zampini w->assign(v,v+n); 7408391a17SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 7508391a17SStefano Zampini d_v = w->data(); 7608391a17SStefano Zampini } 7708391a17SStefano Zampini 7808391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 797e8381f9SStefano Zampini cusp->coo_pw->begin())); 8008391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 817e8381f9SStefano Zampini cusp->coo_pw->end())); 8208391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 837e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 8408391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 85e61fc153SStefano Zampini delete w; 86219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 87219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 887e8381f9SStefano Zampini } else { 89219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr); 90219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 917e8381f9SStefano Zampini } 927e8381f9SStefano Zampini PetscFunctionReturn(0); 937e8381f9SStefano Zampini } 947e8381f9SStefano Zampini 957e8381f9SStefano Zampini template <typename Tuple> 967e8381f9SStefano Zampini struct IsNotOffDiagT 977e8381f9SStefano Zampini { 987e8381f9SStefano Zampini PetscInt _cstart,_cend; 997e8381f9SStefano Zampini 1007e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 1017e8381f9SStefano Zampini __host__ __device__ 1027e8381f9SStefano Zampini inline bool operator()(Tuple t) 1037e8381f9SStefano Zampini { 1047e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 1057e8381f9SStefano Zampini } 1067e8381f9SStefano Zampini }; 1077e8381f9SStefano Zampini 1087e8381f9SStefano Zampini struct IsOffDiag 1097e8381f9SStefano Zampini { 1107e8381f9SStefano Zampini PetscInt _cstart,_cend; 1117e8381f9SStefano Zampini 1127e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 1137e8381f9SStefano Zampini __host__ __device__ 1147e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 1157e8381f9SStefano Zampini { 1167e8381f9SStefano Zampini return c < _cstart || c >= _cend; 1177e8381f9SStefano Zampini } 1187e8381f9SStefano Zampini }; 1197e8381f9SStefano Zampini 1207e8381f9SStefano Zampini struct GlobToLoc 1217e8381f9SStefano Zampini { 1227e8381f9SStefano Zampini PetscInt _start; 1237e8381f9SStefano Zampini 1247e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 1257e8381f9SStefano Zampini __host__ __device__ 1267e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 1277e8381f9SStefano Zampini { 1287e8381f9SStefano Zampini return c - _start; 1297e8381f9SStefano Zampini } 1307e8381f9SStefano Zampini }; 1317e8381f9SStefano Zampini 132219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 1337e8381f9SStefano Zampini { 1347e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1357e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 1367e8381f9SStefano Zampini PetscErrorCode ierr; 13782a78a4eSJed Brown PetscInt N,*jj; 1387e8381f9SStefano Zampini size_t noff = 0; 139ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1407e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1417e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1427e8381f9SStefano Zampini cudaError_t cerr; 1437e8381f9SStefano Zampini 1447e8381f9SStefano Zampini PetscFunctionBegin; 1457e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1467e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1477e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1487e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1497e8381f9SStefano Zampini 1507e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1517e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1527e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 15308391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1547e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1557e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1567e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1577e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1587e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1597e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1607e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1617e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1627e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1637e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1647e8381f9SStefano Zampini } 1657e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1667e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1677e8381f9SStefano Zampini 1687e8381f9SStefano Zampini /* from global to local */ 1697e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1707e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 17108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1727e8381f9SStefano Zampini 1737e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 174ddea5d60SJunchao Zhang ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 1757e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1767e8381f9SStefano Zampini auto o_j = d_j.begin(); 17708391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 178ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1797e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 180ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 18108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1827e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 183*cbc6b225SStefano Zampini /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 184ddea5d60SJunchao Zhang ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 1857e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1867e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1877e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1887e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 18982a78a4eSJed Brown ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr); 1902c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 1917e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1927e8381f9SStefano Zampini 1937e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1947e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1957e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1967e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1977e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1987e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1997e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2007e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 2017e8381f9SStefano Zampini 2027e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 203219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 204219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 2057e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 2067e8381f9SStefano Zampini 2077e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 2087e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 2097e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 2107e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 211a0e72f99SJunchao Zhang /* 2127e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 2137e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 214a0e72f99SJunchao Zhang */ 2157e8381f9SStefano Zampini 2167e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 2177e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 218*cbc6b225SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 2197e8381f9SStefano Zampini PetscFunctionReturn(0); 2207e8381f9SStefano Zampini } 2219ae82921SPaul Mullowney 222219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 223219fbbafSJunchao Zhang { 224219fbbafSJunchao Zhang PetscErrorCode ierr; 225*cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 226219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 227*cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 228219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 229219fbbafSJunchao Zhang PetscInt rstart,rend; 230219fbbafSJunchao Zhang cudaError_t cerr; 231219fbbafSJunchao Zhang 232219fbbafSJunchao Zhang PetscFunctionBegin; 233*cbc6b225SStefano Zampini ierr = PetscFree(mpiaij->garray);CHKERRQ(ierr); 234*cbc6b225SStefano Zampini ierr = VecDestroy(&mpiaij->lvec);CHKERRQ(ierr); 235*cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE) 236*cbc6b225SStefano Zampini ierr = PetscTableDestroy(&mpiaij->colmap);CHKERRQ(ierr); 237*cbc6b225SStefano Zampini #else 238*cbc6b225SStefano Zampini ierr = PetscFree(mpiaij->colmap);CHKERRQ(ierr); 239*cbc6b225SStefano Zampini #endif 240*cbc6b225SStefano Zampini ierr = VecScatterDestroy(&mpiaij->Mvctx);CHKERRQ(ierr); 241*cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE; 242*cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE; 243*cbc6b225SStefano Zampini ierr = MatResetPreallocationCOO_MPIAIJ(mat);CHKERRQ(ierr); 244*cbc6b225SStefano Zampini ierr = MatResetPreallocationCOO_MPIAIJCUSPARSE(mat);CHKERRQ(ierr); 245219fbbafSJunchao Zhang if (coo_i) { 246219fbbafSJunchao Zhang ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 247219fbbafSJunchao Zhang ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 248219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 249219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 250*cbc6b225SStefano Zampini if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;} 251219fbbafSJunchao Zhang } 252219fbbafSJunchao Zhang } 253219fbbafSJunchao Zhang } 254219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 255*cbc6b225SStefano Zampini ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr); 256219fbbafSJunchao Zhang if (coo_basic) { 257219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 258219fbbafSJunchao Zhang } else { 259*cbc6b225SStefano Zampini ierr = MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 260*cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 261*cbc6b225SStefano Zampini /* creates the GPU memory */ 262baf1f5d7SJed Brown ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->A);CHKERRQ(ierr); 263baf1f5d7SJed Brown ierr = MatSeqAIJCUSPARSECopyToGPU(mpiaij->B);CHKERRQ(ierr); 264*cbc6b225SStefano Zampini mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 265219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 266219fbbafSJunchao Zhang 267219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr); 268219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 269219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr); 270219fbbafSJunchao Zhang 271219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr); 272219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 273219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr); 274219fbbafSJunchao Zhang 275219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr); 276219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 277219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr); 278219fbbafSJunchao Zhang 279219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr); 280219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 281219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr); 282219fbbafSJunchao Zhang 283219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr); 284219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 285219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 286219fbbafSJunchao Zhang 287219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 288219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 289219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 290219fbbafSJunchao Zhang 291219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 292219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 293219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 294219fbbafSJunchao Zhang 295219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 296219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 297219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 298219fbbafSJunchao Zhang 299219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 300219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 301219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 302219fbbafSJunchao Zhang 303219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 304219fbbafSJunchao Zhang } 305219fbbafSJunchao Zhang PetscFunctionReturn(0); 306219fbbafSJunchao Zhang } 307219fbbafSJunchao Zhang 308219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 309219fbbafSJunchao Zhang { 310219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 311219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 312219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 313219fbbafSJunchao Zhang } 314219fbbafSJunchao Zhang 315219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 316219fbbafSJunchao Zhang { 317219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 318219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 319219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 320219fbbafSJunchao Zhang } 321219fbbafSJunchao Zhang 322219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 323219fbbafSJunchao Zhang { 324219fbbafSJunchao Zhang PetscErrorCode ierr; 325219fbbafSJunchao Zhang cudaError_t cerr; 326219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 327219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 328219fbbafSJunchao Zhang Mat A = mpiaij->A,B = mpiaij->B; 329219fbbafSJunchao Zhang PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 330*cbc6b225SStefano Zampini PetscScalar *Aa,*Ba = NULL; 331219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 332219fbbafSJunchao Zhang const PetscScalar *v1 = v; 333219fbbafSJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 334219fbbafSJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 335219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 336219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 337219fbbafSJunchao Zhang PetscMemType memtype; 338219fbbafSJunchao Zhang 339219fbbafSJunchao Zhang PetscFunctionBegin; 340219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 341*cbc6b225SStefano Zampini PetscMPIInt size; 342*cbc6b225SStefano Zampini 343*cbc6b225SStefano Zampini ierr = MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size);CHKERRMPI(ierr); 344219fbbafSJunchao Zhang ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 345219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 346219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 3477487cd7cSJunchao Zhang cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 348219fbbafSJunchao Zhang } 349219fbbafSJunchao Zhang 350219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 351219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 3527487cd7cSJunchao Zhang cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 353*cbc6b225SStefano Zampini if (size > 1) { 354*cbc6b225SStefano Zampini ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr); 3557487cd7cSJunchao Zhang cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 356*cbc6b225SStefano Zampini } 357219fbbafSJunchao Zhang } else { 358219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */ 359*cbc6b225SStefano Zampini if (size > 1) { ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); } 360219fbbafSJunchao Zhang } 361219fbbafSJunchao Zhang 362219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 363*cbc6b225SStefano Zampini if (mpiaij->sendlen) { 364219fbbafSJunchao Zhang MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 365*cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 366*cbc6b225SStefano Zampini } 367219fbbafSJunchao Zhang 368219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 369*cbc6b225SStefano Zampini if (size > 1) { ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); } 370219fbbafSJunchao Zhang /* Add local entries to A and B */ 371*cbc6b225SStefano Zampini if (Annz1) { 372219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 373*cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 374*cbc6b225SStefano Zampini } 375*cbc6b225SStefano Zampini if (Bnnz1) { 376219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 377*cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 378*cbc6b225SStefano Zampini } 379*cbc6b225SStefano Zampini if (size > 1) { ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); } 380219fbbafSJunchao Zhang 381219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 382*cbc6b225SStefano Zampini if (Annz2) { 383219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 384*cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 385*cbc6b225SStefano Zampini } 386*cbc6b225SStefano Zampini if (Bnnz2) { 387219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 388*cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 389*cbc6b225SStefano Zampini } 390219fbbafSJunchao Zhang 391219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 392219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr); 393*cbc6b225SStefano Zampini if (size > 1) { ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); } 394219fbbafSJunchao Zhang } else { 395219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr); 396*cbc6b225SStefano Zampini if (size > 1) { ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); } 397219fbbafSJunchao Zhang } 398219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 399219fbbafSJunchao Zhang } else { 400219fbbafSJunchao Zhang ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr); 401219fbbafSJunchao Zhang } 402*cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU; 403219fbbafSJunchao Zhang PetscFunctionReturn(0); 404219fbbafSJunchao Zhang } 405219fbbafSJunchao Zhang 406ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 407ed502f03SStefano Zampini { 408ed502f03SStefano Zampini Mat Ad,Ao; 409ed502f03SStefano Zampini const PetscInt *cmap; 410ed502f03SStefano Zampini PetscErrorCode ierr; 411ed502f03SStefano Zampini 412ed502f03SStefano Zampini PetscFunctionBegin; 413ed502f03SStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 414ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 415ed502f03SStefano Zampini if (glob) { 416ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 417ed502f03SStefano Zampini 418ed502f03SStefano Zampini ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 419ed502f03SStefano Zampini ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 420ed502f03SStefano Zampini ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 421ed502f03SStefano Zampini ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 422ed502f03SStefano Zampini for (i=0; i<dn; i++) gidx[i] = cst + i; 423ed502f03SStefano Zampini for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 424ed502f03SStefano Zampini ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 425ed502f03SStefano Zampini } 426ed502f03SStefano Zampini PetscFunctionReturn(0); 427ed502f03SStefano Zampini } 428ed502f03SStefano Zampini 4299ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 4309ae82921SPaul Mullowney { 431bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 432bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 4339ae82921SPaul Mullowney PetscErrorCode ierr; 4349ae82921SPaul Mullowney PetscInt i; 4359ae82921SPaul Mullowney 4369ae82921SPaul Mullowney PetscFunctionBegin; 4379ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 4389ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 4397e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 4409ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 4412c71b3e2SJacob Faibussowitsch PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 4429ae82921SPaul Mullowney } 4439ae82921SPaul Mullowney } 4447e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 4459ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 4462c71b3e2SJacob Faibussowitsch PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 4479ae82921SPaul Mullowney } 4489ae82921SPaul Mullowney } 4496a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 4506a29ce69SStefano Zampini ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 4516a29ce69SStefano Zampini #else 4526a29ce69SStefano Zampini ierr = PetscFree(b->colmap);CHKERRQ(ierr); 4536a29ce69SStefano Zampini #endif 4546a29ce69SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 4556a29ce69SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 4566a29ce69SStefano Zampini ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 4576a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4586a29ce69SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 4596a29ce69SStefano Zampini if (!b->A) { 4609ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 4619ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 4623bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 4636a29ce69SStefano Zampini } 4646a29ce69SStefano Zampini if (!b->B) { 4656a29ce69SStefano Zampini PetscMPIInt size; 46655b25c41SPierre Jolivet ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 4679ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 4686a29ce69SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 4693bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 4709ae82921SPaul Mullowney } 471d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 472d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4736a29ce69SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 4746a29ce69SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 4759ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 4769ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 477e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 478e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 479b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 480b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 4819ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4829ae82921SPaul Mullowney PetscFunctionReturn(0); 4839ae82921SPaul Mullowney } 484e057df02SPaul Mullowney 4859ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 4869ae82921SPaul Mullowney { 4879ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 4889ae82921SPaul Mullowney PetscErrorCode ierr; 4899ae82921SPaul Mullowney 4909ae82921SPaul Mullowney PetscFunctionBegin; 4919ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4924d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 4939ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4949ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 4959ae82921SPaul Mullowney PetscFunctionReturn(0); 4969ae82921SPaul Mullowney } 497ca45077fSPaul Mullowney 4983fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 4993fa6b06aSMark Adams { 5003fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 5013fa6b06aSMark Adams PetscErrorCode ierr; 5023fa6b06aSMark Adams 5033fa6b06aSMark Adams PetscFunctionBegin; 5043fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 5053fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 5063fa6b06aSMark Adams PetscFunctionReturn(0); 5073fa6b06aSMark Adams } 508042217e8SBarry Smith 509fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 510fdc842d1SBarry Smith { 511fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 512fdc842d1SBarry Smith PetscErrorCode ierr; 513fdc842d1SBarry Smith 514fdc842d1SBarry Smith PetscFunctionBegin; 515fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 5164d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 517fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 518fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 519fdc842d1SBarry Smith PetscFunctionReturn(0); 520fdc842d1SBarry Smith } 521fdc842d1SBarry Smith 522ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 523ca45077fSPaul Mullowney { 524ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 525ca45077fSPaul Mullowney PetscErrorCode ierr; 526ca45077fSPaul Mullowney 527ca45077fSPaul Mullowney PetscFunctionBegin; 5289b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 529ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 5309b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 5319b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 532ca45077fSPaul Mullowney PetscFunctionReturn(0); 533ca45077fSPaul Mullowney } 5349ae82921SPaul Mullowney 535e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 536ca45077fSPaul Mullowney { 537ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 538bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 539e057df02SPaul Mullowney 540ca45077fSPaul Mullowney PetscFunctionBegin; 541e057df02SPaul Mullowney switch (op) { 542e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 543e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 544045c96e1SPaul Mullowney break; 545e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 546e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 547045c96e1SPaul Mullowney break; 548e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 549e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 550e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 551045c96e1SPaul Mullowney break; 552e057df02SPaul Mullowney default: 55398921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 554045c96e1SPaul Mullowney } 555ca45077fSPaul Mullowney PetscFunctionReturn(0); 556ca45077fSPaul Mullowney } 557e057df02SPaul Mullowney 5584416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 559e057df02SPaul Mullowney { 560e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 561e057df02SPaul Mullowney PetscErrorCode ierr; 562e057df02SPaul Mullowney PetscBool flg; 563a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 564a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 5655fd66863SKarl Rupp 566e057df02SPaul Mullowney PetscFunctionBegin; 567e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 568e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 569e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 570a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 571e057df02SPaul Mullowney if (flg) { 572e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 573e057df02SPaul Mullowney } 574e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 575a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 576e057df02SPaul Mullowney if (flg) { 577e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 578e057df02SPaul Mullowney } 579e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 580a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 581e057df02SPaul Mullowney if (flg) { 582e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 583e057df02SPaul Mullowney } 584e057df02SPaul Mullowney } 5850af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 586e057df02SPaul Mullowney PetscFunctionReturn(0); 587e057df02SPaul Mullowney } 588e057df02SPaul Mullowney 58934d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 59034d6c7a5SJose E. Roman { 59134d6c7a5SJose E. Roman PetscErrorCode ierr; 5923fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 593042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 594042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 595042217e8SBarry Smith 59634d6c7a5SJose E. Roman PetscFunctionBegin; 59734d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 598042217e8SBarry Smith if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 599042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 600042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 601042217e8SBarry Smith cudaError_t cerr; 602042217e8SBarry Smith 603042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 604042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 605042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 606042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 60799551766SMark Adams if (h_mat->allocated_indices) { 60899551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 60999551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 61099551766SMark Adams if (h_mat->offdiag.j) { 61199551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 61299551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 61399551766SMark Adams } 61499551766SMark Adams } 615042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 616042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 617042217e8SBarry Smith cusp->deviceMat = NULL; 6183fa6b06aSMark Adams } 61934d6c7a5SJose E. Roman PetscFunctionReturn(0); 62034d6c7a5SJose E. Roman } 62134d6c7a5SJose E. Roman 622bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 623bbf3fe20SPaul Mullowney { 624bbf3fe20SPaul Mullowney PetscErrorCode ierr; 625219fbbafSJunchao Zhang cudaError_t cerr; 6263fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 6273fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 628b06137fdSPaul Mullowney cusparseStatus_t stat; 629bbf3fe20SPaul Mullowney 630bbf3fe20SPaul Mullowney PetscFunctionBegin; 6312c71b3e2SJacob Faibussowitsch PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 6323fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 633042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 634042217e8SBarry Smith 6353fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 636042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 637042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 638042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 63999551766SMark Adams if (h_mat->allocated_indices) { 64099551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 64199551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 64299551766SMark Adams if (h_mat->offdiag.j) { 64399551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 64499551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 64599551766SMark Adams } 64699551766SMark Adams } 647042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 648042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 6493fa6b06aSMark Adams } 650bbf3fe20SPaul Mullowney try { 6513fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 6523fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 65357d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 654a0e72f99SJunchao Zhang /* We want cusparseStruct to use PetscDefaultCudaStream 65517403302SKarl Rupp if (cusparseStruct->stream) { 656042217e8SBarry Smith cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 65717403302SKarl Rupp } 658a0e72f99SJunchao Zhang */ 659*cbc6b225SStefano Zampini /* Free COO */ 660*cbc6b225SStefano Zampini ierr = MatResetPreallocationCOO_MPIAIJCUSPARSE(A);CHKERRQ(ierr); 661bbf3fe20SPaul Mullowney delete cusparseStruct; 662bbf3fe20SPaul Mullowney } catch(char *ex) { 66398921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 664bbf3fe20SPaul Mullowney } 6653338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 666ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 6677e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 6687e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 6693338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 670ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 671bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 672bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 673bbf3fe20SPaul Mullowney } 674ca45077fSPaul Mullowney 6753338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 6769ae82921SPaul Mullowney { 6779ae82921SPaul Mullowney PetscErrorCode ierr; 678bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 679b06137fdSPaul Mullowney cusparseStatus_t stat; 6803338378cSStefano Zampini Mat A; 6819ae82921SPaul Mullowney 6829ae82921SPaul Mullowney PetscFunctionBegin; 683219fbbafSJunchao Zhang ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 6843338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 6853338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 6863338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 6873338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 6883338378cSStefano Zampini } 6893338378cSStefano Zampini A = *newmat; 6906f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 69134136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 69234136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 69334136279SStefano Zampini 694bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 695d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 696d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 697d98d7c49SStefano Zampini if (a->lvec) { 698d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 699d98d7c49SStefano Zampini } 700d98d7c49SStefano Zampini 7013338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 702219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 703219fbbafSJunchao Zhang stat = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat); 704219fbbafSJunchao Zhang a->spptr = cusp; 7053338378cSStefano Zampini } 7062205254eSKarl Rupp 70734d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 708bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 709fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 710bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 711bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 712bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 7133fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 7144e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 7152205254eSKarl Rupp 716bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 717ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 7183338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 719bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 7207e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 7217e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 722ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 723ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 724ae48a8d0SStefano Zampini #endif 7259ae82921SPaul Mullowney PetscFunctionReturn(0); 7269ae82921SPaul Mullowney } 7279ae82921SPaul Mullowney 7283338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 7293338378cSStefano Zampini { 7303338378cSStefano Zampini PetscErrorCode ierr; 7313338378cSStefano Zampini 7323338378cSStefano Zampini PetscFunctionBegin; 733a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 7343338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 7353338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 7363338378cSStefano Zampini PetscFunctionReturn(0); 7373338378cSStefano Zampini } 7383338378cSStefano Zampini 7393f9c0db1SPaul Mullowney /*@ 7403f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 741e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 7423f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 743e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 744e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 745e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 7469ae82921SPaul Mullowney 747d083f849SBarry Smith Collective 748e057df02SPaul Mullowney 749e057df02SPaul Mullowney Input Parameters: 750e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 751e057df02SPaul Mullowney . m - number of rows 752e057df02SPaul Mullowney . n - number of columns 753e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 754e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 7550298fd71SBarry Smith (possibly different for each row) or NULL 756e057df02SPaul Mullowney 757e057df02SPaul Mullowney Output Parameter: 758e057df02SPaul Mullowney . A - the matrix 759e057df02SPaul Mullowney 760e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 761e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 762e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 763e057df02SPaul Mullowney 764e057df02SPaul Mullowney Notes: 765e057df02SPaul Mullowney If nnz is given then nz is ignored 766e057df02SPaul Mullowney 767e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 768e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 769e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 770e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 771e057df02SPaul Mullowney 772e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7730298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 774e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 775e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 776e057df02SPaul Mullowney 777e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 778e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 779e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 780e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 781e057df02SPaul Mullowney 782e057df02SPaul Mullowney Level: intermediate 783e057df02SPaul Mullowney 784e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 785e057df02SPaul Mullowney @*/ 7869ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 7879ae82921SPaul Mullowney { 7889ae82921SPaul Mullowney PetscErrorCode ierr; 7899ae82921SPaul Mullowney PetscMPIInt size; 7909ae82921SPaul Mullowney 7919ae82921SPaul Mullowney PetscFunctionBegin; 7929ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 7939ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 794ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 7959ae82921SPaul Mullowney if (size > 1) { 7969ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 7979ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 7989ae82921SPaul Mullowney } else { 7999ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 8009ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 8019ae82921SPaul Mullowney } 8029ae82921SPaul Mullowney PetscFunctionReturn(0); 8039ae82921SPaul Mullowney } 8049ae82921SPaul Mullowney 8053ca39a21SBarry Smith /*MC 8066bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 807e057df02SPaul Mullowney 8082692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 8092692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 8102692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 8119ae82921SPaul Mullowney 8129ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 8139ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 8149ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 8159ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 8169ae82921SPaul Mullowney the above preallocation routines for simplicity. 8179ae82921SPaul Mullowney 8189ae82921SPaul Mullowney Options Database Keys: 819e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 8208468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8218468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8228468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8239ae82921SPaul Mullowney 8249ae82921SPaul Mullowney Level: beginner 8259ae82921SPaul Mullowney 8266bb69460SJunchao Zhang .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 8276bb69460SJunchao Zhang M*/ 8286bb69460SJunchao Zhang 8296bb69460SJunchao Zhang /*MC 8306bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 8316bb69460SJunchao Zhang 8326bb69460SJunchao Zhang Level: beginner 8336bb69460SJunchao Zhang 8346bb69460SJunchao Zhang .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 8359ae82921SPaul Mullowney M*/ 8363fa6b06aSMark Adams 837042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 838042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 8393fa6b06aSMark Adams { 840042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 841042217e8SBarry Smith PetscMPIInt size; 8423fa6b06aSMark Adams PetscErrorCode ierr; 843042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 844042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 84599551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 846042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 847042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 8483fa6b06aSMark Adams 8493fa6b06aSMark Adams PetscFunctionBegin; 8502c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 851042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 8523fa6b06aSMark Adams *B = NULL; 8533fa6b06aSMark Adams PetscFunctionReturn(0); 8543fa6b06aSMark Adams } 855042217e8SBarry Smith ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 85699551766SMark Adams // get jaca 8573fa6b06aSMark Adams if (size == 1) { 858042217e8SBarry Smith PetscBool isseqaij; 859042217e8SBarry Smith 860042217e8SBarry Smith ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 861042217e8SBarry Smith if (isseqaij) { 8623fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 8632c71b3e2SJacob Faibussowitsch PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 864042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 865042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 866042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 8673fa6b06aSMark Adams } else { 8683fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 8692c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 870042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8713fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 872042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 873042217e8SBarry Smith d_mat = spptr->deviceMat; 874042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 8753fa6b06aSMark Adams } 876042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 877042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 8782c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 879042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 880042217e8SBarry Smith bi = NULL; 881042217e8SBarry Smith bj = NULL; 882042217e8SBarry Smith ba = NULL; 883042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 884042217e8SBarry Smith } else { 885042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 8862c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 887042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 88899551766SMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 889042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8903fa6b06aSMark Adams 8912c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 892042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 893042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 8942c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 895042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 896042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 897042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 898042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 899042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 9002c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 9012c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 902042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 903042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 9043fa6b06aSMark Adams if (jacb->compressedrow.use) { 905042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 906042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 907042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 9083fa6b06aSMark Adams } 909042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 910042217e8SBarry Smith } else { 911042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 912042217e8SBarry Smith } 913042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 914042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 915042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 916042217e8SBarry Smith d_mat = spptr->deviceMat; 917042217e8SBarry Smith } 9183fa6b06aSMark Adams if (jaca->compressedrow.use) { 919042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 920042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 921042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 9223fa6b06aSMark Adams } 923042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 9243fa6b06aSMark Adams } else { 925042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 9263fa6b06aSMark Adams } 927042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 928042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 929042217e8SBarry Smith 930042217e8SBarry Smith if (!d_mat) { 931042217e8SBarry Smith cudaError_t cerr; 932042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 933042217e8SBarry Smith 934042217e8SBarry Smith // create and populate strucy on host and copy on device 935042217e8SBarry Smith ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 936042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 937042217e8SBarry Smith cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 938042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 939042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 94099551766SMark Adams PetscInt *colmap; 941042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 942042217e8SBarry Smith 9432c71b3e2SJacob Faibussowitsch PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 944042217e8SBarry Smith 945042217e8SBarry Smith ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 946042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 947365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 948365b711fSMark Adams { // have to make a long version of these 94999551766SMark Adams int *h_bi32, *h_bj32; 95099551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 951365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 95299551766SMark Adams cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 95399551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 95499551766SMark Adams cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 95599551766SMark Adams for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 95699551766SMark Adams 95799551766SMark Adams cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 95899551766SMark Adams cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 95999551766SMark Adams cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 96099551766SMark Adams cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 96199551766SMark Adams 96299551766SMark Adams h_mat->offdiag.i = d_bi64; 96399551766SMark Adams h_mat->offdiag.j = d_bj64; 96499551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 96599551766SMark Adams 966365b711fSMark Adams ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 967365b711fSMark Adams } 968365b711fSMark Adams #else 96999551766SMark Adams h_mat->offdiag.i = (PetscInt*)bi; 97099551766SMark Adams h_mat->offdiag.j = (PetscInt*)bj; 97199551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 972365b711fSMark Adams #endif 973042217e8SBarry Smith h_mat->offdiag.a = ba; 974042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 975042217e8SBarry Smith 97699551766SMark Adams cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 97799551766SMark Adams cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 978042217e8SBarry Smith ierr = PetscFree(colmap);CHKERRQ(ierr); 979042217e8SBarry Smith } 980042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 981042217e8SBarry Smith h_mat->rend = A->rmap->rend; 982042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 983042217e8SBarry Smith h_mat->cend = A->cmap->rend; 98449b994a9SMark Adams h_mat->M = A->cmap->N; 985365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 986365b711fSMark Adams { 9872c71b3e2SJacob Faibussowitsch PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 98899551766SMark Adams int *h_ai32, *h_aj32; 98999551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 990365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 99199551766SMark Adams cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 99299551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 99399551766SMark Adams cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 99499551766SMark Adams for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 99599551766SMark Adams 99699551766SMark Adams cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 99799551766SMark Adams cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 99899551766SMark Adams cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 99999551766SMark Adams cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 100099551766SMark Adams 100199551766SMark Adams h_mat->diag.i = d_ai64; 100299551766SMark Adams h_mat->diag.j = d_aj64; 100399551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 100499551766SMark Adams 1005365b711fSMark Adams ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 1006365b711fSMark Adams } 1007365b711fSMark Adams #else 100899551766SMark Adams h_mat->diag.i = (PetscInt*)ai; 100999551766SMark Adams h_mat->diag.j = (PetscInt*)aj; 101099551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 1011365b711fSMark Adams #endif 1012042217e8SBarry Smith h_mat->diag.a = aa; 1013042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 1014042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 1015042217e8SBarry Smith // copy pointers and metadata to device 1016042217e8SBarry Smith cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 1017042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 1018042217e8SBarry Smith } else { 1019042217e8SBarry Smith ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 1020042217e8SBarry Smith } 1021042217e8SBarry Smith *B = d_mat; 1022042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 10233fa6b06aSMark Adams PetscFunctionReturn(0); 10243fa6b06aSMark Adams } 1025