1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 147e8381f9SStefano Zampini struct VecCUDAEquals 157e8381f9SStefano Zampini { 167e8381f9SStefano Zampini template <typename Tuple> 177e8381f9SStefano Zampini __host__ __device__ 187e8381f9SStefano Zampini void operator()(Tuple t) 197e8381f9SStefano Zampini { 207e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 217e8381f9SStefano Zampini } 227e8381f9SStefano Zampini }; 237e8381f9SStefano Zampini 24219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 257e8381f9SStefano Zampini { 267e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 277e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 287e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 297e8381f9SStefano Zampini PetscErrorCode ierr; 307e8381f9SStefano Zampini 317e8381f9SStefano Zampini PetscFunctionBegin; 32e61fc153SStefano Zampini if (cusp->coo_p && v) { 3308391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 3408391a17SStefano Zampini THRUSTARRAY *w = NULL; 3508391a17SStefano Zampini 3608391a17SStefano Zampini if (isCudaMem(v)) { 3708391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 3808391a17SStefano Zampini } else { 39e61fc153SStefano Zampini w = new THRUSTARRAY(n); 40e61fc153SStefano Zampini w->assign(v,v+n); 4108391a17SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 4208391a17SStefano Zampini d_v = w->data(); 4308391a17SStefano Zampini } 4408391a17SStefano Zampini 4508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 467e8381f9SStefano Zampini cusp->coo_pw->begin())); 4708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 487e8381f9SStefano Zampini cusp->coo_pw->end())); 4908391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 507e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 5108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52e61fc153SStefano Zampini delete w; 53219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 54219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 557e8381f9SStefano Zampini } else { 56219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode);CHKERRQ(ierr); 57219fbbafSJunchao Zhang ierr = MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 587e8381f9SStefano Zampini } 597e8381f9SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 607e8381f9SStefano Zampini A->num_ass++; 617e8381f9SStefano Zampini A->assembled = PETSC_TRUE; 627e8381f9SStefano Zampini A->ass_nonzerostate = A->nonzerostate; 634eb5378fSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 647e8381f9SStefano Zampini PetscFunctionReturn(0); 657e8381f9SStefano Zampini } 667e8381f9SStefano Zampini 677e8381f9SStefano Zampini template <typename Tuple> 687e8381f9SStefano Zampini struct IsNotOffDiagT 697e8381f9SStefano Zampini { 707e8381f9SStefano Zampini PetscInt _cstart,_cend; 717e8381f9SStefano Zampini 727e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 737e8381f9SStefano Zampini __host__ __device__ 747e8381f9SStefano Zampini inline bool operator()(Tuple t) 757e8381f9SStefano Zampini { 767e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 777e8381f9SStefano Zampini } 787e8381f9SStefano Zampini }; 797e8381f9SStefano Zampini 807e8381f9SStefano Zampini struct IsOffDiag 817e8381f9SStefano Zampini { 827e8381f9SStefano Zampini PetscInt _cstart,_cend; 837e8381f9SStefano Zampini 847e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 857e8381f9SStefano Zampini __host__ __device__ 867e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 877e8381f9SStefano Zampini { 887e8381f9SStefano Zampini return c < _cstart || c >= _cend; 897e8381f9SStefano Zampini } 907e8381f9SStefano Zampini }; 917e8381f9SStefano Zampini 927e8381f9SStefano Zampini struct GlobToLoc 937e8381f9SStefano Zampini { 947e8381f9SStefano Zampini PetscInt _start; 957e8381f9SStefano Zampini 967e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 977e8381f9SStefano Zampini __host__ __device__ 987e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 997e8381f9SStefano Zampini { 1007e8381f9SStefano Zampini return c - _start; 1017e8381f9SStefano Zampini } 1027e8381f9SStefano Zampini }; 1037e8381f9SStefano Zampini 104219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 1057e8381f9SStefano Zampini { 1067e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1077e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 1087e8381f9SStefano Zampini PetscErrorCode ierr; 10982a78a4eSJed Brown PetscInt N,*jj; 1107e8381f9SStefano Zampini size_t noff = 0; 111ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1127e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1137e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1147e8381f9SStefano Zampini cudaError_t cerr; 1157e8381f9SStefano Zampini 1167e8381f9SStefano Zampini PetscFunctionBegin; 1177e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1187e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1197e8381f9SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 1207e8381f9SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 1217e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1227e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1237e8381f9SStefano Zampini 1247e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1257e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1267e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1277e8381f9SStefano Zampini delete cusp->coo_p; 1287e8381f9SStefano Zampini delete cusp->coo_pw; 1297e8381f9SStefano Zampini cusp->coo_p = NULL; 1307e8381f9SStefano Zampini cusp->coo_pw = NULL; 13108391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1327e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1337e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1347e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1357e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1367e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1377e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1387e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1397e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1407e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1417e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1427e8381f9SStefano Zampini } 1437e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1447e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1457e8381f9SStefano Zampini 1467e8381f9SStefano Zampini /* from global to local */ 1477e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1487e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 14908391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1507e8381f9SStefano Zampini 1517e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 152ddea5d60SJunchao Zhang ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 1537e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1547e8381f9SStefano Zampini auto o_j = d_j.begin(); 15508391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 156ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1577e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 158ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 15908391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1607e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 161ddea5d60SJunchao Zhang ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 1627e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1637e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1647e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1657e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 16682a78a4eSJed Brown ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr); 1672c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 1687e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1697e8381f9SStefano Zampini 1707e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1717e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1727e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1737e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1747e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1757e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1767e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1777e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 1787e8381f9SStefano Zampini 1797e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 180219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 181219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 1827e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 1837e8381f9SStefano Zampini 1847e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 1857e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 1867e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 1877e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 188a0e72f99SJunchao Zhang /* 1897e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 1907e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 191a0e72f99SJunchao Zhang */ 1927e8381f9SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 1937e8381f9SStefano Zampini B->preallocated = PETSC_TRUE; 1947e8381f9SStefano Zampini B->nonzerostate++; 1957e8381f9SStefano Zampini 1967e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 1977e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 1987e8381f9SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1997e8381f9SStefano Zampini B->assembled = PETSC_FALSE; 2007e8381f9SStefano Zampini B->was_assembled = PETSC_FALSE; 2017e8381f9SStefano Zampini PetscFunctionReturn(0); 2027e8381f9SStefano Zampini } 2039ae82921SPaul Mullowney 204219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 205219fbbafSJunchao Zhang { 206219fbbafSJunchao Zhang PetscErrorCode ierr; 207219fbbafSJunchao Zhang Mat newmat; 208219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij; 209219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 210219fbbafSJunchao Zhang PetscInt coo_basic = 1; 211219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 212219fbbafSJunchao Zhang PetscInt rstart,rend; 213219fbbafSJunchao Zhang cudaError_t cerr; 214219fbbafSJunchao Zhang 215219fbbafSJunchao Zhang PetscFunctionBegin; 216219fbbafSJunchao Zhang if (coo_i) { 217219fbbafSJunchao Zhang ierr = PetscLayoutGetRange(mat->rmap,&rstart,&rend);CHKERRQ(ierr); 218219fbbafSJunchao Zhang ierr = PetscGetMemType(coo_i,&mtype);CHKERRQ(ierr); 219219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 220219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 221219fbbafSJunchao Zhang if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = 0; break;} 222219fbbafSJunchao Zhang } 223219fbbafSJunchao Zhang } 224219fbbafSJunchao Zhang } 225219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 226219fbbafSJunchao Zhang ierr = MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_INT,MPI_LAND,PetscObjectComm((PetscObject)mat));CHKERRMPI(ierr); 227219fbbafSJunchao Zhang 228219fbbafSJunchao Zhang if (coo_basic) { 229219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 230219fbbafSJunchao Zhang } else { 231219fbbafSJunchao Zhang mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 232219fbbafSJunchao Zhang ierr = MatCreate(PetscObjectComm((PetscObject)mat),&newmat);CHKERRQ(ierr); 233219fbbafSJunchao Zhang ierr = MatSetSizes(newmat,mat->rmap->n,mat->cmap->n,mat->rmap->N,mat->cmap->N);CHKERRQ(ierr); 234219fbbafSJunchao Zhang ierr = MatSetType(newmat,MATMPIAIJ);CHKERRQ(ierr); 235219fbbafSJunchao Zhang ierr = MatSetOption(newmat,MAT_IGNORE_OFF_PROC_ENTRIES,mpiaij->donotstash);CHKERRQ(ierr); /* Inherit the two options that we respect from mat */ 236219fbbafSJunchao Zhang ierr = MatSetOption(newmat,MAT_NO_OFF_PROC_ENTRIES,mat->nooffprocentries);CHKERRQ(ierr); 237219fbbafSJunchao Zhang ierr = MatSetPreallocationCOO_MPIAIJ(newmat,coo_n,coo_i,coo_j);CHKERRQ(ierr); 238219fbbafSJunchao Zhang ierr = MatConvert(newmat,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&newmat);CHKERRQ(ierr); 239219fbbafSJunchao Zhang ierr = MatHeaderMerge(mat,&newmat);CHKERRQ(ierr); 240219fbbafSJunchao Zhang ierr = MatZeroEntries(mat);CHKERRQ(ierr); /* Zero matrix on device */ 241219fbbafSJunchao Zhang mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); /* mat->data was changed in MatHeaderReplace() */ 242219fbbafSJunchao Zhang mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 243219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 244219fbbafSJunchao Zhang 245219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount));CHKERRCUDA(cerr); 246219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 247219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount));CHKERRCUDA(cerr); 248219fbbafSJunchao Zhang 249219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount));CHKERRCUDA(cerr); 250219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 251219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount));CHKERRCUDA(cerr); 252219fbbafSJunchao Zhang 253219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount));CHKERRCUDA(cerr); 254219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 255219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount));CHKERRCUDA(cerr); 256219fbbafSJunchao Zhang 257219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount));CHKERRCUDA(cerr); 258219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount));CHKERRCUDA(cerr); 259219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount));CHKERRCUDA(cerr); 260219fbbafSJunchao Zhang 261219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount));CHKERRCUDA(cerr); 262219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 263219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar));CHKERRCUDA(cerr); 264219fbbafSJunchao Zhang 265219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 266219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 267219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 268219fbbafSJunchao Zhang 269219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 270219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 271219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 272219fbbafSJunchao Zhang 273219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 274219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 275219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 276219fbbafSJunchao Zhang 277219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 278219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 279219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 280219fbbafSJunchao Zhang 281219fbbafSJunchao Zhang cerr = cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 282219fbbafSJunchao Zhang } 283219fbbafSJunchao Zhang PetscFunctionReturn(0); 284219fbbafSJunchao Zhang } 285219fbbafSJunchao Zhang 286219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 287219fbbafSJunchao Zhang { 288219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 289219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 290219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 291219fbbafSJunchao Zhang } 292219fbbafSJunchao Zhang 293219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 294219fbbafSJunchao Zhang { 295219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 296219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 297219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 298219fbbafSJunchao Zhang } 299219fbbafSJunchao Zhang 300219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 301219fbbafSJunchao Zhang { 302219fbbafSJunchao Zhang PetscErrorCode ierr; 303219fbbafSJunchao Zhang cudaError_t cerr; 304219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 305219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 306219fbbafSJunchao Zhang Mat A = mpiaij->A,B = mpiaij->B; 307219fbbafSJunchao Zhang PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 308219fbbafSJunchao Zhang PetscScalar *Aa,*Ba; 309219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 310219fbbafSJunchao Zhang const PetscScalar *v1 = v; 311219fbbafSJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 312219fbbafSJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 313219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 314219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 315219fbbafSJunchao Zhang PetscMemType memtype; 316219fbbafSJunchao Zhang 317219fbbafSJunchao Zhang PetscFunctionBegin; 318219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 319219fbbafSJunchao Zhang ierr = PetscGetMemType(v,&memtype);CHKERRQ(ierr); 320219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 321219fbbafSJunchao Zhang cerr = cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar));CHKERRCUDA(cerr); 322*7487cd7cSJunchao Zhang cerr = cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 323219fbbafSJunchao Zhang } 324219fbbafSJunchao Zhang 325219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 326219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa);CHKERRQ(ierr); /* write matrix values */ 327219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba);CHKERRQ(ierr); 328*7487cd7cSJunchao Zhang cerr = cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 329*7487cd7cSJunchao Zhang cerr = cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar));CHKERRCUDA(cerr); 330219fbbafSJunchao Zhang } else { 331219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArray(A,&Aa);CHKERRQ(ierr); /* read & write matrix values */ 332219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSEGetArray(B,&Ba);CHKERRQ(ierr); 333219fbbafSJunchao Zhang } 334219fbbafSJunchao Zhang 335219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 336219fbbafSJunchao Zhang MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 337219fbbafSJunchao Zhang 338219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 339219fbbafSJunchao Zhang ierr = PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE);CHKERRQ(ierr); 340219fbbafSJunchao Zhang /* Add local entries to A and B */ 341219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 342219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 343219fbbafSJunchao Zhang ierr = PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE);CHKERRQ(ierr); 344219fbbafSJunchao Zhang 345219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 346219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 347219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 348219fbbafSJunchao Zhang 349219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 350219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa);CHKERRQ(ierr); 351219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba);CHKERRQ(ierr); 352219fbbafSJunchao Zhang } else { 353219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArray(A,&Aa);CHKERRQ(ierr); 354219fbbafSJunchao Zhang ierr = MatSeqAIJCUSPARSERestoreArray(B,&Ba);CHKERRQ(ierr); 355219fbbafSJunchao Zhang } 356219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) {cerr = cudaFree((void*)v1);CHKERRCUDA(cerr);} 357219fbbafSJunchao Zhang } else { 358219fbbafSJunchao Zhang ierr = MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode);CHKERRQ(ierr); 359219fbbafSJunchao Zhang } 360219fbbafSJunchao Zhang PetscFunctionReturn(0); 361219fbbafSJunchao Zhang } 362219fbbafSJunchao Zhang 363ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 364ed502f03SStefano Zampini { 365ed502f03SStefano Zampini Mat Ad,Ao; 366ed502f03SStefano Zampini const PetscInt *cmap; 367ed502f03SStefano Zampini PetscErrorCode ierr; 368ed502f03SStefano Zampini 369ed502f03SStefano Zampini PetscFunctionBegin; 370ed502f03SStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 371ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 372ed502f03SStefano Zampini if (glob) { 373ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 374ed502f03SStefano Zampini 375ed502f03SStefano Zampini ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 376ed502f03SStefano Zampini ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 377ed502f03SStefano Zampini ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 378ed502f03SStefano Zampini ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 379ed502f03SStefano Zampini for (i=0; i<dn; i++) gidx[i] = cst + i; 380ed502f03SStefano Zampini for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 381ed502f03SStefano Zampini ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 382ed502f03SStefano Zampini } 383ed502f03SStefano Zampini PetscFunctionReturn(0); 384ed502f03SStefano Zampini } 385ed502f03SStefano Zampini 3869ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 3879ae82921SPaul Mullowney { 388bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 389bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 3909ae82921SPaul Mullowney PetscErrorCode ierr; 3919ae82921SPaul Mullowney PetscInt i; 3929ae82921SPaul Mullowney 3939ae82921SPaul Mullowney PetscFunctionBegin; 3949ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 3959ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 3967e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 3979ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 3982c71b3e2SJacob Faibussowitsch PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 3999ae82921SPaul Mullowney } 4009ae82921SPaul Mullowney } 4017e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 4029ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 4032c71b3e2SJacob Faibussowitsch PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 4049ae82921SPaul Mullowney } 4059ae82921SPaul Mullowney } 4066a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 4076a29ce69SStefano Zampini ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 4086a29ce69SStefano Zampini #else 4096a29ce69SStefano Zampini ierr = PetscFree(b->colmap);CHKERRQ(ierr); 4106a29ce69SStefano Zampini #endif 4116a29ce69SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 4126a29ce69SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 4136a29ce69SStefano Zampini ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 4146a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4156a29ce69SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 4166a29ce69SStefano Zampini if (!b->A) { 4179ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 4189ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 4193bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 4206a29ce69SStefano Zampini } 4216a29ce69SStefano Zampini if (!b->B) { 4226a29ce69SStefano Zampini PetscMPIInt size; 42355b25c41SPierre Jolivet ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 4249ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 4256a29ce69SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 4263bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 4279ae82921SPaul Mullowney } 428d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 429d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 4306a29ce69SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 4316a29ce69SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 4329ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 4339ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 434e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 435e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 436b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 437b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 438a0e72f99SJunchao Zhang /* Let A, B use b's handle with pre-set stream 439b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 440b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 441a0e72f99SJunchao Zhang */ 4429ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4439ae82921SPaul Mullowney PetscFunctionReturn(0); 4449ae82921SPaul Mullowney } 445e057df02SPaul Mullowney 4469ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 4479ae82921SPaul Mullowney { 4489ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 4499ae82921SPaul Mullowney PetscErrorCode ierr; 4509ae82921SPaul Mullowney PetscInt nt; 4519ae82921SPaul Mullowney 4529ae82921SPaul Mullowney PetscFunctionBegin; 4539ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 4542c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 45556258e06SRichard Tran Mills /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 45656258e06SRichard Tran Mills if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 4579ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4584d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 4599ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4609ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 4619ae82921SPaul Mullowney PetscFunctionReturn(0); 4629ae82921SPaul Mullowney } 463ca45077fSPaul Mullowney 4643fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 4653fa6b06aSMark Adams { 4663fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 4673fa6b06aSMark Adams PetscErrorCode ierr; 4683fa6b06aSMark Adams 4693fa6b06aSMark Adams PetscFunctionBegin; 4703fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 4713fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 4723fa6b06aSMark Adams PetscFunctionReturn(0); 4733fa6b06aSMark Adams } 474042217e8SBarry Smith 475fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 476fdc842d1SBarry Smith { 477fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 478fdc842d1SBarry Smith PetscErrorCode ierr; 479fdc842d1SBarry Smith PetscInt nt; 480fdc842d1SBarry Smith 481fdc842d1SBarry Smith PetscFunctionBegin; 482fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 4832c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 484fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 4854d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 486fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 487fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 488fdc842d1SBarry Smith PetscFunctionReturn(0); 489fdc842d1SBarry Smith } 490fdc842d1SBarry Smith 491ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 492ca45077fSPaul Mullowney { 493ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 494ca45077fSPaul Mullowney PetscErrorCode ierr; 495ca45077fSPaul Mullowney PetscInt nt; 496ca45077fSPaul Mullowney 497ca45077fSPaul Mullowney PetscFunctionBegin; 498ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 4992c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt); 5009b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 501ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 5029b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 5039b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 504ca45077fSPaul Mullowney PetscFunctionReturn(0); 505ca45077fSPaul Mullowney } 5069ae82921SPaul Mullowney 507e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 508ca45077fSPaul Mullowney { 509ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 510bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 511e057df02SPaul Mullowney 512ca45077fSPaul Mullowney PetscFunctionBegin; 513e057df02SPaul Mullowney switch (op) { 514e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 515e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 516045c96e1SPaul Mullowney break; 517e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 518e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 519045c96e1SPaul Mullowney break; 520e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 521e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 522e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 523045c96e1SPaul Mullowney break; 524e057df02SPaul Mullowney default: 52598921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 526045c96e1SPaul Mullowney } 527ca45077fSPaul Mullowney PetscFunctionReturn(0); 528ca45077fSPaul Mullowney } 529e057df02SPaul Mullowney 5304416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 531e057df02SPaul Mullowney { 532e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 533e057df02SPaul Mullowney PetscErrorCode ierr; 534e057df02SPaul Mullowney PetscBool flg; 535a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 536a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 5375fd66863SKarl Rupp 538e057df02SPaul Mullowney PetscFunctionBegin; 539e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 540e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 541e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 542a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 543e057df02SPaul Mullowney if (flg) { 544e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 545e057df02SPaul Mullowney } 546e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 547a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 548e057df02SPaul Mullowney if (flg) { 549e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 550e057df02SPaul Mullowney } 551e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 552a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 553e057df02SPaul Mullowney if (flg) { 554e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 555e057df02SPaul Mullowney } 556e057df02SPaul Mullowney } 5570af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 558e057df02SPaul Mullowney PetscFunctionReturn(0); 559e057df02SPaul Mullowney } 560e057df02SPaul Mullowney 56134d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 56234d6c7a5SJose E. Roman { 56334d6c7a5SJose E. Roman PetscErrorCode ierr; 5643fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 565042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 566042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 567042217e8SBarry Smith 56834d6c7a5SJose E. Roman PetscFunctionBegin; 56934d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 570042217e8SBarry Smith if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 571042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 572042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 573042217e8SBarry Smith cudaError_t cerr; 574042217e8SBarry Smith 575042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 576042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 577042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 578042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 57999551766SMark Adams if (h_mat->allocated_indices) { 58099551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 58199551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 58299551766SMark Adams if (h_mat->offdiag.j) { 58399551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 58499551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 58599551766SMark Adams } 58699551766SMark Adams } 587042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 588042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 589042217e8SBarry Smith cusp->deviceMat = NULL; 5903fa6b06aSMark Adams } 59134d6c7a5SJose E. Roman PetscFunctionReturn(0); 59234d6c7a5SJose E. Roman } 59334d6c7a5SJose E. Roman 594bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 595bbf3fe20SPaul Mullowney { 596bbf3fe20SPaul Mullowney PetscErrorCode ierr; 597219fbbafSJunchao Zhang cudaError_t cerr; 5983fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 5993fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 600b06137fdSPaul Mullowney cusparseStatus_t stat; 601bbf3fe20SPaul Mullowney 602bbf3fe20SPaul Mullowney PetscFunctionBegin; 6032c71b3e2SJacob Faibussowitsch PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 6043fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 605042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 606042217e8SBarry Smith 6073fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 608042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 609042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 610042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 61199551766SMark Adams if (h_mat->allocated_indices) { 61299551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 61399551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 61499551766SMark Adams if (h_mat->offdiag.j) { 61599551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 61699551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 61799551766SMark Adams } 61899551766SMark Adams } 619042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 620042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 6213fa6b06aSMark Adams } 622bbf3fe20SPaul Mullowney try { 6233fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 6243fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 62557d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 626a0e72f99SJunchao Zhang /* We want cusparseStruct to use PetscDefaultCudaStream 62717403302SKarl Rupp if (cusparseStruct->stream) { 628042217e8SBarry Smith cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 62917403302SKarl Rupp } 630a0e72f99SJunchao Zhang */ 631219fbbafSJunchao Zhang /* Extended COO */ 632219fbbafSJunchao Zhang if (cusparseStruct->use_extended_coo) { 633219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aimap1_d);CHKERRCUDA(cerr); 634219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Ajmap1_d);CHKERRCUDA(cerr); 635219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aperm1_d);CHKERRCUDA(cerr); 636219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bimap1_d);CHKERRCUDA(cerr); 637219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bjmap1_d);CHKERRCUDA(cerr); 638219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bperm1_d);CHKERRCUDA(cerr); 639219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aimap2_d);CHKERRCUDA(cerr); 640219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Ajmap2_d);CHKERRCUDA(cerr); 641219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Aperm2_d);CHKERRCUDA(cerr); 642219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bimap2_d);CHKERRCUDA(cerr); 643219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bjmap2_d);CHKERRCUDA(cerr); 644219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Bperm2_d);CHKERRCUDA(cerr); 645219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->Cperm1_d);CHKERRCUDA(cerr); 646219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->sendbuf_d);CHKERRCUDA(cerr); 647219fbbafSJunchao Zhang cerr = cudaFree(cusparseStruct->recvbuf_d);CHKERRCUDA(cerr); 648219fbbafSJunchao Zhang } else { 6497e8381f9SStefano Zampini delete cusparseStruct->coo_p; 6507e8381f9SStefano Zampini delete cusparseStruct->coo_pw; 651219fbbafSJunchao Zhang } 652bbf3fe20SPaul Mullowney delete cusparseStruct; 653bbf3fe20SPaul Mullowney } catch(char *ex) { 65498921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 655bbf3fe20SPaul Mullowney } 6563338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 657ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 6587e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 6597e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 6603338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 661ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 662bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 663bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 664bbf3fe20SPaul Mullowney } 665ca45077fSPaul Mullowney 6663338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 6679ae82921SPaul Mullowney { 6689ae82921SPaul Mullowney PetscErrorCode ierr; 669bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 670b06137fdSPaul Mullowney cusparseStatus_t stat; 6713338378cSStefano Zampini Mat A; 6729ae82921SPaul Mullowney 6739ae82921SPaul Mullowney PetscFunctionBegin; 674219fbbafSJunchao Zhang ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 6753338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 6763338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 6773338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 6783338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 6793338378cSStefano Zampini } 6803338378cSStefano Zampini A = *newmat; 6816f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 68234136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 68334136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 68434136279SStefano Zampini 685bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 686d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 687d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 688d98d7c49SStefano Zampini if (a->lvec) { 689d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 690d98d7c49SStefano Zampini } 691d98d7c49SStefano Zampini 6923338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 693219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 694219fbbafSJunchao Zhang stat = cusparseCreate(&(cusp->handle));CHKERRCUSPARSE(stat); 695219fbbafSJunchao Zhang a->spptr = cusp; 6963338378cSStefano Zampini } 6972205254eSKarl Rupp 69834d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 699bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 700fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 701bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 702bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 703bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 7043fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 7054e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 7062205254eSKarl Rupp 707bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 708ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 7093338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 710bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 7117e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 7127e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 713ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 714ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 715ae48a8d0SStefano Zampini #endif 7169ae82921SPaul Mullowney PetscFunctionReturn(0); 7179ae82921SPaul Mullowney } 7189ae82921SPaul Mullowney 7193338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 7203338378cSStefano Zampini { 7213338378cSStefano Zampini PetscErrorCode ierr; 7223338378cSStefano Zampini 7233338378cSStefano Zampini PetscFunctionBegin; 724a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 7253338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 7263338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 7273338378cSStefano Zampini PetscFunctionReturn(0); 7283338378cSStefano Zampini } 7293338378cSStefano Zampini 7303f9c0db1SPaul Mullowney /*@ 7313f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 732e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 7333f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 734e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 735e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 736e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 7379ae82921SPaul Mullowney 738d083f849SBarry Smith Collective 739e057df02SPaul Mullowney 740e057df02SPaul Mullowney Input Parameters: 741e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 742e057df02SPaul Mullowney . m - number of rows 743e057df02SPaul Mullowney . n - number of columns 744e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 745e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 7460298fd71SBarry Smith (possibly different for each row) or NULL 747e057df02SPaul Mullowney 748e057df02SPaul Mullowney Output Parameter: 749e057df02SPaul Mullowney . A - the matrix 750e057df02SPaul Mullowney 751e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 752e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 753e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 754e057df02SPaul Mullowney 755e057df02SPaul Mullowney Notes: 756e057df02SPaul Mullowney If nnz is given then nz is ignored 757e057df02SPaul Mullowney 758e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 759e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 760e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 761e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 762e057df02SPaul Mullowney 763e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7640298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 765e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 766e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 767e057df02SPaul Mullowney 768e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 769e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 770e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 771e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 772e057df02SPaul Mullowney 773e057df02SPaul Mullowney Level: intermediate 774e057df02SPaul Mullowney 775e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 776e057df02SPaul Mullowney @*/ 7779ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 7789ae82921SPaul Mullowney { 7799ae82921SPaul Mullowney PetscErrorCode ierr; 7809ae82921SPaul Mullowney PetscMPIInt size; 7819ae82921SPaul Mullowney 7829ae82921SPaul Mullowney PetscFunctionBegin; 7839ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 7849ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 785ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 7869ae82921SPaul Mullowney if (size > 1) { 7879ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 7889ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 7899ae82921SPaul Mullowney } else { 7909ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 7919ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 7929ae82921SPaul Mullowney } 7939ae82921SPaul Mullowney PetscFunctionReturn(0); 7949ae82921SPaul Mullowney } 7959ae82921SPaul Mullowney 7963ca39a21SBarry Smith /*MC 7976bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 798e057df02SPaul Mullowney 7992692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 8002692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 8012692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 8029ae82921SPaul Mullowney 8039ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 8049ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 8059ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 8069ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 8079ae82921SPaul Mullowney the above preallocation routines for simplicity. 8089ae82921SPaul Mullowney 8099ae82921SPaul Mullowney Options Database Keys: 810e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 8118468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8128468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8138468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 8149ae82921SPaul Mullowney 8159ae82921SPaul Mullowney Level: beginner 8169ae82921SPaul Mullowney 8176bb69460SJunchao Zhang .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 8186bb69460SJunchao Zhang M*/ 8196bb69460SJunchao Zhang 8206bb69460SJunchao Zhang /*MC 8216bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 8226bb69460SJunchao Zhang 8236bb69460SJunchao Zhang Level: beginner 8246bb69460SJunchao Zhang 8256bb69460SJunchao Zhang .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 8269ae82921SPaul Mullowney M*/ 8273fa6b06aSMark Adams 828042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 829042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 8303fa6b06aSMark Adams { 831042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 832042217e8SBarry Smith PetscMPIInt size; 8333fa6b06aSMark Adams PetscErrorCode ierr; 834042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 835042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 83699551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 837042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 838042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 8393fa6b06aSMark Adams 8403fa6b06aSMark Adams PetscFunctionBegin; 8412c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 842042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 8433fa6b06aSMark Adams *B = NULL; 8443fa6b06aSMark Adams PetscFunctionReturn(0); 8453fa6b06aSMark Adams } 846042217e8SBarry Smith ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 84799551766SMark Adams // get jaca 8483fa6b06aSMark Adams if (size == 1) { 849042217e8SBarry Smith PetscBool isseqaij; 850042217e8SBarry Smith 851042217e8SBarry Smith ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 852042217e8SBarry Smith if (isseqaij) { 8533fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 8542c71b3e2SJacob Faibussowitsch PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 855042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 856042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 857042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 8583fa6b06aSMark Adams } else { 8593fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 8602c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 861042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8623fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 863042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 864042217e8SBarry Smith d_mat = spptr->deviceMat; 865042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 8663fa6b06aSMark Adams } 867042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 868042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 8692c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 870042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 871042217e8SBarry Smith bi = NULL; 872042217e8SBarry Smith bj = NULL; 873042217e8SBarry Smith ba = NULL; 874042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 875042217e8SBarry Smith } else { 876042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 8772c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 878042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 87999551766SMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 880042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8813fa6b06aSMark Adams 8822c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 883042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 884042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 8852c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 886042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 887042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 888042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 889042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 890042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 8912c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 8922c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 893042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 894042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 8953fa6b06aSMark Adams if (jacb->compressedrow.use) { 896042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 897042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 898042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 8993fa6b06aSMark Adams } 900042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 901042217e8SBarry Smith } else { 902042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 903042217e8SBarry Smith } 904042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 905042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 906042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 907042217e8SBarry Smith d_mat = spptr->deviceMat; 908042217e8SBarry Smith } 9093fa6b06aSMark Adams if (jaca->compressedrow.use) { 910042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 911042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 912042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 9133fa6b06aSMark Adams } 914042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 9153fa6b06aSMark Adams } else { 916042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 9173fa6b06aSMark Adams } 918042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 919042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 920042217e8SBarry Smith 921042217e8SBarry Smith if (!d_mat) { 922042217e8SBarry Smith cudaError_t cerr; 923042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 924042217e8SBarry Smith 925042217e8SBarry Smith // create and populate strucy on host and copy on device 926042217e8SBarry Smith ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 927042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 928042217e8SBarry Smith cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 929042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 930042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 93199551766SMark Adams PetscInt *colmap; 932042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 933042217e8SBarry Smith 9342c71b3e2SJacob Faibussowitsch PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 935042217e8SBarry Smith 936042217e8SBarry Smith ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 937042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 938365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 939365b711fSMark Adams { // have to make a long version of these 94099551766SMark Adams int *h_bi32, *h_bj32; 94199551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 942365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 94399551766SMark Adams cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 94499551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 94599551766SMark Adams cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 94699551766SMark Adams for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 94799551766SMark Adams 94899551766SMark Adams cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 94999551766SMark Adams cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 95099551766SMark Adams cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 95199551766SMark Adams cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 95299551766SMark Adams 95399551766SMark Adams h_mat->offdiag.i = d_bi64; 95499551766SMark Adams h_mat->offdiag.j = d_bj64; 95599551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 95699551766SMark Adams 957365b711fSMark Adams ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 958365b711fSMark Adams } 959365b711fSMark Adams #else 96099551766SMark Adams h_mat->offdiag.i = (PetscInt*)bi; 96199551766SMark Adams h_mat->offdiag.j = (PetscInt*)bj; 96299551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 963365b711fSMark Adams #endif 964042217e8SBarry Smith h_mat->offdiag.a = ba; 965042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 966042217e8SBarry Smith 96799551766SMark Adams cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 96899551766SMark Adams cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 969042217e8SBarry Smith ierr = PetscFree(colmap);CHKERRQ(ierr); 970042217e8SBarry Smith } 971042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 972042217e8SBarry Smith h_mat->rend = A->rmap->rend; 973042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 974042217e8SBarry Smith h_mat->cend = A->cmap->rend; 97549b994a9SMark Adams h_mat->M = A->cmap->N; 976365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 977365b711fSMark Adams { 9782c71b3e2SJacob Faibussowitsch PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 97999551766SMark Adams int *h_ai32, *h_aj32; 98099551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 981365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 98299551766SMark Adams cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 98399551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 98499551766SMark Adams cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 98599551766SMark Adams for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 98699551766SMark Adams 98799551766SMark Adams cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 98899551766SMark Adams cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 98999551766SMark Adams cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 99099551766SMark Adams cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 99199551766SMark Adams 99299551766SMark Adams h_mat->diag.i = d_ai64; 99399551766SMark Adams h_mat->diag.j = d_aj64; 99499551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 99599551766SMark Adams 996365b711fSMark Adams ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 997365b711fSMark Adams } 998365b711fSMark Adams #else 99999551766SMark Adams h_mat->diag.i = (PetscInt*)ai; 100099551766SMark Adams h_mat->diag.j = (PetscInt*)aj; 100199551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 1002365b711fSMark Adams #endif 1003042217e8SBarry Smith h_mat->diag.a = aa; 1004042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 1005042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 1006042217e8SBarry Smith // copy pointers and metadata to device 1007042217e8SBarry Smith cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 1008042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 1009042217e8SBarry Smith } else { 1010042217e8SBarry Smith ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 1011042217e8SBarry Smith } 1012042217e8SBarry Smith *B = d_mat; 1013042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 10143fa6b06aSMark Adams PetscFunctionReturn(0); 10153fa6b06aSMark Adams } 1016