1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 147e8381f9SStefano Zampini struct VecCUDAEquals 157e8381f9SStefano Zampini { 167e8381f9SStefano Zampini template <typename Tuple> 177e8381f9SStefano Zampini __host__ __device__ 187e8381f9SStefano Zampini void operator()(Tuple t) 197e8381f9SStefano Zampini { 207e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 217e8381f9SStefano Zampini } 227e8381f9SStefano Zampini }; 237e8381f9SStefano Zampini 24cbc6b225SStefano Zampini static PetscErrorCode MatResetPreallocationCOO_MPIAIJCUSPARSE(Mat mat) 25cbc6b225SStefano Zampini { 26cbc6b225SStefano Zampini Mat_MPIAIJ *aij = (Mat_MPIAIJ*)mat->data; 27cbc6b225SStefano Zampini Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 28cbc6b225SStefano Zampini 29cbc6b225SStefano Zampini PetscFunctionBegin; 30cbc6b225SStefano Zampini if (!cusparseStruct) PetscFunctionReturn(0); 31cbc6b225SStefano Zampini if (cusparseStruct->use_extended_coo) { 325f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Aimap1_d)); 335f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Ajmap1_d)); 345f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Aperm1_d)); 355f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Bimap1_d)); 365f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Bjmap1_d)); 375f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Bperm1_d)); 385f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Aimap2_d)); 395f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Ajmap2_d)); 405f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Aperm2_d)); 415f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Bimap2_d)); 425f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Bjmap2_d)); 435f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Bperm2_d)); 445f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->Cperm1_d)); 455f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->sendbuf_d)); 465f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(cusparseStruct->recvbuf_d)); 47cbc6b225SStefano Zampini } 48cbc6b225SStefano Zampini cusparseStruct->use_extended_coo = PETSC_FALSE; 49cbc6b225SStefano Zampini delete cusparseStruct->coo_p; 50cbc6b225SStefano Zampini delete cusparseStruct->coo_pw; 51cbc6b225SStefano Zampini cusparseStruct->coo_p = NULL; 52cbc6b225SStefano Zampini cusparseStruct->coo_pw = NULL; 53cbc6b225SStefano Zampini PetscFunctionReturn(0); 54cbc6b225SStefano Zampini } 55cbc6b225SStefano Zampini 56219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE_Basic(Mat A, const PetscScalar v[], InsertMode imode) 577e8381f9SStefano Zampini { 587e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 597e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 607e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 617e8381f9SStefano Zampini 627e8381f9SStefano Zampini PetscFunctionBegin; 63e61fc153SStefano Zampini if (cusp->coo_p && v) { 6408391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 6508391a17SStefano Zampini THRUSTARRAY *w = NULL; 6608391a17SStefano Zampini 6708391a17SStefano Zampini if (isCudaMem(v)) { 6808391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 6908391a17SStefano Zampini } else { 70e61fc153SStefano Zampini w = new THRUSTARRAY(n); 71e61fc153SStefano Zampini w->assign(v,v+n); 725f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(n*sizeof(PetscScalar))); 7308391a17SStefano Zampini d_v = w->data(); 7408391a17SStefano Zampini } 7508391a17SStefano Zampini 7608391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 777e8381f9SStefano Zampini cusp->coo_pw->begin())); 7808391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 797e8381f9SStefano Zampini cusp->coo_pw->end())); 805f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 817e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 825f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 83e61fc153SStefano Zampini delete w; 845f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,cusp->coo_pw->data().get(),imode)); 855f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode)); 867e8381f9SStefano Zampini } else { 875f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->A,v,imode)); 885f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetValuesCOO_SeqAIJCUSPARSE_Basic(a->B,v ? v+cusp->coo_nd : NULL,imode)); 897e8381f9SStefano Zampini } 907e8381f9SStefano Zampini PetscFunctionReturn(0); 917e8381f9SStefano Zampini } 927e8381f9SStefano Zampini 937e8381f9SStefano Zampini template <typename Tuple> 947e8381f9SStefano Zampini struct IsNotOffDiagT 957e8381f9SStefano Zampini { 967e8381f9SStefano Zampini PetscInt _cstart,_cend; 977e8381f9SStefano Zampini 987e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 997e8381f9SStefano Zampini __host__ __device__ 1007e8381f9SStefano Zampini inline bool operator()(Tuple t) 1017e8381f9SStefano Zampini { 1027e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 1037e8381f9SStefano Zampini } 1047e8381f9SStefano Zampini }; 1057e8381f9SStefano Zampini 1067e8381f9SStefano Zampini struct IsOffDiag 1077e8381f9SStefano Zampini { 1087e8381f9SStefano Zampini PetscInt _cstart,_cend; 1097e8381f9SStefano Zampini 1107e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 1117e8381f9SStefano Zampini __host__ __device__ 1127e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 1137e8381f9SStefano Zampini { 1147e8381f9SStefano Zampini return c < _cstart || c >= _cend; 1157e8381f9SStefano Zampini } 1167e8381f9SStefano Zampini }; 1177e8381f9SStefano Zampini 1187e8381f9SStefano Zampini struct GlobToLoc 1197e8381f9SStefano Zampini { 1207e8381f9SStefano Zampini PetscInt _start; 1217e8381f9SStefano Zampini 1227e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 1237e8381f9SStefano Zampini __host__ __device__ 1247e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 1257e8381f9SStefano Zampini { 1267e8381f9SStefano Zampini return c - _start; 1277e8381f9SStefano Zampini } 1287e8381f9SStefano Zampini }; 1297e8381f9SStefano Zampini 130219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 1317e8381f9SStefano Zampini { 1327e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1337e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 13482a78a4eSJed Brown PetscInt N,*jj; 1357e8381f9SStefano Zampini size_t noff = 0; 136ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1377e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1387e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1397e8381f9SStefano Zampini 1407e8381f9SStefano Zampini PetscFunctionBegin; 1415f80ce2aSJacob Faibussowitsch if (b->A) CHKERRQ(MatCUSPARSEClearHandle(b->A)); 1425f80ce2aSJacob Faibussowitsch if (b->B) CHKERRQ(MatCUSPARSEClearHandle(b->B)); 1435f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy(&b->A)); 1445f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy(&b->B)); 1457e8381f9SStefano Zampini 1465f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogCpuToGpu(2.*n*sizeof(PetscInt))); 1477e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1487e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1495f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 1507e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1517e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1527e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1537e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1547e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1557e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1567e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1577e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1587e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1597e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1607e8381f9SStefano Zampini } 1617e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1627e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1637e8381f9SStefano Zampini 1647e8381f9SStefano Zampini /* from global to local */ 1657e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1667e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 1675f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 1687e8381f9SStefano Zampini 1697e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 1705f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(cusp->coo_no,&jj)); /* jj[] will store compacted col ids of the offdiag part */ 1715f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 1727e8381f9SStefano Zampini auto o_j = d_j.begin(); 1735f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeBegin()); 174ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1757e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 176ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 1775f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuTimeEnd()); 1787e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 179cbc6b225SStefano Zampini /* allocate the garray, the columns of B will not be mapped in the multiply setup */ 1805f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(noff,&b->garray)); 1815f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost)); 1825f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt))); 1835f80ce2aSJacob Faibussowitsch CHKERRQ(ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g)); 1845f80ce2aSJacob Faibussowitsch CHKERRQ(ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH)); 1855f80ce2aSJacob Faibussowitsch CHKERRQ(ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj)); 1862c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 1875f80ce2aSJacob Faibussowitsch CHKERRQ(ISLocalToGlobalMappingDestroy(&l2g)); 1887e8381f9SStefano Zampini 1895f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->A)); 1905f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 1915f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(b->A,MATSEQAIJCUSPARSE)); 1925f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 1935f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->B)); 1945f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff)); 1955f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(b->B,MATSEQAIJCUSPARSE)); 1965f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 1977e8381f9SStefano Zampini 1987e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1995f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get())); 2005f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetPreallocationCOO_SeqAIJCUSPARSE_Basic(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj)); 2015f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(jj)); 2027e8381f9SStefano Zampini 2035f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat)); 2045f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat)); 2055f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetHandle(b->A,cusp->handle)); 2065f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetHandle(b->B,cusp->handle)); 207a0e72f99SJunchao Zhang /* 2085f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetStream(b->A,cusp->stream)); 2095f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetStream(b->B,cusp->stream)); 210a0e72f99SJunchao Zhang */ 2117e8381f9SStefano Zampini 2125f80ce2aSJacob Faibussowitsch CHKERRQ(MatBindToCPU(b->A,B->boundtocpu)); 2135f80ce2aSJacob Faibussowitsch CHKERRQ(MatBindToCPU(b->B,B->boundtocpu)); 2145f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetUpMultiply_MPIAIJ(B)); 2157e8381f9SStefano Zampini PetscFunctionReturn(0); 2167e8381f9SStefano Zampini } 2179ae82921SPaul Mullowney 218219fbbafSJunchao Zhang static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, const PetscInt coo_i[], const PetscInt coo_j[]) 219219fbbafSJunchao Zhang { 220cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)mat->data; 221219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev; 222cbc6b225SStefano Zampini PetscBool coo_basic = PETSC_TRUE; 223219fbbafSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_DEVICE; 224219fbbafSJunchao Zhang PetscInt rstart,rend; 225219fbbafSJunchao Zhang 226219fbbafSJunchao Zhang PetscFunctionBegin; 2275f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(mpiaij->garray)); 2285f80ce2aSJacob Faibussowitsch CHKERRQ(VecDestroy(&mpiaij->lvec)); 229cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE) 2305f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableDestroy(&mpiaij->colmap)); 231cbc6b225SStefano Zampini #else 2325f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(mpiaij->colmap)); 233cbc6b225SStefano Zampini #endif 2345f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterDestroy(&mpiaij->Mvctx)); 235cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE; 236cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE; 2375f80ce2aSJacob Faibussowitsch CHKERRQ(MatResetPreallocationCOO_MPIAIJ(mat)); 2385f80ce2aSJacob Faibussowitsch CHKERRQ(MatResetPreallocationCOO_MPIAIJCUSPARSE(mat)); 239219fbbafSJunchao Zhang if (coo_i) { 2405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLayoutGetRange(mat->rmap,&rstart,&rend)); 2415f80ce2aSJacob Faibussowitsch CHKERRQ(PetscGetMemType(coo_i,&mtype)); 242219fbbafSJunchao Zhang if (PetscMemTypeHost(mtype)) { 243219fbbafSJunchao Zhang for (PetscCount k=0; k<coo_n; k++) { /* Are there negative indices or remote entries? */ 244cbc6b225SStefano Zampini if (coo_i[k]<0 || coo_i[k]<rstart || coo_i[k]>=rend || coo_j[k]<0) {coo_basic = PETSC_FALSE; break;} 245219fbbafSJunchao Zhang } 246219fbbafSJunchao Zhang } 247219fbbafSJunchao Zhang } 248219fbbafSJunchao Zhang /* All ranks must agree on the value of coo_basic */ 2495f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Allreduce(MPI_IN_PLACE,&coo_basic,1,MPIU_BOOL,MPI_LAND,PetscObjectComm((PetscObject)mat))); 250219fbbafSJunchao Zhang if (coo_basic) { 2515f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetPreallocationCOO_MPIAIJCUSPARSE_Basic(mat,coo_n,coo_i,coo_j)); 252219fbbafSJunchao Zhang } else { 2535f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetPreallocationCOO_MPIAIJ(mat,coo_n,coo_i,coo_j)); 254cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU; 255cbc6b225SStefano Zampini /* creates the GPU memory */ 2565f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A)); 2575f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B)); 258cbc6b225SStefano Zampini mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 259219fbbafSJunchao Zhang mpidev->use_extended_coo = PETSC_TRUE; 260219fbbafSJunchao Zhang 2615f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Aimap1_d,mpiaij->Annz1*sizeof(PetscCount))); 2625f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Ajmap1_d,(mpiaij->Annz1+1)*sizeof(PetscCount))); 2635f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Aperm1_d,mpiaij->Atot1*sizeof(PetscCount))); 264219fbbafSJunchao Zhang 2655f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Bimap1_d,mpiaij->Bnnz1*sizeof(PetscCount))); 2665f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Bjmap1_d,(mpiaij->Bnnz1+1)*sizeof(PetscCount))); 2675f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Bperm1_d,mpiaij->Btot1*sizeof(PetscCount))); 268219fbbafSJunchao Zhang 2695f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Aimap2_d,mpiaij->Annz2*sizeof(PetscCount))); 2705f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Ajmap2_d,(mpiaij->Annz2+1)*sizeof(PetscCount))); 2715f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Aperm2_d,mpiaij->Atot2*sizeof(PetscCount))); 272219fbbafSJunchao Zhang 2735f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Bimap2_d,mpiaij->Bnnz2*sizeof(PetscCount))); 2745f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Bjmap2_d,(mpiaij->Bnnz2+1)*sizeof(PetscCount))); 2755f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Bperm2_d,mpiaij->Btot2*sizeof(PetscCount))); 276219fbbafSJunchao Zhang 2775f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->Cperm1_d,mpiaij->sendlen*sizeof(PetscCount))); 2785f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->sendbuf_d,mpiaij->sendlen*sizeof(PetscScalar))); 2795f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&mpidev->recvbuf_d,mpiaij->recvlen*sizeof(PetscScalar))); 280219fbbafSJunchao Zhang 2815f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Aimap1_d,mpiaij->Aimap1,mpiaij->Annz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2825f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Ajmap1_d,mpiaij->Ajmap1,(mpiaij->Annz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2835f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Aperm1_d,mpiaij->Aperm1,mpiaij->Atot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 284219fbbafSJunchao Zhang 2855f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Bimap1_d,mpiaij->Bimap1,mpiaij->Bnnz1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2865f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Bjmap1_d,mpiaij->Bjmap1,(mpiaij->Bnnz1+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2875f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Bperm1_d,mpiaij->Bperm1,mpiaij->Btot1*sizeof(PetscCount),cudaMemcpyHostToDevice)); 288219fbbafSJunchao Zhang 2895f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Aimap2_d,mpiaij->Aimap2,mpiaij->Annz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2905f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Ajmap2_d,mpiaij->Ajmap2,(mpiaij->Annz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2915f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Aperm2_d,mpiaij->Aperm2,mpiaij->Atot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 292219fbbafSJunchao Zhang 2935f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Bimap2_d,mpiaij->Bimap2,mpiaij->Bnnz2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2945f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Bjmap2_d,mpiaij->Bjmap2,(mpiaij->Bnnz2+1)*sizeof(PetscCount),cudaMemcpyHostToDevice)); 2955f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Bperm2_d,mpiaij->Bperm2,mpiaij->Btot2*sizeof(PetscCount),cudaMemcpyHostToDevice)); 296219fbbafSJunchao Zhang 2975f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(mpidev->Cperm1_d,mpiaij->Cperm1,mpiaij->sendlen*sizeof(PetscCount),cudaMemcpyHostToDevice)); 298219fbbafSJunchao Zhang } 299219fbbafSJunchao Zhang PetscFunctionReturn(0); 300219fbbafSJunchao Zhang } 301219fbbafSJunchao Zhang 302219fbbafSJunchao Zhang __global__ void MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[]) 303219fbbafSJunchao Zhang { 304219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 305219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 306219fbbafSJunchao Zhang for (; i<nnz; i+= grid_size) buf[i] = kv[perm[i]]; 307219fbbafSJunchao Zhang } 308219fbbafSJunchao Zhang 309219fbbafSJunchao Zhang __global__ void MatAddCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount imap[],const PetscCount jmap[],const PetscCount perm[],PetscScalar a[]) 310219fbbafSJunchao Zhang { 311219fbbafSJunchao Zhang PetscCount i = blockIdx.x*blockDim.x + threadIdx.x; 312219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x; 313219fbbafSJunchao Zhang for (; i<nnz; i += grid_size) {for (PetscCount k=jmap[i]; k<jmap[i+1]; k++) a[imap[i]] += kv[perm[k]];} 314219fbbafSJunchao Zhang } 315219fbbafSJunchao Zhang 316219fbbafSJunchao Zhang static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode) 317219fbbafSJunchao Zhang { 318219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ*>(mat->data); 319219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *mpidev = static_cast<Mat_MPIAIJCUSPARSE*>(mpiaij->spptr); 320219fbbafSJunchao Zhang Mat A = mpiaij->A,B = mpiaij->B; 321219fbbafSJunchao Zhang PetscCount Annz1 = mpiaij->Annz1,Annz2 = mpiaij->Annz2,Bnnz1 = mpiaij->Bnnz1,Bnnz2 = mpiaij->Bnnz2; 322cbc6b225SStefano Zampini PetscScalar *Aa,*Ba = NULL; 323219fbbafSJunchao Zhang PetscScalar *vsend = mpidev->sendbuf_d,*v2 = mpidev->recvbuf_d; 324219fbbafSJunchao Zhang const PetscScalar *v1 = v; 325219fbbafSJunchao Zhang const PetscCount *Ajmap1 = mpidev->Ajmap1_d,*Ajmap2 = mpidev->Ajmap2_d,*Aimap1 = mpidev->Aimap1_d,*Aimap2 = mpidev->Aimap2_d; 326219fbbafSJunchao Zhang const PetscCount *Bjmap1 = mpidev->Bjmap1_d,*Bjmap2 = mpidev->Bjmap2_d,*Bimap1 = mpidev->Bimap1_d,*Bimap2 = mpidev->Bimap2_d; 327219fbbafSJunchao Zhang const PetscCount *Aperm1 = mpidev->Aperm1_d,*Aperm2 = mpidev->Aperm2_d,*Bperm1 = mpidev->Bperm1_d,*Bperm2 = mpidev->Bperm2_d; 328219fbbafSJunchao Zhang const PetscCount *Cperm1 = mpidev->Cperm1_d; 329219fbbafSJunchao Zhang PetscMemType memtype; 330219fbbafSJunchao Zhang 331219fbbafSJunchao Zhang PetscFunctionBegin; 332219fbbafSJunchao Zhang if (mpidev->use_extended_coo) { 333cbc6b225SStefano Zampini PetscMPIInt size; 334cbc6b225SStefano Zampini 3355f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)mat),&size)); 3365f80ce2aSJacob Faibussowitsch CHKERRQ(PetscGetMemType(v,&memtype)); 337219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */ 3385f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&v1,mpiaij->coo_n*sizeof(PetscScalar))); 3395f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy((void*)v1,v,mpiaij->coo_n*sizeof(PetscScalar),cudaMemcpyHostToDevice)); 340219fbbafSJunchao Zhang } 341219fbbafSJunchao Zhang 342219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3435f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(A,&Aa)); /* write matrix values */ 3445f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemset(Aa,0,((Mat_SeqAIJ*)A->data)->nz*sizeof(PetscScalar))); 345cbc6b225SStefano Zampini if (size > 1) { 3465f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEGetArrayWrite(B,&Ba)); 3475f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemset(Ba,0,((Mat_SeqAIJ*)B->data)->nz*sizeof(PetscScalar))); 348cbc6b225SStefano Zampini } 349219fbbafSJunchao Zhang } else { 3505f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEGetArray(A,&Aa)); /* read & write matrix values */ 3515f80ce2aSJacob Faibussowitsch if (size > 1) CHKERRQ(MatSeqAIJCUSPARSEGetArray(B,&Ba)); 352219fbbafSJunchao Zhang } 353219fbbafSJunchao Zhang 354219fbbafSJunchao Zhang /* Pack entries to be sent to remote */ 355cbc6b225SStefano Zampini if (mpiaij->sendlen) { 356219fbbafSJunchao Zhang MatPackCOOValues<<<(mpiaij->sendlen+255)/256,256>>>(v1,mpiaij->sendlen,Cperm1,vsend); 357cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 358cbc6b225SStefano Zampini } 359219fbbafSJunchao Zhang 360219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */ 3615f80ce2aSJacob Faibussowitsch if (size > 1) CHKERRQ(PetscSFReduceWithMemTypeBegin(mpiaij->coo_sf,MPIU_SCALAR,PETSC_MEMTYPE_CUDA,vsend,PETSC_MEMTYPE_CUDA,v2,MPI_REPLACE)); 362219fbbafSJunchao Zhang /* Add local entries to A and B */ 363cbc6b225SStefano Zampini if (Annz1) { 364219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz1+255)/256,256>>>(v1,Annz1,Aimap1,Ajmap1,Aperm1,Aa); 365cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 366cbc6b225SStefano Zampini } 367cbc6b225SStefano Zampini if (Bnnz1) { 368219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz1+255)/256,256>>>(v1,Bnnz1,Bimap1,Bjmap1,Bperm1,Ba); 369cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 370cbc6b225SStefano Zampini } 3715f80ce2aSJacob Faibussowitsch if (size > 1) CHKERRQ(PetscSFReduceEnd(mpiaij->coo_sf,MPIU_SCALAR,vsend,v2,MPI_REPLACE)); 372219fbbafSJunchao Zhang 373219fbbafSJunchao Zhang /* Add received remote entries to A and B */ 374cbc6b225SStefano Zampini if (Annz2) { 375219fbbafSJunchao Zhang MatAddCOOValues<<<(Annz2+255)/256,256>>>(v2,Annz2,Aimap2,Ajmap2,Aperm2,Aa); 376cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 377cbc6b225SStefano Zampini } 378cbc6b225SStefano Zampini if (Bnnz2) { 379219fbbafSJunchao Zhang MatAddCOOValues<<<(Bnnz2+255)/256,256>>>(v2,Bnnz2,Bimap2,Bjmap2,Bperm2,Ba); 380cbc6b225SStefano Zampini CHKERRCUDA(cudaPeekAtLastError()); 381cbc6b225SStefano Zampini } 382219fbbafSJunchao Zhang 383219fbbafSJunchao Zhang if (imode == INSERT_VALUES) { 3845f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(A,&Aa)); 3855f80ce2aSJacob Faibussowitsch if (size > 1) CHKERRQ(MatSeqAIJCUSPARSERestoreArrayWrite(B,&Ba)); 386219fbbafSJunchao Zhang } else { 3875f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSERestoreArray(A,&Aa)); 3885f80ce2aSJacob Faibussowitsch if (size > 1) CHKERRQ(MatSeqAIJCUSPARSERestoreArray(B,&Ba)); 389219fbbafSJunchao Zhang } 3905f80ce2aSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) CHKERRCUDA(cudaFree((void*)v1)); 391219fbbafSJunchao Zhang } else { 3925f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetValuesCOO_MPIAIJCUSPARSE_Basic(mat,v,imode)); 393219fbbafSJunchao Zhang } 394cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU; 395219fbbafSJunchao Zhang PetscFunctionReturn(0); 396219fbbafSJunchao Zhang } 397219fbbafSJunchao Zhang 398ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 399ed502f03SStefano Zampini { 400ed502f03SStefano Zampini Mat Ad,Ao; 401ed502f03SStefano Zampini const PetscInt *cmap; 402ed502f03SStefano Zampini 403ed502f03SStefano Zampini PetscFunctionBegin; 4045f80ce2aSJacob Faibussowitsch CHKERRQ(MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap)); 4055f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc)); 406ed502f03SStefano Zampini if (glob) { 407ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 408ed502f03SStefano Zampini 4095f80ce2aSJacob Faibussowitsch CHKERRQ(MatGetLocalSize(Ad,NULL,&dn)); 4105f80ce2aSJacob Faibussowitsch CHKERRQ(MatGetLocalSize(Ao,NULL,&on)); 4115f80ce2aSJacob Faibussowitsch CHKERRQ(MatGetOwnershipRangeColumn(A,&cst,NULL)); 4125f80ce2aSJacob Faibussowitsch CHKERRQ(PetscMalloc1(dn+on,&gidx)); 413ed502f03SStefano Zampini for (i = 0; i<dn; i++) gidx[i] = cst + i; 414ed502f03SStefano Zampini for (i = 0; i<on; i++) gidx[i+dn] = cmap[i]; 4155f80ce2aSJacob Faibussowitsch CHKERRQ(ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob)); 416ed502f03SStefano Zampini } 417ed502f03SStefano Zampini PetscFunctionReturn(0); 418ed502f03SStefano Zampini } 419ed502f03SStefano Zampini 4209ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 4219ae82921SPaul Mullowney { 422bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 423bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 4249ae82921SPaul Mullowney PetscInt i; 4259ae82921SPaul Mullowney 4269ae82921SPaul Mullowney PetscFunctionBegin; 4275f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLayoutSetUp(B->rmap)); 4285f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLayoutSetUp(B->cmap)); 4297e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 4309ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 4312c71b3e2SJacob Faibussowitsch PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 4329ae82921SPaul Mullowney } 4339ae82921SPaul Mullowney } 4347e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 4359ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 4362c71b3e2SJacob Faibussowitsch PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 4379ae82921SPaul Mullowney } 4389ae82921SPaul Mullowney } 4396a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 4405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscTableDestroy(&b->colmap)); 4416a29ce69SStefano Zampini #else 4425f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(b->colmap)); 4436a29ce69SStefano Zampini #endif 4445f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(b->garray)); 4455f80ce2aSJacob Faibussowitsch CHKERRQ(VecDestroy(&b->lvec)); 4465f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterDestroy(&b->Mvctx)); 4476a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 4485f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy(&b->B)); 4496a29ce69SStefano Zampini if (!b->A) { 4505f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->A)); 4515f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n)); 4525f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->A)); 4536a29ce69SStefano Zampini } 4546a29ce69SStefano Zampini if (!b->B) { 4556a29ce69SStefano Zampini PetscMPIInt size; 4565f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B),&size)); 4575f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(PETSC_COMM_SELF,&b->B)); 4585f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0)); 4595f80ce2aSJacob Faibussowitsch CHKERRQ(PetscLogObjectParent((PetscObject)B,(PetscObject)b->B)); 4609ae82921SPaul Mullowney } 4615f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(b->A,MATSEQAIJCUSPARSE)); 4625f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(b->B,MATSEQAIJCUSPARSE)); 4635f80ce2aSJacob Faibussowitsch CHKERRQ(MatBindToCPU(b->A,B->boundtocpu)); 4645f80ce2aSJacob Faibussowitsch CHKERRQ(MatBindToCPU(b->B,B->boundtocpu)); 4655f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz)); 4665f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz)); 4675f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat)); 4685f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat)); 4695f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetHandle(b->A,cusparseStruct->handle)); 4705f80ce2aSJacob Faibussowitsch CHKERRQ(MatCUSPARSESetHandle(b->B,cusparseStruct->handle)); 4719ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 4729ae82921SPaul Mullowney PetscFunctionReturn(0); 4739ae82921SPaul Mullowney } 474e057df02SPaul Mullowney 4759ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 4769ae82921SPaul Mullowney { 4779ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 4789ae82921SPaul Mullowney 4799ae82921SPaul Mullowney PetscFunctionBegin; 4805f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 4815f80ce2aSJacob Faibussowitsch CHKERRQ((*a->A->ops->mult)(a->A,xx,yy)); 4825f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 4835f80ce2aSJacob Faibussowitsch CHKERRQ((*a->B->ops->multadd)(a->B,a->lvec,yy,yy)); 4849ae82921SPaul Mullowney PetscFunctionReturn(0); 4859ae82921SPaul Mullowney } 486ca45077fSPaul Mullowney 4873fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 4883fa6b06aSMark Adams { 4893fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 4903fa6b06aSMark Adams 4913fa6b06aSMark Adams PetscFunctionBegin; 4925f80ce2aSJacob Faibussowitsch CHKERRQ(MatZeroEntries(l->A)); 4935f80ce2aSJacob Faibussowitsch CHKERRQ(MatZeroEntries(l->B)); 4943fa6b06aSMark Adams PetscFunctionReturn(0); 4953fa6b06aSMark Adams } 496042217e8SBarry Smith 497fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 498fdc842d1SBarry Smith { 499fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 500fdc842d1SBarry Smith 501fdc842d1SBarry Smith PetscFunctionBegin; 5025f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 5035f80ce2aSJacob Faibussowitsch CHKERRQ((*a->A->ops->multadd)(a->A,xx,yy,zz)); 5045f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD)); 5055f80ce2aSJacob Faibussowitsch CHKERRQ((*a->B->ops->multadd)(a->B,a->lvec,zz,zz)); 506fdc842d1SBarry Smith PetscFunctionReturn(0); 507fdc842d1SBarry Smith } 508fdc842d1SBarry Smith 509ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 510ca45077fSPaul Mullowney { 511ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 512ca45077fSPaul Mullowney 513ca45077fSPaul Mullowney PetscFunctionBegin; 5145f80ce2aSJacob Faibussowitsch CHKERRQ((*a->B->ops->multtranspose)(a->B,xx,a->lvec)); 5155f80ce2aSJacob Faibussowitsch CHKERRQ((*a->A->ops->multtranspose)(a->A,xx,yy)); 5165f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 5175f80ce2aSJacob Faibussowitsch CHKERRQ(VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE)); 518ca45077fSPaul Mullowney PetscFunctionReturn(0); 519ca45077fSPaul Mullowney } 5209ae82921SPaul Mullowney 521e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 522ca45077fSPaul Mullowney { 523ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 524bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 525e057df02SPaul Mullowney 526ca45077fSPaul Mullowney PetscFunctionBegin; 527e057df02SPaul Mullowney switch (op) { 528e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 529e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 530045c96e1SPaul Mullowney break; 531e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 532e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 533045c96e1SPaul Mullowney break; 534e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 535e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 536e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 537045c96e1SPaul Mullowney break; 538e057df02SPaul Mullowney default: 53998921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 540045c96e1SPaul Mullowney } 541ca45077fSPaul Mullowney PetscFunctionReturn(0); 542ca45077fSPaul Mullowney } 543e057df02SPaul Mullowney 5444416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 545e057df02SPaul Mullowney { 546e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 547e057df02SPaul Mullowney PetscBool flg; 548a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 549a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 5505fd66863SKarl Rupp 551e057df02SPaul Mullowney PetscFunctionBegin; 5525f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options")); 553e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 5545f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 5555f80ce2aSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 5565f80ce2aSJacob Faibussowitsch if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format)); 5575f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 5585f80ce2aSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg)); 5595f80ce2aSJacob Faibussowitsch if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format)); 5605f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 5615f80ce2aSJacob Faibussowitsch "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg)); 5625f80ce2aSJacob Faibussowitsch if (flg) CHKERRQ(MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format)); 563e057df02SPaul Mullowney } 5645f80ce2aSJacob Faibussowitsch CHKERRQ(PetscOptionsTail()); 565e057df02SPaul Mullowney PetscFunctionReturn(0); 566e057df02SPaul Mullowney } 567e057df02SPaul Mullowney 56834d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 56934d6c7a5SJose E. Roman { 5703fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 571042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 572042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 573042217e8SBarry Smith 57434d6c7a5SJose E. Roman PetscFunctionBegin; 5755f80ce2aSJacob Faibussowitsch CHKERRQ(MatAssemblyEnd_MPIAIJ(A,mode)); 5765f80ce2aSJacob Faibussowitsch if (mpiaij->lvec) CHKERRQ(VecSetType(mpiaij->lvec,VECSEQCUDA)); 577042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 578042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 579042217e8SBarry Smith 5805f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(A,"Destroy device mat since nonzerostate changed\n")); 5815f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&h_mat)); 5825f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 5835f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->colmap)); 58499551766SMark Adams if (h_mat->allocated_indices) { 5855f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->diag.i)); 5865f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->diag.j)); 58799551766SMark Adams if (h_mat->offdiag.j) { 5885f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->offdiag.i)); 5895f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->offdiag.j)); 59099551766SMark Adams } 59199551766SMark Adams } 5925f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(d_mat)); 5935f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(h_mat)); 594042217e8SBarry Smith cusp->deviceMat = NULL; 5953fa6b06aSMark Adams } 59634d6c7a5SJose E. Roman PetscFunctionReturn(0); 59734d6c7a5SJose E. Roman } 59834d6c7a5SJose E. Roman 599bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 600bbf3fe20SPaul Mullowney { 6013fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 6023fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 603bbf3fe20SPaul Mullowney 604bbf3fe20SPaul Mullowney PetscFunctionBegin; 605*28b400f6SJacob Faibussowitsch PetscCheck(cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 6063fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 607042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 608042217e8SBarry Smith 6095f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(A,"Have device matrix\n")); 6105f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&h_mat)); 6115f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost)); 6125f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->colmap)); 61399551766SMark Adams if (h_mat->allocated_indices) { 6145f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->diag.i)); 6155f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->diag.j)); 61699551766SMark Adams if (h_mat->offdiag.j) { 6175f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->offdiag.i)); 6185f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(h_mat->offdiag.j)); 61999551766SMark Adams } 62099551766SMark Adams } 6215f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaFree(d_mat)); 6225f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(h_mat)); 6233fa6b06aSMark Adams } 624bbf3fe20SPaul Mullowney try { 6255f80ce2aSJacob Faibussowitsch if (aij->A) CHKERRQ(MatCUSPARSEClearHandle(aij->A)); 6265f80ce2aSJacob Faibussowitsch if (aij->B) CHKERRQ(MatCUSPARSEClearHandle(aij->B)); 6275f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseDestroy(cusparseStruct->handle)); 628a0e72f99SJunchao Zhang /* We want cusparseStruct to use PetscDefaultCudaStream 62917403302SKarl Rupp if (cusparseStruct->stream) { 6305f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaStreamDestroy(cusparseStruct->stream)); 63117403302SKarl Rupp } 632a0e72f99SJunchao Zhang */ 633cbc6b225SStefano Zampini /* Free COO */ 6345f80ce2aSJacob Faibussowitsch CHKERRQ(MatResetPreallocationCOO_MPIAIJCUSPARSE(A)); 635bbf3fe20SPaul Mullowney delete cusparseStruct; 636bbf3fe20SPaul Mullowney } catch(char *ex) { 63798921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 638bbf3fe20SPaul Mullowney } 6395f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL)); 6405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL)); 6415f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL)); 6425f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL)); 6435f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL)); 6445f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL)); 6455f80ce2aSJacob Faibussowitsch CHKERRQ(MatDestroy_MPIAIJ(A)); 646bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 647bbf3fe20SPaul Mullowney } 648ca45077fSPaul Mullowney 6493338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 6509ae82921SPaul Mullowney { 651bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 6523338378cSStefano Zampini Mat A; 6539ae82921SPaul Mullowney 6549ae82921SPaul Mullowney PetscFunctionBegin; 6555f80ce2aSJacob Faibussowitsch CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6565f80ce2aSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) CHKERRQ(MatDuplicate(B,MAT_COPY_VALUES,newmat)); 6575f80ce2aSJacob Faibussowitsch else if (reuse == MAT_REUSE_MATRIX) CHKERRQ(MatCopy(B,*newmat,SAME_NONZERO_PATTERN)); 6583338378cSStefano Zampini A = *newmat; 6596f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 6605f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(A->defaultvectype)); 6615f80ce2aSJacob Faibussowitsch CHKERRQ(PetscStrallocpy(VECCUDA,&A->defaultvectype)); 66234136279SStefano Zampini 663bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 6645f80ce2aSJacob Faibussowitsch if (a->A) CHKERRQ(MatSetType(a->A,MATSEQAIJCUSPARSE)); 6655f80ce2aSJacob Faibussowitsch if (a->B) CHKERRQ(MatSetType(a->B,MATSEQAIJCUSPARSE)); 6665f80ce2aSJacob Faibussowitsch if (a->lvec) CHKERRQ(VecSetType(a->lvec,VECSEQCUDA)); 667d98d7c49SStefano Zampini 6683338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 669219fbbafSJunchao Zhang Mat_MPIAIJCUSPARSE *cusp = new Mat_MPIAIJCUSPARSE; 6705f80ce2aSJacob Faibussowitsch CHKERRCUSPARSE(cusparseCreate(&(cusp->handle))); 671219fbbafSJunchao Zhang a->spptr = cusp; 6723338378cSStefano Zampini } 6732205254eSKarl Rupp 67434d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 675bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 676fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 677bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 678bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 679bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 6803fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 6814e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 6822205254eSKarl Rupp 6835f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE)); 6845f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE)); 6855f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE)); 6865f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE)); 6875f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE)); 6885f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE)); 689ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 6905f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE)); 691ae48a8d0SStefano Zampini #endif 6929ae82921SPaul Mullowney PetscFunctionReturn(0); 6939ae82921SPaul Mullowney } 6949ae82921SPaul Mullowney 6953338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 6963338378cSStefano Zampini { 6973338378cSStefano Zampini PetscFunctionBegin; 6985f80ce2aSJacob Faibussowitsch CHKERRQ(PetscDeviceInitialize(PETSC_DEVICE_CUDA)); 6995f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate_MPIAIJ(A)); 7005f80ce2aSJacob Faibussowitsch CHKERRQ(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A)); 7013338378cSStefano Zampini PetscFunctionReturn(0); 7023338378cSStefano Zampini } 7033338378cSStefano Zampini 7043f9c0db1SPaul Mullowney /*@ 7053f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 706e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 7073f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 708e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 709e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 710e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 7119ae82921SPaul Mullowney 712d083f849SBarry Smith Collective 713e057df02SPaul Mullowney 714e057df02SPaul Mullowney Input Parameters: 715e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 716e057df02SPaul Mullowney . m - number of rows 717e057df02SPaul Mullowney . n - number of columns 718e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 719e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 7200298fd71SBarry Smith (possibly different for each row) or NULL 721e057df02SPaul Mullowney 722e057df02SPaul Mullowney Output Parameter: 723e057df02SPaul Mullowney . A - the matrix 724e057df02SPaul Mullowney 725e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 726e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 727e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 728e057df02SPaul Mullowney 729e057df02SPaul Mullowney Notes: 730e057df02SPaul Mullowney If nnz is given then nz is ignored 731e057df02SPaul Mullowney 732e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 733e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 734e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 735e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 736e057df02SPaul Mullowney 737e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 7380298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 739e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 740e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 741e057df02SPaul Mullowney 742e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 743e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 744e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 745e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 746e057df02SPaul Mullowney 747e057df02SPaul Mullowney Level: intermediate 748e057df02SPaul Mullowney 749e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 750e057df02SPaul Mullowney @*/ 7519ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 7529ae82921SPaul Mullowney { 7539ae82921SPaul Mullowney PetscMPIInt size; 7549ae82921SPaul Mullowney 7559ae82921SPaul Mullowney PetscFunctionBegin; 7565f80ce2aSJacob Faibussowitsch CHKERRQ(MatCreate(comm,A)); 7575f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetSizes(*A,m,n,M,N)); 7585f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Comm_size(comm,&size)); 7599ae82921SPaul Mullowney if (size > 1) { 7605f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(*A,MATMPIAIJCUSPARSE)); 7615f80ce2aSJacob Faibussowitsch CHKERRQ(MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz)); 7629ae82921SPaul Mullowney } else { 7635f80ce2aSJacob Faibussowitsch CHKERRQ(MatSetType(*A,MATSEQAIJCUSPARSE)); 7645f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJSetPreallocation(*A,d_nz,d_nnz)); 7659ae82921SPaul Mullowney } 7669ae82921SPaul Mullowney PetscFunctionReturn(0); 7679ae82921SPaul Mullowney } 7689ae82921SPaul Mullowney 7693ca39a21SBarry Smith /*MC 7706bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 771e057df02SPaul Mullowney 7722692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 7732692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 7742692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 7759ae82921SPaul Mullowney 7769ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 7779ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 7789ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 7799ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 7809ae82921SPaul Mullowney the above preallocation routines for simplicity. 7819ae82921SPaul Mullowney 7829ae82921SPaul Mullowney Options Database Keys: 783e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 7848468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7858468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7868468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 7879ae82921SPaul Mullowney 7889ae82921SPaul Mullowney Level: beginner 7899ae82921SPaul Mullowney 7906bb69460SJunchao Zhang .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 7916bb69460SJunchao Zhang M*/ 7926bb69460SJunchao Zhang 7936bb69460SJunchao Zhang /*MC 7946bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 7956bb69460SJunchao Zhang 7966bb69460SJunchao Zhang Level: beginner 7976bb69460SJunchao Zhang 7986bb69460SJunchao Zhang .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 7999ae82921SPaul Mullowney M*/ 8003fa6b06aSMark Adams 801042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 802042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 8033fa6b06aSMark Adams { 804042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 805042217e8SBarry Smith PetscMPIInt size; 806042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 807042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 80899551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 809042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 810042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 8113fa6b06aSMark Adams 8123fa6b06aSMark Adams PetscFunctionBegin; 813*28b400f6SJacob Faibussowitsch PetscCheck(A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 814042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 8153fa6b06aSMark Adams *B = NULL; 8163fa6b06aSMark Adams PetscFunctionReturn(0); 8173fa6b06aSMark Adams } 8185f80ce2aSJacob Faibussowitsch CHKERRMPI(MPI_Comm_size(PetscObjectComm((PetscObject)A),&size)); 81999551766SMark Adams // get jaca 8203fa6b06aSMark Adams if (size == 1) { 821042217e8SBarry Smith PetscBool isseqaij; 822042217e8SBarry Smith 8235f80ce2aSJacob Faibussowitsch CHKERRQ(PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij)); 824042217e8SBarry Smith if (isseqaij) { 8253fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 826*28b400f6SJacob Faibussowitsch PetscCheck(jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 827042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 828042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 8295f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(A)); 8303fa6b06aSMark Adams } else { 8313fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 832*28b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 833042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8343fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 835042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 836042217e8SBarry Smith d_mat = spptr->deviceMat; 8375f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 8383fa6b06aSMark Adams } 839042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 840042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 841*28b400f6SJacob Faibussowitsch PetscCheck(matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 842042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 843042217e8SBarry Smith bi = NULL; 844042217e8SBarry Smith bj = NULL; 845042217e8SBarry Smith ba = NULL; 846042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 847042217e8SBarry Smith } else { 848042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 849*28b400f6SJacob Faibussowitsch PetscCheck(aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 850042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 85199551766SMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 852042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 8533fa6b06aSMark Adams 8542c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 855042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 856042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 8572c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 858042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 8595f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->A)); 8605f80ce2aSJacob Faibussowitsch CHKERRQ(MatSeqAIJCUSPARSECopyToGPU(aij->B)); 861042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 862042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 863*28b400f6SJacob Faibussowitsch PetscCheck(matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 864*28b400f6SJacob Faibussowitsch PetscCheck(matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 865042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 866042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 8673fa6b06aSMark Adams if (jacb->compressedrow.use) { 868042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 869042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 870042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 8713fa6b06aSMark Adams } 872042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 873042217e8SBarry Smith } else { 874042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 875042217e8SBarry Smith } 876042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 877042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 878042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 879042217e8SBarry Smith d_mat = spptr->deviceMat; 880042217e8SBarry Smith } 8813fa6b06aSMark Adams if (jaca->compressedrow.use) { 882042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 883042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 884042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 8853fa6b06aSMark Adams } 886042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 8873fa6b06aSMark Adams } else { 888042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 8893fa6b06aSMark Adams } 890042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 891042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 892042217e8SBarry Smith 893042217e8SBarry Smith if (!d_mat) { 894042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 895042217e8SBarry Smith 896042217e8SBarry Smith // create and populate strucy on host and copy on device 8975f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(A,"Create device matrix\n")); 8985f80ce2aSJacob Faibussowitsch CHKERRQ(PetscNew(&h_mat)); 8995f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_mat,sizeof(*d_mat))); 900042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 901042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 90299551766SMark Adams PetscInt *colmap; 903042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 904042217e8SBarry Smith 9052c71b3e2SJacob Faibussowitsch PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 906042217e8SBarry Smith 9075f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCalloc1(N+1,&colmap)); 908042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 909365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 910365b711fSMark Adams { // have to make a long version of these 91199551766SMark Adams int *h_bi32, *h_bj32; 91299551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 9135f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64)); 9145f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost)); 91599551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 9165f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost)); 91799551766SMark Adams for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 91899551766SMark Adams 9195f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64))); 9205f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice)); 9215f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64))); 9225f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice)); 92399551766SMark Adams 92499551766SMark Adams h_mat->offdiag.i = d_bi64; 92599551766SMark Adams h_mat->offdiag.j = d_bj64; 92699551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 92799551766SMark Adams 9285f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64)); 929365b711fSMark Adams } 930365b711fSMark Adams #else 93199551766SMark Adams h_mat->offdiag.i = (PetscInt*)bi; 93299551766SMark Adams h_mat->offdiag.j = (PetscInt*)bj; 93399551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 934365b711fSMark Adams #endif 935042217e8SBarry Smith h_mat->offdiag.a = ba; 936042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 937042217e8SBarry Smith 9385f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap))); 9395f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice)); 9405f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(colmap)); 941042217e8SBarry Smith } 942042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 943042217e8SBarry Smith h_mat->rend = A->rmap->rend; 944042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 945042217e8SBarry Smith h_mat->cend = A->cmap->rend; 94649b994a9SMark Adams h_mat->M = A->cmap->N; 947365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 948365b711fSMark Adams { 9492c71b3e2SJacob Faibussowitsch PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 95099551766SMark Adams int *h_ai32, *h_aj32; 95199551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 9525f80ce2aSJacob Faibussowitsch CHKERRQ(PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64)); 9535f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost)); 95499551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 9555f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost)); 95699551766SMark Adams for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 95799551766SMark Adams 9585f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64))); 9595f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice)); 9605f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64))); 9615f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice)); 96299551766SMark Adams 96399551766SMark Adams h_mat->diag.i = d_ai64; 96499551766SMark Adams h_mat->diag.j = d_aj64; 96599551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 96699551766SMark Adams 9675f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64)); 968365b711fSMark Adams } 969365b711fSMark Adams #else 97099551766SMark Adams h_mat->diag.i = (PetscInt*)ai; 97199551766SMark Adams h_mat->diag.j = (PetscInt*)aj; 97299551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 973365b711fSMark Adams #endif 974042217e8SBarry Smith h_mat->diag.a = aa; 975042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 976042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 977042217e8SBarry Smith // copy pointers and metadata to device 9785f80ce2aSJacob Faibussowitsch CHKERRCUDA(cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice)); 9795f80ce2aSJacob Faibussowitsch CHKERRQ(PetscFree(h_mat)); 980042217e8SBarry Smith } else { 9815f80ce2aSJacob Faibussowitsch CHKERRQ(PetscInfo(A,"Reusing device matrix\n")); 982042217e8SBarry Smith } 983042217e8SBarry Smith *B = d_mat; 984042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 9853fa6b06aSMark Adams PetscFunctionReturn(0); 9863fa6b06aSMark Adams } 987