1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 94eb5378fSStefano Zampini #include <petscsf.h> 107e8381f9SStefano Zampini 117e8381f9SStefano Zampini struct VecCUDAEquals 127e8381f9SStefano Zampini { 137e8381f9SStefano Zampini template <typename Tuple> 147e8381f9SStefano Zampini __host__ __device__ 157e8381f9SStefano Zampini void operator()(Tuple t) 167e8381f9SStefano Zampini { 177e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 187e8381f9SStefano Zampini } 197e8381f9SStefano Zampini }; 207e8381f9SStefano Zampini 217e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 227e8381f9SStefano Zampini { 237e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 247e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 257e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 267e8381f9SStefano Zampini PetscErrorCode ierr; 277e8381f9SStefano Zampini 287e8381f9SStefano Zampini PetscFunctionBegin; 29e61fc153SStefano Zampini if (cusp->coo_p && v) { 3008391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 3108391a17SStefano Zampini THRUSTARRAY *w = NULL; 3208391a17SStefano Zampini 3308391a17SStefano Zampini if (isCudaMem(v)) { 3408391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 3508391a17SStefano Zampini } else { 36e61fc153SStefano Zampini w = new THRUSTARRAY(n); 37e61fc153SStefano Zampini w->assign(v,v+n); 3808391a17SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 3908391a17SStefano Zampini d_v = w->data(); 4008391a17SStefano Zampini } 4108391a17SStefano Zampini 4208391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 437e8381f9SStefano Zampini cusp->coo_pw->begin())); 4408391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 457e8381f9SStefano Zampini cusp->coo_pw->end())); 4608391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 477e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 4808391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 49e61fc153SStefano Zampini delete w; 507e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 517e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 527e8381f9SStefano Zampini } else { 537e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 547e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 557e8381f9SStefano Zampini } 567e8381f9SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 577e8381f9SStefano Zampini A->num_ass++; 587e8381f9SStefano Zampini A->assembled = PETSC_TRUE; 597e8381f9SStefano Zampini A->ass_nonzerostate = A->nonzerostate; 604eb5378fSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 617e8381f9SStefano Zampini PetscFunctionReturn(0); 627e8381f9SStefano Zampini } 637e8381f9SStefano Zampini 647e8381f9SStefano Zampini template <typename Tuple> 657e8381f9SStefano Zampini struct IsNotOffDiagT 667e8381f9SStefano Zampini { 677e8381f9SStefano Zampini PetscInt _cstart,_cend; 687e8381f9SStefano Zampini 697e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 707e8381f9SStefano Zampini __host__ __device__ 717e8381f9SStefano Zampini inline bool operator()(Tuple t) 727e8381f9SStefano Zampini { 737e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 747e8381f9SStefano Zampini } 757e8381f9SStefano Zampini }; 767e8381f9SStefano Zampini 777e8381f9SStefano Zampini struct IsOffDiag 787e8381f9SStefano Zampini { 797e8381f9SStefano Zampini PetscInt _cstart,_cend; 807e8381f9SStefano Zampini 817e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 827e8381f9SStefano Zampini __host__ __device__ 837e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 847e8381f9SStefano Zampini { 857e8381f9SStefano Zampini return c < _cstart || c >= _cend; 867e8381f9SStefano Zampini } 877e8381f9SStefano Zampini }; 887e8381f9SStefano Zampini 897e8381f9SStefano Zampini struct GlobToLoc 907e8381f9SStefano Zampini { 917e8381f9SStefano Zampini PetscInt _start; 927e8381f9SStefano Zampini 937e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 947e8381f9SStefano Zampini __host__ __device__ 957e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 967e8381f9SStefano Zampini { 977e8381f9SStefano Zampini return c - _start; 987e8381f9SStefano Zampini } 997e8381f9SStefano Zampini }; 1007e8381f9SStefano Zampini 1017e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[]) 1027e8381f9SStefano Zampini { 1037e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1047e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 1057e8381f9SStefano Zampini PetscErrorCode ierr; 1067e8381f9SStefano Zampini PetscInt *jj; 1077e8381f9SStefano Zampini size_t noff = 0; 108ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1097e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1107e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1117e8381f9SStefano Zampini cudaError_t cerr; 1127e8381f9SStefano Zampini 1137e8381f9SStefano Zampini PetscFunctionBegin; 1147e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 1157e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 1167e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1177e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1187e8381f9SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 1197e8381f9SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 1207e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1217e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1227e8381f9SStefano Zampini 1237e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1247e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1257e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1267e8381f9SStefano Zampini delete cusp->coo_p; 1277e8381f9SStefano Zampini delete cusp->coo_pw; 1287e8381f9SStefano Zampini cusp->coo_p = NULL; 1297e8381f9SStefano Zampini cusp->coo_pw = NULL; 13008391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1317e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1327e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1337e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1347e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1357e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1367e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1377e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1387e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1397e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1407e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1417e8381f9SStefano Zampini } 1427e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1437e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1447e8381f9SStefano Zampini 1457e8381f9SStefano Zampini /* from global to local */ 1467e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1477e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 14808391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1497e8381f9SStefano Zampini 1507e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 151ddea5d60SJunchao Zhang ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 1527e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1537e8381f9SStefano Zampini auto o_j = d_j.begin(); 15408391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 155ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1567e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 157ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 15808391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1597e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 160ddea5d60SJunchao Zhang ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 1617e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1627e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1637e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1647e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 1657e8381f9SStefano Zampini ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr); 1667e8381f9SStefano Zampini if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no); 1677e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1687e8381f9SStefano Zampini 1697e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1707e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1717e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1727e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1737e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1747e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1757e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1767e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 1777e8381f9SStefano Zampini 1787e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1797e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 1807e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 1817e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 1827e8381f9SStefano Zampini 1837e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 1847e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 1857e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 1867e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 187a0e72f99SJunchao Zhang /* 1887e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 1897e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 190a0e72f99SJunchao Zhang */ 1917e8381f9SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 1927e8381f9SStefano Zampini B->preallocated = PETSC_TRUE; 1937e8381f9SStefano Zampini B->nonzerostate++; 1947e8381f9SStefano Zampini 1957e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 1967e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 1977e8381f9SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 1987e8381f9SStefano Zampini B->assembled = PETSC_FALSE; 1997e8381f9SStefano Zampini B->was_assembled = PETSC_FALSE; 2007e8381f9SStefano Zampini PetscFunctionReturn(0); 2017e8381f9SStefano Zampini } 2029ae82921SPaul Mullowney 203ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 204ed502f03SStefano Zampini { 205ed502f03SStefano Zampini Mat Ad,Ao; 206ed502f03SStefano Zampini const PetscInt *cmap; 207ed502f03SStefano Zampini PetscErrorCode ierr; 208ed502f03SStefano Zampini 209ed502f03SStefano Zampini PetscFunctionBegin; 210ed502f03SStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 211ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 212ed502f03SStefano Zampini if (glob) { 213ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 214ed502f03SStefano Zampini 215ed502f03SStefano Zampini ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 216ed502f03SStefano Zampini ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 217ed502f03SStefano Zampini ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 218ed502f03SStefano Zampini ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 219ed502f03SStefano Zampini for (i=0; i<dn; i++) gidx[i] = cst + i; 220ed502f03SStefano Zampini for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 221ed502f03SStefano Zampini ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 222ed502f03SStefano Zampini } 223ed502f03SStefano Zampini PetscFunctionReturn(0); 224ed502f03SStefano Zampini } 225ed502f03SStefano Zampini 2269ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 2279ae82921SPaul Mullowney { 228bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 229bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 2309ae82921SPaul Mullowney PetscErrorCode ierr; 2319ae82921SPaul Mullowney PetscInt i; 2329ae82921SPaul Mullowney 2339ae82921SPaul Mullowney PetscFunctionBegin; 2349ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 2359ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 2367e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 2379ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2389ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 2399ae82921SPaul Mullowney } 2409ae82921SPaul Mullowney } 2417e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 2429ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 2439ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 2449ae82921SPaul Mullowney } 2459ae82921SPaul Mullowney } 2466a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 2476a29ce69SStefano Zampini ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 2486a29ce69SStefano Zampini #else 2496a29ce69SStefano Zampini ierr = PetscFree(b->colmap);CHKERRQ(ierr); 2506a29ce69SStefano Zampini #endif 2516a29ce69SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 2526a29ce69SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 2536a29ce69SStefano Zampini ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 2546a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 2556a29ce69SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 2566a29ce69SStefano Zampini if (!b->A) { 2579ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 2589ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 2593bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 2606a29ce69SStefano Zampini } 2616a29ce69SStefano Zampini if (!b->B) { 2626a29ce69SStefano Zampini PetscMPIInt size; 26355b25c41SPierre Jolivet ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 2649ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 2656a29ce69SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 2663bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 2679ae82921SPaul Mullowney } 268d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 269d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2706a29ce69SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 2716a29ce69SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 2729ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 2739ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 274e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 275e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 276b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 277b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 278a0e72f99SJunchao Zhang /* Let A, B use b's handle with pre-set stream 279b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 280b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 281a0e72f99SJunchao Zhang */ 2829ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 2839ae82921SPaul Mullowney PetscFunctionReturn(0); 2849ae82921SPaul Mullowney } 285e057df02SPaul Mullowney 2869ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2879ae82921SPaul Mullowney { 2889ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 2899ae82921SPaul Mullowney PetscErrorCode ierr; 2909ae82921SPaul Mullowney PetscInt nt; 2919ae82921SPaul Mullowney 2929ae82921SPaul Mullowney PetscFunctionBegin; 2939ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 2949ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 2959ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 2964d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 2979ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 2989ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 2999ae82921SPaul Mullowney PetscFunctionReturn(0); 3009ae82921SPaul Mullowney } 301ca45077fSPaul Mullowney 3023fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 3033fa6b06aSMark Adams { 3043fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 3053fa6b06aSMark Adams PetscErrorCode ierr; 3063fa6b06aSMark Adams 3073fa6b06aSMark Adams PetscFunctionBegin; 3083fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 3093fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 3103fa6b06aSMark Adams PetscFunctionReturn(0); 3113fa6b06aSMark Adams } 312042217e8SBarry Smith 313fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 314fdc842d1SBarry Smith { 315fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 316fdc842d1SBarry Smith PetscErrorCode ierr; 317fdc842d1SBarry Smith PetscInt nt; 318fdc842d1SBarry Smith 319fdc842d1SBarry Smith PetscFunctionBegin; 320fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 321fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 322fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 3234d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 324fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 325fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 326fdc842d1SBarry Smith PetscFunctionReturn(0); 327fdc842d1SBarry Smith } 328fdc842d1SBarry Smith 329ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 330ca45077fSPaul Mullowney { 331ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 332ca45077fSPaul Mullowney PetscErrorCode ierr; 333ca45077fSPaul Mullowney PetscInt nt; 334ca45077fSPaul Mullowney 335ca45077fSPaul Mullowney PetscFunctionBegin; 336ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 337ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 3389b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 339ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 3409b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 3419b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 342ca45077fSPaul Mullowney PetscFunctionReturn(0); 343ca45077fSPaul Mullowney } 3449ae82921SPaul Mullowney 345e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 346ca45077fSPaul Mullowney { 347ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 348bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 349e057df02SPaul Mullowney 350ca45077fSPaul Mullowney PetscFunctionBegin; 351e057df02SPaul Mullowney switch (op) { 352e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 353e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 354045c96e1SPaul Mullowney break; 355e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 356e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 357045c96e1SPaul Mullowney break; 358e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 359e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 360e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 361045c96e1SPaul Mullowney break; 362e057df02SPaul Mullowney default: 363e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 364045c96e1SPaul Mullowney } 365ca45077fSPaul Mullowney PetscFunctionReturn(0); 366ca45077fSPaul Mullowney } 367e057df02SPaul Mullowney 3684416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 369e057df02SPaul Mullowney { 370e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 371e057df02SPaul Mullowney PetscErrorCode ierr; 372e057df02SPaul Mullowney PetscBool flg; 373a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 374a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 3755fd66863SKarl Rupp 376e057df02SPaul Mullowney PetscFunctionBegin; 377e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 378e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 379e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 380a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 381e057df02SPaul Mullowney if (flg) { 382e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 383e057df02SPaul Mullowney } 384e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 385a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 386e057df02SPaul Mullowney if (flg) { 387e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 388e057df02SPaul Mullowney } 389e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 390a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 391e057df02SPaul Mullowney if (flg) { 392e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 393e057df02SPaul Mullowney } 394e057df02SPaul Mullowney } 3950af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 396e057df02SPaul Mullowney PetscFunctionReturn(0); 397e057df02SPaul Mullowney } 398e057df02SPaul Mullowney 39934d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 40034d6c7a5SJose E. Roman { 40134d6c7a5SJose E. Roman PetscErrorCode ierr; 4023fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 403042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 404042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 405042217e8SBarry Smith 40634d6c7a5SJose E. Roman PetscFunctionBegin; 40734d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 408042217e8SBarry Smith if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 409042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 410042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 411042217e8SBarry Smith cudaError_t cerr; 412042217e8SBarry Smith 413042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 414042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 415042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 416042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 417042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 418042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 419042217e8SBarry Smith cusp->deviceMat = NULL; 4203fa6b06aSMark Adams } 42134d6c7a5SJose E. Roman PetscFunctionReturn(0); 42234d6c7a5SJose E. Roman } 42334d6c7a5SJose E. Roman 424bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 425bbf3fe20SPaul Mullowney { 426bbf3fe20SPaul Mullowney PetscErrorCode ierr; 4273fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 4283fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 429b06137fdSPaul Mullowney cusparseStatus_t stat; 430bbf3fe20SPaul Mullowney 431bbf3fe20SPaul Mullowney PetscFunctionBegin; 432d98d7c49SStefano Zampini if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 4333fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 434042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 435042217e8SBarry Smith cudaError_t cerr; 436042217e8SBarry Smith 4373fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 438042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 439042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 440042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 441042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 442042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 4433fa6b06aSMark Adams } 444bbf3fe20SPaul Mullowney try { 4453fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 4463fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 44757d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 448a0e72f99SJunchao Zhang /* We want cusparseStruct to use PetscDefaultCudaStream 44917403302SKarl Rupp if (cusparseStruct->stream) { 450042217e8SBarry Smith cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 45117403302SKarl Rupp } 452a0e72f99SJunchao Zhang */ 4537e8381f9SStefano Zampini delete cusparseStruct->coo_p; 4547e8381f9SStefano Zampini delete cusparseStruct->coo_pw; 455bbf3fe20SPaul Mullowney delete cusparseStruct; 456bbf3fe20SPaul Mullowney } catch(char *ex) { 457bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 458bbf3fe20SPaul Mullowney } 4593338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 460ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 4617e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 4627e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 4633338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 464ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 465bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 466bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 467bbf3fe20SPaul Mullowney } 468ca45077fSPaul Mullowney 4693338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 4709ae82921SPaul Mullowney { 4719ae82921SPaul Mullowney PetscErrorCode ierr; 472bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 473bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 474b06137fdSPaul Mullowney cusparseStatus_t stat; 4753338378cSStefano Zampini Mat A; 4769ae82921SPaul Mullowney 4779ae82921SPaul Mullowney PetscFunctionBegin; 4783338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 4793338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 4803338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 4813338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 4823338378cSStefano Zampini } 4833338378cSStefano Zampini A = *newmat; 4846f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 48534136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 48634136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 48734136279SStefano Zampini 488bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 489d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 490d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 491d98d7c49SStefano Zampini if (a->lvec) { 492d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 493d98d7c49SStefano Zampini } 494d98d7c49SStefano Zampini 4953338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 496bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 4972205254eSKarl Rupp 498bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 499e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 500e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 5017e8381f9SStefano Zampini cusparseStruct->coo_p = NULL; 5027e8381f9SStefano Zampini cusparseStruct->coo_pw = NULL; 503042217e8SBarry Smith cusparseStruct->stream = 0; 5043fa6b06aSMark Adams cusparseStruct->deviceMat = NULL; 505042217e8SBarry Smith stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 5063338378cSStefano Zampini } 5072205254eSKarl Rupp 50834d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 509bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 510fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 511bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 512bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 513bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 5143fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 5154e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 5162205254eSKarl Rupp 517bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 518ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 5193338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 520bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 5217e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 5227e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 523ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 524ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 525ae48a8d0SStefano Zampini #endif 5269ae82921SPaul Mullowney PetscFunctionReturn(0); 5279ae82921SPaul Mullowney } 5289ae82921SPaul Mullowney 5293338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 5303338378cSStefano Zampini { 5313338378cSStefano Zampini PetscErrorCode ierr; 5323338378cSStefano Zampini 5333338378cSStefano Zampini PetscFunctionBegin; 534*a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 5353338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 5363338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 5373338378cSStefano Zampini PetscFunctionReturn(0); 5383338378cSStefano Zampini } 5393338378cSStefano Zampini 5403f9c0db1SPaul Mullowney /*@ 5413f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 542e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 5433f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 544e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 545e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 546e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 5479ae82921SPaul Mullowney 548d083f849SBarry Smith Collective 549e057df02SPaul Mullowney 550e057df02SPaul Mullowney Input Parameters: 551e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 552e057df02SPaul Mullowney . m - number of rows 553e057df02SPaul Mullowney . n - number of columns 554e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 555e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 5560298fd71SBarry Smith (possibly different for each row) or NULL 557e057df02SPaul Mullowney 558e057df02SPaul Mullowney Output Parameter: 559e057df02SPaul Mullowney . A - the matrix 560e057df02SPaul Mullowney 561e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 562e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 563e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 564e057df02SPaul Mullowney 565e057df02SPaul Mullowney Notes: 566e057df02SPaul Mullowney If nnz is given then nz is ignored 567e057df02SPaul Mullowney 568e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 569e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 570e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 571e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 572e057df02SPaul Mullowney 573e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 5740298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 575e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 576e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 577e057df02SPaul Mullowney 578e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 579e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 580e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 581e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 582e057df02SPaul Mullowney 583e057df02SPaul Mullowney Level: intermediate 584e057df02SPaul Mullowney 585e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 586e057df02SPaul Mullowney @*/ 5879ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 5889ae82921SPaul Mullowney { 5899ae82921SPaul Mullowney PetscErrorCode ierr; 5909ae82921SPaul Mullowney PetscMPIInt size; 5919ae82921SPaul Mullowney 5929ae82921SPaul Mullowney PetscFunctionBegin; 5939ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 5949ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 595ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 5969ae82921SPaul Mullowney if (size > 1) { 5979ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 5989ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 5999ae82921SPaul Mullowney } else { 6009ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 6019ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 6029ae82921SPaul Mullowney } 6039ae82921SPaul Mullowney PetscFunctionReturn(0); 6049ae82921SPaul Mullowney } 6059ae82921SPaul Mullowney 6063ca39a21SBarry Smith /*MC 6076bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 608e057df02SPaul Mullowney 6092692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 6102692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 6112692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 6129ae82921SPaul Mullowney 6139ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 6149ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 6159ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 6169ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 6179ae82921SPaul Mullowney the above preallocation routines for simplicity. 6189ae82921SPaul Mullowney 6199ae82921SPaul Mullowney Options Database Keys: 620e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 6218468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6228468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6238468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6249ae82921SPaul Mullowney 6259ae82921SPaul Mullowney Level: beginner 6269ae82921SPaul Mullowney 6276bb69460SJunchao Zhang .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 6286bb69460SJunchao Zhang M*/ 6296bb69460SJunchao Zhang 6306bb69460SJunchao Zhang /*MC 6316bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 6326bb69460SJunchao Zhang 6336bb69460SJunchao Zhang Level: beginner 6346bb69460SJunchao Zhang 6356bb69460SJunchao Zhang .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 6369ae82921SPaul Mullowney M*/ 6373fa6b06aSMark Adams 638042217e8SBarry Smith PETSC_INTERN PetscErrorCode MatSeqAIJCUSPARSECopyToGPU(Mat); 639042217e8SBarry Smith 640042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 641042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 6423fa6b06aSMark Adams { 643042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 644042217e8SBarry Smith PetscMPIInt size; 6453fa6b06aSMark Adams PetscErrorCode ierr; 646042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 647042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 648042217e8SBarry Smith Mat_SeqAIJ *jaca = NULL; 649042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 650042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 6513fa6b06aSMark Adams 6523fa6b06aSMark Adams PetscFunctionBegin; 653042217e8SBarry Smith if (!A->assembled) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 654042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 6553fa6b06aSMark Adams *B = NULL; 6563fa6b06aSMark Adams PetscFunctionReturn(0); 6573fa6b06aSMark Adams } 658042217e8SBarry Smith ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 6593fa6b06aSMark Adams if (size == 1) { 660042217e8SBarry Smith PetscBool isseqaij; 661042217e8SBarry Smith 662042217e8SBarry Smith ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 663042217e8SBarry Smith if (isseqaij) { 6643fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 665042217e8SBarry Smith if (!jaca->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 666042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 667042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 668042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 6693fa6b06aSMark Adams } else { 6703fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 671042217e8SBarry Smith if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 672042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 6733fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 674042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 675042217e8SBarry Smith d_mat = spptr->deviceMat; 676042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 6773fa6b06aSMark Adams } 678042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 679042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 680042217e8SBarry Smith if (!matstruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 681042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 682042217e8SBarry Smith bi = NULL; 683042217e8SBarry Smith bj = NULL; 684042217e8SBarry Smith ba = NULL; 685042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 686042217e8SBarry Smith } else { 687042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 688042217e8SBarry Smith if (!aij->roworiented) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 689042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 690042217e8SBarry Smith Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 691042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 6923fa6b06aSMark Adams 693042217e8SBarry Smith if (!A->nooffprocentries && !aij->donotstash) SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 694042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 695042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 696042217e8SBarry Smith if (cusparsestructA->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 697042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 698042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 699042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 700042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 701042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 702042217e8SBarry Smith if (!matstructA) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 703042217e8SBarry Smith if (!matstructB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 704042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 705042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 7063fa6b06aSMark Adams if (jacb->compressedrow.use) { 707042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 708042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 709042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 7103fa6b06aSMark Adams } 711042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 712042217e8SBarry Smith } else { 713042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 714042217e8SBarry Smith } 715042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 716042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 717042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 718042217e8SBarry Smith d_mat = spptr->deviceMat; 719042217e8SBarry Smith } 7203fa6b06aSMark Adams if (jaca->compressedrow.use) { 721042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 722042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 723042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 7243fa6b06aSMark Adams } 725042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 7263fa6b06aSMark Adams } else { 727042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 7283fa6b06aSMark Adams } 729042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 730042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 731042217e8SBarry Smith 732042217e8SBarry Smith if (!d_mat) { 733042217e8SBarry Smith cudaError_t cerr; 734042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 735042217e8SBarry Smith 736042217e8SBarry Smith // create and populate strucy on host and copy on device 737042217e8SBarry Smith ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 738042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 739042217e8SBarry Smith cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 740042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 741042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 742042217e8SBarry Smith int *colmap; 743042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 744042217e8SBarry Smith 745b3c64f9dSJunchao Zhang if (n && !aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 746042217e8SBarry Smith 747042217e8SBarry Smith ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 748042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 749042217e8SBarry Smith 750042217e8SBarry Smith h_mat->offdiag.i = bi; 751042217e8SBarry Smith h_mat->offdiag.j = bj; 752042217e8SBarry Smith h_mat->offdiag.a = ba; 753042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 754042217e8SBarry Smith 755042217e8SBarry Smith cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(int));CHKERRCUDA(cerr); 756042217e8SBarry Smith cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(int),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 757042217e8SBarry Smith ierr = PetscFree(colmap);CHKERRQ(ierr); 758042217e8SBarry Smith } 759042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 760042217e8SBarry Smith h_mat->rend = A->rmap->rend; 761042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 762042217e8SBarry Smith h_mat->cend = A->cmap->rend; 763042217e8SBarry Smith h_mat->N = A->cmap->N; 764042217e8SBarry Smith h_mat->diag.i = ai; 765042217e8SBarry Smith h_mat->diag.j = aj; 766042217e8SBarry Smith h_mat->diag.a = aa; 767042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 768042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 769042217e8SBarry Smith // copy pointers and metadata to device 770042217e8SBarry Smith cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 771042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 772042217e8SBarry Smith } else { 773042217e8SBarry Smith ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 774042217e8SBarry Smith } 775042217e8SBarry Smith *B = d_mat; 776042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 7773fa6b06aSMark Adams PetscFunctionReturn(0); 7783fa6b06aSMark Adams } 779