1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 299acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 30fd2b57fSKarl Rupp 43d13b8fdSMatthew G. Knepley #include <petscconf.h> 59ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 657d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 73d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 87e8381f9SStefano Zampini #include <thrust/advance.h> 9a2cee5feSJed Brown #include <thrust/partition.h> 10a2cee5feSJed Brown #include <thrust/sort.h> 11a2cee5feSJed Brown #include <thrust/unique.h> 124eb5378fSStefano Zampini #include <petscsf.h> 137e8381f9SStefano Zampini 147e8381f9SStefano Zampini struct VecCUDAEquals 157e8381f9SStefano Zampini { 167e8381f9SStefano Zampini template <typename Tuple> 177e8381f9SStefano Zampini __host__ __device__ 187e8381f9SStefano Zampini void operator()(Tuple t) 197e8381f9SStefano Zampini { 207e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t); 217e8381f9SStefano Zampini } 227e8381f9SStefano Zampini }; 237e8381f9SStefano Zampini 247e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode) 257e8381f9SStefano Zampini { 267e8381f9SStefano Zampini Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 277e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr; 287e8381f9SStefano Zampini PetscInt n = cusp->coo_nd + cusp->coo_no; 297e8381f9SStefano Zampini PetscErrorCode ierr; 307e8381f9SStefano Zampini 317e8381f9SStefano Zampini PetscFunctionBegin; 32e61fc153SStefano Zampini if (cusp->coo_p && v) { 3308391a17SStefano Zampini thrust::device_ptr<const PetscScalar> d_v; 3408391a17SStefano Zampini THRUSTARRAY *w = NULL; 3508391a17SStefano Zampini 3608391a17SStefano Zampini if (isCudaMem(v)) { 3708391a17SStefano Zampini d_v = thrust::device_pointer_cast(v); 3808391a17SStefano Zampini } else { 39e61fc153SStefano Zampini w = new THRUSTARRAY(n); 40e61fc153SStefano Zampini w->assign(v,v+n); 4108391a17SStefano Zampini ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr); 4208391a17SStefano Zampini d_v = w->data(); 4308391a17SStefano Zampini } 4408391a17SStefano Zampini 4508391a17SStefano Zampini auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()), 467e8381f9SStefano Zampini cusp->coo_pw->begin())); 4708391a17SStefano Zampini auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()), 487e8381f9SStefano Zampini cusp->coo_pw->end())); 4908391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 507e8381f9SStefano Zampini thrust::for_each(zibit,zieit,VecCUDAEquals()); 5108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 52e61fc153SStefano Zampini delete w; 537e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr); 547e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr); 557e8381f9SStefano Zampini } else { 567e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr); 577e8381f9SStefano Zampini ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr); 587e8381f9SStefano Zampini } 597e8381f9SStefano Zampini ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr); 607e8381f9SStefano Zampini A->num_ass++; 617e8381f9SStefano Zampini A->assembled = PETSC_TRUE; 627e8381f9SStefano Zampini A->ass_nonzerostate = A->nonzerostate; 634eb5378fSStefano Zampini A->offloadmask = PETSC_OFFLOAD_GPU; 647e8381f9SStefano Zampini PetscFunctionReturn(0); 657e8381f9SStefano Zampini } 667e8381f9SStefano Zampini 677e8381f9SStefano Zampini template <typename Tuple> 687e8381f9SStefano Zampini struct IsNotOffDiagT 697e8381f9SStefano Zampini { 707e8381f9SStefano Zampini PetscInt _cstart,_cend; 717e8381f9SStefano Zampini 727e8381f9SStefano Zampini IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 737e8381f9SStefano Zampini __host__ __device__ 747e8381f9SStefano Zampini inline bool operator()(Tuple t) 757e8381f9SStefano Zampini { 767e8381f9SStefano Zampini return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend); 777e8381f9SStefano Zampini } 787e8381f9SStefano Zampini }; 797e8381f9SStefano Zampini 807e8381f9SStefano Zampini struct IsOffDiag 817e8381f9SStefano Zampini { 827e8381f9SStefano Zampini PetscInt _cstart,_cend; 837e8381f9SStefano Zampini 847e8381f9SStefano Zampini IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {} 857e8381f9SStefano Zampini __host__ __device__ 867e8381f9SStefano Zampini inline bool operator() (const PetscInt &c) 877e8381f9SStefano Zampini { 887e8381f9SStefano Zampini return c < _cstart || c >= _cend; 897e8381f9SStefano Zampini } 907e8381f9SStefano Zampini }; 917e8381f9SStefano Zampini 927e8381f9SStefano Zampini struct GlobToLoc 937e8381f9SStefano Zampini { 947e8381f9SStefano Zampini PetscInt _start; 957e8381f9SStefano Zampini 967e8381f9SStefano Zampini GlobToLoc(PetscInt start) : _start(start) {} 977e8381f9SStefano Zampini __host__ __device__ 987e8381f9SStefano Zampini inline PetscInt operator() (const PetscInt &c) 997e8381f9SStefano Zampini { 1007e8381f9SStefano Zampini return c - _start; 1017e8381f9SStefano Zampini } 1027e8381f9SStefano Zampini }; 1037e8381f9SStefano Zampini 10482a78a4eSJed Brown static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscCount n, const PetscInt coo_i[], const PetscInt coo_j[]) 1057e8381f9SStefano Zampini { 1067e8381f9SStefano Zampini Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 1077e8381f9SStefano Zampini Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr; 1087e8381f9SStefano Zampini PetscErrorCode ierr; 10982a78a4eSJed Brown PetscInt N,*jj; 1107e8381f9SStefano Zampini size_t noff = 0; 111ddea5d60SJunchao Zhang THRUSTINTARRAY d_i(n); /* on device, storing partitioned coo_i with diagonal first, and off-diag next */ 1127e8381f9SStefano Zampini THRUSTINTARRAY d_j(n); 1137e8381f9SStefano Zampini ISLocalToGlobalMapping l2g; 1147e8381f9SStefano Zampini cudaError_t cerr; 1157e8381f9SStefano Zampini 1167e8381f9SStefano Zampini PetscFunctionBegin; 1177e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 1187e8381f9SStefano Zampini ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 1197e8381f9SStefano Zampini if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); } 1207e8381f9SStefano Zampini if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); } 1217e8381f9SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 1227e8381f9SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 1237e8381f9SStefano Zampini ierr = MatDestroy(&b->A);CHKERRQ(ierr); 1247e8381f9SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 1257e8381f9SStefano Zampini 1267e8381f9SStefano Zampini ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr); 1277e8381f9SStefano Zampini d_i.assign(coo_i,coo_i+n); 1287e8381f9SStefano Zampini d_j.assign(coo_j,coo_j+n); 1297e8381f9SStefano Zampini delete cusp->coo_p; 1307e8381f9SStefano Zampini delete cusp->coo_pw; 1317e8381f9SStefano Zampini cusp->coo_p = NULL; 1327e8381f9SStefano Zampini cusp->coo_pw = NULL; 13308391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 1347e8381f9SStefano Zampini auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1357e8381f9SStefano Zampini auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend)); 1367e8381f9SStefano Zampini if (firstoffd != d_j.end() && firstdiag != d_j.end()) { 1377e8381f9SStefano Zampini cusp->coo_p = new THRUSTINTARRAY(n); 1387e8381f9SStefano Zampini cusp->coo_pw = new THRUSTARRAY(n); 1397e8381f9SStefano Zampini thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0); 1407e8381f9SStefano Zampini auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin())); 1417e8381f9SStefano Zampini auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end())); 1427e8381f9SStefano Zampini auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend)); 1437e8381f9SStefano Zampini firstoffd = mzipp.get_iterator_tuple().get<1>(); 1447e8381f9SStefano Zampini } 1457e8381f9SStefano Zampini cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd); 1467e8381f9SStefano Zampini cusp->coo_no = thrust::distance(firstoffd,d_j.end()); 1477e8381f9SStefano Zampini 1487e8381f9SStefano Zampini /* from global to local */ 1497e8381f9SStefano Zampini thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart)); 1507e8381f9SStefano Zampini thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart)); 15108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1527e8381f9SStefano Zampini 1537e8381f9SStefano Zampini /* copy offdiag column indices to map on the CPU */ 154ddea5d60SJunchao Zhang ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr); /* jj[] will store compacted col ids of the offdiag part */ 1557e8381f9SStefano Zampini cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1567e8381f9SStefano Zampini auto o_j = d_j.begin(); 15708391a17SStefano Zampini ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr); 158ddea5d60SJunchao Zhang thrust::advance(o_j,cusp->coo_nd); /* sort and unique offdiag col ids */ 1597e8381f9SStefano Zampini thrust::sort(thrust::device,o_j,d_j.end()); 160ddea5d60SJunchao Zhang auto wit = thrust::unique(thrust::device,o_j,d_j.end()); /* return end iter of the unique range */ 16108391a17SStefano Zampini ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr); 1627e8381f9SStefano Zampini noff = thrust::distance(o_j,wit); 163ddea5d60SJunchao Zhang ierr = PetscMalloc1(noff,&b->garray);CHKERRQ(ierr); 1647e8381f9SStefano Zampini cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 1657e8381f9SStefano Zampini ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr); 1667e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr); 1677e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr); 16882a78a4eSJed Brown ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&N,jj);CHKERRQ(ierr); 169*2c71b3e2SJacob Faibussowitsch PetscCheck(N == cusp->coo_no,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %" PetscInt_FMT " != %" PetscInt_FMT " coo size",N,cusp->coo_no); 1707e8381f9SStefano Zampini ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr); 1717e8381f9SStefano Zampini 1727e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 1737e8381f9SStefano Zampini ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 1747e8381f9SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1757e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 1767e8381f9SStefano Zampini ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 1777e8381f9SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr); 1787e8381f9SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 1797e8381f9SStefano Zampini ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 1807e8381f9SStefano Zampini 1817e8381f9SStefano Zampini /* GPU memory, cusparse specific call handles it internally */ 1827e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr); 1837e8381f9SStefano Zampini ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr); 1847e8381f9SStefano Zampini ierr = PetscFree(jj);CHKERRQ(ierr); 1857e8381f9SStefano Zampini 1867e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr); 1877e8381f9SStefano Zampini ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr); 1887e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr); 1897e8381f9SStefano Zampini ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr); 190a0e72f99SJunchao Zhang /* 1917e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr); 1927e8381f9SStefano Zampini ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr); 193a0e72f99SJunchao Zhang */ 1947e8381f9SStefano Zampini ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr); 1957e8381f9SStefano Zampini B->preallocated = PETSC_TRUE; 1967e8381f9SStefano Zampini B->nonzerostate++; 1977e8381f9SStefano Zampini 1987e8381f9SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 1997e8381f9SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 2007e8381f9SStefano Zampini B->offloadmask = PETSC_OFFLOAD_CPU; 2017e8381f9SStefano Zampini B->assembled = PETSC_FALSE; 2027e8381f9SStefano Zampini B->was_assembled = PETSC_FALSE; 2037e8381f9SStefano Zampini PetscFunctionReturn(0); 2047e8381f9SStefano Zampini } 2059ae82921SPaul Mullowney 206ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc) 207ed502f03SStefano Zampini { 208ed502f03SStefano Zampini Mat Ad,Ao; 209ed502f03SStefano Zampini const PetscInt *cmap; 210ed502f03SStefano Zampini PetscErrorCode ierr; 211ed502f03SStefano Zampini 212ed502f03SStefano Zampini PetscFunctionBegin; 213ed502f03SStefano Zampini ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr); 214ed502f03SStefano Zampini ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr); 215ed502f03SStefano Zampini if (glob) { 216ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx; 217ed502f03SStefano Zampini 218ed502f03SStefano Zampini ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr); 219ed502f03SStefano Zampini ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr); 220ed502f03SStefano Zampini ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr); 221ed502f03SStefano Zampini ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr); 222ed502f03SStefano Zampini for (i=0; i<dn; i++) gidx[i] = cst + i; 223ed502f03SStefano Zampini for (i=0; i<on; i++) gidx[i+dn] = cmap[i]; 224ed502f03SStefano Zampini ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr); 225ed502f03SStefano Zampini } 226ed502f03SStefano Zampini PetscFunctionReturn(0); 227ed502f03SStefano Zampini } 228ed502f03SStefano Zampini 2299ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 2309ae82921SPaul Mullowney { 231bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 232bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 2339ae82921SPaul Mullowney PetscErrorCode ierr; 2349ae82921SPaul Mullowney PetscInt i; 2359ae82921SPaul Mullowney 2369ae82921SPaul Mullowney PetscFunctionBegin; 2379ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 2389ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 2397e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) { 2409ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 241*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(d_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,d_nnz[i]); 2429ae82921SPaul Mullowney } 2439ae82921SPaul Mullowney } 2447e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) { 2459ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 246*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(o_nnz[i] < 0,PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT,i,o_nnz[i]); 2479ae82921SPaul Mullowney } 2489ae82921SPaul Mullowney } 2496a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE) 2506a29ce69SStefano Zampini ierr = PetscTableDestroy(&b->colmap);CHKERRQ(ierr); 2516a29ce69SStefano Zampini #else 2526a29ce69SStefano Zampini ierr = PetscFree(b->colmap);CHKERRQ(ierr); 2536a29ce69SStefano Zampini #endif 2546a29ce69SStefano Zampini ierr = PetscFree(b->garray);CHKERRQ(ierr); 2556a29ce69SStefano Zampini ierr = VecDestroy(&b->lvec);CHKERRQ(ierr); 2566a29ce69SStefano Zampini ierr = VecScatterDestroy(&b->Mvctx);CHKERRQ(ierr); 2576a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */ 2586a29ce69SStefano Zampini ierr = MatDestroy(&b->B);CHKERRQ(ierr); 2596a29ce69SStefano Zampini if (!b->A) { 2609ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 2619ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 2623bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 2636a29ce69SStefano Zampini } 2646a29ce69SStefano Zampini if (!b->B) { 2656a29ce69SStefano Zampini PetscMPIInt size; 26655b25c41SPierre Jolivet ierr = MPI_Comm_size(PetscObjectComm((PetscObject)B),&size);CHKERRMPI(ierr); 2679ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 2686a29ce69SStefano Zampini ierr = MatSetSizes(b->B,B->rmap->n,size > 1 ? B->cmap->N : 0,B->rmap->n,size > 1 ? B->cmap->N : 0);CHKERRQ(ierr); 2693bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 2709ae82921SPaul Mullowney } 271d98d7c49SStefano Zampini ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 272d98d7c49SStefano Zampini ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 2736a29ce69SStefano Zampini ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 2746a29ce69SStefano Zampini ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 2759ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 2769ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 277e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 278e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 279b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 280b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 281a0e72f99SJunchao Zhang /* Let A, B use b's handle with pre-set stream 282b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 283b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 284a0e72f99SJunchao Zhang */ 2859ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 2869ae82921SPaul Mullowney PetscFunctionReturn(0); 2879ae82921SPaul Mullowney } 288e057df02SPaul Mullowney 2899ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 2909ae82921SPaul Mullowney { 2919ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 2929ae82921SPaul Mullowney PetscErrorCode ierr; 2939ae82921SPaul Mullowney PetscInt nt; 2949ae82921SPaul Mullowney 2959ae82921SPaul Mullowney PetscFunctionBegin; 2969ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 297*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 29856258e06SRichard Tran Mills /* If A is bound to the CPU, the local vector used in the matrix multiplies should also be bound to the CPU. */ 29956258e06SRichard Tran Mills if (A->boundtocpu) {ierr = VecBindToCPU(a->lvec,PETSC_TRUE);CHKERRQ(ierr);} 3009ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 3014d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 3029ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 3039ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 3049ae82921SPaul Mullowney PetscFunctionReturn(0); 3059ae82921SPaul Mullowney } 306ca45077fSPaul Mullowney 3073fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 3083fa6b06aSMark Adams { 3093fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 3103fa6b06aSMark Adams PetscErrorCode ierr; 3113fa6b06aSMark Adams 3123fa6b06aSMark Adams PetscFunctionBegin; 3133fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 3143fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 3153fa6b06aSMark Adams PetscFunctionReturn(0); 3163fa6b06aSMark Adams } 317042217e8SBarry Smith 318fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 319fdc842d1SBarry Smith { 320fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 321fdc842d1SBarry Smith PetscErrorCode ierr; 322fdc842d1SBarry Smith PetscInt nt; 323fdc842d1SBarry Smith 324fdc842d1SBarry Smith PetscFunctionBegin; 325fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 326*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->cmap->n,nt); 327fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 3284d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 329fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 330fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 331fdc842d1SBarry Smith PetscFunctionReturn(0); 332fdc842d1SBarry Smith } 333fdc842d1SBarry Smith 334ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 335ca45077fSPaul Mullowney { 336ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 337ca45077fSPaul Mullowney PetscErrorCode ierr; 338ca45077fSPaul Mullowney PetscInt nt; 339ca45077fSPaul Mullowney 340ca45077fSPaul Mullowney PetscFunctionBegin; 341ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 342*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(nt != A->rmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%" PetscInt_FMT ") and xx (%" PetscInt_FMT ")",A->rmap->n,nt); 3439b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 344ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 3459b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 3469b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 347ca45077fSPaul Mullowney PetscFunctionReturn(0); 348ca45077fSPaul Mullowney } 3499ae82921SPaul Mullowney 350e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 351ca45077fSPaul Mullowney { 352ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 353bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 354e057df02SPaul Mullowney 355ca45077fSPaul Mullowney PetscFunctionBegin; 356e057df02SPaul Mullowney switch (op) { 357e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 358e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 359045c96e1SPaul Mullowney break; 360e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 361e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 362045c96e1SPaul Mullowney break; 363e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 364e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 365e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 366045c96e1SPaul Mullowney break; 367e057df02SPaul Mullowney default: 36898921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 369045c96e1SPaul Mullowney } 370ca45077fSPaul Mullowney PetscFunctionReturn(0); 371ca45077fSPaul Mullowney } 372e057df02SPaul Mullowney 3734416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 374e057df02SPaul Mullowney { 375e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 376e057df02SPaul Mullowney PetscErrorCode ierr; 377e057df02SPaul Mullowney PetscBool flg; 378a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 379a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 3805fd66863SKarl Rupp 381e057df02SPaul Mullowney PetscFunctionBegin; 382e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 383e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 384e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 385a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 386e057df02SPaul Mullowney if (flg) { 387e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 388e057df02SPaul Mullowney } 389e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 390a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 391e057df02SPaul Mullowney if (flg) { 392e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 393e057df02SPaul Mullowney } 394e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 395a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 396e057df02SPaul Mullowney if (flg) { 397e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 398e057df02SPaul Mullowney } 399e057df02SPaul Mullowney } 4000af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 401e057df02SPaul Mullowney PetscFunctionReturn(0); 402e057df02SPaul Mullowney } 403e057df02SPaul Mullowney 40434d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 40534d6c7a5SJose E. Roman { 40634d6c7a5SJose E. Roman PetscErrorCode ierr; 4073fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 408042217e8SBarry Smith Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 409042217e8SBarry Smith PetscObjectState onnz = A->nonzerostate; 410042217e8SBarry Smith 41134d6c7a5SJose E. Roman PetscFunctionBegin; 41234d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 413042217e8SBarry Smith if (mpiaij->lvec) { ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); } 414042217e8SBarry Smith if (onnz != A->nonzerostate && cusp->deviceMat) { 415042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusp->deviceMat, h_mat; 416042217e8SBarry Smith cudaError_t cerr; 417042217e8SBarry Smith 418042217e8SBarry Smith ierr = PetscInfo(A,"Destroy device mat since nonzerostate changed\n");CHKERRQ(ierr); 419042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 420042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 421042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 42299551766SMark Adams if (h_mat->allocated_indices) { 42399551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 42499551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 42599551766SMark Adams if (h_mat->offdiag.j) { 42699551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 42799551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 42899551766SMark Adams } 42999551766SMark Adams } 430042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 431042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 432042217e8SBarry Smith cusp->deviceMat = NULL; 4333fa6b06aSMark Adams } 43434d6c7a5SJose E. Roman PetscFunctionReturn(0); 43534d6c7a5SJose E. Roman } 43634d6c7a5SJose E. Roman 437bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 438bbf3fe20SPaul Mullowney { 439bbf3fe20SPaul Mullowney PetscErrorCode ierr; 4403fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 4413fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 442b06137fdSPaul Mullowney cusparseStatus_t stat; 443bbf3fe20SPaul Mullowney 444bbf3fe20SPaul Mullowney PetscFunctionBegin; 445*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!cusparseStruct,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr"); 4463fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 447042217e8SBarry Smith PetscSplitCSRDataStructure d_mat = cusparseStruct->deviceMat, h_mat; 448042217e8SBarry Smith cudaError_t cerr; 449042217e8SBarry Smith 4503fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 451042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 452042217e8SBarry Smith cerr = cudaMemcpy(h_mat,d_mat,sizeof(*d_mat),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 453042217e8SBarry Smith cerr = cudaFree(h_mat->colmap);CHKERRCUDA(cerr); 45499551766SMark Adams if (h_mat->allocated_indices) { 45599551766SMark Adams cerr = cudaFree(h_mat->diag.i);CHKERRCUDA(cerr); 45699551766SMark Adams cerr = cudaFree(h_mat->diag.j);CHKERRCUDA(cerr); 45799551766SMark Adams if (h_mat->offdiag.j) { 45899551766SMark Adams cerr = cudaFree(h_mat->offdiag.i);CHKERRCUDA(cerr); 45999551766SMark Adams cerr = cudaFree(h_mat->offdiag.j);CHKERRCUDA(cerr); 46099551766SMark Adams } 46199551766SMark Adams } 462042217e8SBarry Smith cerr = cudaFree(d_mat);CHKERRCUDA(cerr); 463042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 4643fa6b06aSMark Adams } 465bbf3fe20SPaul Mullowney try { 4663fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 4673fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 46857d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 469a0e72f99SJunchao Zhang /* We want cusparseStruct to use PetscDefaultCudaStream 47017403302SKarl Rupp if (cusparseStruct->stream) { 471042217e8SBarry Smith cudaError_t err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 47217403302SKarl Rupp } 473a0e72f99SJunchao Zhang */ 4747e8381f9SStefano Zampini delete cusparseStruct->coo_p; 4757e8381f9SStefano Zampini delete cusparseStruct->coo_pw; 476bbf3fe20SPaul Mullowney delete cusparseStruct; 477bbf3fe20SPaul Mullowney } catch(char *ex) { 47898921bdaSJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 479bbf3fe20SPaul Mullowney } 4803338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 481ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr); 4827e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr); 4837e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr); 4843338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 485ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",NULL);CHKERRQ(ierr); 486bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 487bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 488bbf3fe20SPaul Mullowney } 489ca45077fSPaul Mullowney 4903338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 4919ae82921SPaul Mullowney { 4929ae82921SPaul Mullowney PetscErrorCode ierr; 493bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 494bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 495b06137fdSPaul Mullowney cusparseStatus_t stat; 4963338378cSStefano Zampini Mat A; 4979ae82921SPaul Mullowney 4989ae82921SPaul Mullowney PetscFunctionBegin; 4993338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 5003338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 5013338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 5023338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 5033338378cSStefano Zampini } 5043338378cSStefano Zampini A = *newmat; 5056f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE; 50634136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 50734136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 50834136279SStefano Zampini 509bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 510d98d7c49SStefano Zampini if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 511d98d7c49SStefano Zampini if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); } 512d98d7c49SStefano Zampini if (a->lvec) { 513d98d7c49SStefano Zampini ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr); 514d98d7c49SStefano Zampini } 515d98d7c49SStefano Zampini 5163338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 517bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 5182205254eSKarl Rupp 519bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 520e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 521e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 5227e8381f9SStefano Zampini cusparseStruct->coo_p = NULL; 5237e8381f9SStefano Zampini cusparseStruct->coo_pw = NULL; 524042217e8SBarry Smith cusparseStruct->stream = 0; 5253fa6b06aSMark Adams cusparseStruct->deviceMat = NULL; 526042217e8SBarry Smith stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 5273338378cSStefano Zampini } 5282205254eSKarl Rupp 52934d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 530bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 531fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 532bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 533bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 534bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 5353fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 5364e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND; 5372205254eSKarl Rupp 538bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 539ed502f03SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr); 5403338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 541bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 5427e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 5437e8381f9SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr); 544ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE) 545ae48a8d0SStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatConvert_mpiaijcusparse_hypre_C",MatConvert_AIJ_HYPRE);CHKERRQ(ierr); 546ae48a8d0SStefano Zampini #endif 5479ae82921SPaul Mullowney PetscFunctionReturn(0); 5489ae82921SPaul Mullowney } 5499ae82921SPaul Mullowney 5503338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 5513338378cSStefano Zampini { 5523338378cSStefano Zampini PetscErrorCode ierr; 5533338378cSStefano Zampini 5543338378cSStefano Zampini PetscFunctionBegin; 555a4af0ceeSJacob Faibussowitsch ierr = PetscDeviceInitialize(PETSC_DEVICE_CUDA);CHKERRQ(ierr); 5563338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 5573338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 5583338378cSStefano Zampini PetscFunctionReturn(0); 5593338378cSStefano Zampini } 5603338378cSStefano Zampini 5613f9c0db1SPaul Mullowney /*@ 5623f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 563e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 5643f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 565e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 566e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 567e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 5689ae82921SPaul Mullowney 569d083f849SBarry Smith Collective 570e057df02SPaul Mullowney 571e057df02SPaul Mullowney Input Parameters: 572e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 573e057df02SPaul Mullowney . m - number of rows 574e057df02SPaul Mullowney . n - number of columns 575e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 576e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 5770298fd71SBarry Smith (possibly different for each row) or NULL 578e057df02SPaul Mullowney 579e057df02SPaul Mullowney Output Parameter: 580e057df02SPaul Mullowney . A - the matrix 581e057df02SPaul Mullowney 582e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 583e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 584e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 585e057df02SPaul Mullowney 586e057df02SPaul Mullowney Notes: 587e057df02SPaul Mullowney If nnz is given then nz is ignored 588e057df02SPaul Mullowney 589e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 590e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 591e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 592e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 593e057df02SPaul Mullowney 594e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 5950298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 596e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 597e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 598e057df02SPaul Mullowney 599e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 600e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 601e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 602e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 603e057df02SPaul Mullowney 604e057df02SPaul Mullowney Level: intermediate 605e057df02SPaul Mullowney 606e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 607e057df02SPaul Mullowney @*/ 6089ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 6099ae82921SPaul Mullowney { 6109ae82921SPaul Mullowney PetscErrorCode ierr; 6119ae82921SPaul Mullowney PetscMPIInt size; 6129ae82921SPaul Mullowney 6139ae82921SPaul Mullowney PetscFunctionBegin; 6149ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 6159ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 616ffc4695bSBarry Smith ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr); 6179ae82921SPaul Mullowney if (size > 1) { 6189ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 6199ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 6209ae82921SPaul Mullowney } else { 6219ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 6229ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 6239ae82921SPaul Mullowney } 6249ae82921SPaul Mullowney PetscFunctionReturn(0); 6259ae82921SPaul Mullowney } 6269ae82921SPaul Mullowney 6273ca39a21SBarry Smith /*MC 6286bb69460SJunchao Zhang MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATMPIAIJCUSPARSE. 629e057df02SPaul Mullowney 6302692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 6312692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 6322692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 6339ae82921SPaul Mullowney 6349ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 6359ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 6369ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 6379ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 6389ae82921SPaul Mullowney the above preallocation routines for simplicity. 6399ae82921SPaul Mullowney 6409ae82921SPaul Mullowney Options Database Keys: 641e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 6428468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6438468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6448468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 6459ae82921SPaul Mullowney 6469ae82921SPaul Mullowney Level: beginner 6479ae82921SPaul Mullowney 6486bb69460SJunchao Zhang .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 6496bb69460SJunchao Zhang M*/ 6506bb69460SJunchao Zhang 6516bb69460SJunchao Zhang /*MC 6526bb69460SJunchao Zhang MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as MATAIJCUSPARSE. 6536bb69460SJunchao Zhang 6546bb69460SJunchao Zhang Level: beginner 6556bb69460SJunchao Zhang 6566bb69460SJunchao Zhang .seealso: MATAIJCUSPARSE, MATSEQAIJCUSPARSE 6579ae82921SPaul Mullowney M*/ 6583fa6b06aSMark Adams 659042217e8SBarry Smith // get GPU pointers to stripped down Mat. For both seq and MPI Mat. 660042217e8SBarry Smith PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure *B) 6613fa6b06aSMark Adams { 662042217e8SBarry Smith PetscSplitCSRDataStructure d_mat; 663042217e8SBarry Smith PetscMPIInt size; 6643fa6b06aSMark Adams PetscErrorCode ierr; 665042217e8SBarry Smith int *ai = NULL,*bi = NULL,*aj = NULL,*bj = NULL; 666042217e8SBarry Smith PetscScalar *aa = NULL,*ba = NULL; 66799551766SMark Adams Mat_SeqAIJ *jaca = NULL, *jacb = NULL; 668042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructA = NULL; 669042217e8SBarry Smith CsrMatrix *matrixA = NULL,*matrixB = NULL; 6703fa6b06aSMark Adams 6713fa6b06aSMark Adams PetscFunctionBegin; 672*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->assembled,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Need already assembled matrix"); 673042217e8SBarry Smith if (A->factortype != MAT_FACTOR_NONE) { 6743fa6b06aSMark Adams *B = NULL; 6753fa6b06aSMark Adams PetscFunctionReturn(0); 6763fa6b06aSMark Adams } 677042217e8SBarry Smith ierr = MPI_Comm_size(PetscObjectComm((PetscObject)A),&size);CHKERRMPI(ierr); 67899551766SMark Adams // get jaca 6793fa6b06aSMark Adams if (size == 1) { 680042217e8SBarry Smith PetscBool isseqaij; 681042217e8SBarry Smith 682042217e8SBarry Smith ierr = PetscObjectBaseTypeCompare((PetscObject)A,MATSEQAIJ,&isseqaij);CHKERRQ(ierr); 683042217e8SBarry Smith if (isseqaij) { 6843fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 685*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!jaca->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 686042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)A->spptr; 687042217e8SBarry Smith d_mat = cusparsestructA->deviceMat; 688042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(A);CHKERRQ(ierr); 6893fa6b06aSMark Adams } else { 6903fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 691*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 692042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 6933fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 694042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 695042217e8SBarry Smith d_mat = spptr->deviceMat; 696042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 6973fa6b06aSMark Adams } 698042217e8SBarry Smith if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 699042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 700*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstruct,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 701042217e8SBarry Smith matrixA = (CsrMatrix*)matstruct->mat; 702042217e8SBarry Smith bi = NULL; 703042217e8SBarry Smith bj = NULL; 704042217e8SBarry Smith ba = NULL; 705042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 706042217e8SBarry Smith } else { 707042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 708*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!aij->roworiented,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support column oriented values insertion"); 709042217e8SBarry Smith jaca = (Mat_SeqAIJ*)aij->A->data; 71099551766SMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 711042217e8SBarry Smith Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 7123fa6b06aSMark Adams 713*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!A->nooffprocentries && !aij->donotstash,PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device assembly does not currently support offproc values insertion. Use MatSetOption(A,MAT_NO_OFF_PROC_ENTRIES,PETSC_TRUE) or MatSetOption(A,MAT_IGNORE_OFF_PROC_ENTRIES,PETSC_TRUE)"); 714042217e8SBarry Smith cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 715042217e8SBarry Smith Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 716*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(cusparsestructA->format!=MAT_CUSPARSE_CSR,PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 717042217e8SBarry Smith if (cusparsestructB->format==MAT_CUSPARSE_CSR) { 718042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->A);CHKERRQ(ierr); 719042217e8SBarry Smith ierr = MatSeqAIJCUSPARSECopyToGPU(aij->B);CHKERRQ(ierr); 720042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 721042217e8SBarry Smith Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 722*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructA,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for A"); 723*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(!matstructB,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Missing Mat_SeqAIJCUSPARSEMultStruct for B"); 724042217e8SBarry Smith matrixA = (CsrMatrix*)matstructA->mat; 725042217e8SBarry Smith matrixB = (CsrMatrix*)matstructB->mat; 7263fa6b06aSMark Adams if (jacb->compressedrow.use) { 727042217e8SBarry Smith if (!cusparsestructB->rowoffsets_gpu) { 728042217e8SBarry Smith cusparsestructB->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 729042217e8SBarry Smith cusparsestructB->rowoffsets_gpu->assign(jacb->i,jacb->i+A->rmap->n+1); 7303fa6b06aSMark Adams } 731042217e8SBarry Smith bi = thrust::raw_pointer_cast(cusparsestructB->rowoffsets_gpu->data()); 732042217e8SBarry Smith } else { 733042217e8SBarry Smith bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 734042217e8SBarry Smith } 735042217e8SBarry Smith bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 736042217e8SBarry Smith ba = thrust::raw_pointer_cast(matrixB->values->data()); 737042217e8SBarry Smith } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 738042217e8SBarry Smith d_mat = spptr->deviceMat; 739042217e8SBarry Smith } 7403fa6b06aSMark Adams if (jaca->compressedrow.use) { 741042217e8SBarry Smith if (!cusparsestructA->rowoffsets_gpu) { 742042217e8SBarry Smith cusparsestructA->rowoffsets_gpu = new THRUSTINTARRAY32(A->rmap->n+1); 743042217e8SBarry Smith cusparsestructA->rowoffsets_gpu->assign(jaca->i,jaca->i+A->rmap->n+1); 7443fa6b06aSMark Adams } 745042217e8SBarry Smith ai = thrust::raw_pointer_cast(cusparsestructA->rowoffsets_gpu->data()); 7463fa6b06aSMark Adams } else { 747042217e8SBarry Smith ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 7483fa6b06aSMark Adams } 749042217e8SBarry Smith aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 750042217e8SBarry Smith aa = thrust::raw_pointer_cast(matrixA->values->data()); 751042217e8SBarry Smith 752042217e8SBarry Smith if (!d_mat) { 753042217e8SBarry Smith cudaError_t cerr; 754042217e8SBarry Smith PetscSplitCSRDataStructure h_mat; 755042217e8SBarry Smith 756042217e8SBarry Smith // create and populate strucy on host and copy on device 757042217e8SBarry Smith ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 758042217e8SBarry Smith ierr = PetscNew(&h_mat);CHKERRQ(ierr); 759042217e8SBarry Smith cerr = cudaMalloc((void**)&d_mat,sizeof(*d_mat));CHKERRCUDA(cerr); 760042217e8SBarry Smith if (size > 1) { /* need the colmap array */ 761042217e8SBarry Smith Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 76299551766SMark Adams PetscInt *colmap; 763042217e8SBarry Smith PetscInt ii,n = aij->B->cmap->n,N = A->cmap->N; 764042217e8SBarry Smith 765*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(n && !aij->garray,PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 766042217e8SBarry Smith 767042217e8SBarry Smith ierr = PetscCalloc1(N+1,&colmap);CHKERRQ(ierr); 768042217e8SBarry Smith for (ii=0; ii<n; ii++) colmap[aij->garray[ii]] = (int)(ii+1); 769365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 770365b711fSMark Adams { // have to make a long version of these 77199551766SMark Adams int *h_bi32, *h_bj32; 77299551766SMark Adams PetscInt *h_bi64, *h_bj64, *d_bi64, *d_bj64; 773365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_bi32,jacb->nz,&h_bj32,A->rmap->n+1,&h_bi64,jacb->nz,&h_bj64);CHKERRQ(ierr); 77499551766SMark Adams cerr = cudaMemcpy(h_bi32, bi, (A->rmap->n+1)*sizeof(*h_bi32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 77599551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_bi64[i] = h_bi32[i]; 77699551766SMark Adams cerr = cudaMemcpy(h_bj32, bj, jacb->nz*sizeof(*h_bj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 77799551766SMark Adams for (int i=0;i<jacb->nz;i++) h_bj64[i] = h_bj32[i]; 77899551766SMark Adams 77999551766SMark Adams cerr = cudaMalloc((void**)&d_bi64,(A->rmap->n+1)*sizeof(*d_bi64));CHKERRCUDA(cerr); 78099551766SMark Adams cerr = cudaMemcpy(d_bi64, h_bi64,(A->rmap->n+1)*sizeof(*d_bi64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 78199551766SMark Adams cerr = cudaMalloc((void**)&d_bj64,jacb->nz*sizeof(*d_bj64));CHKERRCUDA(cerr); 78299551766SMark Adams cerr = cudaMemcpy(d_bj64, h_bj64,jacb->nz*sizeof(*d_bj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 78399551766SMark Adams 78499551766SMark Adams h_mat->offdiag.i = d_bi64; 78599551766SMark Adams h_mat->offdiag.j = d_bj64; 78699551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 78799551766SMark Adams 788365b711fSMark Adams ierr = PetscFree4(h_bi32,h_bj32,h_bi64,h_bj64);CHKERRQ(ierr); 789365b711fSMark Adams } 790365b711fSMark Adams #else 79199551766SMark Adams h_mat->offdiag.i = (PetscInt*)bi; 79299551766SMark Adams h_mat->offdiag.j = (PetscInt*)bj; 79399551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 794365b711fSMark Adams #endif 795042217e8SBarry Smith h_mat->offdiag.a = ba; 796042217e8SBarry Smith h_mat->offdiag.n = A->rmap->n; 797042217e8SBarry Smith 79899551766SMark Adams cerr = cudaMalloc((void**)&h_mat->colmap,(N+1)*sizeof(*h_mat->colmap));CHKERRCUDA(cerr); 79999551766SMark Adams cerr = cudaMemcpy(h_mat->colmap,colmap,(N+1)*sizeof(*h_mat->colmap),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 800042217e8SBarry Smith ierr = PetscFree(colmap);CHKERRQ(ierr); 801042217e8SBarry Smith } 802042217e8SBarry Smith h_mat->rstart = A->rmap->rstart; 803042217e8SBarry Smith h_mat->rend = A->rmap->rend; 804042217e8SBarry Smith h_mat->cstart = A->cmap->rstart; 805042217e8SBarry Smith h_mat->cend = A->cmap->rend; 80649b994a9SMark Adams h_mat->M = A->cmap->N; 807365b711fSMark Adams #if defined(PETSC_USE_64BIT_INDICES) 808365b711fSMark Adams { 809*2c71b3e2SJacob Faibussowitsch PetscCheckFalse(sizeof(PetscInt) != 8,PETSC_COMM_SELF,PETSC_ERR_PLIB,"size pof PetscInt = %d",sizeof(PetscInt)); 81099551766SMark Adams int *h_ai32, *h_aj32; 81199551766SMark Adams PetscInt *h_ai64, *h_aj64, *d_ai64, *d_aj64; 812365b711fSMark Adams ierr = PetscCalloc4(A->rmap->n+1,&h_ai32,jaca->nz,&h_aj32,A->rmap->n+1,&h_ai64,jaca->nz,&h_aj64);CHKERRQ(ierr); 81399551766SMark Adams cerr = cudaMemcpy(h_ai32, ai, (A->rmap->n+1)*sizeof(*h_ai32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 81499551766SMark Adams for (int i=0;i<A->rmap->n+1;i++) h_ai64[i] = h_ai32[i]; 81599551766SMark Adams cerr = cudaMemcpy(h_aj32, aj, jaca->nz*sizeof(*h_aj32),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr); 81699551766SMark Adams for (int i=0;i<jaca->nz;i++) h_aj64[i] = h_aj32[i]; 81799551766SMark Adams 81899551766SMark Adams cerr = cudaMalloc((void**)&d_ai64,(A->rmap->n+1)*sizeof(*d_ai64));CHKERRCUDA(cerr); 81999551766SMark Adams cerr = cudaMemcpy(d_ai64, h_ai64,(A->rmap->n+1)*sizeof(*d_ai64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 82099551766SMark Adams cerr = cudaMalloc((void**)&d_aj64,jaca->nz*sizeof(*d_aj64));CHKERRCUDA(cerr); 82199551766SMark Adams cerr = cudaMemcpy(d_aj64, h_aj64,jaca->nz*sizeof(*d_aj64),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 82299551766SMark Adams 82399551766SMark Adams h_mat->diag.i = d_ai64; 82499551766SMark Adams h_mat->diag.j = d_aj64; 82599551766SMark Adams h_mat->allocated_indices = PETSC_TRUE; 82699551766SMark Adams 827365b711fSMark Adams ierr = PetscFree4(h_ai32,h_aj32,h_ai64,h_aj64);CHKERRQ(ierr); 828365b711fSMark Adams } 829365b711fSMark Adams #else 83099551766SMark Adams h_mat->diag.i = (PetscInt*)ai; 83199551766SMark Adams h_mat->diag.j = (PetscInt*)aj; 83299551766SMark Adams h_mat->allocated_indices = PETSC_FALSE; 833365b711fSMark Adams #endif 834042217e8SBarry Smith h_mat->diag.a = aa; 835042217e8SBarry Smith h_mat->diag.n = A->rmap->n; 836042217e8SBarry Smith h_mat->rank = PetscGlobalRank; 837042217e8SBarry Smith // copy pointers and metadata to device 838042217e8SBarry Smith cerr = cudaMemcpy(d_mat,h_mat,sizeof(*d_mat),cudaMemcpyHostToDevice);CHKERRCUDA(cerr); 839042217e8SBarry Smith ierr = PetscFree(h_mat);CHKERRQ(ierr); 840042217e8SBarry Smith } else { 841042217e8SBarry Smith ierr = PetscInfo(A,"Reusing device matrix\n");CHKERRQ(ierr); 842042217e8SBarry Smith } 843042217e8SBarry Smith *B = d_mat; 844042217e8SBarry Smith A->offloadmask = PETSC_OFFLOAD_GPU; 8453fa6b06aSMark Adams PetscFunctionReturn(0); 8463fa6b06aSMark Adams } 847