1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 40fd2b57fSKarl Rupp 53d13b8fdSMatthew G. Knepley #include <petscconf.h> 69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 99ae82921SPaul Mullowney 109ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 119ae82921SPaul Mullowney { 12bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 149ae82921SPaul Mullowney PetscErrorCode ierr; 159ae82921SPaul Mullowney PetscInt i; 169ae82921SPaul Mullowney 179ae82921SPaul Mullowney PetscFunctionBegin; 189ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 199ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 209ae82921SPaul Mullowney if (d_nnz) { 219ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 229ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239ae82921SPaul Mullowney } 249ae82921SPaul Mullowney } 259ae82921SPaul Mullowney if (o_nnz) { 269ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 279ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 289ae82921SPaul Mullowney } 299ae82921SPaul Mullowney } 309ae82921SPaul Mullowney if (!B->preallocated) { 31bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 329ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 349ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 359ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 363bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 379ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 38b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 399ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 409ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 413bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 429ae82921SPaul Mullowney } 439ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 449ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 45e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 46e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 47b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 48b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 49b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 50b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 512205254eSKarl Rupp 529ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 539ae82921SPaul Mullowney PetscFunctionReturn(0); 549ae82921SPaul Mullowney } 55e057df02SPaul Mullowney 569ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 579ae82921SPaul Mullowney { 589ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 599ae82921SPaul Mullowney PetscErrorCode ierr; 609ae82921SPaul Mullowney PetscInt nt; 619ae82921SPaul Mullowney 629ae82921SPaul Mullowney PetscFunctionBegin; 639ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 649ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 65959dcdf5SJunchao Zhang ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 669ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 674d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 689ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 699ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 709ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 719ae82921SPaul Mullowney PetscFunctionReturn(0); 729ae82921SPaul Mullowney } 73ca45077fSPaul Mullowney 74*3fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 75*3fa6b06aSMark Adams { 76*3fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 77*3fa6b06aSMark Adams PetscErrorCode ierr; 78*3fa6b06aSMark Adams 79*3fa6b06aSMark Adams PetscFunctionBegin; 80*3fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 81*3fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 82*3fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 83*3fa6b06aSMark Adams if (d_mat) { 84*3fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 85*3fa6b06aSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 86*3fa6b06aSMark Adams PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 87*3fa6b06aSMark Adams cudaError_t err; 88*3fa6b06aSMark Adams PetscScalar *vals; 89*3fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 90*3fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 91*3fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 92*3fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 93*3fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 94*3fa6b06aSMark Adams } 95*3fa6b06aSMark Adams } 96*3fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 97*3fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 98*3fa6b06aSMark Adams 99*3fa6b06aSMark Adams PetscFunctionReturn(0); 100*3fa6b06aSMark Adams } 101fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 102fdc842d1SBarry Smith { 103fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 104fdc842d1SBarry Smith PetscErrorCode ierr; 105fdc842d1SBarry Smith PetscInt nt; 106fdc842d1SBarry Smith 107fdc842d1SBarry Smith PetscFunctionBegin; 108fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 109fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 110fdc842d1SBarry Smith ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 111fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1124d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 113fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 114fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 115fdc842d1SBarry Smith ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 116fdc842d1SBarry Smith PetscFunctionReturn(0); 117fdc842d1SBarry Smith } 118fdc842d1SBarry Smith 119ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 120ca45077fSPaul Mullowney { 121ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 122ca45077fSPaul Mullowney PetscErrorCode ierr; 123ca45077fSPaul Mullowney PetscInt nt; 124ca45077fSPaul Mullowney 125ca45077fSPaul Mullowney PetscFunctionBegin; 126ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 127ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 128a3fdcf43SKarl Rupp ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 1299b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 130ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 1319b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 1329b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 133ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 134ca45077fSPaul Mullowney PetscFunctionReturn(0); 135ca45077fSPaul Mullowney } 1369ae82921SPaul Mullowney 137e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 138ca45077fSPaul Mullowney { 139ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 140bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 141e057df02SPaul Mullowney 142ca45077fSPaul Mullowney PetscFunctionBegin; 143e057df02SPaul Mullowney switch (op) { 144e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 145e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 146045c96e1SPaul Mullowney break; 147e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 148e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 149045c96e1SPaul Mullowney break; 150e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 151e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 152e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 153045c96e1SPaul Mullowney break; 154e057df02SPaul Mullowney default: 155e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 156045c96e1SPaul Mullowney } 157ca45077fSPaul Mullowney PetscFunctionReturn(0); 158ca45077fSPaul Mullowney } 159e057df02SPaul Mullowney 1604416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 161e057df02SPaul Mullowney { 162e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 163e057df02SPaul Mullowney PetscErrorCode ierr; 164e057df02SPaul Mullowney PetscBool flg; 165a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 166a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 1675fd66863SKarl Rupp 168e057df02SPaul Mullowney PetscFunctionBegin; 169e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 170e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 171e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 172a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 173e057df02SPaul Mullowney if (flg) { 174e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 175e057df02SPaul Mullowney } 176e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 177a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 178e057df02SPaul Mullowney if (flg) { 179e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 180e057df02SPaul Mullowney } 181e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 182a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183e057df02SPaul Mullowney if (flg) { 184e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 185e057df02SPaul Mullowney } 186e057df02SPaul Mullowney } 1870af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 188e057df02SPaul Mullowney PetscFunctionReturn(0); 189e057df02SPaul Mullowney } 190e057df02SPaul Mullowney 19134d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 19234d6c7a5SJose E. Roman { 19334d6c7a5SJose E. Roman PetscErrorCode ierr; 194*3fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 195*3fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 196*3fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 197*3fa6b06aSMark Adams PetscInt nnz_state = A->nonzerostate; 19834d6c7a5SJose E. Roman PetscFunctionBegin; 199*3fa6b06aSMark Adams if (d_mat) { 200*3fa6b06aSMark Adams cudaError_t err; 201*3fa6b06aSMark Adams err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 202*3fa6b06aSMark Adams } 20334d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 20434d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 20534d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 20634d6c7a5SJose E. Roman } 207*3fa6b06aSMark Adams if (nnz_state > A->nonzerostate) { 208*3fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 209*3fa6b06aSMark Adams } 210*3fa6b06aSMark Adams 21134d6c7a5SJose E. Roman PetscFunctionReturn(0); 21234d6c7a5SJose E. Roman } 21334d6c7a5SJose E. Roman 214bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 215bbf3fe20SPaul Mullowney { 216bbf3fe20SPaul Mullowney PetscErrorCode ierr; 217*3fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 218*3fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 219b06137fdSPaul Mullowney cudaError_t err; 220b06137fdSPaul Mullowney cusparseStatus_t stat; 221bbf3fe20SPaul Mullowney 222bbf3fe20SPaul Mullowney PetscFunctionBegin; 223*3fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 224*3fa6b06aSMark Adams Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 225*3fa6b06aSMark Adams Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 226*3fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 227*3fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 228*3fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 229*3fa6b06aSMark Adams if (jaca->compressedrow.use) { 230*3fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 231*3fa6b06aSMark Adams } 232*3fa6b06aSMark Adams if (jacb->compressedrow.use) { 233*3fa6b06aSMark Adams err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 234*3fa6b06aSMark Adams } 235*3fa6b06aSMark Adams err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 236*3fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 237*3fa6b06aSMark Adams } 238bbf3fe20SPaul Mullowney try { 239*3fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 240*3fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 24157d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 24217403302SKarl Rupp if (cusparseStruct->stream) { 243c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 24417403302SKarl Rupp } 245bbf3fe20SPaul Mullowney delete cusparseStruct; 246bbf3fe20SPaul Mullowney } catch(char *ex) { 247bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 248bbf3fe20SPaul Mullowney } 2493338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 2503338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 251bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 252bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 253bbf3fe20SPaul Mullowney } 254ca45077fSPaul Mullowney 2553338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 2569ae82921SPaul Mullowney { 2579ae82921SPaul Mullowney PetscErrorCode ierr; 258bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 259bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 260b06137fdSPaul Mullowney cusparseStatus_t stat; 2613338378cSStefano Zampini Mat A; 2629ae82921SPaul Mullowney 2639ae82921SPaul Mullowney PetscFunctionBegin; 2643338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 2653338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2663338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 2673338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2683338378cSStefano Zampini } 2693338378cSStefano Zampini A = *newmat; 2703338378cSStefano Zampini 27134136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 27234136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 27334136279SStefano Zampini 274bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 2753338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 276bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2772205254eSKarl Rupp 278bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 279e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 280e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 28117403302SKarl Rupp cusparseStruct->stream = 0; 28257d48284SJunchao Zhang stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 283*3fa6b06aSMark Adams cusparseStruct->deviceMat = NULL; 2843338378cSStefano Zampini } 2852205254eSKarl Rupp 28634d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 287bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 288fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 289bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 290bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 291bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 292*3fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 2932205254eSKarl Rupp 294bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 2953338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 296bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2979ae82921SPaul Mullowney PetscFunctionReturn(0); 2989ae82921SPaul Mullowney } 2999ae82921SPaul Mullowney 3003338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 3013338378cSStefano Zampini { 3023338378cSStefano Zampini PetscErrorCode ierr; 3033338378cSStefano Zampini 3043338378cSStefano Zampini PetscFunctionBegin; 30505035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 3063338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 3073338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 3083338378cSStefano Zampini PetscFunctionReturn(0); 3093338378cSStefano Zampini } 3103338378cSStefano Zampini 3113f9c0db1SPaul Mullowney /*@ 3123f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 313e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3143f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 315e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 316e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 317e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 3189ae82921SPaul Mullowney 319d083f849SBarry Smith Collective 320e057df02SPaul Mullowney 321e057df02SPaul Mullowney Input Parameters: 322e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 323e057df02SPaul Mullowney . m - number of rows 324e057df02SPaul Mullowney . n - number of columns 325e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 326e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 3270298fd71SBarry Smith (possibly different for each row) or NULL 328e057df02SPaul Mullowney 329e057df02SPaul Mullowney Output Parameter: 330e057df02SPaul Mullowney . A - the matrix 331e057df02SPaul Mullowney 332e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 333e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 334e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 335e057df02SPaul Mullowney 336e057df02SPaul Mullowney Notes: 337e057df02SPaul Mullowney If nnz is given then nz is ignored 338e057df02SPaul Mullowney 339e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 340e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 341e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 342e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 343e057df02SPaul Mullowney 344e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 3450298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 346e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 347e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 348e057df02SPaul Mullowney 349e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 350e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 351e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 352e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 353e057df02SPaul Mullowney 354e057df02SPaul Mullowney Level: intermediate 355e057df02SPaul Mullowney 356e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 357e057df02SPaul Mullowney @*/ 3589ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3599ae82921SPaul Mullowney { 3609ae82921SPaul Mullowney PetscErrorCode ierr; 3619ae82921SPaul Mullowney PetscMPIInt size; 3629ae82921SPaul Mullowney 3639ae82921SPaul Mullowney PetscFunctionBegin; 3649ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3659ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3669ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3679ae82921SPaul Mullowney if (size > 1) { 3689ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3699ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3709ae82921SPaul Mullowney } else { 3719ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3729ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3739ae82921SPaul Mullowney } 3749ae82921SPaul Mullowney PetscFunctionReturn(0); 3759ae82921SPaul Mullowney } 3769ae82921SPaul Mullowney 3773ca39a21SBarry Smith /*MC 378e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 379e057df02SPaul Mullowney 3802692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3812692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3822692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3839ae82921SPaul Mullowney 3849ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3859ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3869ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3879ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3889ae82921SPaul Mullowney the above preallocation routines for simplicity. 3899ae82921SPaul Mullowney 3909ae82921SPaul Mullowney Options Database Keys: 391e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3928468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3938468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3948468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3959ae82921SPaul Mullowney 3969ae82921SPaul Mullowney Level: beginner 3979ae82921SPaul Mullowney 3988468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3998468deeeSKarl Rupp M 4009ae82921SPaul Mullowney M*/ 401*3fa6b06aSMark Adams 402*3fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 403*3fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 404*3fa6b06aSMark Adams { 405*3fa6b06aSMark Adams PetscSplitCSRDataStructure **p_d_mat; 406*3fa6b06aSMark Adams PetscMPIInt size,rank; 407*3fa6b06aSMark Adams MPI_Comm comm; 408*3fa6b06aSMark Adams PetscErrorCode ierr; 409*3fa6b06aSMark Adams int *ai,*bi,*aj,*bj; 410*3fa6b06aSMark Adams PetscScalar *aa,*ba; 411*3fa6b06aSMark Adams 412*3fa6b06aSMark Adams PetscFunctionBegin; 413*3fa6b06aSMark Adams ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 414*3fa6b06aSMark Adams ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 415*3fa6b06aSMark Adams ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 416*3fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 417*3fa6b06aSMark Adams CsrMatrix *matrixA,*matrixB=NULL; 418*3fa6b06aSMark Adams if (size == 1) { 419*3fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 420*3fa6b06aSMark Adams p_d_mat = &cusparsestruct->deviceMat; 421*3fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 422*3fa6b06aSMark Adams if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 423*3fa6b06aSMark Adams matrixA = (CsrMatrix*)matstruct->mat; 424*3fa6b06aSMark Adams bi = bj = NULL; ba = NULL; 425*3fa6b06aSMark Adams } else { 426*3fa6b06aSMark Adams SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 427*3fa6b06aSMark Adams } 428*3fa6b06aSMark Adams } else { 429*3fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 430*3fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 431*3fa6b06aSMark Adams p_d_mat = &spptr->deviceMat; 432*3fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 433*3fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 434*3fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 435*3fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 436*3fa6b06aSMark Adams if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 437*3fa6b06aSMark Adams if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 438*3fa6b06aSMark Adams matrixA = (CsrMatrix*)matstructA->mat; 439*3fa6b06aSMark Adams matrixB = (CsrMatrix*)matstructB->mat; 440*3fa6b06aSMark Adams bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 441*3fa6b06aSMark Adams bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 442*3fa6b06aSMark Adams ba = thrust::raw_pointer_cast(matrixB->values->data()); 443*3fa6b06aSMark Adams if (rank==-1) { 444*3fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++) 445*3fa6b06aSMark Adams std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl; 446*3fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->column_indices->size(); i++) 447*3fa6b06aSMark Adams std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl; 448*3fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->values->size(); i++) 449*3fa6b06aSMark Adams std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl; 450*3fa6b06aSMark Adams } 451*3fa6b06aSMark Adams } else { 452*3fa6b06aSMark Adams SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 453*3fa6b06aSMark Adams } 454*3fa6b06aSMark Adams } 455*3fa6b06aSMark Adams ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 456*3fa6b06aSMark Adams aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 457*3fa6b06aSMark Adams aa = thrust::raw_pointer_cast(matrixA->values->data()); 458*3fa6b06aSMark Adams } else { 459*3fa6b06aSMark Adams *B = NULL; 460*3fa6b06aSMark Adams PetscFunctionReturn(0); 461*3fa6b06aSMark Adams } 462*3fa6b06aSMark Adams // act like MatSetValues because not called on host 463*3fa6b06aSMark Adams if (A->assembled) { 464*3fa6b06aSMark Adams if (A->was_assembled) { 465*3fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 466*3fa6b06aSMark Adams } 467*3fa6b06aSMark Adams A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 468*3fa6b06aSMark Adams } else { 469*3fa6b06aSMark Adams SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 470*3fa6b06aSMark Adams } 471*3fa6b06aSMark Adams if (!*p_d_mat) { 472*3fa6b06aSMark Adams cudaError_t err; 473*3fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat, h_mat; 474*3fa6b06aSMark Adams Mat_SeqAIJ *jaca; 475*3fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz; 476*3fa6b06aSMark Adams // create and copy 477*3fa6b06aSMark Adams ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 478*3fa6b06aSMark Adams err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 479*3fa6b06aSMark Adams err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 480*3fa6b06aSMark Adams *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 481*3fa6b06aSMark Adams if (size == 1) { 482*3fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 483*3fa6b06aSMark Adams h_mat.nonzerostate = A->nonzerostate; 484*3fa6b06aSMark Adams h_mat.rstart = 0; h_mat.rend = A->rmap->n; 485*3fa6b06aSMark Adams h_mat.cstart = 0; h_mat.cend = A->cmap->n; 486*3fa6b06aSMark Adams h_mat.offdiag.i = h_mat.offdiag.j = NULL; 487*3fa6b06aSMark Adams h_mat.offdiag.a = NULL; 488*3fa6b06aSMark Adams h_mat.seq = PETSC_TRUE; 489*3fa6b06aSMark Adams } else { 490*3fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 491*3fa6b06aSMark Adams Mat_SeqAIJ *jacb; 492*3fa6b06aSMark Adams h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE 493*3fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 494*3fa6b06aSMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 495*3fa6b06aSMark Adams h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state? 496*3fa6b06aSMark Adams if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 497*3fa6b06aSMark Adams if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 498*3fa6b06aSMark Adams // create colmap - this is ussually done (lazy) in MatSetValues 499*3fa6b06aSMark Adams aij->donotstash = PETSC_TRUE; 500*3fa6b06aSMark Adams aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 501*3fa6b06aSMark Adams jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 502*3fa6b06aSMark Adams #if defined(PETSC_USE_CTABLE) 503*3fa6b06aSMark Adams SETERRQ(comm,PETSC_ERR_SUP,"Devioce metadata does not support ctable (--with-ctable=0)"); 504*3fa6b06aSMark Adams #else 505*3fa6b06aSMark Adams ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 506*3fa6b06aSMark Adams aij->colmap[A->cmap->N] = -9; 507*3fa6b06aSMark Adams ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 508*3fa6b06aSMark Adams { 509*3fa6b06aSMark Adams PetscInt ii; 510*3fa6b06aSMark Adams for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 511*3fa6b06aSMark Adams } 512*3fa6b06aSMark Adams if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 513*3fa6b06aSMark Adams #endif 514*3fa6b06aSMark Adams // allocate B copy data 515*3fa6b06aSMark Adams h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 516*3fa6b06aSMark Adams h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 517*3fa6b06aSMark Adams nnz = jacb->i[n]; 518*3fa6b06aSMark Adams 519*3fa6b06aSMark Adams if (jacb->compressedrow.use) { 520*3fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 521*3fa6b06aSMark Adams err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 522*3fa6b06aSMark Adams } else { 523*3fa6b06aSMark Adams h_mat.offdiag.i = bi; 524*3fa6b06aSMark Adams } 525*3fa6b06aSMark Adams h_mat.offdiag.j = bj; 526*3fa6b06aSMark Adams h_mat.offdiag.a = ba; 527*3fa6b06aSMark Adams 528*3fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 529*3fa6b06aSMark Adams err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 530*3fa6b06aSMark Adams h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 531*3fa6b06aSMark Adams h_mat.offdiag.n = n; 532*3fa6b06aSMark Adams } 533*3fa6b06aSMark Adams // allocate A copy data 534*3fa6b06aSMark Adams nnz = jaca->i[n]; 535*3fa6b06aSMark Adams h_mat.diag.n = n; 536*3fa6b06aSMark Adams h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 537*3fa6b06aSMark Adams ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 538*3fa6b06aSMark Adams if (jaca->compressedrow.use) { 539*3fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 540*3fa6b06aSMark Adams err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 541*3fa6b06aSMark Adams } else { 542*3fa6b06aSMark Adams h_mat.diag.i = ai; 543*3fa6b06aSMark Adams } 544*3fa6b06aSMark Adams h_mat.diag.j = aj; 545*3fa6b06aSMark Adams h_mat.diag.a = aa; 546*3fa6b06aSMark Adams // copy pointers and metdata to device 547*3fa6b06aSMark Adams err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 548*3fa6b06aSMark Adams ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 549*3fa6b06aSMark Adams } else { 550*3fa6b06aSMark Adams *B = *p_d_mat; 551*3fa6b06aSMark Adams } 552*3fa6b06aSMark Adams A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 553*3fa6b06aSMark Adams PetscFunctionReturn(0); 554*3fa6b06aSMark Adams } 555