1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK 253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX 399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1 40fd2b57fSKarl Rupp 53d13b8fdSMatthew G. Knepley #include <petscconf.h> 69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/ 757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h> 83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h> 99ae82921SPaul Mullowney 109ae82921SPaul Mullowney PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[]) 119ae82921SPaul Mullowney { 12bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ*)B->data; 13bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr; 149ae82921SPaul Mullowney PetscErrorCode ierr; 159ae82921SPaul Mullowney PetscInt i; 169ae82921SPaul Mullowney 179ae82921SPaul Mullowney PetscFunctionBegin; 189ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr); 199ae82921SPaul Mullowney ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr); 209ae82921SPaul Mullowney if (d_nnz) { 219ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 229ae82921SPaul Mullowney if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]); 239ae82921SPaul Mullowney } 249ae82921SPaul Mullowney } 259ae82921SPaul Mullowney if (o_nnz) { 269ae82921SPaul Mullowney for (i=0; i<B->rmap->n; i++) { 279ae82921SPaul Mullowney if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]); 289ae82921SPaul Mullowney } 299ae82921SPaul Mullowney } 309ae82921SPaul Mullowney if (!B->preallocated) { 31bbf3fe20SPaul Mullowney /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */ 329ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr); 33b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr); 349ae82921SPaul Mullowney ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr); 359ae82921SPaul Mullowney ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 363bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr); 379ae82921SPaul Mullowney ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr); 38b470e4b4SRichard Tran Mills ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr); 399ae82921SPaul Mullowney ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr); 409ae82921SPaul Mullowney ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 413bb1ff40SBarry Smith ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr); 429ae82921SPaul Mullowney } 439ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr); 449ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr); 45e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr); 46e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr); 47b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr); 48b06137fdSPaul Mullowney ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr); 49b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr); 50b06137fdSPaul Mullowney ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr); 512205254eSKarl Rupp 529ae82921SPaul Mullowney B->preallocated = PETSC_TRUE; 539ae82921SPaul Mullowney PetscFunctionReturn(0); 549ae82921SPaul Mullowney } 55e057df02SPaul Mullowney 569ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 579ae82921SPaul Mullowney { 589ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 599ae82921SPaul Mullowney PetscErrorCode ierr; 609ae82921SPaul Mullowney PetscInt nt; 619ae82921SPaul Mullowney 629ae82921SPaul Mullowney PetscFunctionBegin; 639ae82921SPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 649ae82921SPaul Mullowney if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 65959dcdf5SJunchao Zhang ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 669ae82921SPaul Mullowney ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 674d55d066SJunchao Zhang ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr); 689ae82921SPaul Mullowney ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 699ae82921SPaul Mullowney ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr); 709ae82921SPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 719ae82921SPaul Mullowney PetscFunctionReturn(0); 729ae82921SPaul Mullowney } 73ca45077fSPaul Mullowney 743fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A) 753fa6b06aSMark Adams { 763fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ*)A->data; 773fa6b06aSMark Adams PetscErrorCode ierr; 783fa6b06aSMark Adams 793fa6b06aSMark Adams PetscFunctionBegin; 803fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 813fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr; 823fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = spptr->deviceMat; 833fa6b06aSMark Adams if (d_mat) { 843fa6b06aSMark Adams Mat_SeqAIJ *a = (Mat_SeqAIJ*)l->A->data; 853fa6b06aSMark Adams Mat_SeqAIJ *b = (Mat_SeqAIJ*)l->B->data; 863fa6b06aSMark Adams PetscInt n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n]; 873fa6b06aSMark Adams cudaError_t err; 883fa6b06aSMark Adams PetscScalar *vals; 893fa6b06aSMark Adams ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr); 903fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 913fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err); 923fa6b06aSMark Adams err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 933fa6b06aSMark Adams err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err); 943fa6b06aSMark Adams } 953fa6b06aSMark Adams } 963fa6b06aSMark Adams ierr = MatZeroEntries(l->A);CHKERRQ(ierr); 973fa6b06aSMark Adams ierr = MatZeroEntries(l->B);CHKERRQ(ierr); 983fa6b06aSMark Adams 993fa6b06aSMark Adams PetscFunctionReturn(0); 1003fa6b06aSMark Adams } 101fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz) 102fdc842d1SBarry Smith { 103fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 104fdc842d1SBarry Smith PetscErrorCode ierr; 105fdc842d1SBarry Smith PetscInt nt; 106fdc842d1SBarry Smith 107fdc842d1SBarry Smith PetscFunctionBegin; 108fdc842d1SBarry Smith ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 109fdc842d1SBarry Smith if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt); 110fdc842d1SBarry Smith ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr); 111fdc842d1SBarry Smith ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 1124d55d066SJunchao Zhang ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr); 113fdc842d1SBarry Smith ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr); 114fdc842d1SBarry Smith ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr); 115fdc842d1SBarry Smith ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 116fdc842d1SBarry Smith PetscFunctionReturn(0); 117fdc842d1SBarry Smith } 118fdc842d1SBarry Smith 119ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy) 120ca45077fSPaul Mullowney { 121ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 122ca45077fSPaul Mullowney PetscErrorCode ierr; 123ca45077fSPaul Mullowney PetscInt nt; 124ca45077fSPaul Mullowney 125ca45077fSPaul Mullowney PetscFunctionBegin; 126ca45077fSPaul Mullowney ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr); 127ccf5f80bSJunchao Zhang if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt); 128a3fdcf43SKarl Rupp ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr); 1299b2db222SKarl Rupp ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr); 130ca45077fSPaul Mullowney ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr); 1319b2db222SKarl Rupp ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 1329b2db222SKarl Rupp ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr); 133ca45077fSPaul Mullowney ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr); 134ca45077fSPaul Mullowney PetscFunctionReturn(0); 135ca45077fSPaul Mullowney } 1369ae82921SPaul Mullowney 137e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format) 138ca45077fSPaul Mullowney { 139ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 140bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 141e057df02SPaul Mullowney 142ca45077fSPaul Mullowney PetscFunctionBegin; 143e057df02SPaul Mullowney switch (op) { 144e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_DIAG: 145e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 146045c96e1SPaul Mullowney break; 147e057df02SPaul Mullowney case MAT_CUSPARSE_MULT_OFFDIAG: 148e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 149045c96e1SPaul Mullowney break; 150e057df02SPaul Mullowney case MAT_CUSPARSE_ALL: 151e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format; 152e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format; 153045c96e1SPaul Mullowney break; 154e057df02SPaul Mullowney default: 155e057df02SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op); 156045c96e1SPaul Mullowney } 157ca45077fSPaul Mullowney PetscFunctionReturn(0); 158ca45077fSPaul Mullowney } 159e057df02SPaul Mullowney 1604416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A) 161e057df02SPaul Mullowney { 162e057df02SPaul Mullowney MatCUSPARSEStorageFormat format; 163e057df02SPaul Mullowney PetscErrorCode ierr; 164e057df02SPaul Mullowney PetscBool flg; 165a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ*)A->data; 166a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 1675fd66863SKarl Rupp 168e057df02SPaul Mullowney PetscFunctionBegin; 169e55864a3SBarry Smith ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr); 170e057df02SPaul Mullowney if (A->factortype==MAT_FACTOR_NONE) { 171e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", 172a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 173e057df02SPaul Mullowney if (flg) { 174e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr); 175e057df02SPaul Mullowney } 176e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 177a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 178e057df02SPaul Mullowney if (flg) { 179e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr); 180e057df02SPaul Mullowney } 181e057df02SPaul Mullowney ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", 182a183c035SDominic Meiser "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr); 183e057df02SPaul Mullowney if (flg) { 184e057df02SPaul Mullowney ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr); 185e057df02SPaul Mullowney } 186e057df02SPaul Mullowney } 1870af67c1bSStefano Zampini ierr = PetscOptionsTail();CHKERRQ(ierr); 188e057df02SPaul Mullowney PetscFunctionReturn(0); 189e057df02SPaul Mullowney } 190e057df02SPaul Mullowney 19134d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode) 19234d6c7a5SJose E. Roman { 19334d6c7a5SJose E. Roman PetscErrorCode ierr; 1943fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ*)A->data; 1953fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr; 1963fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat; 1973fa6b06aSMark Adams PetscInt nnz_state = A->nonzerostate; 19834d6c7a5SJose E. Roman PetscFunctionBegin; 1993fa6b06aSMark Adams if (d_mat) { 2003fa6b06aSMark Adams cudaError_t err; 2013fa6b06aSMark Adams err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2023fa6b06aSMark Adams } 20334d6c7a5SJose E. Roman ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr); 20434d6c7a5SJose E. Roman if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) { 20534d6c7a5SJose E. Roman ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr); 20634d6c7a5SJose E. Roman } 2073fa6b06aSMark Adams if (nnz_state > A->nonzerostate) { 2083fa6b06aSMark Adams A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device 2093fa6b06aSMark Adams } 2103fa6b06aSMark Adams 21134d6c7a5SJose E. Roman PetscFunctionReturn(0); 21234d6c7a5SJose E. Roman } 21334d6c7a5SJose E. Roman 214bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A) 215bbf3fe20SPaul Mullowney { 216bbf3fe20SPaul Mullowney PetscErrorCode ierr; 2173fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 2183fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr; 219b06137fdSPaul Mullowney cudaError_t err; 220b06137fdSPaul Mullowney cusparseStatus_t stat; 221bbf3fe20SPaul Mullowney 222bbf3fe20SPaul Mullowney PetscFunctionBegin; 2233fa6b06aSMark Adams if (cusparseStruct->deviceMat) { 2243fa6b06aSMark Adams Mat_SeqAIJ *jaca = (Mat_SeqAIJ*)aij->A->data; 2253fa6b06aSMark Adams Mat_SeqAIJ *jacb = (Mat_SeqAIJ*)aij->B->data; 2263fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat; 2273fa6b06aSMark Adams ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr); 2283fa6b06aSMark Adams err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err); 2293fa6b06aSMark Adams if (jaca->compressedrow.use) { 2303fa6b06aSMark Adams err = cudaFree(h_mat.diag.i);CHKERRCUDA(err); 2313fa6b06aSMark Adams } 2323fa6b06aSMark Adams if (jacb->compressedrow.use) { 2333fa6b06aSMark Adams err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err); 2343fa6b06aSMark Adams } 2353fa6b06aSMark Adams err = cudaFree(h_mat.colmap);CHKERRCUDA(err); 2363fa6b06aSMark Adams err = cudaFree(d_mat);CHKERRCUDA(err); 2373fa6b06aSMark Adams } 238bbf3fe20SPaul Mullowney try { 2393fa6b06aSMark Adams if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); } 2403fa6b06aSMark Adams if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); } 24157d48284SJunchao Zhang stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat); 24217403302SKarl Rupp if (cusparseStruct->stream) { 243c41cb2e2SAlejandro Lamas Daviña err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err); 24417403302SKarl Rupp } 245bbf3fe20SPaul Mullowney delete cusparseStruct; 246bbf3fe20SPaul Mullowney } catch(char *ex) { 247bbf3fe20SPaul Mullowney SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex); 248bbf3fe20SPaul Mullowney } 2493338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr); 2503338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr); 251bbf3fe20SPaul Mullowney ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr); 252bbf3fe20SPaul Mullowney PetscFunctionReturn(0); 253bbf3fe20SPaul Mullowney } 254ca45077fSPaul Mullowney 2553338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat) 2569ae82921SPaul Mullowney { 2579ae82921SPaul Mullowney PetscErrorCode ierr; 258bbf3fe20SPaul Mullowney Mat_MPIAIJ *a; 259bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct; 260b06137fdSPaul Mullowney cusparseStatus_t stat; 2613338378cSStefano Zampini Mat A; 2629ae82921SPaul Mullowney 2639ae82921SPaul Mullowney PetscFunctionBegin; 2643338378cSStefano Zampini if (reuse == MAT_INITIAL_MATRIX) { 2653338378cSStefano Zampini ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr); 2663338378cSStefano Zampini } else if (reuse == MAT_REUSE_MATRIX) { 2673338378cSStefano Zampini ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr); 2683338378cSStefano Zampini } 2693338378cSStefano Zampini A = *newmat; 2703338378cSStefano Zampini 27134136279SStefano Zampini ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr); 27234136279SStefano Zampini ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr); 27334136279SStefano Zampini 274bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ*)A->data; 2753338378cSStefano Zampini if (reuse != MAT_REUSE_MATRIX && !a->spptr) { 276bbf3fe20SPaul Mullowney a->spptr = new Mat_MPIAIJCUSPARSE; 2772205254eSKarl Rupp 278bbf3fe20SPaul Mullowney cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr; 279e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = MAT_CUSPARSE_CSR; 280e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR; 28117403302SKarl Rupp cusparseStruct->stream = 0; 28257d48284SJunchao Zhang stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat); 2833fa6b06aSMark Adams cusparseStruct->deviceMat = NULL; 2843338378cSStefano Zampini } 2852205254eSKarl Rupp 28634d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE; 287bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE; 288fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE; 289bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE; 290bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE; 291bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE; 2923fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE; 2932205254eSKarl Rupp 294bbf3fe20SPaul Mullowney ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 2953338378cSStefano Zampini ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr); 296bdf89e91SBarry Smith ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr); 2979ae82921SPaul Mullowney PetscFunctionReturn(0); 2989ae82921SPaul Mullowney } 2999ae82921SPaul Mullowney 3003338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A) 3013338378cSStefano Zampini { 3023338378cSStefano Zampini PetscErrorCode ierr; 3033338378cSStefano Zampini 3043338378cSStefano Zampini PetscFunctionBegin; 30505035670SJunchao Zhang ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr); 3063338378cSStefano Zampini ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr); 3073338378cSStefano Zampini ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr); 3083338378cSStefano Zampini PetscFunctionReturn(0); 3093338378cSStefano Zampini } 3103338378cSStefano Zampini 3113f9c0db1SPaul Mullowney /*@ 3123f9c0db1SPaul Mullowney MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format 313e057df02SPaul Mullowney (the default parallel PETSc format). This matrix will ultimately pushed down 3143f9c0db1SPaul Mullowney to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix 315e057df02SPaul Mullowney assembly performance the user should preallocate the matrix storage by setting 316e057df02SPaul Mullowney the parameter nz (or the array nnz). By setting these parameters accurately, 317e057df02SPaul Mullowney performance during matrix assembly can be increased by more than a factor of 50. 3189ae82921SPaul Mullowney 319d083f849SBarry Smith Collective 320e057df02SPaul Mullowney 321e057df02SPaul Mullowney Input Parameters: 322e057df02SPaul Mullowney + comm - MPI communicator, set to PETSC_COMM_SELF 323e057df02SPaul Mullowney . m - number of rows 324e057df02SPaul Mullowney . n - number of columns 325e057df02SPaul Mullowney . nz - number of nonzeros per row (same for all rows) 326e057df02SPaul Mullowney - nnz - array containing the number of nonzeros in the various rows 3270298fd71SBarry Smith (possibly different for each row) or NULL 328e057df02SPaul Mullowney 329e057df02SPaul Mullowney Output Parameter: 330e057df02SPaul Mullowney . A - the matrix 331e057df02SPaul Mullowney 332e057df02SPaul Mullowney It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(), 333e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly. 334e057df02SPaul Mullowney [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation] 335e057df02SPaul Mullowney 336e057df02SPaul Mullowney Notes: 337e057df02SPaul Mullowney If nnz is given then nz is ignored 338e057df02SPaul Mullowney 339e057df02SPaul Mullowney The AIJ format (also called the Yale sparse matrix format or 340e057df02SPaul Mullowney compressed row storage), is fully compatible with standard Fortran 77 341e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at 342e057df02SPaul Mullowney either one (as in Fortran) or zero. See the users' manual for details. 343e057df02SPaul Mullowney 344e057df02SPaul Mullowney Specify the preallocated storage with either nz or nnz (not both). 3450298fd71SBarry Smith Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory 346e057df02SPaul Mullowney allocation. For large problems you MUST preallocate memory or you 347e057df02SPaul Mullowney will get TERRIBLE performance, see the users' manual chapter on matrices. 348e057df02SPaul Mullowney 349e057df02SPaul Mullowney By default, this format uses inodes (identical nodes) when possible, to 350e057df02SPaul Mullowney improve numerical efficiency of matrix-vector products and solves. We 351e057df02SPaul Mullowney search for consecutive rows with the same nonzero structure, thereby 352e057df02SPaul Mullowney reusing matrix information to achieve increased efficiency. 353e057df02SPaul Mullowney 354e057df02SPaul Mullowney Level: intermediate 355e057df02SPaul Mullowney 356e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE 357e057df02SPaul Mullowney @*/ 3589ae82921SPaul Mullowney PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A) 3599ae82921SPaul Mullowney { 3609ae82921SPaul Mullowney PetscErrorCode ierr; 3619ae82921SPaul Mullowney PetscMPIInt size; 3629ae82921SPaul Mullowney 3639ae82921SPaul Mullowney PetscFunctionBegin; 3649ae82921SPaul Mullowney ierr = MatCreate(comm,A);CHKERRQ(ierr); 3659ae82921SPaul Mullowney ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr); 3669ae82921SPaul Mullowney ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 3679ae82921SPaul Mullowney if (size > 1) { 3689ae82921SPaul Mullowney ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr); 3699ae82921SPaul Mullowney ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr); 3709ae82921SPaul Mullowney } else { 3719ae82921SPaul Mullowney ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); 3729ae82921SPaul Mullowney ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr); 3739ae82921SPaul Mullowney } 3749ae82921SPaul Mullowney PetscFunctionReturn(0); 3759ae82921SPaul Mullowney } 3769ae82921SPaul Mullowney 3773ca39a21SBarry Smith /*MC 378e057df02SPaul Mullowney MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices. 379e057df02SPaul Mullowney 3802692e278SPaul Mullowney A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either 3812692e278SPaul Mullowney CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later. 3822692e278SPaul Mullowney All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library. 3839ae82921SPaul Mullowney 3849ae82921SPaul Mullowney This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator, 3859ae82921SPaul Mullowney and MATMPIAIJCUSPARSE otherwise. As a result, for single process communicators, 3869ae82921SPaul Mullowney MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported 3879ae82921SPaul Mullowney for communicators controlling multiple processes. It is recommended that you call both of 3889ae82921SPaul Mullowney the above preallocation routines for simplicity. 3899ae82921SPaul Mullowney 3909ae82921SPaul Mullowney Options Database Keys: 391e057df02SPaul Mullowney + -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions() 3928468deeeSKarl Rupp . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3938468deeeSKarl Rupp . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3948468deeeSKarl Rupp - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid). 3959ae82921SPaul Mullowney 3969ae82921SPaul Mullowney Level: beginner 3979ae82921SPaul Mullowney 3988468deeeSKarl Rupp .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation 3998468deeeSKarl Rupp M 4009ae82921SPaul Mullowney M*/ 4013fa6b06aSMark Adams 4023fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat. 4033fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B) 4043fa6b06aSMark Adams { 405*9db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE) 406*9db3cbf9SStefano Zampini SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)"); 407*9db3cbf9SStefano Zampini #else 4083fa6b06aSMark Adams PetscSplitCSRDataStructure **p_d_mat; 4093fa6b06aSMark Adams PetscMPIInt size,rank; 4103fa6b06aSMark Adams MPI_Comm comm; 4113fa6b06aSMark Adams PetscErrorCode ierr; 4123fa6b06aSMark Adams int *ai,*bi,*aj,*bj; 4133fa6b06aSMark Adams PetscScalar *aa,*ba; 4143fa6b06aSMark Adams 4153fa6b06aSMark Adams PetscFunctionBegin; 4163fa6b06aSMark Adams ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr); 4173fa6b06aSMark Adams ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr); 4183fa6b06aSMark Adams ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr); 4193fa6b06aSMark Adams if (A->factortype == MAT_FACTOR_NONE) { 4203fa6b06aSMark Adams CsrMatrix *matrixA,*matrixB=NULL; 4213fa6b06aSMark Adams if (size == 1) { 4223fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr; 4233fa6b06aSMark Adams p_d_mat = &cusparsestruct->deviceMat; 4243fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat; 4253fa6b06aSMark Adams if (cusparsestruct->format==MAT_CUSPARSE_CSR) { 4263fa6b06aSMark Adams matrixA = (CsrMatrix*)matstruct->mat; 4273fa6b06aSMark Adams bi = bj = NULL; ba = NULL; 4283fa6b06aSMark Adams } else { 4293fa6b06aSMark Adams SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR"); 4303fa6b06aSMark Adams } 4313fa6b06aSMark Adams } else { 4323fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 4333fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr; 4343fa6b06aSMark Adams p_d_mat = &spptr->deviceMat; 4353fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr; 4363fa6b06aSMark Adams Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr; 4373fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat; 4383fa6b06aSMark Adams Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat; 4393fa6b06aSMark Adams if (cusparsestructA->format==MAT_CUSPARSE_CSR) { 4403fa6b06aSMark Adams if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR"); 4413fa6b06aSMark Adams matrixA = (CsrMatrix*)matstructA->mat; 4423fa6b06aSMark Adams matrixB = (CsrMatrix*)matstructB->mat; 4433fa6b06aSMark Adams bi = thrust::raw_pointer_cast(matrixB->row_offsets->data()); 4443fa6b06aSMark Adams bj = thrust::raw_pointer_cast(matrixB->column_indices->data()); 4453fa6b06aSMark Adams ba = thrust::raw_pointer_cast(matrixB->values->data()); 4463fa6b06aSMark Adams if (rank==-1) { 4473fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++) 4483fa6b06aSMark Adams std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl; 4493fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->column_indices->size(); i++) 4503fa6b06aSMark Adams std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl; 4513fa6b06aSMark Adams for(unsigned int i = 0; i < matrixB->values->size(); i++) 4523fa6b06aSMark Adams std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl; 4533fa6b06aSMark Adams } 4543fa6b06aSMark Adams } else { 4553fa6b06aSMark Adams SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR"); 4563fa6b06aSMark Adams } 4573fa6b06aSMark Adams } 4583fa6b06aSMark Adams ai = thrust::raw_pointer_cast(matrixA->row_offsets->data()); 4593fa6b06aSMark Adams aj = thrust::raw_pointer_cast(matrixA->column_indices->data()); 4603fa6b06aSMark Adams aa = thrust::raw_pointer_cast(matrixA->values->data()); 4613fa6b06aSMark Adams } else { 4623fa6b06aSMark Adams *B = NULL; 4633fa6b06aSMark Adams PetscFunctionReturn(0); 4643fa6b06aSMark Adams } 4653fa6b06aSMark Adams // act like MatSetValues because not called on host 4663fa6b06aSMark Adams if (A->assembled) { 4673fa6b06aSMark Adams if (A->was_assembled) { 4683fa6b06aSMark Adams ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr); 4693fa6b06aSMark Adams } 4703fa6b06aSMark Adams A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here? 4713fa6b06aSMark Adams } else { 4723fa6b06aSMark Adams SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix"); 4733fa6b06aSMark Adams } 4743fa6b06aSMark Adams if (!*p_d_mat) { 4753fa6b06aSMark Adams cudaError_t err; 4763fa6b06aSMark Adams PetscSplitCSRDataStructure *d_mat, h_mat; 4773fa6b06aSMark Adams Mat_SeqAIJ *jaca; 4783fa6b06aSMark Adams PetscInt n = A->rmap->n, nnz; 4793fa6b06aSMark Adams // create and copy 4803fa6b06aSMark Adams ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr); 4813fa6b06aSMark Adams err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 4823fa6b06aSMark Adams err = cudaMemset( d_mat, 0, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err); 4833fa6b06aSMark Adams *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up 4843fa6b06aSMark Adams if (size == 1) { 4853fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)A->data; 4863fa6b06aSMark Adams h_mat.nonzerostate = A->nonzerostate; 4873fa6b06aSMark Adams h_mat.rstart = 0; h_mat.rend = A->rmap->n; 4883fa6b06aSMark Adams h_mat.cstart = 0; h_mat.cend = A->cmap->n; 4893fa6b06aSMark Adams h_mat.offdiag.i = h_mat.offdiag.j = NULL; 4903fa6b06aSMark Adams h_mat.offdiag.a = NULL; 4913fa6b06aSMark Adams h_mat.seq = PETSC_TRUE; 4923fa6b06aSMark Adams } else { 4933fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ*)A->data; 4943fa6b06aSMark Adams Mat_SeqAIJ *jacb; 4953fa6b06aSMark Adams h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE 4963fa6b06aSMark Adams jaca = (Mat_SeqAIJ*)aij->A->data; 4973fa6b06aSMark Adams jacb = (Mat_SeqAIJ*)aij->B->data; 4983fa6b06aSMark Adams h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state? 4993fa6b06aSMark Adams if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray"); 5003fa6b06aSMark Adams if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n"); 5013fa6b06aSMark Adams // create colmap - this is ussually done (lazy) in MatSetValues 5023fa6b06aSMark Adams aij->donotstash = PETSC_TRUE; 5033fa6b06aSMark Adams aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE; 5043fa6b06aSMark Adams jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly 5053fa6b06aSMark Adams ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr); 5063fa6b06aSMark Adams aij->colmap[A->cmap->N] = -9; 5073fa6b06aSMark Adams ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr); 5083fa6b06aSMark Adams { 5093fa6b06aSMark Adams PetscInt ii; 5103fa6b06aSMark Adams for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1; 5113fa6b06aSMark Adams } 5123fa6b06aSMark Adams if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9"); 5133fa6b06aSMark Adams // allocate B copy data 5143fa6b06aSMark Adams h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend; 5153fa6b06aSMark Adams h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend; 5163fa6b06aSMark Adams nnz = jacb->i[n]; 5173fa6b06aSMark Adams 5183fa6b06aSMark Adams if (jacb->compressedrow.use) { 5193fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.offdiag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 5203fa6b06aSMark Adams err = cudaMemcpy( h_mat.offdiag.i, jacb->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 5213fa6b06aSMark Adams } else { 5223fa6b06aSMark Adams h_mat.offdiag.i = bi; 5233fa6b06aSMark Adams } 5243fa6b06aSMark Adams h_mat.offdiag.j = bj; 5253fa6b06aSMark Adams h_mat.offdiag.a = ba; 5263fa6b06aSMark Adams 5273fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.colmap, (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output 5283fa6b06aSMark Adams err = cudaMemcpy( h_mat.colmap, aij->colmap, (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err); 5293fa6b06aSMark Adams h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries; 5303fa6b06aSMark Adams h_mat.offdiag.n = n; 5313fa6b06aSMark Adams } 5323fa6b06aSMark Adams // allocate A copy data 5333fa6b06aSMark Adams nnz = jaca->i[n]; 5343fa6b06aSMark Adams h_mat.diag.n = n; 5353fa6b06aSMark Adams h_mat.diag.ignorezeroentries = jaca->ignorezeroentries; 5363fa6b06aSMark Adams ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr); 5373fa6b06aSMark Adams if (jaca->compressedrow.use) { 5383fa6b06aSMark Adams err = cudaMalloc((void **)&h_mat.diag.i, (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input 5393fa6b06aSMark Adams err = cudaMemcpy( h_mat.diag.i, jaca->i, (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err); 5403fa6b06aSMark Adams } else { 5413fa6b06aSMark Adams h_mat.diag.i = ai; 5423fa6b06aSMark Adams } 5433fa6b06aSMark Adams h_mat.diag.j = aj; 5443fa6b06aSMark Adams h_mat.diag.a = aa; 5453fa6b06aSMark Adams // copy pointers and metdata to device 5463fa6b06aSMark Adams err = cudaMemcpy( d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err); 5473fa6b06aSMark Adams ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr); 5483fa6b06aSMark Adams } else { 5493fa6b06aSMark Adams *B = *p_d_mat; 5503fa6b06aSMark Adams } 5513fa6b06aSMark Adams A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues 5523fa6b06aSMark Adams PetscFunctionReturn(0); 553*9db3cbf9SStefano Zampini #endif 5543fa6b06aSMark Adams } 555