xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 9db3cbf93c0c42e7090475b1bd59c84e4d5354ce)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
99ae82921SPaul Mullowney 
109ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
119ae82921SPaul Mullowney {
12bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
149ae82921SPaul Mullowney   PetscErrorCode     ierr;
159ae82921SPaul Mullowney   PetscInt           i;
169ae82921SPaul Mullowney 
179ae82921SPaul Mullowney   PetscFunctionBegin;
189ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
199ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
209ae82921SPaul Mullowney   if (d_nnz) {
219ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
229ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239ae82921SPaul Mullowney     }
249ae82921SPaul Mullowney   }
259ae82921SPaul Mullowney   if (o_nnz) {
269ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
279ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
289ae82921SPaul Mullowney     }
299ae82921SPaul Mullowney   }
309ae82921SPaul Mullowney   if (!B->preallocated) {
31bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
329ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
349ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
359ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
363bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
379ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
38b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
399ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
409ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
413bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
429ae82921SPaul Mullowney   }
439ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
449ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
45e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
46e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
47b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
48b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
49b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
50b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
512205254eSKarl Rupp 
529ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
539ae82921SPaul Mullowney   PetscFunctionReturn(0);
549ae82921SPaul Mullowney }
55e057df02SPaul Mullowney 
569ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
579ae82921SPaul Mullowney {
589ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
599ae82921SPaul Mullowney   PetscErrorCode ierr;
609ae82921SPaul Mullowney   PetscInt       nt;
619ae82921SPaul Mullowney 
629ae82921SPaul Mullowney   PetscFunctionBegin;
639ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
649ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
65959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
669ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
674d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
689ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
699ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
709ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
719ae82921SPaul Mullowney   PetscFunctionReturn(0);
729ae82921SPaul Mullowney }
73ca45077fSPaul Mullowney 
743fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
753fa6b06aSMark Adams {
763fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
773fa6b06aSMark Adams   PetscErrorCode ierr;
783fa6b06aSMark Adams 
793fa6b06aSMark Adams   PetscFunctionBegin;
803fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
813fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
823fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
833fa6b06aSMark Adams     if (d_mat) {
843fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
853fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
863fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
873fa6b06aSMark Adams       cudaError_t  err;
883fa6b06aSMark Adams       PetscScalar  *vals;
893fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
903fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
913fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
923fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
933fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
943fa6b06aSMark Adams     }
953fa6b06aSMark Adams   }
963fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
973fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
983fa6b06aSMark Adams 
993fa6b06aSMark Adams   PetscFunctionReturn(0);
1003fa6b06aSMark Adams }
101fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
102fdc842d1SBarry Smith {
103fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
104fdc842d1SBarry Smith   PetscErrorCode ierr;
105fdc842d1SBarry Smith   PetscInt       nt;
106fdc842d1SBarry Smith 
107fdc842d1SBarry Smith   PetscFunctionBegin;
108fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
109fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
110fdc842d1SBarry Smith   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
111fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1124d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
113fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
114fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
115fdc842d1SBarry Smith   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
116fdc842d1SBarry Smith   PetscFunctionReturn(0);
117fdc842d1SBarry Smith }
118fdc842d1SBarry Smith 
119ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
120ca45077fSPaul Mullowney {
121ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
122ca45077fSPaul Mullowney   PetscErrorCode ierr;
123ca45077fSPaul Mullowney   PetscInt       nt;
124ca45077fSPaul Mullowney 
125ca45077fSPaul Mullowney   PetscFunctionBegin;
126ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
127ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
128a3fdcf43SKarl Rupp   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
1299b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
130ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
1319b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1329b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
133ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
134ca45077fSPaul Mullowney   PetscFunctionReturn(0);
135ca45077fSPaul Mullowney }
1369ae82921SPaul Mullowney 
137e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
138ca45077fSPaul Mullowney {
139ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
140bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
141e057df02SPaul Mullowney 
142ca45077fSPaul Mullowney   PetscFunctionBegin;
143e057df02SPaul Mullowney   switch (op) {
144e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
145e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
146045c96e1SPaul Mullowney     break;
147e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
148e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
149045c96e1SPaul Mullowney     break;
150e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
151e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
152e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
153045c96e1SPaul Mullowney     break;
154e057df02SPaul Mullowney   default:
155e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
156045c96e1SPaul Mullowney   }
157ca45077fSPaul Mullowney   PetscFunctionReturn(0);
158ca45077fSPaul Mullowney }
159e057df02SPaul Mullowney 
1604416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
161e057df02SPaul Mullowney {
162e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
163e057df02SPaul Mullowney   PetscErrorCode           ierr;
164e057df02SPaul Mullowney   PetscBool                flg;
165a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
166a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1675fd66863SKarl Rupp 
168e057df02SPaul Mullowney   PetscFunctionBegin;
169e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
170e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
171e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
172a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
173e057df02SPaul Mullowney     if (flg) {
174e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
175e057df02SPaul Mullowney     }
176e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
177a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
178e057df02SPaul Mullowney     if (flg) {
179e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
180e057df02SPaul Mullowney     }
181e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
182a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183e057df02SPaul Mullowney     if (flg) {
184e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
185e057df02SPaul Mullowney     }
186e057df02SPaul Mullowney   }
1870af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
188e057df02SPaul Mullowney   PetscFunctionReturn(0);
189e057df02SPaul Mullowney }
190e057df02SPaul Mullowney 
19134d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
19234d6c7a5SJose E. Roman {
19334d6c7a5SJose E. Roman   PetscErrorCode             ierr;
1943fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
1953fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
1963fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
1973fa6b06aSMark Adams   PetscInt                    nnz_state = A->nonzerostate;
19834d6c7a5SJose E. Roman   PetscFunctionBegin;
1993fa6b06aSMark Adams   if (d_mat) {
2003fa6b06aSMark Adams     cudaError_t                err;
2013fa6b06aSMark Adams     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2023fa6b06aSMark Adams   }
20334d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
20434d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
20534d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
20634d6c7a5SJose E. Roman   }
2073fa6b06aSMark Adams   if (nnz_state > A->nonzerostate) {
2083fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
2093fa6b06aSMark Adams   }
2103fa6b06aSMark Adams 
21134d6c7a5SJose E. Roman   PetscFunctionReturn(0);
21234d6c7a5SJose E. Roman }
21334d6c7a5SJose E. Roman 
214bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
215bbf3fe20SPaul Mullowney {
216bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
2173fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
2183fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
219b06137fdSPaul Mullowney   cudaError_t        err;
220b06137fdSPaul Mullowney   cusparseStatus_t   stat;
221bbf3fe20SPaul Mullowney 
222bbf3fe20SPaul Mullowney   PetscFunctionBegin;
2233fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
2243fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
2253fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
2263fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
2273fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
2283fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
2293fa6b06aSMark Adams     if (jaca->compressedrow.use) {
2303fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
2313fa6b06aSMark Adams     }
2323fa6b06aSMark Adams     if (jacb->compressedrow.use) {
2333fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
2343fa6b06aSMark Adams     }
2353fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
2363fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
2373fa6b06aSMark Adams   }
238bbf3fe20SPaul Mullowney   try {
2393fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
2403fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
24157d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
24217403302SKarl Rupp     if (cusparseStruct->stream) {
243c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
24417403302SKarl Rupp     }
245bbf3fe20SPaul Mullowney     delete cusparseStruct;
246bbf3fe20SPaul Mullowney   } catch(char *ex) {
247bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
248bbf3fe20SPaul Mullowney   }
2493338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
2503338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
251bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
252bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
253bbf3fe20SPaul Mullowney }
254ca45077fSPaul Mullowney 
2553338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
2569ae82921SPaul Mullowney {
2579ae82921SPaul Mullowney   PetscErrorCode     ierr;
258bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
259bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
260b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2613338378cSStefano Zampini   Mat                A;
2629ae82921SPaul Mullowney 
2639ae82921SPaul Mullowney   PetscFunctionBegin;
2643338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
2653338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2663338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
2673338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2683338378cSStefano Zampini   }
2693338378cSStefano Zampini   A = *newmat;
2703338378cSStefano Zampini 
27134136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
27234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
27334136279SStefano Zampini 
274bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
2753338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
276bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
2772205254eSKarl Rupp 
278bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
279e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
280e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
28117403302SKarl Rupp     cusparseStruct->stream              = 0;
28257d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
2833fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
2843338378cSStefano Zampini   }
2852205254eSKarl Rupp 
28634d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
287bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
288fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
289bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
290bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
291bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
2923fa6b06aSMark Adams   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
2932205254eSKarl Rupp 
294bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
2953338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
296bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2979ae82921SPaul Mullowney   PetscFunctionReturn(0);
2989ae82921SPaul Mullowney }
2999ae82921SPaul Mullowney 
3003338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
3013338378cSStefano Zampini {
3023338378cSStefano Zampini   PetscErrorCode ierr;
3033338378cSStefano Zampini 
3043338378cSStefano Zampini   PetscFunctionBegin;
30505035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
3063338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
3073338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
3083338378cSStefano Zampini   PetscFunctionReturn(0);
3093338378cSStefano Zampini }
3103338378cSStefano Zampini 
3113f9c0db1SPaul Mullowney /*@
3123f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
313e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
3143f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
315e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
316e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
317e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
3189ae82921SPaul Mullowney 
319d083f849SBarry Smith    Collective
320e057df02SPaul Mullowney 
321e057df02SPaul Mullowney    Input Parameters:
322e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
323e057df02SPaul Mullowney .  m - number of rows
324e057df02SPaul Mullowney .  n - number of columns
325e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
326e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
3270298fd71SBarry Smith          (possibly different for each row) or NULL
328e057df02SPaul Mullowney 
329e057df02SPaul Mullowney    Output Parameter:
330e057df02SPaul Mullowney .  A - the matrix
331e057df02SPaul Mullowney 
332e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
333e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
334e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
335e057df02SPaul Mullowney 
336e057df02SPaul Mullowney    Notes:
337e057df02SPaul Mullowney    If nnz is given then nz is ignored
338e057df02SPaul Mullowney 
339e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
340e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
341e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
342e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
343e057df02SPaul Mullowney 
344e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3450298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
346e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
347e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
348e057df02SPaul Mullowney 
349e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
350e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
351e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
352e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
353e057df02SPaul Mullowney 
354e057df02SPaul Mullowney    Level: intermediate
355e057df02SPaul Mullowney 
356e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
357e057df02SPaul Mullowney @*/
3589ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3599ae82921SPaul Mullowney {
3609ae82921SPaul Mullowney   PetscErrorCode ierr;
3619ae82921SPaul Mullowney   PetscMPIInt    size;
3629ae82921SPaul Mullowney 
3639ae82921SPaul Mullowney   PetscFunctionBegin;
3649ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3659ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3669ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3679ae82921SPaul Mullowney   if (size > 1) {
3689ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3699ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3709ae82921SPaul Mullowney   } else {
3719ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3729ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3739ae82921SPaul Mullowney   }
3749ae82921SPaul Mullowney   PetscFunctionReturn(0);
3759ae82921SPaul Mullowney }
3769ae82921SPaul Mullowney 
3773ca39a21SBarry Smith /*MC
378e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
379e057df02SPaul Mullowney 
3802692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3812692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3822692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3839ae82921SPaul Mullowney 
3849ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3859ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3869ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3879ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3889ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3899ae82921SPaul Mullowney 
3909ae82921SPaul Mullowney    Options Database Keys:
391e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3928468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3938468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3948468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3959ae82921SPaul Mullowney 
3969ae82921SPaul Mullowney   Level: beginner
3979ae82921SPaul Mullowney 
3988468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3998468deeeSKarl Rupp M
4009ae82921SPaul Mullowney M*/
4013fa6b06aSMark Adams 
4023fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
4033fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
4043fa6b06aSMark Adams {
405*9db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
406*9db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
407*9db3cbf9SStefano Zampini #else
4083fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
4093fa6b06aSMark Adams   PetscMPIInt                size,rank;
4103fa6b06aSMark Adams   MPI_Comm                   comm;
4113fa6b06aSMark Adams   PetscErrorCode             ierr;
4123fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
4133fa6b06aSMark Adams   PetscScalar                *aa,*ba;
4143fa6b06aSMark Adams 
4153fa6b06aSMark Adams   PetscFunctionBegin;
4163fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
4173fa6b06aSMark Adams   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
4183fa6b06aSMark Adams   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
4193fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
4203fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
4213fa6b06aSMark Adams     if (size == 1) {
4223fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
4233fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
4243fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
4253fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
4263fa6b06aSMark Adams 	matrixA = (CsrMatrix*)matstruct->mat;
4273fa6b06aSMark Adams 	bi = bj = NULL; ba = NULL;
4283fa6b06aSMark Adams       } else {
4293fa6b06aSMark Adams 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
4303fa6b06aSMark Adams       }
4313fa6b06aSMark Adams     } else {
4323fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
4333fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
4343fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
4353fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
4363fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
4373fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
4383fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
4393fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
4403fa6b06aSMark Adams 	if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
4413fa6b06aSMark Adams 	matrixA = (CsrMatrix*)matstructA->mat;
4423fa6b06aSMark Adams 	matrixB = (CsrMatrix*)matstructB->mat;
4433fa6b06aSMark Adams 	bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
4443fa6b06aSMark Adams 	bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
4453fa6b06aSMark Adams 	ba = thrust::raw_pointer_cast(matrixB->values->data());
4463fa6b06aSMark Adams 	if (rank==-1) {
4473fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
4483fa6b06aSMark Adams 	    std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
4493fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
4503fa6b06aSMark Adams 	    std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
4513fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->values->size(); i++)
4523fa6b06aSMark Adams 	    std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
4533fa6b06aSMark Adams 	}
4543fa6b06aSMark Adams       } else {
4553fa6b06aSMark Adams 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
4563fa6b06aSMark Adams       }
4573fa6b06aSMark Adams     }
4583fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
4593fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
4603fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
4613fa6b06aSMark Adams   } else {
4623fa6b06aSMark Adams     *B = NULL;
4633fa6b06aSMark Adams     PetscFunctionReturn(0);
4643fa6b06aSMark Adams   }
4653fa6b06aSMark Adams   // act like MatSetValues because not called on host
4663fa6b06aSMark Adams   if (A->assembled) {
4673fa6b06aSMark Adams     if (A->was_assembled) {
4683fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
4693fa6b06aSMark Adams     }
4703fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
4713fa6b06aSMark Adams   } else {
4723fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
4733fa6b06aSMark Adams   }
4743fa6b06aSMark Adams   if (!*p_d_mat) {
4753fa6b06aSMark Adams     cudaError_t                 err;
4763fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
4773fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
4783fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
4793fa6b06aSMark Adams     // create and copy
4803fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
4813fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
4823fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
4833fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
4843fa6b06aSMark Adams     if (size == 1) {
4853fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
4863fa6b06aSMark Adams       h_mat.nonzerostate = A->nonzerostate;
4873fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
4883fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
4893fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
4903fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
4913fa6b06aSMark Adams       h_mat.seq = PETSC_TRUE;
4923fa6b06aSMark Adams     } else {
4933fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
4943fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
4953fa6b06aSMark Adams       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
4963fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
4973fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
4983fa6b06aSMark Adams       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
4993fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
5003fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
5013fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
5023fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
5033fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
5043fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
5053fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
5063fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
5073fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
5083fa6b06aSMark Adams       {
5093fa6b06aSMark Adams 	PetscInt ii;
5103fa6b06aSMark Adams 	for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
5113fa6b06aSMark Adams       }
5123fa6b06aSMark Adams       if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
5133fa6b06aSMark Adams       // allocate B copy data
5143fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
5153fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
5163fa6b06aSMark Adams       nnz = jacb->i[n];
5173fa6b06aSMark Adams 
5183fa6b06aSMark Adams       if (jacb->compressedrow.use) {
5193fa6b06aSMark Adams 	err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
5203fa6b06aSMark Adams 	err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5213fa6b06aSMark Adams       } else {
5223fa6b06aSMark Adams 	h_mat.offdiag.i = bi;
5233fa6b06aSMark Adams       }
5243fa6b06aSMark Adams       h_mat.offdiag.j = bj;
5253fa6b06aSMark Adams       h_mat.offdiag.a = ba;
5263fa6b06aSMark Adams 
5273fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
5283fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5293fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
5303fa6b06aSMark Adams       h_mat.offdiag.n = n;
5313fa6b06aSMark Adams     }
5323fa6b06aSMark Adams     // allocate A copy data
5333fa6b06aSMark Adams     nnz = jaca->i[n];
5343fa6b06aSMark Adams     h_mat.diag.n = n;
5353fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
5363fa6b06aSMark Adams     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
5373fa6b06aSMark Adams     if (jaca->compressedrow.use) {
5383fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
5393fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5403fa6b06aSMark Adams     } else {
5413fa6b06aSMark Adams       h_mat.diag.i = ai;
5423fa6b06aSMark Adams     }
5433fa6b06aSMark Adams     h_mat.diag.j = aj;
5443fa6b06aSMark Adams     h_mat.diag.a = aa;
5453fa6b06aSMark Adams     // copy pointers and metdata to device
5463fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
5473fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
5483fa6b06aSMark Adams   } else {
5493fa6b06aSMark Adams     *B = *p_d_mat;
5503fa6b06aSMark Adams   }
5513fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
5523fa6b06aSMark Adams   PetscFunctionReturn(0);
553*9db3cbf9SStefano Zampini #endif
5543fa6b06aSMark Adams }
555