xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 3fa6b06a16b9e10b6e41dcaa1d4ab6dd90b29d60)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
99ae82921SPaul Mullowney 
109ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
119ae82921SPaul Mullowney {
12bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b               = (Mat_MPIAIJ*)B->data;
13bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
149ae82921SPaul Mullowney   PetscErrorCode     ierr;
159ae82921SPaul Mullowney   PetscInt           i;
169ae82921SPaul Mullowney 
179ae82921SPaul Mullowney   PetscFunctionBegin;
189ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
199ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
209ae82921SPaul Mullowney   if (d_nnz) {
219ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
229ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
239ae82921SPaul Mullowney     }
249ae82921SPaul Mullowney   }
259ae82921SPaul Mullowney   if (o_nnz) {
269ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
279ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
289ae82921SPaul Mullowney     }
299ae82921SPaul Mullowney   }
309ae82921SPaul Mullowney   if (!B->preallocated) {
31bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
329ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
33b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
349ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
359ae82921SPaul Mullowney     ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
363bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
379ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
38b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
399ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
409ae82921SPaul Mullowney     ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
413bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
429ae82921SPaul Mullowney   }
439ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
449ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
45e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
46e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
47b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
48b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
49b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
50b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
512205254eSKarl Rupp 
529ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
539ae82921SPaul Mullowney   PetscFunctionReturn(0);
549ae82921SPaul Mullowney }
55e057df02SPaul Mullowney 
569ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
579ae82921SPaul Mullowney {
589ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
599ae82921SPaul Mullowney   PetscErrorCode ierr;
609ae82921SPaul Mullowney   PetscInt       nt;
619ae82921SPaul Mullowney 
629ae82921SPaul Mullowney   PetscFunctionBegin;
639ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
649ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
65959dcdf5SJunchao Zhang   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
669ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
674d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
689ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
699ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
709ae82921SPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
719ae82921SPaul Mullowney   PetscFunctionReturn(0);
729ae82921SPaul Mullowney }
73ca45077fSPaul Mullowney 
74*3fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
75*3fa6b06aSMark Adams {
76*3fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
77*3fa6b06aSMark Adams   PetscErrorCode ierr;
78*3fa6b06aSMark Adams 
79*3fa6b06aSMark Adams   PetscFunctionBegin;
80*3fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
81*3fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
82*3fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
83*3fa6b06aSMark Adams     if (d_mat) {
84*3fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
85*3fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
86*3fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
87*3fa6b06aSMark Adams       cudaError_t  err;
88*3fa6b06aSMark Adams       PetscScalar  *vals;
89*3fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
90*3fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
91*3fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
92*3fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
93*3fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
94*3fa6b06aSMark Adams     }
95*3fa6b06aSMark Adams   }
96*3fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
97*3fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
98*3fa6b06aSMark Adams 
99*3fa6b06aSMark Adams   PetscFunctionReturn(0);
100*3fa6b06aSMark Adams }
101fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
102fdc842d1SBarry Smith {
103fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
104fdc842d1SBarry Smith   PetscErrorCode ierr;
105fdc842d1SBarry Smith   PetscInt       nt;
106fdc842d1SBarry Smith 
107fdc842d1SBarry Smith   PetscFunctionBegin;
108fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
109fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
110fdc842d1SBarry Smith   ierr = VecScatterInitializeForGPU(a->Mvctx,xx);CHKERRQ(ierr);
111fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
1124d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
113fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
114fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
115fdc842d1SBarry Smith   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
116fdc842d1SBarry Smith   PetscFunctionReturn(0);
117fdc842d1SBarry Smith }
118fdc842d1SBarry Smith 
119ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
120ca45077fSPaul Mullowney {
121ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
122ca45077fSPaul Mullowney   PetscErrorCode ierr;
123ca45077fSPaul Mullowney   PetscInt       nt;
124ca45077fSPaul Mullowney 
125ca45077fSPaul Mullowney   PetscFunctionBegin;
126ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
127ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
128a3fdcf43SKarl Rupp   ierr = VecScatterInitializeForGPU(a->Mvctx,a->lvec);CHKERRQ(ierr);
1299b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
130ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
1319b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
1329b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
133ca45077fSPaul Mullowney   ierr = VecScatterFinalizeForGPU(a->Mvctx);CHKERRQ(ierr);
134ca45077fSPaul Mullowney   PetscFunctionReturn(0);
135ca45077fSPaul Mullowney }
1369ae82921SPaul Mullowney 
137e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
138ca45077fSPaul Mullowney {
139ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
140bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
141e057df02SPaul Mullowney 
142ca45077fSPaul Mullowney   PetscFunctionBegin;
143e057df02SPaul Mullowney   switch (op) {
144e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
145e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
146045c96e1SPaul Mullowney     break;
147e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
148e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
149045c96e1SPaul Mullowney     break;
150e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
151e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
152e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
153045c96e1SPaul Mullowney     break;
154e057df02SPaul Mullowney   default:
155e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
156045c96e1SPaul Mullowney   }
157ca45077fSPaul Mullowney   PetscFunctionReturn(0);
158ca45077fSPaul Mullowney }
159e057df02SPaul Mullowney 
1604416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
161e057df02SPaul Mullowney {
162e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
163e057df02SPaul Mullowney   PetscErrorCode           ierr;
164e057df02SPaul Mullowney   PetscBool                flg;
165a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
166a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
1675fd66863SKarl Rupp 
168e057df02SPaul Mullowney   PetscFunctionBegin;
169e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
170e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
171e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
172a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
173e057df02SPaul Mullowney     if (flg) {
174e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
175e057df02SPaul Mullowney     }
176e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
177a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
178e057df02SPaul Mullowney     if (flg) {
179e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
180e057df02SPaul Mullowney     }
181e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
182a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
183e057df02SPaul Mullowney     if (flg) {
184e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
185e057df02SPaul Mullowney     }
186e057df02SPaul Mullowney   }
1870af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
188e057df02SPaul Mullowney   PetscFunctionReturn(0);
189e057df02SPaul Mullowney }
190e057df02SPaul Mullowney 
19134d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
19234d6c7a5SJose E. Roman {
19334d6c7a5SJose E. Roman   PetscErrorCode             ierr;
194*3fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
195*3fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
196*3fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
197*3fa6b06aSMark Adams   PetscInt                    nnz_state = A->nonzerostate;
19834d6c7a5SJose E. Roman   PetscFunctionBegin;
199*3fa6b06aSMark Adams   if (d_mat) {
200*3fa6b06aSMark Adams     cudaError_t                err;
201*3fa6b06aSMark Adams     err = cudaMemcpy( &nnz_state, &d_mat->nonzerostate, sizeof(PetscInt), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
202*3fa6b06aSMark Adams   }
20334d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
20434d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
20534d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
20634d6c7a5SJose E. Roman   }
207*3fa6b06aSMark Adams   if (nnz_state > A->nonzerostate) {
208*3fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
209*3fa6b06aSMark Adams   }
210*3fa6b06aSMark Adams 
21134d6c7a5SJose E. Roman   PetscFunctionReturn(0);
21234d6c7a5SJose E. Roman }
21334d6c7a5SJose E. Roman 
214bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
215bbf3fe20SPaul Mullowney {
216bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
217*3fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
218*3fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
219b06137fdSPaul Mullowney   cudaError_t        err;
220b06137fdSPaul Mullowney   cusparseStatus_t   stat;
221bbf3fe20SPaul Mullowney 
222bbf3fe20SPaul Mullowney   PetscFunctionBegin;
223*3fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
224*3fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
225*3fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
226*3fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
227*3fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
228*3fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
229*3fa6b06aSMark Adams     if (jaca->compressedrow.use) {
230*3fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
231*3fa6b06aSMark Adams     }
232*3fa6b06aSMark Adams     if (jacb->compressedrow.use) {
233*3fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
234*3fa6b06aSMark Adams     }
235*3fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
236*3fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
237*3fa6b06aSMark Adams   }
238bbf3fe20SPaul Mullowney   try {
239*3fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
240*3fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
24157d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
24217403302SKarl Rupp     if (cusparseStruct->stream) {
243c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
24417403302SKarl Rupp     }
245bbf3fe20SPaul Mullowney     delete cusparseStruct;
246bbf3fe20SPaul Mullowney   } catch(char *ex) {
247bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
248bbf3fe20SPaul Mullowney   }
2493338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
2503338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
251bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
252bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
253bbf3fe20SPaul Mullowney }
254ca45077fSPaul Mullowney 
2553338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
2569ae82921SPaul Mullowney {
2579ae82921SPaul Mullowney   PetscErrorCode     ierr;
258bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
259bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
260b06137fdSPaul Mullowney   cusparseStatus_t   stat;
2613338378cSStefano Zampini   Mat                A;
2629ae82921SPaul Mullowney 
2639ae82921SPaul Mullowney   PetscFunctionBegin;
2643338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
2653338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
2663338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
2673338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
2683338378cSStefano Zampini   }
2693338378cSStefano Zampini   A = *newmat;
2703338378cSStefano Zampini 
27134136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
27234136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
27334136279SStefano Zampini 
274bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
2753338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
276bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
2772205254eSKarl Rupp 
278bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
279e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
280e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
28117403302SKarl Rupp     cusparseStruct->stream              = 0;
28257d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
283*3fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
2843338378cSStefano Zampini   }
2852205254eSKarl Rupp 
28634d6c7a5SJose E. Roman   A->ops->assemblyend    = MatAssemblyEnd_MPIAIJCUSPARSE;
287bbf3fe20SPaul Mullowney   A->ops->mult           = MatMult_MPIAIJCUSPARSE;
288fdc842d1SBarry Smith   A->ops->multadd        = MatMultAdd_MPIAIJCUSPARSE;
289bbf3fe20SPaul Mullowney   A->ops->multtranspose  = MatMultTranspose_MPIAIJCUSPARSE;
290bbf3fe20SPaul Mullowney   A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
291bbf3fe20SPaul Mullowney   A->ops->destroy        = MatDestroy_MPIAIJCUSPARSE;
292*3fa6b06aSMark Adams   A->ops->zeroentries    = MatZeroEntries_MPIAIJCUSPARSE;
2932205254eSKarl Rupp 
294bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
2953338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
296bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
2979ae82921SPaul Mullowney   PetscFunctionReturn(0);
2989ae82921SPaul Mullowney }
2999ae82921SPaul Mullowney 
3003338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
3013338378cSStefano Zampini {
3023338378cSStefano Zampini   PetscErrorCode ierr;
3033338378cSStefano Zampini 
3043338378cSStefano Zampini   PetscFunctionBegin;
30505035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
3063338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
3073338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
3083338378cSStefano Zampini   PetscFunctionReturn(0);
3093338378cSStefano Zampini }
3103338378cSStefano Zampini 
3113f9c0db1SPaul Mullowney /*@
3123f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
313e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
3143f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
315e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
316e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
317e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
3189ae82921SPaul Mullowney 
319d083f849SBarry Smith    Collective
320e057df02SPaul Mullowney 
321e057df02SPaul Mullowney    Input Parameters:
322e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
323e057df02SPaul Mullowney .  m - number of rows
324e057df02SPaul Mullowney .  n - number of columns
325e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
326e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
3270298fd71SBarry Smith          (possibly different for each row) or NULL
328e057df02SPaul Mullowney 
329e057df02SPaul Mullowney    Output Parameter:
330e057df02SPaul Mullowney .  A - the matrix
331e057df02SPaul Mullowney 
332e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
333e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
334e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
335e057df02SPaul Mullowney 
336e057df02SPaul Mullowney    Notes:
337e057df02SPaul Mullowney    If nnz is given then nz is ignored
338e057df02SPaul Mullowney 
339e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
340e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
341e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
342e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
343e057df02SPaul Mullowney 
344e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
3450298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
346e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
347e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
348e057df02SPaul Mullowney 
349e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
350e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
351e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
352e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
353e057df02SPaul Mullowney 
354e057df02SPaul Mullowney    Level: intermediate
355e057df02SPaul Mullowney 
356e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
357e057df02SPaul Mullowney @*/
3589ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
3599ae82921SPaul Mullowney {
3609ae82921SPaul Mullowney   PetscErrorCode ierr;
3619ae82921SPaul Mullowney   PetscMPIInt    size;
3629ae82921SPaul Mullowney 
3639ae82921SPaul Mullowney   PetscFunctionBegin;
3649ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
3659ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
3669ae82921SPaul Mullowney   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
3679ae82921SPaul Mullowney   if (size > 1) {
3689ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
3699ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
3709ae82921SPaul Mullowney   } else {
3719ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
3729ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
3739ae82921SPaul Mullowney   }
3749ae82921SPaul Mullowney   PetscFunctionReturn(0);
3759ae82921SPaul Mullowney }
3769ae82921SPaul Mullowney 
3773ca39a21SBarry Smith /*MC
378e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
379e057df02SPaul Mullowney 
3802692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
3812692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
3822692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
3839ae82921SPaul Mullowney 
3849ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
3859ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
3869ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
3879ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
3889ae82921SPaul Mullowney    the above preallocation routines for simplicity.
3899ae82921SPaul Mullowney 
3909ae82921SPaul Mullowney    Options Database Keys:
391e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
3928468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3938468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3948468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
3959ae82921SPaul Mullowney 
3969ae82921SPaul Mullowney   Level: beginner
3979ae82921SPaul Mullowney 
3988468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
3998468deeeSKarl Rupp M
4009ae82921SPaul Mullowney M*/
401*3fa6b06aSMark Adams 
402*3fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
403*3fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
404*3fa6b06aSMark Adams {
405*3fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
406*3fa6b06aSMark Adams   PetscMPIInt                size,rank;
407*3fa6b06aSMark Adams   MPI_Comm                   comm;
408*3fa6b06aSMark Adams   PetscErrorCode             ierr;
409*3fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
410*3fa6b06aSMark Adams   PetscScalar                *aa,*ba;
411*3fa6b06aSMark Adams 
412*3fa6b06aSMark Adams   PetscFunctionBegin;
413*3fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
414*3fa6b06aSMark Adams   ierr = MPI_Comm_size(comm,&size);CHKERRQ(ierr);
415*3fa6b06aSMark Adams   ierr = MPI_Comm_rank(comm,&rank);CHKERRQ(ierr);
416*3fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
417*3fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
418*3fa6b06aSMark Adams     if (size == 1) {
419*3fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
420*3fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
421*3fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
422*3fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
423*3fa6b06aSMark Adams 	matrixA = (CsrMatrix*)matstruct->mat;
424*3fa6b06aSMark Adams 	bi = bj = NULL; ba = NULL;
425*3fa6b06aSMark Adams       } else {
426*3fa6b06aSMark Adams 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
427*3fa6b06aSMark Adams       }
428*3fa6b06aSMark Adams     } else {
429*3fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
430*3fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
431*3fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
432*3fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
433*3fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
434*3fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
435*3fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
436*3fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
437*3fa6b06aSMark Adams 	if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
438*3fa6b06aSMark Adams 	matrixA = (CsrMatrix*)matstructA->mat;
439*3fa6b06aSMark Adams 	matrixB = (CsrMatrix*)matstructB->mat;
440*3fa6b06aSMark Adams 	bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
441*3fa6b06aSMark Adams 	bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
442*3fa6b06aSMark Adams 	ba = thrust::raw_pointer_cast(matrixB->values->data());
443*3fa6b06aSMark Adams 	if (rank==-1) {
444*3fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->row_offsets->size(); i++)
445*3fa6b06aSMark Adams 	    std::cout << "\trow_offsets[" << i << "] = " << (*matrixB->row_offsets)[i] << std::endl;
446*3fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->column_indices->size(); i++)
447*3fa6b06aSMark Adams 	    std::cout << "\tcolumn_indices[" << i << "] = " << (*matrixB->column_indices)[i] << std::endl;
448*3fa6b06aSMark Adams 	  for(unsigned int i = 0; i < matrixB->values->size(); i++)
449*3fa6b06aSMark Adams 	    std::cout << "\tvalues[" << i << "] = " << (*matrixB->values)[i] << std::endl;
450*3fa6b06aSMark Adams 	}
451*3fa6b06aSMark Adams       } else {
452*3fa6b06aSMark Adams 	SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
453*3fa6b06aSMark Adams       }
454*3fa6b06aSMark Adams     }
455*3fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
456*3fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
457*3fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
458*3fa6b06aSMark Adams   } else {
459*3fa6b06aSMark Adams     *B = NULL;
460*3fa6b06aSMark Adams     PetscFunctionReturn(0);
461*3fa6b06aSMark Adams   }
462*3fa6b06aSMark Adams   // act like MatSetValues because not called on host
463*3fa6b06aSMark Adams   if (A->assembled) {
464*3fa6b06aSMark Adams     if (A->was_assembled) {
465*3fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
466*3fa6b06aSMark Adams     }
467*3fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
468*3fa6b06aSMark Adams   } else {
469*3fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
470*3fa6b06aSMark Adams   }
471*3fa6b06aSMark Adams   if (!*p_d_mat) {
472*3fa6b06aSMark Adams     cudaError_t                 err;
473*3fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
474*3fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
475*3fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
476*3fa6b06aSMark Adams     // create and copy
477*3fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
478*3fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
479*3fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
480*3fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
481*3fa6b06aSMark Adams     if (size == 1) {
482*3fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
483*3fa6b06aSMark Adams       h_mat.nonzerostate = A->nonzerostate;
484*3fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
485*3fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
486*3fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
487*3fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
488*3fa6b06aSMark Adams       h_mat.seq = PETSC_TRUE;
489*3fa6b06aSMark Adams     } else {
490*3fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
491*3fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
492*3fa6b06aSMark Adams       h_mat.seq = PETSC_FALSE; // for MatAssemblyEnd_SeqAIJCUSPARSE
493*3fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
494*3fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
495*3fa6b06aSMark Adams       h_mat.nonzerostate = aij->A->nonzerostate; // just keep one nonzero state?
496*3fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
497*3fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
498*3fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
499*3fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
500*3fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
501*3fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
502*3fa6b06aSMark Adams #if defined(PETSC_USE_CTABLE)
503*3fa6b06aSMark Adams       SETERRQ(comm,PETSC_ERR_SUP,"Devioce metadata does not support ctable (--with-ctable=0)");
504*3fa6b06aSMark Adams #else
505*3fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
506*3fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
507*3fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
508*3fa6b06aSMark Adams       {
509*3fa6b06aSMark Adams 	PetscInt ii;
510*3fa6b06aSMark Adams 	for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
511*3fa6b06aSMark Adams       }
512*3fa6b06aSMark Adams       if(aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
513*3fa6b06aSMark Adams #endif
514*3fa6b06aSMark Adams       // allocate B copy data
515*3fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
516*3fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
517*3fa6b06aSMark Adams       nnz = jacb->i[n];
518*3fa6b06aSMark Adams 
519*3fa6b06aSMark Adams       if (jacb->compressedrow.use) {
520*3fa6b06aSMark Adams 	err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
521*3fa6b06aSMark Adams 	err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
522*3fa6b06aSMark Adams       } else {
523*3fa6b06aSMark Adams 	h_mat.offdiag.i = bi;
524*3fa6b06aSMark Adams       }
525*3fa6b06aSMark Adams       h_mat.offdiag.j = bj;
526*3fa6b06aSMark Adams       h_mat.offdiag.a = ba;
527*3fa6b06aSMark Adams 
528*3fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
529*3fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
530*3fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
531*3fa6b06aSMark Adams       h_mat.offdiag.n = n;
532*3fa6b06aSMark Adams     }
533*3fa6b06aSMark Adams     // allocate A copy data
534*3fa6b06aSMark Adams     nnz = jaca->i[n];
535*3fa6b06aSMark Adams     h_mat.diag.n = n;
536*3fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
537*3fa6b06aSMark Adams     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRQ(ierr);
538*3fa6b06aSMark Adams     if (jaca->compressedrow.use) {
539*3fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
540*3fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
541*3fa6b06aSMark Adams     } else {
542*3fa6b06aSMark Adams       h_mat.diag.i = ai;
543*3fa6b06aSMark Adams     }
544*3fa6b06aSMark Adams     h_mat.diag.j = aj;
545*3fa6b06aSMark Adams     h_mat.diag.a = aa;
546*3fa6b06aSMark Adams     // copy pointers and metdata to device
547*3fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
548*3fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
549*3fa6b06aSMark Adams   } else {
550*3fa6b06aSMark Adams     *B = *p_d_mat;
551*3fa6b06aSMark Adams   }
552*3fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
553*3fa6b06aSMark Adams   PetscFunctionReturn(0);
554*3fa6b06aSMark Adams }
555