xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 4eb5378f66f8820a2cbc12e4d62d4969c2b43f57)
1dced61a5SBarry Smith #define PETSC_SKIP_SPINLOCK
253800007SKarl Rupp #define PETSC_SKIP_CXX_COMPLEX_FIX
399acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
40fd2b57fSKarl Rupp 
53d13b8fdSMatthew G. Knepley #include <petscconf.h>
69ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h>   /*I "petscmat.h" I*/
757d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
83d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
97e8381f9SStefano Zampini #include <thrust/advance.h>
10*4eb5378fSStefano Zampini #include <petscsf.h>
117e8381f9SStefano Zampini 
127e8381f9SStefano Zampini struct VecCUDAEquals
137e8381f9SStefano Zampini {
147e8381f9SStefano Zampini   template <typename Tuple>
157e8381f9SStefano Zampini   __host__ __device__
167e8381f9SStefano Zampini   void operator()(Tuple t)
177e8381f9SStefano Zampini   {
187e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
197e8381f9SStefano Zampini   }
207e8381f9SStefano Zampini };
217e8381f9SStefano Zampini 
227e8381f9SStefano Zampini static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat A, const PetscScalar v[], InsertMode imode)
237e8381f9SStefano Zampini {
247e8381f9SStefano Zampini   Mat_MPIAIJ         *a = (Mat_MPIAIJ*)A->data;
257e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE *cusp = (Mat_MPIAIJCUSPARSE*)a->spptr;
267e8381f9SStefano Zampini   PetscInt           n = cusp->coo_nd + cusp->coo_no;
277e8381f9SStefano Zampini   PetscErrorCode     ierr;
287e8381f9SStefano Zampini   cudaError_t        cerr;
297e8381f9SStefano Zampini 
307e8381f9SStefano Zampini   PetscFunctionBegin;
317e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
32e61fc153SStefano Zampini   if (cusp->coo_p && v) {
3308391a17SStefano Zampini     thrust::device_ptr<const PetscScalar> d_v;
3408391a17SStefano Zampini     THRUSTARRAY                           *w = NULL;
3508391a17SStefano Zampini 
3608391a17SStefano Zampini     if (isCudaMem(v)) {
3708391a17SStefano Zampini       d_v = thrust::device_pointer_cast(v);
3808391a17SStefano Zampini     } else {
39e61fc153SStefano Zampini       w = new THRUSTARRAY(n);
40e61fc153SStefano Zampini       w->assign(v,v+n);
4108391a17SStefano Zampini       ierr = PetscLogCpuToGpu(n*sizeof(PetscScalar));CHKERRQ(ierr);
4208391a17SStefano Zampini       d_v = w->data();
4308391a17SStefano Zampini     }
4408391a17SStefano Zampini 
4508391a17SStefano Zampini     auto zibit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->begin()),
467e8381f9SStefano Zampini                                                               cusp->coo_pw->begin()));
4708391a17SStefano Zampini     auto zieit = thrust::make_zip_iterator(thrust::make_tuple(thrust::make_permutation_iterator(d_v,cusp->coo_p->end()),
487e8381f9SStefano Zampini                                                               cusp->coo_pw->end()));
4908391a17SStefano Zampini     ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
507e8381f9SStefano Zampini     thrust::for_each(zibit,zieit,VecCUDAEquals());
5108391a17SStefano Zampini     cerr = WaitForCUDA();CHKERRCUDA(cerr);
5208391a17SStefano Zampini     ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
53e61fc153SStefano Zampini     delete w;
547e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,cusp->coo_pw->data().get(),imode);CHKERRQ(ierr);
557e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,cusp->coo_pw->data().get()+cusp->coo_nd,imode);CHKERRQ(ierr);
567e8381f9SStefano Zampini   } else {
577e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->A,v,imode);CHKERRQ(ierr);
587e8381f9SStefano Zampini     ierr = MatSetValuesCOO_SeqAIJCUSPARSE(a->B,v ? v+cusp->coo_nd : NULL,imode);CHKERRQ(ierr);
597e8381f9SStefano Zampini   }
607e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSESetVCOO,A,0,0,0);CHKERRQ(ierr);
617e8381f9SStefano Zampini   ierr = PetscObjectStateIncrease((PetscObject)A);CHKERRQ(ierr);
627e8381f9SStefano Zampini   A->num_ass++;
637e8381f9SStefano Zampini   A->assembled        = PETSC_TRUE;
647e8381f9SStefano Zampini   A->ass_nonzerostate = A->nonzerostate;
65*4eb5378fSStefano Zampini   A->offloadmask      = PETSC_OFFLOAD_GPU;
667e8381f9SStefano Zampini   PetscFunctionReturn(0);
677e8381f9SStefano Zampini }
687e8381f9SStefano Zampini 
697e8381f9SStefano Zampini template <typename Tuple>
707e8381f9SStefano Zampini struct IsNotOffDiagT
717e8381f9SStefano Zampini {
727e8381f9SStefano Zampini   PetscInt _cstart,_cend;
737e8381f9SStefano Zampini 
747e8381f9SStefano Zampini   IsNotOffDiagT(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
757e8381f9SStefano Zampini   __host__ __device__
767e8381f9SStefano Zampini   inline bool operator()(Tuple t)
777e8381f9SStefano Zampini   {
787e8381f9SStefano Zampini     return !(thrust::get<1>(t) < _cstart || thrust::get<1>(t) >= _cend);
797e8381f9SStefano Zampini   }
807e8381f9SStefano Zampini };
817e8381f9SStefano Zampini 
827e8381f9SStefano Zampini struct IsOffDiag
837e8381f9SStefano Zampini {
847e8381f9SStefano Zampini   PetscInt _cstart,_cend;
857e8381f9SStefano Zampini 
867e8381f9SStefano Zampini   IsOffDiag(PetscInt cstart, PetscInt cend) : _cstart(cstart), _cend(cend) {}
877e8381f9SStefano Zampini   __host__ __device__
887e8381f9SStefano Zampini   inline bool operator() (const PetscInt &c)
897e8381f9SStefano Zampini   {
907e8381f9SStefano Zampini     return c < _cstart || c >= _cend;
917e8381f9SStefano Zampini   }
927e8381f9SStefano Zampini };
937e8381f9SStefano Zampini 
947e8381f9SStefano Zampini struct GlobToLoc
957e8381f9SStefano Zampini {
967e8381f9SStefano Zampini   PetscInt _start;
977e8381f9SStefano Zampini 
987e8381f9SStefano Zampini   GlobToLoc(PetscInt start) : _start(start) {}
997e8381f9SStefano Zampini   __host__ __device__
1007e8381f9SStefano Zampini   inline PetscInt operator() (const PetscInt &c)
1017e8381f9SStefano Zampini   {
1027e8381f9SStefano Zampini     return c - _start;
1037e8381f9SStefano Zampini   }
1047e8381f9SStefano Zampini };
1057e8381f9SStefano Zampini 
1067e8381f9SStefano Zampini static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat B, PetscInt n, const PetscInt coo_i[], const PetscInt coo_j[])
1077e8381f9SStefano Zampini {
1087e8381f9SStefano Zampini   Mat_MPIAIJ             *b = (Mat_MPIAIJ*)B->data;
1097e8381f9SStefano Zampini   Mat_MPIAIJCUSPARSE     *cusp = (Mat_MPIAIJCUSPARSE*)b->spptr;
1107e8381f9SStefano Zampini   PetscErrorCode         ierr;
1117e8381f9SStefano Zampini   PetscInt               *jj;
1127e8381f9SStefano Zampini   size_t                 noff = 0;
1137e8381f9SStefano Zampini   THRUSTINTARRAY         d_i(n);
1147e8381f9SStefano Zampini   THRUSTINTARRAY         d_j(n);
1157e8381f9SStefano Zampini   ISLocalToGlobalMapping l2g;
1167e8381f9SStefano Zampini   cudaError_t            cerr;
1177e8381f9SStefano Zampini 
1187e8381f9SStefano Zampini   PetscFunctionBegin;
1197e8381f9SStefano Zampini   ierr = PetscLogEventBegin(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
1207e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
1217e8381f9SStefano Zampini   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
1227e8381f9SStefano Zampini   if (b->A) { ierr = MatCUSPARSEClearHandle(b->A);CHKERRQ(ierr); }
1237e8381f9SStefano Zampini   if (b->B) { ierr = MatCUSPARSEClearHandle(b->B);CHKERRQ(ierr); }
1247e8381f9SStefano Zampini   ierr = PetscFree(b->garray);CHKERRQ(ierr);
1257e8381f9SStefano Zampini   ierr = VecDestroy(&b->lvec);CHKERRQ(ierr);
1267e8381f9SStefano Zampini   ierr = MatDestroy(&b->A);CHKERRQ(ierr);
1277e8381f9SStefano Zampini   ierr = MatDestroy(&b->B);CHKERRQ(ierr);
1287e8381f9SStefano Zampini 
1297e8381f9SStefano Zampini   ierr = PetscLogCpuToGpu(2.*n*sizeof(PetscInt));CHKERRQ(ierr);
1307e8381f9SStefano Zampini   d_i.assign(coo_i,coo_i+n);
1317e8381f9SStefano Zampini   d_j.assign(coo_j,coo_j+n);
1327e8381f9SStefano Zampini   delete cusp->coo_p;
1337e8381f9SStefano Zampini   delete cusp->coo_pw;
1347e8381f9SStefano Zampini   cusp->coo_p = NULL;
1357e8381f9SStefano Zampini   cusp->coo_pw = NULL;
13608391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1377e8381f9SStefano Zampini   auto firstoffd = thrust::find_if(thrust::device,d_j.begin(),d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1387e8381f9SStefano Zampini   auto firstdiag = thrust::find_if_not(thrust::device,firstoffd,d_j.end(),IsOffDiag(B->cmap->rstart,B->cmap->rend));
1397e8381f9SStefano Zampini   if (firstoffd != d_j.end() && firstdiag != d_j.end()) {
1407e8381f9SStefano Zampini     cusp->coo_p = new THRUSTINTARRAY(n);
1417e8381f9SStefano Zampini     cusp->coo_pw = new THRUSTARRAY(n);
1427e8381f9SStefano Zampini     thrust::sequence(thrust::device,cusp->coo_p->begin(),cusp->coo_p->end(),0);
1437e8381f9SStefano Zampini     auto fzipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.begin(),d_j.begin(),cusp->coo_p->begin()));
1447e8381f9SStefano Zampini     auto ezipp = thrust::make_zip_iterator(thrust::make_tuple(d_i.end(),d_j.end(),cusp->coo_p->end()));
1457e8381f9SStefano Zampini     auto mzipp = thrust::partition(thrust::device,fzipp,ezipp,IsNotOffDiagT<thrust::tuple<PetscInt,PetscInt,PetscInt> >(B->cmap->rstart,B->cmap->rend));
1467e8381f9SStefano Zampini     firstoffd = mzipp.get_iterator_tuple().get<1>();
1477e8381f9SStefano Zampini   }
1487e8381f9SStefano Zampini   cusp->coo_nd = thrust::distance(d_j.begin(),firstoffd);
1497e8381f9SStefano Zampini   cusp->coo_no = thrust::distance(firstoffd,d_j.end());
1507e8381f9SStefano Zampini 
1517e8381f9SStefano Zampini   /* from global to local */
1527e8381f9SStefano Zampini   thrust::transform(thrust::device,d_i.begin(),d_i.end(),d_i.begin(),GlobToLoc(B->rmap->rstart));
1537e8381f9SStefano Zampini   thrust::transform(thrust::device,d_j.begin(),firstoffd,d_j.begin(),GlobToLoc(B->cmap->rstart));
15408391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
15508391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1567e8381f9SStefano Zampini 
1577e8381f9SStefano Zampini   /* copy offdiag column indices to map on the CPU */
1587e8381f9SStefano Zampini   ierr = PetscMalloc1(cusp->coo_no,&jj);CHKERRQ(ierr);
1597e8381f9SStefano Zampini   cerr = cudaMemcpy(jj,d_j.data().get()+cusp->coo_nd,cusp->coo_no*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1607e8381f9SStefano Zampini   auto o_j = d_j.begin();
16108391a17SStefano Zampini   ierr = PetscLogGpuTimeBegin();CHKERRQ(ierr);
1627e8381f9SStefano Zampini   thrust::advance(o_j,cusp->coo_nd);
1637e8381f9SStefano Zampini   thrust::sort(thrust::device,o_j,d_j.end());
1647e8381f9SStefano Zampini   auto wit = thrust::unique(thrust::device,o_j,d_j.end());
16508391a17SStefano Zampini   cerr = WaitForCUDA();CHKERRCUDA(cerr);
16608391a17SStefano Zampini   ierr = PetscLogGpuTimeEnd();CHKERRQ(ierr);
1677e8381f9SStefano Zampini   noff = thrust::distance(o_j,wit);
1687e8381f9SStefano Zampini   ierr = PetscMalloc1(noff+1,&b->garray);CHKERRQ(ierr);
1697e8381f9SStefano Zampini   cerr = cudaMemcpy(b->garray,d_j.data().get()+cusp->coo_nd,noff*sizeof(PetscInt),cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
1707e8381f9SStefano Zampini   ierr = PetscLogGpuToCpu((noff+cusp->coo_no)*sizeof(PetscInt));CHKERRQ(ierr);
1717e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingCreate(PETSC_COMM_SELF,1,noff,b->garray,PETSC_COPY_VALUES,&l2g);CHKERRQ(ierr);
1727e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingSetType(l2g,ISLOCALTOGLOBALMAPPINGHASH);CHKERRQ(ierr);
1737e8381f9SStefano Zampini   ierr = ISGlobalToLocalMappingApply(l2g,IS_GTOLM_DROP,cusp->coo_no,jj,&n,jj);CHKERRQ(ierr);
1747e8381f9SStefano Zampini   if (n != cusp->coo_no) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Unexpected is size %D != %D coo size",n,cusp->coo_no);
1757e8381f9SStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&l2g);CHKERRQ(ierr);
1767e8381f9SStefano Zampini 
1777e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
1787e8381f9SStefano Zampini   ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
1797e8381f9SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1807e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
1817e8381f9SStefano Zampini   ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
1827e8381f9SStefano Zampini   ierr = MatSetSizes(b->B,B->rmap->n,noff,B->rmap->n,noff);CHKERRQ(ierr);
1837e8381f9SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
1847e8381f9SStefano Zampini   ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
1857e8381f9SStefano Zampini 
1867e8381f9SStefano Zampini   /* GPU memory, cusparse specific call handles it internally */
1877e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->A,cusp->coo_nd,d_i.data().get(),d_j.data().get());CHKERRQ(ierr);
1887e8381f9SStefano Zampini   ierr = MatSetPreallocationCOO_SeqAIJCUSPARSE(b->B,cusp->coo_no,d_i.data().get()+cusp->coo_nd,jj);CHKERRQ(ierr);
1897e8381f9SStefano Zampini   ierr = PetscFree(jj);CHKERRQ(ierr);
1907e8381f9SStefano Zampini 
1917e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusp->diagGPUMatFormat);CHKERRQ(ierr);
1927e8381f9SStefano Zampini   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusp->offdiagGPUMatFormat);CHKERRQ(ierr);
1937e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->A,cusp->handle);CHKERRQ(ierr);
1947e8381f9SStefano Zampini   ierr = MatCUSPARSESetHandle(b->B,cusp->handle);CHKERRQ(ierr);
1957e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->A,cusp->stream);CHKERRQ(ierr);
1967e8381f9SStefano Zampini   ierr = MatCUSPARSESetStream(b->B,cusp->stream);CHKERRQ(ierr);
1977e8381f9SStefano Zampini   ierr = MatSetUpMultiply_MPIAIJ(B);CHKERRQ(ierr);
1987e8381f9SStefano Zampini   B->preallocated = PETSC_TRUE;
1997e8381f9SStefano Zampini   B->nonzerostate++;
2007e8381f9SStefano Zampini   ierr = PetscLogEventEnd(MAT_CUSPARSEPreallCOO,B,0,0,0);CHKERRQ(ierr);
2017e8381f9SStefano Zampini 
2027e8381f9SStefano Zampini   ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2037e8381f9SStefano Zampini   ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2047e8381f9SStefano Zampini   B->offloadmask = PETSC_OFFLOAD_CPU;
2057e8381f9SStefano Zampini   B->assembled = PETSC_FALSE;
2067e8381f9SStefano Zampini   B->was_assembled = PETSC_FALSE;
2077e8381f9SStefano Zampini   PetscFunctionReturn(0);
2087e8381f9SStefano Zampini }
2099ae82921SPaul Mullowney 
210ed502f03SStefano Zampini static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS *glob,Mat *A_loc)
211ed502f03SStefano Zampini {
212ed502f03SStefano Zampini   Mat            Ad,Ao;
213ed502f03SStefano Zampini   const PetscInt *cmap;
214ed502f03SStefano Zampini   PetscErrorCode ierr;
215ed502f03SStefano Zampini 
216ed502f03SStefano Zampini   PetscFunctionBegin;
217ed502f03SStefano Zampini   ierr = MatMPIAIJGetSeqAIJ(A,&Ad,&Ao,&cmap);CHKERRQ(ierr);
218ed502f03SStefano Zampini   ierr = MatSeqAIJCUSPARSEMergeMats(Ad,Ao,scall,A_loc);CHKERRQ(ierr);
219ed502f03SStefano Zampini   if (glob) {
220ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
221ed502f03SStefano Zampini 
222ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ad,NULL,&dn);CHKERRQ(ierr);
223ed502f03SStefano Zampini     ierr = MatGetLocalSize(Ao,NULL,&on);CHKERRQ(ierr);
224ed502f03SStefano Zampini     ierr = MatGetOwnershipRangeColumn(A,&cst,NULL);CHKERRQ(ierr);
225ed502f03SStefano Zampini     ierr = PetscMalloc1(dn+on,&gidx);CHKERRQ(ierr);
226ed502f03SStefano Zampini     for (i=0; i<dn; i++) gidx[i]    = cst + i;
227ed502f03SStefano Zampini     for (i=0; i<on; i++) gidx[i+dn] = cmap[i];
228ed502f03SStefano Zampini     ierr = ISCreateGeneral(PetscObjectComm((PetscObject)Ad),dn+on,gidx,PETSC_OWN_POINTER,glob);CHKERRQ(ierr);
229ed502f03SStefano Zampini   }
230ed502f03SStefano Zampini   PetscFunctionReturn(0);
231ed502f03SStefano Zampini }
232ed502f03SStefano Zampini 
2339ae82921SPaul Mullowney PetscErrorCode  MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])
2349ae82921SPaul Mullowney {
235bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b = (Mat_MPIAIJ*)B->data;
236bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)b->spptr;
2379ae82921SPaul Mullowney   PetscErrorCode     ierr;
2389ae82921SPaul Mullowney   PetscInt           i;
2399ae82921SPaul Mullowney 
2409ae82921SPaul Mullowney   PetscFunctionBegin;
2419ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->rmap);CHKERRQ(ierr);
2429ae82921SPaul Mullowney   ierr = PetscLayoutSetUp(B->cmap);CHKERRQ(ierr);
2437e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
2449ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2459ae82921SPaul Mullowney       if (d_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"d_nnz cannot be less than 0: local row %D value %D",i,d_nnz[i]);
2469ae82921SPaul Mullowney     }
2479ae82921SPaul Mullowney   }
2487e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
2499ae82921SPaul Mullowney     for (i=0; i<B->rmap->n; i++) {
2509ae82921SPaul Mullowney       if (o_nnz[i] < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"o_nnz cannot be less than 0: local row %D value %D",i,o_nnz[i]);
2519ae82921SPaul Mullowney     }
2529ae82921SPaul Mullowney   }
2539ae82921SPaul Mullowney   if (!B->preallocated) {
254bbf3fe20SPaul Mullowney     /* Explicitly create 2 MATSEQAIJCUSPARSE matrices. */
2559ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->A);CHKERRQ(ierr);
256b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->A,B->boundtocpu);CHKERRQ(ierr);
2579ae82921SPaul Mullowney     ierr = MatSetSizes(b->A,B->rmap->n,B->cmap->n,B->rmap->n,B->cmap->n);CHKERRQ(ierr);
2583bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->A);CHKERRQ(ierr);
2599ae82921SPaul Mullowney     ierr = MatCreate(PETSC_COMM_SELF,&b->B);CHKERRQ(ierr);
260b470e4b4SRichard Tran Mills     ierr = MatBindToCPU(b->B,B->boundtocpu);CHKERRQ(ierr);
2619ae82921SPaul Mullowney     ierr = MatSetSizes(b->B,B->rmap->n,B->cmap->N,B->rmap->n,B->cmap->N);CHKERRQ(ierr);
2623bb1ff40SBarry Smith     ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)b->B);CHKERRQ(ierr);
2639ae82921SPaul Mullowney   }
264d98d7c49SStefano Zampini   ierr = MatSetType(b->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
265d98d7c49SStefano Zampini   ierr = MatSetType(b->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
266d98d7c49SStefano Zampini   if (b->lvec) {
267d98d7c49SStefano Zampini     ierr = VecSetType(b->lvec,VECSEQCUDA);CHKERRQ(ierr);
268d98d7c49SStefano Zampini   }
2699ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->A,d_nz,d_nnz);CHKERRQ(ierr);
2709ae82921SPaul Mullowney   ierr = MatSeqAIJSetPreallocation(b->B,o_nz,o_nnz);CHKERRQ(ierr);
271e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->A,MAT_CUSPARSE_MULT,cusparseStruct->diagGPUMatFormat);CHKERRQ(ierr);
272e057df02SPaul Mullowney   ierr = MatCUSPARSESetFormat(b->B,MAT_CUSPARSE_MULT,cusparseStruct->offdiagGPUMatFormat);CHKERRQ(ierr);
273b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->A,cusparseStruct->handle);CHKERRQ(ierr);
274b06137fdSPaul Mullowney   ierr = MatCUSPARSESetHandle(b->B,cusparseStruct->handle);CHKERRQ(ierr);
275b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->A,cusparseStruct->stream);CHKERRQ(ierr);
276b06137fdSPaul Mullowney   ierr = MatCUSPARSESetStream(b->B,cusparseStruct->stream);CHKERRQ(ierr);
2772205254eSKarl Rupp 
2789ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
2799ae82921SPaul Mullowney   PetscFunctionReturn(0);
2809ae82921SPaul Mullowney }
281e057df02SPaul Mullowney 
282*4eb5378fSStefano Zampini typedef struct {
283*4eb5378fSStefano Zampini   Mat         *mp;    /* intermediate products */
284*4eb5378fSStefano Zampini   PetscBool   *mptmp; /* is the intermediate product temporary ? */
285*4eb5378fSStefano Zampini   PetscInt    cp;     /* number of intermediate products */
286*4eb5378fSStefano Zampini 
287*4eb5378fSStefano Zampini   /* support for MatGetBrowsOfAoCols_MPIAIJ for P_oth */
288*4eb5378fSStefano Zampini   PetscInt    *startsj_s,*startsj_r;
289*4eb5378fSStefano Zampini   PetscScalar *bufa;
290*4eb5378fSStefano Zampini   Mat         P_oth;
291*4eb5378fSStefano Zampini 
292*4eb5378fSStefano Zampini   /* may take advantage of merging product->B */
293*4eb5378fSStefano Zampini   Mat Bloc;
294*4eb5378fSStefano Zampini 
295*4eb5378fSStefano Zampini   /* cusparse does not have support to split between symbolic and numeric phases
296*4eb5378fSStefano Zampini      When api_user is true, we don't need to update the numerical values
297*4eb5378fSStefano Zampini      of the temporary storage */
298*4eb5378fSStefano Zampini   PetscBool reusesym;
299*4eb5378fSStefano Zampini 
300*4eb5378fSStefano Zampini   /* support for COO values insertion */
301*4eb5378fSStefano Zampini   PetscScalar *coo_v,*coo_w;
302*4eb5378fSStefano Zampini   PetscSF     sf; /* if present, non-local values insertion (i.e. AtB or PtAP) */
303*4eb5378fSStefano Zampini } MatMatMPIAIJCUSPARSE;
304*4eb5378fSStefano Zampini 
305*4eb5378fSStefano Zampini PetscErrorCode MatDestroy_MatMatMPIAIJCUSPARSE(void *data)
306*4eb5378fSStefano Zampini {
307*4eb5378fSStefano Zampini   MatMatMPIAIJCUSPARSE *mmdata = (MatMatMPIAIJCUSPARSE*)data;
308*4eb5378fSStefano Zampini   PetscInt             i;
309*4eb5378fSStefano Zampini   PetscErrorCode       ierr;
310*4eb5378fSStefano Zampini 
311*4eb5378fSStefano Zampini   PetscFunctionBegin;
312*4eb5378fSStefano Zampini   ierr = PetscFree2(mmdata->startsj_s,mmdata->startsj_r);CHKERRQ(ierr);
313*4eb5378fSStefano Zampini   ierr = PetscFree(mmdata->bufa);CHKERRQ(ierr);
314*4eb5378fSStefano Zampini   ierr = PetscFree(mmdata->coo_v);CHKERRQ(ierr);
315*4eb5378fSStefano Zampini   ierr = PetscFree(mmdata->coo_w);CHKERRQ(ierr);
316*4eb5378fSStefano Zampini   ierr = MatDestroy(&mmdata->P_oth);CHKERRQ(ierr);
317*4eb5378fSStefano Zampini   ierr = MatDestroy(&mmdata->Bloc);CHKERRQ(ierr);
318*4eb5378fSStefano Zampini   ierr = PetscSFDestroy(&mmdata->sf);CHKERRQ(ierr);
319*4eb5378fSStefano Zampini   for (i = 0; i < mmdata->cp; i++) {
320*4eb5378fSStefano Zampini     ierr = MatDestroy(&mmdata->mp[i]);CHKERRQ(ierr);
321*4eb5378fSStefano Zampini   }
322*4eb5378fSStefano Zampini   ierr = PetscFree(mmdata->mp);CHKERRQ(ierr);
323*4eb5378fSStefano Zampini   ierr = PetscFree(mmdata->mptmp);CHKERRQ(ierr);
324*4eb5378fSStefano Zampini   ierr = PetscFree(mmdata);CHKERRQ(ierr);
325*4eb5378fSStefano Zampini   PetscFunctionReturn(0);
326*4eb5378fSStefano Zampini }
327*4eb5378fSStefano Zampini 
328*4eb5378fSStefano Zampini static PetscErrorCode MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
329*4eb5378fSStefano Zampini {
330*4eb5378fSStefano Zampini   MatMatMPIAIJCUSPARSE *mmdata;
331*4eb5378fSStefano Zampini   PetscScalar          *tmp;
332*4eb5378fSStefano Zampini   PetscInt             i,n;
333*4eb5378fSStefano Zampini   PetscErrorCode       ierr;
334*4eb5378fSStefano Zampini 
335*4eb5378fSStefano Zampini   PetscFunctionBegin;
336*4eb5378fSStefano Zampini   MatCheckProduct(C,1);
337*4eb5378fSStefano Zampini   if (!C->product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data empty");
338*4eb5378fSStefano Zampini   mmdata = (MatMatMPIAIJCUSPARSE*)C->product->data;
339*4eb5378fSStefano Zampini   tmp = mmdata->sf ? mmdata->coo_w : mmdata->coo_v;
340*4eb5378fSStefano Zampini   if (!mmdata->reusesym) { /* update temporary matrices */
341*4eb5378fSStefano Zampini     if (mmdata->P_oth) {
342*4eb5378fSStefano Zampini       ierr = MatGetBrowsOfAoCols_MPIAIJ(C->product->A,C->product->B,MAT_REUSE_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
343*4eb5378fSStefano Zampini     }
344*4eb5378fSStefano Zampini     if (mmdata->Bloc) {
345*4eb5378fSStefano Zampini       ierr = MatMPIAIJGetLocalMatMerge(C->product->B,MAT_REUSE_MATRIX,NULL,&mmdata->Bloc);CHKERRQ(ierr);
346*4eb5378fSStefano Zampini     }
347*4eb5378fSStefano Zampini   }
348*4eb5378fSStefano Zampini   mmdata->reusesym = PETSC_FALSE;
349*4eb5378fSStefano Zampini   for (i = 0, n = 0; i < mmdata->cp; i++) {
350*4eb5378fSStefano Zampini     Mat_SeqAIJ        *mm = (Mat_SeqAIJ*)mmdata->mp[i]->data;
351*4eb5378fSStefano Zampini     const PetscScalar *vv;
352*4eb5378fSStefano Zampini 
353*4eb5378fSStefano Zampini     if (!mmdata->mp[i]->ops->productnumeric) SETERRQ1(PetscObjectComm((PetscObject)mmdata->mp[i]),PETSC_ERR_PLIB,"Missing numeric op for %s",MatProductTypes[mmdata->mp[i]->product->type]);
354*4eb5378fSStefano Zampini     ierr = (*mmdata->mp[i]->ops->productnumeric)(mmdata->mp[i]);CHKERRQ(ierr);
355*4eb5378fSStefano Zampini     if (mmdata->mptmp[i]) continue;
356*4eb5378fSStefano Zampini     /* TODO: add support for using GPU data directly */
357*4eb5378fSStefano Zampini     ierr = MatSeqAIJGetArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
358*4eb5378fSStefano Zampini     ierr = PetscArraycpy(tmp + n,vv,mm->nz);CHKERRQ(ierr);
359*4eb5378fSStefano Zampini     ierr = MatSeqAIJRestoreArrayRead(mmdata->mp[i],&vv);CHKERRQ(ierr);
360*4eb5378fSStefano Zampini     n   += mm->nz;
361*4eb5378fSStefano Zampini   }
362*4eb5378fSStefano Zampini   if (mmdata->sf) { /* offprocess insertion */
363*4eb5378fSStefano Zampini     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
364*4eb5378fSStefano Zampini     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_SCALAR,tmp,mmdata->coo_v);CHKERRQ(ierr);
365*4eb5378fSStefano Zampini   }
366*4eb5378fSStefano Zampini   ierr = MatSetValuesCOO(C,mmdata->coo_v,INSERT_VALUES);CHKERRQ(ierr);
367*4eb5378fSStefano Zampini   PetscFunctionReturn(0);
368*4eb5378fSStefano Zampini }
369*4eb5378fSStefano Zampini 
370*4eb5378fSStefano Zampini /* Pt * A or A * P */
371*4eb5378fSStefano Zampini #define MAX_NUMBER_INTERMEDIATE 4
372*4eb5378fSStefano Zampini static PetscErrorCode MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE(Mat C)
373*4eb5378fSStefano Zampini {
374*4eb5378fSStefano Zampini   Mat_Product            *product = C->product;
375*4eb5378fSStefano Zampini   Mat                    A,P,mp[MAX_NUMBER_INTERMEDIATE];
376*4eb5378fSStefano Zampini   Mat_MPIAIJ             *a,*p;
377*4eb5378fSStefano Zampini   MatMatMPIAIJCUSPARSE   *mmdata;
378*4eb5378fSStefano Zampini   ISLocalToGlobalMapping P_oth_l2g = NULL;
379*4eb5378fSStefano Zampini   IS                     glob = NULL;
380*4eb5378fSStefano Zampini   const PetscInt         *globidx,*P_oth_idx;
381*4eb5378fSStefano Zampini   const PetscInt         *cmapa[MAX_NUMBER_INTERMEDIATE],*rmapa[MAX_NUMBER_INTERMEDIATE];
382*4eb5378fSStefano Zampini   PetscInt               cp = 0,m,n,M,N,ncoo,*coo_i,*coo_j,cmapt[MAX_NUMBER_INTERMEDIATE],rmapt[MAX_NUMBER_INTERMEDIATE],i,j;
383*4eb5378fSStefano Zampini   MatProductType         ptype;
384*4eb5378fSStefano Zampini   PetscBool              mptmp[MAX_NUMBER_INTERMEDIATE],hasoffproc = PETSC_FALSE;
385*4eb5378fSStefano Zampini   PetscMPIInt            size;
386*4eb5378fSStefano Zampini   PetscErrorCode         ierr;
387*4eb5378fSStefano Zampini 
388*4eb5378fSStefano Zampini   PetscFunctionBegin;
389*4eb5378fSStefano Zampini   MatCheckProduct(C,1);
390*4eb5378fSStefano Zampini   if (product->data) SETERRQ(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Product data not empty");
391*4eb5378fSStefano Zampini   ptype = product->type;
392*4eb5378fSStefano Zampini   if (product->A->symmetric && ptype == MATPRODUCT_AtB) ptype = MATPRODUCT_AB;
393*4eb5378fSStefano Zampini   switch (ptype) {
394*4eb5378fSStefano Zampini   case MATPRODUCT_AB:
395*4eb5378fSStefano Zampini     A = product->A;
396*4eb5378fSStefano Zampini     P = product->B;
397*4eb5378fSStefano Zampini     m = A->rmap->n;
398*4eb5378fSStefano Zampini     n = P->cmap->n;
399*4eb5378fSStefano Zampini     M = A->rmap->N;
400*4eb5378fSStefano Zampini     N = P->cmap->N;
401*4eb5378fSStefano Zampini     break;
402*4eb5378fSStefano Zampini   case MATPRODUCT_AtB:
403*4eb5378fSStefano Zampini     P = product->A;
404*4eb5378fSStefano Zampini     A = product->B;
405*4eb5378fSStefano Zampini     m = P->cmap->n;
406*4eb5378fSStefano Zampini     n = A->cmap->n;
407*4eb5378fSStefano Zampini     M = P->cmap->N;
408*4eb5378fSStefano Zampini     N = A->cmap->N;
409*4eb5378fSStefano Zampini     hasoffproc = PETSC_TRUE;
410*4eb5378fSStefano Zampini     break;
411*4eb5378fSStefano Zampini   case MATPRODUCT_PtAP:
412*4eb5378fSStefano Zampini     A = product->A;
413*4eb5378fSStefano Zampini     P = product->B;
414*4eb5378fSStefano Zampini     m = P->cmap->n;
415*4eb5378fSStefano Zampini     n = P->cmap->n;
416*4eb5378fSStefano Zampini     M = P->cmap->N;
417*4eb5378fSStefano Zampini     N = P->cmap->N;
418*4eb5378fSStefano Zampini     hasoffproc = PETSC_TRUE;
419*4eb5378fSStefano Zampini     break;
420*4eb5378fSStefano Zampini   default:
421*4eb5378fSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
422*4eb5378fSStefano Zampini   }
423*4eb5378fSStefano Zampini   ierr = MPI_Comm_size(PetscObjectComm((PetscObject)C),&size);CHKERRQ(ierr);
424*4eb5378fSStefano Zampini   if (size == 1) hasoffproc = PETSC_FALSE;
425*4eb5378fSStefano Zampini 
426*4eb5378fSStefano Zampini   for (i=0;i<MAX_NUMBER_INTERMEDIATE;i++) {
427*4eb5378fSStefano Zampini     mp[i]    = NULL;
428*4eb5378fSStefano Zampini     mptmp[i] = PETSC_FALSE;
429*4eb5378fSStefano Zampini     rmapt[i] = 0;
430*4eb5378fSStefano Zampini     cmapt[i] = 0;
431*4eb5378fSStefano Zampini     rmapa[i] = NULL;
432*4eb5378fSStefano Zampini     cmapa[i] = NULL;
433*4eb5378fSStefano Zampini   }
434*4eb5378fSStefano Zampini 
435*4eb5378fSStefano Zampini   a = (Mat_MPIAIJ*)A->data;
436*4eb5378fSStefano Zampini   p = (Mat_MPIAIJ*)P->data;
437*4eb5378fSStefano Zampini   ierr = MatSetSizes(C,m,n,M,N);CHKERRQ(ierr);
438*4eb5378fSStefano Zampini   ierr = PetscLayoutSetUp(C->rmap);CHKERRQ(ierr);
439*4eb5378fSStefano Zampini   ierr = PetscLayoutSetUp(C->cmap);CHKERRQ(ierr);
440*4eb5378fSStefano Zampini   ierr = MatSetType(C,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
441*4eb5378fSStefano Zampini   ierr = PetscNew(&mmdata);CHKERRQ(ierr);
442*4eb5378fSStefano Zampini   mmdata->reusesym = product->api_user;
443*4eb5378fSStefano Zampini   switch (ptype) {
444*4eb5378fSStefano Zampini   case MATPRODUCT_AB: /* A * P */
445*4eb5378fSStefano Zampini     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
446*4eb5378fSStefano Zampini 
447*4eb5378fSStefano Zampini     if (1) { /* A_diag * P_loc and A_off * P_oth TODO: add customization for this */
448*4eb5378fSStefano Zampini       /* P is product->B */
449*4eb5378fSStefano Zampini       ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
450*4eb5378fSStefano Zampini       ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
451*4eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
452*4eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
453*4eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
454*4eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
455*4eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
456*4eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
457*4eb5378fSStefano Zampini       ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
458*4eb5378fSStefano Zampini       rmapt[cp] = 1;
459*4eb5378fSStefano Zampini       cmapt[cp] = 2;
460*4eb5378fSStefano Zampini       cmapa[cp] = globidx;
461*4eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
462*4eb5378fSStefano Zampini       cp++;
463*4eb5378fSStefano Zampini     } else { /* A_diag * P_diag and A_diag * P_off and A_off * P_oth */
464*4eb5378fSStefano Zampini       ierr = MatProductCreate(a->A,p->A,NULL,&mp[cp]);CHKERRQ(ierr);
465*4eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
466*4eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
467*4eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
468*4eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
469*4eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
470*4eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
471*4eb5378fSStefano Zampini       rmapt[cp] = 1;
472*4eb5378fSStefano Zampini       cmapt[cp] = 1;
473*4eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
474*4eb5378fSStefano Zampini       cp++;
475*4eb5378fSStefano Zampini       ierr = MatProductCreate(a->A,p->B,NULL,&mp[cp]);CHKERRQ(ierr);
476*4eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
477*4eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
478*4eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
479*4eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
480*4eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
481*4eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
482*4eb5378fSStefano Zampini       rmapt[cp] = 1;
483*4eb5378fSStefano Zampini       cmapt[cp] = 2;
484*4eb5378fSStefano Zampini       cmapa[cp] = p->garray;
485*4eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
486*4eb5378fSStefano Zampini       cp++;
487*4eb5378fSStefano Zampini     }
488*4eb5378fSStefano Zampini     if (mmdata->P_oth) {
489*4eb5378fSStefano Zampini       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
490*4eb5378fSStefano Zampini       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
491*4eb5378fSStefano Zampini       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
492*4eb5378fSStefano Zampini       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
493*4eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
494*4eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
495*4eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
496*4eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
497*4eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
498*4eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
499*4eb5378fSStefano Zampini       rmapt[cp] = 1;
500*4eb5378fSStefano Zampini       cmapt[cp] = 2;
501*4eb5378fSStefano Zampini       cmapa[cp] = P_oth_idx;
502*4eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
503*4eb5378fSStefano Zampini       cp++;
504*4eb5378fSStefano Zampini     }
505*4eb5378fSStefano Zampini     break;
506*4eb5378fSStefano Zampini   case MATPRODUCT_AtB: /* (P^t * A): P_diag * A_loc + P_off * A_loc */
507*4eb5378fSStefano Zampini     /* A is product->B */
508*4eb5378fSStefano Zampini     ierr = MatMPIAIJGetLocalMatMerge(A,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
509*4eb5378fSStefano Zampini     ierr = MatProductCreate(p->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
510*4eb5378fSStefano Zampini     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
511*4eb5378fSStefano Zampini     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
512*4eb5378fSStefano Zampini     mp[cp]->product->api_user = product->api_user;
513*4eb5378fSStefano Zampini     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
514*4eb5378fSStefano Zampini     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
515*4eb5378fSStefano Zampini     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
516*4eb5378fSStefano Zampini     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
517*4eb5378fSStefano Zampini     rmapt[cp] = 1;
518*4eb5378fSStefano Zampini     cmapt[cp] = 2;
519*4eb5378fSStefano Zampini     cmapa[cp] = globidx;
520*4eb5378fSStefano Zampini     mptmp[cp] = PETSC_FALSE;
521*4eb5378fSStefano Zampini     cp++;
522*4eb5378fSStefano Zampini     ierr = MatProductCreate(p->B,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
523*4eb5378fSStefano Zampini     ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
524*4eb5378fSStefano Zampini     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
525*4eb5378fSStefano Zampini     mp[cp]->product->api_user = product->api_user;
526*4eb5378fSStefano Zampini     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
527*4eb5378fSStefano Zampini     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
528*4eb5378fSStefano Zampini     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
529*4eb5378fSStefano Zampini     rmapt[cp] = 2;
530*4eb5378fSStefano Zampini     rmapa[cp] = p->garray;
531*4eb5378fSStefano Zampini     cmapt[cp] = 2;
532*4eb5378fSStefano Zampini     cmapa[cp] = globidx;
533*4eb5378fSStefano Zampini     mptmp[cp] = PETSC_FALSE;
534*4eb5378fSStefano Zampini     cp++;
535*4eb5378fSStefano Zampini     break;
536*4eb5378fSStefano Zampini   case MATPRODUCT_PtAP:
537*4eb5378fSStefano Zampini     ierr = MatGetBrowsOfAoCols_MPIAIJ(A,P,MAT_INITIAL_MATRIX,&mmdata->startsj_s,&mmdata->startsj_r,&mmdata->bufa,&mmdata->P_oth);CHKERRQ(ierr);
538*4eb5378fSStefano Zampini     /* P is product->B */
539*4eb5378fSStefano Zampini     ierr = MatMPIAIJGetLocalMatMerge(P,MAT_INITIAL_MATRIX,&glob,&mmdata->Bloc);CHKERRQ(ierr);
540*4eb5378fSStefano Zampini     ierr = MatProductCreate(a->A,mmdata->Bloc,NULL,&mp[cp]);CHKERRQ(ierr);
541*4eb5378fSStefano Zampini     ierr = MatProductSetType(mp[cp],MATPRODUCT_PtAP);CHKERRQ(ierr);
542*4eb5378fSStefano Zampini     ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
543*4eb5378fSStefano Zampini     mp[cp]->product->api_user = product->api_user;
544*4eb5378fSStefano Zampini     ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
545*4eb5378fSStefano Zampini     if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
546*4eb5378fSStefano Zampini     ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
547*4eb5378fSStefano Zampini     ierr = ISGetIndices(glob,&globidx);CHKERRQ(ierr);
548*4eb5378fSStefano Zampini     rmapt[cp] = 2;
549*4eb5378fSStefano Zampini     rmapa[cp] = globidx;
550*4eb5378fSStefano Zampini     cmapt[cp] = 2;
551*4eb5378fSStefano Zampini     cmapa[cp] = globidx;
552*4eb5378fSStefano Zampini     mptmp[cp] = PETSC_FALSE;
553*4eb5378fSStefano Zampini     cp++;
554*4eb5378fSStefano Zampini     if (mmdata->P_oth) {
555*4eb5378fSStefano Zampini       ierr = MatSeqAIJCompactOutExtraColumns_SeqAIJ(mmdata->P_oth,&P_oth_l2g);CHKERRQ(ierr);
556*4eb5378fSStefano Zampini       ierr = MatSetType(mmdata->P_oth,((PetscObject)(a->B))->type_name);CHKERRQ(ierr);
557*4eb5378fSStefano Zampini       ierr = ISLocalToGlobalMappingGetIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
558*4eb5378fSStefano Zampini       ierr = MatProductCreate(a->B,mmdata->P_oth,NULL,&mp[cp]);CHKERRQ(ierr);
559*4eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AB);CHKERRQ(ierr);
560*4eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
561*4eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
562*4eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
563*4eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
564*4eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
565*4eb5378fSStefano Zampini       mptmp[cp] = PETSC_TRUE;
566*4eb5378fSStefano Zampini       cp++;
567*4eb5378fSStefano Zampini       ierr = MatProductCreate(mmdata->Bloc,mp[1],NULL,&mp[cp]);CHKERRQ(ierr);
568*4eb5378fSStefano Zampini       ierr = MatProductSetType(mp[cp],MATPRODUCT_AtB);CHKERRQ(ierr);
569*4eb5378fSStefano Zampini       ierr = MatProductSetFill(mp[cp],product->fill);CHKERRQ(ierr);
570*4eb5378fSStefano Zampini       mp[cp]->product->api_user = product->api_user;
571*4eb5378fSStefano Zampini       ierr = MatProductSetFromOptions(mp[cp]);CHKERRQ(ierr);
572*4eb5378fSStefano Zampini       if (!mp[cp]->ops->productsymbolic) SETERRQ1(PetscObjectComm((PetscObject)mp[cp]),PETSC_ERR_PLIB,"Missing symbolic op for %s",MatProductTypes[mp[cp]->product->type]);
573*4eb5378fSStefano Zampini       ierr = (*mp[cp]->ops->productsymbolic)(mp[cp]);CHKERRQ(ierr);
574*4eb5378fSStefano Zampini       rmapt[cp] = 2;
575*4eb5378fSStefano Zampini       rmapa[cp] = globidx;
576*4eb5378fSStefano Zampini       cmapt[cp] = 2;
577*4eb5378fSStefano Zampini       cmapa[cp] = P_oth_idx;
578*4eb5378fSStefano Zampini       mptmp[cp] = PETSC_FALSE;
579*4eb5378fSStefano Zampini       cp++;
580*4eb5378fSStefano Zampini     }
581*4eb5378fSStefano Zampini     break;
582*4eb5378fSStefano Zampini   default:
583*4eb5378fSStefano Zampini     SETERRQ1(PetscObjectComm((PetscObject)C),PETSC_ERR_PLIB,"Not for product type %s",MatProductTypes[ptype]);
584*4eb5378fSStefano Zampini   }
585*4eb5378fSStefano Zampini   ierr = PetscMalloc1(cp,&mmdata->mp);CHKERRQ(ierr);
586*4eb5378fSStefano Zampini   for (i = 0; i < cp; i++) mmdata->mp[i] = mp[i];
587*4eb5378fSStefano Zampini   ierr = PetscMalloc1(cp,&mmdata->mptmp);CHKERRQ(ierr);
588*4eb5378fSStefano Zampini   for (i = 0; i < cp; i++) mmdata->mptmp[i] = mptmp[i];
589*4eb5378fSStefano Zampini   mmdata->cp = cp;
590*4eb5378fSStefano Zampini   C->product->data       = mmdata;
591*4eb5378fSStefano Zampini   C->product->destroy    = MatDestroy_MatMatMPIAIJCUSPARSE;
592*4eb5378fSStefano Zampini   C->ops->productnumeric = MatProductNumeric_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
593*4eb5378fSStefano Zampini 
594*4eb5378fSStefano Zampini   /* prepare coo coordinates for values insertion */
595*4eb5378fSStefano Zampini   ncoo = 0;
596*4eb5378fSStefano Zampini   for (cp = 0; cp < mmdata->cp; cp++) {
597*4eb5378fSStefano Zampini     Mat_SeqAIJ *mm = (Mat_SeqAIJ*)mp[cp]->data;
598*4eb5378fSStefano Zampini     if (mptmp[cp]) continue;
599*4eb5378fSStefano Zampini     ncoo += mm->nz;
600*4eb5378fSStefano Zampini   }
601*4eb5378fSStefano Zampini   ierr = PetscMalloc2(ncoo,&coo_i,ncoo,&coo_j);CHKERRQ(ierr);
602*4eb5378fSStefano Zampini   ncoo = 0;
603*4eb5378fSStefano Zampini   for (cp = 0; cp < mmdata->cp; cp++) {
604*4eb5378fSStefano Zampini     Mat_SeqAIJ     *mm = (Mat_SeqAIJ*)mp[cp]->data;
605*4eb5378fSStefano Zampini     PetscInt       *coi = coo_i + ncoo;
606*4eb5378fSStefano Zampini     PetscInt       *coj = coo_j + ncoo;
607*4eb5378fSStefano Zampini     const PetscInt mr = mp[cp]->rmap->n;
608*4eb5378fSStefano Zampini     const PetscInt *jj  = mm->j;
609*4eb5378fSStefano Zampini     const PetscInt *ii  = mm->i;
610*4eb5378fSStefano Zampini 
611*4eb5378fSStefano Zampini     if (mptmp[cp]) continue;
612*4eb5378fSStefano Zampini     /* rows coo */
613*4eb5378fSStefano Zampini     if (!rmapt[cp]) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"This should not happen");
614*4eb5378fSStefano Zampini     else if (rmapt[cp] == 1) { /* local to global for owned rows of  */
615*4eb5378fSStefano Zampini       const PetscInt rs = C->rmap->rstart;
616*4eb5378fSStefano Zampini       for (i = 0; i < mr; i++) {
617*4eb5378fSStefano Zampini         const PetscInt gr = i + rs;
618*4eb5378fSStefano Zampini         for (j = ii[i]; j < ii[i+1]; j++) {
619*4eb5378fSStefano Zampini           coi[j] = gr;
620*4eb5378fSStefano Zampini         }
621*4eb5378fSStefano Zampini       }
622*4eb5378fSStefano Zampini     } else { /* offprocess */
623*4eb5378fSStefano Zampini       const PetscInt *rmap = rmapa[cp];
624*4eb5378fSStefano Zampini       for (i = 0; i < mr; i++) {
625*4eb5378fSStefano Zampini         const PetscInt gr = rmap[i];
626*4eb5378fSStefano Zampini         for (j = ii[i]; j < ii[i+1]; j++) {
627*4eb5378fSStefano Zampini           coi[j] = gr;
628*4eb5378fSStefano Zampini         }
629*4eb5378fSStefano Zampini       }
630*4eb5378fSStefano Zampini     }
631*4eb5378fSStefano Zampini     /* columns coo */
632*4eb5378fSStefano Zampini     if (!cmapt[cp]) {
633*4eb5378fSStefano Zampini       ierr = PetscArraycpy(coj,jj,mm->nz);CHKERRQ(ierr);
634*4eb5378fSStefano Zampini     } else if (cmapt[cp] == 1) { /* local to global for owned columns of P */
635*4eb5378fSStefano Zampini       const PetscInt cs = P->cmap->rstart;
636*4eb5378fSStefano Zampini       for (j = 0; j < mm->nz; j++) {
637*4eb5378fSStefano Zampini         coj[j] = jj[j] + cs;
638*4eb5378fSStefano Zampini       }
639*4eb5378fSStefano Zampini     } else { /* offdiag */
640*4eb5378fSStefano Zampini       const PetscInt *cmap = cmapa[cp];
641*4eb5378fSStefano Zampini       for (j = 0; j < mm->nz; j++) {
642*4eb5378fSStefano Zampini         coj[j] = cmap[jj[j]];
643*4eb5378fSStefano Zampini       }
644*4eb5378fSStefano Zampini     }
645*4eb5378fSStefano Zampini     ncoo += mm->nz;
646*4eb5378fSStefano Zampini   }
647*4eb5378fSStefano Zampini   if (glob) {
648*4eb5378fSStefano Zampini     ierr = ISRestoreIndices(glob,&globidx);CHKERRQ(ierr);
649*4eb5378fSStefano Zampini   }
650*4eb5378fSStefano Zampini   ierr = ISDestroy(&glob);CHKERRQ(ierr);
651*4eb5378fSStefano Zampini   if (P_oth_l2g) {
652*4eb5378fSStefano Zampini     ierr = ISLocalToGlobalMappingRestoreIndices(P_oth_l2g,&P_oth_idx);CHKERRQ(ierr);
653*4eb5378fSStefano Zampini   }
654*4eb5378fSStefano Zampini   ierr = ISLocalToGlobalMappingDestroy(&P_oth_l2g);CHKERRQ(ierr);
655*4eb5378fSStefano Zampini 
656*4eb5378fSStefano Zampini   if (hasoffproc) { /* offproc values insertion */
657*4eb5378fSStefano Zampini     const PetscInt *sfdeg;
658*4eb5378fSStefano Zampini     const PetscInt n = P->cmap->n;
659*4eb5378fSStefano Zampini     PetscInt ncoo2,*coo_i2,*coo_j2;
660*4eb5378fSStefano Zampini 
661*4eb5378fSStefano Zampini     ierr = PetscSFCreate(PetscObjectComm((PetscObject)C),&mmdata->sf);CHKERRQ(ierr);
662*4eb5378fSStefano Zampini     ierr = PetscSFSetGraphLayout(mmdata->sf,P->cmap,ncoo,NULL,PETSC_OWN_POINTER,coo_i);CHKERRQ(ierr);
663*4eb5378fSStefano Zampini     ierr = PetscSFComputeDegreeBegin(mmdata->sf,&sfdeg);CHKERRQ(ierr);
664*4eb5378fSStefano Zampini     ierr = PetscSFComputeDegreeEnd(mmdata->sf,&sfdeg);CHKERRQ(ierr);
665*4eb5378fSStefano Zampini     for (i = 0, ncoo2 = 0; i < n; i++) ncoo2 += sfdeg[i];
666*4eb5378fSStefano Zampini     ierr = PetscMalloc2(ncoo2,&coo_i2,ncoo2,&coo_j2);CHKERRQ(ierr);
667*4eb5378fSStefano Zampini     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
668*4eb5378fSStefano Zampini     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_i,coo_i2);CHKERRQ(ierr);
669*4eb5378fSStefano Zampini     ierr = PetscSFGatherBegin(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
670*4eb5378fSStefano Zampini     ierr = PetscSFGatherEnd(mmdata->sf,MPIU_INT,coo_j,coo_j2);CHKERRQ(ierr);
671*4eb5378fSStefano Zampini     ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
672*4eb5378fSStefano Zampini     ierr = PetscMalloc1(ncoo,&mmdata->coo_w);CHKERRQ(ierr);
673*4eb5378fSStefano Zampini     coo_i = coo_i2;
674*4eb5378fSStefano Zampini     coo_j = coo_j2;
675*4eb5378fSStefano Zampini     ncoo  = ncoo2;
676*4eb5378fSStefano Zampini   }
677*4eb5378fSStefano Zampini 
678*4eb5378fSStefano Zampini   /* preallocate with COO data */
679*4eb5378fSStefano Zampini   ierr = MatSetPreallocationCOO(C,ncoo,coo_i,coo_j);CHKERRQ(ierr);
680*4eb5378fSStefano Zampini   ierr = PetscMalloc1(ncoo,&mmdata->coo_v);CHKERRQ(ierr);
681*4eb5378fSStefano Zampini   ierr = PetscFree2(coo_i,coo_j);CHKERRQ(ierr);
682*4eb5378fSStefano Zampini   PetscFunctionReturn(0);
683*4eb5378fSStefano Zampini }
684*4eb5378fSStefano Zampini 
685*4eb5378fSStefano Zampini static PetscErrorCode MatProductSetFromOptions_MPIAIJCUSPARSE(Mat mat)
686*4eb5378fSStefano Zampini {
687*4eb5378fSStefano Zampini   Mat_Product    *product = mat->product;
688*4eb5378fSStefano Zampini   PetscErrorCode ierr;
689*4eb5378fSStefano Zampini   PetscBool      Biscusp = PETSC_FALSE;
690*4eb5378fSStefano Zampini 
691*4eb5378fSStefano Zampini   PetscFunctionBegin;
692*4eb5378fSStefano Zampini   MatCheckProduct(mat,1);
693*4eb5378fSStefano Zampini   if (!product->B->boundtocpu) {
694*4eb5378fSStefano Zampini     ierr = PetscObjectTypeCompare((PetscObject)product->B,MATMPIAIJCUSPARSE,&Biscusp);CHKERRQ(ierr);
695*4eb5378fSStefano Zampini   }
696*4eb5378fSStefano Zampini   if (Biscusp) {
697*4eb5378fSStefano Zampini     switch (product->type) {
698*4eb5378fSStefano Zampini     case MATPRODUCT_AB:
699*4eb5378fSStefano Zampini     case MATPRODUCT_AtB:
700*4eb5378fSStefano Zampini     case MATPRODUCT_PtAP:
701*4eb5378fSStefano Zampini       mat->ops->productsymbolic = MatProductSymbolic_MPIAIJCUSPARSE_MPIAIJCUSPARSE;
702*4eb5378fSStefano Zampini       break;
703*4eb5378fSStefano Zampini     default:
704*4eb5378fSStefano Zampini       break;
705*4eb5378fSStefano Zampini     }
706*4eb5378fSStefano Zampini   }
707*4eb5378fSStefano Zampini   if (!mat->ops->productsymbolic) {
708*4eb5378fSStefano Zampini     ierr = MatProductSetFromOptions_MPIAIJ(mat);CHKERRQ(ierr);
709*4eb5378fSStefano Zampini   }
710*4eb5378fSStefano Zampini   PetscFunctionReturn(0);
711*4eb5378fSStefano Zampini }
712*4eb5378fSStefano Zampini 
7139ae82921SPaul Mullowney PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
7149ae82921SPaul Mullowney {
7159ae82921SPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
7169ae82921SPaul Mullowney   PetscErrorCode ierr;
7179ae82921SPaul Mullowney   PetscInt       nt;
7189ae82921SPaul Mullowney 
7199ae82921SPaul Mullowney   PetscFunctionBegin;
7209ae82921SPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
7219ae82921SPaul Mullowney   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
7229ae82921SPaul Mullowney   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
7234d55d066SJunchao Zhang   ierr = (*a->A->ops->mult)(a->A,xx,yy);CHKERRQ(ierr);
7249ae82921SPaul Mullowney   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
7259ae82921SPaul Mullowney   ierr = (*a->B->ops->multadd)(a->B,a->lvec,yy,yy);CHKERRQ(ierr);
7269ae82921SPaul Mullowney   PetscFunctionReturn(0);
7279ae82921SPaul Mullowney }
728ca45077fSPaul Mullowney 
7293fa6b06aSMark Adams PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
7303fa6b06aSMark Adams {
7313fa6b06aSMark Adams   Mat_MPIAIJ     *l = (Mat_MPIAIJ*)A->data;
7323fa6b06aSMark Adams   PetscErrorCode ierr;
7333fa6b06aSMark Adams 
7343fa6b06aSMark Adams   PetscFunctionBegin;
7353fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
7363fa6b06aSMark Adams     Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)l->spptr;
7373fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = spptr->deviceMat;
7383fa6b06aSMark Adams     if (d_mat) {
7393fa6b06aSMark Adams       Mat_SeqAIJ   *a = (Mat_SeqAIJ*)l->A->data;
7403fa6b06aSMark Adams       Mat_SeqAIJ   *b = (Mat_SeqAIJ*)l->B->data;
7413fa6b06aSMark Adams       PetscInt     n = A->rmap->n, nnza = a->i[n], nnzb = b->i[n];
7423fa6b06aSMark Adams       cudaError_t  err;
7433fa6b06aSMark Adams       PetscScalar  *vals;
7443fa6b06aSMark Adams       ierr = PetscInfo(A,"Zero device matrix diag and offfdiag\n");CHKERRQ(ierr);
7453fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->diag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
7463fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnza)*sizeof(PetscScalar));CHKERRCUDA(err);
7473fa6b06aSMark Adams       err = cudaMemcpy( &vals, &d_mat->offdiag.a, sizeof(PetscScalar*), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
7483fa6b06aSMark Adams       err = cudaMemset( vals, 0, (nnzb)*sizeof(PetscScalar));CHKERRCUDA(err);
7493fa6b06aSMark Adams     }
7503fa6b06aSMark Adams   }
7513fa6b06aSMark Adams   ierr = MatZeroEntries(l->A);CHKERRQ(ierr);
7523fa6b06aSMark Adams   ierr = MatZeroEntries(l->B);CHKERRQ(ierr);
7533fa6b06aSMark Adams 
7543fa6b06aSMark Adams   PetscFunctionReturn(0);
7553fa6b06aSMark Adams }
756fdc842d1SBarry Smith PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)
757fdc842d1SBarry Smith {
758fdc842d1SBarry Smith   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
759fdc842d1SBarry Smith   PetscErrorCode ierr;
760fdc842d1SBarry Smith   PetscInt       nt;
761fdc842d1SBarry Smith 
762fdc842d1SBarry Smith   PetscFunctionBegin;
763fdc842d1SBarry Smith   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
764fdc842d1SBarry Smith   if (nt != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->cmap->n,nt);
765fdc842d1SBarry Smith   ierr = VecScatterBegin(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
7664d55d066SJunchao Zhang   ierr = (*a->A->ops->multadd)(a->A,xx,yy,zz);CHKERRQ(ierr);
767fdc842d1SBarry Smith   ierr = VecScatterEnd(a->Mvctx,xx,a->lvec,INSERT_VALUES,SCATTER_FORWARD);CHKERRQ(ierr);
768fdc842d1SBarry Smith   ierr = (*a->B->ops->multadd)(a->B,a->lvec,zz,zz);CHKERRQ(ierr);
769fdc842d1SBarry Smith   PetscFunctionReturn(0);
770fdc842d1SBarry Smith }
771fdc842d1SBarry Smith 
772ca45077fSPaul Mullowney PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)
773ca45077fSPaul Mullowney {
774ca45077fSPaul Mullowney   Mat_MPIAIJ     *a = (Mat_MPIAIJ*)A->data;
775ca45077fSPaul Mullowney   PetscErrorCode ierr;
776ca45077fSPaul Mullowney   PetscInt       nt;
777ca45077fSPaul Mullowney 
778ca45077fSPaul Mullowney   PetscFunctionBegin;
779ca45077fSPaul Mullowney   ierr = VecGetLocalSize(xx,&nt);CHKERRQ(ierr);
780ccf5f80bSJunchao Zhang   if (nt != A->rmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Incompatible partition of A (%D) and xx (%D)",A->rmap->n,nt);
7819b2db222SKarl Rupp   ierr = (*a->B->ops->multtranspose)(a->B,xx,a->lvec);CHKERRQ(ierr);
782ca45077fSPaul Mullowney   ierr = (*a->A->ops->multtranspose)(a->A,xx,yy);CHKERRQ(ierr);
7839b2db222SKarl Rupp   ierr = VecScatterBegin(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
7849b2db222SKarl Rupp   ierr = VecScatterEnd(a->Mvctx,a->lvec,yy,ADD_VALUES,SCATTER_REVERSE);CHKERRQ(ierr);
785ca45077fSPaul Mullowney   PetscFunctionReturn(0);
786ca45077fSPaul Mullowney }
7879ae82921SPaul Mullowney 
788e057df02SPaul Mullowney PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)
789ca45077fSPaul Mullowney {
790ca45077fSPaul Mullowney   Mat_MPIAIJ         *a               = (Mat_MPIAIJ*)A->data;
791bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE * cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
792e057df02SPaul Mullowney 
793ca45077fSPaul Mullowney   PetscFunctionBegin;
794e057df02SPaul Mullowney   switch (op) {
795e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_DIAG:
796e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat = format;
797045c96e1SPaul Mullowney     break;
798e057df02SPaul Mullowney   case MAT_CUSPARSE_MULT_OFFDIAG:
799e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
800045c96e1SPaul Mullowney     break;
801e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
802e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
803e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
804045c96e1SPaul Mullowney     break;
805e057df02SPaul Mullowney   default:
806e057df02SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.",op);
807045c96e1SPaul Mullowney   }
808ca45077fSPaul Mullowney   PetscFunctionReturn(0);
809ca45077fSPaul Mullowney }
810e057df02SPaul Mullowney 
8114416b707SBarry Smith PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(PetscOptionItems *PetscOptionsObject,Mat A)
812e057df02SPaul Mullowney {
813e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
814e057df02SPaul Mullowney   PetscErrorCode           ierr;
815e057df02SPaul Mullowney   PetscBool                flg;
816a183c035SDominic Meiser   Mat_MPIAIJ               *a = (Mat_MPIAIJ*)A->data;
817a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE       *cusparseStruct = (Mat_MPIAIJCUSPARSE*)a->spptr;
8185fd66863SKarl Rupp 
819e057df02SPaul Mullowney   PetscFunctionBegin;
820e55864a3SBarry Smith   ierr = PetscOptionsHead(PetscOptionsObject,"MPIAIJCUSPARSE options");CHKERRQ(ierr);
821e057df02SPaul Mullowney   if (A->factortype==MAT_FACTOR_NONE) {
822e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format","sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV",
823a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
824e057df02SPaul Mullowney     if (flg) {
825e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_DIAG,format);CHKERRQ(ierr);
826e057df02SPaul Mullowney     }
827e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format","sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
828a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->offdiagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
829e057df02SPaul Mullowney     if (flg) {
830e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_MULT_OFFDIAG,format);CHKERRQ(ierr);
831e057df02SPaul Mullowney     }
832e057df02SPaul Mullowney     ierr = PetscOptionsEnum("-mat_cusparse_storage_format","sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV",
833a183c035SDominic Meiser                             "MatCUSPARSESetFormat",MatCUSPARSEStorageFormats,(PetscEnum)cusparseStruct->diagGPUMatFormat,(PetscEnum*)&format,&flg);CHKERRQ(ierr);
834e057df02SPaul Mullowney     if (flg) {
835e057df02SPaul Mullowney       ierr = MatCUSPARSESetFormat(A,MAT_CUSPARSE_ALL,format);CHKERRQ(ierr);
836e057df02SPaul Mullowney     }
837e057df02SPaul Mullowney   }
8380af67c1bSStefano Zampini   ierr = PetscOptionsTail();CHKERRQ(ierr);
839e057df02SPaul Mullowney   PetscFunctionReturn(0);
840e057df02SPaul Mullowney }
841e057df02SPaul Mullowney 
84234d6c7a5SJose E. Roman PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)
84334d6c7a5SJose E. Roman {
84434d6c7a5SJose E. Roman   PetscErrorCode             ierr;
8453fa6b06aSMark Adams   Mat_MPIAIJ                 *mpiaij = (Mat_MPIAIJ*)A->data;
8463fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE         *cusparseStruct = (Mat_MPIAIJCUSPARSE*)mpiaij->spptr;
8473fa6b06aSMark Adams   PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat;
84834d6c7a5SJose E. Roman   PetscFunctionBegin;
84934d6c7a5SJose E. Roman   ierr = MatAssemblyEnd_MPIAIJ(A,mode);CHKERRQ(ierr);
85034d6c7a5SJose E. Roman   if (!A->was_assembled && mode == MAT_FINAL_ASSEMBLY) {
85134d6c7a5SJose E. Roman     ierr = VecSetType(mpiaij->lvec,VECSEQCUDA);CHKERRQ(ierr);
85234d6c7a5SJose E. Roman   }
853a587d139SMark   if (d_mat) {
8543fa6b06aSMark Adams     A->offloadmask = PETSC_OFFLOAD_GPU; // if we assembled on the device
8553fa6b06aSMark Adams   }
8563fa6b06aSMark Adams 
85734d6c7a5SJose E. Roman   PetscFunctionReturn(0);
85834d6c7a5SJose E. Roman }
85934d6c7a5SJose E. Roman 
860bbf3fe20SPaul Mullowney PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
861bbf3fe20SPaul Mullowney {
862bbf3fe20SPaul Mullowney   PetscErrorCode     ierr;
8633fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ*)A->data;
8643fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE*)aij->spptr;
865b06137fdSPaul Mullowney   cudaError_t        err;
866b06137fdSPaul Mullowney   cusparseStatus_t   stat;
867bbf3fe20SPaul Mullowney 
868bbf3fe20SPaul Mullowney   PetscFunctionBegin;
869d98d7c49SStefano Zampini   if (!cusparseStruct) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing spptr");
8703fa6b06aSMark Adams   if (cusparseStruct->deviceMat) {
8713fa6b06aSMark Adams     Mat_SeqAIJ                 *jaca = (Mat_SeqAIJ*)aij->A->data;
8723fa6b06aSMark Adams     Mat_SeqAIJ                 *jacb = (Mat_SeqAIJ*)aij->B->data;
8733fa6b06aSMark Adams     PetscSplitCSRDataStructure *d_mat = cusparseStruct->deviceMat, h_mat;
8743fa6b06aSMark Adams     ierr = PetscInfo(A,"Have device matrix\n");CHKERRQ(ierr);
8753fa6b06aSMark Adams     err = cudaMemcpy( &h_mat, d_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyDeviceToHost);CHKERRCUDA(err);
8763fa6b06aSMark Adams     if (jaca->compressedrow.use) {
8773fa6b06aSMark Adams       err = cudaFree(h_mat.diag.i);CHKERRCUDA(err);
8783fa6b06aSMark Adams     }
8793fa6b06aSMark Adams     if (jacb->compressedrow.use) {
8803fa6b06aSMark Adams       err = cudaFree(h_mat.offdiag.i);CHKERRCUDA(err);
8813fa6b06aSMark Adams     }
8823fa6b06aSMark Adams     err = cudaFree(h_mat.colmap);CHKERRCUDA(err);
8833fa6b06aSMark Adams     err = cudaFree(d_mat);CHKERRCUDA(err);
8843fa6b06aSMark Adams   }
885bbf3fe20SPaul Mullowney   try {
8863fa6b06aSMark Adams     if (aij->A) { ierr = MatCUSPARSEClearHandle(aij->A);CHKERRQ(ierr); }
8873fa6b06aSMark Adams     if (aij->B) { ierr = MatCUSPARSEClearHandle(aij->B);CHKERRQ(ierr); }
88857d48284SJunchao Zhang     stat = cusparseDestroy(cusparseStruct->handle);CHKERRCUSPARSE(stat);
88917403302SKarl Rupp     if (cusparseStruct->stream) {
890c41cb2e2SAlejandro Lamas Daviña       err = cudaStreamDestroy(cusparseStruct->stream);CHKERRCUDA(err);
89117403302SKarl Rupp     }
8927e8381f9SStefano Zampini     delete cusparseStruct->coo_p;
8937e8381f9SStefano Zampini     delete cusparseStruct->coo_pw;
894bbf3fe20SPaul Mullowney     delete cusparseStruct;
895bbf3fe20SPaul Mullowney   } catch(char *ex) {
896bbf3fe20SPaul Mullowney     SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"Mat_MPIAIJCUSPARSE error: %s", ex);
897bbf3fe20SPaul Mullowney   }
8983338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",NULL);CHKERRQ(ierr);
899ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",NULL);CHKERRQ(ierr);
9007e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",NULL);CHKERRQ(ierr);
9017e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",NULL);CHKERRQ(ierr);
9023338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",NULL);CHKERRQ(ierr);
903bbf3fe20SPaul Mullowney   ierr = MatDestroy_MPIAIJ(A);CHKERRQ(ierr);
904bbf3fe20SPaul Mullowney   PetscFunctionReturn(0);
905bbf3fe20SPaul Mullowney }
906ca45077fSPaul Mullowney 
9073338378cSStefano Zampini PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType mtype, MatReuse reuse, Mat* newmat)
9089ae82921SPaul Mullowney {
9099ae82921SPaul Mullowney   PetscErrorCode     ierr;
910bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *a;
911bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct;
912b06137fdSPaul Mullowney   cusparseStatus_t   stat;
9133338378cSStefano Zampini   Mat                A;
9149ae82921SPaul Mullowney 
9159ae82921SPaul Mullowney   PetscFunctionBegin;
9163338378cSStefano Zampini   if (reuse == MAT_INITIAL_MATRIX) {
9173338378cSStefano Zampini     ierr = MatDuplicate(B,MAT_COPY_VALUES,newmat);CHKERRQ(ierr);
9183338378cSStefano Zampini   } else if (reuse == MAT_REUSE_MATRIX) {
9193338378cSStefano Zampini     ierr = MatCopy(B,*newmat,SAME_NONZERO_PATTERN);CHKERRQ(ierr);
9203338378cSStefano Zampini   }
9213338378cSStefano Zampini   A = *newmat;
9223338378cSStefano Zampini 
92334136279SStefano Zampini   ierr = PetscFree(A->defaultvectype);CHKERRQ(ierr);
92434136279SStefano Zampini   ierr = PetscStrallocpy(VECCUDA,&A->defaultvectype);CHKERRQ(ierr);
92534136279SStefano Zampini 
926bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ*)A->data;
927d98d7c49SStefano Zampini   if (a->A) { ierr = MatSetType(a->A,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
928d98d7c49SStefano Zampini   if (a->B) { ierr = MatSetType(a->B,MATSEQAIJCUSPARSE);CHKERRQ(ierr); }
929d98d7c49SStefano Zampini   if (a->lvec) {
930d98d7c49SStefano Zampini     ierr = VecSetType(a->lvec,VECSEQCUDA);CHKERRQ(ierr);
931d98d7c49SStefano Zampini   }
932d98d7c49SStefano Zampini 
9333338378cSStefano Zampini   if (reuse != MAT_REUSE_MATRIX && !a->spptr) {
934bbf3fe20SPaul Mullowney     a->spptr = new Mat_MPIAIJCUSPARSE;
9352205254eSKarl Rupp 
936bbf3fe20SPaul Mullowney     cusparseStruct                      = (Mat_MPIAIJCUSPARSE*)a->spptr;
937e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = MAT_CUSPARSE_CSR;
938e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = MAT_CUSPARSE_CSR;
9397e8381f9SStefano Zampini     cusparseStruct->coo_p               = NULL;
9407e8381f9SStefano Zampini     cusparseStruct->coo_pw              = NULL;
94117403302SKarl Rupp     cusparseStruct->stream              = 0;
94257d48284SJunchao Zhang     stat = cusparseCreate(&(cusparseStruct->handle));CHKERRCUSPARSE(stat);
9433fa6b06aSMark Adams     cusparseStruct->deviceMat = NULL;
9443338378cSStefano Zampini   }
9452205254eSKarl Rupp 
94634d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
947bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
948fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
949bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
950bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
951bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
9523fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
953*4eb5378fSStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJCUSPARSE;
9542205254eSKarl Rupp 
955bbf3fe20SPaul Mullowney   ierr = PetscObjectChangeTypeName((PetscObject)A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
956ed502f03SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJGetLocalMatMerge_C",MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE);CHKERRQ(ierr);
9573338378cSStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatMPIAIJSetPreallocation_C",MatMPIAIJSetPreallocation_MPIAIJCUSPARSE);CHKERRQ(ierr);
958bdf89e91SBarry Smith   ierr = PetscObjectComposeFunction((PetscObject)A,"MatCUSPARSESetFormat_C",MatCUSPARSESetFormat_MPIAIJCUSPARSE);CHKERRQ(ierr);
9597e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetPreallocationCOO_C",MatSetPreallocationCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
9607e8381f9SStefano Zampini   ierr = PetscObjectComposeFunction((PetscObject)A,"MatSetValuesCOO_C",MatSetValuesCOO_MPIAIJCUSPARSE);CHKERRQ(ierr);
9619ae82921SPaul Mullowney   PetscFunctionReturn(0);
9629ae82921SPaul Mullowney }
9639ae82921SPaul Mullowney 
9643338378cSStefano Zampini PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
9653338378cSStefano Zampini {
9663338378cSStefano Zampini   PetscErrorCode ierr;
9673338378cSStefano Zampini 
9683338378cSStefano Zampini   PetscFunctionBegin;
96905035670SJunchao Zhang   ierr = PetscCUDAInitializeCheck();CHKERRQ(ierr);
9703338378cSStefano Zampini   ierr = MatCreate_MPIAIJ(A);CHKERRQ(ierr);
9713338378cSStefano Zampini   ierr = MatConvert_MPIAIJ_MPIAIJCUSPARSE(A,MATMPIAIJCUSPARSE,MAT_INPLACE_MATRIX,&A);CHKERRQ(ierr);
9723338378cSStefano Zampini   PetscFunctionReturn(0);
9733338378cSStefano Zampini }
9743338378cSStefano Zampini 
9753f9c0db1SPaul Mullowney /*@
9763f9c0db1SPaul Mullowney    MatCreateAIJCUSPARSE - Creates a sparse matrix in AIJ (compressed row) format
977e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
9783f9c0db1SPaul Mullowney    to NVidia GPUs and use the CUSPARSE library for calculations. For good matrix
979e057df02SPaul Mullowney    assembly performance the user should preallocate the matrix storage by setting
980e057df02SPaul Mullowney    the parameter nz (or the array nnz).  By setting these parameters accurately,
981e057df02SPaul Mullowney    performance during matrix assembly can be increased by more than a factor of 50.
9829ae82921SPaul Mullowney 
983d083f849SBarry Smith    Collective
984e057df02SPaul Mullowney 
985e057df02SPaul Mullowney    Input Parameters:
986e057df02SPaul Mullowney +  comm - MPI communicator, set to PETSC_COMM_SELF
987e057df02SPaul Mullowney .  m - number of rows
988e057df02SPaul Mullowney .  n - number of columns
989e057df02SPaul Mullowney .  nz - number of nonzeros per row (same for all rows)
990e057df02SPaul Mullowney -  nnz - array containing the number of nonzeros in the various rows
9910298fd71SBarry Smith          (possibly different for each row) or NULL
992e057df02SPaul Mullowney 
993e057df02SPaul Mullowney    Output Parameter:
994e057df02SPaul Mullowney .  A - the matrix
995e057df02SPaul Mullowney 
996e057df02SPaul Mullowney    It is recommended that one use the MatCreate(), MatSetType() and/or MatSetFromOptions(),
997e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
998e057df02SPaul Mullowney    [MatXXXXSetPreallocation() is, for example, MatSeqAIJSetPreallocation]
999e057df02SPaul Mullowney 
1000e057df02SPaul Mullowney    Notes:
1001e057df02SPaul Mullowney    If nnz is given then nz is ignored
1002e057df02SPaul Mullowney 
1003e057df02SPaul Mullowney    The AIJ format (also called the Yale sparse matrix format or
1004e057df02SPaul Mullowney    compressed row storage), is fully compatible with standard Fortran 77
1005e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
1006e057df02SPaul Mullowney    either one (as in Fortran) or zero.  See the users' manual for details.
1007e057df02SPaul Mullowney 
1008e057df02SPaul Mullowney    Specify the preallocated storage with either nz or nnz (not both).
10090298fd71SBarry Smith    Set nz=PETSC_DEFAULT and nnz=NULL for PETSc to control dynamic memory
1010e057df02SPaul Mullowney    allocation.  For large problems you MUST preallocate memory or you
1011e057df02SPaul Mullowney    will get TERRIBLE performance, see the users' manual chapter on matrices.
1012e057df02SPaul Mullowney 
1013e057df02SPaul Mullowney    By default, this format uses inodes (identical nodes) when possible, to
1014e057df02SPaul Mullowney    improve numerical efficiency of matrix-vector products and solves. We
1015e057df02SPaul Mullowney    search for consecutive rows with the same nonzero structure, thereby
1016e057df02SPaul Mullowney    reusing matrix information to achieve increased efficiency.
1017e057df02SPaul Mullowney 
1018e057df02SPaul Mullowney    Level: intermediate
1019e057df02SPaul Mullowney 
1020e057df02SPaul Mullowney .seealso: MatCreate(), MatCreateAIJ(), MatSetValues(), MatSeqAIJSetColumnIndices(), MatCreateSeqAIJWithArrays(), MatCreateAIJ(), MATMPIAIJCUSPARSE, MATAIJCUSPARSE
1021e057df02SPaul Mullowney @*/
10229ae82921SPaul Mullowney PetscErrorCode  MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat *A)
10239ae82921SPaul Mullowney {
10249ae82921SPaul Mullowney   PetscErrorCode ierr;
10259ae82921SPaul Mullowney   PetscMPIInt    size;
10269ae82921SPaul Mullowney 
10279ae82921SPaul Mullowney   PetscFunctionBegin;
10289ae82921SPaul Mullowney   ierr = MatCreate(comm,A);CHKERRQ(ierr);
10299ae82921SPaul Mullowney   ierr = MatSetSizes(*A,m,n,M,N);CHKERRQ(ierr);
1030ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
10319ae82921SPaul Mullowney   if (size > 1) {
10329ae82921SPaul Mullowney     ierr = MatSetType(*A,MATMPIAIJCUSPARSE);CHKERRQ(ierr);
10339ae82921SPaul Mullowney     ierr = MatMPIAIJSetPreallocation(*A,d_nz,d_nnz,o_nz,o_nnz);CHKERRQ(ierr);
10349ae82921SPaul Mullowney   } else {
10359ae82921SPaul Mullowney     ierr = MatSetType(*A,MATSEQAIJCUSPARSE);CHKERRQ(ierr);
10369ae82921SPaul Mullowney     ierr = MatSeqAIJSetPreallocation(*A,d_nz,d_nnz);CHKERRQ(ierr);
10379ae82921SPaul Mullowney   }
10389ae82921SPaul Mullowney   PetscFunctionReturn(0);
10399ae82921SPaul Mullowney }
10409ae82921SPaul Mullowney 
10413ca39a21SBarry Smith /*MC
1042e057df02SPaul Mullowney    MATAIJCUSPARSE - MATMPIAIJCUSPARSE = "aijcusparse" = "mpiaijcusparse" - A matrix type to be used for sparse matrices.
1043e057df02SPaul Mullowney 
10442692e278SPaul Mullowney    A matrix type type whose data resides on Nvidia GPUs. These matrices can be in either
10452692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
10462692e278SPaul Mullowney    All matrix calculations are performed on Nvidia GPUs using the CUSPARSE library.
10479ae82921SPaul Mullowney 
10489ae82921SPaul Mullowney    This matrix type is identical to MATSEQAIJCUSPARSE when constructed with a single process communicator,
10499ae82921SPaul Mullowney    and MATMPIAIJCUSPARSE otherwise.  As a result, for single process communicators,
10509ae82921SPaul Mullowney    MatSeqAIJSetPreallocation is supported, and similarly MatMPIAIJSetPreallocation is supported
10519ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
10529ae82921SPaul Mullowney    the above preallocation routines for simplicity.
10539ae82921SPaul Mullowney 
10549ae82921SPaul Mullowney    Options Database Keys:
1055e057df02SPaul Mullowney +  -mat_type mpiaijcusparse - sets the matrix type to "mpiaijcusparse" during a call to MatSetFromOptions()
10568468deeeSKarl Rupp .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
10578468deeeSKarl Rupp .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
10588468deeeSKarl Rupp -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix during a call to MatSetFromOptions(). Other options include ell (ellpack) or hyb (hybrid).
10599ae82921SPaul Mullowney 
10609ae82921SPaul Mullowney   Level: beginner
10619ae82921SPaul Mullowney 
10628468deeeSKarl Rupp  .seealso: MatCreateAIJCUSPARSE(), MATSEQAIJCUSPARSE, MatCreateSeqAIJCUSPARSE(), MatCUSPARSESetFormat(), MatCUSPARSEStorageFormat, MatCUSPARSEFormatOperation
10638468deeeSKarl Rupp M
10649ae82921SPaul Mullowney M*/
10653fa6b06aSMark Adams 
10663fa6b06aSMark Adams // get GPU pointer to stripped down Mat. For both Seq and MPI Mat.
10673fa6b06aSMark Adams PetscErrorCode MatCUSPARSEGetDeviceMatWrite(Mat A, PetscSplitCSRDataStructure **B)
10683fa6b06aSMark Adams {
10699db3cbf9SStefano Zampini #if defined(PETSC_USE_CTABLE)
10709db3cbf9SStefano Zampini   SETERRQ(PetscObjectComm((PetscObject)A),PETSC_ERR_SUP,"Device metadata does not support ctable (--with-ctable=0)");
10719db3cbf9SStefano Zampini #else
10723fa6b06aSMark Adams   PetscSplitCSRDataStructure **p_d_mat;
10733fa6b06aSMark Adams   PetscMPIInt                size,rank;
10743fa6b06aSMark Adams   MPI_Comm                   comm;
10753fa6b06aSMark Adams   PetscErrorCode             ierr;
10763fa6b06aSMark Adams   int                        *ai,*bi,*aj,*bj;
10773fa6b06aSMark Adams   PetscScalar                *aa,*ba;
10783fa6b06aSMark Adams 
10793fa6b06aSMark Adams   PetscFunctionBegin;
10803fa6b06aSMark Adams   ierr = PetscObjectGetComm((PetscObject)A,&comm);CHKERRQ(ierr);
1081ffc4695bSBarry Smith   ierr = MPI_Comm_size(comm,&size);CHKERRMPI(ierr);
1082ffc4695bSBarry Smith   ierr = MPI_Comm_rank(comm,&rank);CHKERRMPI(ierr);
10833fa6b06aSMark Adams   if (A->factortype == MAT_FACTOR_NONE) {
10843fa6b06aSMark Adams     CsrMatrix *matrixA,*matrixB=NULL;
10853fa6b06aSMark Adams     if (size == 1) {
10863fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestruct = (Mat_SeqAIJCUSPARSE*)A->spptr;
10873fa6b06aSMark Adams       p_d_mat = &cusparsestruct->deviceMat;
10883fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstruct = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestruct->mat;
10893fa6b06aSMark Adams       if (cusparsestruct->format==MAT_CUSPARSE_CSR) {
10903fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstruct->mat;
10913fa6b06aSMark Adams         bi = bj = NULL; ba = NULL;
10926b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat needs MAT_CUSPARSE_CSR");
10933fa6b06aSMark Adams     } else {
10943fa6b06aSMark Adams       Mat_MPIAIJ         *aij = (Mat_MPIAIJ*)A->data;
10953fa6b06aSMark Adams       Mat_MPIAIJCUSPARSE *spptr = (Mat_MPIAIJCUSPARSE*)aij->spptr;
10963fa6b06aSMark Adams       p_d_mat = &spptr->deviceMat;
10973fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructA = (Mat_SeqAIJCUSPARSE*)aij->A->spptr;
10983fa6b06aSMark Adams       Mat_SeqAIJCUSPARSE *cusparsestructB = (Mat_SeqAIJCUSPARSE*)aij->B->spptr;
10993fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructA = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructA->mat;
11003fa6b06aSMark Adams       Mat_SeqAIJCUSPARSEMultStruct *matstructB = (Mat_SeqAIJCUSPARSEMultStruct*)cusparsestructB->mat;
11013fa6b06aSMark Adams       if (cusparsestructA->format==MAT_CUSPARSE_CSR) {
11023fa6b06aSMark Adams         if (cusparsestructB->format!=MAT_CUSPARSE_CSR) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat B needs MAT_CUSPARSE_CSR");
11033fa6b06aSMark Adams         matrixA = (CsrMatrix*)matstructA->mat;
11043fa6b06aSMark Adams         matrixB = (CsrMatrix*)matstructB->mat;
11053fa6b06aSMark Adams         bi = thrust::raw_pointer_cast(matrixB->row_offsets->data());
11063fa6b06aSMark Adams         bj = thrust::raw_pointer_cast(matrixB->column_indices->data());
11073fa6b06aSMark Adams         ba = thrust::raw_pointer_cast(matrixB->values->data());
11086b78a27eSPierre Jolivet       } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Device Mat A needs MAT_CUSPARSE_CSR");
11093fa6b06aSMark Adams     }
11103fa6b06aSMark Adams     ai = thrust::raw_pointer_cast(matrixA->row_offsets->data());
11113fa6b06aSMark Adams     aj = thrust::raw_pointer_cast(matrixA->column_indices->data());
11123fa6b06aSMark Adams     aa = thrust::raw_pointer_cast(matrixA->values->data());
11133fa6b06aSMark Adams   } else {
11143fa6b06aSMark Adams     *B = NULL;
11153fa6b06aSMark Adams     PetscFunctionReturn(0);
11163fa6b06aSMark Adams   }
11173fa6b06aSMark Adams   // act like MatSetValues because not called on host
11183fa6b06aSMark Adams   if (A->assembled) {
11193fa6b06aSMark Adams     if (A->was_assembled) {
11203fa6b06aSMark Adams       ierr = PetscInfo(A,"Assemble more than once already\n");CHKERRQ(ierr);
11213fa6b06aSMark Adams     }
11223fa6b06aSMark Adams     A->was_assembled = PETSC_TRUE; // this is done (lazy) in MatAssemble but we are not calling it anymore - done in AIJ AssemblyEnd, need here?
11233fa6b06aSMark Adams   } else {
11243fa6b06aSMark Adams     SETERRQ(comm,PETSC_ERR_SUP,"Need assemble matrix");
11253fa6b06aSMark Adams   }
11263fa6b06aSMark Adams   if (!*p_d_mat) {
11273fa6b06aSMark Adams     cudaError_t                 err;
11283fa6b06aSMark Adams     PetscSplitCSRDataStructure  *d_mat, h_mat;
11293fa6b06aSMark Adams     Mat_SeqAIJ                  *jaca;
11303fa6b06aSMark Adams     PetscInt                    n = A->rmap->n, nnz;
11313fa6b06aSMark Adams     // create and copy
11323fa6b06aSMark Adams     ierr = PetscInfo(A,"Create device matrix\n");CHKERRQ(ierr);
11333fa6b06aSMark Adams     err = cudaMalloc((void **)&d_mat, sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
11343fa6b06aSMark Adams     err = cudaMemset( d_mat, 0,       sizeof(PetscSplitCSRDataStructure));CHKERRCUDA(err);
11353fa6b06aSMark Adams     *B = *p_d_mat = d_mat; // return it, set it in Mat, and set it up
11363fa6b06aSMark Adams     if (size == 1) {
11373fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)A->data;
11383fa6b06aSMark Adams       h_mat.rstart = 0; h_mat.rend = A->rmap->n;
11393fa6b06aSMark Adams       h_mat.cstart = 0; h_mat.cend = A->cmap->n;
11403fa6b06aSMark Adams       h_mat.offdiag.i = h_mat.offdiag.j = NULL;
11413fa6b06aSMark Adams       h_mat.offdiag.a = NULL;
11423fa6b06aSMark Adams     } else {
11433fa6b06aSMark Adams       Mat_MPIAIJ  *aij = (Mat_MPIAIJ*)A->data;
11443fa6b06aSMark Adams       Mat_SeqAIJ  *jacb;
11453fa6b06aSMark Adams       jaca = (Mat_SeqAIJ*)aij->A->data;
11463fa6b06aSMark Adams       jacb = (Mat_SeqAIJ*)aij->B->data;
11473fa6b06aSMark Adams       if (!aij->garray) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"MPIAIJ Matrix was assembled but is missing garray");
11483fa6b06aSMark Adams       if (aij->B->rmap->n != aij->A->rmap->n) SETERRQ(comm,PETSC_ERR_SUP,"Only support aij->B->rmap->n == aij->A->rmap->n");
11493fa6b06aSMark Adams       // create colmap - this is ussually done (lazy) in MatSetValues
11503fa6b06aSMark Adams       aij->donotstash = PETSC_TRUE;
11513fa6b06aSMark Adams       aij->A->nooffprocentries = aij->B->nooffprocentries = A->nooffprocentries = PETSC_TRUE;
11523fa6b06aSMark Adams       jaca->nonew = jacb->nonew = PETSC_TRUE; // no more dissassembly
11533fa6b06aSMark Adams       ierr = PetscCalloc1(A->cmap->N+1,&aij->colmap);CHKERRQ(ierr);
11543fa6b06aSMark Adams       aij->colmap[A->cmap->N] = -9;
11553fa6b06aSMark Adams       ierr = PetscLogObjectMemory((PetscObject)A,(A->cmap->N+1)*sizeof(PetscInt));CHKERRQ(ierr);
11563fa6b06aSMark Adams       {
11573fa6b06aSMark Adams         PetscInt ii;
11583fa6b06aSMark Adams         for (ii=0; ii<aij->B->cmap->n; ii++) aij->colmap[aij->garray[ii]] = ii+1;
11593fa6b06aSMark Adams       }
11603fa6b06aSMark Adams       if (aij->colmap[A->cmap->N] != -9) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"aij->colmap[A->cmap->N] != -9");
11613fa6b06aSMark Adams       // allocate B copy data
11623fa6b06aSMark Adams       h_mat.rstart = A->rmap->rstart; h_mat.rend = A->rmap->rend;
11633fa6b06aSMark Adams       h_mat.cstart = A->cmap->rstart; h_mat.cend = A->cmap->rend;
11643fa6b06aSMark Adams       nnz = jacb->i[n];
11653fa6b06aSMark Adams 
11663fa6b06aSMark Adams       if (jacb->compressedrow.use) {
11673fa6b06aSMark Adams         err = cudaMalloc((void **)&h_mat.offdiag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
11683fa6b06aSMark Adams         err = cudaMemcpy(          h_mat.offdiag.i,    jacb->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
11696b78a27eSPierre Jolivet       } else h_mat.offdiag.i = bi;
11703fa6b06aSMark Adams       h_mat.offdiag.j = bj;
11713fa6b06aSMark Adams       h_mat.offdiag.a = ba;
11723fa6b06aSMark Adams 
11733fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.colmap,                  (A->cmap->N+1)*sizeof(PetscInt));CHKERRCUDA(err); // kernel output
11743fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.colmap,    aij->colmap,  (A->cmap->N+1)*sizeof(PetscInt), cudaMemcpyHostToDevice);CHKERRCUDA(err);
11753fa6b06aSMark Adams       h_mat.offdiag.ignorezeroentries = jacb->ignorezeroentries;
11763fa6b06aSMark Adams       h_mat.offdiag.n = n;
11773fa6b06aSMark Adams     }
11783fa6b06aSMark Adams     // allocate A copy data
11793fa6b06aSMark Adams     nnz = jaca->i[n];
11803fa6b06aSMark Adams     h_mat.diag.n = n;
11813fa6b06aSMark Adams     h_mat.diag.ignorezeroentries = jaca->ignorezeroentries;
1182ffc4695bSBarry Smith     ierr = MPI_Comm_rank(comm,&h_mat.rank);CHKERRMPI(ierr);
11833fa6b06aSMark Adams     if (jaca->compressedrow.use) {
11843fa6b06aSMark Adams       err = cudaMalloc((void **)&h_mat.diag.i,               (n+1)*sizeof(int));CHKERRCUDA(err); // kernel input
11853fa6b06aSMark Adams       err = cudaMemcpy(          h_mat.diag.i,    jaca->i,   (n+1)*sizeof(int), cudaMemcpyHostToDevice);CHKERRCUDA(err);
11863fa6b06aSMark Adams     } else {
11873fa6b06aSMark Adams       h_mat.diag.i = ai;
11883fa6b06aSMark Adams     }
11893fa6b06aSMark Adams     h_mat.diag.j = aj;
11903fa6b06aSMark Adams     h_mat.diag.a = aa;
11913fa6b06aSMark Adams     // copy pointers and metdata to device
11923fa6b06aSMark Adams     err = cudaMemcpy(          d_mat, &h_mat, sizeof(PetscSplitCSRDataStructure), cudaMemcpyHostToDevice);CHKERRCUDA(err);
11933fa6b06aSMark Adams     ierr = PetscInfo2(A,"Create device Mat n=%D nnz=%D\n",h_mat.diag.n, nnz);CHKERRQ(ierr);
11943fa6b06aSMark Adams   } else {
11953fa6b06aSMark Adams     *B = *p_d_mat;
11963fa6b06aSMark Adams   }
11973fa6b06aSMark Adams   A->assembled = PETSC_FALSE; // ready to write with matsetvalues - this done (lazy) in normal MatSetValues
11983fa6b06aSMark Adams   PetscFunctionReturn(0);
11999db3cbf9SStefano Zampini #endif
12003fa6b06aSMark Adams }
1201