xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 49abdd8a111d9c2ef7fc48ade253ef64e07f9b37)
199acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
557d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
77e8381f9SStefano Zampini #include <thrust/advance.h>
8a2cee5feSJed Brown #include <thrust/partition.h>
9a2cee5feSJed Brown #include <thrust/sort.h>
10a2cee5feSJed Brown #include <thrust/unique.h>
114eb5378fSStefano Zampini #include <petscsf.h>
127e8381f9SStefano Zampini 
139371c9d4SSatish Balay struct VecCUDAEquals {
147e8381f9SStefano Zampini   template <typename Tuple>
15d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
16d71ae5a4SJacob Faibussowitsch   {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
21*49abdd8aSBarry Smith static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void **data)
22d71ae5a4SJacob Faibussowitsch {
23*49abdd8aSBarry Smith   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)*data;
24cbc6b225SStefano Zampini 
25cbc6b225SStefano Zampini   PetscFunctionBegin;
262c4ab24aSJunchao Zhang   PetscCall(PetscSFDestroy(&coo->sf));
272c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Ajmap1));
282c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aperm1));
292c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bjmap1));
302c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bperm1));
312c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aimap2));
322c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Ajmap2));
332c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aperm2));
342c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bimap2));
352c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bjmap2));
362c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bperm2));
372c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Cperm1));
382c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->sendbuf));
392c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->recvbuf));
402c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
427e8381f9SStefano Zampini }
439ae82921SPaul Mullowney 
44d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45d71ae5a4SJacob Faibussowitsch {
46cbc6b225SStefano Zampini   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
472c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
482c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
492c4ab24aSJunchao Zhang   PetscInt            *i, *j;
50*49abdd8aSBarry Smith   PetscContainer       container_h;
512c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52219fbbafSJunchao Zhang 
53219fbbafSJunchao Zhang   PetscFunctionBegin;
549566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->garray));
559566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&mpiaij->lvec));
56cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
57eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58cbc6b225SStefano Zampini #else
599566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->colmap));
60cbc6b225SStefano Zampini #endif
619566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62cbc6b225SStefano Zampini   mat->assembled     = PETSC_FALSE;
63cbc6b225SStefano Zampini   mat->was_assembled = PETSC_FALSE;
649566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
652c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
662c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
672c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
682c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
692c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70219fbbafSJunchao Zhang   } else {
712c4ab24aSJunchao Zhang     i = coo_i;
722c4ab24aSJunchao Zhang     j = coo_j;
732c4ab24aSJunchao Zhang   }
742c4ab24aSJunchao Zhang 
752c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
762c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
77cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
782c4ab24aSJunchao Zhang   // Create the GPU memory
799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
809566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81219fbbafSJunchao Zhang 
822c4ab24aSJunchao Zhang   // Copy the COO struct to device
832c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
842c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
852c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
862c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87219fbbafSJunchao Zhang 
882c4ab24aSJunchao Zhang   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
892c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
902c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
912c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
922c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
932c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
942c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
952c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
962c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
972c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
982c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
992c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
1002c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
1012c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102219fbbafSJunchao Zhang 
1032c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1042c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1052c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1062c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1072c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1082c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1092c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1102c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1112c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1122c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1132c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114219fbbafSJunchao Zhang 
1152c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
116*49abdd8aSBarry Smith   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
1173ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
118219fbbafSJunchao Zhang }
119219fbbafSJunchao Zhang 
120d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
121d71ae5a4SJacob Faibussowitsch {
122219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
123219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
124219fbbafSJunchao Zhang   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
125219fbbafSJunchao Zhang }
126219fbbafSJunchao Zhang 
127d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
128d71ae5a4SJacob Faibussowitsch {
129219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
130219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
131158ec288SJunchao Zhang   for (; i < Annz + Bnnz; i += grid_size) {
132158ec288SJunchao Zhang     PetscScalar sum = 0.0;
133158ec288SJunchao Zhang     if (i < Annz) {
134158ec288SJunchao Zhang       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
135158ec288SJunchao Zhang       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
136158ec288SJunchao Zhang     } else {
137158ec288SJunchao Zhang       i -= Annz;
138158ec288SJunchao Zhang       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
139158ec288SJunchao Zhang       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
140158ec288SJunchao Zhang     }
141158ec288SJunchao Zhang   }
142158ec288SJunchao Zhang }
143158ec288SJunchao Zhang 
144d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
145d71ae5a4SJacob Faibussowitsch {
146158ec288SJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
147158ec288SJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
148158ec288SJunchao Zhang   for (; i < Annz2 + Bnnz2; i += grid_size) {
149158ec288SJunchao Zhang     if (i < Annz2) {
150158ec288SJunchao Zhang       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
151158ec288SJunchao Zhang     } else {
152158ec288SJunchao Zhang       i -= Annz2;
153158ec288SJunchao Zhang       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
154158ec288SJunchao Zhang     }
155158ec288SJunchao Zhang   }
156219fbbafSJunchao Zhang }
157219fbbafSJunchao Zhang 
158d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
159d71ae5a4SJacob Faibussowitsch {
160219fbbafSJunchao Zhang   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
161219fbbafSJunchao Zhang   Mat                  A = mpiaij->A, B = mpiaij->B;
1622c4ab24aSJunchao Zhang   PetscScalar         *Aa, *Ba;
163219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
164219fbbafSJunchao Zhang   PetscMemType         memtype;
1652c4ab24aSJunchao Zhang   PetscContainer       container;
1662c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo;
167219fbbafSJunchao Zhang 
168219fbbafSJunchao Zhang   PetscFunctionBegin;
1692c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1702c4ab24aSJunchao Zhang   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
1712c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
172cbc6b225SStefano Zampini 
1732c4ab24aSJunchao Zhang   const auto &Annz   = coo->Annz;
1742c4ab24aSJunchao Zhang   const auto &Annz2  = coo->Annz2;
1752c4ab24aSJunchao Zhang   const auto &Bnnz   = coo->Bnnz;
1762c4ab24aSJunchao Zhang   const auto &Bnnz2  = coo->Bnnz2;
1772c4ab24aSJunchao Zhang   const auto &vsend  = coo->sendbuf;
1782c4ab24aSJunchao Zhang   const auto &v2     = coo->recvbuf;
1792c4ab24aSJunchao Zhang   const auto &Ajmap1 = coo->Ajmap1;
1802c4ab24aSJunchao Zhang   const auto &Ajmap2 = coo->Ajmap2;
1812c4ab24aSJunchao Zhang   const auto &Aimap2 = coo->Aimap2;
1822c4ab24aSJunchao Zhang   const auto &Bjmap1 = coo->Bjmap1;
1832c4ab24aSJunchao Zhang   const auto &Bjmap2 = coo->Bjmap2;
1842c4ab24aSJunchao Zhang   const auto &Bimap2 = coo->Bimap2;
1852c4ab24aSJunchao Zhang   const auto &Aperm1 = coo->Aperm1;
1862c4ab24aSJunchao Zhang   const auto &Aperm2 = coo->Aperm2;
1872c4ab24aSJunchao Zhang   const auto &Bperm1 = coo->Bperm1;
1882c4ab24aSJunchao Zhang   const auto &Bperm2 = coo->Bperm2;
1892c4ab24aSJunchao Zhang   const auto &Cperm1 = coo->Cperm1;
1902c4ab24aSJunchao Zhang 
1919566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
192219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
1932c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
1942c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
195219fbbafSJunchao Zhang   }
196219fbbafSJunchao Zhang 
197219fbbafSJunchao Zhang   if (imode == INSERT_VALUES) {
1989566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
1999566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
200219fbbafSJunchao Zhang   } else {
2019566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
202158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
203219fbbafSJunchao Zhang   }
204219fbbafSJunchao Zhang 
20508bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeBegin());
206219fbbafSJunchao Zhang   /* Pack entries to be sent to remote */
2072c4ab24aSJunchao Zhang   if (coo->sendlen) {
2082c4ab24aSJunchao Zhang     MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
2099371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError());
210cbc6b225SStefano Zampini   }
211219fbbafSJunchao Zhang 
212219fbbafSJunchao Zhang   /* Send remote entries to their owner and overlap the communication with local computation */
2132c4ab24aSJunchao Zhang   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
214219fbbafSJunchao Zhang   /* Add local entries to A and B */
215158ec288SJunchao Zhang   if (Annz + Bnnz > 0) {
2166497c311SBarry Smith     MatAddLocalCOOValues<<<(int)((Annz + Bnnz + 255) / 256), 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
2179566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
218cbc6b225SStefano Zampini   }
2192c4ab24aSJunchao Zhang   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
220219fbbafSJunchao Zhang 
221219fbbafSJunchao Zhang   /* Add received remote entries to A and B */
222158ec288SJunchao Zhang   if (Annz2 + Bnnz2 > 0) {
2236497c311SBarry Smith     MatAddRemoteCOOValues<<<(int)((Annz2 + Bnnz2 + 255) / 256), 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
2249566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
225cbc6b225SStefano Zampini   }
22608bb9926SJunchao Zhang   PetscCall(PetscLogGpuTimeEnd());
227219fbbafSJunchao Zhang 
228219fbbafSJunchao Zhang   if (imode == INSERT_VALUES) {
2299566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
230158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
231219fbbafSJunchao Zhang   } else {
2329566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
233158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
234219fbbafSJunchao Zhang   }
2359566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
236cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
2373ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
238219fbbafSJunchao Zhang }
239219fbbafSJunchao Zhang 
240d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
241d71ae5a4SJacob Faibussowitsch {
242ed502f03SStefano Zampini   Mat             Ad, Ao;
243ed502f03SStefano Zampini   const PetscInt *cmap;
244ed502f03SStefano Zampini 
245ed502f03SStefano Zampini   PetscFunctionBegin;
2469566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
2479566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
248ed502f03SStefano Zampini   if (glob) {
249ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
250ed502f03SStefano Zampini 
2519566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
2529566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ao, NULL, &on));
2539566063dSJacob Faibussowitsch     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
2549566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(dn + on, &gidx));
255ed502f03SStefano Zampini     for (i = 0; i < dn; i++) gidx[i] = cst + i;
256ed502f03SStefano Zampini     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
2579566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
258ed502f03SStefano Zampini   }
2593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
260ed502f03SStefano Zampini }
261ed502f03SStefano Zampini 
26266976f2fSJacob Faibussowitsch static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
263d71ae5a4SJacob Faibussowitsch {
264bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
265bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
2669ae82921SPaul Mullowney   PetscInt            i;
2679ae82921SPaul Mullowney 
2689ae82921SPaul Mullowney   PetscFunctionBegin;
2692f3c17daSJacob Faibussowitsch   if (B->hash_active) {
2702f3c17daSJacob Faibussowitsch     B->ops[0]      = b->cops;
2712f3c17daSJacob Faibussowitsch     B->hash_active = PETSC_FALSE;
2722f3c17daSJacob Faibussowitsch   }
2739566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->rmap));
2749566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->cmap));
2757e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
276ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
2779ae82921SPaul Mullowney   }
2787e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
279ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
2809ae82921SPaul Mullowney   }
2816a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
282eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&b->colmap));
2836a29ce69SStefano Zampini #else
2849566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->colmap));
2856a29ce69SStefano Zampini #endif
2869566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->garray));
2879566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&b->lvec));
2889566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&b->Mvctx));
2896a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
2909566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
2916a29ce69SStefano Zampini   if (!b->A) {
2929566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2939566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2946a29ce69SStefano Zampini   }
2956a29ce69SStefano Zampini   if (!b->B) {
2966a29ce69SStefano Zampini     PetscMPIInt size;
2979566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2989566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2999566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
3009ae82921SPaul Mullowney   }
3019566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
3029566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
3039566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
3049566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
3059566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
3069566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
3079566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
3089566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
3099ae82921SPaul Mullowney   B->preallocated  = PETSC_TRUE;
3102f3c17daSJacob Faibussowitsch   B->was_assembled = PETSC_FALSE;
3112f3c17daSJacob Faibussowitsch   B->assembled     = PETSC_FALSE;
3123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3139ae82921SPaul Mullowney }
314e057df02SPaul Mullowney 
31566976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
316d71ae5a4SJacob Faibussowitsch {
3179ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
3189ae82921SPaul Mullowney 
3199ae82921SPaul Mullowney   PetscFunctionBegin;
3209566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3219566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
3229566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3239566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
3243ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3259ae82921SPaul Mullowney }
326ca45077fSPaul Mullowney 
32766976f2fSJacob Faibussowitsch static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
328d71ae5a4SJacob Faibussowitsch {
3293fa6b06aSMark Adams   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
3303fa6b06aSMark Adams 
3313fa6b06aSMark Adams   PetscFunctionBegin;
3329566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->A));
3339566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->B));
3343ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3353fa6b06aSMark Adams }
336042217e8SBarry Smith 
33766976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
338d71ae5a4SJacob Faibussowitsch {
339fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
340fdc842d1SBarry Smith 
341fdc842d1SBarry Smith   PetscFunctionBegin;
3429566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3439566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
3449566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3459566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
3463ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
347fdc842d1SBarry Smith }
348fdc842d1SBarry Smith 
34966976f2fSJacob Faibussowitsch static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
350d71ae5a4SJacob Faibussowitsch {
351ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
352ca45077fSPaul Mullowney 
353ca45077fSPaul Mullowney   PetscFunctionBegin;
3549566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
3559566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
3569566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3579566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3583ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
359ca45077fSPaul Mullowney }
3609ae82921SPaul Mullowney 
36166976f2fSJacob Faibussowitsch static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
362d71ae5a4SJacob Faibussowitsch {
363ca45077fSPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
364bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
365e057df02SPaul Mullowney 
366ca45077fSPaul Mullowney   PetscFunctionBegin;
367e057df02SPaul Mullowney   switch (op) {
368d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_DIAG:
369d71ae5a4SJacob Faibussowitsch     cusparseStruct->diagGPUMatFormat = format;
370d71ae5a4SJacob Faibussowitsch     break;
371d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_OFFDIAG:
372d71ae5a4SJacob Faibussowitsch     cusparseStruct->offdiagGPUMatFormat = format;
373d71ae5a4SJacob Faibussowitsch     break;
374e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
375e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
376e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
377045c96e1SPaul Mullowney     break;
378d71ae5a4SJacob Faibussowitsch   default:
379d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
380045c96e1SPaul Mullowney   }
3813ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
382ca45077fSPaul Mullowney }
383e057df02SPaul Mullowney 
38466976f2fSJacob Faibussowitsch static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
385d71ae5a4SJacob Faibussowitsch {
386e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
387e057df02SPaul Mullowney   PetscBool                flg;
388a183c035SDominic Meiser   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
389a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
3905fd66863SKarl Rupp 
391e057df02SPaul Mullowney   PetscFunctionBegin;
392d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
393e057df02SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3949371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
3959566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
3969371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
3979566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
3989371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
3999566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
400e057df02SPaul Mullowney   }
401d0609cedSBarry Smith   PetscOptionsHeadEnd();
4023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
403e057df02SPaul Mullowney }
404e057df02SPaul Mullowney 
40566976f2fSJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
406d71ae5a4SJacob Faibussowitsch {
4073fa6b06aSMark Adams   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
408042217e8SBarry Smith 
40934d6c7a5SJose E. Roman   PetscFunctionBegin;
4109566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
4119566063dSJacob Faibussowitsch   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
4123ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41334d6c7a5SJose E. Roman }
41434d6c7a5SJose E. Roman 
41566976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
416d71ae5a4SJacob Faibussowitsch {
4173fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
4183fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
419bbf3fe20SPaul Mullowney 
420bbf3fe20SPaul Mullowney   PetscFunctionBegin;
42128b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
422acbf8a88SJunchao Zhang   PetscCallCXX(delete cusparseStruct);
4239566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
4249566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
4259566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4269566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4279566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
4289566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
4299566063dSJacob Faibussowitsch   PetscCall(MatDestroy_MPIAIJ(A));
4303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
431bbf3fe20SPaul Mullowney }
432ca45077fSPaul Mullowney 
43326cec326SBarry Smith /* defines MatSetValues_MPICUSPARSE_Hash() */
43426cec326SBarry Smith #define TYPE AIJ
43526cec326SBarry Smith #define TYPE_AIJ
43626cec326SBarry Smith #define SUB_TYPE_CUSPARSE
43726cec326SBarry Smith #include "../src/mat/impls/aij/mpi/mpihashmat.h"
43826cec326SBarry Smith #undef TYPE
43926cec326SBarry Smith #undef TYPE_AIJ
44026cec326SBarry Smith #undef SUB_TYPE_CUSPARSE
44126cec326SBarry Smith 
44226cec326SBarry Smith static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
44326cec326SBarry Smith {
44426cec326SBarry Smith   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
44526cec326SBarry Smith   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
44626cec326SBarry Smith 
44726cec326SBarry Smith   PetscFunctionBegin;
44826cec326SBarry Smith   PetscCall(MatSetUp_MPI_Hash(A));
44926cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
45026cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
45126cec326SBarry Smith   A->preallocated = PETSC_TRUE;
45226cec326SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
45326cec326SBarry Smith }
45426cec326SBarry Smith 
4558eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
456d71ae5a4SJacob Faibussowitsch {
457bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
4583338378cSStefano Zampini   Mat         A;
4599ae82921SPaul Mullowney 
4609ae82921SPaul Mullowney   PetscFunctionBegin;
4619566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
4629566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
4639566063dSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
4643338378cSStefano Zampini   A             = *newmat;
4656f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
4669566063dSJacob Faibussowitsch   PetscCall(PetscFree(A->defaultvectype));
4679566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
46834136279SStefano Zampini 
469bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ *)A->data;
4709566063dSJacob Faibussowitsch   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
4719566063dSJacob Faibussowitsch   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
4729566063dSJacob Faibussowitsch   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
473d98d7c49SStefano Zampini 
47448a46eb9SPierre Jolivet   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
4752205254eSKarl Rupp 
47634d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
477bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
478fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
479bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
480bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
481bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
4823fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
4834e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
48426cec326SBarry Smith   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
4852205254eSKarl Rupp 
4869566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
4879566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
4889566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
4899566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
4909566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
4919566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
492ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
4939566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
494ae48a8d0SStefano Zampini #endif
4953ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4969ae82921SPaul Mullowney }
4979ae82921SPaul Mullowney 
498d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
499d71ae5a4SJacob Faibussowitsch {
5003338378cSStefano Zampini   PetscFunctionBegin;
5019566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
5029566063dSJacob Faibussowitsch   PetscCall(MatCreate_MPIAIJ(A));
5039566063dSJacob Faibussowitsch   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
5043ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5053338378cSStefano Zampini }
5063338378cSStefano Zampini 
5073f9c0db1SPaul Mullowney /*@
50811a5261eSBarry Smith   MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
509e057df02SPaul Mullowney   (the default parallel PETSc format).  This matrix will ultimately pushed down
51020f4b53cSBarry Smith   to NVIDIA GPUs and use the CuSPARSE library for calculations.
5119ae82921SPaul Mullowney 
512d083f849SBarry Smith   Collective
513e057df02SPaul Mullowney 
514e057df02SPaul Mullowney   Input Parameters:
51511a5261eSBarry Smith + comm  - MPI communicator, set to `PETSC_COMM_SELF`
51620f4b53cSBarry Smith . m     - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
51720f4b53cSBarry Smith            This value should be the same as the local size used in creating the
51820f4b53cSBarry Smith            y vector for the matrix-vector product y = Ax.
51920f4b53cSBarry Smith . n     - This value should be the same as the local size used in creating the
52020f4b53cSBarry Smith        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
52120f4b53cSBarry Smith        calculated if `N` is given) For square matrices `n` is almost always `m`.
52220f4b53cSBarry Smith . M     - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
52320f4b53cSBarry Smith . N     - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
52420f4b53cSBarry Smith . d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
52520f4b53cSBarry Smith            (same value is used for all local rows)
52620f4b53cSBarry Smith . d_nnz - array containing the number of nonzeros in the various rows of the
52720f4b53cSBarry Smith            DIAGONAL portion of the local submatrix (possibly different for each row)
52820f4b53cSBarry Smith            or `NULL`, if `d_nz` is used to specify the nonzero structure.
52920f4b53cSBarry Smith            The size of this array is equal to the number of local rows, i.e `m`.
53020f4b53cSBarry Smith            For matrices you plan to factor you must leave room for the diagonal entry and
53120f4b53cSBarry Smith            put in the entry even if it is zero.
53220f4b53cSBarry Smith . o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
53320f4b53cSBarry Smith            submatrix (same value is used for all local rows).
53420f4b53cSBarry Smith - o_nnz - array containing the number of nonzeros in the various rows of the
53520f4b53cSBarry Smith            OFF-DIAGONAL portion of the local submatrix (possibly different for
53620f4b53cSBarry Smith            each row) or `NULL`, if `o_nz` is used to specify the nonzero
53720f4b53cSBarry Smith            structure. The size of this array is equal to the number
53820f4b53cSBarry Smith            of local rows, i.e `m`.
539e057df02SPaul Mullowney 
540e057df02SPaul Mullowney   Output Parameter:
541e057df02SPaul Mullowney . A - the matrix
542e057df02SPaul Mullowney 
5432ef1f0ffSBarry Smith   Level: intermediate
5442ef1f0ffSBarry Smith 
5452ef1f0ffSBarry Smith   Notes:
54611a5261eSBarry Smith   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
547e057df02SPaul Mullowney   MatXXXXSetPreallocation() paradigm instead of this routine directly.
54811a5261eSBarry Smith   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
549e057df02SPaul Mullowney 
55011a5261eSBarry Smith   The AIJ format, also called the
5512ef1f0ffSBarry Smith   compressed row storage), is fully compatible with standard Fortran
552e057df02SPaul Mullowney   storage.  That is, the stored row and column indices can begin at
5532ef1f0ffSBarry Smith   either one (as in Fortran) or zero.
554e057df02SPaul Mullowney 
555fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
556e057df02SPaul Mullowney @*/
557d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
558d71ae5a4SJacob Faibussowitsch {
5599ae82921SPaul Mullowney   PetscMPIInt size;
5609ae82921SPaul Mullowney 
5619ae82921SPaul Mullowney   PetscFunctionBegin;
5629566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
5639566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, M, N));
5649566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
5659ae82921SPaul Mullowney   if (size > 1) {
5669566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
5679566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
5689ae82921SPaul Mullowney   } else {
5699566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
5709566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
5719ae82921SPaul Mullowney   }
5723ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5739ae82921SPaul Mullowney }
5749ae82921SPaul Mullowney 
5753ca39a21SBarry Smith /*MC
57611a5261eSBarry Smith    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
577e057df02SPaul Mullowney 
57815229ffcSPierre Jolivet    A matrix type whose data resides on NVIDIA GPUs. These matrices can be in either
5792692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
58011a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
5819ae82921SPaul Mullowney 
58211a5261eSBarry Smith    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
58311a5261eSBarry Smith    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
58411a5261eSBarry Smith    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
5859ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
5869ae82921SPaul Mullowney    the above preallocation routines for simplicity.
5879ae82921SPaul Mullowney 
5889ae82921SPaul Mullowney    Options Database Keys:
5892ef1f0ffSBarry Smith +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
5902ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
5912ef1f0ffSBarry Smith .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5922ef1f0ffSBarry Smith -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5939ae82921SPaul Mullowney 
5949ae82921SPaul Mullowney   Level: beginner
5959ae82921SPaul Mullowney 
5961cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
5976bb69460SJunchao Zhang M*/
5986bb69460SJunchao Zhang 
5996bb69460SJunchao Zhang /*MC
60011a5261eSBarry Smith    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
6016bb69460SJunchao Zhang 
6026bb69460SJunchao Zhang   Level: beginner
6036bb69460SJunchao Zhang 
6041cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
6059ae82921SPaul Mullowney M*/
606