199acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
20fd2b57fSKarl Rupp
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
557d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
77e8381f9SStefano Zampini #include <thrust/advance.h>
8a2cee5feSJed Brown #include <thrust/partition.h>
9a2cee5feSJed Brown #include <thrust/sort.h>
10a2cee5feSJed Brown #include <thrust/unique.h>
114eb5378fSStefano Zampini #include <petscsf.h>
127e8381f9SStefano Zampini
139371c9d4SSatish Balay struct VecCUDAEquals {
147e8381f9SStefano Zampini template <typename Tuple>
operator ()VecCUDAEquals15d71ae5a4SJacob Faibussowitsch __host__ __device__ void operator()(Tuple t)
16d71ae5a4SJacob Faibussowitsch {
177e8381f9SStefano Zampini thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini
MatCOOStructDestroy_MPIAIJCUSPARSE(PetscCtxRt data)21*2a8381b2SBarry Smith static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(PetscCtxRt data)
22d71ae5a4SJacob Faibussowitsch {
23*2a8381b2SBarry Smith MatCOOStruct_MPIAIJ *coo = *(MatCOOStruct_MPIAIJ **)data;
24cbc6b225SStefano Zampini
25cbc6b225SStefano Zampini PetscFunctionBegin;
262c4ab24aSJunchao Zhang PetscCall(PetscSFDestroy(&coo->sf));
272c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Ajmap1));
282c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Aperm1));
292c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bjmap1));
302c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bperm1));
312c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Aimap2));
322c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Ajmap2));
332c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Aperm2));
342c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bimap2));
352c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bjmap2));
362c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Bperm2));
372c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->Cperm1));
382c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->sendbuf));
392c4ab24aSJunchao Zhang PetscCallCUDA(cudaFree(coo->recvbuf));
402c4ab24aSJunchao Zhang PetscCall(PetscFree(coo));
413ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
427e8381f9SStefano Zampini }
439ae82921SPaul Mullowney
MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])44d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45d71ae5a4SJacob Faibussowitsch {
46cbc6b225SStefano Zampini Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
472c4ab24aSJunchao Zhang PetscBool dev_ij = PETSC_FALSE;
482c4ab24aSJunchao Zhang PetscMemType mtype = PETSC_MEMTYPE_HOST;
492c4ab24aSJunchao Zhang PetscInt *i, *j;
5049abdd8aSBarry Smith PetscContainer container_h;
512c4ab24aSJunchao Zhang MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52219fbbafSJunchao Zhang
53219fbbafSJunchao Zhang PetscFunctionBegin;
549566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->garray));
559566063dSJacob Faibussowitsch PetscCall(VecDestroy(&mpiaij->lvec));
56cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
57eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58cbc6b225SStefano Zampini #else
599566063dSJacob Faibussowitsch PetscCall(PetscFree(mpiaij->colmap));
60cbc6b225SStefano Zampini #endif
619566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62cbc6b225SStefano Zampini mat->assembled = PETSC_FALSE;
63cbc6b225SStefano Zampini mat->was_assembled = PETSC_FALSE;
649566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(coo_i, &mtype));
652c4ab24aSJunchao Zhang if (PetscMemTypeDevice(mtype)) {
662c4ab24aSJunchao Zhang dev_ij = PETSC_TRUE;
672c4ab24aSJunchao Zhang PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
682c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
692c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70219fbbafSJunchao Zhang } else {
712c4ab24aSJunchao Zhang i = coo_i;
722c4ab24aSJunchao Zhang j = coo_j;
732c4ab24aSJunchao Zhang }
742c4ab24aSJunchao Zhang
752c4ab24aSJunchao Zhang PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
762c4ab24aSJunchao Zhang if (dev_ij) PetscCall(PetscFree2(i, j));
77cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_CPU;
782c4ab24aSJunchao Zhang // Create the GPU memory
799566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
809566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81219fbbafSJunchao Zhang
822c4ab24aSJunchao Zhang // Copy the COO struct to device
832c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
84*2a8381b2SBarry Smith PetscCall(PetscContainerGetPointer(container_h, &coo_h));
852c4ab24aSJunchao Zhang PetscCall(PetscMalloc1(1, &coo_d));
862c4ab24aSJunchao Zhang *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87219fbbafSJunchao Zhang
882c4ab24aSJunchao Zhang PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
892c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
902c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
912c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
922c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
932c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
942c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
952c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
962c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
972c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
982c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
992c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
1002c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
1012c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102219fbbafSJunchao Zhang
1032c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1042c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1052c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1062c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1072c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1082c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1092c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1102c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1112c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
1122c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
1132c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114219fbbafSJunchao Zhang
1152c4ab24aSJunchao Zhang // Put the COO struct in a container and then attach that to the matrix
11649abdd8aSBarry Smith PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
1173ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
118219fbbafSJunchao Zhang }
119219fbbafSJunchao Zhang
MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])120d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
121d71ae5a4SJacob Faibussowitsch {
122219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
123219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x;
124219fbbafSJunchao Zhang for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
125219fbbafSJunchao Zhang }
126219fbbafSJunchao Zhang
MatAddLocalCOOValues(const PetscScalar kv[],InsertMode imode,PetscCount Annz,const PetscCount Ajmap1[],const PetscCount Aperm1[],PetscScalar Aa[],PetscCount Bnnz,const PetscCount Bjmap1[],const PetscCount Bperm1[],PetscScalar Ba[])127d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
128d71ae5a4SJacob Faibussowitsch {
129219fbbafSJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
130219fbbafSJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x;
131158ec288SJunchao Zhang for (; i < Annz + Bnnz; i += grid_size) {
132158ec288SJunchao Zhang PetscScalar sum = 0.0;
133158ec288SJunchao Zhang if (i < Annz) {
134158ec288SJunchao Zhang for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
135158ec288SJunchao Zhang Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
136158ec288SJunchao Zhang } else {
137158ec288SJunchao Zhang i -= Annz;
138158ec288SJunchao Zhang for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
139158ec288SJunchao Zhang Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
140158ec288SJunchao Zhang }
141158ec288SJunchao Zhang }
142158ec288SJunchao Zhang }
143158ec288SJunchao Zhang
MatAddRemoteCOOValues(const PetscScalar kv[],PetscCount Annz2,const PetscCount Aimap2[],const PetscCount Ajmap2[],const PetscCount Aperm2[],PetscScalar Aa[],PetscCount Bnnz2,const PetscCount Bimap2[],const PetscCount Bjmap2[],const PetscCount Bperm2[],PetscScalar Ba[])144d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
145d71ae5a4SJacob Faibussowitsch {
146158ec288SJunchao Zhang PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
147158ec288SJunchao Zhang const PetscCount grid_size = gridDim.x * blockDim.x;
148158ec288SJunchao Zhang for (; i < Annz2 + Bnnz2; i += grid_size) {
149158ec288SJunchao Zhang if (i < Annz2) {
150158ec288SJunchao Zhang for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
151158ec288SJunchao Zhang } else {
152158ec288SJunchao Zhang i -= Annz2;
153158ec288SJunchao Zhang for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
154158ec288SJunchao Zhang }
155158ec288SJunchao Zhang }
156219fbbafSJunchao Zhang }
157219fbbafSJunchao Zhang
MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)158d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
159d71ae5a4SJacob Faibussowitsch {
160219fbbafSJunchao Zhang Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
161219fbbafSJunchao Zhang Mat A = mpiaij->A, B = mpiaij->B;
1622c4ab24aSJunchao Zhang PetscScalar *Aa, *Ba;
163219fbbafSJunchao Zhang const PetscScalar *v1 = v;
164219fbbafSJunchao Zhang PetscMemType memtype;
1652c4ab24aSJunchao Zhang PetscContainer container;
1662c4ab24aSJunchao Zhang MatCOOStruct_MPIAIJ *coo;
167219fbbafSJunchao Zhang
168219fbbafSJunchao Zhang PetscFunctionBegin;
1692c4ab24aSJunchao Zhang PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1702c4ab24aSJunchao Zhang PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
171*2a8381b2SBarry Smith PetscCall(PetscContainerGetPointer(container, &coo));
172cbc6b225SStefano Zampini
1732c4ab24aSJunchao Zhang const auto &Annz = coo->Annz;
1742c4ab24aSJunchao Zhang const auto &Annz2 = coo->Annz2;
1752c4ab24aSJunchao Zhang const auto &Bnnz = coo->Bnnz;
1762c4ab24aSJunchao Zhang const auto &Bnnz2 = coo->Bnnz2;
1772c4ab24aSJunchao Zhang const auto &vsend = coo->sendbuf;
1782c4ab24aSJunchao Zhang const auto &v2 = coo->recvbuf;
1792c4ab24aSJunchao Zhang const auto &Ajmap1 = coo->Ajmap1;
1802c4ab24aSJunchao Zhang const auto &Ajmap2 = coo->Ajmap2;
1812c4ab24aSJunchao Zhang const auto &Aimap2 = coo->Aimap2;
1822c4ab24aSJunchao Zhang const auto &Bjmap1 = coo->Bjmap1;
1832c4ab24aSJunchao Zhang const auto &Bjmap2 = coo->Bjmap2;
1842c4ab24aSJunchao Zhang const auto &Bimap2 = coo->Bimap2;
1852c4ab24aSJunchao Zhang const auto &Aperm1 = coo->Aperm1;
1862c4ab24aSJunchao Zhang const auto &Aperm2 = coo->Aperm2;
1872c4ab24aSJunchao Zhang const auto &Bperm1 = coo->Bperm1;
1882c4ab24aSJunchao Zhang const auto &Bperm2 = coo->Bperm2;
1892c4ab24aSJunchao Zhang const auto &Cperm1 = coo->Cperm1;
1902c4ab24aSJunchao Zhang
1919566063dSJacob Faibussowitsch PetscCall(PetscGetMemType(v, &memtype));
192219fbbafSJunchao Zhang if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
1932c4ab24aSJunchao Zhang PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
1942c4ab24aSJunchao Zhang PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
195219fbbafSJunchao Zhang }
196219fbbafSJunchao Zhang
197219fbbafSJunchao Zhang if (imode == INSERT_VALUES) {
1989566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
1999566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
200219fbbafSJunchao Zhang } else {
2019566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
202158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
203219fbbafSJunchao Zhang }
204219fbbafSJunchao Zhang
20508bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeBegin());
206219fbbafSJunchao Zhang /* Pack entries to be sent to remote */
2072c4ab24aSJunchao Zhang if (coo->sendlen) {
2082c4ab24aSJunchao Zhang MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
2099371c9d4SSatish Balay PetscCallCUDA(cudaPeekAtLastError());
210cbc6b225SStefano Zampini }
211219fbbafSJunchao Zhang
212219fbbafSJunchao Zhang /* Send remote entries to their owner and overlap the communication with local computation */
2132c4ab24aSJunchao Zhang PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
214219fbbafSJunchao Zhang /* Add local entries to A and B */
215158ec288SJunchao Zhang if (Annz + Bnnz > 0) {
2166497c311SBarry Smith MatAddLocalCOOValues<<<(int)((Annz + Bnnz + 255) / 256), 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
2179566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError());
218cbc6b225SStefano Zampini }
2192c4ab24aSJunchao Zhang PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
220219fbbafSJunchao Zhang
221219fbbafSJunchao Zhang /* Add received remote entries to A and B */
222158ec288SJunchao Zhang if (Annz2 + Bnnz2 > 0) {
2236497c311SBarry Smith MatAddRemoteCOOValues<<<(int)((Annz2 + Bnnz2 + 255) / 256), 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
2249566063dSJacob Faibussowitsch PetscCallCUDA(cudaPeekAtLastError());
225cbc6b225SStefano Zampini }
22608bb9926SJunchao Zhang PetscCall(PetscLogGpuTimeEnd());
227219fbbafSJunchao Zhang
228219fbbafSJunchao Zhang if (imode == INSERT_VALUES) {
2299566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
230158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
231219fbbafSJunchao Zhang } else {
2329566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
233158ec288SJunchao Zhang PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
234219fbbafSJunchao Zhang }
2359566063dSJacob Faibussowitsch if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
236cbc6b225SStefano Zampini mat->offloadmask = PETSC_OFFLOAD_GPU;
2373ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
238219fbbafSJunchao Zhang }
239219fbbafSJunchao Zhang
MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS * glob,Mat * A_loc)240d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
241d71ae5a4SJacob Faibussowitsch {
242ed502f03SStefano Zampini Mat Ad, Ao;
243ed502f03SStefano Zampini const PetscInt *cmap;
244ed502f03SStefano Zampini
245ed502f03SStefano Zampini PetscFunctionBegin;
2469566063dSJacob Faibussowitsch PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
2479566063dSJacob Faibussowitsch PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
248ed502f03SStefano Zampini if (glob) {
249ed502f03SStefano Zampini PetscInt cst, i, dn, on, *gidx;
250ed502f03SStefano Zampini
2519566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ad, NULL, &dn));
2529566063dSJacob Faibussowitsch PetscCall(MatGetLocalSize(Ao, NULL, &on));
2539566063dSJacob Faibussowitsch PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
2549566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(dn + on, &gidx));
255ed502f03SStefano Zampini for (i = 0; i < dn; i++) gidx[i] = cst + i;
256ed502f03SStefano Zampini for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
2579566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
258ed502f03SStefano Zampini }
2593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
260ed502f03SStefano Zampini }
261ed502f03SStefano Zampini
MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])26266976f2fSJacob Faibussowitsch static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
263d71ae5a4SJacob Faibussowitsch {
264bbf3fe20SPaul Mullowney Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
265bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
2669ae82921SPaul Mullowney PetscInt i;
2679ae82921SPaul Mullowney
2689ae82921SPaul Mullowney PetscFunctionBegin;
2692f3c17daSJacob Faibussowitsch if (B->hash_active) {
2702f3c17daSJacob Faibussowitsch B->ops[0] = b->cops;
2712f3c17daSJacob Faibussowitsch B->hash_active = PETSC_FALSE;
2722f3c17daSJacob Faibussowitsch }
2739566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->rmap));
2749566063dSJacob Faibussowitsch PetscCall(PetscLayoutSetUp(B->cmap));
2757e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && d_nnz) {
276ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
2779ae82921SPaul Mullowney }
2787e8381f9SStefano Zampini if (PetscDefined(USE_DEBUG) && o_nnz) {
279ad540459SPierre Jolivet for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
2809ae82921SPaul Mullowney }
2816a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
282eec179cfSJacob Faibussowitsch PetscCall(PetscHMapIDestroy(&b->colmap));
2836a29ce69SStefano Zampini #else
2849566063dSJacob Faibussowitsch PetscCall(PetscFree(b->colmap));
2856a29ce69SStefano Zampini #endif
2869566063dSJacob Faibussowitsch PetscCall(PetscFree(b->garray));
2879566063dSJacob Faibussowitsch PetscCall(VecDestroy(&b->lvec));
2889566063dSJacob Faibussowitsch PetscCall(VecScatterDestroy(&b->Mvctx));
2896a29ce69SStefano Zampini /* Because the B will have been resized we simply destroy it and create a new one each time */
2909566063dSJacob Faibussowitsch PetscCall(MatDestroy(&b->B));
2916a29ce69SStefano Zampini if (!b->A) {
2929566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2939566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2946a29ce69SStefano Zampini }
2956a29ce69SStefano Zampini if (!b->B) {
2966a29ce69SStefano Zampini PetscMPIInt size;
2979566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2989566063dSJacob Faibussowitsch PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2999566063dSJacob Faibussowitsch PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
3009ae82921SPaul Mullowney }
3019566063dSJacob Faibussowitsch PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
3029566063dSJacob Faibussowitsch PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
3039566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->A, B->boundtocpu));
3049566063dSJacob Faibussowitsch PetscCall(MatBindToCPU(b->B, B->boundtocpu));
3059566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
3069566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
3079566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
3089566063dSJacob Faibussowitsch PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
3099ae82921SPaul Mullowney B->preallocated = PETSC_TRUE;
3102f3c17daSJacob Faibussowitsch B->was_assembled = PETSC_FALSE;
3112f3c17daSJacob Faibussowitsch B->assembled = PETSC_FALSE;
3123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
3139ae82921SPaul Mullowney }
314e057df02SPaul Mullowney
MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)31566976f2fSJacob Faibussowitsch static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
316d71ae5a4SJacob Faibussowitsch {
3179ae82921SPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
3189ae82921SPaul Mullowney
3199ae82921SPaul Mullowney PetscFunctionBegin;
3209566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3219566063dSJacob Faibussowitsch PetscCall((*a->A->ops->mult)(a->A, xx, yy));
3229566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3239566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
3243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
3259ae82921SPaul Mullowney }
326ca45077fSPaul Mullowney
MatZeroEntries_MPIAIJCUSPARSE(Mat A)32766976f2fSJacob Faibussowitsch static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
328d71ae5a4SJacob Faibussowitsch {
3293fa6b06aSMark Adams Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
3303fa6b06aSMark Adams
3313fa6b06aSMark Adams PetscFunctionBegin;
3329566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->A));
3339566063dSJacob Faibussowitsch PetscCall(MatZeroEntries(l->B));
3343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
3353fa6b06aSMark Adams }
336042217e8SBarry Smith
MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)33766976f2fSJacob Faibussowitsch static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
338d71ae5a4SJacob Faibussowitsch {
339fdc842d1SBarry Smith Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
340fdc842d1SBarry Smith
341fdc842d1SBarry Smith PetscFunctionBegin;
3429566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3439566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
3449566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3459566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
3463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
347fdc842d1SBarry Smith }
348fdc842d1SBarry Smith
MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)34966976f2fSJacob Faibussowitsch static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
350d71ae5a4SJacob Faibussowitsch {
351ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
352ca45077fSPaul Mullowney
353ca45077fSPaul Mullowney PetscFunctionBegin;
3549566063dSJacob Faibussowitsch PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
3559566063dSJacob Faibussowitsch PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
3569566063dSJacob Faibussowitsch PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3579566063dSJacob Faibussowitsch PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
359ca45077fSPaul Mullowney }
3609ae82921SPaul Mullowney
MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)36166976f2fSJacob Faibussowitsch static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
362d71ae5a4SJacob Faibussowitsch {
363ca45077fSPaul Mullowney Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
364bbf3fe20SPaul Mullowney Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
365e057df02SPaul Mullowney
366ca45077fSPaul Mullowney PetscFunctionBegin;
367e057df02SPaul Mullowney switch (op) {
368d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_DIAG:
369d71ae5a4SJacob Faibussowitsch cusparseStruct->diagGPUMatFormat = format;
370d71ae5a4SJacob Faibussowitsch break;
371d71ae5a4SJacob Faibussowitsch case MAT_CUSPARSE_MULT_OFFDIAG:
372d71ae5a4SJacob Faibussowitsch cusparseStruct->offdiagGPUMatFormat = format;
373d71ae5a4SJacob Faibussowitsch break;
374e057df02SPaul Mullowney case MAT_CUSPARSE_ALL:
375e057df02SPaul Mullowney cusparseStruct->diagGPUMatFormat = format;
376e057df02SPaul Mullowney cusparseStruct->offdiagGPUMatFormat = format;
377045c96e1SPaul Mullowney break;
378d71ae5a4SJacob Faibussowitsch default:
379d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
380045c96e1SPaul Mullowney }
3813ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
382ca45077fSPaul Mullowney }
383e057df02SPaul Mullowney
MatSetFromOptions_MPIAIJCUSPARSE(Mat A,PetscOptionItems PetscOptionsObject)384ce78bad3SBarry Smith static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
385d71ae5a4SJacob Faibussowitsch {
386e057df02SPaul Mullowney MatCUSPARSEStorageFormat format;
387e057df02SPaul Mullowney PetscBool flg;
388a183c035SDominic Meiser Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
389a183c035SDominic Meiser Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
3905fd66863SKarl Rupp
391e057df02SPaul Mullowney PetscFunctionBegin;
392d0609cedSBarry Smith PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
393e057df02SPaul Mullowney if (A->factortype == MAT_FACTOR_NONE) {
3949371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
3959566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
3969371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
3979566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
3989371c9d4SSatish Balay PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
3999566063dSJacob Faibussowitsch if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
400e057df02SPaul Mullowney }
401d0609cedSBarry Smith PetscOptionsHeadEnd();
4023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
403e057df02SPaul Mullowney }
404e057df02SPaul Mullowney
MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)40566976f2fSJacob Faibussowitsch static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
406d71ae5a4SJacob Faibussowitsch {
4073fa6b06aSMark Adams Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
408042217e8SBarry Smith
40934d6c7a5SJose E. Roman PetscFunctionBegin;
4109566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
4119566063dSJacob Faibussowitsch if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
4123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
41334d6c7a5SJose E. Roman }
41434d6c7a5SJose E. Roman
MatDestroy_MPIAIJCUSPARSE(Mat A)41566976f2fSJacob Faibussowitsch static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
416d71ae5a4SJacob Faibussowitsch {
4173fa6b06aSMark Adams Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
4183fa6b06aSMark Adams Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
419bbf3fe20SPaul Mullowney
420bbf3fe20SPaul Mullowney PetscFunctionBegin;
42128b400f6SJacob Faibussowitsch PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
422acbf8a88SJunchao Zhang PetscCallCXX(delete cusparseStruct);
4239566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
4249566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
4259566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4269566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4279566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
4289566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
4299566063dSJacob Faibussowitsch PetscCall(MatDestroy_MPIAIJ(A));
4303ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
431bbf3fe20SPaul Mullowney }
432ca45077fSPaul Mullowney
43326cec326SBarry Smith /* defines MatSetValues_MPICUSPARSE_Hash() */
43426cec326SBarry Smith #define TYPE AIJ
43526cec326SBarry Smith #define TYPE_AIJ
43626cec326SBarry Smith #define SUB_TYPE_CUSPARSE
43726cec326SBarry Smith #include "../src/mat/impls/aij/mpi/mpihashmat.h"
43826cec326SBarry Smith #undef TYPE
43926cec326SBarry Smith #undef TYPE_AIJ
44026cec326SBarry Smith #undef SUB_TYPE_CUSPARSE
44126cec326SBarry Smith
MatSetUp_MPI_HASH_CUSPARSE(Mat A)44226cec326SBarry Smith static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
44326cec326SBarry Smith {
44426cec326SBarry Smith Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data;
44526cec326SBarry Smith Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
44626cec326SBarry Smith
44726cec326SBarry Smith PetscFunctionBegin;
44826cec326SBarry Smith PetscCall(MatSetUp_MPI_Hash(A));
44926cec326SBarry Smith PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
45026cec326SBarry Smith PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
45126cec326SBarry Smith A->preallocated = PETSC_TRUE;
45226cec326SBarry Smith PetscFunctionReturn(PETSC_SUCCESS);
45326cec326SBarry Smith }
45426cec326SBarry Smith
MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B,MatType,MatReuse reuse,Mat * newmat)4558eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
456d71ae5a4SJacob Faibussowitsch {
457bbf3fe20SPaul Mullowney Mat_MPIAIJ *a;
4583338378cSStefano Zampini Mat A;
4599ae82921SPaul Mullowney
4609ae82921SPaul Mullowney PetscFunctionBegin;
4619566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
4629566063dSJacob Faibussowitsch if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
4639566063dSJacob Faibussowitsch else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
4643338378cSStefano Zampini A = *newmat;
4656f3d89d0SStefano Zampini A->boundtocpu = PETSC_FALSE;
4669566063dSJacob Faibussowitsch PetscCall(PetscFree(A->defaultvectype));
4679566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
46834136279SStefano Zampini
469bbf3fe20SPaul Mullowney a = (Mat_MPIAIJ *)A->data;
4709566063dSJacob Faibussowitsch if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
4719566063dSJacob Faibussowitsch if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
4729566063dSJacob Faibussowitsch if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
473d98d7c49SStefano Zampini
47448a46eb9SPierre Jolivet if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
4752205254eSKarl Rupp
47634d6c7a5SJose E. Roman A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
477bbf3fe20SPaul Mullowney A->ops->mult = MatMult_MPIAIJCUSPARSE;
478fdc842d1SBarry Smith A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
479bbf3fe20SPaul Mullowney A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
480bbf3fe20SPaul Mullowney A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
481bbf3fe20SPaul Mullowney A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
4823fa6b06aSMark Adams A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
4834e84afc0SStefano Zampini A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
48426cec326SBarry Smith A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE;
48503db1824SAlex Lindsay A->ops->getcurrentmemtype = MatGetCurrentMemType_MPIAIJ;
4862205254eSKarl Rupp
4879566063dSJacob Faibussowitsch PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
4889566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
4899566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
4909566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
4919566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
4929566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
493ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
4949566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
495ae48a8d0SStefano Zampini #endif
4963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
4979ae82921SPaul Mullowney }
4989ae82921SPaul Mullowney
MatCreate_MPIAIJCUSPARSE(Mat A)499d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
500d71ae5a4SJacob Faibussowitsch {
5013338378cSStefano Zampini PetscFunctionBegin;
5029566063dSJacob Faibussowitsch PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
5039566063dSJacob Faibussowitsch PetscCall(MatCreate_MPIAIJ(A));
5049566063dSJacob Faibussowitsch PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
5053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
5063338378cSStefano Zampini }
5073338378cSStefano Zampini
5083f9c0db1SPaul Mullowney /*@
50953220ed8SBarry Smith MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format. This matrix will ultimately be pushed down
51020f4b53cSBarry Smith to NVIDIA GPUs and use the CuSPARSE library for calculations.
5119ae82921SPaul Mullowney
512d083f849SBarry Smith Collective
513e057df02SPaul Mullowney
514e057df02SPaul Mullowney Input Parameters:
51511a5261eSBarry Smith + comm - MPI communicator, set to `PETSC_COMM_SELF`
51620f4b53cSBarry Smith . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
51720f4b53cSBarry Smith This value should be the same as the local size used in creating the
51853220ed8SBarry Smith $y$ vector for the matrix-vector product $y = Ax$.
51920f4b53cSBarry Smith . n - This value should be the same as the local size used in creating the
52053220ed8SBarry Smith $x$ vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
52120f4b53cSBarry Smith calculated if `N` is given) For square matrices `n` is almost always `m`.
52220f4b53cSBarry Smith . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
52320f4b53cSBarry Smith . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
52420f4b53cSBarry Smith . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix
52520f4b53cSBarry Smith (same value is used for all local rows)
52620f4b53cSBarry Smith . d_nnz - array containing the number of nonzeros in the various rows of the
52720f4b53cSBarry Smith DIAGONAL portion of the local submatrix (possibly different for each row)
52820f4b53cSBarry Smith or `NULL`, if `d_nz` is used to specify the nonzero structure.
52920f4b53cSBarry Smith The size of this array is equal to the number of local rows, i.e `m`.
53020f4b53cSBarry Smith For matrices you plan to factor you must leave room for the diagonal entry and
53120f4b53cSBarry Smith put in the entry even if it is zero.
53220f4b53cSBarry Smith . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local
53320f4b53cSBarry Smith submatrix (same value is used for all local rows).
53420f4b53cSBarry Smith - o_nnz - array containing the number of nonzeros in the various rows of the
53520f4b53cSBarry Smith OFF-DIAGONAL portion of the local submatrix (possibly different for
53620f4b53cSBarry Smith each row) or `NULL`, if `o_nz` is used to specify the nonzero
53720f4b53cSBarry Smith structure. The size of this array is equal to the number
53820f4b53cSBarry Smith of local rows, i.e `m`.
539e057df02SPaul Mullowney
540e057df02SPaul Mullowney Output Parameter:
541e057df02SPaul Mullowney . A - the matrix
542e057df02SPaul Mullowney
5432ef1f0ffSBarry Smith Level: intermediate
5442ef1f0ffSBarry Smith
5452ef1f0ffSBarry Smith Notes:
54611a5261eSBarry Smith It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
547e057df02SPaul Mullowney MatXXXXSetPreallocation() paradigm instead of this routine directly.
54811a5261eSBarry Smith [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
549e057df02SPaul Mullowney
55011a5261eSBarry Smith The AIJ format, also called the
5512ef1f0ffSBarry Smith compressed row storage), is fully compatible with standard Fortran
552e057df02SPaul Mullowney storage. That is, the stored row and column indices can begin at
5532ef1f0ffSBarry Smith either one (as in Fortran) or zero.
554e057df02SPaul Mullowney
55553220ed8SBarry Smith When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
55653220ed8SBarry Smith
557fe59aa6dSJacob Faibussowitsch .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
558e057df02SPaul Mullowney @*/
MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat * A)559d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
560d71ae5a4SJacob Faibussowitsch {
5619ae82921SPaul Mullowney PetscMPIInt size;
5629ae82921SPaul Mullowney
5639ae82921SPaul Mullowney PetscFunctionBegin;
5649566063dSJacob Faibussowitsch PetscCall(MatCreate(comm, A));
5659566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*A, m, n, M, N));
5669566063dSJacob Faibussowitsch PetscCallMPI(MPI_Comm_size(comm, &size));
5679ae82921SPaul Mullowney if (size > 1) {
5689566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
5699566063dSJacob Faibussowitsch PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
5709ae82921SPaul Mullowney } else {
5719566063dSJacob Faibussowitsch PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
5729566063dSJacob Faibussowitsch PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
5739ae82921SPaul Mullowney }
5743ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
5759ae82921SPaul Mullowney }
5769ae82921SPaul Mullowney
5773ca39a21SBarry Smith /*MC
57853220ed8SBarry Smith MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs.
5799ae82921SPaul Mullowney
5809ae82921SPaul Mullowney Options Database Keys:
5812ef1f0ffSBarry Smith + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
5822ef1f0ffSBarry Smith . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
5832ef1f0ffSBarry Smith . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5842ef1f0ffSBarry Smith - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5859ae82921SPaul Mullowney
5869ae82921SPaul Mullowney Level: beginner
5879ae82921SPaul Mullowney
58853220ed8SBarry Smith Notes:
58953220ed8SBarry Smith These matrices can be in either CSR, ELL, or HYB format. The ELL and HYB formats require CUDA 4.2 or later.
59053220ed8SBarry Smith
59153220ed8SBarry Smith All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
59253220ed8SBarry Smith
59353220ed8SBarry Smith Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
59453220ed8SBarry Smith if some integer values passed in do not fit in `int`.
59553220ed8SBarry Smith
5961cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
5976bb69460SJunchao Zhang M*/
5986bb69460SJunchao Zhang
5996bb69460SJunchao Zhang /*MC
60053220ed8SBarry Smith MATAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs; it is as same as `MATSEQAIJCUSPARSE` on one MPI process and `MATMPIAIJCUSPARSE` on multiple processes.
6016bb69460SJunchao Zhang
6026bb69460SJunchao Zhang Level: beginner
6036bb69460SJunchao Zhang
6041cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
6059ae82921SPaul Mullowney M*/
606