xref: /petsc/src/mat/impls/aij/mpi/mpicusparse/mpiaijcusparse.cu (revision 2c4ab24a73fa188c4ad3835fde91f6d95fb2e34f)
199acd6aaSStefano Zampini #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
20fd2b57fSKarl Rupp 
33d13b8fdSMatthew G. Knepley #include <petscconf.h>
49ae82921SPaul Mullowney #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
557d48284SJunchao Zhang #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
63d13b8fdSMatthew G. Knepley #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
77e8381f9SStefano Zampini #include <thrust/advance.h>
8a2cee5feSJed Brown #include <thrust/partition.h>
9a2cee5feSJed Brown #include <thrust/sort.h>
10a2cee5feSJed Brown #include <thrust/unique.h>
114eb5378fSStefano Zampini #include <petscsf.h>
127e8381f9SStefano Zampini 
139371c9d4SSatish Balay struct VecCUDAEquals {
147e8381f9SStefano Zampini   template <typename Tuple>
15d71ae5a4SJacob Faibussowitsch   __host__ __device__ void operator()(Tuple t)
16d71ae5a4SJacob Faibussowitsch   {
177e8381f9SStefano Zampini     thrust::get<1>(t) = thrust::get<0>(t);
187e8381f9SStefano Zampini   }
197e8381f9SStefano Zampini };
207e8381f9SStefano Zampini 
21*2c4ab24aSJunchao Zhang static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(void *data)
22d71ae5a4SJacob Faibussowitsch {
23*2c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo = (MatCOOStruct_MPIAIJ *)data;
24cbc6b225SStefano Zampini 
25cbc6b225SStefano Zampini   PetscFunctionBegin;
26*2c4ab24aSJunchao Zhang   PetscCall(PetscSFDestroy(&coo->sf));
27*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Ajmap1));
28*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aperm1));
29*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bjmap1));
30*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bperm1));
31*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aimap2));
32*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Ajmap2));
33*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Aperm2));
34*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bimap2));
35*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bjmap2));
36*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Bperm2));
37*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->Cperm1));
38*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->sendbuf));
39*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaFree(coo->recvbuf));
40*2c4ab24aSJunchao Zhang   PetscCall(PetscFree(coo));
413ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
427e8381f9SStefano Zampini }
439ae82921SPaul Mullowney 
44d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45d71ae5a4SJacob Faibussowitsch {
46cbc6b225SStefano Zampini   Mat_MPIAIJ          *mpiaij = (Mat_MPIAIJ *)mat->data;
47*2c4ab24aSJunchao Zhang   PetscBool            dev_ij = PETSC_FALSE;
48*2c4ab24aSJunchao Zhang   PetscMemType         mtype  = PETSC_MEMTYPE_HOST;
49*2c4ab24aSJunchao Zhang   PetscInt            *i, *j;
50*2c4ab24aSJunchao Zhang   PetscContainer       container_h, container_d;
51*2c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52219fbbafSJunchao Zhang 
53219fbbafSJunchao Zhang   PetscFunctionBegin;
549566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->garray));
559566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&mpiaij->lvec));
56cbc6b225SStefano Zampini #if defined(PETSC_USE_CTABLE)
57eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58cbc6b225SStefano Zampini #else
599566063dSJacob Faibussowitsch   PetscCall(PetscFree(mpiaij->colmap));
60cbc6b225SStefano Zampini #endif
619566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62cbc6b225SStefano Zampini   mat->assembled     = PETSC_FALSE;
63cbc6b225SStefano Zampini   mat->was_assembled = PETSC_FALSE;
649566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(coo_i, &mtype));
65*2c4ab24aSJunchao Zhang   if (PetscMemTypeDevice(mtype)) {
66*2c4ab24aSJunchao Zhang     dev_ij = PETSC_TRUE;
67*2c4ab24aSJunchao Zhang     PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
68*2c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
69*2c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70219fbbafSJunchao Zhang   } else {
71*2c4ab24aSJunchao Zhang     i = coo_i;
72*2c4ab24aSJunchao Zhang     j = coo_j;
73*2c4ab24aSJunchao Zhang   }
74*2c4ab24aSJunchao Zhang 
75*2c4ab24aSJunchao Zhang   PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
76*2c4ab24aSJunchao Zhang   if (dev_ij) PetscCall(PetscFree2(i, j));
77cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_CPU;
78*2c4ab24aSJunchao Zhang   // Create the GPU memory
799566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
809566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81219fbbafSJunchao Zhang 
82*2c4ab24aSJunchao Zhang   // Copy the COO struct to device
83*2c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
84*2c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container_h, (void **)&coo_h));
85*2c4ab24aSJunchao Zhang   PetscCall(PetscMalloc1(1, &coo_d));
86*2c4ab24aSJunchao Zhang   *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87219fbbafSJunchao Zhang 
88*2c4ab24aSJunchao Zhang   PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
89*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
90*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
91*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
92*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
93*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
94*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
95*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
96*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
97*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
98*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
99*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102219fbbafSJunchao Zhang 
103*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
104*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
105*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
106*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
107*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
108*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
109*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
110*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
111*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
112*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
113*2c4ab24aSJunchao Zhang   PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114219fbbafSJunchao Zhang 
115*2c4ab24aSJunchao Zhang   // Put the COO struct in a container and then attach that to the matrix
116*2c4ab24aSJunchao Zhang   PetscCall(PetscContainerCreate(PETSC_COMM_SELF, &container_d));
117*2c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetPointer(container_d, coo_d));
118*2c4ab24aSJunchao Zhang   PetscCall(PetscContainerSetUserDestroy(container_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
119*2c4ab24aSJunchao Zhang   PetscCall(PetscObjectCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject)container_d));
120*2c4ab24aSJunchao Zhang   PetscCall(PetscContainerDestroy(&container_d));
1213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
122219fbbafSJunchao Zhang }
123219fbbafSJunchao Zhang 
124d71ae5a4SJacob Faibussowitsch __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
125d71ae5a4SJacob Faibussowitsch {
126219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
127219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
128219fbbafSJunchao Zhang   for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
129219fbbafSJunchao Zhang }
130219fbbafSJunchao Zhang 
131d71ae5a4SJacob Faibussowitsch __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
132d71ae5a4SJacob Faibussowitsch {
133219fbbafSJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
134219fbbafSJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
135158ec288SJunchao Zhang   for (; i < Annz + Bnnz; i += grid_size) {
136158ec288SJunchao Zhang     PetscScalar sum = 0.0;
137158ec288SJunchao Zhang     if (i < Annz) {
138158ec288SJunchao Zhang       for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
139158ec288SJunchao Zhang       Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
140158ec288SJunchao Zhang     } else {
141158ec288SJunchao Zhang       i -= Annz;
142158ec288SJunchao Zhang       for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
143158ec288SJunchao Zhang       Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
144158ec288SJunchao Zhang     }
145158ec288SJunchao Zhang   }
146158ec288SJunchao Zhang }
147158ec288SJunchao Zhang 
148d71ae5a4SJacob Faibussowitsch __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
149d71ae5a4SJacob Faibussowitsch {
150158ec288SJunchao Zhang   PetscCount       i         = blockIdx.x * blockDim.x + threadIdx.x;
151158ec288SJunchao Zhang   const PetscCount grid_size = gridDim.x * blockDim.x;
152158ec288SJunchao Zhang   for (; i < Annz2 + Bnnz2; i += grid_size) {
153158ec288SJunchao Zhang     if (i < Annz2) {
154158ec288SJunchao Zhang       for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
155158ec288SJunchao Zhang     } else {
156158ec288SJunchao Zhang       i -= Annz2;
157158ec288SJunchao Zhang       for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
158158ec288SJunchao Zhang     }
159158ec288SJunchao Zhang   }
160219fbbafSJunchao Zhang }
161219fbbafSJunchao Zhang 
162d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
163d71ae5a4SJacob Faibussowitsch {
164219fbbafSJunchao Zhang   Mat_MPIAIJ          *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
165219fbbafSJunchao Zhang   Mat                  A = mpiaij->A, B = mpiaij->B;
166*2c4ab24aSJunchao Zhang   PetscScalar         *Aa, *Ba;
167219fbbafSJunchao Zhang   const PetscScalar   *v1 = v;
168219fbbafSJunchao Zhang   PetscMemType         memtype;
169*2c4ab24aSJunchao Zhang   PetscContainer       container;
170*2c4ab24aSJunchao Zhang   MatCOOStruct_MPIAIJ *coo;
171219fbbafSJunchao Zhang 
172219fbbafSJunchao Zhang   PetscFunctionBegin;
173*2c4ab24aSJunchao Zhang   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
174*2c4ab24aSJunchao Zhang   PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
175*2c4ab24aSJunchao Zhang   PetscCall(PetscContainerGetPointer(container, (void **)&coo));
176cbc6b225SStefano Zampini 
177*2c4ab24aSJunchao Zhang   const auto &Annz   = coo->Annz;
178*2c4ab24aSJunchao Zhang   const auto &Annz2  = coo->Annz2;
179*2c4ab24aSJunchao Zhang   const auto &Bnnz   = coo->Bnnz;
180*2c4ab24aSJunchao Zhang   const auto &Bnnz2  = coo->Bnnz2;
181*2c4ab24aSJunchao Zhang   const auto &vsend  = coo->sendbuf;
182*2c4ab24aSJunchao Zhang   const auto &v2     = coo->recvbuf;
183*2c4ab24aSJunchao Zhang   const auto &Ajmap1 = coo->Ajmap1;
184*2c4ab24aSJunchao Zhang   const auto &Ajmap2 = coo->Ajmap2;
185*2c4ab24aSJunchao Zhang   const auto &Aimap2 = coo->Aimap2;
186*2c4ab24aSJunchao Zhang   const auto &Bjmap1 = coo->Bjmap1;
187*2c4ab24aSJunchao Zhang   const auto &Bjmap2 = coo->Bjmap2;
188*2c4ab24aSJunchao Zhang   const auto &Bimap2 = coo->Bimap2;
189*2c4ab24aSJunchao Zhang   const auto &Aperm1 = coo->Aperm1;
190*2c4ab24aSJunchao Zhang   const auto &Aperm2 = coo->Aperm2;
191*2c4ab24aSJunchao Zhang   const auto &Bperm1 = coo->Bperm1;
192*2c4ab24aSJunchao Zhang   const auto &Bperm2 = coo->Bperm2;
193*2c4ab24aSJunchao Zhang   const auto &Cperm1 = coo->Cperm1;
194*2c4ab24aSJunchao Zhang 
1959566063dSJacob Faibussowitsch   PetscCall(PetscGetMemType(v, &memtype));
196219fbbafSJunchao Zhang   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
197*2c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
198*2c4ab24aSJunchao Zhang     PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
199219fbbafSJunchao Zhang   }
200219fbbafSJunchao Zhang 
201219fbbafSJunchao Zhang   if (imode == INSERT_VALUES) {
2029566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
2039566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
204219fbbafSJunchao Zhang   } else {
2059566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
206158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
207219fbbafSJunchao Zhang   }
208219fbbafSJunchao Zhang 
209219fbbafSJunchao Zhang   /* Pack entries to be sent to remote */
210*2c4ab24aSJunchao Zhang   if (coo->sendlen) {
211*2c4ab24aSJunchao Zhang     MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
2129371c9d4SSatish Balay     PetscCallCUDA(cudaPeekAtLastError());
213cbc6b225SStefano Zampini   }
214219fbbafSJunchao Zhang 
215219fbbafSJunchao Zhang   /* Send remote entries to their owner and overlap the communication with local computation */
216*2c4ab24aSJunchao Zhang   PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
217219fbbafSJunchao Zhang   /* Add local entries to A and B */
218158ec288SJunchao Zhang   if (Annz + Bnnz > 0) {
219158ec288SJunchao Zhang     MatAddLocalCOOValues<<<(Annz + Bnnz + 255) / 256, 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
2209566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
221cbc6b225SStefano Zampini   }
222*2c4ab24aSJunchao Zhang   PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
223219fbbafSJunchao Zhang 
224219fbbafSJunchao Zhang   /* Add received remote entries to A and B */
225158ec288SJunchao Zhang   if (Annz2 + Bnnz2 > 0) {
226158ec288SJunchao Zhang     MatAddRemoteCOOValues<<<(Annz2 + Bnnz2 + 255) / 256, 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
2279566063dSJacob Faibussowitsch     PetscCallCUDA(cudaPeekAtLastError());
228cbc6b225SStefano Zampini   }
229219fbbafSJunchao Zhang 
230219fbbafSJunchao Zhang   if (imode == INSERT_VALUES) {
2319566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
232158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
233219fbbafSJunchao Zhang   } else {
2349566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
235158ec288SJunchao Zhang     PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
236219fbbafSJunchao Zhang   }
2379566063dSJacob Faibussowitsch   if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
238cbc6b225SStefano Zampini   mat->offloadmask = PETSC_OFFLOAD_GPU;
2393ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
240219fbbafSJunchao Zhang }
241219fbbafSJunchao Zhang 
242d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
243d71ae5a4SJacob Faibussowitsch {
244ed502f03SStefano Zampini   Mat             Ad, Ao;
245ed502f03SStefano Zampini   const PetscInt *cmap;
246ed502f03SStefano Zampini 
247ed502f03SStefano Zampini   PetscFunctionBegin;
2489566063dSJacob Faibussowitsch   PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
2499566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
250ed502f03SStefano Zampini   if (glob) {
251ed502f03SStefano Zampini     PetscInt cst, i, dn, on, *gidx;
252ed502f03SStefano Zampini 
2539566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ad, NULL, &dn));
2549566063dSJacob Faibussowitsch     PetscCall(MatGetLocalSize(Ao, NULL, &on));
2559566063dSJacob Faibussowitsch     PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
2569566063dSJacob Faibussowitsch     PetscCall(PetscMalloc1(dn + on, &gidx));
257ed502f03SStefano Zampini     for (i = 0; i < dn; i++) gidx[i] = cst + i;
258ed502f03SStefano Zampini     for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
2599566063dSJacob Faibussowitsch     PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
260ed502f03SStefano Zampini   }
2613ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
262ed502f03SStefano Zampini }
263ed502f03SStefano Zampini 
264d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
265d71ae5a4SJacob Faibussowitsch {
266bbf3fe20SPaul Mullowney   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)B->data;
267bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
2689ae82921SPaul Mullowney   PetscInt            i;
2699ae82921SPaul Mullowney 
2709ae82921SPaul Mullowney   PetscFunctionBegin;
2719566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->rmap));
2729566063dSJacob Faibussowitsch   PetscCall(PetscLayoutSetUp(B->cmap));
2737e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && d_nnz) {
274ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
2759ae82921SPaul Mullowney   }
2767e8381f9SStefano Zampini   if (PetscDefined(USE_DEBUG) && o_nnz) {
277ad540459SPierre Jolivet     for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
2789ae82921SPaul Mullowney   }
2796a29ce69SStefano Zampini #if defined(PETSC_USE_CTABLE)
280eec179cfSJacob Faibussowitsch   PetscCall(PetscHMapIDestroy(&b->colmap));
2816a29ce69SStefano Zampini #else
2829566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->colmap));
2836a29ce69SStefano Zampini #endif
2849566063dSJacob Faibussowitsch   PetscCall(PetscFree(b->garray));
2859566063dSJacob Faibussowitsch   PetscCall(VecDestroy(&b->lvec));
2869566063dSJacob Faibussowitsch   PetscCall(VecScatterDestroy(&b->Mvctx));
2876a29ce69SStefano Zampini   /* Because the B will have been resized we simply destroy it and create a new one each time */
2889566063dSJacob Faibussowitsch   PetscCall(MatDestroy(&b->B));
2896a29ce69SStefano Zampini   if (!b->A) {
2909566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
2919566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
2926a29ce69SStefano Zampini   }
2936a29ce69SStefano Zampini   if (!b->B) {
2946a29ce69SStefano Zampini     PetscMPIInt size;
2959566063dSJacob Faibussowitsch     PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
2969566063dSJacob Faibussowitsch     PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
2979566063dSJacob Faibussowitsch     PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
2989ae82921SPaul Mullowney   }
2999566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
3009566063dSJacob Faibussowitsch   PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
3019566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->A, B->boundtocpu));
3029566063dSJacob Faibussowitsch   PetscCall(MatBindToCPU(b->B, B->boundtocpu));
3039566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
3049566063dSJacob Faibussowitsch   PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
3059566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
3069566063dSJacob Faibussowitsch   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
3079ae82921SPaul Mullowney   B->preallocated = PETSC_TRUE;
3083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3099ae82921SPaul Mullowney }
310e057df02SPaul Mullowney 
311d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
312d71ae5a4SJacob Faibussowitsch {
3139ae82921SPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
3149ae82921SPaul Mullowney 
3159ae82921SPaul Mullowney   PetscFunctionBegin;
3169566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3179566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->mult)(a->A, xx, yy));
3189566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3199566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
3203ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3219ae82921SPaul Mullowney }
322ca45077fSPaul Mullowney 
323d71ae5a4SJacob Faibussowitsch PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
324d71ae5a4SJacob Faibussowitsch {
3253fa6b06aSMark Adams   Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
3263fa6b06aSMark Adams 
3273fa6b06aSMark Adams   PetscFunctionBegin;
3289566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->A));
3299566063dSJacob Faibussowitsch   PetscCall(MatZeroEntries(l->B));
3303ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
3313fa6b06aSMark Adams }
332042217e8SBarry Smith 
333d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
334d71ae5a4SJacob Faibussowitsch {
335fdc842d1SBarry Smith   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
336fdc842d1SBarry Smith 
337fdc842d1SBarry Smith   PetscFunctionBegin;
3389566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3399566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
3409566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
3419566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
3423ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
343fdc842d1SBarry Smith }
344fdc842d1SBarry Smith 
345d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
346d71ae5a4SJacob Faibussowitsch {
347ca45077fSPaul Mullowney   Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
348ca45077fSPaul Mullowney 
349ca45077fSPaul Mullowney   PetscFunctionBegin;
3509566063dSJacob Faibussowitsch   PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
3519566063dSJacob Faibussowitsch   PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
3529566063dSJacob Faibussowitsch   PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3539566063dSJacob Faibussowitsch   PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
3543ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
355ca45077fSPaul Mullowney }
3569ae82921SPaul Mullowney 
357d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
358d71ae5a4SJacob Faibussowitsch {
359ca45077fSPaul Mullowney   Mat_MPIAIJ         *a              = (Mat_MPIAIJ *)A->data;
360bbf3fe20SPaul Mullowney   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
361e057df02SPaul Mullowney 
362ca45077fSPaul Mullowney   PetscFunctionBegin;
363e057df02SPaul Mullowney   switch (op) {
364d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_DIAG:
365d71ae5a4SJacob Faibussowitsch     cusparseStruct->diagGPUMatFormat = format;
366d71ae5a4SJacob Faibussowitsch     break;
367d71ae5a4SJacob Faibussowitsch   case MAT_CUSPARSE_MULT_OFFDIAG:
368d71ae5a4SJacob Faibussowitsch     cusparseStruct->offdiagGPUMatFormat = format;
369d71ae5a4SJacob Faibussowitsch     break;
370e057df02SPaul Mullowney   case MAT_CUSPARSE_ALL:
371e057df02SPaul Mullowney     cusparseStruct->diagGPUMatFormat    = format;
372e057df02SPaul Mullowney     cusparseStruct->offdiagGPUMatFormat = format;
373045c96e1SPaul Mullowney     break;
374d71ae5a4SJacob Faibussowitsch   default:
375d71ae5a4SJacob Faibussowitsch     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
376045c96e1SPaul Mullowney   }
3773ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
378ca45077fSPaul Mullowney }
379e057df02SPaul Mullowney 
380d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems *PetscOptionsObject)
381d71ae5a4SJacob Faibussowitsch {
382e057df02SPaul Mullowney   MatCUSPARSEStorageFormat format;
383e057df02SPaul Mullowney   PetscBool                flg;
384a183c035SDominic Meiser   Mat_MPIAIJ              *a              = (Mat_MPIAIJ *)A->data;
385a183c035SDominic Meiser   Mat_MPIAIJCUSPARSE      *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
3865fd66863SKarl Rupp 
387e057df02SPaul Mullowney   PetscFunctionBegin;
388d0609cedSBarry Smith   PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
389e057df02SPaul Mullowney   if (A->factortype == MAT_FACTOR_NONE) {
3909371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
3919566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
3929371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
3939566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
3949371c9d4SSatish Balay     PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
3959566063dSJacob Faibussowitsch     if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
396e057df02SPaul Mullowney   }
397d0609cedSBarry Smith   PetscOptionsHeadEnd();
3983ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
399e057df02SPaul Mullowney }
400e057df02SPaul Mullowney 
401d71ae5a4SJacob Faibussowitsch PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
402d71ae5a4SJacob Faibussowitsch {
4033fa6b06aSMark Adams   Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
404042217e8SBarry Smith 
40534d6c7a5SJose E. Roman   PetscFunctionBegin;
4069566063dSJacob Faibussowitsch   PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
4079566063dSJacob Faibussowitsch   if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
4083ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
40934d6c7a5SJose E. Roman }
41034d6c7a5SJose E. Roman 
411d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
412d71ae5a4SJacob Faibussowitsch {
4133fa6b06aSMark Adams   Mat_MPIAIJ         *aij            = (Mat_MPIAIJ *)A->data;
4143fa6b06aSMark Adams   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
415bbf3fe20SPaul Mullowney 
416bbf3fe20SPaul Mullowney   PetscFunctionBegin;
41728b400f6SJacob Faibussowitsch   PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
418acbf8a88SJunchao Zhang   PetscCallCXX(delete cusparseStruct);
4199566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
4209566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
4219566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
4229566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
4239566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
4249566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
4259566063dSJacob Faibussowitsch   PetscCall(MatDestroy_MPIAIJ(A));
4263ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
427bbf3fe20SPaul Mullowney }
428ca45077fSPaul Mullowney 
42926cec326SBarry Smith /* defines MatSetValues_MPICUSPARSE_Hash() */
43026cec326SBarry Smith #define TYPE AIJ
43126cec326SBarry Smith #define TYPE_AIJ
43226cec326SBarry Smith #define SUB_TYPE_CUSPARSE
43326cec326SBarry Smith #include "../src/mat/impls/aij/mpi/mpihashmat.h"
43426cec326SBarry Smith #undef TYPE
43526cec326SBarry Smith #undef TYPE_AIJ
43626cec326SBarry Smith #undef SUB_TYPE_CUSPARSE
43726cec326SBarry Smith 
43826cec326SBarry Smith static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
43926cec326SBarry Smith {
44026cec326SBarry Smith   Mat_MPIAIJ         *b              = (Mat_MPIAIJ *)A->data;
44126cec326SBarry Smith   Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
44226cec326SBarry Smith 
44326cec326SBarry Smith   PetscFunctionBegin;
44426cec326SBarry Smith   PetscCall(MatSetUp_MPI_Hash(A));
44526cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
44626cec326SBarry Smith   PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
44726cec326SBarry Smith   A->preallocated = PETSC_TRUE;
44826cec326SBarry Smith   PetscFunctionReturn(PETSC_SUCCESS);
44926cec326SBarry Smith }
45026cec326SBarry Smith 
4518eb1d50fSPierre Jolivet PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
452d71ae5a4SJacob Faibussowitsch {
453bbf3fe20SPaul Mullowney   Mat_MPIAIJ *a;
4543338378cSStefano Zampini   Mat         A;
4559ae82921SPaul Mullowney 
4569ae82921SPaul Mullowney   PetscFunctionBegin;
4579566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
4589566063dSJacob Faibussowitsch   if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
4599566063dSJacob Faibussowitsch   else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
4603338378cSStefano Zampini   A             = *newmat;
4616f3d89d0SStefano Zampini   A->boundtocpu = PETSC_FALSE;
4629566063dSJacob Faibussowitsch   PetscCall(PetscFree(A->defaultvectype));
4639566063dSJacob Faibussowitsch   PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
46434136279SStefano Zampini 
465bbf3fe20SPaul Mullowney   a = (Mat_MPIAIJ *)A->data;
4669566063dSJacob Faibussowitsch   if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
4679566063dSJacob Faibussowitsch   if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
4689566063dSJacob Faibussowitsch   if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
469d98d7c49SStefano Zampini 
47048a46eb9SPierre Jolivet   if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
4712205254eSKarl Rupp 
47234d6c7a5SJose E. Roman   A->ops->assemblyend           = MatAssemblyEnd_MPIAIJCUSPARSE;
473bbf3fe20SPaul Mullowney   A->ops->mult                  = MatMult_MPIAIJCUSPARSE;
474fdc842d1SBarry Smith   A->ops->multadd               = MatMultAdd_MPIAIJCUSPARSE;
475bbf3fe20SPaul Mullowney   A->ops->multtranspose         = MatMultTranspose_MPIAIJCUSPARSE;
476bbf3fe20SPaul Mullowney   A->ops->setfromoptions        = MatSetFromOptions_MPIAIJCUSPARSE;
477bbf3fe20SPaul Mullowney   A->ops->destroy               = MatDestroy_MPIAIJCUSPARSE;
4783fa6b06aSMark Adams   A->ops->zeroentries           = MatZeroEntries_MPIAIJCUSPARSE;
4794e84afc0SStefano Zampini   A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
48026cec326SBarry Smith   A->ops->setup                 = MatSetUp_MPI_HASH_CUSPARSE;
4812205254eSKarl Rupp 
4829566063dSJacob Faibussowitsch   PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
4839566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
4849566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
4859566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
4869566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
4879566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
488ae48a8d0SStefano Zampini #if defined(PETSC_HAVE_HYPRE)
4899566063dSJacob Faibussowitsch   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
490ae48a8d0SStefano Zampini #endif
4913ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
4929ae82921SPaul Mullowney }
4939ae82921SPaul Mullowney 
494d71ae5a4SJacob Faibussowitsch PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
495d71ae5a4SJacob Faibussowitsch {
4963338378cSStefano Zampini   PetscFunctionBegin;
4979566063dSJacob Faibussowitsch   PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
4989566063dSJacob Faibussowitsch   PetscCall(MatCreate_MPIAIJ(A));
4999566063dSJacob Faibussowitsch   PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
5003ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5013338378cSStefano Zampini }
5023338378cSStefano Zampini 
5033f9c0db1SPaul Mullowney /*@
50411a5261eSBarry Smith    MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format
505e057df02SPaul Mullowney    (the default parallel PETSc format).  This matrix will ultimately pushed down
50620f4b53cSBarry Smith    to NVIDIA GPUs and use the CuSPARSE library for calculations.
5079ae82921SPaul Mullowney 
508d083f849SBarry Smith    Collective
509e057df02SPaul Mullowney 
510e057df02SPaul Mullowney    Input Parameters:
51111a5261eSBarry Smith +  comm - MPI communicator, set to `PETSC_COMM_SELF`
51220f4b53cSBarry Smith .  m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
51320f4b53cSBarry Smith            This value should be the same as the local size used in creating the
51420f4b53cSBarry Smith            y vector for the matrix-vector product y = Ax.
51520f4b53cSBarry Smith .  n - This value should be the same as the local size used in creating the
51620f4b53cSBarry Smith        x vector for the matrix-vector product y = Ax. (or PETSC_DECIDE to have
51720f4b53cSBarry Smith        calculated if `N` is given) For square matrices `n` is almost always `m`.
51820f4b53cSBarry Smith .  M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
51920f4b53cSBarry Smith .  N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
52020f4b53cSBarry Smith .  d_nz  - number of nonzeros per row in DIAGONAL portion of local submatrix
52120f4b53cSBarry Smith            (same value is used for all local rows)
52220f4b53cSBarry Smith .  d_nnz - array containing the number of nonzeros in the various rows of the
52320f4b53cSBarry Smith            DIAGONAL portion of the local submatrix (possibly different for each row)
52420f4b53cSBarry Smith            or `NULL`, if `d_nz` is used to specify the nonzero structure.
52520f4b53cSBarry Smith            The size of this array is equal to the number of local rows, i.e `m`.
52620f4b53cSBarry Smith            For matrices you plan to factor you must leave room for the diagonal entry and
52720f4b53cSBarry Smith            put in the entry even if it is zero.
52820f4b53cSBarry Smith .  o_nz  - number of nonzeros per row in the OFF-DIAGONAL portion of local
52920f4b53cSBarry Smith            submatrix (same value is used for all local rows).
53020f4b53cSBarry Smith -  o_nnz - array containing the number of nonzeros in the various rows of the
53120f4b53cSBarry Smith            OFF-DIAGONAL portion of the local submatrix (possibly different for
53220f4b53cSBarry Smith            each row) or `NULL`, if `o_nz` is used to specify the nonzero
53320f4b53cSBarry Smith            structure. The size of this array is equal to the number
53420f4b53cSBarry Smith            of local rows, i.e `m`.
535e057df02SPaul Mullowney 
536e057df02SPaul Mullowney    Output Parameter:
537e057df02SPaul Mullowney .  A - the matrix
538e057df02SPaul Mullowney 
5392ef1f0ffSBarry Smith    Level: intermediate
5402ef1f0ffSBarry Smith 
5412ef1f0ffSBarry Smith    Notes:
54211a5261eSBarry Smith    It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
543e057df02SPaul Mullowney    MatXXXXSetPreallocation() paradigm instead of this routine directly.
54411a5261eSBarry Smith    [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
545e057df02SPaul Mullowney 
54611a5261eSBarry Smith    The AIJ format, also called the
5472ef1f0ffSBarry Smith    compressed row storage), is fully compatible with standard Fortran
548e057df02SPaul Mullowney    storage.  That is, the stored row and column indices can begin at
5492ef1f0ffSBarry Smith    either one (as in Fortran) or zero.
550e057df02SPaul Mullowney 
5512ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MatCreateAIJ()`, `MATMPIAIJCUSPARSE`, `MATAIJCUSPARSE`
552e057df02SPaul Mullowney @*/
553d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
554d71ae5a4SJacob Faibussowitsch {
5559ae82921SPaul Mullowney   PetscMPIInt size;
5569ae82921SPaul Mullowney 
5579ae82921SPaul Mullowney   PetscFunctionBegin;
5589566063dSJacob Faibussowitsch   PetscCall(MatCreate(comm, A));
5599566063dSJacob Faibussowitsch   PetscCall(MatSetSizes(*A, m, n, M, N));
5609566063dSJacob Faibussowitsch   PetscCallMPI(MPI_Comm_size(comm, &size));
5619ae82921SPaul Mullowney   if (size > 1) {
5629566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
5639566063dSJacob Faibussowitsch     PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
5649ae82921SPaul Mullowney   } else {
5659566063dSJacob Faibussowitsch     PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
5669566063dSJacob Faibussowitsch     PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
5679ae82921SPaul Mullowney   }
5683ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
5699ae82921SPaul Mullowney }
5709ae82921SPaul Mullowney 
5713ca39a21SBarry Smith /*MC
57211a5261eSBarry Smith    MATAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATMPIAIJCUSPARSE`.
573e057df02SPaul Mullowney 
57411a5261eSBarry Smith    A matrix type type whose data resides on NVIDIA GPUs. These matrices can be in either
5752692e278SPaul Mullowney    CSR, ELL, or Hybrid format. The ELL and HYB formats require CUDA 4.2 or later.
57611a5261eSBarry Smith    All matrix calculations are performed on NVIDIA GPUs using the CuSPARSE library.
5779ae82921SPaul Mullowney 
57811a5261eSBarry Smith    This matrix type is identical to `MATSEQAIJCUSPARSE` when constructed with a single process communicator,
57911a5261eSBarry Smith    and `MATMPIAIJCUSPARSE` otherwise.  As a result, for single process communicators,
58011a5261eSBarry Smith    `MatSeqAIJSetPreallocation()` is supported, and similarly `MatMPIAIJSetPreallocation()` is supported
5819ae82921SPaul Mullowney    for communicators controlling multiple processes.  It is recommended that you call both of
5829ae82921SPaul Mullowney    the above preallocation routines for simplicity.
5839ae82921SPaul Mullowney 
5849ae82921SPaul Mullowney    Options Database Keys:
5852ef1f0ffSBarry Smith +  -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
5862ef1f0ffSBarry Smith .  -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
5872ef1f0ffSBarry Smith .  -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5882ef1f0ffSBarry Smith -  -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
5899ae82921SPaul Mullowney 
5909ae82921SPaul Mullowney   Level: beginner
5919ae82921SPaul Mullowney 
5922ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
5936bb69460SJunchao Zhang M*/
5946bb69460SJunchao Zhang 
5956bb69460SJunchao Zhang /*MC
59611a5261eSBarry Smith    MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices; it is as same as `MATAIJCUSPARSE`.
5976bb69460SJunchao Zhang 
5986bb69460SJunchao Zhang   Level: beginner
5996bb69460SJunchao Zhang 
6002ef1f0ffSBarry Smith .seealso: [](chapter_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
6019ae82921SPaul Mullowney M*/
602