1 #define PETSC_SKIP_IMMINTRIN_H_CUDAWORKAROUND 1
2
3 #include <petscconf.h>
4 #include <../src/mat/impls/aij/mpi/mpiaij.h> /*I "petscmat.h" I*/
5 #include <../src/mat/impls/aij/seq/seqcusparse/cusparsematimpl.h>
6 #include <../src/mat/impls/aij/mpi/mpicusparse/mpicusparsematimpl.h>
7 #include <thrust/advance.h>
8 #include <thrust/partition.h>
9 #include <thrust/sort.h>
10 #include <thrust/unique.h>
11 #include <petscsf.h>
12
13 struct VecCUDAEquals {
14 template <typename Tuple>
operator ()VecCUDAEquals15 __host__ __device__ void operator()(Tuple t)
16 {
17 thrust::get<1>(t) = thrust::get<0>(t);
18 }
19 };
20
MatCOOStructDestroy_MPIAIJCUSPARSE(PetscCtxRt data)21 static PetscErrorCode MatCOOStructDestroy_MPIAIJCUSPARSE(PetscCtxRt data)
22 {
23 MatCOOStruct_MPIAIJ *coo = *(MatCOOStruct_MPIAIJ **)data;
24
25 PetscFunctionBegin;
26 PetscCall(PetscSFDestroy(&coo->sf));
27 PetscCallCUDA(cudaFree(coo->Ajmap1));
28 PetscCallCUDA(cudaFree(coo->Aperm1));
29 PetscCallCUDA(cudaFree(coo->Bjmap1));
30 PetscCallCUDA(cudaFree(coo->Bperm1));
31 PetscCallCUDA(cudaFree(coo->Aimap2));
32 PetscCallCUDA(cudaFree(coo->Ajmap2));
33 PetscCallCUDA(cudaFree(coo->Aperm2));
34 PetscCallCUDA(cudaFree(coo->Bimap2));
35 PetscCallCUDA(cudaFree(coo->Bjmap2));
36 PetscCallCUDA(cudaFree(coo->Bperm2));
37 PetscCallCUDA(cudaFree(coo->Cperm1));
38 PetscCallCUDA(cudaFree(coo->sendbuf));
39 PetscCallCUDA(cudaFree(coo->recvbuf));
40 PetscCall(PetscFree(coo));
41 PetscFunctionReturn(PETSC_SUCCESS);
42 }
43
MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])44 static PetscErrorCode MatSetPreallocationCOO_MPIAIJCUSPARSE(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
45 {
46 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)mat->data;
47 PetscBool dev_ij = PETSC_FALSE;
48 PetscMemType mtype = PETSC_MEMTYPE_HOST;
49 PetscInt *i, *j;
50 PetscContainer container_h;
51 MatCOOStruct_MPIAIJ *coo_h, *coo_d;
52
53 PetscFunctionBegin;
54 PetscCall(PetscFree(mpiaij->garray));
55 PetscCall(VecDestroy(&mpiaij->lvec));
56 #if defined(PETSC_USE_CTABLE)
57 PetscCall(PetscHMapIDestroy(&mpiaij->colmap));
58 #else
59 PetscCall(PetscFree(mpiaij->colmap));
60 #endif
61 PetscCall(VecScatterDestroy(&mpiaij->Mvctx));
62 mat->assembled = PETSC_FALSE;
63 mat->was_assembled = PETSC_FALSE;
64 PetscCall(PetscGetMemType(coo_i, &mtype));
65 if (PetscMemTypeDevice(mtype)) {
66 dev_ij = PETSC_TRUE;
67 PetscCall(PetscMalloc2(coo_n, &i, coo_n, &j));
68 PetscCallCUDA(cudaMemcpy(i, coo_i, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
69 PetscCallCUDA(cudaMemcpy(j, coo_j, coo_n * sizeof(PetscInt), cudaMemcpyDeviceToHost));
70 } else {
71 i = coo_i;
72 j = coo_j;
73 }
74
75 PetscCall(MatSetPreallocationCOO_MPIAIJ(mat, coo_n, i, j));
76 if (dev_ij) PetscCall(PetscFree2(i, j));
77 mat->offloadmask = PETSC_OFFLOAD_CPU;
78 // Create the GPU memory
79 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->A));
80 PetscCall(MatSeqAIJCUSPARSECopyToGPU(mpiaij->B));
81
82 // Copy the COO struct to device
83 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
84 PetscCall(PetscContainerGetPointer(container_h, &coo_h));
85 PetscCall(PetscMalloc1(1, &coo_d));
86 *coo_d = *coo_h; // do a shallow copy and then amend fields in coo_d
87
88 PetscCall(PetscObjectReference((PetscObject)coo_d->sf)); // Since we destroy the sf in both coo_h and coo_d
89 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount)));
90 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm1, coo_h->Atot1 * sizeof(PetscCount)));
91 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount)));
92 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm1, coo_h->Btot1 * sizeof(PetscCount)));
93 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aimap2, coo_h->Annz2 * sizeof(PetscCount)));
94 PetscCallCUDA(cudaMalloc((void **)&coo_d->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount)));
95 PetscCallCUDA(cudaMalloc((void **)&coo_d->Aperm2, coo_h->Atot2 * sizeof(PetscCount)));
96 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount)));
97 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount)));
98 PetscCallCUDA(cudaMalloc((void **)&coo_d->Bperm2, coo_h->Btot2 * sizeof(PetscCount)));
99 PetscCallCUDA(cudaMalloc((void **)&coo_d->Cperm1, coo_h->sendlen * sizeof(PetscCount)));
100 PetscCallCUDA(cudaMalloc((void **)&coo_d->sendbuf, coo_h->sendlen * sizeof(PetscScalar)));
101 PetscCallCUDA(cudaMalloc((void **)&coo_d->recvbuf, coo_h->recvlen * sizeof(PetscScalar)));
102
103 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap1, coo_h->Ajmap1, (coo_h->Annz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
104 PetscCallCUDA(cudaMemcpy(coo_d->Aperm1, coo_h->Aperm1, coo_h->Atot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
105 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap1, coo_h->Bjmap1, (coo_h->Bnnz + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
106 PetscCallCUDA(cudaMemcpy(coo_d->Bperm1, coo_h->Bperm1, coo_h->Btot1 * sizeof(PetscCount), cudaMemcpyHostToDevice));
107 PetscCallCUDA(cudaMemcpy(coo_d->Aimap2, coo_h->Aimap2, coo_h->Annz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
108 PetscCallCUDA(cudaMemcpy(coo_d->Ajmap2, coo_h->Ajmap2, (coo_h->Annz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
109 PetscCallCUDA(cudaMemcpy(coo_d->Aperm2, coo_h->Aperm2, coo_h->Atot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
110 PetscCallCUDA(cudaMemcpy(coo_d->Bimap2, coo_h->Bimap2, coo_h->Bnnz2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
111 PetscCallCUDA(cudaMemcpy(coo_d->Bjmap2, coo_h->Bjmap2, (coo_h->Bnnz2 + 1) * sizeof(PetscCount), cudaMemcpyHostToDevice));
112 PetscCallCUDA(cudaMemcpy(coo_d->Bperm2, coo_h->Bperm2, coo_h->Btot2 * sizeof(PetscCount), cudaMemcpyHostToDevice));
113 PetscCallCUDA(cudaMemcpy(coo_d->Cperm1, coo_h->Cperm1, coo_h->sendlen * sizeof(PetscCount), cudaMemcpyHostToDevice));
114
115 // Put the COO struct in a container and then attach that to the matrix
116 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_MPIAIJCUSPARSE));
117 PetscFunctionReturn(PETSC_SUCCESS);
118 }
119
MatPackCOOValues(const PetscScalar kv[],PetscCount nnz,const PetscCount perm[],PetscScalar buf[])120 __global__ static void MatPackCOOValues(const PetscScalar kv[], PetscCount nnz, const PetscCount perm[], PetscScalar buf[])
121 {
122 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
123 const PetscCount grid_size = gridDim.x * blockDim.x;
124 for (; i < nnz; i += grid_size) buf[i] = kv[perm[i]];
125 }
126
MatAddLocalCOOValues(const PetscScalar kv[],InsertMode imode,PetscCount Annz,const PetscCount Ajmap1[],const PetscCount Aperm1[],PetscScalar Aa[],PetscCount Bnnz,const PetscCount Bjmap1[],const PetscCount Bperm1[],PetscScalar Ba[])127 __global__ static void MatAddLocalCOOValues(const PetscScalar kv[], InsertMode imode, PetscCount Annz, const PetscCount Ajmap1[], const PetscCount Aperm1[], PetscScalar Aa[], PetscCount Bnnz, const PetscCount Bjmap1[], const PetscCount Bperm1[], PetscScalar Ba[])
128 {
129 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
130 const PetscCount grid_size = gridDim.x * blockDim.x;
131 for (; i < Annz + Bnnz; i += grid_size) {
132 PetscScalar sum = 0.0;
133 if (i < Annz) {
134 for (PetscCount k = Ajmap1[i]; k < Ajmap1[i + 1]; k++) sum += kv[Aperm1[k]];
135 Aa[i] = (imode == INSERT_VALUES ? 0.0 : Aa[i]) + sum;
136 } else {
137 i -= Annz;
138 for (PetscCount k = Bjmap1[i]; k < Bjmap1[i + 1]; k++) sum += kv[Bperm1[k]];
139 Ba[i] = (imode == INSERT_VALUES ? 0.0 : Ba[i]) + sum;
140 }
141 }
142 }
143
MatAddRemoteCOOValues(const PetscScalar kv[],PetscCount Annz2,const PetscCount Aimap2[],const PetscCount Ajmap2[],const PetscCount Aperm2[],PetscScalar Aa[],PetscCount Bnnz2,const PetscCount Bimap2[],const PetscCount Bjmap2[],const PetscCount Bperm2[],PetscScalar Ba[])144 __global__ static void MatAddRemoteCOOValues(const PetscScalar kv[], PetscCount Annz2, const PetscCount Aimap2[], const PetscCount Ajmap2[], const PetscCount Aperm2[], PetscScalar Aa[], PetscCount Bnnz2, const PetscCount Bimap2[], const PetscCount Bjmap2[], const PetscCount Bperm2[], PetscScalar Ba[])
145 {
146 PetscCount i = blockIdx.x * blockDim.x + threadIdx.x;
147 const PetscCount grid_size = gridDim.x * blockDim.x;
148 for (; i < Annz2 + Bnnz2; i += grid_size) {
149 if (i < Annz2) {
150 for (PetscCount k = Ajmap2[i]; k < Ajmap2[i + 1]; k++) Aa[Aimap2[i]] += kv[Aperm2[k]];
151 } else {
152 i -= Annz2;
153 for (PetscCount k = Bjmap2[i]; k < Bjmap2[i + 1]; k++) Ba[Bimap2[i]] += kv[Bperm2[k]];
154 }
155 }
156 }
157
MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat,const PetscScalar v[],InsertMode imode)158 static PetscErrorCode MatSetValuesCOO_MPIAIJCUSPARSE(Mat mat, const PetscScalar v[], InsertMode imode)
159 {
160 Mat_MPIAIJ *mpiaij = static_cast<Mat_MPIAIJ *>(mat->data);
161 Mat A = mpiaij->A, B = mpiaij->B;
162 PetscScalar *Aa, *Ba;
163 const PetscScalar *v1 = v;
164 PetscMemType memtype;
165 PetscContainer container;
166 MatCOOStruct_MPIAIJ *coo;
167
168 PetscFunctionBegin;
169 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
170 PetscCheck(container, PetscObjectComm((PetscObject)mat), PETSC_ERR_PLIB, "Not found MatCOOStruct on this matrix");
171 PetscCall(PetscContainerGetPointer(container, &coo));
172
173 const auto &Annz = coo->Annz;
174 const auto &Annz2 = coo->Annz2;
175 const auto &Bnnz = coo->Bnnz;
176 const auto &Bnnz2 = coo->Bnnz2;
177 const auto &vsend = coo->sendbuf;
178 const auto &v2 = coo->recvbuf;
179 const auto &Ajmap1 = coo->Ajmap1;
180 const auto &Ajmap2 = coo->Ajmap2;
181 const auto &Aimap2 = coo->Aimap2;
182 const auto &Bjmap1 = coo->Bjmap1;
183 const auto &Bjmap2 = coo->Bjmap2;
184 const auto &Bimap2 = coo->Bimap2;
185 const auto &Aperm1 = coo->Aperm1;
186 const auto &Aperm2 = coo->Aperm2;
187 const auto &Bperm1 = coo->Bperm1;
188 const auto &Bperm2 = coo->Bperm2;
189 const auto &Cperm1 = coo->Cperm1;
190
191 PetscCall(PetscGetMemType(v, &memtype));
192 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we need to copy it to device */
193 PetscCallCUDA(cudaMalloc((void **)&v1, coo->n * sizeof(PetscScalar)));
194 PetscCallCUDA(cudaMemcpy((void *)v1, v, coo->n * sizeof(PetscScalar), cudaMemcpyHostToDevice));
195 }
196
197 if (imode == INSERT_VALUES) {
198 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(A, &Aa)); /* write matrix values */
199 PetscCall(MatSeqAIJCUSPARSEGetArrayWrite(B, &Ba));
200 } else {
201 PetscCall(MatSeqAIJCUSPARSEGetArray(A, &Aa)); /* read & write matrix values */
202 PetscCall(MatSeqAIJCUSPARSEGetArray(B, &Ba));
203 }
204
205 PetscCall(PetscLogGpuTimeBegin());
206 /* Pack entries to be sent to remote */
207 if (coo->sendlen) {
208 MatPackCOOValues<<<(coo->sendlen + 255) / 256, 256>>>(v1, coo->sendlen, Cperm1, vsend);
209 PetscCallCUDA(cudaPeekAtLastError());
210 }
211
212 /* Send remote entries to their owner and overlap the communication with local computation */
213 PetscCall(PetscSFReduceWithMemTypeBegin(coo->sf, MPIU_SCALAR, PETSC_MEMTYPE_CUDA, vsend, PETSC_MEMTYPE_CUDA, v2, MPI_REPLACE));
214 /* Add local entries to A and B */
215 if (Annz + Bnnz > 0) {
216 MatAddLocalCOOValues<<<(int)((Annz + Bnnz + 255) / 256), 256>>>(v1, imode, Annz, Ajmap1, Aperm1, Aa, Bnnz, Bjmap1, Bperm1, Ba);
217 PetscCallCUDA(cudaPeekAtLastError());
218 }
219 PetscCall(PetscSFReduceEnd(coo->sf, MPIU_SCALAR, vsend, v2, MPI_REPLACE));
220
221 /* Add received remote entries to A and B */
222 if (Annz2 + Bnnz2 > 0) {
223 MatAddRemoteCOOValues<<<(int)((Annz2 + Bnnz2 + 255) / 256), 256>>>(v2, Annz2, Aimap2, Ajmap2, Aperm2, Aa, Bnnz2, Bimap2, Bjmap2, Bperm2, Ba);
224 PetscCallCUDA(cudaPeekAtLastError());
225 }
226 PetscCall(PetscLogGpuTimeEnd());
227
228 if (imode == INSERT_VALUES) {
229 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(A, &Aa));
230 PetscCall(MatSeqAIJCUSPARSERestoreArrayWrite(B, &Ba));
231 } else {
232 PetscCall(MatSeqAIJCUSPARSERestoreArray(A, &Aa));
233 PetscCall(MatSeqAIJCUSPARSERestoreArray(B, &Ba));
234 }
235 if (PetscMemTypeHost(memtype)) PetscCallCUDA(cudaFree((void *)v1));
236 mat->offloadmask = PETSC_OFFLOAD_GPU;
237 PetscFunctionReturn(PETSC_SUCCESS);
238 }
239
MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A,MatReuse scall,IS * glob,Mat * A_loc)240 static PetscErrorCode MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE(Mat A, MatReuse scall, IS *glob, Mat *A_loc)
241 {
242 Mat Ad, Ao;
243 const PetscInt *cmap;
244
245 PetscFunctionBegin;
246 PetscCall(MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &cmap));
247 PetscCall(MatSeqAIJCUSPARSEMergeMats(Ad, Ao, scall, A_loc));
248 if (glob) {
249 PetscInt cst, i, dn, on, *gidx;
250
251 PetscCall(MatGetLocalSize(Ad, NULL, &dn));
252 PetscCall(MatGetLocalSize(Ao, NULL, &on));
253 PetscCall(MatGetOwnershipRangeColumn(A, &cst, NULL));
254 PetscCall(PetscMalloc1(dn + on, &gidx));
255 for (i = 0; i < dn; i++) gidx[i] = cst + i;
256 for (i = 0; i < on; i++) gidx[i + dn] = cmap[i];
257 PetscCall(ISCreateGeneral(PetscObjectComm((PetscObject)Ad), dn + on, gidx, PETSC_OWN_POINTER, glob));
258 }
259 PetscFunctionReturn(PETSC_SUCCESS);
260 }
261
MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[])262 static PetscErrorCode MatMPIAIJSetPreallocation_MPIAIJCUSPARSE(Mat B, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[])
263 {
264 Mat_MPIAIJ *b = (Mat_MPIAIJ *)B->data;
265 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
266 PetscInt i;
267
268 PetscFunctionBegin;
269 if (B->hash_active) {
270 B->ops[0] = b->cops;
271 B->hash_active = PETSC_FALSE;
272 }
273 PetscCall(PetscLayoutSetUp(B->rmap));
274 PetscCall(PetscLayoutSetUp(B->cmap));
275 if (PetscDefined(USE_DEBUG) && d_nnz) {
276 for (i = 0; i < B->rmap->n; i++) PetscCheck(d_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "d_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, d_nnz[i]);
277 }
278 if (PetscDefined(USE_DEBUG) && o_nnz) {
279 for (i = 0; i < B->rmap->n; i++) PetscCheck(o_nnz[i] >= 0, PETSC_COMM_SELF, PETSC_ERR_ARG_OUTOFRANGE, "o_nnz cannot be less than 0: local row %" PetscInt_FMT " value %" PetscInt_FMT, i, o_nnz[i]);
280 }
281 #if defined(PETSC_USE_CTABLE)
282 PetscCall(PetscHMapIDestroy(&b->colmap));
283 #else
284 PetscCall(PetscFree(b->colmap));
285 #endif
286 PetscCall(PetscFree(b->garray));
287 PetscCall(VecDestroy(&b->lvec));
288 PetscCall(VecScatterDestroy(&b->Mvctx));
289 /* Because the B will have been resized we simply destroy it and create a new one each time */
290 PetscCall(MatDestroy(&b->B));
291 if (!b->A) {
292 PetscCall(MatCreate(PETSC_COMM_SELF, &b->A));
293 PetscCall(MatSetSizes(b->A, B->rmap->n, B->cmap->n, B->rmap->n, B->cmap->n));
294 }
295 if (!b->B) {
296 PetscMPIInt size;
297 PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)B), &size));
298 PetscCall(MatCreate(PETSC_COMM_SELF, &b->B));
299 PetscCall(MatSetSizes(b->B, B->rmap->n, size > 1 ? B->cmap->N : 0, B->rmap->n, size > 1 ? B->cmap->N : 0));
300 }
301 PetscCall(MatSetType(b->A, MATSEQAIJCUSPARSE));
302 PetscCall(MatSetType(b->B, MATSEQAIJCUSPARSE));
303 PetscCall(MatBindToCPU(b->A, B->boundtocpu));
304 PetscCall(MatBindToCPU(b->B, B->boundtocpu));
305 PetscCall(MatSeqAIJSetPreallocation(b->A, d_nz, d_nnz));
306 PetscCall(MatSeqAIJSetPreallocation(b->B, o_nz, o_nnz));
307 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
308 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
309 B->preallocated = PETSC_TRUE;
310 B->was_assembled = PETSC_FALSE;
311 B->assembled = PETSC_FALSE;
312 PetscFunctionReturn(PETSC_SUCCESS);
313 }
314
MatMult_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)315 static PetscErrorCode MatMult_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
316 {
317 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
318
319 PetscFunctionBegin;
320 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
321 PetscCall((*a->A->ops->mult)(a->A, xx, yy));
322 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
323 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, yy, yy));
324 PetscFunctionReturn(PETSC_SUCCESS);
325 }
326
MatZeroEntries_MPIAIJCUSPARSE(Mat A)327 static PetscErrorCode MatZeroEntries_MPIAIJCUSPARSE(Mat A)
328 {
329 Mat_MPIAIJ *l = (Mat_MPIAIJ *)A->data;
330
331 PetscFunctionBegin;
332 PetscCall(MatZeroEntries(l->A));
333 PetscCall(MatZeroEntries(l->B));
334 PetscFunctionReturn(PETSC_SUCCESS);
335 }
336
MatMultAdd_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy,Vec zz)337 static PetscErrorCode MatMultAdd_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy, Vec zz)
338 {
339 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
340
341 PetscFunctionBegin;
342 PetscCall(VecScatterBegin(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
343 PetscCall((*a->A->ops->multadd)(a->A, xx, yy, zz));
344 PetscCall(VecScatterEnd(a->Mvctx, xx, a->lvec, INSERT_VALUES, SCATTER_FORWARD));
345 PetscCall((*a->B->ops->multadd)(a->B, a->lvec, zz, zz));
346 PetscFunctionReturn(PETSC_SUCCESS);
347 }
348
MatMultTranspose_MPIAIJCUSPARSE(Mat A,Vec xx,Vec yy)349 static PetscErrorCode MatMultTranspose_MPIAIJCUSPARSE(Mat A, Vec xx, Vec yy)
350 {
351 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
352
353 PetscFunctionBegin;
354 PetscCall((*a->B->ops->multtranspose)(a->B, xx, a->lvec));
355 PetscCall((*a->A->ops->multtranspose)(a->A, xx, yy));
356 PetscCall(VecScatterBegin(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
357 PetscCall(VecScatterEnd(a->Mvctx, a->lvec, yy, ADD_VALUES, SCATTER_REVERSE));
358 PetscFunctionReturn(PETSC_SUCCESS);
359 }
360
MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A,MatCUSPARSEFormatOperation op,MatCUSPARSEStorageFormat format)361 static PetscErrorCode MatCUSPARSESetFormat_MPIAIJCUSPARSE(Mat A, MatCUSPARSEFormatOperation op, MatCUSPARSEStorageFormat format)
362 {
363 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
364 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
365
366 PetscFunctionBegin;
367 switch (op) {
368 case MAT_CUSPARSE_MULT_DIAG:
369 cusparseStruct->diagGPUMatFormat = format;
370 break;
371 case MAT_CUSPARSE_MULT_OFFDIAG:
372 cusparseStruct->offdiagGPUMatFormat = format;
373 break;
374 case MAT_CUSPARSE_ALL:
375 cusparseStruct->diagGPUMatFormat = format;
376 cusparseStruct->offdiagGPUMatFormat = format;
377 break;
378 default:
379 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "unsupported operation %d for MatCUSPARSEFormatOperation. Only MAT_CUSPARSE_MULT_DIAG, MAT_CUSPARSE_MULT_DIAG, and MAT_CUSPARSE_MULT_ALL are currently supported.", op);
380 }
381 PetscFunctionReturn(PETSC_SUCCESS);
382 }
383
MatSetFromOptions_MPIAIJCUSPARSE(Mat A,PetscOptionItems PetscOptionsObject)384 static PetscErrorCode MatSetFromOptions_MPIAIJCUSPARSE(Mat A, PetscOptionItems PetscOptionsObject)
385 {
386 MatCUSPARSEStorageFormat format;
387 PetscBool flg;
388 Mat_MPIAIJ *a = (Mat_MPIAIJ *)A->data;
389 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)a->spptr;
390
391 PetscFunctionBegin;
392 PetscOptionsHeadBegin(PetscOptionsObject, "MPIAIJCUSPARSE options");
393 if (A->factortype == MAT_FACTOR_NONE) {
394 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_diag_storage_format", "sets storage format of the diagonal blocks of (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
395 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_DIAG, format));
396 PetscCall(PetscOptionsEnum("-mat_cusparse_mult_offdiag_storage_format", "sets storage format of the off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->offdiagGPUMatFormat, (PetscEnum *)&format, &flg));
397 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_MULT_OFFDIAG, format));
398 PetscCall(PetscOptionsEnum("-mat_cusparse_storage_format", "sets storage format of the diagonal and off-diagonal blocks (mpi)aijcusparse gpu matrices for SpMV", "MatCUSPARSESetFormat", MatCUSPARSEStorageFormats, (PetscEnum)cusparseStruct->diagGPUMatFormat, (PetscEnum *)&format, &flg));
399 if (flg) PetscCall(MatCUSPARSESetFormat(A, MAT_CUSPARSE_ALL, format));
400 }
401 PetscOptionsHeadEnd();
402 PetscFunctionReturn(PETSC_SUCCESS);
403 }
404
MatAssemblyEnd_MPIAIJCUSPARSE(Mat A,MatAssemblyType mode)405 static PetscErrorCode MatAssemblyEnd_MPIAIJCUSPARSE(Mat A, MatAssemblyType mode)
406 {
407 Mat_MPIAIJ *mpiaij = (Mat_MPIAIJ *)A->data;
408
409 PetscFunctionBegin;
410 PetscCall(MatAssemblyEnd_MPIAIJ(A, mode));
411 if (mpiaij->lvec) PetscCall(VecSetType(mpiaij->lvec, VECSEQCUDA));
412 PetscFunctionReturn(PETSC_SUCCESS);
413 }
414
MatDestroy_MPIAIJCUSPARSE(Mat A)415 static PetscErrorCode MatDestroy_MPIAIJCUSPARSE(Mat A)
416 {
417 Mat_MPIAIJ *aij = (Mat_MPIAIJ *)A->data;
418 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)aij->spptr;
419
420 PetscFunctionBegin;
421 PetscCheck(cusparseStruct, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing spptr");
422 PetscCallCXX(delete cusparseStruct);
423 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", NULL));
424 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", NULL));
425 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
426 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
427 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", NULL));
428 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", NULL));
429 PetscCall(MatDestroy_MPIAIJ(A));
430 PetscFunctionReturn(PETSC_SUCCESS);
431 }
432
433 /* defines MatSetValues_MPICUSPARSE_Hash() */
434 #define TYPE AIJ
435 #define TYPE_AIJ
436 #define SUB_TYPE_CUSPARSE
437 #include "../src/mat/impls/aij/mpi/mpihashmat.h"
438 #undef TYPE
439 #undef TYPE_AIJ
440 #undef SUB_TYPE_CUSPARSE
441
MatSetUp_MPI_HASH_CUSPARSE(Mat A)442 static PetscErrorCode MatSetUp_MPI_HASH_CUSPARSE(Mat A)
443 {
444 Mat_MPIAIJ *b = (Mat_MPIAIJ *)A->data;
445 Mat_MPIAIJCUSPARSE *cusparseStruct = (Mat_MPIAIJCUSPARSE *)b->spptr;
446
447 PetscFunctionBegin;
448 PetscCall(MatSetUp_MPI_Hash(A));
449 PetscCall(MatCUSPARSESetFormat(b->A, MAT_CUSPARSE_MULT, cusparseStruct->diagGPUMatFormat));
450 PetscCall(MatCUSPARSESetFormat(b->B, MAT_CUSPARSE_MULT, cusparseStruct->offdiagGPUMatFormat));
451 A->preallocated = PETSC_TRUE;
452 PetscFunctionReturn(PETSC_SUCCESS);
453 }
454
MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B,MatType,MatReuse reuse,Mat * newmat)455 PETSC_INTERN PetscErrorCode MatConvert_MPIAIJ_MPIAIJCUSPARSE(Mat B, MatType, MatReuse reuse, Mat *newmat)
456 {
457 Mat_MPIAIJ *a;
458 Mat A;
459
460 PetscFunctionBegin;
461 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
462 if (reuse == MAT_INITIAL_MATRIX) PetscCall(MatDuplicate(B, MAT_COPY_VALUES, newmat));
463 else if (reuse == MAT_REUSE_MATRIX) PetscCall(MatCopy(B, *newmat, SAME_NONZERO_PATTERN));
464 A = *newmat;
465 A->boundtocpu = PETSC_FALSE;
466 PetscCall(PetscFree(A->defaultvectype));
467 PetscCall(PetscStrallocpy(VECCUDA, &A->defaultvectype));
468
469 a = (Mat_MPIAIJ *)A->data;
470 if (a->A) PetscCall(MatSetType(a->A, MATSEQAIJCUSPARSE));
471 if (a->B) PetscCall(MatSetType(a->B, MATSEQAIJCUSPARSE));
472 if (a->lvec) PetscCall(VecSetType(a->lvec, VECSEQCUDA));
473
474 if (reuse != MAT_REUSE_MATRIX && !a->spptr) PetscCallCXX(a->spptr = new Mat_MPIAIJCUSPARSE);
475
476 A->ops->assemblyend = MatAssemblyEnd_MPIAIJCUSPARSE;
477 A->ops->mult = MatMult_MPIAIJCUSPARSE;
478 A->ops->multadd = MatMultAdd_MPIAIJCUSPARSE;
479 A->ops->multtranspose = MatMultTranspose_MPIAIJCUSPARSE;
480 A->ops->setfromoptions = MatSetFromOptions_MPIAIJCUSPARSE;
481 A->ops->destroy = MatDestroy_MPIAIJCUSPARSE;
482 A->ops->zeroentries = MatZeroEntries_MPIAIJCUSPARSE;
483 A->ops->productsetfromoptions = MatProductSetFromOptions_MPIAIJBACKEND;
484 A->ops->setup = MatSetUp_MPI_HASH_CUSPARSE;
485 A->ops->getcurrentmemtype = MatGetCurrentMemType_MPIAIJ;
486
487 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATMPIAIJCUSPARSE));
488 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJGetLocalMatMerge_C", MatMPIAIJGetLocalMatMerge_MPIAIJCUSPARSE));
489 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatMPIAIJSetPreallocation_C", MatMPIAIJSetPreallocation_MPIAIJCUSPARSE));
490 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatCUSPARSESetFormat_C", MatCUSPARSESetFormat_MPIAIJCUSPARSE));
491 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_MPIAIJCUSPARSE));
492 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_MPIAIJCUSPARSE));
493 #if defined(PETSC_HAVE_HYPRE)
494 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_mpiaijcusparse_hypre_C", MatConvert_AIJ_HYPRE));
495 #endif
496 PetscFunctionReturn(PETSC_SUCCESS);
497 }
498
MatCreate_MPIAIJCUSPARSE(Mat A)499 PETSC_EXTERN PetscErrorCode MatCreate_MPIAIJCUSPARSE(Mat A)
500 {
501 PetscFunctionBegin;
502 PetscCall(PetscDeviceInitialize(PETSC_DEVICE_CUDA));
503 PetscCall(MatCreate_MPIAIJ(A));
504 PetscCall(MatConvert_MPIAIJ_MPIAIJCUSPARSE(A, MATMPIAIJCUSPARSE, MAT_INPLACE_MATRIX, &A));
505 PetscFunctionReturn(PETSC_SUCCESS);
506 }
507
508 /*@
509 MatCreateAIJCUSPARSE - Creates a sparse matrix in `MATAIJCUSPARSE` (compressed row) format. This matrix will ultimately be pushed down
510 to NVIDIA GPUs and use the CuSPARSE library for calculations.
511
512 Collective
513
514 Input Parameters:
515 + comm - MPI communicator, set to `PETSC_COMM_SELF`
516 . m - number of local rows (or `PETSC_DECIDE` to have calculated if `M` is given)
517 This value should be the same as the local size used in creating the
518 $y$ vector for the matrix-vector product $y = Ax$.
519 . n - This value should be the same as the local size used in creating the
520 $x$ vector for the matrix-vector product $y = Ax$. (or `PETSC_DECIDE` to have
521 calculated if `N` is given) For square matrices `n` is almost always `m`.
522 . M - number of global rows (or `PETSC_DETERMINE` to have calculated if `m` is given)
523 . N - number of global columns (or `PETSC_DETERMINE` to have calculated if `n` is given)
524 . d_nz - number of nonzeros per row in DIAGONAL portion of local submatrix
525 (same value is used for all local rows)
526 . d_nnz - array containing the number of nonzeros in the various rows of the
527 DIAGONAL portion of the local submatrix (possibly different for each row)
528 or `NULL`, if `d_nz` is used to specify the nonzero structure.
529 The size of this array is equal to the number of local rows, i.e `m`.
530 For matrices you plan to factor you must leave room for the diagonal entry and
531 put in the entry even if it is zero.
532 . o_nz - number of nonzeros per row in the OFF-DIAGONAL portion of local
533 submatrix (same value is used for all local rows).
534 - o_nnz - array containing the number of nonzeros in the various rows of the
535 OFF-DIAGONAL portion of the local submatrix (possibly different for
536 each row) or `NULL`, if `o_nz` is used to specify the nonzero
537 structure. The size of this array is equal to the number
538 of local rows, i.e `m`.
539
540 Output Parameter:
541 . A - the matrix
542
543 Level: intermediate
544
545 Notes:
546 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
547 MatXXXXSetPreallocation() paradigm instead of this routine directly.
548 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
549
550 The AIJ format, also called the
551 compressed row storage), is fully compatible with standard Fortran
552 storage. That is, the stored row and column indices can begin at
553 either one (as in Fortran) or zero.
554
555 When working with matrices for GPUs, it is often better to use the `MatSetPreallocationCOO()` and `MatSetValuesCOO()` paradigm rather than using this routine and `MatSetValues()`
556
557 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`, `MATMPIAIJCUSPARSE`
558 @*/
MatCreateAIJCUSPARSE(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt M,PetscInt N,PetscInt d_nz,const PetscInt d_nnz[],PetscInt o_nz,const PetscInt o_nnz[],Mat * A)559 PetscErrorCode MatCreateAIJCUSPARSE(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt M, PetscInt N, PetscInt d_nz, const PetscInt d_nnz[], PetscInt o_nz, const PetscInt o_nnz[], Mat *A)
560 {
561 PetscMPIInt size;
562
563 PetscFunctionBegin;
564 PetscCall(MatCreate(comm, A));
565 PetscCall(MatSetSizes(*A, m, n, M, N));
566 PetscCallMPI(MPI_Comm_size(comm, &size));
567 if (size > 1) {
568 PetscCall(MatSetType(*A, MATMPIAIJCUSPARSE));
569 PetscCall(MatMPIAIJSetPreallocation(*A, d_nz, d_nnz, o_nz, o_nnz));
570 } else {
571 PetscCall(MatSetType(*A, MATSEQAIJCUSPARSE));
572 PetscCall(MatSeqAIJSetPreallocation(*A, d_nz, d_nnz));
573 }
574 PetscFunctionReturn(PETSC_SUCCESS);
575 }
576
577 /*MC
578 MATMPIAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs.
579
580 Options Database Keys:
581 + -mat_type mpiaijcusparse - sets the matrix type to `MATMPIAIJCUSPARSE`
582 . -mat_cusparse_storage_format csr - sets the storage format of diagonal and off-diagonal matrices. Other options include ell (ellpack) or hyb (hybrid).
583 . -mat_cusparse_mult_diag_storage_format csr - sets the storage format of diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
584 - -mat_cusparse_mult_offdiag_storage_format csr - sets the storage format of off-diagonal matrix. Other options include ell (ellpack) or hyb (hybrid).
585
586 Level: beginner
587
588 Notes:
589 These matrices can be in either CSR, ELL, or HYB format. The ELL and HYB formats require CUDA 4.2 or later.
590
591 All matrix calculations are performed on NVIDIA GPUs using the cuSPARSE library.
592
593 Uses 32-bit integers internally. If PETSc is configured `--with-64-bit-indices`, the integer row and column indices are stored on the GPU with `int`. It is unclear what happens
594 if some integer values passed in do not fit in `int`.
595
596 .seealso: [](ch_matrices), `Mat`, `MatCreateAIJCUSPARSE()`, `MATSEQAIJCUSPARSE`, `MATMPIAIJCUSPARSE`, `MatCreateSeqAIJCUSPARSE()`, `MatCUSPARSESetFormat()`, `MatCUSPARSEStorageFormat`, `MatCUSPARSEFormatOperation`
597 M*/
598
599 /*MC
600 MATAIJCUSPARSE - A matrix type to be used for sparse matrices on NVIDIA GPUs; it is as same as `MATSEQAIJCUSPARSE` on one MPI process and `MATMPIAIJCUSPARSE` on multiple processes.
601
602 Level: beginner
603
604 .seealso: [](ch_matrices), `Mat`, `MATAIJCUSPARSE`, `MATSEQAIJCUSPARSE`
605 M*/
606