1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 #define DISABLE_CUSPARSE_DEPRECATED
20 #include <KokkosSparse_spmv.hpp>
21
22 #include <KokkosSparse_spiluk.hpp>
23 #include <KokkosSparse_sptrsv.hpp>
24 #include <KokkosSparse_spgemm.hpp>
25 #include <KokkosSparse_spadd.hpp>
26 #include <KokkosBatched_LU_Decl.hpp>
27 #include <KokkosBatched_InverseLU_Decl.hpp>
28
29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
30
31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32 #include <KokkosSparse_Utils.hpp>
33 using KokkosSparse::sort_crs_matrix;
34 using KokkosSparse::Impl::transpose_matrix;
35 #else
36 #include <KokkosKernels_Sorting.hpp>
37 using KokkosKernels::sort_crs_matrix;
38 using KokkosKernels::Impl::transpose_matrix;
39 #endif
40
41 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
42 using KokkosSparse::spiluk_symbolic;
43 using KokkosSparse::spiluk_numeric;
44 using KokkosSparse::sptrsv_symbolic;
45 using KokkosSparse::sptrsv_solve;
46 using KokkosSparse::Experimental::SPTRSVAlgorithm;
47 using KokkosSparse::Experimental::SPILUKAlgorithm;
48 #else
49 using KokkosSparse::Experimental::spiluk_symbolic;
50 using KokkosSparse::Experimental::spiluk_numeric;
51 using KokkosSparse::Experimental::sptrsv_symbolic;
52 using KokkosSparse::Experimental::sptrsv_solve;
53 using KokkosSparse::Experimental::SPTRSVAlgorithm;
54 using KokkosSparse::Experimental::SPILUKAlgorithm;
55 #endif
56
57 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
58
59 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
60 we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
61 In the latter case, it is important to set a_dual's sync state correctly.
62 */
MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)63 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
64 {
65 Mat_SeqAIJ *aijseq;
66 Mat_SeqAIJKokkos *aijkok;
67
68 PetscFunctionBegin;
69 if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
70 PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
71
72 aijseq = static_cast<Mat_SeqAIJ *>(A->data);
73 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
74
75 /* If aijkok does not exist, we just copy i, j to device.
76 If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
77 In both cases, we build a new aijkok structure.
78 */
79 if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
80 if (aijkok && aijkok->host_aij_allocated_by_kokkos) { /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
81 PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
82 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
83 PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
84 PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
85 PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
86 PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
87 aijseq->free_a = PETSC_TRUE;
88 aijseq->free_ij = PETSC_TRUE;
89 /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
90 we have the CsrMatrix on device first and then copy to host, followed by
91 MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
92 One could improve it by not using NULL garray.
93 */
94 }
95 delete aijkok;
96 aijkok = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /* don't copy mat values to device */);
97 A->spptr = aijkok;
98 } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
99 const PetscInt *adiag;
100 /* the a->diag is created at assmebly here because the rest of the Kokkos AIJ code assumes it always exists. This needs to be fixed since it is now only created when needed! */
101 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
102 MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
103 auto diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
104 aijkok->diag_dual = MatRowMapKokkosDualView(diag_d, diag_h);
105 }
106 PetscFunctionReturn(PETSC_SUCCESS);
107 }
108
109 /* Sync CSR data to device if not yet */
MatSeqAIJKokkosSyncDevice(Mat A)110 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
111 {
112 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
113
114 PetscFunctionBegin;
115 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
116 PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
117 if (aijkok->a_dual.need_sync_device()) {
118 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
119 aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
120 aijkok->hermitian_updated = PETSC_FALSE;
121 }
122 PetscFunctionReturn(PETSC_SUCCESS);
123 }
124
125 /* Mark the CSR data on device as modified */
MatSeqAIJKokkosModifyDevice(Mat A)126 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
127 {
128 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
129
130 PetscFunctionBegin;
131 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
132 aijkok->a_dual.clear_sync_state();
133 aijkok->a_dual.modify_device();
134 aijkok->transpose_updated = PETSC_FALSE;
135 aijkok->hermitian_updated = PETSC_FALSE;
136 PetscCall(PetscObjectStateIncrease((PetscObject)A));
137 PetscFunctionReturn(PETSC_SUCCESS);
138 }
139
MatSeqAIJKokkosSyncHost(Mat A)140 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
141 {
142 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
143 auto exec = PetscGetKokkosExecutionSpace();
144
145 PetscFunctionBegin;
146 PetscCheckTypeName(A, MATSEQAIJKOKKOS);
147 /* We do not expect one needs factors on host */
148 PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
149 PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
150 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
151 PetscFunctionReturn(PETSC_SUCCESS);
152 }
153
MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar * array[])154 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
155 {
156 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157
158 PetscFunctionBegin;
159 /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
160 Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
161 reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
162 must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
163 */
164 if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
165 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
166 *array = aijkok->a_dual.view_host().data();
167 } else { /* Happens when calling MatSetValues on a newly created matrix */
168 *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
169 }
170 PetscFunctionReturn(PETSC_SUCCESS);
171 }
172
MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar * array[])173 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
174 {
175 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
176
177 PetscFunctionBegin;
178 if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
179 PetscFunctionReturn(PETSC_SUCCESS);
180 }
181
MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar * array[])182 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
183 {
184 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185
186 PetscFunctionBegin;
187 if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
188 PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
189 *array = aijkok->a_dual.view_host().data();
190 } else {
191 *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
192 }
193 PetscFunctionReturn(PETSC_SUCCESS);
194 }
195
MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar * array[])196 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
197 {
198 PetscFunctionBegin;
199 *array = NULL;
200 PetscFunctionReturn(PETSC_SUCCESS);
201 }
202
MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar * array[])203 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
204 {
205 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
206
207 PetscFunctionBegin;
208 if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
209 *array = aijkok->a_dual.view_host().data();
210 } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
211 *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
212 }
213 PetscFunctionReturn(PETSC_SUCCESS);
214 }
215
MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar * array[])216 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
217 {
218 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
219
220 PetscFunctionBegin;
221 if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
222 aijkok->a_dual.clear_sync_state();
223 aijkok->a_dual.modify_host();
224 }
225 PetscFunctionReturn(PETSC_SUCCESS);
226 }
227
MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt ** i,const PetscInt ** j,PetscScalar ** a,PetscMemType * mtype)228 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
229 {
230 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
231
232 PetscFunctionBegin;
233 PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
234
235 if (i) *i = aijkok->i_device_data();
236 if (j) *j = aijkok->j_device_data();
237 if (a) {
238 PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
239 *a = aijkok->a_device_data();
240 }
241 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
242 PetscFunctionReturn(PETSC_SUCCESS);
243 }
244
MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A,PetscMemType * mtype)245 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
246 {
247 PetscFunctionBegin;
248 *mtype = PETSC_MEMTYPE_KOKKOS;
249 PetscFunctionReturn(PETSC_SUCCESS);
250 }
251
252 /*
253 Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
254
255 Input Parameter:
256 . A - the MATSEQAIJKOKKOS matrix
257
258 Output Parameters:
259 + perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
260 - T_d - the transpose on device, whose value array is allocated but not initialized
261 */
MatSeqAIJKokkosGenerateTransposeStructure(Mat A,MatRowMapKokkosView & perm_d,KokkosCsrMatrix & T_d)262 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
263 {
264 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
265 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
266 const PetscInt *Ai = aseq->i, *Aj = aseq->j;
267 MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
268 MatRowMapType *Ti = Ti_h.data();
269 MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
270 MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
271 PetscInt *Tj = Tj_h.data();
272 PetscInt *perm = perm_h.data();
273 PetscInt *offset;
274
275 PetscFunctionBegin;
276 // Populate Ti
277 PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
278 Ti++;
279 for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
280 Ti--;
281 for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
282
283 // Populate Tj and the permutation array
284 PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
285 for (PetscInt i = 0; i < m; i++) {
286 for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
287 PetscInt r = Aj[j]; // row r of T
288 PetscInt disp = Ti[r] + offset[r];
289
290 Tj[disp] = i; // col i of T
291 perm[disp] = j;
292 offset[r]++;
293 }
294 }
295 PetscCall(PetscFree(offset));
296
297 // Sort each row of T, along with the permutation array
298 for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
299
300 // Output perm and T on device
301 auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
302 auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
303 PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
304 PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
305 PetscFunctionReturn(PETSC_SUCCESS);
306 }
307
308 // Generate the transpose on device and cache it internally
309 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
MatSeqAIJKokkosGenerateTranspose_Private(Mat A,KokkosCsrMatrix * csrmatT)310 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
311 {
312 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
313 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
314 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
315 KokkosCsrMatrix &T = akok->csrmatT;
316
317 PetscFunctionBegin;
318 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
319 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
320
321 const auto &Aa = akok->a_dual.view_device();
322
323 if (A->symmetric == PETSC_BOOL3_TRUE) {
324 *csrmatT = akok->csrmat;
325 } else {
326 // See if we already have a cached transpose and its value is up to date
327 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
328 if (!akok->transpose_updated) { // if the value is out of date, update the cached version
329 const auto &perm = akok->transpose_perm; // get the permutation array
330 auto &Ta = T.values;
331
332 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
333 }
334 } else { // Generate T of size n x m for the first time
335 MatRowMapKokkosView perm;
336
337 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
338 akok->transpose_perm = perm; // cache the perm in this matrix for reuse
339 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
340 }
341 akok->transpose_updated = PETSC_TRUE;
342 *csrmatT = akok->csrmatT;
343 }
344 PetscFunctionReturn(PETSC_SUCCESS);
345 }
346
347 // Generate the Hermitian on device and cache it internally
MatSeqAIJKokkosGenerateHermitian_Private(Mat A,KokkosCsrMatrix * csrmatH)348 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
349 {
350 Mat_SeqAIJ *aseq = static_cast<Mat_SeqAIJ *>(A->data);
351 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
352 PetscInt nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
353 KokkosCsrMatrix &T = akok->csrmatH;
354
355 PetscFunctionBegin;
356 PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
357 PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
358
359 const auto &Aa = akok->a_dual.view_device();
360
361 if (A->hermitian == PETSC_BOOL3_TRUE) {
362 *csrmatH = akok->csrmat;
363 } else {
364 // See if we already have a cached Hermitian and its value is up to date
365 if (T.numRows() == n && T.numCols() == m) { // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
366 if (!akok->hermitian_updated) { // if the value is out of date, update the cached version
367 const auto &perm = akok->transpose_perm; // get the permutation array
368 auto &Ta = T.values;
369
370 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
371 }
372 } else { // Generate T of size n x m for the first time
373 MatRowMapKokkosView perm;
374
375 PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
376 akok->transpose_perm = perm; // cache the perm in this matrix for reuse
377 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
378 }
379 akok->hermitian_updated = PETSC_TRUE;
380 *csrmatH = akok->csrmatH;
381 }
382 PetscFunctionReturn(PETSC_SUCCESS);
383 }
384
385 /* y = A x */
MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)386 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
387 {
388 Mat_SeqAIJKokkos *aijkok;
389 ConstPetscScalarKokkosView xv;
390 PetscScalarKokkosView yv;
391
392 PetscFunctionBegin;
393 PetscCall(PetscLogGpuTimeBegin());
394 PetscCall(MatSeqAIJKokkosSyncDevice(A));
395 PetscCall(VecGetKokkosView(xx, &xv));
396 PetscCall(VecGetKokkosViewWrite(yy, &yv));
397 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
398 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
399 PetscCall(VecRestoreKokkosView(xx, &xv));
400 PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
401 /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
402 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
403 PetscCall(PetscLogGpuTimeEnd());
404 PetscFunctionReturn(PETSC_SUCCESS);
405 }
406
407 /* y = A^T x */
MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)408 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
409 {
410 Mat_SeqAIJKokkos *aijkok;
411 const char *mode;
412 ConstPetscScalarKokkosView xv;
413 PetscScalarKokkosView yv;
414 KokkosCsrMatrix csrmat;
415
416 PetscFunctionBegin;
417 PetscCall(PetscLogGpuTimeBegin());
418 PetscCall(MatSeqAIJKokkosSyncDevice(A));
419 PetscCall(VecGetKokkosView(xx, &xv));
420 PetscCall(VecGetKokkosViewWrite(yy, &yv));
421 if (A->form_explicit_transpose) {
422 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
423 mode = "N";
424 } else {
425 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
426 csrmat = aijkok->csrmat;
427 mode = "T";
428 }
429 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
430 PetscCall(VecRestoreKokkosView(xx, &xv));
431 PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
432 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
433 PetscCall(PetscLogGpuTimeEnd());
434 PetscFunctionReturn(PETSC_SUCCESS);
435 }
436
437 /* y = A^H x */
MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)438 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
439 {
440 Mat_SeqAIJKokkos *aijkok;
441 const char *mode;
442 ConstPetscScalarKokkosView xv;
443 PetscScalarKokkosView yv;
444 KokkosCsrMatrix csrmat;
445
446 PetscFunctionBegin;
447 PetscCall(PetscLogGpuTimeBegin());
448 PetscCall(MatSeqAIJKokkosSyncDevice(A));
449 PetscCall(VecGetKokkosView(xx, &xv));
450 PetscCall(VecGetKokkosViewWrite(yy, &yv));
451 if (A->form_explicit_transpose) {
452 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
453 mode = "N";
454 } else {
455 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
456 csrmat = aijkok->csrmat;
457 mode = "C";
458 }
459 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
460 PetscCall(VecRestoreKokkosView(xx, &xv));
461 PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
462 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
463 PetscCall(PetscLogGpuTimeEnd());
464 PetscFunctionReturn(PETSC_SUCCESS);
465 }
466
467 /* z = A x + y */
MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)468 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
469 {
470 Mat_SeqAIJKokkos *aijkok;
471 ConstPetscScalarKokkosView xv;
472 PetscScalarKokkosView zv;
473
474 PetscFunctionBegin;
475 PetscCall(PetscLogGpuTimeBegin());
476 PetscCall(MatSeqAIJKokkosSyncDevice(A));
477 if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
478 PetscCall(VecGetKokkosView(xx, &xv));
479 PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
480 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
481 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
482 PetscCall(VecRestoreKokkosView(xx, &xv));
483 PetscCall(VecRestoreKokkosView(zz, &zv));
484 PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
485 PetscCall(PetscLogGpuTimeEnd());
486 PetscFunctionReturn(PETSC_SUCCESS);
487 }
488
489 /* z = A^T x + y */
MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)490 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
491 {
492 Mat_SeqAIJKokkos *aijkok;
493 const char *mode;
494 ConstPetscScalarKokkosView xv;
495 PetscScalarKokkosView zv;
496 KokkosCsrMatrix csrmat;
497
498 PetscFunctionBegin;
499 PetscCall(PetscLogGpuTimeBegin());
500 PetscCall(MatSeqAIJKokkosSyncDevice(A));
501 if (zz != yy) PetscCall(VecCopy(yy, zz));
502 PetscCall(VecGetKokkosView(xx, &xv));
503 PetscCall(VecGetKokkosView(zz, &zv));
504 if (A->form_explicit_transpose) {
505 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
506 mode = "N";
507 } else {
508 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
509 csrmat = aijkok->csrmat;
510 mode = "T";
511 }
512 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
513 PetscCall(VecRestoreKokkosView(xx, &xv));
514 PetscCall(VecRestoreKokkosView(zz, &zv));
515 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
516 PetscCall(PetscLogGpuTimeEnd());
517 PetscFunctionReturn(PETSC_SUCCESS);
518 }
519
520 /* z = A^H x + y */
MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)521 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
522 {
523 Mat_SeqAIJKokkos *aijkok;
524 const char *mode;
525 ConstPetscScalarKokkosView xv;
526 PetscScalarKokkosView zv;
527 KokkosCsrMatrix csrmat;
528
529 PetscFunctionBegin;
530 PetscCall(PetscLogGpuTimeBegin());
531 PetscCall(MatSeqAIJKokkosSyncDevice(A));
532 if (zz != yy) PetscCall(VecCopy(yy, zz));
533 PetscCall(VecGetKokkosView(xx, &xv));
534 PetscCall(VecGetKokkosView(zz, &zv));
535 if (A->form_explicit_transpose) {
536 PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
537 mode = "N";
538 } else {
539 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
540 csrmat = aijkok->csrmat;
541 mode = "C";
542 }
543 PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
544 PetscCall(VecRestoreKokkosView(xx, &xv));
545 PetscCall(VecRestoreKokkosView(zz, &zv));
546 PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
547 PetscCall(PetscLogGpuTimeEnd());
548 PetscFunctionReturn(PETSC_SUCCESS);
549 }
550
MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)551 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
552 {
553 Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
554
555 PetscFunctionBegin;
556 switch (op) {
557 case MAT_FORM_EXPLICIT_TRANSPOSE:
558 /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
559 if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
560 A->form_explicit_transpose = flg;
561 break;
562 default:
563 PetscCall(MatSetOption_SeqAIJ(A, op, flg));
564 break;
565 }
566 PetscFunctionReturn(PETSC_SUCCESS);
567 }
568
569 /* Depending on reuse, either build a new mat, or use the existing mat */
MatConvert_SeqAIJ_SeqAIJKokkos(Mat A,MatType mtype,MatReuse reuse,Mat * newmat)570 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
571 {
572 Mat_SeqAIJ *aseq;
573
574 PetscFunctionBegin;
575 PetscCall(PetscKokkosInitializeCheck());
576 if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
577 PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
578 PetscCall(MatSetType(*newmat, mtype));
579 } else if (reuse == MAT_REUSE_MATRIX) { /* Reuse the mat created before */
580 PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
581 } else if (reuse == MAT_INPLACE_MATRIX) { /* newmat is A */
582 PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
583 PetscCall(PetscFree(A->defaultvectype));
584 PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
585 PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
586 PetscCall(MatSetOps_SeqAIJKokkos(A));
587 aseq = static_cast<Mat_SeqAIJ *>(A->data);
588 if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
589 PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
590 A->spptr = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
591 }
592 }
593 PetscFunctionReturn(PETSC_SUCCESS);
594 }
595
596 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
597 an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
598 */
MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat * B)599 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
600 {
601 Mat_SeqAIJ *bseq;
602 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
603 Mat mat;
604
605 PetscFunctionBegin;
606 /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
607 PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
608 mat = *B;
609 if (A->assembled) {
610 bseq = static_cast<Mat_SeqAIJ *>(mat->data);
611 bkok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
612 bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
613 /* Now copy values to B if needed */
614 if (dupOption == MAT_COPY_VALUES) {
615 if (akok->a_dual.need_sync_device()) {
616 Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
617 bkok->a_dual.modify_host();
618 } else { /* If device has the latest data, we only copy data on device */
619 Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
620 bkok->a_dual.modify_device();
621 }
622 } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
623 /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
624 bkok->a_dual.modify_host();
625 }
626 mat->spptr = bkok;
627 }
628
629 PetscCall(PetscFree(mat->defaultvectype));
630 PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
631 PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
632 PetscCall(MatSetOps_SeqAIJKokkos(mat));
633 PetscFunctionReturn(PETSC_SUCCESS);
634 }
635
MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat * B)636 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
637 {
638 Mat At;
639 KokkosCsrMatrix internT;
640 Mat_SeqAIJKokkos *atkok, *bkok;
641
642 PetscFunctionBegin;
643 if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
644 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
645 if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
646 /* Deep copy internT, as we want to isolate the internal transpose */
647 PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
648 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
649 if (reuse == MAT_INITIAL_MATRIX) *B = At;
650 else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
651 } else { /* MAT_REUSE_MATRIX, just need to copy values to B on device */
652 if ((*B)->assembled) {
653 bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
654 PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
655 PetscCall(MatSeqAIJKokkosModifyDevice(*B));
656 } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
657 Mat_SeqAIJ *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
658 MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
659 MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
660 PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
661 PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
662 } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
663 }
664 PetscFunctionReturn(PETSC_SUCCESS);
665 }
666
MatDestroy_SeqAIJKokkos(Mat A)667 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
668 {
669 Mat_SeqAIJKokkos *aijkok;
670
671 PetscFunctionBegin;
672 if (A->factortype == MAT_FACTOR_NONE) {
673 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
674 delete aijkok;
675 } else {
676 delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
677 }
678 A->spptr = NULL;
679 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
680 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
681 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
682 #if defined(PETSC_HAVE_HYPRE)
683 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
684 #endif
685 PetscCall(MatDestroy_SeqAIJ(A));
686 PetscFunctionReturn(PETSC_SUCCESS);
687 }
688
689 /*MC
690 MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
691
692 A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
693
694 Options Database Key:
695 . -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
696
697 Level: beginner
698
699 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
700 M*/
MatCreate_SeqAIJKokkos(Mat A)701 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
702 {
703 PetscFunctionBegin;
704 PetscCall(PetscKokkosInitializeCheck());
705 PetscCall(MatCreate_SeqAIJ(A));
706 PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
707 PetscFunctionReturn(PETSC_SUCCESS);
708 }
709
710 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat * C)711 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
712 {
713 Mat_SeqAIJ *a, *b;
714 Mat_SeqAIJKokkos *akok, *bkok, *ckok;
715 MatScalarKokkosView aa, ba, ca;
716 MatRowMapKokkosView ai, bi, ci;
717 MatColIdxKokkosView aj, bj, cj;
718 PetscInt m, n, nnz, aN;
719
720 PetscFunctionBegin;
721 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
722 PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
723 PetscAssertPointer(C, 4);
724 PetscCheckTypeName(A, MATSEQAIJKOKKOS);
725 PetscCheckTypeName(B, MATSEQAIJKOKKOS);
726 PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
727 PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
728
729 PetscCall(MatSeqAIJKokkosSyncDevice(A));
730 PetscCall(MatSeqAIJKokkosSyncDevice(B));
731 a = static_cast<Mat_SeqAIJ *>(A->data);
732 b = static_cast<Mat_SeqAIJ *>(B->data);
733 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
734 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
735 aa = akok->a_dual.view_device();
736 ai = akok->i_dual.view_device();
737 ba = bkok->a_dual.view_device();
738 bi = bkok->i_dual.view_device();
739 m = A->rmap->n; /* M, N and nnz of C */
740 n = A->cmap->n + B->cmap->n;
741 nnz = a->nz + b->nz;
742 aN = A->cmap->n; /* N of A */
743 if (reuse == MAT_INITIAL_MATRIX) {
744 aj = akok->j_dual.view_device();
745 bj = bkok->j_dual.view_device();
746 auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
747 auto ci = MatRowMapKokkosView("i", ai.extent(0));
748 auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
749
750 /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
751 Kokkos::parallel_for(
752 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
753 PetscInt i = t.league_rank(); /* row i */
754 PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
755
756 Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
757 ci(i) = coffset;
758 if (i == m - 1) ci(m) = ai(m) + bi(m);
759 });
760
761 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
762 if (k < alen) {
763 ca(coffset + k) = aa(ai(i) + k);
764 cj(coffset + k) = aj(ai(i) + k);
765 } else {
766 ca(coffset + k) = ba(bi(i) + k - alen);
767 cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
768 }
769 });
770 });
771 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
772 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
773 } else if (reuse == MAT_REUSE_MATRIX) {
774 PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
775 PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
776 ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
777 ca = ckok->a_dual.view_device();
778 ci = ckok->i_dual.view_device();
779
780 Kokkos::parallel_for(
781 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
782 PetscInt i = t.league_rank(); /* row i */
783 PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
784 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
785 if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
786 else ca(ci(i) + k) = ba(bi(i) + k - alen);
787 });
788 });
789 PetscCall(MatSeqAIJKokkosModifyDevice(*C));
790 }
791 PetscFunctionReturn(PETSC_SUCCESS);
792 }
793
MatProductCtxDestroy_SeqAIJKokkos(PetscCtxRt pdata)794 static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(PetscCtxRt pdata)
795 {
796 PetscFunctionBegin;
797 delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata);
798 PetscFunctionReturn(PETSC_SUCCESS);
799 }
800
MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)801 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
802 {
803 Mat_Product *product = C->product;
804 Mat A, B;
805 bool transA, transB; /* use bool, since KK needs this type */
806 Mat_SeqAIJKokkos *akok, *bkok, *ckok;
807 Mat_SeqAIJ *c;
808 MatProductCtx_SeqAIJKokkos *pdata;
809 KokkosCsrMatrix csrmatA, csrmatB;
810
811 PetscFunctionBegin;
812 MatCheckProduct(C, 1);
813 PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
814 pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data);
815
816 // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
817 // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
818 // we still do numeric.
819 if (pdata->reusesym) { // numeric reuses results from symbolic
820 pdata->reusesym = PETSC_FALSE;
821 PetscFunctionReturn(PETSC_SUCCESS);
822 }
823
824 switch (product->type) {
825 case MATPRODUCT_AB:
826 transA = false;
827 transB = false;
828 break;
829 case MATPRODUCT_AtB:
830 transA = true;
831 transB = false;
832 break;
833 case MATPRODUCT_ABt:
834 transA = false;
835 transB = true;
836 break;
837 default:
838 SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
839 }
840
841 A = product->A;
842 B = product->B;
843 PetscCall(MatSeqAIJKokkosSyncDevice(A));
844 PetscCall(MatSeqAIJKokkosSyncDevice(B));
845 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
846 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
847 ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
848
849 PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
850
851 csrmatA = akok->csrmat;
852 csrmatB = bkok->csrmat;
853
854 /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
855 if (transA) {
856 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
857 transA = false;
858 }
859
860 if (transB) {
861 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
862 transB = false;
863 }
864 PetscCall(PetscLogGpuTimeBegin());
865 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
866 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
867 auto spgemmHandle = pdata->kh.get_spgemm_handle();
868 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
869 #endif
870
871 PetscCall(PetscLogGpuTimeEnd());
872 PetscCall(MatSeqAIJKokkosModifyDevice(C));
873 /* shorter version of MatAssemblyEnd_SeqAIJ */
874 c = (Mat_SeqAIJ *)C->data;
875 PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
876 PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
877 PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
878 c->reallocs = 0;
879 C->info.mallocs = 0;
880 C->info.nz_unneeded = 0;
881 C->assembled = C->was_assembled = PETSC_TRUE;
882 C->num_ass++;
883 PetscFunctionReturn(PETSC_SUCCESS);
884 }
885
MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)886 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
887 {
888 Mat_Product *product = C->product;
889 MatProductType ptype;
890 Mat A, B;
891 bool transA, transB;
892 Mat_SeqAIJKokkos *akok, *bkok, *ckok;
893 MatProductCtx_SeqAIJKokkos *pdata;
894 MPI_Comm comm;
895 KokkosCsrMatrix csrmatA, csrmatB, csrmatC;
896
897 PetscFunctionBegin;
898 MatCheckProduct(C, 1);
899 PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
900 PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
901 A = product->A;
902 B = product->B;
903 PetscCall(MatSeqAIJKokkosSyncDevice(A));
904 PetscCall(MatSeqAIJKokkosSyncDevice(B));
905 akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
906 bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
907 csrmatA = akok->csrmat;
908 csrmatB = bkok->csrmat;
909
910 ptype = product->type;
911 // Take advantage of the symmetry if true
912 if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
913 ptype = MATPRODUCT_AB;
914 product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
915 }
916 if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
917 ptype = MATPRODUCT_AB;
918 product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
919 }
920
921 switch (ptype) {
922 case MATPRODUCT_AB:
923 transA = false;
924 transB = false;
925 PetscCall(MatSetBlockSizesFromMats(C, A, B));
926 break;
927 case MATPRODUCT_AtB:
928 transA = true;
929 transB = false;
930 if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
931 if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
932 break;
933 case MATPRODUCT_ABt:
934 transA = false;
935 transB = true;
936 if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
937 if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
938 break;
939 default:
940 SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
941 }
942 PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos());
943 pdata->reusesym = product->api_user;
944
945 /* TODO: add command line options to select spgemm algorithms */
946 auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
947
948 /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
949 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
950 #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
951 spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
952 #endif
953 #endif
954 PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
955
956 PetscCall(PetscLogGpuTimeBegin());
957 /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
958 if (transA) {
959 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
960 transA = false;
961 }
962
963 if (transB) {
964 PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
965 transB = false;
966 }
967
968 PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
969 /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
970 So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
971 calling new Mat_SeqAIJKokkos().
972 TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
973 */
974 PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
975 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
976 /* Query if KK outputs a sorted matrix. If not, we need to sort it */
977 auto spgemmHandle = pdata->kh.get_spgemm_handle();
978 if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
979 #endif
980 PetscCall(PetscLogGpuTimeEnd());
981
982 PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
983 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
984 C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos;
985 PetscFunctionReturn(PETSC_SUCCESS);
986 }
987
988 /* handles sparse matrix matrix ops */
MatProductSetFromOptions_SeqAIJKokkos(Mat mat)989 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
990 {
991 Mat_Product *product = mat->product;
992 PetscBool Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
993
994 PetscFunctionBegin;
995 MatCheckProduct(mat, 1);
996 PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
997 if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
998 if (Biskok && Ciskok) {
999 switch (product->type) {
1000 case MATPRODUCT_AB:
1001 case MATPRODUCT_AtB:
1002 case MATPRODUCT_ABt:
1003 mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1004 break;
1005 case MATPRODUCT_PtAP:
1006 case MATPRODUCT_RARt:
1007 case MATPRODUCT_ABC:
1008 mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1009 break;
1010 default:
1011 break;
1012 }
1013 } else { /* fallback for AIJ */
1014 PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1015 }
1016 PetscFunctionReturn(PETSC_SUCCESS);
1017 }
1018
MatScale_SeqAIJKokkos(Mat A,PetscScalar a)1019 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1020 {
1021 Mat_SeqAIJKokkos *aijkok;
1022
1023 PetscFunctionBegin;
1024 PetscCall(PetscLogGpuTimeBegin());
1025 PetscCall(MatSeqAIJKokkosSyncDevice(A));
1026 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1027 KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1028 PetscCall(MatSeqAIJKokkosModifyDevice(A));
1029 PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1030 PetscCall(PetscLogGpuTimeEnd());
1031 PetscFunctionReturn(PETSC_SUCCESS);
1032 }
1033
1034 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
MatShift_SeqAIJKokkos(Mat A,PetscScalar a)1035 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1036 {
1037 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1038
1039 PetscFunctionBegin;
1040 if (A->assembled && aijseq->diagDense) { // no missing diagonals
1041 PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1042
1043 PetscCall(PetscLogGpuTimeBegin());
1044 PetscCall(MatSeqAIJKokkosSyncDevice(A));
1045 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1046 const auto &Aa = aijkok->a_dual.view_device();
1047 const auto &Adiag = aijkok->diag_dual.view_device();
1048 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1049 PetscCall(MatSeqAIJKokkosModifyDevice(A));
1050 PetscCall(PetscLogGpuFlops(n));
1051 PetscCall(PetscLogGpuTimeEnd());
1052 } else { // need reassembly, very slow!
1053 PetscCall(MatShift_Basic(A, a));
1054 }
1055 PetscFunctionReturn(PETSC_SUCCESS);
1056 }
1057
MatDiagonalSet_SeqAIJKokkos(Mat Y,Vec D,InsertMode is)1058 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1059 {
1060 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1061
1062 PetscFunctionBegin;
1063 if (Y->assembled && aijseq->diagDense) { // no missing diagonals
1064 ConstPetscScalarKokkosView dv;
1065 PetscInt n, nv;
1066
1067 PetscCall(PetscLogGpuTimeBegin());
1068 PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1069 PetscCall(VecGetKokkosView(D, &dv));
1070 PetscCall(VecGetLocalSize(D, &nv));
1071 n = PetscMin(Y->rmap->n, Y->cmap->n);
1072 PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1073
1074 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1075 const auto &Aa = aijkok->a_dual.view_device();
1076 const auto &Adiag = aijkok->diag_dual.view_device();
1077 PetscCallCXX(Kokkos::parallel_for(
1078 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1079 if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1080 else Aa(Adiag(i)) += dv(i);
1081 }));
1082 PetscCall(VecRestoreKokkosView(D, &dv));
1083 PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1084 PetscCall(PetscLogGpuFlops(n));
1085 PetscCall(PetscLogGpuTimeEnd());
1086 } else { // need reassembly, very slow!
1087 PetscCall(MatDiagonalSet_Default(Y, D, is));
1088 }
1089 PetscFunctionReturn(PETSC_SUCCESS);
1090 }
1091
MatDiagonalScale_SeqAIJKokkos(Mat A,Vec ll,Vec rr)1092 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1093 {
1094 Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1095 PetscInt m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1096 ConstPetscScalarKokkosView lv, rv;
1097
1098 PetscFunctionBegin;
1099 PetscCall(PetscLogGpuTimeBegin());
1100 PetscCall(MatSeqAIJKokkosSyncDevice(A));
1101 const auto aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1102 const auto &Aa = aijkok->a_dual.view_device();
1103 const auto &Ai = aijkok->i_dual.view_device();
1104 const auto &Aj = aijkok->j_dual.view_device();
1105 if (ll) {
1106 PetscCall(VecGetLocalSize(ll, &m));
1107 PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1108 PetscCall(VecGetKokkosView(ll, &lv));
1109 PetscCallCXX(Kokkos::parallel_for( // for each row
1110 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1111 PetscInt i = t.league_rank(); // row i
1112 PetscInt len = Ai(i + 1) - Ai(i);
1113 // scale entries on the row
1114 Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1115 }));
1116 PetscCall(VecRestoreKokkosView(ll, &lv));
1117 PetscCall(PetscLogGpuFlops(nz));
1118 }
1119 if (rr) {
1120 PetscCall(VecGetLocalSize(rr, &n));
1121 PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1122 PetscCall(VecGetKokkosView(rr, &rv));
1123 PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1124 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1125 PetscCall(VecRestoreKokkosView(rr, &lv));
1126 PetscCall(PetscLogGpuFlops(nz));
1127 }
1128 PetscCall(MatSeqAIJKokkosModifyDevice(A));
1129 PetscCall(PetscLogGpuTimeEnd());
1130 PetscFunctionReturn(PETSC_SUCCESS);
1131 }
1132
MatZeroEntries_SeqAIJKokkos(Mat A)1133 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1134 {
1135 Mat_SeqAIJKokkos *aijkok;
1136
1137 PetscFunctionBegin;
1138 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1139 if (aijkok) { /* Only zero the device if data is already there */
1140 KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1141 PetscCall(MatSeqAIJKokkosModifyDevice(A));
1142 } else { /* Might be preallocated but not assembled */
1143 PetscCall(MatZeroEntries_SeqAIJ(A));
1144 }
1145 PetscFunctionReturn(PETSC_SUCCESS);
1146 }
1147
MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x)1148 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1149 {
1150 Mat_SeqAIJKokkos *aijkok;
1151 PetscInt n;
1152 PetscScalarKokkosView xv;
1153
1154 PetscFunctionBegin;
1155 PetscCall(VecGetLocalSize(x, &n));
1156 PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1157 PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1158
1159 PetscCall(MatSeqAIJKokkosSyncDevice(A));
1160 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1161
1162 const auto &Aa = aijkok->a_dual.view_device();
1163 const auto &Ai = aijkok->i_dual.view_device();
1164 const auto &Adiag = aijkok->diag_dual.view_device();
1165
1166 PetscCall(VecGetKokkosViewWrite(x, &xv));
1167 Kokkos::parallel_for(
1168 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1169 if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1170 else xv(i) = 0;
1171 });
1172 PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1173 PetscFunctionReturn(PETSC_SUCCESS);
1174 }
1175
1176 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView * kv)1177 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1178 {
1179 Mat_SeqAIJKokkos *aijkok;
1180
1181 PetscFunctionBegin;
1182 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1183 PetscAssertPointer(kv, 2);
1184 PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1185 PetscCall(MatSeqAIJKokkosSyncDevice(A));
1186 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1187 *kv = aijkok->a_dual.view_device();
1188 PetscFunctionReturn(PETSC_SUCCESS);
1189 }
1190
MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView * kv)1191 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1192 {
1193 PetscFunctionBegin;
1194 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1195 PetscAssertPointer(kv, 2);
1196 PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1197 PetscFunctionReturn(PETSC_SUCCESS);
1198 }
1199
MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView * kv)1200 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1201 {
1202 Mat_SeqAIJKokkos *aijkok;
1203
1204 PetscFunctionBegin;
1205 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1206 PetscAssertPointer(kv, 2);
1207 PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1208 PetscCall(MatSeqAIJKokkosSyncDevice(A));
1209 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1210 *kv = aijkok->a_dual.view_device();
1211 PetscFunctionReturn(PETSC_SUCCESS);
1212 }
1213
MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView * kv)1214 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1215 {
1216 PetscFunctionBegin;
1217 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1218 PetscAssertPointer(kv, 2);
1219 PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1220 PetscCall(MatSeqAIJKokkosModifyDevice(A));
1221 PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223
MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView * kv)1224 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1225 {
1226 Mat_SeqAIJKokkos *aijkok;
1227
1228 PetscFunctionBegin;
1229 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1230 PetscAssertPointer(kv, 2);
1231 PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1232 aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1233 *kv = aijkok->a_dual.view_device();
1234 PetscFunctionReturn(PETSC_SUCCESS);
1235 }
1236
MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView * kv)1237 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1238 {
1239 PetscFunctionBegin;
1240 PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1241 PetscAssertPointer(kv, 2);
1242 PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1243 PetscCall(MatSeqAIJKokkosModifyDevice(A));
1244 PetscFunctionReturn(PETSC_SUCCESS);
1245 }
1246
MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm,PetscInt m,PetscInt n,Kokkos::View<PetscInt * > & i_d,Kokkos::View<PetscInt * > & j_d,Kokkos::View<PetscScalar * > & a_d,Mat * A)1247 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1248 {
1249 Mat_SeqAIJKokkos *akok;
1250
1251 PetscFunctionBegin;
1252 PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1253 PetscCall(MatCreate(comm, A));
1254 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1255 PetscFunctionReturn(PETSC_SUCCESS);
1256 }
1257
1258 /* Computes Y += alpha X */
MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)1259 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1260 {
1261 Mat_SeqAIJ *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1262 Mat_SeqAIJKokkos *xkok, *ykok, *zkok;
1263 ConstMatScalarKokkosView Xa;
1264 MatScalarKokkosView Ya;
1265 auto exec = PetscGetKokkosExecutionSpace();
1266
1267 PetscFunctionBegin;
1268 PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1269 PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1270 PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1271 PetscCall(MatSeqAIJKokkosSyncDevice(X));
1272 PetscCall(PetscLogGpuTimeBegin());
1273
1274 if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1275 PetscBool e;
1276 PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1277 if (e) {
1278 PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1279 if (e) pattern = SAME_NONZERO_PATTERN;
1280 }
1281 }
1282
1283 /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1284 cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1285 If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1286 KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1287 */
1288 ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1289 xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1290 Xa = xkok->a_dual.view_device();
1291 Ya = ykok->a_dual.view_device();
1292
1293 if (pattern == SAME_NONZERO_PATTERN) {
1294 KokkosBlas::axpy(exec, alpha, Xa, Ya);
1295 PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1296 } else if (pattern == SUBSET_NONZERO_PATTERN) {
1297 MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1298 MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1299
1300 Kokkos::parallel_for(
1301 Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1302 PetscInt i = t.league_rank(); // row i
1303 Kokkos::single(Kokkos::PerTeam(t), [=]() {
1304 // Only one thread works in a team
1305 PetscInt p, q = Yi(i);
1306 for (p = Xi(i); p < Xi(i + 1); p++) { // For each nonzero on row i of X,
1307 while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1308 if (Xj(p) == Yj(q)) { // Found it
1309 Ya(q) += alpha * Xa(p);
1310 q++;
1311 } else {
1312 // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1313 // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1314 #if PETSC_PKG_KOKKOS_VERSION_GE(5, 0, 0)
1315 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = KokkosKernels::ArithTraits<PetscScalar>::nan();
1316 #elif PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1317 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1318 #else
1319 if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1320 #endif
1321 }
1322 }
1323 });
1324 });
1325 PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1326 } else { // different nonzero patterns
1327 Mat Z;
1328 KokkosCsrMatrix zcsr;
1329 KernelHandle kh;
1330 kh.create_spadd_handle(true); // X, Y are sorted
1331 KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1332 KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1333 zkok = new Mat_SeqAIJKokkos(zcsr);
1334 PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1335 PetscCall(MatHeaderReplace(Y, &Z));
1336 kh.destroy_spadd_handle();
1337 }
1338 PetscCall(PetscLogGpuTimeEnd());
1339 PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1340 PetscFunctionReturn(PETSC_SUCCESS);
1341 }
1342
1343 struct MatCOOStruct_SeqAIJKokkos {
1344 PetscCount n;
1345 PetscCount Atot;
1346 PetscInt nz;
1347 PetscCountKokkosView jmap;
1348 PetscCountKokkosView perm;
1349
MatCOOStruct_SeqAIJKokkosMatCOOStruct_SeqAIJKokkos1350 MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1351 {
1352 nz = coo_h->nz;
1353 n = coo_h->n;
1354 Atot = coo_h->Atot;
1355 jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1356 perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1357 }
1358 };
1359
MatCOOStructDestroy_SeqAIJKokkos(PetscCtxRt data)1360 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(PetscCtxRt data)
1361 {
1362 PetscFunctionBegin;
1363 PetscCallCXX(delete *static_cast<MatCOOStruct_SeqAIJKokkos **>(data));
1364 PetscFunctionReturn(PETSC_SUCCESS);
1365 }
1366
MatSetPreallocationCOO_SeqAIJKokkos(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])1367 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1368 {
1369 Mat_SeqAIJKokkos *akok;
1370 Mat_SeqAIJ *aseq;
1371 PetscContainer container_h;
1372 MatCOOStruct_SeqAIJ *coo_h;
1373 MatCOOStruct_SeqAIJKokkos *coo_d;
1374
1375 PetscFunctionBegin;
1376 PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1377 aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1378 akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1379 delete akok;
1380 mat->spptr = akok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1381 PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1382
1383 // Copy the COO struct to device
1384 PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1385 PetscCall(PetscContainerGetPointer(container_h, &coo_h));
1386 PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1387
1388 // Put the COO struct in a container and then attach that to the matrix
1389 PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1390 PetscFunctionReturn(PETSC_SUCCESS);
1391 }
1392
MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)1393 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1394 {
1395 MatScalarKokkosView Aa;
1396 ConstMatScalarKokkosView kv;
1397 PetscMemType memtype;
1398 PetscContainer container;
1399 MatCOOStruct_SeqAIJKokkos *coo;
1400
1401 PetscFunctionBegin;
1402 PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1403 PetscCall(PetscContainerGetPointer(container, &coo));
1404
1405 const auto &n = coo->n;
1406 const auto &Annz = coo->nz;
1407 const auto &jmap = coo->jmap;
1408 const auto &perm = coo->perm;
1409
1410 PetscCall(PetscGetMemType(v, &memtype));
1411 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1412 kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1413 } else {
1414 kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1415 }
1416
1417 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1418 else PetscCall(MatSeqAIJGetKokkosView(A, &Aa)); /* read & write matrix values */
1419
1420 PetscCall(PetscLogGpuTimeBegin());
1421 Kokkos::parallel_for(
1422 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1423 PetscScalar sum = 0.0;
1424 for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1425 Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1426 });
1427 PetscCall(PetscLogGpuTimeEnd());
1428
1429 if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1430 else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1431 PetscFunctionReturn(PETSC_SUCCESS);
1432 }
1433
MatSetOps_SeqAIJKokkos(Mat A)1434 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1435 {
1436 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1437
1438 PetscFunctionBegin;
1439 A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1440 A->boundtocpu = PETSC_FALSE;
1441
1442 A->ops->assemblyend = MatAssemblyEnd_SeqAIJKokkos;
1443 A->ops->destroy = MatDestroy_SeqAIJKokkos;
1444 A->ops->duplicate = MatDuplicate_SeqAIJKokkos;
1445 A->ops->axpy = MatAXPY_SeqAIJKokkos;
1446 A->ops->scale = MatScale_SeqAIJKokkos;
1447 A->ops->zeroentries = MatZeroEntries_SeqAIJKokkos;
1448 A->ops->productsetfromoptions = MatProductSetFromOptions_SeqAIJKokkos;
1449 A->ops->mult = MatMult_SeqAIJKokkos;
1450 A->ops->multadd = MatMultAdd_SeqAIJKokkos;
1451 A->ops->multtranspose = MatMultTranspose_SeqAIJKokkos;
1452 A->ops->multtransposeadd = MatMultTransposeAdd_SeqAIJKokkos;
1453 A->ops->multhermitiantranspose = MatMultHermitianTranspose_SeqAIJKokkos;
1454 A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1455 A->ops->productnumeric = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1456 A->ops->transpose = MatTranspose_SeqAIJKokkos;
1457 A->ops->setoption = MatSetOption_SeqAIJKokkos;
1458 A->ops->getdiagonal = MatGetDiagonal_SeqAIJKokkos;
1459 A->ops->shift = MatShift_SeqAIJKokkos;
1460 A->ops->diagonalset = MatDiagonalSet_SeqAIJKokkos;
1461 A->ops->diagonalscale = MatDiagonalScale_SeqAIJKokkos;
1462 A->ops->getcurrentmemtype = MatGetCurrentMemType_SeqAIJKokkos;
1463 a->ops->getarray = MatSeqAIJGetArray_SeqAIJKokkos;
1464 a->ops->restorearray = MatSeqAIJRestoreArray_SeqAIJKokkos;
1465 a->ops->getarrayread = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1466 a->ops->restorearrayread = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1467 a->ops->getarraywrite = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1468 a->ops->restorearraywrite = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1469 a->ops->getcsrandmemtype = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1470
1471 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1472 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1473 #if defined(PETSC_HAVE_HYPRE)
1474 PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1475 #endif
1476 PetscFunctionReturn(PETSC_SUCCESS);
1477 }
1478
1479 /*
1480 Extract the (prescribled) diagonal blocks of the matrix and then invert them
1481
1482 Input Parameters:
1483 + A - the MATSEQAIJKOKKOS matrix
1484 . bs - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1485 . bs2 - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1486 . blkMap - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1487 - work - a pre-allocated work buffer (as big as diagVal) for use by this routine
1488
1489 Output Parameter:
1490 . diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1491 */
MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A,const PetscIntKokkosView & bs,const PetscIntKokkosView & bs2,const PetscIntKokkosView & blkMap,PetscScalarKokkosView & work,PetscScalarKokkosView & diagVal)1492 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1493 {
1494 Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1495 PetscInt nblocks = bs.extent(0) - 1;
1496
1497 PetscFunctionBegin;
1498 PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1499
1500 // Pull out the diagonal blocks of the matrix and then invert the blocks
1501 auto Aa = akok->a_dual.view_device();
1502 auto Ai = akok->i_dual.view_device();
1503 auto Aj = akok->j_dual.view_device();
1504 auto Adiag = akok->diag_dual.view_device();
1505 // TODO: how to tune the team size?
1506 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1507 auto ts = Kokkos::AUTO();
1508 #else
1509 auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1510 #endif
1511 PetscCallCXX(Kokkos::parallel_for(
1512 Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1513 const PetscInt bid = teamMember.league_rank(); // block id
1514 const PetscInt rstart = bs(bid); // this block starts from this row
1515 const PetscInt m = bs(bid + 1) - bs(bid); // size of this block
1516 const auto &B = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1517 const auto &W = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1518
1519 Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1520 PetscInt i = rstart + r; // i-th row in A
1521
1522 if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1523 PetscInt first = Adiag(i) - r; // we start to check nonzeros from here along this row
1524
1525 for (PetscInt c = 0; c < m; c++) { // walk n steps to see what column indices we will meet
1526 if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1527 B(r, c) = 0.0;
1528 } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1529 B(r, c) = Aa(first + c);
1530 } else { // this entry does not show up in the CSR
1531 B(r, c) = 0.0;
1532 }
1533 }
1534 } else { // rare case that the diagonal does not exist
1535 const PetscInt begin = Ai(i);
1536 const PetscInt end = Ai(i + 1);
1537 for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1538 for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1539 if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1540 else if (Aj(j) >= rstart + m) break;
1541 }
1542 }
1543 });
1544
1545 // LU-decompose B (w/o pivoting) and then invert B
1546 KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1547 KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1548 }));
1549 // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1550 PetscFunctionReturn(PETSC_SUCCESS);
1551 }
1552
MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos * akok)1553 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1554 {
1555 Mat_SeqAIJ *aseq;
1556 PetscInt i, m, n;
1557 auto exec = PetscGetKokkosExecutionSpace();
1558
1559 PetscFunctionBegin;
1560 PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1561
1562 m = akok->nrows();
1563 n = akok->ncols();
1564 PetscCall(MatSetSizes(A, m, n, m, n));
1565 PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1566
1567 /* Set up data structures of A as a MATSEQAIJ */
1568 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1569 aseq = (Mat_SeqAIJ *)A->data;
1570
1571 PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1572 PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1573
1574 aseq->i = akok->i_host_data();
1575 aseq->j = akok->j_host_data();
1576 aseq->a = akok->a_host_data();
1577 aseq->nonew = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1578 aseq->free_a = PETSC_FALSE;
1579 aseq->free_ij = PETSC_FALSE;
1580 aseq->nz = akok->nnz();
1581 aseq->maxnz = aseq->nz;
1582
1583 PetscCall(PetscMalloc1(m, &aseq->imax));
1584 PetscCall(PetscMalloc1(m, &aseq->ilen));
1585 for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1586
1587 /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1588 akok->nonzerostate = A->nonzerostate;
1589 A->spptr = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1590 PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1591 PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1592 PetscFunctionReturn(PETSC_SUCCESS);
1593 }
1594
MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A,KokkosCsrMatrix * csr)1595 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1596 {
1597 PetscFunctionBegin;
1598 PetscCall(MatSeqAIJKokkosSyncDevice(A));
1599 *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1600 PetscFunctionReturn(PETSC_SUCCESS);
1601 }
1602
MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm,KokkosCsrMatrix csr,Mat * A)1603 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1604 {
1605 Mat_SeqAIJKokkos *akok;
1606
1607 PetscFunctionBegin;
1608 PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1609 PetscCall(MatCreate(comm, A));
1610 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1611 PetscFunctionReturn(PETSC_SUCCESS);
1612 }
1613
1614 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1615
1616 Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1617 */
MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos * akok,Mat * A)1618 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1619 {
1620 PetscFunctionBegin;
1621 PetscCall(MatCreate(comm, A));
1622 PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1623 PetscFunctionReturn(PETSC_SUCCESS);
1624 }
1625
1626 /*@C
1627 MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1628 (the default parallel PETSc format). This matrix will ultimately be handled by
1629 Kokkos for calculations.
1630
1631 Collective
1632
1633 Input Parameters:
1634 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1635 . m - number of rows
1636 . n - number of columns
1637 . nz - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1638 - nnz - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1639
1640 Output Parameter:
1641 . A - the matrix
1642
1643 Level: intermediate
1644
1645 Notes:
1646 It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1647 MatXXXXSetPreallocation() paradgm instead of this routine directly.
1648 [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1649
1650 The AIJ format, also called
1651 compressed row storage, is fully compatible with standard Fortran
1652 storage. That is, the stored row and column indices can begin at
1653 either one (as in Fortran) or zero.
1654
1655 Specify the preallocated storage with either `nz` or `nnz` (not both).
1656 Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1657 allocation.
1658
1659 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1660 @*/
MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)1661 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1662 {
1663 PetscFunctionBegin;
1664 PetscCall(PetscKokkosInitializeCheck());
1665 PetscCall(MatCreate(comm, A));
1666 PetscCall(MatSetSizes(*A, m, n, m, n));
1667 PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1668 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1669 PetscFunctionReturn(PETSC_SUCCESS);
1670 }
1671
1672 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1673 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1674 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1675 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
MatSeqAIJKokkosSolveCheck(Mat A)1676 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1677 {
1678 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1679 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1680 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1681
1682 PetscFunctionBegin;
1683 if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1684 if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1685 if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1686 factors->sptrsv_symbolic_completed = PETSC_TRUE;
1687 }
1688 PetscFunctionReturn(PETSC_SUCCESS);
1689 }
1690
MatSeqAIJKokkosTransposeSolveCheck(Mat A)1691 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1692 {
1693 const PetscInt n = A->rmap->n;
1694 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1695 const PetscBool has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1696 const PetscBool has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1697
1698 PetscFunctionBegin;
1699 if (!factors->transpose_updated) {
1700 if (has_upper) {
1701 if (!factors->iUt_d.extent(0)) { // Allocate Ut on device if not yet
1702 factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1703 factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1704 factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1705 }
1706
1707 if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1708 if (!factors->U) {
1709 Mat_SeqAIJ *seq;
1710
1711 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1712 PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1713
1714 seq = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1715 factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1716 factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1717 factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1718 } else {
1719 PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1720 }
1721 // Copy Ut from host to device
1722 PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1723 PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1724 PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1725 } else { // If U was computed on device, we also compute the transpose there
1726 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1727 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1728 factors->jU_d, factors->aU_d,
1729 factors->iUt_d, factors->jUt_d,
1730 factors->aUt_d));
1731 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1732 }
1733 PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1734 }
1735
1736 // do the same for L with LU
1737 if (has_lower) {
1738 if (!factors->iLt_d.extent(0)) { // Allocate Lt on device if not yet
1739 factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1740 factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1741 factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1742 }
1743
1744 if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1745 if (!factors->L) {
1746 Mat_SeqAIJ *seq;
1747
1748 PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1749 PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1750
1751 seq = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1752 factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1753 factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1754 factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1755 } else {
1756 PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1757 }
1758 // Copy Lt from host to device
1759 PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1760 PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1761 PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1762 } else { // If L was computed on device, we also compute the transpose there
1763 // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1764 PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1765 factors->jL_d, factors->aL_d,
1766 factors->iLt_d, factors->jLt_d,
1767 factors->aLt_d));
1768 PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1769 }
1770 PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1771 }
1772
1773 factors->transpose_updated = PETSC_TRUE;
1774 }
1775 PetscFunctionReturn(PETSC_SUCCESS);
1776 }
1777
1778 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1779 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
MatSolve_SeqAIJKokkos_Cholesky(Mat A,Vec bb,Vec xx)1780 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1781 {
1782 auto exec = PetscGetKokkosExecutionSpace();
1783 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1784 PetscInt m = A->rmap->n;
1785 PetscScalarKokkosView D = factors->D_d;
1786 PetscScalarKokkosView X, Y, B; // alias
1787 ConstPetscScalarKokkosView b;
1788 PetscScalarKokkosView x;
1789 PetscIntKokkosView &rowperm = factors->rowperm;
1790 PetscBool identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1791
1792 PetscFunctionBegin;
1793 PetscCall(PetscLogGpuTimeBegin());
1794 PetscCall(MatSeqAIJKokkosSolveCheck(A)); // for UX = T
1795 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1796 PetscCall(VecGetKokkosView(bb, &b));
1797 PetscCall(VecGetKokkosViewWrite(xx, &x));
1798
1799 // Solve U^T Y = B
1800 if (identity) { // Reorder b with the row permutation
1801 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1802 Y = factors->workVector;
1803 } else {
1804 B = factors->workVector;
1805 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1806 Y = x;
1807 }
1808 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1809
1810 // Solve diag(D) Y' = Y.
1811 // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1812 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1813
1814 // Solve UX = Y
1815 if (identity) {
1816 X = x;
1817 } else {
1818 X = factors->workVector; // B is not needed anymore
1819 }
1820 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1821
1822 // Reorder X with the inverse column (row) permutation
1823 if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1824
1825 PetscCall(VecRestoreKokkosView(bb, &b));
1826 PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1827 PetscCall(PetscLogGpuTimeEnd());
1828 PetscFunctionReturn(PETSC_SUCCESS);
1829 }
1830
1831 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1832 // R and C are represented by rowperm and colperm in factors.
1833 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
MatSolve_SeqAIJKokkos_LU(Mat A,Vec bb,Vec xx)1834 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1835 {
1836 auto exec = PetscGetKokkosExecutionSpace();
1837 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1838 PetscInt m = A->rmap->n;
1839 PetscScalarKokkosView X, Y, B; // alias
1840 ConstPetscScalarKokkosView b;
1841 PetscScalarKokkosView x;
1842 PetscIntKokkosView &rowperm = factors->rowperm;
1843 PetscIntKokkosView &colperm = factors->colperm;
1844 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1845 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1846
1847 PetscFunctionBegin;
1848 PetscCall(PetscLogGpuTimeBegin());
1849 PetscCall(MatSeqAIJKokkosSolveCheck(A));
1850 PetscCall(VecGetKokkosView(bb, &b));
1851 PetscCall(VecGetKokkosViewWrite(xx, &x));
1852
1853 // Solve L Y = B (i.e., L (U C^- x) = R b). R b indicates applying the row permutation on b.
1854 if (row_identity) {
1855 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1856 Y = factors->workVector;
1857 } else {
1858 B = factors->workVector;
1859 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1860 Y = x;
1861 }
1862 PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1863
1864 // Solve U C^- x = Y
1865 if (col_identity) {
1866 X = x;
1867 } else {
1868 X = factors->workVector;
1869 }
1870 PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1871
1872 // x = C X; Reorder X with the inverse col permutation
1873 if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1874
1875 PetscCall(VecRestoreKokkosView(bb, &b));
1876 PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1877 PetscCall(PetscLogGpuTimeEnd());
1878 PetscFunctionReturn(PETSC_SUCCESS);
1879 }
1880
1881 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1882 // R and C are represented by rowperm and colperm in factors.
1883 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1884 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
MatSolveTranspose_SeqAIJKokkos_LU(Mat A,Vec bb,Vec xx)1885 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1886 {
1887 auto exec = PetscGetKokkosExecutionSpace();
1888 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1889 PetscInt m = A->rmap->n;
1890 PetscScalarKokkosView X, Y, B; // alias
1891 ConstPetscScalarKokkosView b;
1892 PetscScalarKokkosView x;
1893 PetscIntKokkosView &rowperm = factors->rowperm;
1894 PetscIntKokkosView &colperm = factors->colperm;
1895 PetscBool row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1896 PetscBool col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1897
1898 PetscFunctionBegin;
1899 PetscCall(PetscLogGpuTimeBegin());
1900 PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1901 PetscCall(VecGetKokkosView(bb, &b));
1902 PetscCall(VecGetKokkosViewWrite(xx, &x));
1903
1904 // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b). Note C^- b = C^T b, which means applying the column permutation on b.
1905 if (col_identity) { // Reorder b with the col permutation
1906 B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1907 Y = factors->workVector;
1908 } else {
1909 B = factors->workVector;
1910 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1911 Y = x;
1912 }
1913 PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1914
1915 // Solve L^T X = Y
1916 if (row_identity) {
1917 X = x;
1918 } else {
1919 X = factors->workVector;
1920 }
1921 PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1922
1923 // x = R^- X = R^T X; Reorder X with the inverse row permutation
1924 if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1925
1926 PetscCall(VecRestoreKokkosView(bb, &b));
1927 PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1928 PetscCall(PetscLogGpuTimeEnd());
1929 PetscFunctionReturn(PETSC_SUCCESS);
1930 }
1931
MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo * info)1932 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1933 {
1934 PetscFunctionBegin;
1935 PetscCall(MatSeqAIJKokkosSyncHost(A));
1936 PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1937
1938 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1939 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1940 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
1941 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1942 const MatScalar *Ba = b->a;
1943 PetscInt m = B->rmap->n, n = B->cmap->n;
1944
1945 if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1946 // Allocate memory and copy the structure
1947 factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1948 factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1949 factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1950 factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1951 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1952 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1953
1954 PetscInt *Li = factors->iL_h.data();
1955 PetscInt *Lj = factors->jL_h.data();
1956 PetscInt *Ui = factors->iU_h.data();
1957 PetscInt *Uj = factors->jU_h.data();
1958
1959 Li[0] = Ui[0] = 0;
1960 for (PetscInt i = 0; i < m; i++) {
1961 PetscInt llen = Bi[i + 1] - Bi[i]; // exclusive of the diagonal entry
1962 PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1963
1964 PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1965 Lj[Li[i] + llen] = i; // diagonal entry of L
1966
1967 Uj[Ui[i]] = i; // diagonal entry of U
1968 PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on the right of the diagonal
1969
1970 Li[i + 1] = Li[i] + llen + 1;
1971 Ui[i + 1] = Ui[i] + ulen;
1972 }
1973
1974 factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1975 factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1976 factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1977 factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1978 factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1979 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1980
1981 // Copy row/col permutation to device
1982 IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
1983 PetscBool row_identity;
1984 PetscCall(ISIdentity(rowperm, &row_identity));
1985 if (!row_identity) {
1986 const PetscInt *ip;
1987
1988 PetscCall(ISGetIndices(rowperm, &ip));
1989 factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1990 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1991 PetscCall(ISRestoreIndices(rowperm, &ip));
1992 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1993 }
1994
1995 IS colperm = ((Mat_SeqAIJ *)B->data)->col;
1996 PetscBool col_identity;
1997 PetscCall(ISIdentity(colperm, &col_identity));
1998 if (!col_identity) {
1999 const PetscInt *ip;
2000
2001 PetscCall(ISGetIndices(colperm, &ip));
2002 factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2003 PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2004 PetscCall(ISRestoreIndices(colperm, &ip));
2005 PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2006 }
2007
2008 /* Create sptrsv handles for L, U and their transpose */
2009 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2010 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2011 #else
2012 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2013 #endif
2014 factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2015 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2016 factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2017 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2018 }
2019
2020 // Copy the value
2021 for (PetscInt i = 0; i < m; i++) {
2022 PetscInt llen = Bi[i + 1] - Bi[i];
2023 PetscInt ulen = Bdiag[i] - Bdiag[i + 1];
2024 const PetscInt *Li = factors->iL_h.data();
2025 const PetscInt *Ui = factors->iU_h.data();
2026
2027 PetscScalar *La = factors->aL_h.data();
2028 PetscScalar *Ua = factors->aU_h.data();
2029
2030 PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2031 La[Li[i] + llen] = 1.0; // diagonal entry
2032
2033 Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]]; // diagonal entry
2034 PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2035 }
2036
2037 PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2038 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2039 // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2040 factors->transpose_updated = PETSC_FALSE;
2041 factors->sptrsv_symbolic_completed = PETSC_FALSE;
2042
2043 B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2044 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2045 }
2046
2047 B->ops->matsolve = NULL;
2048 B->ops->matsolvetranspose = NULL;
2049 PetscFunctionReturn(PETSC_SUCCESS);
2050 }
2051
MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B,Mat A,const MatFactorInfo * info)2052 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2053 {
2054 Mat_SeqAIJKokkos *aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2055 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2056 PetscInt fill_lev = info->levels;
2057
2058 PetscFunctionBegin;
2059 PetscCall(PetscLogGpuTimeBegin());
2060 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2061 PetscCall(MatSeqAIJKokkosSyncDevice(A));
2062
2063 auto a_d = aijkok->a_dual.view_device();
2064 auto i_d = aijkok->i_dual.view_device();
2065 auto j_d = aijkok->j_dual.view_device();
2066
2067 PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2068
2069 B->assembled = PETSC_TRUE;
2070 B->preallocated = PETSC_TRUE;
2071 B->ops->solve = MatSolve_SeqAIJKokkos_LU;
2072 B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2073 B->ops->matsolve = NULL;
2074 B->ops->matsolvetranspose = NULL;
2075
2076 /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2077 factors->transpose_updated = PETSC_FALSE;
2078 factors->sptrsv_symbolic_completed = PETSC_FALSE;
2079 /* TODO: log flops, but how to know that? */
2080 PetscCall(PetscLogGpuTimeEnd());
2081 PetscFunctionReturn(PETSC_SUCCESS);
2082 }
2083
2084 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B,Mat A,IS,IS,const MatFactorInfo * info)2085 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2086 {
2087 Mat_SeqAIJKokkos *aijkok;
2088 Mat_SeqAIJ *b;
2089 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2090 PetscInt fill_lev = info->levels;
2091 PetscInt nnzA = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2092 PetscInt n = A->rmap->n;
2093
2094 PetscFunctionBegin;
2095 PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2096 PetscCall(MatSeqAIJKokkosSyncDevice(A));
2097
2098 /* Create a spiluk handle and then do symbolic factorization */
2099 nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2100 factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2101
2102 auto spiluk_handle = factors->kh.get_spiluk_handle();
2103
2104 Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2105 Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2106 Kokkos::realloc(factors->iU_d, n + 1);
2107 Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2108
2109 aijkok = (Mat_SeqAIJKokkos *)A->spptr;
2110 auto i_d = aijkok->i_dual.view_device();
2111 auto j_d = aijkok->j_dual.view_device();
2112 PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2113 /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2114
2115 Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2116 Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2117 Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2118 Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2119
2120 /* TODO: add options to select sptrsv algorithms */
2121 /* Create sptrsv handles for L, U and their transpose */
2122 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2123 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2124 #else
2125 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2126 #endif
2127
2128 factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2129 factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2130 factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2131 factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2132
2133 /* Fill fields of the factor matrix B */
2134 PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2135 b = (Mat_SeqAIJ *)B->data;
2136 b->nz = b->maxnz = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2137 B->info.fill_ratio_given = info->fill;
2138 B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2139
2140 B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2141 PetscFunctionReturn(PETSC_SUCCESS);
2142 }
2143
MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)2144 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2145 {
2146 PetscFunctionBegin;
2147 PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2148 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2149 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2150 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2151 PetscFunctionReturn(PETSC_SUCCESS);
2152 }
2153
MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)2154 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2155 {
2156 PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2157
2158 PetscFunctionBegin;
2159 if (!info->factoronhost) {
2160 PetscCall(ISIdentity(isrow, &row_identity));
2161 PetscCall(ISIdentity(iscol, &col_identity));
2162 }
2163
2164 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2165 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2166
2167 if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2168 PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2169 } else {
2170 PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2171 B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2172 }
2173 PetscFunctionReturn(PETSC_SUCCESS);
2174 }
2175
MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo * info)2176 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2177 {
2178 PetscFunctionBegin;
2179 PetscCall(MatSeqAIJKokkosSyncHost(A));
2180 PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2181
2182 if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2183 Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2184 Mat_SeqAIJ *b = static_cast<Mat_SeqAIJ *>(B->data);
2185 const PetscInt *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2186 const MatScalar *Ba = b->a;
2187 PetscInt m = B->rmap->n;
2188
2189 if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2190 // Allocate memory and copy the structure
2191 factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2192 factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2193 factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2194 factors->D_h = MatScalarKokkosViewHost(NoInit("D_h"), m);
2195 factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2196 factors->D_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2197
2198 // Build jU_h from the skewed Aj
2199 PetscInt *Uj = factors->jU_h.data();
2200 for (PetscInt i = 0; i < m; i++) {
2201 PetscInt ulen = Bi[i + 1] - Bi[i];
2202 Uj[Bi[i]] = i; // diagonal entry
2203 PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2204 }
2205
2206 // Copy iU, jU to device
2207 PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2208 PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2209
2210 // Copy row/col permutation to device
2211 IS rowperm = ((Mat_SeqAIJ *)B->data)->row;
2212 PetscBool row_identity;
2213 PetscCall(ISIdentity(rowperm, &row_identity));
2214 if (!row_identity) {
2215 const PetscInt *ip;
2216
2217 PetscCall(ISGetIndices(rowperm, &ip));
2218 PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2219 PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2220 PetscCall(ISRestoreIndices(rowperm, &ip));
2221 PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2222 }
2223
2224 // Create sptrsv handles for U and U^T
2225 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2226 auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2227 #else
2228 auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2229 #endif
2230 factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2231 factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2232 }
2233 // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2234 B->ops->solve = MatSolve_SeqAIJKokkos_Cholesky;
2235 B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2236
2237 // Copy the value
2238 PetscScalar *Ua = factors->aU_h.data();
2239 PetscScalar *D = factors->D_h.data();
2240 for (PetscInt i = 0; i < m; i++) {
2241 D[i] = Ba[Bdiag[i]]; // actually Aa[Adiag[i]] is the inverse of the diagonal
2242 Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2243 for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2244 }
2245 PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2246 PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2247
2248 factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2249 factors->transpose_updated = PETSC_FALSE;
2250 }
2251
2252 B->ops->matsolve = NULL;
2253 B->ops->matsolvetranspose = NULL;
2254 PetscFunctionReturn(PETSC_SUCCESS);
2255 }
2256
MatICCFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS perm,const MatFactorInfo * info)2257 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2258 {
2259 PetscFunctionBegin;
2260 if (info->solveonhost) {
2261 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2262 PetscCall(MatSetType(B, MATSEQSBAIJ));
2263 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2264 }
2265
2266 PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2267
2268 if (!info->solveonhost) {
2269 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2270 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2271 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2272 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2273 }
2274 PetscFunctionReturn(PETSC_SUCCESS);
2275 }
2276
MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS perm,const MatFactorInfo * info)2277 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2278 {
2279 PetscFunctionBegin;
2280 if (info->solveonhost) {
2281 // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2282 PetscCall(MatSetType(B, MATSEQSBAIJ));
2283 PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2284 }
2285
2286 PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2287
2288 if (!info->solveonhost) {
2289 // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2290 PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2291 PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2292 B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2293 }
2294 PetscFunctionReturn(PETSC_SUCCESS);
2295 }
2296
2297 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A,MatSolverType * type)2298 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2299 {
2300 PetscFunctionBegin;
2301 *type = MATSOLVERKOKKOS;
2302 PetscFunctionReturn(PETSC_SUCCESS);
2303 }
2304
2305 /*MC
2306 MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2307 on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2308
2309 Level: beginner
2310
2311 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`
2312 M*/
MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat * B)2313 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2314 {
2315 PetscInt n = A->rmap->n;
2316 MPI_Comm comm;
2317
2318 PetscFunctionBegin;
2319 PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2320 PetscCall(MatCreate(comm, B));
2321 PetscCall(MatSetSizes(*B, n, n, n, n));
2322 PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2323 (*B)->factortype = ftype;
2324 PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2325 PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2326 PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2327
2328 if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2329 (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqAIJKokkos;
2330 (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2331 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2332 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2333 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2334 } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2335 (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqAIJKokkos;
2336 (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2337 PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2338 PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2339 } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2340
2341 // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2342 (*B)->canuseordering = PETSC_TRUE;
2343 PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2344 PetscFunctionReturn(PETSC_SUCCESS);
2345 }
2346
MatSolverTypeRegister_Kokkos(void)2347 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2348 {
2349 PetscFunctionBegin;
2350 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2351 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2352 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2353 PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2354 PetscFunctionReturn(PETSC_SUCCESS);
2355 }
2356
2357 /* Utility to print out a KokkosCsrMatrix for debugging */
PrintCsrMatrix(const KokkosCsrMatrix & csrmat)2358 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2359 {
2360 const auto &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2361 const auto &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2362 const auto &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2363 const PetscInt *i = iv.data();
2364 const PetscInt *j = jv.data();
2365 const PetscScalar *a = av.data();
2366 PetscInt m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2367
2368 PetscFunctionBegin;
2369 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2370 for (PetscInt k = 0; k < m; k++) {
2371 PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2372 for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2373 PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2374 }
2375 PetscFunctionReturn(PETSC_SUCCESS);
2376 }
2377