xref: /petsc/src/mat/impls/aij/seq/kokkos/aijkok.kokkos.cxx (revision 4e8208cbcbc709572b8abe32f33c78b69c819375)
1 #include <petsc_kokkos.hpp>
2 #include <petscvec_kokkos.hpp>
3 #include <petscmat_kokkos.hpp>
4 #include <petscpkg_version.h>
5 #include <petsc/private/petscimpl.h>
6 #include <petsc/private/sfimpl.h>
7 #include <petsc/private/kokkosimpl.hpp>
8 #include <petscsys.h>
9 
10 #include <Kokkos_Core.hpp>
11 #include <KokkosBlas.hpp>
12 #include <KokkosSparse_CrsMatrix.hpp>
13 
14 // To suppress compiler warnings:
15 // /path/include/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp:434:63:
16 // warning: 'cusparseStatus_t cusparseDbsrmm(cusparseHandle_t, cusparseDirection_t, cusparseOperation_t,
17 // cusparseOperation_t, int, int, int, int, const double*, cusparseMatDescr_t, const double*, const int*, const int*,
18 // int, const double*, int, const double*, double*, int)' is deprecated: please use cusparseSpMM instead [-Wdeprecated-declarations]
19 #define DISABLE_CUSPARSE_DEPRECATED
20 #include <KokkosSparse_spmv.hpp>
21 
22 #include <KokkosSparse_spiluk.hpp>
23 #include <KokkosSparse_sptrsv.hpp>
24 #include <KokkosSparse_spgemm.hpp>
25 #include <KokkosSparse_spadd.hpp>
26 #include <KokkosBatched_LU_Decl.hpp>
27 #include <KokkosBatched_InverseLU_Decl.hpp>
28 
29 #include <../src/mat/impls/aij/seq/kokkos/aijkok.hpp>
30 
31 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(3, 7, 0)
32   #include <KokkosSparse_Utils.hpp>
33 using KokkosSparse::sort_crs_matrix;
34 using KokkosSparse::Impl::transpose_matrix;
35 #else
36   #include <KokkosKernels_Sorting.hpp>
37 using KokkosKernels::sort_crs_matrix;
38 using KokkosKernels::Impl::transpose_matrix;
39 #endif
40 
41 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_GE(4, 6, 0)
42 using KokkosSparse::spiluk_symbolic;
43 using KokkosSparse::spiluk_numeric;
44 using KokkosSparse::sptrsv_symbolic;
45 using KokkosSparse::sptrsv_solve;
46 using KokkosSparse::Experimental::SPTRSVAlgorithm;
47 using KokkosSparse::Experimental::SPILUKAlgorithm;
48 #else
49 using KokkosSparse::Experimental::spiluk_symbolic;
50 using KokkosSparse::Experimental::spiluk_numeric;
51 using KokkosSparse::Experimental::sptrsv_symbolic;
52 using KokkosSparse::Experimental::sptrsv_solve;
53 using KokkosSparse::Experimental::SPTRSVAlgorithm;
54 using KokkosSparse::Experimental::SPILUKAlgorithm;
55 #endif
56 
57 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat); /* Forward declaration */
58 
59 /* MatAssemblyEnd_SeqAIJKokkos() happens when we finalized nonzeros of the matrix, either after
60    we assembled the matrix on host, or after we directly produced the matrix data on device (ex., through MatMatMult).
61    In the latter case, it is important to set a_dual's sync state correctly.
62  */
MatAssemblyEnd_SeqAIJKokkos(Mat A,MatAssemblyType mode)63 static PetscErrorCode MatAssemblyEnd_SeqAIJKokkos(Mat A, MatAssemblyType mode)
64 {
65   Mat_SeqAIJ       *aijseq;
66   Mat_SeqAIJKokkos *aijkok;
67 
68   PetscFunctionBegin;
69   if (mode == MAT_FLUSH_ASSEMBLY) PetscFunctionReturn(PETSC_SUCCESS);
70   PetscCall(MatAssemblyEnd_SeqAIJ(A, mode));
71 
72   aijseq = static_cast<Mat_SeqAIJ *>(A->data);
73   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
74 
75   /* If aijkok does not exist, we just copy i, j to device.
76      If aijkok already exists, but the device's nonzero pattern does not match with the host's, we assume the latest data is on host.
77      In both cases, we build a new aijkok structure.
78   */
79   if (!aijkok || aijkok->nonzerostate != A->nonzerostate) { /* aijkok might not exist yet or nonzero pattern has changed */
80     if (aijkok && aijkok->host_aij_allocated_by_kokkos) {   /* Avoid accidentally freeing much needed a,i,j on host when deleting aijkok */
81       PetscCall(PetscShmgetAllocateArray(aijkok->nrows() + 1, sizeof(PetscInt), (void **)&aijseq->i));
82       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->j));
83       PetscCall(PetscShmgetAllocateArray(aijkok->nnz(), sizeof(PetscInt), (void **)&aijseq->a));
84       PetscCall(PetscArraycpy(aijseq->i, aijkok->i_host_data(), aijkok->nrows() + 1));
85       PetscCall(PetscArraycpy(aijseq->j, aijkok->j_host_data(), aijkok->nnz()));
86       PetscCall(PetscArraycpy(aijseq->a, aijkok->a_host_data(), aijkok->nnz()));
87       aijseq->free_a  = PETSC_TRUE;
88       aijseq->free_ij = PETSC_TRUE;
89       /* This arises from MatCreateSeqAIJKokkosWithKokkosCsrMatrix() used in MatMatMult, where
90          we have the CsrMatrix on device first and then copy to host, followed by
91          MatSetMPIAIJWithSplitSeqAIJ() with garray = NULL.
92          One could improve it by not using NULL garray.
93       */
94     }
95     delete aijkok;
96     aijkok   = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aijseq, A->nonzerostate, PETSC_FALSE /* don't copy mat values to device */);
97     A->spptr = aijkok;
98   } else if (A->rmap->n && aijkok->diag_dual.extent(0) == 0) { // MatProduct might directly produce AIJ on device, but not the diag.
99     const PetscInt *adiag;
100     /* the a->diag is created at assmebly here because the rest of the Kokkos AIJ code assumes it always exists. This needs to be fixed since it is now only created when needed! */
101     PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &adiag, NULL));
102     MatRowMapKokkosViewHost diag_h(aijseq->diag, A->rmap->n);
103     auto                    diag_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), diag_h);
104     aijkok->diag_dual              = MatRowMapKokkosDualView(diag_d, diag_h);
105   }
106   PetscFunctionReturn(PETSC_SUCCESS);
107 }
108 
109 /* Sync CSR data to device if not yet */
MatSeqAIJKokkosSyncDevice(Mat A)110 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosSyncDevice(Mat A)
111 {
112   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
113 
114   PetscFunctionBegin;
115   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from host to device");
116   PetscCheck(aijkok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
117   if (aijkok->a_dual.need_sync_device()) {
118     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
119     aijkok->transpose_updated = PETSC_FALSE; /* values of the transpose is out-of-date */
120     aijkok->hermitian_updated = PETSC_FALSE;
121   }
122   PetscFunctionReturn(PETSC_SUCCESS);
123 }
124 
125 /* Mark the CSR data on device as modified */
MatSeqAIJKokkosModifyDevice(Mat A)126 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosModifyDevice(Mat A)
127 {
128   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
129 
130   PetscFunctionBegin;
131   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Not supported for factorized matries");
132   aijkok->a_dual.clear_sync_state();
133   aijkok->a_dual.modify_device();
134   aijkok->transpose_updated = PETSC_FALSE;
135   aijkok->hermitian_updated = PETSC_FALSE;
136   PetscCall(PetscObjectStateIncrease((PetscObject)A));
137   PetscFunctionReturn(PETSC_SUCCESS);
138 }
139 
MatSeqAIJKokkosSyncHost(Mat A)140 static PetscErrorCode MatSeqAIJKokkosSyncHost(Mat A)
141 {
142   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
143   auto              exec   = PetscGetKokkosExecutionSpace();
144 
145   PetscFunctionBegin;
146   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
147   /* We do not expect one needs factors on host  */
148   PetscCheck(A->factortype == MAT_FACTOR_NONE, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Can't sync factorized matrix from device to host");
149   PetscCheck(aijkok, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "Missing AIJKOK");
150   PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, exec));
151   PetscFunctionReturn(PETSC_SUCCESS);
152 }
153 
MatSeqAIJGetArray_SeqAIJKokkos(Mat A,PetscScalar * array[])154 static PetscErrorCode MatSeqAIJGetArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
155 {
156   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
157 
158   PetscFunctionBegin;
159   /* aijkok contains valid pointers only if the host's nonzerostate matches with the device's.
160     Calling MatSeqAIJSetPreallocation() or MatSetValues() on host, where aijseq->{i,j,a} might be
161     reallocated, will lead to stale {i,j,a}_dual in aijkok. In both operations, the hosts's nonzerostate
162     must have been updated. The stale aijkok will be rebuilt during MatAssemblyEnd.
163   */
164   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
165     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
166     *array = aijkok->a_dual.view_host().data();
167   } else { /* Happens when calling MatSetValues on a newly created matrix */
168     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
169   }
170   PetscFunctionReturn(PETSC_SUCCESS);
171 }
172 
MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A,PetscScalar * array[])173 static PetscErrorCode MatSeqAIJRestoreArray_SeqAIJKokkos(Mat A, PetscScalar *array[])
174 {
175   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
176 
177   PetscFunctionBegin;
178   if (aijkok && A->nonzerostate == aijkok->nonzerostate) aijkok->a_dual.modify_host();
179   PetscFunctionReturn(PETSC_SUCCESS);
180 }
181 
MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A,const PetscScalar * array[])182 static PetscErrorCode MatSeqAIJGetArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
183 {
184   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
185 
186   PetscFunctionBegin;
187   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
188     PetscCall(KokkosDualViewSyncHost(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
189     *array = aijkok->a_dual.view_host().data();
190   } else {
191     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
192   }
193   PetscFunctionReturn(PETSC_SUCCESS);
194 }
195 
MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A,const PetscScalar * array[])196 static PetscErrorCode MatSeqAIJRestoreArrayRead_SeqAIJKokkos(Mat A, const PetscScalar *array[])
197 {
198   PetscFunctionBegin;
199   *array = NULL;
200   PetscFunctionReturn(PETSC_SUCCESS);
201 }
202 
MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A,PetscScalar * array[])203 static PetscErrorCode MatSeqAIJGetArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
204 {
205   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
206 
207   PetscFunctionBegin;
208   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
209     *array = aijkok->a_dual.view_host().data();
210   } else { /* Ex. happens with MatZeroEntries on a preallocated but not assembled matrix */
211     *array = static_cast<Mat_SeqAIJ *>(A->data)->a;
212   }
213   PetscFunctionReturn(PETSC_SUCCESS);
214 }
215 
MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A,PetscScalar * array[])216 static PetscErrorCode MatSeqAIJRestoreArrayWrite_SeqAIJKokkos(Mat A, PetscScalar *array[])
217 {
218   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
219 
220   PetscFunctionBegin;
221   if (aijkok && A->nonzerostate == aijkok->nonzerostate) {
222     aijkok->a_dual.clear_sync_state();
223     aijkok->a_dual.modify_host();
224   }
225   PetscFunctionReturn(PETSC_SUCCESS);
226 }
227 
MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A,const PetscInt ** i,const PetscInt ** j,PetscScalar ** a,PetscMemType * mtype)228 static PetscErrorCode MatSeqAIJGetCSRAndMemType_SeqAIJKokkos(Mat A, const PetscInt **i, const PetscInt **j, PetscScalar **a, PetscMemType *mtype)
229 {
230   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
231 
232   PetscFunctionBegin;
233   PetscCheck(aijkok != NULL, PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "aijkok is NULL");
234 
235   if (i) *i = aijkok->i_device_data();
236   if (j) *j = aijkok->j_device_data();
237   if (a) {
238     PetscCall(KokkosDualViewSyncDevice(aijkok->a_dual, PetscGetKokkosExecutionSpace()));
239     *a = aijkok->a_device_data();
240   }
241   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS;
242   PetscFunctionReturn(PETSC_SUCCESS);
243 }
244 
MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A,PetscMemType * mtype)245 static PetscErrorCode MatGetCurrentMemType_SeqAIJKokkos(PETSC_UNUSED Mat A, PetscMemType *mtype)
246 {
247   PetscFunctionBegin;
248   *mtype = PETSC_MEMTYPE_KOKKOS;
249   PetscFunctionReturn(PETSC_SUCCESS);
250 }
251 
252 /*
253   Generate the sparsity pattern of a MatSeqAIJKokkos matrix's transpose on device.
254 
255   Input Parameter:
256 .  A       - the MATSEQAIJKOKKOS matrix
257 
258   Output Parameters:
259 +  perm_d - the permutation array on device, which connects Ta(i) = Aa(perm(i))
260 -  T_d    - the transpose on device, whose value array is allocated but not initialized
261 */
MatSeqAIJKokkosGenerateTransposeStructure(Mat A,MatRowMapKokkosView & perm_d,KokkosCsrMatrix & T_d)262 static PetscErrorCode MatSeqAIJKokkosGenerateTransposeStructure(Mat A, MatRowMapKokkosView &perm_d, KokkosCsrMatrix &T_d)
263 {
264   Mat_SeqAIJ             *aseq = static_cast<Mat_SeqAIJ *>(A->data);
265   PetscInt                nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
266   const PetscInt         *Ai = aseq->i, *Aj = aseq->j;
267   MatRowMapKokkosViewHost Ti_h(NoInit("Ti"), n + 1);
268   MatRowMapType          *Ti = Ti_h.data();
269   MatColIdxKokkosViewHost Tj_h(NoInit("Tj"), nz);
270   MatRowMapKokkosViewHost perm_h(NoInit("permutation"), nz);
271   PetscInt               *Tj   = Tj_h.data();
272   PetscInt               *perm = perm_h.data();
273   PetscInt               *offset;
274 
275   PetscFunctionBegin;
276   // Populate Ti
277   PetscCallCXX(Kokkos::deep_copy(Ti_h, 0));
278   Ti++;
279   for (PetscInt i = 0; i < nz; i++) Ti[Aj[i]]++;
280   Ti--;
281   for (PetscInt i = 0; i < n; i++) Ti[i + 1] += Ti[i];
282 
283   // Populate Tj and the permutation array
284   PetscCall(PetscCalloc1(n, &offset)); // offset in each T row to fill in its column indices
285   for (PetscInt i = 0; i < m; i++) {
286     for (PetscInt j = Ai[i]; j < Ai[i + 1]; j++) { // A's (i,j) is T's (j,i)
287       PetscInt r    = Aj[j];                       // row r of T
288       PetscInt disp = Ti[r] + offset[r];
289 
290       Tj[disp]   = i; // col i of T
291       perm[disp] = j;
292       offset[r]++;
293     }
294   }
295   PetscCall(PetscFree(offset));
296 
297   // Sort each row of T, along with the permutation array
298   for (PetscInt i = 0; i < n; i++) PetscCall(PetscSortIntWithArray(Ti[i + 1] - Ti[i], Tj + Ti[i], perm + Ti[i]));
299 
300   // Output perm and T on device
301   auto Ti_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Ti_h);
302   auto Tj_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), Tj_h);
303   PetscCallCXX(T_d = KokkosCsrMatrix("csrmatT", n, m, nz, MatScalarKokkosView("Ta", nz), Ti_d, Tj_d));
304   PetscCallCXX(perm_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), perm_h));
305   PetscFunctionReturn(PETSC_SUCCESS);
306 }
307 
308 // Generate the transpose on device and cache it internally
309 // Note: KK transpose_matrix() does not have support symbolic/numeric transpose, so we do it on our own
MatSeqAIJKokkosGenerateTranspose_Private(Mat A,KokkosCsrMatrix * csrmatT)310 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGenerateTranspose_Private(Mat A, KokkosCsrMatrix *csrmatT)
311 {
312   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
313   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
314   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
315   KokkosCsrMatrix  &T = akok->csrmatT;
316 
317   PetscFunctionBegin;
318   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
319   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
320 
321   const auto &Aa = akok->a_dual.view_device();
322 
323   if (A->symmetric == PETSC_BOOL3_TRUE) {
324     *csrmatT = akok->csrmat;
325   } else {
326     // See if we already have a cached transpose and its value is up to date
327     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
328       if (!akok->transpose_updated) {            // if the value is out of date, update the cached version
329         const auto &perm = akok->transpose_perm; // get the permutation array
330         auto       &Ta   = T.values;
331 
332         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = Aa(perm(i)); }));
333       }
334     } else { // Generate T of size n x m for the first time
335       MatRowMapKokkosView perm;
336 
337       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
338       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
339       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = Aa(perm(i)); }));
340     }
341     akok->transpose_updated = PETSC_TRUE;
342     *csrmatT                = akok->csrmatT;
343   }
344   PetscFunctionReturn(PETSC_SUCCESS);
345 }
346 
347 // Generate the Hermitian on device and cache it internally
MatSeqAIJKokkosGenerateHermitian_Private(Mat A,KokkosCsrMatrix * csrmatH)348 static PetscErrorCode MatSeqAIJKokkosGenerateHermitian_Private(Mat A, KokkosCsrMatrix *csrmatH)
349 {
350   Mat_SeqAIJ       *aseq = static_cast<Mat_SeqAIJ *>(A->data);
351   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
352   PetscInt          nz = aseq->nz, m = A->rmap->N, n = A->cmap->n;
353   KokkosCsrMatrix  &T = akok->csrmatH;
354 
355   PetscFunctionBegin;
356   PetscCheck(akok, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Unexpected NULL (Mat_SeqAIJKokkos*)A->spptr");
357   PetscCall(KokkosDualViewSyncDevice(akok->a_dual, PetscGetKokkosExecutionSpace())); // Sync A's values since we are going to access them on device
358 
359   const auto &Aa = akok->a_dual.view_device();
360 
361   if (A->hermitian == PETSC_BOOL3_TRUE) {
362     *csrmatH = akok->csrmat;
363   } else {
364     // See if we already have a cached Hermitian and its value is up to date
365     if (T.numRows() == n && T.numCols() == m) {  // this indicates csrmatT had been generated before, otherwise T has 0 rows/cols after construction
366       if (!akok->hermitian_updated) {            // if the value is out of date, update the cached version
367         const auto &perm = akok->transpose_perm; // get the permutation array
368         auto       &Ta   = T.values;
369 
370         PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { Ta(i) = PetscConj(Aa(perm(i))); }));
371       }
372     } else { // Generate T of size n x m for the first time
373       MatRowMapKokkosView perm;
374 
375       PetscCall(MatSeqAIJKokkosGenerateTransposeStructure(A, perm, T));
376       akok->transpose_perm = perm; // cache the perm in this matrix for reuse
377       PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt i) { T.values(i) = PetscConj(Aa(perm(i))); }));
378     }
379     akok->hermitian_updated = PETSC_TRUE;
380     *csrmatH                = akok->csrmatH;
381   }
382   PetscFunctionReturn(PETSC_SUCCESS);
383 }
384 
385 /* y = A x */
MatMult_SeqAIJKokkos(Mat A,Vec xx,Vec yy)386 static PetscErrorCode MatMult_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
387 {
388   Mat_SeqAIJKokkos          *aijkok;
389   ConstPetscScalarKokkosView xv;
390   PetscScalarKokkosView      yv;
391 
392   PetscFunctionBegin;
393   PetscCall(PetscLogGpuTimeBegin());
394   PetscCall(MatSeqAIJKokkosSyncDevice(A));
395   PetscCall(VecGetKokkosView(xx, &xv));
396   PetscCall(VecGetKokkosViewWrite(yy, &yv));
397   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
398   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A x + beta y */
399   PetscCall(VecRestoreKokkosView(xx, &xv));
400   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
401   /* 2.0*nnz - numRows seems more accurate here but assumes there are no zero-rows. So a little sloppy here. */
402   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
403   PetscCall(PetscLogGpuTimeEnd());
404   PetscFunctionReturn(PETSC_SUCCESS);
405 }
406 
407 /* y = A^T x */
MatMultTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)408 static PetscErrorCode MatMultTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
409 {
410   Mat_SeqAIJKokkos          *aijkok;
411   const char                *mode;
412   ConstPetscScalarKokkosView xv;
413   PetscScalarKokkosView      yv;
414   KokkosCsrMatrix            csrmat;
415 
416   PetscFunctionBegin;
417   PetscCall(PetscLogGpuTimeBegin());
418   PetscCall(MatSeqAIJKokkosSyncDevice(A));
419   PetscCall(VecGetKokkosView(xx, &xv));
420   PetscCall(VecGetKokkosViewWrite(yy, &yv));
421   if (A->form_explicit_transpose) {
422     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
423     mode = "N";
424   } else {
425     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
426     csrmat = aijkok->csrmat;
427     mode   = "T";
428   }
429   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^T x + beta y */
430   PetscCall(VecRestoreKokkosView(xx, &xv));
431   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
432   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
433   PetscCall(PetscLogGpuTimeEnd());
434   PetscFunctionReturn(PETSC_SUCCESS);
435 }
436 
437 /* y = A^H x */
MatMultHermitianTranspose_SeqAIJKokkos(Mat A,Vec xx,Vec yy)438 static PetscErrorCode MatMultHermitianTranspose_SeqAIJKokkos(Mat A, Vec xx, Vec yy)
439 {
440   Mat_SeqAIJKokkos          *aijkok;
441   const char                *mode;
442   ConstPetscScalarKokkosView xv;
443   PetscScalarKokkosView      yv;
444   KokkosCsrMatrix            csrmat;
445 
446   PetscFunctionBegin;
447   PetscCall(PetscLogGpuTimeBegin());
448   PetscCall(MatSeqAIJKokkosSyncDevice(A));
449   PetscCall(VecGetKokkosView(xx, &xv));
450   PetscCall(VecGetKokkosViewWrite(yy, &yv));
451   if (A->form_explicit_transpose) {
452     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
453     mode = "N";
454   } else {
455     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
456     csrmat = aijkok->csrmat;
457     mode   = "C";
458   }
459   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 0.0 /*beta*/, yv)); /* y = alpha A^H x + beta y */
460   PetscCall(VecRestoreKokkosView(xx, &xv));
461   PetscCall(VecRestoreKokkosViewWrite(yy, &yv));
462   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
463   PetscCall(PetscLogGpuTimeEnd());
464   PetscFunctionReturn(PETSC_SUCCESS);
465 }
466 
467 /* z = A x + y */
MatMultAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)468 static PetscErrorCode MatMultAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
469 {
470   Mat_SeqAIJKokkos          *aijkok;
471   ConstPetscScalarKokkosView xv;
472   PetscScalarKokkosView      zv;
473 
474   PetscFunctionBegin;
475   PetscCall(PetscLogGpuTimeBegin());
476   PetscCall(MatSeqAIJKokkosSyncDevice(A));
477   if (zz != yy) PetscCall(VecCopy(yy, zz)); // depending on yy's sync flags, zz might get its latest data on host
478   PetscCall(VecGetKokkosView(xx, &xv));
479   PetscCall(VecGetKokkosView(zz, &zv)); // do after VecCopy(yy, zz) to get the latest data on device
480   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
481   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), "N", 1.0 /*alpha*/, aijkok->csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A x + beta z */
482   PetscCall(VecRestoreKokkosView(xx, &xv));
483   PetscCall(VecRestoreKokkosView(zz, &zv));
484   PetscCall(PetscLogGpuFlops(2.0 * aijkok->csrmat.nnz()));
485   PetscCall(PetscLogGpuTimeEnd());
486   PetscFunctionReturn(PETSC_SUCCESS);
487 }
488 
489 /* z = A^T x + y */
MatMultTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)490 static PetscErrorCode MatMultTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
491 {
492   Mat_SeqAIJKokkos          *aijkok;
493   const char                *mode;
494   ConstPetscScalarKokkosView xv;
495   PetscScalarKokkosView      zv;
496   KokkosCsrMatrix            csrmat;
497 
498   PetscFunctionBegin;
499   PetscCall(PetscLogGpuTimeBegin());
500   PetscCall(MatSeqAIJKokkosSyncDevice(A));
501   if (zz != yy) PetscCall(VecCopy(yy, zz));
502   PetscCall(VecGetKokkosView(xx, &xv));
503   PetscCall(VecGetKokkosView(zz, &zv));
504   if (A->form_explicit_transpose) {
505     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmat));
506     mode = "N";
507   } else {
508     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
509     csrmat = aijkok->csrmat;
510     mode   = "T";
511   }
512   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^T x + beta z */
513   PetscCall(VecRestoreKokkosView(xx, &xv));
514   PetscCall(VecRestoreKokkosView(zz, &zv));
515   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
516   PetscCall(PetscLogGpuTimeEnd());
517   PetscFunctionReturn(PETSC_SUCCESS);
518 }
519 
520 /* z = A^H x + y */
MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A,Vec xx,Vec yy,Vec zz)521 static PetscErrorCode MatMultHermitianTransposeAdd_SeqAIJKokkos(Mat A, Vec xx, Vec yy, Vec zz)
522 {
523   Mat_SeqAIJKokkos          *aijkok;
524   const char                *mode;
525   ConstPetscScalarKokkosView xv;
526   PetscScalarKokkosView      zv;
527   KokkosCsrMatrix            csrmat;
528 
529   PetscFunctionBegin;
530   PetscCall(PetscLogGpuTimeBegin());
531   PetscCall(MatSeqAIJKokkosSyncDevice(A));
532   if (zz != yy) PetscCall(VecCopy(yy, zz));
533   PetscCall(VecGetKokkosView(xx, &xv));
534   PetscCall(VecGetKokkosView(zz, &zv));
535   if (A->form_explicit_transpose) {
536     PetscCall(MatSeqAIJKokkosGenerateHermitian_Private(A, &csrmat));
537     mode = "N";
538   } else {
539     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
540     csrmat = aijkok->csrmat;
541     mode   = "C";
542   }
543   PetscCallCXX(KokkosSparse::spmv(PetscGetKokkosExecutionSpace(), mode, 1.0 /*alpha*/, csrmat, xv, 1.0 /*beta*/, zv)); /* z = alpha A^H x + beta z */
544   PetscCall(VecRestoreKokkosView(xx, &xv));
545   PetscCall(VecRestoreKokkosView(zz, &zv));
546   PetscCall(PetscLogGpuFlops(2.0 * csrmat.nnz()));
547   PetscCall(PetscLogGpuTimeEnd());
548   PetscFunctionReturn(PETSC_SUCCESS);
549 }
550 
MatSetOption_SeqAIJKokkos(Mat A,MatOption op,PetscBool flg)551 static PetscErrorCode MatSetOption_SeqAIJKokkos(Mat A, MatOption op, PetscBool flg)
552 {
553   Mat_SeqAIJKokkos *aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
554 
555   PetscFunctionBegin;
556   switch (op) {
557   case MAT_FORM_EXPLICIT_TRANSPOSE:
558     /* need to destroy the transpose matrix if present to prevent from logic errors if flg is set to true later */
559     if (A->form_explicit_transpose && !flg && aijkok) PetscCall(aijkok->DestroyMatTranspose());
560     A->form_explicit_transpose = flg;
561     break;
562   default:
563     PetscCall(MatSetOption_SeqAIJ(A, op, flg));
564     break;
565   }
566   PetscFunctionReturn(PETSC_SUCCESS);
567 }
568 
569 /* Depending on reuse, either build a new mat, or use the existing mat */
MatConvert_SeqAIJ_SeqAIJKokkos(Mat A,MatType mtype,MatReuse reuse,Mat * newmat)570 PETSC_INTERN PetscErrorCode MatConvert_SeqAIJ_SeqAIJKokkos(Mat A, MatType mtype, MatReuse reuse, Mat *newmat)
571 {
572   Mat_SeqAIJ *aseq;
573 
574   PetscFunctionBegin;
575   PetscCall(PetscKokkosInitializeCheck());
576   if (reuse == MAT_INITIAL_MATRIX) { /* Build a brand new mat */
577     PetscCall(MatDuplicate(A, MAT_COPY_VALUES, newmat));
578     PetscCall(MatSetType(*newmat, mtype));
579   } else if (reuse == MAT_REUSE_MATRIX) {                 /* Reuse the mat created before */
580     PetscCall(MatCopy(A, *newmat, SAME_NONZERO_PATTERN)); /* newmat is already a SeqAIJKokkos */
581   } else if (reuse == MAT_INPLACE_MATRIX) {               /* newmat is A */
582     PetscCheck(A == *newmat, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "A != *newmat with MAT_INPLACE_MATRIX");
583     PetscCall(PetscFree(A->defaultvectype));
584     PetscCall(PetscStrallocpy(VECKOKKOS, &A->defaultvectype)); /* Allocate and copy the string */
585     PetscCall(PetscObjectChangeTypeName((PetscObject)A, MATSEQAIJKOKKOS));
586     PetscCall(MatSetOps_SeqAIJKokkos(A));
587     aseq = static_cast<Mat_SeqAIJ *>(A->data);
588     if (A->assembled) { /* Copy i, j (but not values) to device for an assembled matrix if not yet */
589       PetscCheck(!A->spptr, PETSC_COMM_WORLD, PETSC_ERR_PLIB, "Expect NULL (Mat_SeqAIJKokkos*)A->spptr");
590       A->spptr = new Mat_SeqAIJKokkos(A, A->rmap->n, A->cmap->n, aseq, A->nonzerostate, PETSC_FALSE);
591     }
592   }
593   PetscFunctionReturn(PETSC_SUCCESS);
594 }
595 
596 /* MatDuplicate always creates a new matrix. MatDuplicate can be called either on an assembled matrix or
597    an unassembled matrix, even though MAT_COPY_VALUES is not allowed for unassembled matrix.
598  */
MatDuplicate_SeqAIJKokkos(Mat A,MatDuplicateOption dupOption,Mat * B)599 static PetscErrorCode MatDuplicate_SeqAIJKokkos(Mat A, MatDuplicateOption dupOption, Mat *B)
600 {
601   Mat_SeqAIJ       *bseq;
602   Mat_SeqAIJKokkos *akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr), *bkok;
603   Mat               mat;
604 
605   PetscFunctionBegin;
606   /* Do not copy values on host as A's latest values might be on device. We don't want to do sync blindly */
607   PetscCall(MatDuplicate_SeqAIJ(A, MAT_DO_NOT_COPY_VALUES, B));
608   mat = *B;
609   if (A->assembled) {
610     bseq = static_cast<Mat_SeqAIJ *>(mat->data);
611     bkok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, bseq, mat->nonzerostate, PETSC_FALSE);
612     bkok->a_dual.clear_sync_state(); /* Clear B's sync state as it will be decided below */
613     /* Now copy values to B if needed */
614     if (dupOption == MAT_COPY_VALUES) {
615       if (akok->a_dual.need_sync_device()) {
616         Kokkos::deep_copy(bkok->a_dual.view_host(), akok->a_dual.view_host());
617         bkok->a_dual.modify_host();
618       } else { /* If device has the latest data, we only copy data on device */
619         Kokkos::deep_copy(bkok->a_dual.view_device(), akok->a_dual.view_device());
620         bkok->a_dual.modify_device();
621       }
622     } else { /* MAT_DO_NOT_COPY_VALUES or MAT_SHARE_NONZERO_PATTERN. B's values should be zeroed */
623       /* B's values on host should be already zeroed by MatDuplicate_SeqAIJ() */
624       bkok->a_dual.modify_host();
625     }
626     mat->spptr = bkok;
627   }
628 
629   PetscCall(PetscFree(mat->defaultvectype));
630   PetscCall(PetscStrallocpy(VECKOKKOS, &mat->defaultvectype)); /* Allocate and copy the string */
631   PetscCall(PetscObjectChangeTypeName((PetscObject)mat, MATSEQAIJKOKKOS));
632   PetscCall(MatSetOps_SeqAIJKokkos(mat));
633   PetscFunctionReturn(PETSC_SUCCESS);
634 }
635 
MatTranspose_SeqAIJKokkos(Mat A,MatReuse reuse,Mat * B)636 static PetscErrorCode MatTranspose_SeqAIJKokkos(Mat A, MatReuse reuse, Mat *B)
637 {
638   Mat               At;
639   KokkosCsrMatrix   internT;
640   Mat_SeqAIJKokkos *atkok, *bkok;
641 
642   PetscFunctionBegin;
643   if (reuse == MAT_REUSE_MATRIX) PetscCall(MatTransposeCheckNonzeroState_Private(A, *B));
644   PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &internT)); /* Generate a transpose internally */
645   if (reuse == MAT_INITIAL_MATRIX || reuse == MAT_INPLACE_MATRIX) {
646     /* Deep copy internT, as we want to isolate the internal transpose */
647     PetscCallCXX(atkok = new Mat_SeqAIJKokkos(KokkosCsrMatrix("csrmat", internT)));
648     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PetscObjectComm((PetscObject)A), atkok, &At));
649     if (reuse == MAT_INITIAL_MATRIX) *B = At;
650     else PetscCall(MatHeaderReplace(A, &At)); /* Replace A with At inplace */
651   } else {                                    /* MAT_REUSE_MATRIX, just need to copy values to B on device */
652     if ((*B)->assembled) {
653       bkok = static_cast<Mat_SeqAIJKokkos *>((*B)->spptr);
654       PetscCallCXX(Kokkos::deep_copy(bkok->a_dual.view_device(), internT.values));
655       PetscCall(MatSeqAIJKokkosModifyDevice(*B));
656     } else if ((*B)->preallocated) { /* It is ok for B to be only preallocated, as needed in MatTranspose_MPIAIJ */
657       Mat_SeqAIJ             *bseq = static_cast<Mat_SeqAIJ *>((*B)->data);
658       MatScalarKokkosViewHost a_h(bseq->a, internT.nnz()); /* bseq->nz = 0 if unassembled */
659       MatColIdxKokkosViewHost j_h(bseq->j, internT.nnz());
660       PetscCallCXX(Kokkos::deep_copy(a_h, internT.values));
661       PetscCallCXX(Kokkos::deep_copy(j_h, internT.graph.entries));
662     } else SETERRQ(PetscObjectComm((PetscObject)A), PETSC_ERR_ARG_WRONGSTATE, "B must be assembled or preallocated");
663   }
664   PetscFunctionReturn(PETSC_SUCCESS);
665 }
666 
MatDestroy_SeqAIJKokkos(Mat A)667 static PetscErrorCode MatDestroy_SeqAIJKokkos(Mat A)
668 {
669   Mat_SeqAIJKokkos *aijkok;
670 
671   PetscFunctionBegin;
672   if (A->factortype == MAT_FACTOR_NONE) {
673     aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
674     delete aijkok;
675   } else {
676     delete static_cast<Mat_SeqAIJKokkosTriFactors *>(A->spptr);
677   }
678   A->spptr = NULL;
679   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatFactorGetSolverType_C", NULL));
680   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", NULL));
681   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", NULL));
682 #if defined(PETSC_HAVE_HYPRE)
683   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", NULL));
684 #endif
685   PetscCall(MatDestroy_SeqAIJ(A));
686   PetscFunctionReturn(PETSC_SUCCESS);
687 }
688 
689 /*MC
690    MATSEQAIJKOKKOS - MATAIJKOKKOS = "(seq)aijkokkos" - A matrix type to be used for sparse matrices with Kokkos
691 
692    A matrix type using Kokkos-Kernels CrsMatrix type for portability across different device types
693 
694    Options Database Key:
695 .  -mat_type aijkokkos - sets the matrix type to `MATSEQAIJKOKKOS` during a call to `MatSetFromOptions()`
696 
697   Level: beginner
698 
699 .seealso: [](ch_matrices), `Mat`, `MatCreateSeqAIJKokkos()`, `MATMPIAIJKOKKOS`
700 M*/
MatCreate_SeqAIJKokkos(Mat A)701 PETSC_EXTERN PetscErrorCode MatCreate_SeqAIJKokkos(Mat A)
702 {
703   PetscFunctionBegin;
704   PetscCall(PetscKokkosInitializeCheck());
705   PetscCall(MatCreate_SeqAIJ(A));
706   PetscCall(MatConvert_SeqAIJ_SeqAIJKokkos(A, MATSEQAIJKOKKOS, MAT_INPLACE_MATRIX, &A));
707   PetscFunctionReturn(PETSC_SUCCESS);
708 }
709 
710 /* Merge A, B into a matrix C. A is put before B. C's size would be A->rmap->n by (A->cmap->n + B->cmap->n) */
MatSeqAIJKokkosMergeMats(Mat A,Mat B,MatReuse reuse,Mat * C)711 PetscErrorCode MatSeqAIJKokkosMergeMats(Mat A, Mat B, MatReuse reuse, Mat *C)
712 {
713   Mat_SeqAIJ         *a, *b;
714   Mat_SeqAIJKokkos   *akok, *bkok, *ckok;
715   MatScalarKokkosView aa, ba, ca;
716   MatRowMapKokkosView ai, bi, ci;
717   MatColIdxKokkosView aj, bj, cj;
718   PetscInt            m, n, nnz, aN;
719 
720   PetscFunctionBegin;
721   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
722   PetscValidHeaderSpecific(B, MAT_CLASSID, 2);
723   PetscAssertPointer(C, 4);
724   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
725   PetscCheckTypeName(B, MATSEQAIJKOKKOS);
726   PetscCheck(A->rmap->n == B->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Invalid number or rows %" PetscInt_FMT " != %" PetscInt_FMT, A->rmap->n, B->rmap->n);
727   PetscCheck(reuse != MAT_INPLACE_MATRIX, PETSC_COMM_SELF, PETSC_ERR_SUP, "MAT_INPLACE_MATRIX not supported");
728 
729   PetscCall(MatSeqAIJKokkosSyncDevice(A));
730   PetscCall(MatSeqAIJKokkosSyncDevice(B));
731   a    = static_cast<Mat_SeqAIJ *>(A->data);
732   b    = static_cast<Mat_SeqAIJ *>(B->data);
733   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
734   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
735   aa   = akok->a_dual.view_device();
736   ai   = akok->i_dual.view_device();
737   ba   = bkok->a_dual.view_device();
738   bi   = bkok->i_dual.view_device();
739   m    = A->rmap->n; /* M, N and nnz of C */
740   n    = A->cmap->n + B->cmap->n;
741   nnz  = a->nz + b->nz;
742   aN   = A->cmap->n; /* N of A */
743   if (reuse == MAT_INITIAL_MATRIX) {
744     aj      = akok->j_dual.view_device();
745     bj      = bkok->j_dual.view_device();
746     auto ca = MatScalarKokkosView("a", aa.extent(0) + ba.extent(0));
747     auto ci = MatRowMapKokkosView("i", ai.extent(0));
748     auto cj = MatColIdxKokkosView("j", aj.extent(0) + bj.extent(0));
749 
750     /* Concatenate A and B in parallel using Kokkos hierarchical parallelism */
751     Kokkos::parallel_for(
752       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
753         PetscInt i       = t.league_rank(); /* row i */
754         PetscInt coffset = ai(i) + bi(i), alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
755 
756         Kokkos::single(Kokkos::PerTeam(t), [=]() { /* this side effect only happens once per whole team */
757                                                    ci(i) = coffset;
758                                                    if (i == m - 1) ci(m) = ai(m) + bi(m);
759         });
760 
761         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
762           if (k < alen) {
763             ca(coffset + k) = aa(ai(i) + k);
764             cj(coffset + k) = aj(ai(i) + k);
765           } else {
766             ca(coffset + k) = ba(bi(i) + k - alen);
767             cj(coffset + k) = bj(bi(i) + k - alen) + aN; /* Entries in B get new column indices in C */
768           }
769         });
770       });
771     PetscCallCXX(ckok = new Mat_SeqAIJKokkos(m, n, nnz, ci, cj, ca));
772     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, ckok, C));
773   } else if (reuse == MAT_REUSE_MATRIX) {
774     PetscValidHeaderSpecific(*C, MAT_CLASSID, 4);
775     PetscCheckTypeName(*C, MATSEQAIJKOKKOS);
776     ckok = static_cast<Mat_SeqAIJKokkos *>((*C)->spptr);
777     ca   = ckok->a_dual.view_device();
778     ci   = ckok->i_dual.view_device();
779 
780     Kokkos::parallel_for(
781       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
782         PetscInt i    = t.league_rank(); /* row i */
783         PetscInt alen = ai(i + 1) - ai(i), blen = bi(i + 1) - bi(i);
784         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, alen + blen), [&](PetscInt k) {
785           if (k < alen) ca(ci(i) + k) = aa(ai(i) + k);
786           else ca(ci(i) + k) = ba(bi(i) + k - alen);
787         });
788       });
789     PetscCall(MatSeqAIJKokkosModifyDevice(*C));
790   }
791   PetscFunctionReturn(PETSC_SUCCESS);
792 }
793 
MatProductCtxDestroy_SeqAIJKokkos(PetscCtxRt pdata)794 static PetscErrorCode MatProductCtxDestroy_SeqAIJKokkos(PetscCtxRt pdata)
795 {
796   PetscFunctionBegin;
797   delete *reinterpret_cast<MatProductCtx_SeqAIJKokkos **>(pdata);
798   PetscFunctionReturn(PETSC_SUCCESS);
799 }
800 
MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)801 static PetscErrorCode MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos(Mat C)
802 {
803   Mat_Product                *product = C->product;
804   Mat                         A, B;
805   bool                        transA, transB; /* use bool, since KK needs this type */
806   Mat_SeqAIJKokkos           *akok, *bkok, *ckok;
807   Mat_SeqAIJ                 *c;
808   MatProductCtx_SeqAIJKokkos *pdata;
809   KokkosCsrMatrix             csrmatA, csrmatB;
810 
811   PetscFunctionBegin;
812   MatCheckProduct(C, 1);
813   PetscCheck(C->product->data, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Product data empty");
814   pdata = static_cast<MatProductCtx_SeqAIJKokkos *>(C->product->data);
815 
816   // See if numeric has already been done in symbolic (e.g., user calls MatMatMult(A,B,MAT_INITIAL_MATRIX,..,C)).
817   // If yes, skip the numeric, but reset the flag so that next time when user calls MatMatMult(E,F,MAT_REUSE_MATRIX,..,C),
818   // we still do numeric.
819   if (pdata->reusesym) { // numeric reuses results from symbolic
820     pdata->reusesym = PETSC_FALSE;
821     PetscFunctionReturn(PETSC_SUCCESS);
822   }
823 
824   switch (product->type) {
825   case MATPRODUCT_AB:
826     transA = false;
827     transB = false;
828     break;
829   case MATPRODUCT_AtB:
830     transA = true;
831     transB = false;
832     break;
833   case MATPRODUCT_ABt:
834     transA = false;
835     transB = true;
836     break;
837   default:
838     SETERRQ(PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
839   }
840 
841   A = product->A;
842   B = product->B;
843   PetscCall(MatSeqAIJKokkosSyncDevice(A));
844   PetscCall(MatSeqAIJKokkosSyncDevice(B));
845   akok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
846   bkok = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
847   ckok = static_cast<Mat_SeqAIJKokkos *>(C->spptr);
848 
849   PetscCheck(ckok, PetscObjectComm((PetscObject)C), PETSC_ERR_PLIB, "Device data structure spptr is empty");
850 
851   csrmatA = akok->csrmat;
852   csrmatB = bkok->csrmat;
853 
854   /* TODO: Once KK spgemm implements transpose, we can get rid of the explicit transpose here */
855   if (transA) {
856     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
857     transA = false;
858   }
859 
860   if (transB) {
861     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
862     transB = false;
863   }
864   PetscCall(PetscLogGpuTimeBegin());
865   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, ckok->csrmat));
866 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
867   auto spgemmHandle = pdata->kh.get_spgemm_handle();
868   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(ckok->csrmat)); /* without sort, mat_tests-ex62_14_seqaijkokkos fails */
869 #endif
870 
871   PetscCall(PetscLogGpuTimeEnd());
872   PetscCall(MatSeqAIJKokkosModifyDevice(C));
873   /* shorter version of MatAssemblyEnd_SeqAIJ */
874   c = (Mat_SeqAIJ *)C->data;
875   PetscCall(PetscInfo(C, "Matrix size: %" PetscInt_FMT " X %" PetscInt_FMT "; storage space: 0 unneeded, %" PetscInt_FMT " used\n", C->rmap->n, C->cmap->n, c->nz));
876   PetscCall(PetscInfo(C, "Number of mallocs during MatSetValues() is 0\n"));
877   PetscCall(PetscInfo(C, "Maximum nonzeros in any row is %" PetscInt_FMT "\n", c->rmax));
878   c->reallocs         = 0;
879   C->info.mallocs     = 0;
880   C->info.nz_unneeded = 0;
881   C->assembled = C->was_assembled = PETSC_TRUE;
882   C->num_ass++;
883   PetscFunctionReturn(PETSC_SUCCESS);
884 }
885 
MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)886 static PetscErrorCode MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos(Mat C)
887 {
888   Mat_Product                *product = C->product;
889   MatProductType              ptype;
890   Mat                         A, B;
891   bool                        transA, transB;
892   Mat_SeqAIJKokkos           *akok, *bkok, *ckok;
893   MatProductCtx_SeqAIJKokkos *pdata;
894   MPI_Comm                    comm;
895   KokkosCsrMatrix             csrmatA, csrmatB, csrmatC;
896 
897   PetscFunctionBegin;
898   MatCheckProduct(C, 1);
899   PetscCall(PetscObjectGetComm((PetscObject)C, &comm));
900   PetscCheck(!product->data, comm, PETSC_ERR_PLIB, "Product data not empty");
901   A = product->A;
902   B = product->B;
903   PetscCall(MatSeqAIJKokkosSyncDevice(A));
904   PetscCall(MatSeqAIJKokkosSyncDevice(B));
905   akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
906   bkok    = static_cast<Mat_SeqAIJKokkos *>(B->spptr);
907   csrmatA = akok->csrmat;
908   csrmatB = bkok->csrmat;
909 
910   ptype = product->type;
911   // Take advantage of the symmetry if true
912   if (A->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_AtB) {
913     ptype                                          = MATPRODUCT_AB;
914     product->symbolic_used_the_fact_A_is_symmetric = PETSC_TRUE;
915   }
916   if (B->symmetric == PETSC_BOOL3_TRUE && ptype == MATPRODUCT_ABt) {
917     ptype                                          = MATPRODUCT_AB;
918     product->symbolic_used_the_fact_B_is_symmetric = PETSC_TRUE;
919   }
920 
921   switch (ptype) {
922   case MATPRODUCT_AB:
923     transA = false;
924     transB = false;
925     PetscCall(MatSetBlockSizesFromMats(C, A, B));
926     break;
927   case MATPRODUCT_AtB:
928     transA = true;
929     transB = false;
930     if (A->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->cmap->bs));
931     if (B->cmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->cmap->bs));
932     break;
933   case MATPRODUCT_ABt:
934     transA = false;
935     transB = true;
936     if (A->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->rmap, A->rmap->bs));
937     if (B->rmap->bs > 0) PetscCall(PetscLayoutSetBlockSize(C->cmap, B->rmap->bs));
938     break;
939   default:
940     SETERRQ(comm, PETSC_ERR_PLIB, "Unsupported product type %s", MatProductTypes[product->type]);
941   }
942   PetscCallCXX(product->data = pdata = new MatProductCtx_SeqAIJKokkos());
943   pdata->reusesym = product->api_user;
944 
945   /* TODO: add command line options to select spgemm algorithms */
946   auto spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_DEFAULT; /* default alg is TPL if enabled, otherwise KK */
947 
948   /* CUDA-10.2's spgemm has bugs. We prefer the SpGEMMreuse APIs introduced in cuda-11.4 */
949 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
950   #if PETSC_PKG_CUDA_VERSION_LT(11, 4, 0)
951   spgemm_alg = KokkosSparse::SPGEMMAlgorithm::SPGEMM_KK;
952   #endif
953 #endif
954   PetscCallCXX(pdata->kh.create_spgemm_handle(spgemm_alg));
955 
956   PetscCall(PetscLogGpuTimeBegin());
957   /* TODO: Get rid of the explicit transpose once KK-spgemm implements the transpose option */
958   if (transA) {
959     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(A, &csrmatA));
960     transA = false;
961   }
962 
963   if (transB) {
964     PetscCall(MatSeqAIJKokkosGenerateTranspose_Private(B, &csrmatB));
965     transB = false;
966   }
967 
968   PetscCallCXX(KokkosSparse::spgemm_symbolic(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
969   /* spgemm_symbolic() only populates C's rowmap, but not C's column indices.
970     So we have to do a fake spgemm_numeric() here to get csrmatC.j_d setup, before
971     calling new Mat_SeqAIJKokkos().
972     TODO: Remove the fake spgemm_numeric() after KK fixed this problem.
973   */
974   PetscCallCXX(KokkosSparse::spgemm_numeric(pdata->kh, csrmatA, transA, csrmatB, transB, csrmatC));
975 #if PETSC_PKG_KOKKOS_KERNELS_VERSION_LT(4, 0, 0)
976   /* Query if KK outputs a sorted matrix. If not, we need to sort it */
977   auto spgemmHandle = pdata->kh.get_spgemm_handle();
978   if (spgemmHandle->get_sort_option() != 1) PetscCallCXX(sort_crs_matrix(csrmatC)); /* sort_option defaults to -1 in KK!*/
979 #endif
980   PetscCall(PetscLogGpuTimeEnd());
981 
982   PetscCallCXX(ckok = new Mat_SeqAIJKokkos(csrmatC));
983   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(C, ckok));
984   C->product->destroy = MatProductCtxDestroy_SeqAIJKokkos;
985   PetscFunctionReturn(PETSC_SUCCESS);
986 }
987 
988 /* handles sparse matrix matrix ops */
MatProductSetFromOptions_SeqAIJKokkos(Mat mat)989 static PetscErrorCode MatProductSetFromOptions_SeqAIJKokkos(Mat mat)
990 {
991   Mat_Product *product = mat->product;
992   PetscBool    Biskok = PETSC_FALSE, Ciskok = PETSC_TRUE;
993 
994   PetscFunctionBegin;
995   MatCheckProduct(mat, 1);
996   PetscCall(PetscObjectTypeCompare((PetscObject)product->B, MATSEQAIJKOKKOS, &Biskok));
997   if (product->type == MATPRODUCT_ABC) PetscCall(PetscObjectTypeCompare((PetscObject)product->C, MATSEQAIJKOKKOS, &Ciskok));
998   if (Biskok && Ciskok) {
999     switch (product->type) {
1000     case MATPRODUCT_AB:
1001     case MATPRODUCT_AtB:
1002     case MATPRODUCT_ABt:
1003       mat->ops->productsymbolic = MatProductSymbolic_SeqAIJKokkos_SeqAIJKokkos;
1004       break;
1005     case MATPRODUCT_PtAP:
1006     case MATPRODUCT_RARt:
1007     case MATPRODUCT_ABC:
1008       mat->ops->productsymbolic = MatProductSymbolic_ABC_Basic;
1009       break;
1010     default:
1011       break;
1012     }
1013   } else { /* fallback for AIJ */
1014     PetscCall(MatProductSetFromOptions_SeqAIJ(mat));
1015   }
1016   PetscFunctionReturn(PETSC_SUCCESS);
1017 }
1018 
MatScale_SeqAIJKokkos(Mat A,PetscScalar a)1019 static PetscErrorCode MatScale_SeqAIJKokkos(Mat A, PetscScalar a)
1020 {
1021   Mat_SeqAIJKokkos *aijkok;
1022 
1023   PetscFunctionBegin;
1024   PetscCall(PetscLogGpuTimeBegin());
1025   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1026   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1027   KokkosBlas::scal(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), a, aijkok->a_dual.view_device());
1028   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1029   PetscCall(PetscLogGpuFlops(aijkok->a_dual.extent(0)));
1030   PetscCall(PetscLogGpuTimeEnd());
1031   PetscFunctionReturn(PETSC_SUCCESS);
1032 }
1033 
1034 // add a to A's diagonal (if A is square) or main diagonal (if A is rectangular)
MatShift_SeqAIJKokkos(Mat A,PetscScalar a)1035 static PetscErrorCode MatShift_SeqAIJKokkos(Mat A, PetscScalar a)
1036 {
1037   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1038 
1039   PetscFunctionBegin;
1040   if (A->assembled && aijseq->diagDense) { // no missing diagonals
1041     PetscInt n = PetscMin(A->rmap->n, A->cmap->n);
1042 
1043     PetscCall(PetscLogGpuTimeBegin());
1044     PetscCall(MatSeqAIJKokkosSyncDevice(A));
1045     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1046     const auto &Aa     = aijkok->a_dual.view_device();
1047     const auto &Adiag  = aijkok->diag_dual.view_device();
1048     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) { Aa(Adiag(i)) += a; }));
1049     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1050     PetscCall(PetscLogGpuFlops(n));
1051     PetscCall(PetscLogGpuTimeEnd());
1052   } else { // need reassembly, very slow!
1053     PetscCall(MatShift_Basic(A, a));
1054   }
1055   PetscFunctionReturn(PETSC_SUCCESS);
1056 }
1057 
MatDiagonalSet_SeqAIJKokkos(Mat Y,Vec D,InsertMode is)1058 static PetscErrorCode MatDiagonalSet_SeqAIJKokkos(Mat Y, Vec D, InsertMode is)
1059 {
1060   Mat_SeqAIJ *aijseq = static_cast<Mat_SeqAIJ *>(Y->data);
1061 
1062   PetscFunctionBegin;
1063   if (Y->assembled && aijseq->diagDense) { // no missing diagonals
1064     ConstPetscScalarKokkosView dv;
1065     PetscInt                   n, nv;
1066 
1067     PetscCall(PetscLogGpuTimeBegin());
1068     PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1069     PetscCall(VecGetKokkosView(D, &dv));
1070     PetscCall(VecGetLocalSize(D, &nv));
1071     n = PetscMin(Y->rmap->n, Y->cmap->n);
1072     PetscCheck(n == nv, PetscObjectComm((PetscObject)Y), PETSC_ERR_ARG_SIZ, "Matrix size and vector size do not match");
1073 
1074     const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1075     const auto &Aa     = aijkok->a_dual.view_device();
1076     const auto &Adiag  = aijkok->diag_dual.view_device();
1077     PetscCallCXX(Kokkos::parallel_for(
1078       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1079         if (is == INSERT_VALUES) Aa(Adiag(i)) = dv(i);
1080         else Aa(Adiag(i)) += dv(i);
1081       }));
1082     PetscCall(VecRestoreKokkosView(D, &dv));
1083     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1084     PetscCall(PetscLogGpuFlops(n));
1085     PetscCall(PetscLogGpuTimeEnd());
1086   } else { // need reassembly, very slow!
1087     PetscCall(MatDiagonalSet_Default(Y, D, is));
1088   }
1089   PetscFunctionReturn(PETSC_SUCCESS);
1090 }
1091 
MatDiagonalScale_SeqAIJKokkos(Mat A,Vec ll,Vec rr)1092 static PetscErrorCode MatDiagonalScale_SeqAIJKokkos(Mat A, Vec ll, Vec rr)
1093 {
1094   Mat_SeqAIJ                *aijseq = static_cast<Mat_SeqAIJ *>(A->data);
1095   PetscInt                   m = A->rmap->n, n = A->cmap->n, nz = aijseq->nz;
1096   ConstPetscScalarKokkosView lv, rv;
1097 
1098   PetscFunctionBegin;
1099   PetscCall(PetscLogGpuTimeBegin());
1100   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1101   const auto  aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1102   const auto &Aa     = aijkok->a_dual.view_device();
1103   const auto &Ai     = aijkok->i_dual.view_device();
1104   const auto &Aj     = aijkok->j_dual.view_device();
1105   if (ll) {
1106     PetscCall(VecGetLocalSize(ll, &m));
1107     PetscCheck(m == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Left scaling vector wrong length");
1108     PetscCall(VecGetKokkosView(ll, &lv));
1109     PetscCallCXX(Kokkos::parallel_for( // for each row
1110       Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), m, Kokkos::AUTO()), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1111         PetscInt i   = t.league_rank(); // row i
1112         PetscInt len = Ai(i + 1) - Ai(i);
1113         // scale entries on the row
1114         Kokkos::parallel_for(Kokkos::TeamThreadRange(t, len), [&](PetscInt j) { Aa(Ai(i) + j) *= lv(i); });
1115       }));
1116     PetscCall(VecRestoreKokkosView(ll, &lv));
1117     PetscCall(PetscLogGpuFlops(nz));
1118   }
1119   if (rr) {
1120     PetscCall(VecGetLocalSize(rr, &n));
1121     PetscCheck(n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Right scaling vector wrong length");
1122     PetscCall(VecGetKokkosView(rr, &rv));
1123     PetscCallCXX(Kokkos::parallel_for( // for each nonzero
1124       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, nz), KOKKOS_LAMBDA(const PetscInt k) { Aa(k) *= rv(Aj(k)); }));
1125     PetscCall(VecRestoreKokkosView(rr, &lv));
1126     PetscCall(PetscLogGpuFlops(nz));
1127   }
1128   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1129   PetscCall(PetscLogGpuTimeEnd());
1130   PetscFunctionReturn(PETSC_SUCCESS);
1131 }
1132 
MatZeroEntries_SeqAIJKokkos(Mat A)1133 static PetscErrorCode MatZeroEntries_SeqAIJKokkos(Mat A)
1134 {
1135   Mat_SeqAIJKokkos *aijkok;
1136 
1137   PetscFunctionBegin;
1138   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1139   if (aijkok) { /* Only zero the device if data is already there */
1140     KokkosBlas::fill(PetscGetKokkosExecutionSpace(), aijkok->a_dual.view_device(), 0.0);
1141     PetscCall(MatSeqAIJKokkosModifyDevice(A));
1142   } else { /* Might be preallocated but not assembled */
1143     PetscCall(MatZeroEntries_SeqAIJ(A));
1144   }
1145   PetscFunctionReturn(PETSC_SUCCESS);
1146 }
1147 
MatGetDiagonal_SeqAIJKokkos(Mat A,Vec x)1148 static PetscErrorCode MatGetDiagonal_SeqAIJKokkos(Mat A, Vec x)
1149 {
1150   Mat_SeqAIJKokkos     *aijkok;
1151   PetscInt              n;
1152   PetscScalarKokkosView xv;
1153 
1154   PetscFunctionBegin;
1155   PetscCall(VecGetLocalSize(x, &n));
1156   PetscCheck(n == A->rmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "Nonconforming matrix and vector");
1157   PetscCheck(A->factortype == MAT_FACTOR_NONE, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetDiagonal_SeqAIJKokkos not supported on factored matrices");
1158 
1159   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1160   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1161 
1162   const auto &Aa    = aijkok->a_dual.view_device();
1163   const auto &Ai    = aijkok->i_dual.view_device();
1164   const auto &Adiag = aijkok->diag_dual.view_device();
1165 
1166   PetscCall(VecGetKokkosViewWrite(x, &xv));
1167   Kokkos::parallel_for(
1168     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt i) {
1169       if (Adiag(i) < Ai(i + 1)) xv(i) = Aa(Adiag(i));
1170       else xv(i) = 0;
1171     });
1172   PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1173   PetscFunctionReturn(PETSC_SUCCESS);
1174 }
1175 
1176 /* Get a Kokkos View from a mat of type MatSeqAIJKokkos */
MatSeqAIJGetKokkosView(Mat A,ConstMatScalarKokkosView * kv)1177 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1178 {
1179   Mat_SeqAIJKokkos *aijkok;
1180 
1181   PetscFunctionBegin;
1182   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1183   PetscAssertPointer(kv, 2);
1184   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1185   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1186   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1187   *kv    = aijkok->a_dual.view_device();
1188   PetscFunctionReturn(PETSC_SUCCESS);
1189 }
1190 
MatSeqAIJRestoreKokkosView(Mat A,ConstMatScalarKokkosView * kv)1191 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, ConstMatScalarKokkosView *kv)
1192 {
1193   PetscFunctionBegin;
1194   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1195   PetscAssertPointer(kv, 2);
1196   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1197   PetscFunctionReturn(PETSC_SUCCESS);
1198 }
1199 
MatSeqAIJGetKokkosView(Mat A,MatScalarKokkosView * kv)1200 PetscErrorCode MatSeqAIJGetKokkosView(Mat A, MatScalarKokkosView *kv)
1201 {
1202   Mat_SeqAIJKokkos *aijkok;
1203 
1204   PetscFunctionBegin;
1205   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1206   PetscAssertPointer(kv, 2);
1207   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1208   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1209   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1210   *kv    = aijkok->a_dual.view_device();
1211   PetscFunctionReturn(PETSC_SUCCESS);
1212 }
1213 
MatSeqAIJRestoreKokkosView(Mat A,MatScalarKokkosView * kv)1214 PetscErrorCode MatSeqAIJRestoreKokkosView(Mat A, MatScalarKokkosView *kv)
1215 {
1216   PetscFunctionBegin;
1217   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1218   PetscAssertPointer(kv, 2);
1219   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1220   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1221   PetscFunctionReturn(PETSC_SUCCESS);
1222 }
1223 
MatSeqAIJGetKokkosViewWrite(Mat A,MatScalarKokkosView * kv)1224 PetscErrorCode MatSeqAIJGetKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1225 {
1226   Mat_SeqAIJKokkos *aijkok;
1227 
1228   PetscFunctionBegin;
1229   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1230   PetscAssertPointer(kv, 2);
1231   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1232   aijkok = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1233   *kv    = aijkok->a_dual.view_device();
1234   PetscFunctionReturn(PETSC_SUCCESS);
1235 }
1236 
MatSeqAIJRestoreKokkosViewWrite(Mat A,MatScalarKokkosView * kv)1237 PetscErrorCode MatSeqAIJRestoreKokkosViewWrite(Mat A, MatScalarKokkosView *kv)
1238 {
1239   PetscFunctionBegin;
1240   PetscValidHeaderSpecific(A, MAT_CLASSID, 1);
1241   PetscAssertPointer(kv, 2);
1242   PetscCheckTypeName(A, MATSEQAIJKOKKOS);
1243   PetscCall(MatSeqAIJKokkosModifyDevice(A));
1244   PetscFunctionReturn(PETSC_SUCCESS);
1245 }
1246 
MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm,PetscInt m,PetscInt n,Kokkos::View<PetscInt * > & i_d,Kokkos::View<PetscInt * > & j_d,Kokkos::View<PetscScalar * > & a_d,Mat * A)1247 PetscErrorCode MatCreateSeqAIJKokkosWithKokkosViews(MPI_Comm comm, PetscInt m, PetscInt n, Kokkos::View<PetscInt *> &i_d, Kokkos::View<PetscInt *> &j_d, Kokkos::View<PetscScalar *> &a_d, Mat *A)
1248 {
1249   Mat_SeqAIJKokkos *akok;
1250 
1251   PetscFunctionBegin;
1252   PetscCallCXX(akok = new Mat_SeqAIJKokkos(m, n, j_d.extent(0), i_d, j_d, a_d));
1253   PetscCall(MatCreate(comm, A));
1254   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1255   PetscFunctionReturn(PETSC_SUCCESS);
1256 }
1257 
1258 /* Computes Y += alpha X */
MatAXPY_SeqAIJKokkos(Mat Y,PetscScalar alpha,Mat X,MatStructure pattern)1259 static PetscErrorCode MatAXPY_SeqAIJKokkos(Mat Y, PetscScalar alpha, Mat X, MatStructure pattern)
1260 {
1261   Mat_SeqAIJ              *x = (Mat_SeqAIJ *)X->data, *y = (Mat_SeqAIJ *)Y->data;
1262   Mat_SeqAIJKokkos        *xkok, *ykok, *zkok;
1263   ConstMatScalarKokkosView Xa;
1264   MatScalarKokkosView      Ya;
1265   auto                     exec = PetscGetKokkosExecutionSpace();
1266 
1267   PetscFunctionBegin;
1268   PetscCheckTypeName(Y, MATSEQAIJKOKKOS);
1269   PetscCheckTypeName(X, MATSEQAIJKOKKOS);
1270   PetscCall(MatSeqAIJKokkosSyncDevice(Y));
1271   PetscCall(MatSeqAIJKokkosSyncDevice(X));
1272   PetscCall(PetscLogGpuTimeBegin());
1273 
1274   if (pattern != SAME_NONZERO_PATTERN && x->nz == y->nz) {
1275     PetscBool e;
1276     PetscCall(PetscArraycmp(x->i, y->i, Y->rmap->n + 1, &e));
1277     if (e) {
1278       PetscCall(PetscArraycmp(x->j, y->j, y->nz, &e));
1279       if (e) pattern = SAME_NONZERO_PATTERN;
1280     }
1281   }
1282 
1283   /* cusparseDcsrgeam2() computes C = alpha A + beta B. If one knew sparsity pattern of C, one can skip
1284     cusparseScsrgeam2_bufferSizeExt() / cusparseXcsrgeam2Nnz(), and directly call cusparseScsrgeam2().
1285     If X is SUBSET_NONZERO_PATTERN of Y, we could take advantage of this cusparse feature. However,
1286     KokkosSparse::spadd(alpha,A,beta,B,C) has symbolic and numeric phases, MatAXPY does not.
1287   */
1288   ykok = static_cast<Mat_SeqAIJKokkos *>(Y->spptr);
1289   xkok = static_cast<Mat_SeqAIJKokkos *>(X->spptr);
1290   Xa   = xkok->a_dual.view_device();
1291   Ya   = ykok->a_dual.view_device();
1292 
1293   if (pattern == SAME_NONZERO_PATTERN) {
1294     KokkosBlas::axpy(exec, alpha, Xa, Ya);
1295     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1296   } else if (pattern == SUBSET_NONZERO_PATTERN) {
1297     MatRowMapKokkosView Xi = xkok->i_dual.view_device(), Yi = ykok->i_dual.view_device();
1298     MatColIdxKokkosView Xj = xkok->j_dual.view_device(), Yj = ykok->j_dual.view_device();
1299 
1300     Kokkos::parallel_for(
1301       Kokkos::TeamPolicy<>(exec, Y->rmap->n, 1), KOKKOS_LAMBDA(const KokkosTeamMemberType &t) {
1302         PetscInt i = t.league_rank(); // row i
1303         Kokkos::single(Kokkos::PerTeam(t), [=]() {
1304           // Only one thread works in a team
1305           PetscInt p, q = Yi(i);
1306           for (p = Xi(i); p < Xi(i + 1); p++) {          // For each nonzero on row i of X,
1307             while (Xj(p) != Yj(q) && q < Yi(i + 1)) q++; // find the matching nonzero on row i of Y.
1308             if (Xj(p) == Yj(q)) {                        // Found it
1309               Ya(q) += alpha * Xa(p);
1310               q++;
1311             } else {
1312             // If not found, it indicates the input is wrong (X is not a SUBSET_NONZERO_PATTERN of Y).
1313             // Just insert a NaN at the beginning of row i if it is not empty, to make the result wrong.
1314 #if PETSC_PKG_KOKKOS_VERSION_GE(5, 0, 0)
1315               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = KokkosKernels::ArithTraits<PetscScalar>::nan();
1316 #elif PETSC_PKG_KOKKOS_VERSION_GE(3, 7, 0)
1317               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::ArithTraits<PetscScalar>::nan();
1318 #else
1319               if (Yi(i) != Yi(i + 1)) Ya(Yi(i)) = Kokkos::Experimental::nan("1");
1320 #endif
1321             }
1322           }
1323         });
1324       });
1325     PetscCall(MatSeqAIJKokkosModifyDevice(Y));
1326   } else { // different nonzero patterns
1327     Mat             Z;
1328     KokkosCsrMatrix zcsr;
1329     KernelHandle    kh;
1330     kh.create_spadd_handle(true); // X, Y are sorted
1331     KokkosSparse::spadd_symbolic(&kh, xkok->csrmat, ykok->csrmat, zcsr);
1332     KokkosSparse::spadd_numeric(&kh, alpha, xkok->csrmat, (PetscScalar)1.0, ykok->csrmat, zcsr);
1333     zkok = new Mat_SeqAIJKokkos(zcsr);
1334     PetscCall(MatCreateSeqAIJKokkosWithCSRMatrix(PETSC_COMM_SELF, zkok, &Z));
1335     PetscCall(MatHeaderReplace(Y, &Z));
1336     kh.destroy_spadd_handle();
1337   }
1338   PetscCall(PetscLogGpuTimeEnd());
1339   PetscCall(PetscLogGpuFlops(xkok->a_dual.extent(0) * 2)); // Because we scaled X and then added it to Y
1340   PetscFunctionReturn(PETSC_SUCCESS);
1341 }
1342 
1343 struct MatCOOStruct_SeqAIJKokkos {
1344   PetscCount           n;
1345   PetscCount           Atot;
1346   PetscInt             nz;
1347   PetscCountKokkosView jmap;
1348   PetscCountKokkosView perm;
1349 
MatCOOStruct_SeqAIJKokkosMatCOOStruct_SeqAIJKokkos1350   MatCOOStruct_SeqAIJKokkos(const MatCOOStruct_SeqAIJ *coo_h)
1351   {
1352     nz   = coo_h->nz;
1353     n    = coo_h->n;
1354     Atot = coo_h->Atot;
1355     jmap = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->jmap, nz + 1));
1356     perm = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), PetscCountKokkosViewHost(coo_h->perm, Atot));
1357   }
1358 };
1359 
MatCOOStructDestroy_SeqAIJKokkos(PetscCtxRt data)1360 static PetscErrorCode MatCOOStructDestroy_SeqAIJKokkos(PetscCtxRt data)
1361 {
1362   PetscFunctionBegin;
1363   PetscCallCXX(delete *static_cast<MatCOOStruct_SeqAIJKokkos **>(data));
1364   PetscFunctionReturn(PETSC_SUCCESS);
1365 }
1366 
MatSetPreallocationCOO_SeqAIJKokkos(Mat mat,PetscCount coo_n,PetscInt coo_i[],PetscInt coo_j[])1367 static PetscErrorCode MatSetPreallocationCOO_SeqAIJKokkos(Mat mat, PetscCount coo_n, PetscInt coo_i[], PetscInt coo_j[])
1368 {
1369   Mat_SeqAIJKokkos          *akok;
1370   Mat_SeqAIJ                *aseq;
1371   PetscContainer             container_h;
1372   MatCOOStruct_SeqAIJ       *coo_h;
1373   MatCOOStruct_SeqAIJKokkos *coo_d;
1374 
1375   PetscFunctionBegin;
1376   PetscCall(MatSetPreallocationCOO_SeqAIJ(mat, coo_n, coo_i, coo_j));
1377   aseq = static_cast<Mat_SeqAIJ *>(mat->data);
1378   akok = static_cast<Mat_SeqAIJKokkos *>(mat->spptr);
1379   delete akok;
1380   mat->spptr = akok = new Mat_SeqAIJKokkos(mat, mat->rmap->n, mat->cmap->n, aseq, mat->nonzerostate + 1, PETSC_FALSE);
1381   PetscCall(MatZeroEntries_SeqAIJKokkos(mat));
1382 
1383   // Copy the COO struct to device
1384   PetscCall(PetscObjectQuery((PetscObject)mat, "__PETSc_MatCOOStruct_Host", (PetscObject *)&container_h));
1385   PetscCall(PetscContainerGetPointer(container_h, &coo_h));
1386   PetscCallCXX(coo_d = new MatCOOStruct_SeqAIJKokkos(coo_h));
1387 
1388   // Put the COO struct in a container and then attach that to the matrix
1389   PetscCall(PetscObjectContainerCompose((PetscObject)mat, "__PETSc_MatCOOStruct_Device", coo_d, MatCOOStructDestroy_SeqAIJKokkos));
1390   PetscFunctionReturn(PETSC_SUCCESS);
1391 }
1392 
MatSetValuesCOO_SeqAIJKokkos(Mat A,const PetscScalar v[],InsertMode imode)1393 static PetscErrorCode MatSetValuesCOO_SeqAIJKokkos(Mat A, const PetscScalar v[], InsertMode imode)
1394 {
1395   MatScalarKokkosView        Aa;
1396   ConstMatScalarKokkosView   kv;
1397   PetscMemType               memtype;
1398   PetscContainer             container;
1399   MatCOOStruct_SeqAIJKokkos *coo;
1400 
1401   PetscFunctionBegin;
1402   PetscCall(PetscObjectQuery((PetscObject)A, "__PETSc_MatCOOStruct_Device", (PetscObject *)&container));
1403   PetscCall(PetscContainerGetPointer(container, &coo));
1404 
1405   const auto &n    = coo->n;
1406   const auto &Annz = coo->nz;
1407   const auto &jmap = coo->jmap;
1408   const auto &perm = coo->perm;
1409 
1410   PetscCall(PetscGetMemType(v, &memtype));
1411   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1412     kv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstMatScalarKokkosViewHost(v, n));
1413   } else {
1414     kv = ConstMatScalarKokkosView(v, n); /* Directly use v[]'s memory */
1415   }
1416 
1417   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJGetKokkosViewWrite(A, &Aa)); /* write matrix values */
1418   else PetscCall(MatSeqAIJGetKokkosView(A, &Aa));                             /* read & write matrix values */
1419 
1420   PetscCall(PetscLogGpuTimeBegin());
1421   Kokkos::parallel_for(
1422     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, Annz), KOKKOS_LAMBDA(const PetscCount i) {
1423       PetscScalar sum = 0.0;
1424       for (PetscCount k = jmap(i); k < jmap(i + 1); k++) sum += kv(perm(k));
1425       Aa(i) = (imode == INSERT_VALUES ? 0.0 : Aa(i)) + sum;
1426     });
1427   PetscCall(PetscLogGpuTimeEnd());
1428 
1429   if (imode == INSERT_VALUES) PetscCall(MatSeqAIJRestoreKokkosViewWrite(A, &Aa));
1430   else PetscCall(MatSeqAIJRestoreKokkosView(A, &Aa));
1431   PetscFunctionReturn(PETSC_SUCCESS);
1432 }
1433 
MatSetOps_SeqAIJKokkos(Mat A)1434 static PetscErrorCode MatSetOps_SeqAIJKokkos(Mat A)
1435 {
1436   Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1437 
1438   PetscFunctionBegin;
1439   A->offloadmask = PETSC_OFFLOAD_KOKKOS; /* We do not really use this flag */
1440   A->boundtocpu  = PETSC_FALSE;
1441 
1442   A->ops->assemblyend               = MatAssemblyEnd_SeqAIJKokkos;
1443   A->ops->destroy                   = MatDestroy_SeqAIJKokkos;
1444   A->ops->duplicate                 = MatDuplicate_SeqAIJKokkos;
1445   A->ops->axpy                      = MatAXPY_SeqAIJKokkos;
1446   A->ops->scale                     = MatScale_SeqAIJKokkos;
1447   A->ops->zeroentries               = MatZeroEntries_SeqAIJKokkos;
1448   A->ops->productsetfromoptions     = MatProductSetFromOptions_SeqAIJKokkos;
1449   A->ops->mult                      = MatMult_SeqAIJKokkos;
1450   A->ops->multadd                   = MatMultAdd_SeqAIJKokkos;
1451   A->ops->multtranspose             = MatMultTranspose_SeqAIJKokkos;
1452   A->ops->multtransposeadd          = MatMultTransposeAdd_SeqAIJKokkos;
1453   A->ops->multhermitiantranspose    = MatMultHermitianTranspose_SeqAIJKokkos;
1454   A->ops->multhermitiantransposeadd = MatMultHermitianTransposeAdd_SeqAIJKokkos;
1455   A->ops->productnumeric            = MatProductNumeric_SeqAIJKokkos_SeqAIJKokkos;
1456   A->ops->transpose                 = MatTranspose_SeqAIJKokkos;
1457   A->ops->setoption                 = MatSetOption_SeqAIJKokkos;
1458   A->ops->getdiagonal               = MatGetDiagonal_SeqAIJKokkos;
1459   A->ops->shift                     = MatShift_SeqAIJKokkos;
1460   A->ops->diagonalset               = MatDiagonalSet_SeqAIJKokkos;
1461   A->ops->diagonalscale             = MatDiagonalScale_SeqAIJKokkos;
1462   A->ops->getcurrentmemtype         = MatGetCurrentMemType_SeqAIJKokkos;
1463   a->ops->getarray                  = MatSeqAIJGetArray_SeqAIJKokkos;
1464   a->ops->restorearray              = MatSeqAIJRestoreArray_SeqAIJKokkos;
1465   a->ops->getarrayread              = MatSeqAIJGetArrayRead_SeqAIJKokkos;
1466   a->ops->restorearrayread          = MatSeqAIJRestoreArrayRead_SeqAIJKokkos;
1467   a->ops->getarraywrite             = MatSeqAIJGetArrayWrite_SeqAIJKokkos;
1468   a->ops->restorearraywrite         = MatSeqAIJRestoreArrayWrite_SeqAIJKokkos;
1469   a->ops->getcsrandmemtype          = MatSeqAIJGetCSRAndMemType_SeqAIJKokkos;
1470 
1471   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetPreallocationCOO_C", MatSetPreallocationCOO_SeqAIJKokkos));
1472   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatSetValuesCOO_C", MatSetValuesCOO_SeqAIJKokkos));
1473 #if defined(PETSC_HAVE_HYPRE)
1474   PetscCall(PetscObjectComposeFunction((PetscObject)A, "MatConvert_seqaijkokkos_hypre_C", MatConvert_AIJ_HYPRE));
1475 #endif
1476   PetscFunctionReturn(PETSC_SUCCESS);
1477 }
1478 
1479 /*
1480    Extract the (prescribled) diagonal blocks of the matrix and then invert them
1481 
1482   Input Parameters:
1483 +  A       - the MATSEQAIJKOKKOS matrix
1484 .  bs      - block sizes in 'csr' format, i.e., the i-th block has size bs(i+1) - bs(i)
1485 .  bs2     - square of block sizes in 'csr' format, i.e., the i-th block should be stored at offset bs2(i) in diagVal[]
1486 .  blkMap  - map row ids to block ids, i.e., row i belongs to the block blkMap(i)
1487 -  work    - a pre-allocated work buffer (as big as diagVal) for use by this routine
1488 
1489   Output Parameter:
1490 .  diagVal - the (pre-allocated) buffer to store the inverted blocks (each block is stored in column-major order)
1491 */
MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A,const PetscIntKokkosView & bs,const PetscIntKokkosView & bs2,const PetscIntKokkosView & blkMap,PetscScalarKokkosView & work,PetscScalarKokkosView & diagVal)1492 PETSC_INTERN PetscErrorCode MatInvertVariableBlockDiagonal_SeqAIJKokkos(Mat A, const PetscIntKokkosView &bs, const PetscIntKokkosView &bs2, const PetscIntKokkosView &blkMap, PetscScalarKokkosView &work, PetscScalarKokkosView &diagVal)
1493 {
1494   Mat_SeqAIJKokkos *akok    = static_cast<Mat_SeqAIJKokkos *>(A->spptr);
1495   PetscInt          nblocks = bs.extent(0) - 1;
1496 
1497   PetscFunctionBegin;
1498   PetscCall(MatSeqAIJKokkosSyncDevice(A)); // Since we'll access A's value on device
1499 
1500   // Pull out the diagonal blocks of the matrix and then invert the blocks
1501   auto Aa    = akok->a_dual.view_device();
1502   auto Ai    = akok->i_dual.view_device();
1503   auto Aj    = akok->j_dual.view_device();
1504   auto Adiag = akok->diag_dual.view_device();
1505   // TODO: how to tune the team size?
1506 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1507   auto ts = Kokkos::AUTO();
1508 #else
1509   auto ts = 16; // improved performance 30% over Kokkos::AUTO() with CUDA, but failed with "Kokkos::abort: Requested Team Size is too large!" on CPUs
1510 #endif
1511   PetscCallCXX(Kokkos::parallel_for(
1512     Kokkos::TeamPolicy<>(PetscGetKokkosExecutionSpace(), nblocks, ts), KOKKOS_LAMBDA(const KokkosTeamMemberType &teamMember) {
1513       const PetscInt bid    = teamMember.league_rank();                                                   // block id
1514       const PetscInt rstart = bs(bid);                                                                    // this block starts from this row
1515       const PetscInt m      = bs(bid + 1) - bs(bid);                                                      // size of this block
1516       const auto    &B      = Kokkos::View<PetscScalar **, Kokkos::LayoutLeft>(&diagVal(bs2(bid)), m, m); // column-major order
1517       const auto    &W      = PetscScalarKokkosView(&work(bs2(bid)), m * m);
1518 
1519       Kokkos::parallel_for(Kokkos::TeamThreadRange(teamMember, m), [=](const PetscInt &r) { // r-th row in B
1520         PetscInt i = rstart + r;                                                            // i-th row in A
1521 
1522         if (Ai(i) <= Adiag(i) && Adiag(i) < Ai(i + 1)) { // if the diagonal exists (common case)
1523           PetscInt first = Adiag(i) - r;                 // we start to check nonzeros from here along this row
1524 
1525           for (PetscInt c = 0; c < m; c++) {                   // walk n steps to see what column indices we will meet
1526             if (first + c < Ai(i) || first + c >= Ai(i + 1)) { // this entry (first+c) is out of range of this row, in other words, its value is zero
1527               B(r, c) = 0.0;
1528             } else if (Aj(first + c) == rstart + c) { // this entry is right on the (rstart+c) column
1529               B(r, c) = Aa(first + c);
1530             } else { // this entry does not show up in the CSR
1531               B(r, c) = 0.0;
1532             }
1533           }
1534         } else { // rare case that the diagonal does not exist
1535           const PetscInt begin = Ai(i);
1536           const PetscInt end   = Ai(i + 1);
1537           for (PetscInt c = 0; c < m; c++) B(r, c) = 0.0;
1538           for (PetscInt j = begin; j < end; j++) { // scan the whole row; could use binary search but this is a rare case so we did not.
1539             if (rstart <= Aj(j) && Aj(j) < rstart + m) B(r, Aj(j) - rstart) = Aa(j);
1540             else if (Aj(j) >= rstart + m) break;
1541           }
1542         }
1543       });
1544 
1545       // LU-decompose B (w/o pivoting) and then invert B
1546       KokkosBatched::TeamLU<KokkosTeamMemberType, KokkosBatched::Algo::LU::Unblocked>::invoke(teamMember, B, 0.0);
1547       KokkosBatched::TeamInverseLU<KokkosTeamMemberType, KokkosBatched::Algo::InverseLU::Unblocked>::invoke(teamMember, B, W);
1548     }));
1549   // PetscLogGpuFlops() is done in the caller PCSetUp_VPBJacobi_Kokkos as we don't want to compute the flops in kernels
1550   PetscFunctionReturn(PETSC_SUCCESS);
1551 }
1552 
MatSetSeqAIJKokkosWithCSRMatrix(Mat A,Mat_SeqAIJKokkos * akok)1553 PETSC_INTERN PetscErrorCode MatSetSeqAIJKokkosWithCSRMatrix(Mat A, Mat_SeqAIJKokkos *akok)
1554 {
1555   Mat_SeqAIJ *aseq;
1556   PetscInt    i, m, n;
1557   auto        exec = PetscGetKokkosExecutionSpace();
1558 
1559   PetscFunctionBegin;
1560   PetscCheck(!A->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "A->spptr is supposed to be empty");
1561 
1562   m = akok->nrows();
1563   n = akok->ncols();
1564   PetscCall(MatSetSizes(A, m, n, m, n));
1565   PetscCall(MatSetType(A, MATSEQAIJKOKKOS));
1566 
1567   /* Set up data structures of A as a MATSEQAIJ */
1568   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(A, MAT_SKIP_ALLOCATION, NULL));
1569   aseq = (Mat_SeqAIJ *)A->data;
1570 
1571   PetscCall(KokkosDualViewSyncHost(akok->i_dual, exec)); /* We always need sync'ed i, j on host */
1572   PetscCall(KokkosDualViewSyncHost(akok->j_dual, exec));
1573 
1574   aseq->i       = akok->i_host_data();
1575   aseq->j       = akok->j_host_data();
1576   aseq->a       = akok->a_host_data();
1577   aseq->nonew   = -1; /*this indicates that inserting a new value in the matrix that generates a new nonzero is an error*/
1578   aseq->free_a  = PETSC_FALSE;
1579   aseq->free_ij = PETSC_FALSE;
1580   aseq->nz      = akok->nnz();
1581   aseq->maxnz   = aseq->nz;
1582 
1583   PetscCall(PetscMalloc1(m, &aseq->imax));
1584   PetscCall(PetscMalloc1(m, &aseq->ilen));
1585   for (i = 0; i < m; i++) aseq->ilen[i] = aseq->imax[i] = aseq->i[i + 1] - aseq->i[i];
1586 
1587   /* It is critical to set the nonzerostate, as we use it to check if sparsity pattern (hence data) has changed on host in MatAssemblyEnd */
1588   akok->nonzerostate = A->nonzerostate;
1589   A->spptr           = akok; /* Set A->spptr before MatAssembly so that A->spptr won't be allocated again there */
1590   PetscCall(MatAssemblyBegin(A, MAT_FINAL_ASSEMBLY));
1591   PetscCall(MatAssemblyEnd(A, MAT_FINAL_ASSEMBLY));
1592   PetscFunctionReturn(PETSC_SUCCESS);
1593 }
1594 
MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A,KokkosCsrMatrix * csr)1595 PETSC_INTERN PetscErrorCode MatSeqAIJKokkosGetKokkosCsrMatrix(Mat A, KokkosCsrMatrix *csr)
1596 {
1597   PetscFunctionBegin;
1598   PetscCall(MatSeqAIJKokkosSyncDevice(A));
1599   *csr = static_cast<Mat_SeqAIJKokkos *>(A->spptr)->csrmat;
1600   PetscFunctionReturn(PETSC_SUCCESS);
1601 }
1602 
MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm,KokkosCsrMatrix csr,Mat * A)1603 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithKokkosCsrMatrix(MPI_Comm comm, KokkosCsrMatrix csr, Mat *A)
1604 {
1605   Mat_SeqAIJKokkos *akok;
1606 
1607   PetscFunctionBegin;
1608   PetscCallCXX(akok = new Mat_SeqAIJKokkos(csr));
1609   PetscCall(MatCreate(comm, A));
1610   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1611   PetscFunctionReturn(PETSC_SUCCESS);
1612 }
1613 
1614 /* Crete a SEQAIJKOKKOS matrix with a Mat_SeqAIJKokkos data structure
1615 
1616    Note we have names like MatSeqAIJSetPreallocationCSR, so I use capitalized CSR
1617  */
MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm,Mat_SeqAIJKokkos * akok,Mat * A)1618 PETSC_INTERN PetscErrorCode MatCreateSeqAIJKokkosWithCSRMatrix(MPI_Comm comm, Mat_SeqAIJKokkos *akok, Mat *A)
1619 {
1620   PetscFunctionBegin;
1621   PetscCall(MatCreate(comm, A));
1622   PetscCall(MatSetSeqAIJKokkosWithCSRMatrix(*A, akok));
1623   PetscFunctionReturn(PETSC_SUCCESS);
1624 }
1625 
1626 /*@C
1627   MatCreateSeqAIJKokkos - Creates a sparse matrix in `MATSEQAIJKOKKOS` (compressed row) format
1628   (the default parallel PETSc format). This matrix will ultimately be handled by
1629   Kokkos for calculations.
1630 
1631   Collective
1632 
1633   Input Parameters:
1634 + comm - MPI communicator, set to `PETSC_COMM_SELF`
1635 . m    - number of rows
1636 . n    - number of columns
1637 . nz   - number of nonzeros per row (same for all rows), ignored if `nnz` is provided
1638 - nnz  - array containing the number of nonzeros in the various rows (possibly different for each row) or `NULL`
1639 
1640   Output Parameter:
1641 . A - the matrix
1642 
1643   Level: intermediate
1644 
1645   Notes:
1646   It is recommended that one use the `MatCreate()`, `MatSetType()` and/or `MatSetFromOptions()`,
1647   MatXXXXSetPreallocation() paradgm instead of this routine directly.
1648   [MatXXXXSetPreallocation() is, for example, `MatSeqAIJSetPreallocation()`]
1649 
1650   The AIJ format, also called
1651   compressed row storage, is fully compatible with standard Fortran
1652   storage.  That is, the stored row and column indices can begin at
1653   either one (as in Fortran) or zero.
1654 
1655   Specify the preallocated storage with either `nz` or `nnz` (not both).
1656   Set `nz` = `PETSC_DEFAULT` and `nnz` = `NULL` for PETSc to control dynamic memory
1657   allocation.
1658 
1659 .seealso: [](ch_matrices), `Mat`, `MatCreate()`, `MatCreateAIJ()`, `MatSetValues()`, `MatSeqAIJSetColumnIndices()`, `MatCreateSeqAIJWithArrays()`
1660 @*/
MatCreateSeqAIJKokkos(MPI_Comm comm,PetscInt m,PetscInt n,PetscInt nz,const PetscInt nnz[],Mat * A)1661 PetscErrorCode MatCreateSeqAIJKokkos(MPI_Comm comm, PetscInt m, PetscInt n, PetscInt nz, const PetscInt nnz[], Mat *A)
1662 {
1663   PetscFunctionBegin;
1664   PetscCall(PetscKokkosInitializeCheck());
1665   PetscCall(MatCreate(comm, A));
1666   PetscCall(MatSetSizes(*A, m, n, m, n));
1667   PetscCall(MatSetType(*A, MATSEQAIJKOKKOS));
1668   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(*A, nz, (PetscInt *)nnz));
1669   PetscFunctionReturn(PETSC_SUCCESS);
1670 }
1671 
1672 // After matrix numeric factorization, there are still steps to do before triangular solve can be called.
1673 // For example, for transpose solve, we might need to compute the transpose matrices if the solver does not support it (such as KK, while cusparse does).
1674 // In cusparse, one has to call cusparseSpSV_analysis() with updated triangular matrix values before calling cusparseSpSV_solve().
1675 // Simiarily, in KK sptrsv_symbolic() has to be called before sptrsv_solve(). We put these steps in MatSeqAIJKokkos{Transpose}SolveCheck.
MatSeqAIJKokkosSolveCheck(Mat A)1676 static PetscErrorCode MatSeqAIJKokkosSolveCheck(Mat A)
1677 {
1678   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1679   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1680   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU and Choleksy
1681 
1682   PetscFunctionBegin;
1683   if (!factors->sptrsv_symbolic_completed) { // If sptrsv_symbolic was not called yet
1684     if (has_upper) PetscCallCXX(sptrsv_symbolic(&factors->khU, factors->iU_d, factors->jU_d, factors->aU_d));
1685     if (has_lower) PetscCallCXX(sptrsv_symbolic(&factors->khL, factors->iL_d, factors->jL_d, factors->aL_d));
1686     factors->sptrsv_symbolic_completed = PETSC_TRUE;
1687   }
1688   PetscFunctionReturn(PETSC_SUCCESS);
1689 }
1690 
MatSeqAIJKokkosTransposeSolveCheck(Mat A)1691 static PetscErrorCode MatSeqAIJKokkosTransposeSolveCheck(Mat A)
1692 {
1693   const PetscInt              n         = A->rmap->n;
1694   Mat_SeqAIJKokkosTriFactors *factors   = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1695   const PetscBool             has_lower = factors->iL_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // false with Choleksy
1696   const PetscBool             has_upper = factors->iU_d.extent(0) ? PETSC_TRUE : PETSC_FALSE; // true with LU or Choleksy
1697 
1698   PetscFunctionBegin;
1699   if (!factors->transpose_updated) {
1700     if (has_upper) {
1701       if (!factors->iUt_d.extent(0)) {                                 // Allocate Ut on device if not yet
1702         factors->iUt_d = MatRowMapKokkosView("factors->iUt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1703         factors->jUt_d = MatColIdxKokkosView(NoInit("factors->jUt_d"), factors->jU_d.extent(0));
1704         factors->aUt_d = MatScalarKokkosView(NoInit("factors->aUt_d"), factors->aU_d.extent(0));
1705       }
1706 
1707       if (factors->iU_h.extent(0)) { // If U is on host (factorization was done on host), we also compute the transpose on host
1708         if (!factors->U) {
1709           Mat_SeqAIJ *seq;
1710 
1711           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iU_h.data(), factors->jU_h.data(), factors->aU_h.data(), &factors->U));
1712           PetscCall(MatTranspose(factors->U, MAT_INITIAL_MATRIX, &factors->Ut));
1713 
1714           seq            = static_cast<Mat_SeqAIJ *>(factors->Ut->data);
1715           factors->iUt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1716           factors->jUt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1717           factors->aUt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1718         } else {
1719           PetscCall(MatTranspose(factors->U, MAT_REUSE_MATRIX, &factors->Ut)); // Matrix Ut' data is aliased with {i, j, a}Ut_h
1720         }
1721         // Copy Ut from host to device
1722         PetscCallCXX(Kokkos::deep_copy(factors->iUt_d, factors->iUt_h));
1723         PetscCallCXX(Kokkos::deep_copy(factors->jUt_d, factors->jUt_h));
1724         PetscCallCXX(Kokkos::deep_copy(factors->aUt_d, factors->aUt_h));
1725       } else { // If U was computed on device, we also compute the transpose there
1726         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1727         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iU_d,
1728                                                                                                                                                                                                                                factors->jU_d, factors->aU_d,
1729                                                                                                                                                                                                                                factors->iUt_d, factors->jUt_d,
1730                                                                                                                                                                                                                                factors->aUt_d));
1731         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iUt_d, factors->jUt_d, factors->aUt_d));
1732       }
1733       PetscCallCXX(sptrsv_symbolic(&factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d));
1734     }
1735 
1736     // do the same for L with LU
1737     if (has_lower) {
1738       if (!factors->iLt_d.extent(0)) {                                 // Allocate Lt on device if not yet
1739         factors->iLt_d = MatRowMapKokkosView("factors->iLt_d", n + 1); // KK requires this view to be initialized to 0 to call transpose_matrix
1740         factors->jLt_d = MatColIdxKokkosView(NoInit("factors->jLt_d"), factors->jL_d.extent(0));
1741         factors->aLt_d = MatScalarKokkosView(NoInit("factors->aLt_d"), factors->aL_d.extent(0));
1742       }
1743 
1744       if (factors->iL_h.extent(0)) { // If L is on host, we also compute the transpose on host
1745         if (!factors->L) {
1746           Mat_SeqAIJ *seq;
1747 
1748           PetscCall(MatCreateSeqAIJWithArrays(PETSC_COMM_SELF, n, n, factors->iL_h.data(), factors->jL_h.data(), factors->aL_h.data(), &factors->L));
1749           PetscCall(MatTranspose(factors->L, MAT_INITIAL_MATRIX, &factors->Lt));
1750 
1751           seq            = static_cast<Mat_SeqAIJ *>(factors->Lt->data);
1752           factors->iLt_h = MatRowMapKokkosViewHost(seq->i, n + 1);
1753           factors->jLt_h = MatColIdxKokkosViewHost(seq->j, seq->nz);
1754           factors->aLt_h = MatScalarKokkosViewHost(seq->a, seq->nz);
1755         } else {
1756           PetscCall(MatTranspose(factors->L, MAT_REUSE_MATRIX, &factors->Lt)); // Matrix Lt' data is aliased with {i, j, a}Lt_h
1757         }
1758         // Copy Lt from host to device
1759         PetscCallCXX(Kokkos::deep_copy(factors->iLt_d, factors->iLt_h));
1760         PetscCallCXX(Kokkos::deep_copy(factors->jLt_d, factors->jLt_h));
1761         PetscCallCXX(Kokkos::deep_copy(factors->aLt_d, factors->aLt_h));
1762       } else { // If L was computed on device, we also compute the transpose there
1763         // TODO: KK transpose_matrix() does not sort column indices, however cusparse requires sorted indices. We have to sort the indices, until KK provides finer control options.
1764         PetscCallCXX(transpose_matrix<ConstMatRowMapKokkosView, ConstMatColIdxKokkosView, ConstMatScalarKokkosView, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView, MatRowMapKokkosView, DefaultExecutionSpace>(n, n, factors->iL_d,
1765                                                                                                                                                                                                                                factors->jL_d, factors->aL_d,
1766                                                                                                                                                                                                                                factors->iLt_d, factors->jLt_d,
1767                                                                                                                                                                                                                                factors->aLt_d));
1768         PetscCallCXX(sort_crs_matrix<DefaultExecutionSpace, MatRowMapKokkosView, MatColIdxKokkosView, MatScalarKokkosView>(factors->iLt_d, factors->jLt_d, factors->aLt_d));
1769       }
1770       PetscCallCXX(sptrsv_symbolic(&factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d));
1771     }
1772 
1773     factors->transpose_updated = PETSC_TRUE;
1774   }
1775   PetscFunctionReturn(PETSC_SUCCESS);
1776 }
1777 
1778 // Solve Ax = b, with RAR = U^T D U, where R is the row (and col) permutation matrix on A.
1779 // R is represented by rowperm in factors. If R is identity (i.e, no reordering), then rowperm is empty.
MatSolve_SeqAIJKokkos_Cholesky(Mat A,Vec bb,Vec xx)1780 static PetscErrorCode MatSolve_SeqAIJKokkos_Cholesky(Mat A, Vec bb, Vec xx)
1781 {
1782   auto                        exec    = PetscGetKokkosExecutionSpace();
1783   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1784   PetscInt                    m       = A->rmap->n;
1785   PetscScalarKokkosView       D       = factors->D_d;
1786   PetscScalarKokkosView       X, Y, B; // alias
1787   ConstPetscScalarKokkosView  b;
1788   PetscScalarKokkosView       x;
1789   PetscIntKokkosView         &rowperm  = factors->rowperm;
1790   PetscBool                   identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1791 
1792   PetscFunctionBegin;
1793   PetscCall(PetscLogGpuTimeBegin());
1794   PetscCall(MatSeqAIJKokkosSolveCheck(A));          // for UX = T
1795   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // for U^T Y = B
1796   PetscCall(VecGetKokkosView(bb, &b));
1797   PetscCall(VecGetKokkosViewWrite(xx, &x));
1798 
1799   // Solve U^T Y = B
1800   if (identity) { // Reorder b with the row permutation
1801     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1802     Y = factors->workVector;
1803   } else {
1804     B = factors->workVector;
1805     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1806     Y = x;
1807   }
1808   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1809 
1810   // Solve diag(D) Y' = Y.
1811   // Actually just do Y' = Y*D since D is already inverted in MatCholeskyFactorNumeric_SeqAIJ(). It is basically a vector element-wise multiplication.
1812   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { Y(i) = Y(i) * D(i); }));
1813 
1814   // Solve UX = Y
1815   if (identity) {
1816     X = x;
1817   } else {
1818     X = factors->workVector; // B is not needed anymore
1819   }
1820   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1821 
1822   // Reorder X with the inverse column (row) permutation
1823   if (!identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1824 
1825   PetscCall(VecRestoreKokkosView(bb, &b));
1826   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1827   PetscCall(PetscLogGpuTimeEnd());
1828   PetscFunctionReturn(PETSC_SUCCESS);
1829 }
1830 
1831 // Solve Ax = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1832 // R and C are represented by rowperm and colperm in factors.
1833 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
MatSolve_SeqAIJKokkos_LU(Mat A,Vec bb,Vec xx)1834 static PetscErrorCode MatSolve_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1835 {
1836   auto                        exec    = PetscGetKokkosExecutionSpace();
1837   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1838   PetscInt                    m       = A->rmap->n;
1839   PetscScalarKokkosView       X, Y, B; // alias
1840   ConstPetscScalarKokkosView  b;
1841   PetscScalarKokkosView       x;
1842   PetscIntKokkosView         &rowperm      = factors->rowperm;
1843   PetscIntKokkosView         &colperm      = factors->colperm;
1844   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1845   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1846 
1847   PetscFunctionBegin;
1848   PetscCall(PetscLogGpuTimeBegin());
1849   PetscCall(MatSeqAIJKokkosSolveCheck(A));
1850   PetscCall(VecGetKokkosView(bb, &b));
1851   PetscCall(VecGetKokkosViewWrite(xx, &x));
1852 
1853   // Solve L Y = B (i.e., L (U C^- x) = R b).  R b indicates applying the row permutation on b.
1854   if (row_identity) {
1855     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1856     Y = factors->workVector;
1857   } else {
1858     B = factors->workVector;
1859     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(rowperm(i)); }));
1860     Y = x;
1861   }
1862   PetscCallCXX(sptrsv_solve(exec, &factors->khL, factors->iL_d, factors->jL_d, factors->aL_d, B, Y));
1863 
1864   // Solve U C^- x = Y
1865   if (col_identity) {
1866     X = x;
1867   } else {
1868     X = factors->workVector;
1869   }
1870   PetscCallCXX(sptrsv_solve(exec, &factors->khU, factors->iU_d, factors->jU_d, factors->aU_d, Y, X));
1871 
1872   // x = C X; Reorder X with the inverse col permutation
1873   if (!col_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(colperm(i)) = X(i); }));
1874 
1875   PetscCall(VecRestoreKokkosView(bb, &b));
1876   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1877   PetscCall(PetscLogGpuTimeEnd());
1878   PetscFunctionReturn(PETSC_SUCCESS);
1879 }
1880 
1881 // Solve A^T x = b, with RAC = LU, where R and C are row and col permutation matrices on A respectively.
1882 // R and C are represented by rowperm and colperm in factors.
1883 // If R or C is identity (i.e, no reordering), then rowperm or colperm is empty.
1884 // A = R^-1 L U C^-1, so A^T = C^-T U^T L^T R^-T. But since C^- = C^T, R^- = R^T, we have A^T = C U^T L^T R.
MatSolveTranspose_SeqAIJKokkos_LU(Mat A,Vec bb,Vec xx)1885 static PetscErrorCode MatSolveTranspose_SeqAIJKokkos_LU(Mat A, Vec bb, Vec xx)
1886 {
1887   auto                        exec    = PetscGetKokkosExecutionSpace();
1888   Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)A->spptr;
1889   PetscInt                    m       = A->rmap->n;
1890   PetscScalarKokkosView       X, Y, B; // alias
1891   ConstPetscScalarKokkosView  b;
1892   PetscScalarKokkosView       x;
1893   PetscIntKokkosView         &rowperm      = factors->rowperm;
1894   PetscIntKokkosView         &colperm      = factors->colperm;
1895   PetscBool                   row_identity = rowperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1896   PetscBool                   col_identity = colperm.extent(0) ? PETSC_FALSE : PETSC_TRUE;
1897 
1898   PetscFunctionBegin;
1899   PetscCall(PetscLogGpuTimeBegin());
1900   PetscCall(MatSeqAIJKokkosTransposeSolveCheck(A)); // Update L^T, U^T if needed, and do sptrsv symbolic for L^T, U^T
1901   PetscCall(VecGetKokkosView(bb, &b));
1902   PetscCall(VecGetKokkosViewWrite(xx, &x));
1903 
1904   // Solve U^T Y = B (i.e., U^T (L^T R x) = C^- b).  Note C^- b = C^T b, which means applying the column permutation on b.
1905   if (col_identity) { // Reorder b with the col permutation
1906     B = PetscScalarKokkosView(const_cast<PetscScalar *>(b.data()), b.extent(0));
1907     Y = factors->workVector;
1908   } else {
1909     B = factors->workVector;
1910     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { B(i) = b(colperm(i)); }));
1911     Y = x;
1912   }
1913   PetscCallCXX(sptrsv_solve(exec, &factors->khUt, factors->iUt_d, factors->jUt_d, factors->aUt_d, B, Y));
1914 
1915   // Solve L^T X = Y
1916   if (row_identity) {
1917     X = x;
1918   } else {
1919     X = factors->workVector;
1920   }
1921   PetscCallCXX(sptrsv_solve(exec, &factors->khLt, factors->iLt_d, factors->jLt_d, factors->aLt_d, Y, X));
1922 
1923   // x = R^- X = R^T X; Reorder X with the inverse row permutation
1924   if (!row_identity) PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(exec, 0, m), KOKKOS_LAMBDA(const PetscInt i) { x(rowperm(i)) = X(i); }));
1925 
1926   PetscCall(VecRestoreKokkosView(bb, &b));
1927   PetscCall(VecRestoreKokkosViewWrite(xx, &x));
1928   PetscCall(PetscLogGpuTimeEnd());
1929   PetscFunctionReturn(PETSC_SUCCESS);
1930 }
1931 
MatLUFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo * info)1932 static PetscErrorCode MatLUFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
1933 {
1934   PetscFunctionBegin;
1935   PetscCall(MatSeqAIJKokkosSyncHost(A));
1936   PetscCall(MatLUFactorNumeric_SeqAIJ(B, A, info));
1937 
1938   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
1939     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
1940     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
1941     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
1942     const MatScalar            *Ba = b->a;
1943     PetscInt                    m = B->rmap->n, n = B->cmap->n;
1944 
1945     if (factors->iL_h.extent(0) == 0) { // Allocate memory and copy the L, U structure for the first time
1946       // Allocate memory and copy the structure
1947       factors->iL_h = MatRowMapKokkosViewHost(NoInit("iL_h"), m + 1);
1948       factors->jL_h = MatColIdxKokkosViewHost(NoInit("jL_h"), (Bi[m] - Bi[0]) + m); // + the diagonal entries
1949       factors->aL_h = MatScalarKokkosViewHost(NoInit("aL_h"), (Bi[m] - Bi[0]) + m);
1950       factors->iU_h = MatRowMapKokkosViewHost(NoInit("iU_h"), m + 1);
1951       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), (Bdiag[0] - Bdiag[m]));
1952       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), (Bdiag[0] - Bdiag[m]));
1953 
1954       PetscInt *Li = factors->iL_h.data();
1955       PetscInt *Lj = factors->jL_h.data();
1956       PetscInt *Ui = factors->iU_h.data();
1957       PetscInt *Uj = factors->jU_h.data();
1958 
1959       Li[0] = Ui[0] = 0;
1960       for (PetscInt i = 0; i < m; i++) {
1961         PetscInt llen = Bi[i + 1] - Bi[i];       // exclusive of the diagonal entry
1962         PetscInt ulen = Bdiag[i] - Bdiag[i + 1]; // inclusive of the diagonal entry
1963 
1964         PetscCall(PetscArraycpy(Lj + Li[i], Bj + Bi[i], llen)); // entries of L on the left of the diagonal
1965         Lj[Li[i] + llen] = i;                                   // diagonal entry of L
1966 
1967         Uj[Ui[i]] = i;                                                             // diagonal entry of U
1968         PetscCall(PetscArraycpy(Uj + Ui[i] + 1, Bj + Bdiag[i + 1] + 1, ulen - 1)); // entries of U on  the right of the diagonal
1969 
1970         Li[i + 1] = Li[i] + llen + 1;
1971         Ui[i + 1] = Ui[i] + ulen;
1972       }
1973 
1974       factors->iL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iL_h);
1975       factors->jL_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jL_h);
1976       factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h);
1977       factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h);
1978       factors->aL_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aL_h);
1979       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
1980 
1981       // Copy row/col permutation to device
1982       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
1983       PetscBool row_identity;
1984       PetscCall(ISIdentity(rowperm, &row_identity));
1985       if (!row_identity) {
1986         const PetscInt *ip;
1987 
1988         PetscCall(ISGetIndices(rowperm, &ip));
1989         factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m);
1990         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
1991         PetscCall(ISRestoreIndices(rowperm, &ip));
1992         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
1993       }
1994 
1995       IS        colperm = ((Mat_SeqAIJ *)B->data)->col;
1996       PetscBool col_identity;
1997       PetscCall(ISIdentity(colperm, &col_identity));
1998       if (!col_identity) {
1999         const PetscInt *ip;
2000 
2001         PetscCall(ISGetIndices(colperm, &ip));
2002         factors->colperm = PetscIntKokkosView(NoInit("colperm"), n);
2003         PetscCallCXX(Kokkos::deep_copy(factors->colperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), n)));
2004         PetscCall(ISRestoreIndices(colperm, &ip));
2005         PetscCall(PetscLogCpuToGpu(n * sizeof(PetscInt)));
2006       }
2007 
2008       /* Create sptrsv handles for L, U and their transpose */
2009 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2010       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2011 #else
2012       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2013 #endif
2014       factors->khL.create_sptrsv_handle(sptrsv_alg, m, true /* L is lower tri */);
2015       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2016       factors->khLt.create_sptrsv_handle(sptrsv_alg, m, false /* L^T is not lower tri */);
2017       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2018     }
2019 
2020     // Copy the value
2021     for (PetscInt i = 0; i < m; i++) {
2022       PetscInt        llen = Bi[i + 1] - Bi[i];
2023       PetscInt        ulen = Bdiag[i] - Bdiag[i + 1];
2024       const PetscInt *Li   = factors->iL_h.data();
2025       const PetscInt *Ui   = factors->iU_h.data();
2026 
2027       PetscScalar *La = factors->aL_h.data();
2028       PetscScalar *Ua = factors->aU_h.data();
2029 
2030       PetscCall(PetscArraycpy(La + Li[i], Ba + Bi[i], llen)); // entries of L
2031       La[Li[i] + llen] = 1.0;                                 // diagonal entry
2032 
2033       Ua[Ui[i]] = 1.0 / Ba[Bdiag[i]];                                            // diagonal entry
2034       PetscCall(PetscArraycpy(Ua + Ui[i] + 1, Ba + Bdiag[i + 1] + 1, ulen - 1)); // entries of U
2035     }
2036 
2037     PetscCallCXX(Kokkos::deep_copy(factors->aL_d, factors->aL_h));
2038     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2039     // Once the factors' values have changed, we need to update their transpose and redo sptrsv symbolic
2040     factors->transpose_updated         = PETSC_FALSE;
2041     factors->sptrsv_symbolic_completed = PETSC_FALSE;
2042 
2043     B->ops->solve          = MatSolve_SeqAIJKokkos_LU;
2044     B->ops->solvetranspose = MatSolveTranspose_SeqAIJKokkos_LU;
2045   }
2046 
2047   B->ops->matsolve          = NULL;
2048   B->ops->matsolvetranspose = NULL;
2049   PetscFunctionReturn(PETSC_SUCCESS);
2050 }
2051 
MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B,Mat A,const MatFactorInfo * info)2052 static PetscErrorCode MatILUFactorNumeric_SeqAIJKokkos_ILU0(Mat B, Mat A, const MatFactorInfo *info)
2053 {
2054   Mat_SeqAIJKokkos           *aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2055   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2056   PetscInt                    fill_lev = info->levels;
2057 
2058   PetscFunctionBegin;
2059   PetscCall(PetscLogGpuTimeBegin());
2060   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo.factoronhost should be false");
2061   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2062 
2063   auto a_d = aijkok->a_dual.view_device();
2064   auto i_d = aijkok->i_dual.view_device();
2065   auto j_d = aijkok->j_dual.view_device();
2066 
2067   PetscCallCXX(spiluk_numeric(&factors->kh, fill_lev, i_d, j_d, a_d, factors->iL_d, factors->jL_d, factors->aL_d, factors->iU_d, factors->jU_d, factors->aU_d));
2068 
2069   B->assembled              = PETSC_TRUE;
2070   B->preallocated           = PETSC_TRUE;
2071   B->ops->solve             = MatSolve_SeqAIJKokkos_LU;
2072   B->ops->solvetranspose    = MatSolveTranspose_SeqAIJKokkos_LU;
2073   B->ops->matsolve          = NULL;
2074   B->ops->matsolvetranspose = NULL;
2075 
2076   /* Once the factors' value changed, we need to update their transpose and sptrsv handle */
2077   factors->transpose_updated         = PETSC_FALSE;
2078   factors->sptrsv_symbolic_completed = PETSC_FALSE;
2079   /* TODO: log flops, but how to know that? */
2080   PetscCall(PetscLogGpuTimeEnd());
2081   PetscFunctionReturn(PETSC_SUCCESS);
2082 }
2083 
2084 // Use KK's spiluk_symbolic() to do ILU0 symbolic factorization, with no row/col reordering
MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B,Mat A,IS,IS,const MatFactorInfo * info)2085 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos_ILU0(Mat B, Mat A, IS, IS, const MatFactorInfo *info)
2086 {
2087   Mat_SeqAIJKokkos           *aijkok;
2088   Mat_SeqAIJ                 *b;
2089   Mat_SeqAIJKokkosTriFactors *factors  = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2090   PetscInt                    fill_lev = info->levels;
2091   PetscInt                    nnzA     = ((Mat_SeqAIJ *)A->data)->nz, nnzL, nnzU;
2092   PetscInt                    n        = A->rmap->n;
2093 
2094   PetscFunctionBegin;
2095   PetscCheck(!info->factoronhost, PetscObjectComm((PetscObject)A), PETSC_ERR_PLIB, "MatFactorInfo's factoronhost should be false as we are doing it on device right now");
2096   PetscCall(MatSeqAIJKokkosSyncDevice(A));
2097 
2098   /* Create a spiluk handle and then do symbolic factorization */
2099   nnzL = nnzU = PetscRealIntMultTruncate(info->fill, nnzA);
2100   factors->kh.create_spiluk_handle(SPILUKAlgorithm::SEQLVLSCHD_TP1, n, nnzL, nnzU);
2101 
2102   auto spiluk_handle = factors->kh.get_spiluk_handle();
2103 
2104   Kokkos::realloc(factors->iL_d, n + 1); /* Free old arrays and realloc */
2105   Kokkos::realloc(factors->jL_d, spiluk_handle->get_nnzL());
2106   Kokkos::realloc(factors->iU_d, n + 1);
2107   Kokkos::realloc(factors->jU_d, spiluk_handle->get_nnzU());
2108 
2109   aijkok   = (Mat_SeqAIJKokkos *)A->spptr;
2110   auto i_d = aijkok->i_dual.view_device();
2111   auto j_d = aijkok->j_dual.view_device();
2112   PetscCallCXX(spiluk_symbolic(&factors->kh, fill_lev, i_d, j_d, factors->iL_d, factors->jL_d, factors->iU_d, factors->jU_d));
2113   /* TODO: if spiluk_symbolic is asynchronous, do we need to sync before calling get_nnzL()? */
2114 
2115   Kokkos::resize(factors->jL_d, spiluk_handle->get_nnzL()); /* Shrink or expand, and retain old value */
2116   Kokkos::resize(factors->jU_d, spiluk_handle->get_nnzU());
2117   Kokkos::realloc(factors->aL_d, spiluk_handle->get_nnzL()); /* No need to retain old value */
2118   Kokkos::realloc(factors->aU_d, spiluk_handle->get_nnzU());
2119 
2120   /* TODO: add options to select sptrsv algorithms */
2121   /* Create sptrsv handles for L, U and their transpose */
2122 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2123   auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2124 #else
2125   auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2126 #endif
2127 
2128   factors->khL.create_sptrsv_handle(sptrsv_alg, n, true /* L is lower tri */);
2129   factors->khU.create_sptrsv_handle(sptrsv_alg, n, false /* U is not lower tri */);
2130   factors->khLt.create_sptrsv_handle(sptrsv_alg, n, false /* L^T is not lower tri */);
2131   factors->khUt.create_sptrsv_handle(sptrsv_alg, n, true /* U^T is lower tri */);
2132 
2133   /* Fill fields of the factor matrix B */
2134   PetscCall(MatSeqAIJSetPreallocation_SeqAIJ(B, MAT_SKIP_ALLOCATION, NULL));
2135   b     = (Mat_SeqAIJ *)B->data;
2136   b->nz = b->maxnz          = spiluk_handle->get_nnzL() + spiluk_handle->get_nnzU();
2137   B->info.fill_ratio_given  = info->fill;
2138   B->info.fill_ratio_needed = nnzA > 0 ? ((PetscReal)b->nz) / ((PetscReal)nnzA) : 1.0;
2139 
2140   B->ops->lufactornumeric = MatILUFactorNumeric_SeqAIJKokkos_ILU0;
2141   PetscFunctionReturn(PETSC_SUCCESS);
2142 }
2143 
MatLUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)2144 static PetscErrorCode MatLUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2145 {
2146   PetscFunctionBegin;
2147   PetscCall(MatLUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info));
2148   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2149   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2150   B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2151   PetscFunctionReturn(PETSC_SUCCESS);
2152 }
2153 
MatILUFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS isrow,IS iscol,const MatFactorInfo * info)2154 static PetscErrorCode MatILUFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS isrow, IS iscol, const MatFactorInfo *info)
2155 {
2156   PetscBool row_identity = PETSC_FALSE, col_identity = PETSC_FALSE;
2157 
2158   PetscFunctionBegin;
2159   if (!info->factoronhost) {
2160     PetscCall(ISIdentity(isrow, &row_identity));
2161     PetscCall(ISIdentity(iscol, &col_identity));
2162   }
2163 
2164   PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2165   PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2166 
2167   if (!info->factoronhost && !info->levels && row_identity && col_identity) { // if level 0 and no reordering
2168     PetscCall(MatILUFactorSymbolic_SeqAIJKokkos_ILU0(B, A, isrow, iscol, info));
2169   } else {
2170     PetscCall(MatILUFactorSymbolic_SeqAIJ(B, A, isrow, iscol, info)); // otherwise, use PETSc's ILU on host
2171     B->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJKokkos;
2172   }
2173   PetscFunctionReturn(PETSC_SUCCESS);
2174 }
2175 
MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B,Mat A,const MatFactorInfo * info)2176 static PetscErrorCode MatCholeskyFactorNumeric_SeqAIJKokkos(Mat B, Mat A, const MatFactorInfo *info)
2177 {
2178   PetscFunctionBegin;
2179   PetscCall(MatSeqAIJKokkosSyncHost(A));
2180   PetscCall(MatCholeskyFactorNumeric_SeqAIJ(B, A, info));
2181 
2182   if (!info->solveonhost) { // if solve on host, then we don't need to copy L, U to device
2183     Mat_SeqAIJKokkosTriFactors *factors = (Mat_SeqAIJKokkosTriFactors *)B->spptr;
2184     Mat_SeqAIJ                 *b       = static_cast<Mat_SeqAIJ *>(B->data);
2185     const PetscInt             *Bi = b->i, *Bj = b->j, *Bdiag = b->diag;
2186     const MatScalar            *Ba = b->a;
2187     PetscInt                    m  = B->rmap->n;
2188 
2189     if (factors->iU_h.extent(0) == 0) { // First time of numeric factorization
2190       // Allocate memory and copy the structure
2191       factors->iU_h = PetscIntKokkosViewHost(const_cast<PetscInt *>(Bi), m + 1); // wrap Bi as iU_h
2192       factors->jU_h = MatColIdxKokkosViewHost(NoInit("jU_h"), Bi[m]);
2193       factors->aU_h = MatScalarKokkosViewHost(NoInit("aU_h"), Bi[m]);
2194       factors->D_h  = MatScalarKokkosViewHost(NoInit("D_h"), m);
2195       factors->aU_d = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->aU_h);
2196       factors->D_d  = Kokkos::create_mirror_view(DefaultMemorySpace(), factors->D_h);
2197 
2198       // Build jU_h from the skewed Aj
2199       PetscInt *Uj = factors->jU_h.data();
2200       for (PetscInt i = 0; i < m; i++) {
2201         PetscInt ulen = Bi[i + 1] - Bi[i];
2202         Uj[Bi[i]]     = i;                                              // diagonal entry
2203         PetscCall(PetscArraycpy(Uj + Bi[i] + 1, Bj + Bi[i], ulen - 1)); // entries of U on the right of the diagonal
2204       }
2205 
2206       // Copy iU, jU to device
2207       PetscCallCXX(factors->iU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->iU_h));
2208       PetscCallCXX(factors->jU_d = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), factors->jU_h));
2209 
2210       // Copy row/col permutation to device
2211       IS        rowperm = ((Mat_SeqAIJ *)B->data)->row;
2212       PetscBool row_identity;
2213       PetscCall(ISIdentity(rowperm, &row_identity));
2214       if (!row_identity) {
2215         const PetscInt *ip;
2216 
2217         PetscCall(ISGetIndices(rowperm, &ip));
2218         PetscCallCXX(factors->rowperm = PetscIntKokkosView(NoInit("rowperm"), m));
2219         PetscCallCXX(Kokkos::deep_copy(factors->rowperm, PetscIntKokkosViewHost(const_cast<PetscInt *>(ip), m)));
2220         PetscCall(ISRestoreIndices(rowperm, &ip));
2221         PetscCall(PetscLogCpuToGpu(m * sizeof(PetscInt)));
2222       }
2223 
2224       // Create sptrsv handles for U and U^T
2225 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE)
2226       auto sptrsv_alg = SPTRSVAlgorithm::SPTRSV_CUSPARSE;
2227 #else
2228       auto sptrsv_alg = SPTRSVAlgorithm::SEQLVLSCHD_TP1;
2229 #endif
2230       factors->khU.create_sptrsv_handle(sptrsv_alg, m, false /* U is not lower tri */);
2231       factors->khUt.create_sptrsv_handle(sptrsv_alg, m, true /* U^T is lower tri */);
2232     }
2233     // These pointers were set MatCholeskyFactorNumeric_SeqAIJ(), so we always need to update them
2234     B->ops->solve          = MatSolve_SeqAIJKokkos_Cholesky;
2235     B->ops->solvetranspose = MatSolve_SeqAIJKokkos_Cholesky;
2236 
2237     // Copy the value
2238     PetscScalar *Ua = factors->aU_h.data();
2239     PetscScalar *D  = factors->D_h.data();
2240     for (PetscInt i = 0; i < m; i++) {
2241       D[i]      = Ba[Bdiag[i]];     // actually Aa[Adiag[i]] is the inverse of the diagonal
2242       Ua[Bi[i]] = (PetscScalar)1.0; // set the unit diagonal for U
2243       for (PetscInt k = 0; k < Bi[i + 1] - Bi[i] - 1; k++) Ua[Bi[i] + 1 + k] = -Ba[Bi[i] + k];
2244     }
2245     PetscCallCXX(Kokkos::deep_copy(factors->aU_d, factors->aU_h));
2246     PetscCallCXX(Kokkos::deep_copy(factors->D_d, factors->D_h));
2247 
2248     factors->sptrsv_symbolic_completed = PETSC_FALSE; // When numeric value changed, we must do these again
2249     factors->transpose_updated         = PETSC_FALSE;
2250   }
2251 
2252   B->ops->matsolve          = NULL;
2253   B->ops->matsolvetranspose = NULL;
2254   PetscFunctionReturn(PETSC_SUCCESS);
2255 }
2256 
MatICCFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS perm,const MatFactorInfo * info)2257 static PetscErrorCode MatICCFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2258 {
2259   PetscFunctionBegin;
2260   if (info->solveonhost) {
2261     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2262     PetscCall(MatSetType(B, MATSEQSBAIJ));
2263     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2264   }
2265 
2266   PetscCall(MatICCFactorSymbolic_SeqAIJ(B, A, perm, info));
2267 
2268   if (!info->solveonhost) {
2269     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2270     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2271     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2272     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2273   }
2274   PetscFunctionReturn(PETSC_SUCCESS);
2275 }
2276 
MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B,Mat A,IS perm,const MatFactorInfo * info)2277 static PetscErrorCode MatCholeskyFactorSymbolic_SeqAIJKokkos(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2278 {
2279   PetscFunctionBegin;
2280   if (info->solveonhost) {
2281     // If solve on host, we have to change the type, as eventually we need to call MatSolve_SeqSBAIJ_1_NaturalOrdering() etc.
2282     PetscCall(MatSetType(B, MATSEQSBAIJ));
2283     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
2284   }
2285 
2286   PetscCall(MatCholeskyFactorSymbolic_SeqAIJ(B, A, perm, info)); // it sets B's two ISes ((Mat_SeqAIJ*)B->data)->{row, col} to perm
2287 
2288   if (!info->solveonhost) {
2289     // If solve on device, B is still a MATSEQAIJKOKKOS, so we are good to allocate B->spptr
2290     PetscCheck(!B->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2291     PetscCallCXX(B->spptr = new Mat_SeqAIJKokkosTriFactors(B->rmap->n));
2292     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqAIJKokkos;
2293   }
2294   PetscFunctionReturn(PETSC_SUCCESS);
2295 }
2296 
2297 // The _Kokkos suffix means we will use Kokkos as a solver for the SeqAIJKokkos matrix
MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A,MatSolverType * type)2298 static PetscErrorCode MatFactorGetSolverType_SeqAIJKokkos_Kokkos(Mat A, MatSolverType *type)
2299 {
2300   PetscFunctionBegin;
2301   *type = MATSOLVERKOKKOS;
2302   PetscFunctionReturn(PETSC_SUCCESS);
2303 }
2304 
2305 /*MC
2306   MATSOLVERKOKKOS = "Kokkos" - A matrix solver type providing triangular solvers for sequential matrices
2307   on a single GPU of type, `MATSEQAIJKOKKOS`, `MATAIJKOKKOS`.
2308 
2309   Level: beginner
2310 
2311 .seealso: [](ch_matrices), `Mat`, `PCFactorSetMatSolverType()`, `MatSolverType`, `MatCreateSeqAIJKokkos()`, `MATAIJKOKKOS`
2312 M*/
MatGetFactor_SeqAIJKokkos_Kokkos(Mat A,MatFactorType ftype,Mat * B)2313 PETSC_EXTERN PetscErrorCode MatGetFactor_SeqAIJKokkos_Kokkos(Mat A, MatFactorType ftype, Mat *B) /* MatGetFactor_<MatType>_<MatSolverType> */
2314 {
2315   PetscInt n = A->rmap->n;
2316   MPI_Comm comm;
2317 
2318   PetscFunctionBegin;
2319   PetscCall(PetscObjectGetComm((PetscObject)A, &comm));
2320   PetscCall(MatCreate(comm, B));
2321   PetscCall(MatSetSizes(*B, n, n, n, n));
2322   PetscCall(MatSetBlockSizesFromMats(*B, A, A));
2323   (*B)->factortype = ftype;
2324   PetscCall(MatSetType(*B, MATSEQAIJKOKKOS));
2325   PetscCall(MatSeqAIJSetPreallocation(*B, MAT_SKIP_ALLOCATION, NULL));
2326   PetscCheck(!(*B)->spptr, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Expected a NULL spptr");
2327 
2328   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
2329     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqAIJKokkos;
2330     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqAIJKokkos;
2331     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
2332     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
2333     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
2334   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
2335     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqAIJKokkos;
2336     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqAIJKokkos;
2337     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
2338     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
2339   } else SETERRQ(comm, PETSC_ERR_SUP, "MatFactorType %s is not supported by MatType SeqAIJKokkos", MatFactorTypes[ftype]);
2340 
2341   // The factorization can use the ordering provided in MatLUFactorSymbolic(), MatCholeskyFactorSymbolic() etc, though we do it on host
2342   (*B)->canuseordering = PETSC_TRUE;
2343   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_SeqAIJKokkos_Kokkos));
2344   PetscFunctionReturn(PETSC_SUCCESS);
2345 }
2346 
MatSolverTypeRegister_Kokkos(void)2347 PETSC_INTERN PetscErrorCode MatSolverTypeRegister_Kokkos(void)
2348 {
2349   PetscFunctionBegin;
2350   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_LU, MatGetFactor_SeqAIJKokkos_Kokkos));
2351   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_CHOLESKY, MatGetFactor_SeqAIJKokkos_Kokkos));
2352   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ILU, MatGetFactor_SeqAIJKokkos_Kokkos));
2353   PetscCall(MatSolverTypeRegister(MATSOLVERKOKKOS, MATSEQAIJKOKKOS, MAT_FACTOR_ICC, MatGetFactor_SeqAIJKokkos_Kokkos));
2354   PetscFunctionReturn(PETSC_SUCCESS);
2355 }
2356 
2357 /* Utility to print out a KokkosCsrMatrix for debugging */
PrintCsrMatrix(const KokkosCsrMatrix & csrmat)2358 PETSC_INTERN PetscErrorCode PrintCsrMatrix(const KokkosCsrMatrix &csrmat)
2359 {
2360   const auto        &iv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.row_map);
2361   const auto        &jv = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.graph.entries);
2362   const auto        &av = Kokkos::create_mirror_view_and_copy(HostMirrorMemorySpace(), csrmat.values);
2363   const PetscInt    *i  = iv.data();
2364   const PetscInt    *j  = jv.data();
2365   const PetscScalar *a  = av.data();
2366   PetscInt           m = csrmat.numRows(), n = csrmat.numCols(), nnz = csrmat.nnz();
2367 
2368   PetscFunctionBegin;
2369   PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT " x %" PetscInt_FMT " SeqAIJKokkos, with %" PetscInt_FMT " nonzeros\n", m, n, nnz));
2370   for (PetscInt k = 0; k < m; k++) {
2371     PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT ": ", k));
2372     for (PetscInt p = i[k]; p < i[k + 1]; p++) PetscCall(PetscPrintf(PETSC_COMM_SELF, "%" PetscInt_FMT "(%.1f), ", j[p], (double)PetscRealPart(a[p])));
2373     PetscCall(PetscPrintf(PETSC_COMM_SELF, "\n"));
2374   }
2375   PetscFunctionReturn(PETSC_SUCCESS);
2376 }
2377