xref: /petsc/src/vec/vec/impls/seq/kokkos/veckok.kokkos.cxx (revision 27f169480a6668db3ba90f8ef6ef68d542d113fa)
1 /*
2    Implements the sequential Kokkos vectors.
3 */
4 #include <petsc_kokkos.hpp>
5 #include <petscvec_kokkos.hpp>
6 
7 #include <petsc/private/sfimpl.h>
8 #include <petsc/private/petscimpl.h>
9 #include <petscmath.h>
10 #include <petscviewer.h>
11 #include <KokkosBlas.hpp>
12 #include <Kokkos_Functional.hpp>
13 
14 #include <../src/vec/vec/impls/dvecimpl.h> /* for VecCreate_Seq_Private */
15 #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
16 
17 template <class MemorySpace>
VecGetKokkosView_Private(Vec v,PetscScalarKokkosViewType<MemorySpace> * kv,PetscBool overwrite)18 static PetscErrorCode VecGetKokkosView_Private(Vec v, PetscScalarKokkosViewType<MemorySpace> *kv, PetscBool overwrite)
19 {
20   Vec_Kokkos *veckok   = static_cast<Vec_Kokkos *>(v->spptr);
21   using ExecutionSpace = typename PetscScalarKokkosViewType<MemorySpace>::traits::device_type;
22 
23   PetscFunctionBegin;
24   VecErrorIfNotKokkos(v);
25   if (!overwrite) { /* If overwrite=true, no need to sync the space, since caller will overwrite the data */
26     PetscCallCXX(veckok->v_dual.sync<ExecutionSpace>());
27   }
28   PetscCallCXX(*kv = veckok->v_dual.view<ExecutionSpace>());
29   PetscFunctionReturn(PETSC_SUCCESS);
30 }
31 
32 template <class MemorySpace>
VecRestoreKokkosView_Private(Vec v,PetscScalarKokkosViewType<MemorySpace> * kv,PetscBool overwrite)33 static PetscErrorCode VecRestoreKokkosView_Private(Vec v, PetscScalarKokkosViewType<MemorySpace> *kv, PetscBool overwrite)
34 {
35   Vec_Kokkos *veckok   = static_cast<Vec_Kokkos *>(v->spptr);
36   using ExecutionSpace = typename PetscScalarKokkosViewType<MemorySpace>::traits::device_type;
37 
38   PetscFunctionBegin;
39   VecErrorIfNotKokkos(v);
40   if (overwrite) PetscCallCXX(veckok->v_dual.clear_sync_state()); /* If overwrite=true, clear the old sync state since user forced an overwrite */
41   PetscCallCXX(veckok->v_dual.modify<ExecutionSpace>());
42   PetscCall(PetscObjectStateIncrease((PetscObject)v));
43   PetscFunctionReturn(PETSC_SUCCESS);
44 }
45 
46 template <class MemorySpace>
VecGetKokkosView(Vec v,ConstPetscScalarKokkosViewType<MemorySpace> * kv)47 PetscErrorCode VecGetKokkosView(Vec v, ConstPetscScalarKokkosViewType<MemorySpace> *kv)
48 {
49   Vec_Kokkos *veckok   = static_cast<Vec_Kokkos *>(v->spptr);
50   using ExecutionSpace = typename PetscScalarKokkosViewType<MemorySpace>::traits::device_type;
51 
52   PetscFunctionBegin;
53   VecErrorIfNotKokkos(v);
54   PetscCallCXX(veckok->v_dual.sync<ExecutionSpace>());
55   PetscCallCXX(*kv = veckok->v_dual.view<ExecutionSpace>());
56   PetscFunctionReturn(PETSC_SUCCESS);
57 }
58 
59 /* Function template explicit instantiation */
60 template PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec, ConstPetscScalarKokkosView *);
61 template <>
VecGetKokkosView(Vec v,PetscScalarKokkosView * kv)62 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec v, PetscScalarKokkosView *kv)
63 {
64   return VecGetKokkosView_Private(v, kv, PETSC_FALSE);
65 }
66 template <>
VecRestoreKokkosView(Vec v,PetscScalarKokkosView * kv)67 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosView(Vec v, PetscScalarKokkosView *kv)
68 {
69   return VecRestoreKokkosView_Private(v, kv, PETSC_FALSE);
70 }
71 template <>
VecGetKokkosViewWrite(Vec v,PetscScalarKokkosView * kv)72 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosViewWrite(Vec v, PetscScalarKokkosView *kv)
73 {
74   return VecGetKokkosView_Private(v, kv, PETSC_TRUE);
75 }
76 template <>
VecRestoreKokkosViewWrite(Vec v,PetscScalarKokkosView * kv)77 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosViewWrite(Vec v, PetscScalarKokkosView *kv)
78 {
79   return VecRestoreKokkosView_Private(v, kv, PETSC_TRUE);
80 }
81 
82 #if !defined(KOKKOS_ENABLE_UNIFIED_MEMORY) /* Get host views if the default memory space is not host space */
83 template PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec, ConstPetscScalarKokkosViewHost *);
84 template <>
VecGetKokkosView(Vec v,PetscScalarKokkosViewHost * kv)85 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec v, PetscScalarKokkosViewHost *kv)
86 {
87   return VecGetKokkosView_Private(v, kv, PETSC_FALSE);
88 }
89 template <>
VecRestoreKokkosView(Vec v,PetscScalarKokkosViewHost * kv)90 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosView(Vec v, PetscScalarKokkosViewHost *kv)
91 {
92   return VecRestoreKokkosView_Private(v, kv, PETSC_FALSE);
93 }
94 template <>
VecGetKokkosViewWrite(Vec v,PetscScalarKokkosViewHost * kv)95 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosViewWrite(Vec v, PetscScalarKokkosViewHost *kv)
96 {
97   return VecGetKokkosView_Private(v, kv, PETSC_TRUE);
98 }
99 template <>
VecRestoreKokkosViewWrite(Vec v,PetscScalarKokkosViewHost * kv)100 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosViewWrite(Vec v, PetscScalarKokkosViewHost *kv)
101 {
102   return VecRestoreKokkosView_Private(v, kv, PETSC_TRUE);
103 }
104 #endif
105 
VecSetRandom_SeqKokkos(Vec xin,PetscRandom r)106 PetscErrorCode VecSetRandom_SeqKokkos(Vec xin, PetscRandom r)
107 {
108   const PetscInt n = xin->map->n;
109   PetscScalar   *xx;
110 
111   PetscFunctionBegin;
112   PetscCall(VecGetArrayWrite(xin, &xx)); /* TODO: generate randoms directly on device */
113   for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValue(r, &xx[i]));
114   PetscCall(VecRestoreArrayWrite(xin, &xx));
115   PetscFunctionReturn(PETSC_SUCCESS);
116 }
117 
118 /* x = |x| */
VecAbs_SeqKokkos(Vec xin)119 PetscErrorCode VecAbs_SeqKokkos(Vec xin)
120 {
121   PetscScalarKokkosView xv;
122   auto                  exec = PetscGetKokkosExecutionSpace();
123 
124   PetscFunctionBegin;
125   PetscCall(PetscLogGpuTimeBegin());
126   PetscCall(VecGetKokkosView(xin, &xv));
127   PetscCallCXX(KokkosBlas::abs(exec, xv, xv));
128   PetscCall(VecRestoreKokkosView(xin, &xv));
129   PetscCall(PetscLogGpuTimeEnd());
130   PetscFunctionReturn(PETSC_SUCCESS);
131 }
132 
133 /* x = 1/x */
VecReciprocal_SeqKokkos(Vec xin)134 PetscErrorCode VecReciprocal_SeqKokkos(Vec xin)
135 {
136   PetscScalarKokkosView xv;
137 
138   PetscFunctionBegin;
139   PetscCall(PetscLogGpuTimeBegin());
140   PetscCall(VecGetKokkosView(xin, &xv));
141   PetscCallCXX(Kokkos::parallel_for(
142     "VecReciprocal", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) {
143       if (xv(i) != (PetscScalar)0.0) xv(i) = (PetscScalar)1.0 / xv(i);
144     }));
145   PetscCall(VecRestoreKokkosView(xin, &xv));
146   PetscCall(PetscLogGpuTimeEnd());
147   PetscFunctionReturn(PETSC_SUCCESS);
148 }
149 
VecMin_SeqKokkos(Vec xin,PetscInt * p,PetscReal * val)150 PetscErrorCode VecMin_SeqKokkos(Vec xin, PetscInt *p, PetscReal *val)
151 {
152   ConstPetscScalarKokkosView                           xv;
153   Kokkos::MinFirstLoc<PetscReal, PetscInt>::value_type result;
154 
155   PetscFunctionBegin;
156   PetscCall(PetscLogGpuTimeBegin());
157   PetscCall(VecGetKokkosView(xin, &xv));
158   PetscCallCXX(Kokkos::parallel_reduce(
159     "VecMin", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n),
160     KOKKOS_LAMBDA(const PetscInt &i, Kokkos::MinFirstLoc<PetscReal, PetscInt>::value_type &lupdate) {
161       if (PetscRealPart(xv(i)) < lupdate.val) {
162         lupdate.val = PetscRealPart(xv(i));
163         lupdate.loc = i;
164       }
165     },
166     Kokkos::MinFirstLoc<PetscReal, PetscInt>(result)));
167   *val = result.val;
168   if (p) *p = result.loc;
169   PetscCall(VecRestoreKokkosView(xin, &xv));
170   PetscCall(PetscLogGpuTimeEnd());
171   PetscFunctionReturn(PETSC_SUCCESS);
172 }
173 
VecMax_SeqKokkos(Vec xin,PetscInt * p,PetscReal * val)174 PetscErrorCode VecMax_SeqKokkos(Vec xin, PetscInt *p, PetscReal *val)
175 {
176   ConstPetscScalarKokkosView                           xv;
177   Kokkos::MaxFirstLoc<PetscReal, PetscInt>::value_type result;
178 
179   PetscFunctionBegin;
180   PetscCall(PetscLogGpuTimeBegin());
181   PetscCall(VecGetKokkosView(xin, &xv));
182   PetscCallCXX(Kokkos::parallel_reduce(
183     "VecMax", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n),
184     KOKKOS_LAMBDA(const PetscInt &i, Kokkos::MaxFirstLoc<PetscReal, PetscInt>::value_type &lupdate) {
185       if (PetscRealPart(xv(i)) > lupdate.val) {
186         lupdate.val = PetscRealPart(xv(i));
187         lupdate.loc = i;
188       }
189     },
190     Kokkos::MaxFirstLoc<PetscReal, PetscInt>(result)));
191   *val = result.val;
192   if (p) *p = result.loc;
193   PetscCall(VecRestoreKokkosView(xin, &xv));
194   PetscCall(PetscLogGpuTimeEnd());
195   PetscFunctionReturn(PETSC_SUCCESS);
196 }
197 
VecSum_SeqKokkos(Vec xin,PetscScalar * sum)198 PetscErrorCode VecSum_SeqKokkos(Vec xin, PetscScalar *sum)
199 {
200   ConstPetscScalarKokkosView xv;
201 
202   PetscFunctionBegin;
203   PetscCall(PetscLogGpuTimeBegin());
204   PetscCall(VecGetKokkosView(xin, &xv));
205   PetscCallCXX(*sum = KokkosBlas::sum(PetscGetKokkosExecutionSpace(), xv));
206   PetscCall(VecRestoreKokkosView(xin, &xv));
207   PetscCall(PetscLogGpuTimeEnd());
208   PetscFunctionReturn(PETSC_SUCCESS);
209 }
210 
VecShift_SeqKokkos(Vec xin,PetscScalar shift)211 PetscErrorCode VecShift_SeqKokkos(Vec xin, PetscScalar shift)
212 {
213   PetscScalarKokkosView xv;
214 
215   PetscFunctionBegin;
216   PetscCall(PetscLogGpuTimeBegin());
217   PetscCall(VecGetKokkosView(xin, &xv));
218   PetscCallCXX(Kokkos::parallel_for("VecShift", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) { xv(i) += shift; }); PetscCall(VecRestoreKokkosView(xin, &xv)));
219   PetscCall(PetscLogGpuTimeEnd());
220   PetscCall(PetscLogGpuFlops(xin->map->n));
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 /* y = alpha x + y */
VecAXPY_SeqKokkos(Vec yin,PetscScalar alpha,Vec xin)225 PetscErrorCode VecAXPY_SeqKokkos(Vec yin, PetscScalar alpha, Vec xin)
226 {
227   PetscFunctionBegin;
228   if (alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
229   if (yin == xin) {
230     PetscCall(VecScale_SeqKokkos(yin, alpha + 1));
231   } else {
232     PetscBool xiskok, yiskok;
233 
234     PetscCall(PetscObjectTypeCompareAny((PetscObject)xin, &xiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
235     PetscCall(PetscObjectTypeCompareAny((PetscObject)yin, &yiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
236     if (xiskok && yiskok) {
237       PetscScalarKokkosView      yv;
238       ConstPetscScalarKokkosView xv;
239 
240       PetscCall(PetscLogGpuTimeBegin());
241       PetscCall(VecGetKokkosView(xin, &xv));
242       PetscCall(VecGetKokkosView(yin, &yv));
243       PetscCallCXX(KokkosBlas::axpy(PetscGetKokkosExecutionSpace(), alpha, xv, yv));
244       PetscCall(VecRestoreKokkosView(xin, &xv));
245       PetscCall(VecRestoreKokkosView(yin, &yv));
246       PetscCall(PetscLogGpuTimeEnd());
247       PetscCall(PetscLogGpuFlops(2.0 * yin->map->n));
248     } else {
249       PetscCall(VecAXPY_Seq(yin, alpha, xin));
250     }
251   }
252   PetscFunctionReturn(PETSC_SUCCESS);
253 }
254 
255 /* y = x + beta y */
VecAYPX_SeqKokkos(Vec yin,PetscScalar beta,Vec xin)256 PetscErrorCode VecAYPX_SeqKokkos(Vec yin, PetscScalar beta, Vec xin)
257 {
258   PetscFunctionBegin;
259   /* One needs to define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 to have optimizations for cases alpha/beta = 0,+/-1 */
260   PetscCall(VecAXPBY_SeqKokkos(yin, 1.0, beta, xin));
261   PetscFunctionReturn(PETSC_SUCCESS);
262 }
263 
264 /* z = y^T x */
VecTDot_SeqKokkos(Vec xin,Vec yin,PetscScalar * z)265 PetscErrorCode VecTDot_SeqKokkos(Vec xin, Vec yin, PetscScalar *z)
266 {
267   ConstPetscScalarKokkosView xv, yv;
268 
269   PetscFunctionBegin;
270   PetscCall(PetscLogGpuTimeBegin());
271   PetscCall(VecGetKokkosView(xin, &xv));
272   PetscCall(VecGetKokkosView(yin, &yv));
273   // Kokkos always overwrites z, so no need to init it
274   PetscCallCXX(Kokkos::parallel_reduce("VecTDot", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &update) { update += yv(i) * xv(i); }, *z));
275   PetscCall(VecRestoreKokkosView(yin, &yv));
276   PetscCall(VecRestoreKokkosView(xin, &xv));
277   PetscCall(PetscLogGpuTimeEnd());
278   if (xin->map->n > 0) PetscCall(PetscLogGpuFlops(2.0 * xin->map->n));
279   PetscFunctionReturn(PETSC_SUCCESS);
280 }
281 
282 struct TransposeDotTag { };
283 struct ConjugateDotTag { };
284 
285 template <PetscInt ValueCount>
286 struct MDotFunctor {
287   static_assert(ValueCount >= 1 && ValueCount <= 8, "ValueCount must be in [1, 8]");
288   /* Note the C++ notation for an array typedef */
289   // noted, thanks
290   typedef PetscScalar                           value_type[];
291   typedef ConstPetscScalarKokkosView::size_type size_type;
292 
293   /* Tell Kokkos the result array's number of entries. This must be a public value in the functor */
294   static constexpr size_type value_count = ValueCount;
295   ConstPetscScalarKokkosView xv, yv[8];
296 
MDotFunctorMDotFunctor297   MDotFunctor(ConstPetscScalarKokkosView &xv, ConstPetscScalarKokkosView &yv0, ConstPetscScalarKokkosView &yv1, ConstPetscScalarKokkosView &yv2, ConstPetscScalarKokkosView &yv3, ConstPetscScalarKokkosView &yv4, ConstPetscScalarKokkosView &yv5, ConstPetscScalarKokkosView &yv6, ConstPetscScalarKokkosView &yv7) :
298     xv(xv)
299   {
300     yv[0] = yv0;
301     yv[1] = yv1;
302     yv[2] = yv2;
303     yv[3] = yv3;
304     yv[4] = yv4;
305     yv[5] = yv5;
306     yv[6] = yv6;
307     yv[7] = yv7;
308   }
309 
operator ()MDotFunctor310   KOKKOS_INLINE_FUNCTION void operator()(TransposeDotTag, const size_type i, value_type sum) const
311   {
312     PetscScalar xval = xv(i);
313     for (size_type j = 0; j < value_count; ++j) sum[j] += yv[j](i) * xval;
314   }
315 
operator ()MDotFunctor316   KOKKOS_INLINE_FUNCTION void operator()(ConjugateDotTag, const size_type i, value_type sum) const
317   {
318     PetscScalar xval = xv(i);
319     for (size_type j = 0; j < value_count; ++j) sum[j] += PetscConj(yv[j](i)) * xval;
320   }
321 
322   // Per https://kokkos.github.io/kokkos-core-wiki/API/core/parallel-dispatch/parallel_reduce.html#requirements
323   // "when specifying a tag in the policy, the functor's potential init/join/final member functions must also be tagged"
324   // So we have this kind of duplicated code.
joinMDotFunctor325   KOKKOS_INLINE_FUNCTION void join(TransposeDotTag, value_type dst, const value_type src) const { join(dst, src); }
joinMDotFunctor326   KOKKOS_INLINE_FUNCTION void join(ConjugateDotTag, value_type dst, const value_type src) const { join(dst, src); }
327 
initMDotFunctor328   KOKKOS_INLINE_FUNCTION void init(TransposeDotTag, value_type sum) const { init(sum); }
initMDotFunctor329   KOKKOS_INLINE_FUNCTION void init(ConjugateDotTag, value_type sum) const { init(sum); }
330 
joinMDotFunctor331   KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
332   {
333     for (size_type j = 0; j < value_count; j++) dst[j] += src[j];
334   }
335 
initMDotFunctor336   KOKKOS_INLINE_FUNCTION void init(value_type sum) const
337   {
338     for (size_type j = 0; j < value_count; j++) sum[j] = 0.0;
339   }
340 };
341 
342 template <class WorkTag>
VecMultiDot_Private(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)343 PetscErrorCode VecMultiDot_Private(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
344 {
345   PetscInt                   i, j, cur = 0, ngroup = nv / 8, rem = nv % 8, N = xin->map->n;
346   ConstPetscScalarKokkosView xv, yv[8];
347   PetscScalarKokkosViewHost  zv(z, nv);
348   auto                       exec = PetscGetKokkosExecutionSpace();
349 
350   PetscFunctionBegin;
351   PetscCall(VecGetKokkosView(xin, &xv));
352   for (i = 0; i < ngroup; i++) { /* 8 y's per group */
353     for (j = 0; j < 8; j++) PetscCall(VecGetKokkosView(yin[cur + j], &yv[j]));
354     MDotFunctor<8> mdot(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]); /* Hope Kokkos make it asynchronous */
355     PetscCall(PetscLogGpuTimeBegin());
356     PetscCallCXX(Kokkos::parallel_reduce(Kokkos::RangePolicy<WorkTag>(exec, 0, N), mdot, Kokkos::subview(zv, Kokkos::pair<PetscInt, PetscInt>(cur, cur + 8))));
357     PetscCall(PetscLogGpuTimeEnd());
358     for (j = 0; j < 8; j++) PetscCall(VecRestoreKokkosView(yin[cur + j], &yv[j]));
359     cur += 8;
360   }
361 
362   if (rem) { /* The remaining */
363     for (j = 0; j < rem; j++) PetscCall(VecGetKokkosView(yin[cur + j], &yv[j]));
364     Kokkos::RangePolicy<WorkTag> policy(exec, 0, N);
365     auto                         results = Kokkos::subview(zv, Kokkos::pair<PetscInt, PetscInt>(cur, cur + rem));
366     // clang-format off
367     PetscCall(PetscLogGpuTimeBegin());
368     switch (rem) {
369     case 1: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<1>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
370     case 2: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<2>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
371     case 3: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<3>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
372     case 4: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<4>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
373     case 5: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<5>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
374     case 6: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<6>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
375     case 7: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<7>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
376     }
377     PetscCall(PetscLogGpuTimeEnd());
378     // clang-format on
379     for (j = 0; j < rem; j++) PetscCall(VecRestoreKokkosView(yin[cur + j], &yv[j]));
380   }
381   PetscCall(VecRestoreKokkosView(xin, &xv));
382   exec.fence(); /* If reduce is async, then we need this fence to make sure z is ready for use on host */
383   PetscFunctionReturn(PETSC_SUCCESS);
384 }
385 
VecMultiDot_Verbose(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)386 static PetscErrorCode VecMultiDot_Verbose(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
387 {
388   PetscInt                   ngroup = nv / 8, rem = nv % 8, N = xin->map->n;
389   ConstPetscScalarKokkosView xv, y0, y1, y2, y3, y4, y5, y6, y7;
390   PetscScalar               *zp = z;
391   const Vec                 *yp = yin;
392   Kokkos::RangePolicy<>      policy(PetscGetKokkosExecutionSpace(), 0, N);
393 
394   // clang-format off
395   PetscFunctionBegin;
396   PetscCall(VecGetKokkosView(xin, &xv));
397   for (PetscInt k = 0; k < ngroup; k++) { // 8 y's per group
398     PetscCall(VecGetKokkosView(yp[0], &y0));
399     PetscCall(VecGetKokkosView(yp[1], &y1));
400     PetscCall(VecGetKokkosView(yp[2], &y2));
401     PetscCall(VecGetKokkosView(yp[3], &y3));
402     PetscCall(VecGetKokkosView(yp[4], &y4));
403     PetscCall(VecGetKokkosView(yp[5], &y5));
404     PetscCall(VecGetKokkosView(yp[6], &y6));
405     PetscCall(VecGetKokkosView(yp[7], &y7));
406     PetscCall(PetscLogGpuTimeBegin()); // only for GPU kernel execution
407     Kokkos::parallel_reduce(
408       "VecMDot8", policy,
409       KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4, PetscScalar &lsum5, PetscScalar &lsum6, PetscScalar &lsum7) {
410         lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
411         lsum4 += xv(i) * PetscConj(y4(i)); lsum5 += xv(i) * PetscConj(y5(i)); lsum6 += xv(i) * PetscConj(y6(i)); lsum7 += xv(i) * PetscConj(y7(i));
412       }, zp[0], zp[1], zp[2], zp[3], zp[4], zp[5], zp[6], zp[7]);
413     PetscCall(PetscLogGpuTimeEnd());
414     PetscCall(PetscLogGpuToCpu(8 * sizeof(PetscScalar))); // for copying to z[] on host
415     PetscCall(VecRestoreKokkosView(yp[0], &y0));
416     PetscCall(VecRestoreKokkosView(yp[1], &y1));
417     PetscCall(VecRestoreKokkosView(yp[2], &y2));
418     PetscCall(VecRestoreKokkosView(yp[3], &y3));
419     PetscCall(VecRestoreKokkosView(yp[4], &y4));
420     PetscCall(VecRestoreKokkosView(yp[5], &y5));
421     PetscCall(VecRestoreKokkosView(yp[6], &y6));
422     PetscCall(VecRestoreKokkosView(yp[7], &y7));
423     yp += 8;
424     zp += 8;
425   }
426 
427   if (rem) { /* The remaining */
428     if (rem > 0) PetscCall(VecGetKokkosView(yp[0], &y0));
429     if (rem > 1) PetscCall(VecGetKokkosView(yp[1], &y1));
430     if (rem > 2) PetscCall(VecGetKokkosView(yp[2], &y2));
431     if (rem > 3) PetscCall(VecGetKokkosView(yp[3], &y3));
432     if (rem > 4) PetscCall(VecGetKokkosView(yp[4], &y4));
433     if (rem > 5) PetscCall(VecGetKokkosView(yp[5], &y5));
434     if (rem > 6) PetscCall(VecGetKokkosView(yp[6], &y6));
435     PetscCall(PetscLogGpuTimeBegin());
436     switch (rem) {
437     case 7:
438       Kokkos::parallel_reduce(
439         "VecMDot7", policy,
440         KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4, PetscScalar &lsum5, PetscScalar &lsum6) {
441         lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
442         lsum4 += xv(i) * PetscConj(y4(i)); lsum5 += xv(i) * PetscConj(y5(i)); lsum6 += xv(i) * PetscConj(y6(i));
443       }, zp[0], zp[1], zp[2], zp[3], zp[4], zp[5], zp[6]);
444       break;
445     case 6:
446       Kokkos::parallel_reduce(
447         "VecMDot6", policy,
448         KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4, PetscScalar &lsum5) {
449         lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
450         lsum4 += xv(i) * PetscConj(y4(i)); lsum5 += xv(i) * PetscConj(y5(i));
451       }, zp[0], zp[1], zp[2], zp[3], zp[4], zp[5]);
452       break;
453     case 5:
454       Kokkos::parallel_reduce(
455         "VecMDot5", policy,
456         KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4) {
457         lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
458         lsum4 += xv(i) * PetscConj(y4(i));
459       }, zp[0], zp[1], zp[2], zp[3], zp[4]);
460       break;
461     case 4:
462       Kokkos::parallel_reduce(
463         "VecMDot4", policy,
464         KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3) {
465         lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
466       }, zp[0], zp[1], zp[2], zp[3]);
467       break;
468     case 3:
469       Kokkos::parallel_reduce(
470         "VecMDot3", policy,
471         KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2) {
472         lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i));
473       }, zp[0], zp[1], zp[2]);
474       break;
475     case 2:
476       Kokkos::parallel_reduce(
477         "VecMDot2", policy,
478         KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1) {
479         lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i));
480       }, zp[0], zp[1]);
481       break;
482     case 1:
483       Kokkos::parallel_reduce(
484         "VecMDot1", policy,
485         KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0) {
486         lsum0 += xv(i) * PetscConj(y0(i));
487       }, zp[0]);
488       break;
489     }
490     PetscCall(PetscLogGpuTimeEnd());
491     PetscCall(PetscLogGpuToCpu(rem * sizeof(PetscScalar))); // for copying to z[] on host
492     if (rem > 0) PetscCall(VecRestoreKokkosView(yp[0], &y0));
493     if (rem > 1) PetscCall(VecRestoreKokkosView(yp[1], &y1));
494     if (rem > 2) PetscCall(VecRestoreKokkosView(yp[2], &y2));
495     if (rem > 3) PetscCall(VecRestoreKokkosView(yp[3], &y3));
496     if (rem > 4) PetscCall(VecRestoreKokkosView(yp[4], &y4));
497     if (rem > 5) PetscCall(VecRestoreKokkosView(yp[5], &y5));
498     if (rem > 6) PetscCall(VecRestoreKokkosView(yp[6], &y6));
499   }
500   PetscCall(VecRestoreKokkosView(xin, &xv));
501   PetscFunctionReturn(PETSC_SUCCESS);
502   // clang-format on
503 }
504 
505 /* z[i] = (x,y_i) = y_i^H x */
VecMDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)506 PetscErrorCode VecMDot_SeqKokkos(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
507 {
508   PetscFunctionBegin;
509   // With no good reason, VecMultiDot_Private() performs much worse than VecMultiDot_Verbose() with HIP,
510   // but they are on par with CUDA. Kokkos team is investigating this problem.
511 #if 0
512   PetscCall(VecMultiDot_Private<ConjugateDotTag>(xin, nv, yin, z));
513 #else
514   PetscCall(VecMultiDot_Verbose(xin, nv, yin, z));
515 #endif
516   PetscCall(PetscLogGpuFlops(PetscMax(nv * (2.0 * xin->map->n - 1), 0.0)));
517   PetscFunctionReturn(PETSC_SUCCESS);
518 }
519 
520 /* z[i] = (x,y_i) = y_i^T x */
VecMTDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)521 PetscErrorCode VecMTDot_SeqKokkos(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
522 {
523   PetscFunctionBegin;
524   PetscCall(VecMultiDot_Private<TransposeDotTag>(xin, nv, yin, z));
525   PetscCall(PetscLogGpuFlops(PetscMax(nv * (2.0 * xin->map->n - 1), 0.0)));
526   PetscFunctionReturn(PETSC_SUCCESS);
527 }
528 
529 // z[i] = (x,y_i) = y_i^H x OR y_i^T x
VecMultiDot_SeqKokkos_GEMV(PetscBool conjugate,Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z_h)530 static PetscErrorCode VecMultiDot_SeqKokkos_GEMV(PetscBool conjugate, Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z_h)
531 {
532   PetscInt                   i, j, nfail;
533   ConstPetscScalarKokkosView xv, yfirst, ynext;
534   const PetscScalar         *yarray;
535   PetscBool                  stop  = PETSC_FALSE;
536   PetscScalar               *z_d   = nullptr;
537   const char                *trans = conjugate ? "C" : "T";
538   PetscInt64                 lda   = 0;
539   PetscInt                   m, n = xin->map->n;
540 
541   PetscFunctionBegin;
542   PetscCall(VecGetKokkosView(xin, &xv));
543 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
544   z_d = z_h;
545 #endif
546   i = nfail = 0;
547   while (i < nv) {
548     // search a sequence of vectors with a fixed stride
549     stop = PETSC_FALSE;
550     PetscCall(VecGetKokkosView(yin[i], &yfirst));
551     yarray = yfirst.data();
552     for (j = i + 1; j < nv; j++) {
553       PetscCall(VecGetKokkosView(yin[j], &ynext));
554       if (j == i + 1) {
555         lda = ynext.data() - yarray;                       // arbitrary ptrdiff could be very large
556         if (lda < 0 || lda - n > 64) stop = PETSC_TRUE;    // avoid using arbitrary lda; 64 bytes are a big enough alignment in VecDuplicateVecs
557       } else if (lda * (j - i) != ynext.data() - yarray) { // not in the same stride? if so, stop searching
558         stop = PETSC_TRUE;
559       }
560       PetscCall(VecRestoreKokkosView(yin[j], &ynext));
561       if (stop) break;
562     }
563     PetscCall(VecRestoreKokkosView(yin[i], &yfirst));
564 
565     // found m vectors yin[i..j) with a stride lda at address yarray
566     m = j - i;
567     if (m > 1) {
568       if (!z_d) {
569         if (nv > PetscScalarPoolSize) { // rare case
570           PetscScalarPoolSize = nv;
571           PetscCallCXX(PetscScalarPool = static_cast<PetscScalar *>(Kokkos::kokkos_realloc(PetscScalarPool, PetscScalarPoolSize)));
572         }
573         z_d = PetscScalarPool;
574       }
575       const auto &A  = Kokkos::View<const PetscScalar **, Kokkos::LayoutLeft>(yarray, lda, m);
576       const auto &Y  = Kokkos::subview(A, std::pair<PetscInt, PetscInt>(0, n), Kokkos::ALL);
577       auto        zv = PetscScalarKokkosDualView(PetscScalarKokkosView(z_d + i, m), PetscScalarKokkosViewHost(z_h + i, m));
578       PetscCall(PetscLogGpuTimeBegin());
579       PetscCallCXX(KokkosBlas::gemv(PetscGetKokkosExecutionSpace(), trans, 1.0, Y, xv, 0.0, zv.view_device()));
580       PetscCall(PetscLogGpuTimeEnd());
581       PetscCallCXX(zv.modify_device());
582       PetscCall(KokkosDualViewSyncHost(zv, PetscGetKokkosExecutionSpace()));
583       PetscCall(PetscLogGpuFlops(PetscMax(m * (2.0 * n - 1), 0.0)));
584     } else {
585       // we only allow falling back on VecDot once, to avoid doing VecMultiDot via individual VecDots
586       if (nfail++ == 0) {
587         if (conjugate) PetscCall(VecDot_SeqKokkos(xin, yin[i], z_h + i));
588         else PetscCall(VecTDot_SeqKokkos(xin, yin[i], z_h + i));
589       } else break; // break the while loop
590     }
591     i = j;
592   }
593   PetscCall(VecRestoreKokkosView(xin, &xv));
594   if (i < nv) { // finish the remaining if any
595     if (conjugate) PetscCall(VecMDot_SeqKokkos(xin, nv - i, yin + i, z_h + i));
596     else PetscCall(VecMTDot_SeqKokkos(xin, nv - i, yin + i, z_h + i));
597   }
598   PetscFunctionReturn(PETSC_SUCCESS);
599 }
600 
VecMDot_SeqKokkos_GEMV(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)601 PetscErrorCode VecMDot_SeqKokkos_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
602 {
603   PetscFunctionBegin;
604   PetscCall(VecMultiDot_SeqKokkos_GEMV(PETSC_TRUE, xin, nv, yin, z)); // conjugate
605   PetscFunctionReturn(PETSC_SUCCESS);
606 }
607 
VecMTDot_SeqKokkos_GEMV(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)608 PetscErrorCode VecMTDot_SeqKokkos_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
609 {
610   PetscFunctionBegin;
611   PetscCall(VecMultiDot_SeqKokkos_GEMV(PETSC_FALSE, xin, nv, yin, z)); // transpose
612   PetscFunctionReturn(PETSC_SUCCESS);
613 }
614 
615 /* x[:] = alpha */
VecSet_SeqKokkos(Vec xin,PetscScalar alpha)616 PetscErrorCode VecSet_SeqKokkos(Vec xin, PetscScalar alpha)
617 {
618   PetscScalarKokkosView xv;
619   auto                  exec = PetscGetKokkosExecutionSpace();
620 
621   PetscFunctionBegin;
622   PetscCall(PetscLogGpuTimeBegin());
623   PetscCall(VecGetKokkosViewWrite(xin, &xv));
624   PetscCallCXX(KokkosBlas::fill(exec, xv, alpha));
625   PetscCall(VecRestoreKokkosViewWrite(xin, &xv));
626   PetscCall(PetscLogGpuTimeEnd());
627   PetscFunctionReturn(PETSC_SUCCESS);
628 }
629 
630 /* x = alpha x */
VecScale_SeqKokkos(Vec xin,PetscScalar alpha)631 PetscErrorCode VecScale_SeqKokkos(Vec xin, PetscScalar alpha)
632 {
633   auto exec = PetscGetKokkosExecutionSpace();
634 
635   PetscFunctionBegin;
636   if (alpha == (PetscScalar)0.0) {
637     PetscCall(VecSet_SeqKokkos(xin, alpha));
638   } else if (alpha != (PetscScalar)1.0) {
639     PetscScalarKokkosView xv;
640 
641     PetscCall(PetscLogGpuTimeBegin());
642     PetscCall(VecGetKokkosView(xin, &xv));
643     PetscCallCXX(KokkosBlas::scal(exec, xv, alpha, xv));
644     PetscCall(VecRestoreKokkosView(xin, &xv));
645     PetscCall(PetscLogGpuTimeEnd());
646     PetscCall(PetscLogGpuFlops(xin->map->n));
647   }
648   PetscFunctionReturn(PETSC_SUCCESS);
649 }
650 
651 /* z = y^H x */
VecDot_SeqKokkos(Vec xin,Vec yin,PetscScalar * z)652 PetscErrorCode VecDot_SeqKokkos(Vec xin, Vec yin, PetscScalar *z)
653 {
654   ConstPetscScalarKokkosView xv, yv;
655   auto                       exec = PetscGetKokkosExecutionSpace();
656 
657   PetscFunctionBegin;
658   PetscCall(PetscLogGpuTimeBegin());
659   PetscCall(VecGetKokkosView(xin, &xv));
660   PetscCall(VecGetKokkosView(yin, &yv));
661   PetscCallCXX(*z = KokkosBlas::dot(exec, yv, xv)); /* KokkosBlas::dot(a,b) takes conjugate of a */
662   PetscCall(VecRestoreKokkosView(xin, &xv));
663   PetscCall(VecRestoreKokkosView(yin, &yv));
664   PetscCall(PetscLogGpuTimeEnd());
665   PetscCall(PetscLogGpuFlops(PetscMax(2.0 * xin->map->n - 1, 0.0)));
666   PetscFunctionReturn(PETSC_SUCCESS);
667 }
668 
669 /* y = x, where x is VECKOKKOS, but y may be not */
VecCopy_SeqKokkos(Vec xin,Vec yin)670 PetscErrorCode VecCopy_SeqKokkos(Vec xin, Vec yin)
671 {
672   auto exec = PetscGetKokkosExecutionSpace();
673 
674   PetscFunctionBegin;
675   PetscCall(PetscLogGpuTimeBegin());
676   if (xin != yin) {
677     Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(xin->spptr);
678     if (yin->offloadmask == PETSC_OFFLOAD_KOKKOS) {
679       /* y is also a VecKokkos */
680       Vec_Kokkos *ykok = static_cast<Vec_Kokkos *>(yin->spptr);
681       /* Kokkos rule: if x's host has newer data, it will copy to y's host view; otherwise to y's device view
682         In case x's host is newer, y's device is newer, it will error (though should not, I think). So we just
683         clear y's sync state.
684        */
685       ykok->v_dual.clear_sync_state();
686       PetscCallCXX(Kokkos::deep_copy(exec, ykok->v_dual, xkok->v_dual)); // either cpu2cpu or gpu2cpu, so don't log it
687     } else {
688       PetscScalar *yarray;
689       PetscCall(VecGetArrayWrite(yin, &yarray));
690       PetscScalarKokkosViewHost yv(yarray, yin->map->n);
691       if (xkok->v_dual.need_sync_host()) {                                     // x's device has newer data
692         PetscCallCXX(Kokkos::deep_copy(exec, yv, xkok->v_dual.view_device())); // gpu2cpu
693         PetscCallCXX(exec.fence());                                            // finish the deep copy
694         PetscCall(PetscLogGpuToCpu(xkok->v_dual.extent(0) * sizeof(PetscScalar)));
695       } else {
696         PetscCallCXX(exec.fence());                                          // make sure xkok->v_dual.view_host() in ready for use on host;  Kokkos might also call it inside deep_copy(). We do it here for safety.
697         PetscCallCXX(Kokkos::deep_copy(exec, yv, xkok->v_dual.view_host())); // Host view to host view deep copy, done on host
698       }
699       PetscCall(VecRestoreArrayWrite(yin, &yarray));
700     }
701   }
702   PetscCall(PetscLogGpuTimeEnd());
703   PetscFunctionReturn(PETSC_SUCCESS);
704 }
705 
706 /* y[i] <--> x[i] */
VecSwap_SeqKokkos(Vec xin,Vec yin)707 PetscErrorCode VecSwap_SeqKokkos(Vec xin, Vec yin)
708 {
709   PetscFunctionBegin;
710   if (xin != yin) {
711     PetscScalarKokkosView xv, yv;
712 
713     PetscCall(PetscLogGpuTimeBegin());
714     PetscCall(VecGetKokkosView(xin, &xv));
715     PetscCall(VecGetKokkosView(yin, &yv));
716     PetscCallCXX(Kokkos::parallel_for(
717       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) {
718         PetscScalar tmp = xv(i);
719         xv(i)           = yv(i);
720         yv(i)           = tmp;
721       }));
722     PetscCall(VecRestoreKokkosView(xin, &xv));
723     PetscCall(VecRestoreKokkosView(yin, &yv));
724     PetscCall(PetscLogGpuTimeEnd());
725   }
726   PetscFunctionReturn(PETSC_SUCCESS);
727 }
728 
729 /*  w = alpha x + y */
VecWAXPY_SeqKokkos(Vec win,PetscScalar alpha,Vec xin,Vec yin)730 PetscErrorCode VecWAXPY_SeqKokkos(Vec win, PetscScalar alpha, Vec xin, Vec yin)
731 {
732   PetscFunctionBegin;
733   if (alpha == (PetscScalar)0.0) {
734     PetscCall(VecCopy_SeqKokkos(yin, win));
735   } else {
736     ConstPetscScalarKokkosView xv, yv;
737     PetscScalarKokkosView      wv;
738 
739     PetscCall(PetscLogGpuTimeBegin());
740     PetscCall(VecGetKokkosViewWrite(win, &wv));
741     PetscCall(VecGetKokkosView(xin, &xv));
742     PetscCall(VecGetKokkosView(yin, &yv));
743     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, win->map->n), KOKKOS_LAMBDA(const PetscInt &i) { wv(i) = alpha * xv(i) + yv(i); }));
744     PetscCall(VecRestoreKokkosView(xin, &xv));
745     PetscCall(VecRestoreKokkosView(yin, &yv));
746     PetscCall(VecRestoreKokkosViewWrite(win, &wv));
747     PetscCall(PetscLogGpuTimeEnd());
748     PetscCall(PetscLogGpuFlops(2.0 * win->map->n));
749   }
750   PetscFunctionReturn(PETSC_SUCCESS);
751 }
752 
753 template <PetscInt ValueCount>
754 struct MAXPYFunctor {
755   static_assert(ValueCount >= 1 && ValueCount <= 8, "ValueCount must be in [1, 8]");
756   typedef ConstPetscScalarKokkosView::size_type size_type;
757 
758   PetscScalarKokkosView      yv;
759   PetscScalar                a[8];
760   ConstPetscScalarKokkosView xv[8];
761 
MAXPYFunctorMAXPYFunctor762   MAXPYFunctor(PetscScalarKokkosView yv, PetscScalar a0, PetscScalar a1, PetscScalar a2, PetscScalar a3, PetscScalar a4, PetscScalar a5, PetscScalar a6, PetscScalar a7, ConstPetscScalarKokkosView xv0, ConstPetscScalarKokkosView xv1, ConstPetscScalarKokkosView xv2, ConstPetscScalarKokkosView xv3, ConstPetscScalarKokkosView xv4, ConstPetscScalarKokkosView xv5, ConstPetscScalarKokkosView xv6, ConstPetscScalarKokkosView xv7) :
763     yv(yv)
764   {
765     a[0]  = a0;
766     a[1]  = a1;
767     a[2]  = a2;
768     a[3]  = a3;
769     a[4]  = a4;
770     a[5]  = a5;
771     a[6]  = a6;
772     a[7]  = a7;
773     xv[0] = xv0;
774     xv[1] = xv1;
775     xv[2] = xv2;
776     xv[3] = xv3;
777     xv[4] = xv4;
778     xv[5] = xv5;
779     xv[6] = xv6;
780     xv[7] = xv7;
781   }
782 
operator ()MAXPYFunctor783   KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
784   {
785     for (PetscInt j = 0; j < ValueCount; ++j) yv(i) += a[j] * xv[j](i);
786   }
787 };
788 
789 /*  y = y + sum alpha[i] x[i] */
VecMAXPY_SeqKokkos(Vec yin,PetscInt nv,const PetscScalar * alpha,Vec * xin)790 PetscErrorCode VecMAXPY_SeqKokkos(Vec yin, PetscInt nv, const PetscScalar *alpha, Vec *xin)
791 {
792   PetscInt                   i, j, cur = 0, ngroup = nv / 8, rem = nv % 8, N = yin->map->n;
793   PetscScalarKokkosView      yv;
794   PetscScalar                a[8];
795   ConstPetscScalarKokkosView xv[8];
796   Kokkos::RangePolicy<>      policy(PetscGetKokkosExecutionSpace(), 0, N);
797 
798   PetscFunctionBegin;
799   PetscCall(PetscLogGpuTimeBegin());
800   PetscCall(VecGetKokkosView(yin, &yv));
801   for (i = 0; i < ngroup; i++) { /* 8 x's per group */
802     for (j = 0; j < 8; j++) {    /* Fill the parameters */
803       a[j] = alpha[cur + j];
804       PetscCall(VecGetKokkosView(xin[cur + j], &xv[j]));
805     }
806     MAXPYFunctor<8> maxpy(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]);
807     PetscCallCXX(Kokkos::parallel_for(policy, maxpy));
808     for (j = 0; j < 8; j++) PetscCall(VecRestoreKokkosView(xin[cur + j], &xv[j]));
809     cur += 8;
810   }
811 
812   if (rem) { /* The remaining */
813     for (j = 0; j < rem; j++) {
814       a[j] = alpha[cur + j];
815       PetscCall(VecGetKokkosView(xin[cur + j], &xv[j]));
816     }
817     // clang-format off
818     switch (rem) {
819     case 1: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<1>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
820     case 2: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<2>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
821     case 3: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<3>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
822     case 4: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<4>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
823     case 5: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<5>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
824     case 6: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<6>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
825     case 7: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<7>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
826     }
827     // clang-format on
828     for (j = 0; j < rem; j++) PetscCall(VecRestoreKokkosView(xin[cur + j], &xv[j]));
829   }
830   PetscCall(VecRestoreKokkosView(yin, &yv));
831   PetscCall(PetscLogGpuTimeEnd());
832   PetscCall(PetscLogGpuFlops(nv * 2.0 * yin->map->n));
833   PetscFunctionReturn(PETSC_SUCCESS);
834 }
835 
836 /*  y = y + sum alpha[i] x[i] */
VecMAXPY_SeqKokkos_GEMV(Vec yin,PetscInt nv,const PetscScalar * a_h,Vec * xin)837 PetscErrorCode VecMAXPY_SeqKokkos_GEMV(Vec yin, PetscInt nv, const PetscScalar *a_h, Vec *xin)
838 {
839   const PetscInt             n = yin->map->n;
840   PetscInt                   i, j, nfail;
841   PetscScalarKokkosView      yv;
842   ConstPetscScalarKokkosView xfirst, xnext;
843   PetscBool                  stop = PETSC_FALSE;
844   PetscInt                   lda  = 0, m;
845   const PetscScalar         *xarray;
846   PetscScalar               *a_d = nullptr;
847 
848   PetscFunctionBegin;
849   PetscCall(PetscLogGpuTimeBegin());
850   PetscCall(VecGetKokkosView(yin, &yv));
851 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
852   a_d = const_cast<PetscScalar *>(a_h);
853 #endif
854   i = nfail = 0;
855   while (i < nv) {
856     stop = PETSC_FALSE;
857     PetscCall(VecGetKokkosView(xin[i], &xfirst));
858     xarray = xfirst.data();
859     for (j = i + 1; j < nv; j++) {
860       PetscCall(VecGetKokkosView(xin[j], &xnext));
861       if (j == i + 1) {
862         lda = xnext.data() - xfirst.data();
863         if (lda < 0 || lda - n > 64) stop = PETSC_TRUE;    // avoid using arbitrary lda; 64 bytes are a big enough alignment in VecDuplicateVecs
864       } else if (lda * (j - i) != xnext.data() - xarray) { // not in the same stride? if so, stop here
865         stop = PETSC_TRUE;
866       }
867       PetscCall(VecRestoreKokkosView(xin[j], &xnext));
868       if (stop) break;
869     }
870     PetscCall(VecRestoreKokkosView(xin[i], &xfirst));
871 
872     m = j - i;
873     if (m > 1) {
874       if (!a_d) {
875         if (nv > PetscScalarPoolSize) { // rare case
876           PetscScalarPoolSize = nv;
877           PetscCallCXX(PetscScalarPool = static_cast<PetscScalar *>(Kokkos::kokkos_realloc(PetscScalarPool, PetscScalarPoolSize)));
878         }
879         a_d = PetscScalarPool;
880       }
881       const auto &B  = Kokkos::View<const PetscScalar **, Kokkos::LayoutLeft>(xarray, lda, m);
882       const auto &A  = Kokkos::subview(B, std::pair<PetscInt, PetscInt>(0, n), Kokkos::ALL);
883       auto        av = PetscScalarKokkosDualView(PetscScalarKokkosView(a_d + i, m), PetscScalarKokkosViewHost(const_cast<PetscScalar *>(a_h) + i, m));
884       av.modify_host();
885       PetscCall(KokkosDualViewSyncDevice(av, PetscGetKokkosExecutionSpace()));
886       PetscCallCXX(KokkosBlas::gemv(PetscGetKokkosExecutionSpace(), "N", 1.0, A, av.view_device(), 1.0, yv));
887       PetscCall(PetscLogGpuFlops(m * 2.0 * n));
888     } else {
889       // we only allow falling back on VecAXPY once
890       if (nfail++ == 0) PetscCall(VecAXPY_SeqKokkos(yin, a_h[i], xin[i]));
891       else break; // break the while loop
892     }
893     i = j;
894   }
895   // finish the remaining if any
896   PetscCall(VecRestoreKokkosView(yin, &yv));
897   if (i < nv) PetscCall(VecMAXPY_SeqKokkos(yin, nv - i, a_h + i, xin + i));
898   PetscCall(PetscLogGpuTimeEnd());
899   PetscFunctionReturn(PETSC_SUCCESS);
900 }
901 
902 /* y = alpha x + beta y */
VecAXPBY_SeqKokkos(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)903 PetscErrorCode VecAXPBY_SeqKokkos(Vec yin, PetscScalar alpha, PetscScalar beta, Vec xin)
904 {
905   PetscBool xiskok, yiskok;
906 
907   PetscFunctionBegin;
908   PetscCall(PetscObjectTypeCompareAny((PetscObject)xin, &xiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
909   PetscCall(PetscObjectTypeCompareAny((PetscObject)yin, &yiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
910   if (xiskok && yiskok) {
911     ConstPetscScalarKokkosView xv;
912     PetscScalarKokkosView      yv;
913 
914     PetscCall(PetscLogGpuTimeBegin());
915     PetscCall(VecGetKokkosView(xin, &xv));
916     PetscCall(VecGetKokkosView(yin, &yv));
917     PetscCallCXX(KokkosBlas::axpby(PetscGetKokkosExecutionSpace(), alpha, xv, beta, yv));
918     PetscCall(VecRestoreKokkosView(xin, &xv));
919     PetscCall(VecRestoreKokkosView(yin, &yv));
920     PetscCall(PetscLogGpuTimeEnd());
921     if (alpha == (PetscScalar)0.0 || beta == (PetscScalar)0.0) {
922       PetscCall(PetscLogGpuFlops(xin->map->n));
923     } else if (beta == (PetscScalar)1.0 || alpha == (PetscScalar)1.0) {
924       PetscCall(PetscLogGpuFlops(2.0 * xin->map->n));
925     } else {
926       PetscCall(PetscLogGpuFlops(3.0 * xin->map->n));
927     }
928   } else {
929     PetscCall(VecAXPBY_Seq(yin, alpha, beta, xin));
930   }
931   PetscFunctionReturn(PETSC_SUCCESS);
932 }
933 
934 /* z = alpha x + beta y + gamma z */
VecAXPBYPCZ_SeqKokkos(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)935 PetscErrorCode VecAXPBYPCZ_SeqKokkos(Vec zin, PetscScalar alpha, PetscScalar beta, PetscScalar gamma, Vec xin, Vec yin)
936 {
937   ConstPetscScalarKokkosView xv, yv;
938   PetscScalarKokkosView      zv;
939   Kokkos::RangePolicy<>      policy(PetscGetKokkosExecutionSpace(), 0, zin->map->n);
940 
941   PetscFunctionBegin;
942   PetscCall(PetscLogGpuTimeBegin());
943   PetscCall(VecGetKokkosView(zin, &zv));
944   PetscCall(VecGetKokkosView(xin, &xv));
945   PetscCall(VecGetKokkosView(yin, &yv));
946   if (gamma == (PetscScalar)0.0) { // a common case
947     if (alpha == -beta) {
948       PetscCallCXX(Kokkos::parallel_for( // a common case
949         policy, KOKKOS_LAMBDA(const PetscInt &i) { zv(i) = alpha * (xv(i) - yv(i)); }));
950     } else {
951       PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i) { zv(i) = alpha * xv(i) + beta * yv(i); }));
952     }
953   } else {
954     PetscCallCXX(KokkosBlas::update(PetscGetKokkosExecutionSpace(), alpha, xv, beta, yv, gamma, zv));
955   }
956   PetscCall(VecRestoreKokkosView(xin, &xv));
957   PetscCall(VecRestoreKokkosView(yin, &yv));
958   PetscCall(VecRestoreKokkosView(zin, &zv));
959   PetscCall(PetscLogGpuTimeEnd());
960   PetscCall(PetscLogGpuFlops(zin->map->n * 5.0));
961   PetscFunctionReturn(PETSC_SUCCESS);
962 }
963 
964 /* w = x*y. Any subset of the x, y, and w may be the same vector.
965 
966   w is of type VecKokkos, but x, y may be not.
967 */
VecPointwiseMult_SeqKokkos(Vec win,Vec xin,Vec yin)968 PetscErrorCode VecPointwiseMult_SeqKokkos(Vec win, Vec xin, Vec yin)
969 {
970   PetscInt n;
971 
972   PetscFunctionBegin;
973   PetscCall(PetscLogGpuTimeBegin());
974   PetscCall(VecGetLocalSize(win, &n));
975   if (xin->offloadmask != PETSC_OFFLOAD_KOKKOS || yin->offloadmask != PETSC_OFFLOAD_KOKKOS) {
976     PetscScalarKokkosViewHost wv;
977     const PetscScalar        *xp, *yp;
978     PetscCall(VecGetArrayRead(xin, &xp));
979     PetscCall(VecGetArrayRead(yin, &yp));
980     PetscCall(VecGetKokkosViewWrite(win, &wv));
981 
982     ConstPetscScalarKokkosViewHost xv(xp, n), yv(yp, n);
983     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n), KOKKOS_LAMBDA(const PetscInt &i) { wv(i) = xv(i) * yv(i); }));
984 
985     PetscCall(VecRestoreArrayRead(xin, &xp));
986     PetscCall(VecRestoreArrayRead(yin, &yp));
987     PetscCall(VecRestoreKokkosViewWrite(win, &wv));
988   } else {
989     ConstPetscScalarKokkosView xv, yv;
990     PetscScalarKokkosView      wv;
991 
992     PetscCall(VecGetKokkosViewWrite(win, &wv));
993     PetscCall(VecGetKokkosView(xin, &xv));
994     PetscCall(VecGetKokkosView(yin, &yv));
995     PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt &i) { wv(i) = xv(i) * yv(i); }));
996     PetscCall(VecRestoreKokkosView(yin, &yv));
997     PetscCall(VecRestoreKokkosView(xin, &xv));
998     PetscCall(VecRestoreKokkosViewWrite(win, &wv));
999   }
1000   PetscCall(PetscLogGpuTimeEnd());
1001   PetscCall(PetscLogGpuFlops(n));
1002   PetscFunctionReturn(PETSC_SUCCESS);
1003 }
1004 
1005 /* w = x/y */
VecPointwiseDivide_SeqKokkos(Vec win,Vec xin,Vec yin)1006 PetscErrorCode VecPointwiseDivide_SeqKokkos(Vec win, Vec xin, Vec yin)
1007 {
1008   PetscInt n;
1009 
1010   PetscFunctionBegin;
1011   PetscCall(PetscLogGpuTimeBegin());
1012   PetscCall(VecGetLocalSize(win, &n));
1013   if (xin->offloadmask != PETSC_OFFLOAD_KOKKOS || yin->offloadmask != PETSC_OFFLOAD_KOKKOS) {
1014     PetscScalarKokkosViewHost wv;
1015     const PetscScalar        *xp, *yp;
1016     PetscCall(VecGetArrayRead(xin, &xp));
1017     PetscCall(VecGetArrayRead(yin, &yp));
1018     PetscCall(VecGetKokkosViewWrite(win, &wv));
1019 
1020     ConstPetscScalarKokkosViewHost xv(xp, n), yv(yp, n);
1021     PetscCallCXX(Kokkos::parallel_for(
1022       Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n), KOKKOS_LAMBDA(const PetscInt &i) {
1023         if (yv(i) != 0.0) wv(i) = xv(i) / yv(i);
1024         else wv(i) = 0.0;
1025       }));
1026 
1027     PetscCall(VecRestoreArrayRead(xin, &xp));
1028     PetscCall(VecRestoreArrayRead(yin, &yp));
1029     PetscCall(VecRestoreKokkosViewWrite(win, &wv));
1030   } else {
1031     ConstPetscScalarKokkosView xv, yv;
1032     PetscScalarKokkosView      wv;
1033 
1034     PetscCall(VecGetKokkosViewWrite(win, &wv));
1035     PetscCall(VecGetKokkosView(xin, &xv));
1036     PetscCall(VecGetKokkosView(yin, &yv));
1037     PetscCallCXX(Kokkos::parallel_for(
1038       Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt &i) {
1039         if (yv(i) != 0.0) wv(i) = xv(i) / yv(i);
1040         else wv(i) = 0.0;
1041       }));
1042     PetscCall(VecRestoreKokkosView(yin, &yv));
1043     PetscCall(VecRestoreKokkosView(xin, &xv));
1044     PetscCall(VecRestoreKokkosViewWrite(win, &wv));
1045   }
1046   PetscCall(PetscLogGpuTimeEnd());
1047   PetscCall(PetscLogGpuFlops(win->map->n));
1048   PetscFunctionReturn(PETSC_SUCCESS);
1049 }
1050 
VecNorm_SeqKokkos(Vec xin,NormType type,PetscReal * z)1051 PetscErrorCode VecNorm_SeqKokkos(Vec xin, NormType type, PetscReal *z)
1052 {
1053   const PetscInt             n = xin->map->n;
1054   ConstPetscScalarKokkosView xv;
1055   auto                       exec = PetscGetKokkosExecutionSpace();
1056 
1057   PetscFunctionBegin;
1058   if (type == NORM_1_AND_2) {
1059     PetscCall(VecNorm_SeqKokkos(xin, NORM_1, z));
1060     PetscCall(VecNorm_SeqKokkos(xin, NORM_2, z + 1));
1061   } else {
1062     PetscCall(PetscLogGpuTimeBegin());
1063     PetscCall(VecGetKokkosView(xin, &xv));
1064     if (type == NORM_2 || type == NORM_FROBENIUS) {
1065       PetscCallCXX(*z = KokkosBlas::nrm2(exec, xv));
1066       PetscCall(PetscLogGpuFlops(PetscMax(2.0 * n - 1, 0.0)));
1067     } else if (type == NORM_1) {
1068       PetscCallCXX(*z = KokkosBlas::nrm1(exec, xv));
1069       PetscCall(PetscLogGpuFlops(PetscMax(n - 1.0, 0.0)));
1070     } else if (type == NORM_INFINITY) {
1071       PetscCallCXX(*z = KokkosBlas::nrminf(exec, xv));
1072     }
1073     PetscCall(VecRestoreKokkosView(xin, &xv));
1074     PetscCall(PetscLogGpuTimeEnd());
1075   }
1076   PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078 
VecErrorWeightedNorms_SeqKokkos(Vec U,Vec Y,Vec E,NormType wnormtype,PetscReal atol,Vec vatol,PetscReal rtol,Vec vrtol,PetscReal ignore_max,PetscReal * norm,PetscInt * norm_loc,PetscReal * norma,PetscInt * norma_loc,PetscReal * normr,PetscInt * normr_loc)1079 PetscErrorCode VecErrorWeightedNorms_SeqKokkos(Vec U, Vec Y, Vec E, NormType wnormtype, PetscReal atol, Vec vatol, PetscReal rtol, Vec vrtol, PetscReal ignore_max, PetscReal *norm, PetscInt *norm_loc, PetscReal *norma, PetscInt *norma_loc, PetscReal *normr, PetscInt *normr_loc)
1080 {
1081   ConstPetscScalarKokkosView u, y, erra, atola, rtola;
1082   PetscBool                  has_E = PETSC_FALSE, has_atol = PETSC_FALSE, has_rtol = PETSC_FALSE;
1083   PetscInt                   n, n_loc = 0, na_loc = 0, nr_loc = 0;
1084   PetscReal                  nrm = 0, nrma = 0, nrmr = 0;
1085 
1086   PetscFunctionBegin;
1087   PetscCall(VecGetLocalSize(U, &n));
1088   PetscCall(VecGetKokkosView(U, &u));
1089   PetscCall(VecGetKokkosView(Y, &y));
1090   if (E) {
1091     PetscCall(VecGetKokkosView(E, &erra));
1092     has_E = PETSC_TRUE;
1093   }
1094   if (vatol) {
1095     PetscCall(VecGetKokkosView(vatol, &atola));
1096     has_atol = PETSC_TRUE;
1097   }
1098   if (vrtol) {
1099     PetscCall(VecGetKokkosView(vrtol, &rtola));
1100     has_rtol = PETSC_TRUE;
1101   }
1102 
1103   if (wnormtype == NORM_INFINITY) {
1104     PetscCallCXX(Kokkos::parallel_reduce(
1105       "VecErrorWeightedNorms_INFINITY", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n),
1106       KOKKOS_LAMBDA(const PetscInt &i, PetscReal &l_nrm, PetscReal &l_nrma, PetscReal &l_nrmr, PetscInt &l_n_loc, PetscInt &l_na_loc, PetscInt &l_nr_loc) {
1107         PetscReal err, tol, tola, tolr, l_atol, l_rtol;
1108         if (PetscAbsScalar(y(i)) >= ignore_max && PetscAbsScalar(u(i)) >= ignore_max) {
1109           l_atol = has_atol ? PetscRealPart(atola(i)) : atol;
1110           l_rtol = has_rtol ? PetscRealPart(rtola(i)) : rtol;
1111           err    = has_E ? PetscAbsScalar(erra(i)) : PetscAbsScalar(y(i) - u(i));
1112           tola   = l_atol;
1113           tolr   = l_rtol * PetscMax(PetscAbsScalar(u(i)), PetscAbsScalar(y(i)));
1114           tol    = tola + tolr;
1115           if (tola > 0.) {
1116             l_nrma = PetscMax(l_nrma, err / tola);
1117             l_na_loc++;
1118           }
1119           if (tolr > 0.) {
1120             l_nrmr = PetscMax(l_nrmr, err / tolr);
1121             l_nr_loc++;
1122           }
1123           if (tol > 0.) {
1124             l_nrm = PetscMax(l_nrm, err / tol);
1125             l_n_loc++;
1126           }
1127         }
1128       },
1129       Kokkos::Max<PetscReal>(nrm), Kokkos::Max<PetscReal>(nrma), Kokkos::Max<PetscReal>(nrmr), n_loc, na_loc, nr_loc));
1130   } else {
1131     PetscCallCXX(Kokkos::parallel_reduce(
1132       "VecErrorWeightedNorms_NORM_2", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n),
1133       KOKKOS_LAMBDA(const PetscInt &i, PetscReal &l_nrm, PetscReal &l_nrma, PetscReal &l_nrmr, PetscInt &l_n_loc, PetscInt &l_na_loc, PetscInt &l_nr_loc) {
1134         PetscReal err, tol, tola, tolr, l_atol, l_rtol;
1135         if (PetscAbsScalar(y(i)) >= ignore_max && PetscAbsScalar(u(i)) >= ignore_max) {
1136           l_atol = has_atol ? PetscRealPart(atola(i)) : atol;
1137           l_rtol = has_rtol ? PetscRealPart(rtola(i)) : rtol;
1138           err    = has_E ? PetscAbsScalar(erra(i)) : PetscAbsScalar(y(i) - u(i));
1139           tola   = l_atol;
1140           tolr   = l_rtol * PetscMax(PetscAbsScalar(u(i)), PetscAbsScalar(y(i)));
1141           tol    = tola + tolr;
1142           if (tola > 0.) {
1143             l_nrma += PetscSqr(err / tola);
1144             l_na_loc++;
1145           }
1146           if (tolr > 0.) {
1147             l_nrmr += PetscSqr(err / tolr);
1148             l_nr_loc++;
1149           }
1150           if (tol > 0.) {
1151             l_nrm += PetscSqr(err / tol);
1152             l_n_loc++;
1153           }
1154         }
1155       },
1156       nrm, nrma, nrmr, n_loc, na_loc, nr_loc));
1157   }
1158 
1159   if (wnormtype == NORM_2) {
1160     *norm  = PetscSqrtReal(nrm);
1161     *norma = PetscSqrtReal(nrma);
1162     *normr = PetscSqrtReal(nrmr);
1163   } else {
1164     *norm  = nrm;
1165     *norma = nrma;
1166     *normr = nrmr;
1167   }
1168   *norm_loc  = n_loc;
1169   *norma_loc = na_loc;
1170   *normr_loc = nr_loc;
1171 
1172   if (E) PetscCall(VecRestoreKokkosView(E, &erra));
1173   if (vatol) PetscCall(VecRestoreKokkosView(vatol, &atola));
1174   if (vrtol) PetscCall(VecRestoreKokkosView(vrtol, &rtola));
1175   PetscCall(VecRestoreKokkosView(U, &u));
1176   PetscCall(VecRestoreKokkosView(Y, &y));
1177   PetscFunctionReturn(PETSC_SUCCESS);
1178 }
1179 
1180 /* A functor for DotNorm2 so that we can compute dp and nm in one kernel */
1181 struct DotNorm2 {
1182   typedef PetscScalar                           value_type[];
1183   typedef ConstPetscScalarKokkosView::size_type size_type;
1184 
1185   size_type                  value_count;
1186   ConstPetscScalarKokkosView xv_, yv_; /* first and second vectors in VecDotNorm2. The order matters. */
1187 
DotNorm2DotNorm21188   DotNorm2(ConstPetscScalarKokkosView &xv, ConstPetscScalarKokkosView &yv) : value_count(2), xv_(xv), yv_(yv) { }
1189 
operator ()DotNorm21190   KOKKOS_INLINE_FUNCTION void operator()(const size_type i, value_type result) const
1191   {
1192     result[0] += PetscConj(yv_(i)) * xv_(i);
1193     result[1] += PetscConj(yv_(i)) * yv_(i);
1194   }
1195 
joinDotNorm21196   KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
1197   {
1198     dst[0] += src[0];
1199     dst[1] += src[1];
1200   }
1201 
initDotNorm21202   KOKKOS_INLINE_FUNCTION void init(value_type result) const
1203   {
1204     result[0] = 0.0;
1205     result[1] = 0.0;
1206   }
1207 };
1208 
1209 /* dp = y^H x, nm = y^H y */
VecDotNorm2_SeqKokkos(Vec xin,Vec yin,PetscScalar * dp,PetscScalar * nm)1210 PetscErrorCode VecDotNorm2_SeqKokkos(Vec xin, Vec yin, PetscScalar *dp, PetscScalar *nm)
1211 {
1212   ConstPetscScalarKokkosView xv, yv;
1213   PetscScalar                result[2];
1214 
1215   PetscFunctionBegin;
1216   PetscCall(PetscLogGpuTimeBegin());
1217   PetscCall(VecGetKokkosView(xin, &xv));
1218   PetscCall(VecGetKokkosView(yin, &yv));
1219   DotNorm2 dn(xv, yv);
1220   PetscCallCXX(Kokkos::parallel_reduce(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), dn, result));
1221   *dp = result[0];
1222   *nm = result[1];
1223   PetscCall(VecRestoreKokkosView(yin, &yv));
1224   PetscCall(VecRestoreKokkosView(xin, &xv));
1225   PetscCall(PetscLogGpuTimeEnd());
1226   PetscCall(PetscLogGpuFlops(4.0 * xin->map->n));
1227   PetscFunctionReturn(PETSC_SUCCESS);
1228 }
1229 
VecConjugate_SeqKokkos(Vec xin)1230 PetscErrorCode VecConjugate_SeqKokkos(Vec xin)
1231 {
1232   PetscScalarKokkosView xv;
1233 
1234   PetscFunctionBegin;
1235   PetscCall(PetscLogGpuTimeBegin());
1236   PetscCall(VecGetKokkosView(xin, &xv));
1237   PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) { xv(i) = PetscConj(xv(i)); }));
1238   PetscCall(VecRestoreKokkosView(xin, &xv));
1239   PetscCall(PetscLogGpuTimeEnd());
1240   PetscFunctionReturn(PETSC_SUCCESS);
1241 }
1242 
1243 /* Temporarily replace the array in vin with a[]. Return to the original array with a call to VecResetArray() */
VecPlaceArray_SeqKokkos(Vec vin,const PetscScalar * a)1244 PetscErrorCode VecPlaceArray_SeqKokkos(Vec vin, const PetscScalar *a)
1245 {
1246   Vec_Seq    *vecseq = (Vec_Seq *)vin->data;
1247   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(vin->spptr);
1248 
1249   PetscFunctionBegin;
1250   PetscCall(VecPlaceArray_Seq(vin, a));
1251   PetscCall(veckok->UpdateArray<HostMirrorMemorySpace>(vecseq->array));
1252   PetscFunctionReturn(PETSC_SUCCESS);
1253 }
1254 
VecResetArray_SeqKokkos(Vec vin)1255 PetscErrorCode VecResetArray_SeqKokkos(Vec vin)
1256 {
1257   Vec_Seq    *vecseq = (Vec_Seq *)vin->data;
1258   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(vin->spptr);
1259 
1260   PetscFunctionBegin;
1261   /* User wants to unhook the provided host array. Sync it so that user can get the latest */
1262   PetscCall(KokkosDualViewSyncHost(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1263   PetscCall(VecResetArray_Seq(vin)); /* Swap back the old host array, assuming its has the latest value */
1264   PetscCall(veckok->UpdateArray<HostMirrorMemorySpace>(vecseq->array));
1265   PetscFunctionReturn(PETSC_SUCCESS);
1266 }
1267 
1268 /*@C
1269   VecKokkosPlaceArray - Allows one to replace the device array in a VecKokkos vector with a
1270   device array provided by the user. This is useful to avoid copying an array into a vector.
1271 
1272   Logically Collective; No Fortran Support
1273 
1274   Input Parameters:
1275 + v - the VecKokkos vector
1276 - a - the device array
1277 
1278   Level: developer
1279 
1280   Notes:
1281 
1282   You can return to the original array with a call to `VecKokkosResetArray()`. `vec` does not take
1283   ownership of `array` in any way.
1284 
1285   The user manages the device array so PETSc doesn't care how it was allocated.
1286 
1287   The user must free `array` themselves but be careful not to
1288   do so before the vector has either been destroyed, had its original array restored with
1289   `VecKokkosResetArray()` or permanently replaced with `VecReplaceArray()`.
1290 
1291 .seealso: [](ch_vectors), `Vec`, `VecGetArray()`, `VecKokkosResetArray()`, `VecReplaceArray()`, `VecResetArray()`
1292 @*/
VecKokkosPlaceArray(Vec v,PetscScalar * a)1293 PetscErrorCode VecKokkosPlaceArray(Vec v, PetscScalar *a)
1294 {
1295   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1296 
1297   PetscFunctionBegin;
1298   VecErrorIfNotKokkos(v);
1299   // Sync the old device view before replacing it; so that when it is put back, it has the saved value.
1300   PetscCall(KokkosDualViewSyncDevice(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1301   PetscCallCXX(veckok->unplaced_d = veckok->v_dual.view_device());
1302   // We assume a[] contains the latest data and discard the vector's old sync state
1303   PetscCall(veckok->UpdateArray<DefaultMemorySpace>(a));
1304   PetscFunctionReturn(PETSC_SUCCESS);
1305 }
1306 
1307 /*@C
1308   VecKokkosResetArray - Resets a vector to use its default memory. Call this
1309   after the use of `VecKokkosPlaceArray()`.
1310 
1311   Not Collective
1312 
1313   Input Parameter:
1314 . v - the vector
1315 
1316   Level: developer
1317 
1318   Notes:
1319 
1320   After the call, the original array placed in with `VecKokkosPlaceArray()` will contain the latest value of the vector.
1321   Note that device kernels are asynchronous. Users are responsible to sync the device if they wish to have immediate access
1322   to the data in the array. Also, after the call, `v` will contain whatever data before `VecKokkosPlaceArray()`.
1323 
1324 .seealso: [](ch_vectors), `Vec`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecKokkosPlaceArray()`
1325 @*/
VecKokkosResetArray(Vec v)1326 PetscErrorCode VecKokkosResetArray(Vec v)
1327 {
1328   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1329 
1330   PetscFunctionBegin;
1331   VecErrorIfNotKokkos(v);
1332   // User wants to unhook the provided device array. Sync it so that user can get the latest
1333   PetscCall(KokkosDualViewSyncDevice(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1334   // Put the unplaced device array back, and set an appropriate modify flag
1335   PetscCall(veckok->UpdateArray<DefaultMemorySpace>(veckok->unplaced_d.data()));
1336   PetscFunctionReturn(PETSC_SUCCESS);
1337 }
1338 
1339 /* Replace the array in vin with a[] that must be allocated by PetscMalloc. a[] is owned by vin afterwards. */
VecReplaceArray_SeqKokkos(Vec vin,const PetscScalar * a)1340 PetscErrorCode VecReplaceArray_SeqKokkos(Vec vin, const PetscScalar *a)
1341 {
1342   Vec_Seq    *vecseq = (Vec_Seq *)vin->data;
1343   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(vin->spptr);
1344 
1345   PetscFunctionBegin;
1346   /* Make sure the users array has the latest values */
1347   if (vecseq->array != vecseq->array_allocated) PetscCall(KokkosDualViewSyncHost(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1348   PetscCall(VecReplaceArray_Seq(vin, a));
1349   PetscCall(veckok->UpdateArray<HostMirrorMemorySpace>(vecseq->array));
1350   PetscFunctionReturn(PETSC_SUCCESS);
1351 }
1352 
1353 /* Maps the local portion of vector v into vector w */
VecGetLocalVector_SeqKokkos(Vec v,Vec w)1354 PetscErrorCode VecGetLocalVector_SeqKokkos(Vec v, Vec w)
1355 {
1356   Vec_Seq    *vecseq = static_cast<Vec_Seq *>(w->data);
1357   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(w->spptr);
1358 
1359   PetscFunctionBegin;
1360   PetscCheckTypeNames(w, VECSEQKOKKOS, VECMPIKOKKOS);
1361   /* Destroy w->data, w->spptr */
1362   if (vecseq) {
1363     PetscCall(PetscFree(vecseq->array_allocated));
1364     PetscCall(PetscFree(w->data));
1365   }
1366   delete veckok;
1367 
1368   /* Replace with v's */
1369   w->data  = v->data;
1370   w->spptr = v->spptr;
1371   PetscCall(PetscObjectStateIncrease((PetscObject)w));
1372   PetscFunctionReturn(PETSC_SUCCESS);
1373 }
1374 
VecRestoreLocalVector_SeqKokkos(Vec v,Vec w)1375 PetscErrorCode VecRestoreLocalVector_SeqKokkos(Vec v, Vec w)
1376 {
1377   PetscFunctionBegin;
1378   PetscCheckTypeNames(w, VECSEQKOKKOS, VECMPIKOKKOS);
1379   v->data  = w->data;
1380   v->spptr = w->spptr;
1381   PetscCall(PetscObjectStateIncrease((PetscObject)v));
1382   /* TODO: need to think if setting w->data/spptr to NULL is safe */
1383   w->data  = NULL;
1384   w->spptr = NULL;
1385   PetscFunctionReturn(PETSC_SUCCESS);
1386 }
1387 
VecGetArray_SeqKokkos(Vec v,PetscScalar ** a)1388 PetscErrorCode VecGetArray_SeqKokkos(Vec v, PetscScalar **a)
1389 {
1390   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1391 
1392   PetscFunctionBegin;
1393   PetscCall(KokkosDualViewSyncHost(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1394   *a = *((PetscScalar **)v->data);
1395   PetscFunctionReturn(PETSC_SUCCESS);
1396 }
1397 
VecRestoreArray_SeqKokkos(Vec v,PetscScalar ** a)1398 PetscErrorCode VecRestoreArray_SeqKokkos(Vec v, PetscScalar **a)
1399 {
1400   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1401 
1402   PetscFunctionBegin;
1403   PetscCallCXX(veckok->v_dual.modify_host());
1404   PetscFunctionReturn(PETSC_SUCCESS);
1405 }
1406 
1407 /* Get array on host to overwrite, so no need to sync host. In VecRestoreArrayWrite() we will mark host is modified. */
VecGetArrayWrite_SeqKokkos(Vec v,PetscScalar ** a)1408 PetscErrorCode VecGetArrayWrite_SeqKokkos(Vec v, PetscScalar **a)
1409 {
1410   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1411 
1412   PetscFunctionBegin;
1413   PetscCallCXX(veckok->v_dual.clear_sync_state());
1414   *a = veckok->v_dual.view_host().data();
1415   PetscFunctionReturn(PETSC_SUCCESS);
1416 }
1417 
VecGetArrayAndMemType_SeqKokkos(Vec v,PetscScalar ** a,PetscMemType * mtype)1418 PetscErrorCode VecGetArrayAndMemType_SeqKokkos(Vec v, PetscScalar **a, PetscMemType *mtype)
1419 {
1420   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1421 
1422   PetscFunctionBegin;
1423   /* Always return up-to-date in the default memory space */
1424   PetscCall(KokkosDualViewSyncDevice(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1425   *a = veckok->v_dual.view_device().data();
1426   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; // Could be PETSC_MEMTYPE_HOST when Kokkos was not configured with cuda etc.
1427   PetscFunctionReturn(PETSC_SUCCESS);
1428 }
1429 
VecRestoreArrayAndMemType_SeqKokkos(Vec v,PetscScalar ** a)1430 PetscErrorCode VecRestoreArrayAndMemType_SeqKokkos(Vec v, PetscScalar **a)
1431 {
1432   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1433 
1434   PetscFunctionBegin;
1435   if (PetscMemTypeHost(PETSC_MEMTYPE_KOKKOS)) {
1436     PetscCallCXX(veckok->v_dual.modify_host());
1437   } else {
1438     PetscCallCXX(veckok->v_dual.modify_device());
1439   }
1440   PetscFunctionReturn(PETSC_SUCCESS);
1441 }
1442 
VecGetArrayWriteAndMemType_SeqKokkos(Vec v,PetscScalar ** a,PetscMemType * mtype)1443 PetscErrorCode VecGetArrayWriteAndMemType_SeqKokkos(Vec v, PetscScalar **a, PetscMemType *mtype)
1444 {
1445   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1446 
1447   PetscFunctionBegin;
1448   // Since the data will be overwritten, we clear the sync state to suppress potential memory copying in sync'ing
1449   PetscCallCXX(veckok->v_dual.clear_sync_state()); // So that in restore, we can safely modify_device()
1450   PetscCall(KokkosDualViewSyncDevice(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1451   *a = veckok->v_dual.view_device().data();
1452   if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; // Could be PETSC_MEMTYPE_HOST when Kokkos was not configured with cuda etc.
1453   PetscFunctionReturn(PETSC_SUCCESS);
1454 }
1455 
1456 /* Copy xin's sync state to y */
VecCopySyncState_Kokkos_Private(Vec xin,Vec yout)1457 static PetscErrorCode VecCopySyncState_Kokkos_Private(Vec xin, Vec yout)
1458 {
1459   Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(xin->spptr);
1460   Vec_Kokkos *ykok = static_cast<Vec_Kokkos *>(yout->spptr);
1461 
1462   PetscFunctionBegin;
1463   PetscCallCXX(ykok->v_dual.clear_sync_state());
1464   if (xkok->v_dual.need_sync_host()) {
1465     PetscCallCXX(ykok->v_dual.modify_device());
1466   } else if (xkok->v_dual.need_sync_device()) {
1467     PetscCallCXX(ykok->v_dual.modify_host());
1468   }
1469   PetscFunctionReturn(PETSC_SUCCESS);
1470 }
1471 
1472 static PetscErrorCode VecCreateSeqKokkosWithArrays_Private(MPI_Comm, PetscInt, PetscInt, const PetscScalar[], const PetscScalar[], Vec *);
1473 
1474 /* Internal routine shared by VecGetSubVector_{SeqKokkos,MPIKokkos} */
VecGetSubVector_Kokkos_Private(Vec x,PetscBool xIsMPI,IS is,Vec * y)1475 PetscErrorCode VecGetSubVector_Kokkos_Private(Vec x, PetscBool xIsMPI, IS is, Vec *y)
1476 {
1477   PetscBool contig;
1478   PetscInt  n, N, start, bs;
1479   MPI_Comm  comm;
1480   Vec       z;
1481 
1482   PetscFunctionBegin;
1483   PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
1484   PetscCall(ISGetLocalSize(is, &n));
1485   PetscCall(ISGetSize(is, &N));
1486   PetscCall(VecGetSubVectorContiguityAndBS_Private(x, is, &contig, &start, &bs));
1487 
1488   if (contig) { /* We can do a no-copy (in-place) implementation with y sharing x's arrays */
1489     Vec_Kokkos        *xkok    = static_cast<Vec_Kokkos *>(x->spptr);
1490     const PetscScalar *array_h = xkok->v_dual.view_host().data() + start;
1491     const PetscScalar *array_d = xkok->v_dual.view_device().data() + start;
1492 
1493     /* These calls assume the input arrays are synced */
1494     if (xIsMPI) PetscCall(VecCreateMPIKokkosWithArrays_Private(comm, bs, n, N, array_h, array_d, &z)); /* x could be MPI even when x's comm size = 1 */
1495     else PetscCall(VecCreateSeqKokkosWithArrays_Private(comm, bs, n, array_h, array_d, &z));
1496 
1497     PetscCall(VecCopySyncState_Kokkos_Private(x, z)); /* Copy x's sync state to z */
1498 
1499     /* This is relevant only in debug mode */
1500     PetscInt state = 0;
1501     PetscCall(VecLockGet(x, &state));
1502     if (state) { /* x is either in read or read/write mode, therefore z, overlapped with x, can only be in read mode */
1503       PetscCall(VecLockReadPush(z));
1504     }
1505 
1506     z->ops->placearray   = NULL; /* z's arrays can't be replaced, because z does not own them */
1507     z->ops->replacearray = NULL;
1508 
1509   } else { /* Have to create a VecScatter and a stand-alone vector */
1510     PetscCall(VecGetSubVectorThroughVecScatter_Private(x, is, bs, &z));
1511   }
1512   *y = z;
1513   PetscFunctionReturn(PETSC_SUCCESS);
1514 }
1515 
VecGetSubVector_SeqKokkos(Vec x,IS is,Vec * y)1516 static PetscErrorCode VecGetSubVector_SeqKokkos(Vec x, IS is, Vec *y)
1517 {
1518   PetscFunctionBegin;
1519   PetscCall(VecGetSubVector_Kokkos_Private(x, PETSC_FALSE, is, y));
1520   PetscFunctionReturn(PETSC_SUCCESS);
1521 }
1522 
1523 /* Restore subvector y to x */
VecRestoreSubVector_SeqKokkos(Vec x,IS is,Vec * y)1524 PetscErrorCode VecRestoreSubVector_SeqKokkos(Vec x, IS is, Vec *y)
1525 {
1526   VecScatter                    vscat;
1527   PETSC_UNUSED PetscObjectState dummystate = 0;
1528   PetscBool                     unchanged;
1529 
1530   PetscFunctionBegin;
1531   PetscCall(PetscObjectComposedDataGetInt((PetscObject)*y, VecGetSubVectorSavedStateId, dummystate, unchanged));
1532   if (unchanged) PetscFunctionReturn(PETSC_SUCCESS); /* If y's state has not changed since VecGetSubVector(), we only need to destroy it */
1533 
1534   PetscCall(PetscObjectQuery((PetscObject)*y, "VecGetSubVector_Scatter", (PetscObject *)&vscat));
1535   if (vscat) {
1536     PetscCall(VecScatterBegin(vscat, *y, x, INSERT_VALUES, SCATTER_REVERSE));
1537     PetscCall(VecScatterEnd(vscat, *y, x, INSERT_VALUES, SCATTER_REVERSE));
1538   } else { /* y and x's (host and device) arrays overlap */
1539     Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(x->spptr);
1540     Vec_Kokkos *ykok = static_cast<Vec_Kokkos *>((*y)->spptr);
1541     PetscInt    state;
1542 
1543     PetscCall(VecLockGet(x, &state));
1544     PetscCheck(!state, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "Vec x is locked for read-only or read/write access");
1545 
1546     /* The tricky part: one has to carefully sync the arrays */
1547     auto exec = PetscGetKokkosExecutionSpace();
1548     if (xkok->v_dual.need_sync_device()) { /* x's host has newer data */
1549       /* Move y's latest values to host (since y is just a subset of x) */
1550       PetscCall(KokkosDualViewSyncHost(ykok->v_dual, exec));
1551     } else if (xkok->v_dual.need_sync_host()) {                /* x's device has newer data */
1552       PetscCall(KokkosDualViewSyncDevice(ykok->v_dual, exec)); /* Move y's latest data to device */
1553     } else {                                                   /* x's host and device data is already sync'ed; Copy y's sync state to x */
1554       PetscCall(VecCopySyncState_Kokkos_Private(*y, x));
1555     }
1556     PetscCall(PetscObjectStateIncrease((PetscObject)x)); /* Since x is updated */
1557   }
1558   PetscFunctionReturn(PETSC_SUCCESS);
1559 }
1560 
VecSetPreallocationCOO_SeqKokkos(Vec x,PetscCount ncoo,const PetscInt coo_i[])1561 static PetscErrorCode VecSetPreallocationCOO_SeqKokkos(Vec x, PetscCount ncoo, const PetscInt coo_i[])
1562 {
1563   Vec_Seq    *vecseq = static_cast<Vec_Seq *>(x->data);
1564   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(x->spptr);
1565   PetscInt    m;
1566 
1567   PetscFunctionBegin;
1568   PetscCall(VecSetPreallocationCOO_Seq(x, ncoo, coo_i));
1569   PetscCall(VecGetLocalSize(x, &m));
1570   PetscCall(veckok->SetUpCOO(vecseq, m));
1571   PetscFunctionReturn(PETSC_SUCCESS);
1572 }
1573 
VecSetValuesCOO_SeqKokkos(Vec x,const PetscScalar v[],InsertMode imode)1574 static PetscErrorCode VecSetValuesCOO_SeqKokkos(Vec x, const PetscScalar v[], InsertMode imode)
1575 {
1576   Vec_Seq                    *vecseq = static_cast<Vec_Seq *>(x->data);
1577   Vec_Kokkos                 *veckok = static_cast<Vec_Kokkos *>(x->spptr);
1578   const PetscCountKokkosView &jmap1  = veckok->jmap1_d;
1579   const PetscCountKokkosView &perm1  = veckok->perm1_d;
1580   PetscScalarKokkosView       xv; /* View for vector x */
1581   ConstPetscScalarKokkosView  vv; /* View for array v[] */
1582   PetscInt                    m;
1583   PetscMemType                memtype;
1584 
1585   PetscFunctionBegin;
1586   PetscCall(VecGetLocalSize(x, &m));
1587   PetscCall(PetscGetMemType(v, &memtype));
1588   if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1589     PetscCallCXX(vv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstPetscScalarKokkosViewHost(v, vecseq->coo_n)));
1590   } else {
1591     PetscCallCXX(vv = ConstPetscScalarKokkosView(v, vecseq->coo_n)); /* Directly use v[]'s memory */
1592   }
1593 
1594   if (imode == INSERT_VALUES) PetscCall(VecGetKokkosViewWrite(x, &xv)); /* write vector */
1595   else PetscCall(VecGetKokkosView(x, &xv));                             /* read & write vector */
1596 
1597   PetscCallCXX(Kokkos::parallel_for(
1598     Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, m), KOKKOS_LAMBDA(const PetscInt &i) {
1599       PetscScalar sum = 0.0;
1600       for (PetscCount k = jmap1(i); k < jmap1(i + 1); k++) sum += vv(perm1(k));
1601       xv(i) = (imode == INSERT_VALUES ? 0.0 : xv(i)) + sum;
1602     }));
1603 
1604   if (imode == INSERT_VALUES) PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1605   else PetscCall(VecRestoreKokkosView(x, &xv));
1606   PetscFunctionReturn(PETSC_SUCCESS);
1607 }
1608 
1609 static PetscErrorCode VecCreate_SeqKokkos_Common(Vec); // forward declaration
1610 
1611 /* Duplicate layout etc but not the values in the input vector */
VecDuplicate_SeqKokkos(Vec win,Vec * v)1612 static PetscErrorCode VecDuplicate_SeqKokkos(Vec win, Vec *v)
1613 {
1614   PetscFunctionBegin;
1615   PetscCall(VecDuplicate_Seq(win, v)); /* It also dups ops of win */
1616   PetscCall(VecCreate_SeqKokkos_Common(*v));
1617   (*v)->ops[0] = win->ops[0]; // recover the ops[]. We need to always follow ops in win
1618   PetscFunctionReturn(PETSC_SUCCESS);
1619 }
1620 
VecDestroy_SeqKokkos(Vec v)1621 static PetscErrorCode VecDestroy_SeqKokkos(Vec v)
1622 {
1623   Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1624   Vec_Seq    *vecseq = static_cast<Vec_Seq *>(v->data);
1625 
1626   PetscFunctionBegin;
1627   delete veckok;
1628   v->spptr = NULL;
1629   if (vecseq) PetscCall(VecDestroy_Seq(v));
1630   PetscFunctionReturn(PETSC_SUCCESS);
1631 }
1632 
1633 // Shared by all VecCreate/Duplicate routines for VecSeqKokkos
VecCreate_SeqKokkos_Common(Vec v)1634 static PetscErrorCode VecCreate_SeqKokkos_Common(Vec v)
1635 {
1636   PetscFunctionBegin;
1637   v->ops->abs             = VecAbs_SeqKokkos;
1638   v->ops->reciprocal      = VecReciprocal_SeqKokkos;
1639   v->ops->pointwisemult   = VecPointwiseMult_SeqKokkos;
1640   v->ops->min             = VecMin_SeqKokkos;
1641   v->ops->max             = VecMax_SeqKokkos;
1642   v->ops->sum             = VecSum_SeqKokkos;
1643   v->ops->shift           = VecShift_SeqKokkos;
1644   v->ops->norm            = VecNorm_SeqKokkos;
1645   v->ops->scale           = VecScale_SeqKokkos;
1646   v->ops->copy            = VecCopy_SeqKokkos;
1647   v->ops->set             = VecSet_SeqKokkos;
1648   v->ops->swap            = VecSwap_SeqKokkos;
1649   v->ops->axpy            = VecAXPY_SeqKokkos;
1650   v->ops->axpby           = VecAXPBY_SeqKokkos;
1651   v->ops->axpbypcz        = VecAXPBYPCZ_SeqKokkos;
1652   v->ops->pointwisedivide = VecPointwiseDivide_SeqKokkos;
1653   v->ops->setrandom       = VecSetRandom_SeqKokkos;
1654 
1655   v->ops->dot   = VecDot_SeqKokkos;
1656   v->ops->tdot  = VecTDot_SeqKokkos;
1657   v->ops->mdot  = VecMDot_SeqKokkos;
1658   v->ops->mtdot = VecMTDot_SeqKokkos;
1659 
1660   v->ops->dot_local   = VecDot_SeqKokkos;
1661   v->ops->tdot_local  = VecTDot_SeqKokkos;
1662   v->ops->mdot_local  = VecMDot_SeqKokkos;
1663   v->ops->mtdot_local = VecMTDot_SeqKokkos;
1664 
1665   v->ops->norm_local             = VecNorm_SeqKokkos;
1666   v->ops->maxpy                  = VecMAXPY_SeqKokkos;
1667   v->ops->aypx                   = VecAYPX_SeqKokkos;
1668   v->ops->waxpy                  = VecWAXPY_SeqKokkos;
1669   v->ops->dotnorm2               = VecDotNorm2_SeqKokkos;
1670   v->ops->errorwnorm             = VecErrorWeightedNorms_SeqKokkos;
1671   v->ops->placearray             = VecPlaceArray_SeqKokkos;
1672   v->ops->replacearray           = VecReplaceArray_SeqKokkos;
1673   v->ops->resetarray             = VecResetArray_SeqKokkos;
1674   v->ops->destroy                = VecDestroy_SeqKokkos;
1675   v->ops->duplicate              = VecDuplicate_SeqKokkos;
1676   v->ops->conjugate              = VecConjugate_SeqKokkos;
1677   v->ops->getlocalvector         = VecGetLocalVector_SeqKokkos;
1678   v->ops->restorelocalvector     = VecRestoreLocalVector_SeqKokkos;
1679   v->ops->getlocalvectorread     = VecGetLocalVector_SeqKokkos;
1680   v->ops->restorelocalvectorread = VecRestoreLocalVector_SeqKokkos;
1681   v->ops->getarraywrite          = VecGetArrayWrite_SeqKokkos;
1682   v->ops->getarray               = VecGetArray_SeqKokkos;
1683   v->ops->restorearray           = VecRestoreArray_SeqKokkos;
1684 
1685   v->ops->getarrayandmemtype      = VecGetArrayAndMemType_SeqKokkos;
1686   v->ops->restorearrayandmemtype  = VecRestoreArrayAndMemType_SeqKokkos;
1687   v->ops->getarraywriteandmemtype = VecGetArrayWriteAndMemType_SeqKokkos;
1688   v->ops->getsubvector            = VecGetSubVector_SeqKokkos;
1689   v->ops->restoresubvector        = VecRestoreSubVector_SeqKokkos;
1690 
1691   v->ops->setpreallocationcoo = VecSetPreallocationCOO_SeqKokkos;
1692   v->ops->setvaluescoo        = VecSetValuesCOO_SeqKokkos;
1693 
1694   v->offloadmask = PETSC_OFFLOAD_KOKKOS; // Mark this is a VECKOKKOS; We use this flag for cheap VECKOKKOS test.
1695   PetscFunctionReturn(PETSC_SUCCESS);
1696 }
1697 
1698 /*@C
1699   VecCreateSeqKokkosWithArray - Creates a Kokkos sequential array-style vector,
1700   where the user provides the array space to store the vector values. The array
1701   provided must be a device array.
1702 
1703   Collective
1704 
1705   Input Parameters:
1706 + comm   - the communicator, should be PETSC_COMM_SELF
1707 . bs     - the block size
1708 . n      - the vector length
1709 - darray - device memory where the vector elements are to be stored.
1710 
1711   Output Parameter:
1712 . v - the vector
1713 
1714   Notes:
1715   Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1716   same type as an existing vector.
1717 
1718   PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1719   The user should not free the array until the vector is destroyed.
1720 
1721   Level: intermediate
1722 
1723 .seealso: `VecCreateMPICUDAWithArray()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`,
1724           `VecCreateGhost()`, `VecCreateSeq()`, `VecCreateSeqWithArray()`,
1725           `VecCreateMPIWithArray()`
1726 @*/
VecCreateSeqKokkosWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar darray[],Vec * v)1727 PetscErrorCode VecCreateSeqKokkosWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar darray[], Vec *v)
1728 {
1729   PetscMPIInt  size;
1730   Vec          w;
1731   Vec_Kokkos  *veckok = NULL;
1732   PetscScalar *harray;
1733 
1734   PetscFunctionBegin;
1735   PetscCallMPI(MPI_Comm_size(comm, &size));
1736   PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create VECSEQKOKKOS on more than one process");
1737 
1738   PetscCall(PetscKokkosInitializeCheck());
1739   PetscCall(VecCreate(comm, &w));
1740   PetscCall(VecSetSizes(w, n, n));
1741   PetscCall(VecSetBlockSize(w, bs));
1742   if (!darray) { /* Allocate memory ourself if user provided NULL */
1743     PetscCall(VecSetType(w, VECSEQKOKKOS));
1744   } else {
1745     /* Build a VECSEQ, get its harray, and then build Vec_Kokkos along with darray */
1746     if (std::is_same<DefaultMemorySpace, HostMirrorMemorySpace>::value) {
1747       harray = const_cast<PetscScalar *>(darray);
1748       PetscCall(VecCreate_Seq_Private(w, harray)); /* Build a sequential vector with harray */
1749     } else {
1750       PetscCall(VecSetType(w, VECSEQ));
1751       harray = static_cast<Vec_Seq *>(w->data)->array;
1752     }
1753     PetscCall(PetscObjectChangeTypeName((PetscObject)w, VECSEQKOKKOS)); /* Change it to Kokkos */
1754     PetscCall(VecCreate_SeqKokkos_Common(w));
1755     PetscCallCXX(veckok = new Vec_Kokkos{n, harray, const_cast<PetscScalar *>(darray)});
1756     PetscCallCXX(veckok->v_dual.modify_device()); /* Mark the device is modified */
1757     w->spptr = static_cast<void *>(veckok);
1758   }
1759   *v = w;
1760   PetscFunctionReturn(PETSC_SUCCESS);
1761 }
1762 
VecConvert_Seq_SeqKokkos_inplace(Vec v)1763 PetscErrorCode VecConvert_Seq_SeqKokkos_inplace(Vec v)
1764 {
1765   Vec_Seq *vecseq;
1766 
1767   PetscFunctionBegin;
1768   PetscCall(PetscKokkosInitializeCheck());
1769   PetscCall(PetscLayoutSetUp(v->map));
1770   PetscCall(PetscObjectChangeTypeName((PetscObject)v, VECSEQKOKKOS));
1771   PetscCall(VecCreate_SeqKokkos_Common(v));
1772   PetscCheck(!v->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "v->spptr not NULL");
1773   vecseq = static_cast<Vec_Seq *>(v->data);
1774   PetscCallCXX(v->spptr = new Vec_Kokkos(v->map->n, vecseq->array, NULL));
1775   PetscFunctionReturn(PETSC_SUCCESS);
1776 }
1777 
1778 // Create a VECSEQKOKKOS with layout and arrays
VecCreateSeqKokkosWithLayoutAndArrays_Private(PetscLayout map,const PetscScalar harray[],const PetscScalar darray[],Vec * v)1779 static PetscErrorCode VecCreateSeqKokkosWithLayoutAndArrays_Private(PetscLayout map, const PetscScalar harray[], const PetscScalar darray[], Vec *v)
1780 {
1781   Vec w;
1782 
1783   PetscFunctionBegin;
1784   if (map->n > 0) PetscCheck(darray, map->comm, PETSC_ERR_ARG_WRONG, "darray cannot be NULL");
1785 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1786   PetscCheck(harray == darray, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "harray and darray must be the same");
1787 #endif
1788   PetscCall(VecCreateSeqWithLayoutAndArray_Private(map, harray, &w));
1789   PetscCall(PetscObjectChangeTypeName((PetscObject)w, VECSEQKOKKOS)); // Change it to VECKOKKOS
1790   PetscCall(VecCreate_SeqKokkos_Common(w));
1791   PetscCallCXX(w->spptr = new Vec_Kokkos(map->n, const_cast<PetscScalar *>(harray), const_cast<PetscScalar *>(darray)));
1792   *v = w;
1793   PetscFunctionReturn(PETSC_SUCCESS);
1794 }
1795 
1796 /*
1797    VecCreateSeqKokkosWithArrays_Private - Creates a Kokkos sequential array-style vector
1798    with user-provided arrays on host and device.
1799 
1800    Collective
1801 
1802    Input Parameter:
1803 +  comm - the communicator, should be PETSC_COMM_SELF
1804 .  bs - the block size
1805 .  n - the vector length
1806 .  harray - host memory where the vector elements are to be stored.
1807 -  darray - device memory where the vector elements are to be stored.
1808 
1809    Output Parameter:
1810 .  v - the vector
1811 
1812    Notes:
1813    Unlike VecCreate{Seq,MPI}CUDAWithArrays(), this routine is private since we do not expect users to use it directly.
1814 
1815    If there is no device, then harray and darray must be the same.
1816    If n is not zero, then harray and darray must be allocated.
1817    After the call, the created vector is supposed to be in a synchronized state, i.e.,
1818    we suppose harray and darray have the same data.
1819 
1820    PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1821    Caller should not free the array until the vector is destroyed.
1822 */
VecCreateSeqKokkosWithArrays_Private(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar harray[],const PetscScalar darray[],Vec * v)1823 static PetscErrorCode VecCreateSeqKokkosWithArrays_Private(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar harray[], const PetscScalar darray[], Vec *v)
1824 {
1825   PetscMPIInt size;
1826   PetscLayout map;
1827 
1828   PetscFunctionBegin;
1829   PetscCall(PetscKokkosInitializeCheck());
1830   PetscCallMPI(MPI_Comm_size(comm, &size));
1831   PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create VECSEQKOKKOS on more than one process");
1832   PetscCall(PetscLayoutCreateFromSizes(comm, n, n, bs, &map));
1833   PetscCall(VecCreateSeqKokkosWithLayoutAndArrays_Private(map, harray, darray, v));
1834   PetscCall(PetscLayoutDestroy(&map));
1835   PetscFunctionReturn(PETSC_SUCCESS);
1836 }
1837 
1838 /* TODO: ftn-auto generates veckok.kokkosf.c */
1839 /*@C
1840   VecCreateSeqKokkos - Creates a standard, sequential array-style vector.
1841 
1842   Collective
1843 
1844   Input Parameters:
1845 + comm - the communicator, should be `PETSC_COMM_SELF`
1846 - n    - the vector length
1847 
1848   Output Parameter:
1849 . v - the vector
1850 
1851   Notes:
1852   Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1853   same type as an existing vector.
1854 
1855   Level: intermediate
1856 
1857 .seealso: `VecCreateMPI()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`
1858  @*/
VecCreateSeqKokkos(MPI_Comm comm,PetscInt n,Vec * v)1859 PetscErrorCode VecCreateSeqKokkos(MPI_Comm comm, PetscInt n, Vec *v)
1860 {
1861   PetscFunctionBegin;
1862   PetscCall(PetscKokkosInitializeCheck());
1863   PetscCall(VecCreate(comm, v));
1864   PetscCall(VecSetSizes(*v, n, n));
1865   PetscCall(VecSetType(*v, VECSEQKOKKOS)); /* Calls VecCreate_SeqKokkos */
1866   PetscFunctionReturn(PETSC_SUCCESS);
1867 }
1868 
1869 // Duplicate a VECSEQKOKKOS
VecDuplicateVecs_SeqKokkos_GEMV(Vec w,PetscInt m,Vec * V[])1870 static PetscErrorCode VecDuplicateVecs_SeqKokkos_GEMV(Vec w, PetscInt m, Vec *V[])
1871 {
1872   PetscInt64                lda; // use 64-bit as we will do "m * lda"
1873   PetscScalar              *array_h, *array_d;
1874   PetscLayout               map;
1875   PetscScalarKokkosDualView w_dual;
1876 
1877   PetscFunctionBegin;
1878   PetscCall(PetscKokkosInitializeCheck()); // as we'll call kokkos_malloc()
1879   PetscCall(PetscMalloc1(m, V));
1880   PetscCall(VecGetLayout(w, &map));
1881   VecGetLocalSizeAligned(w, 64, &lda); // get in lda the 64-bytes aligned local size
1882   // See comments in VecCreate_SeqKokkos() on why we use DualView to allocate the memory
1883   PetscCallCXX(w_dual = PetscScalarKokkosDualView("VecDuplicateVecs", m * lda)); // Kokkos init's w_dual to zero
1884 
1885   // create the m vectors with raw arrays from v_dual
1886   array_h = w_dual.view_host().data();
1887   array_d = w_dual.view_device().data();
1888   for (PetscInt i = 0; i < m; i++) {
1889     Vec v;
1890     PetscCall(VecCreateSeqKokkosWithLayoutAndArrays_Private(map, &array_h[i * lda], &array_d[i * lda], &v));
1891     PetscCall(PetscObjectListDuplicate(((PetscObject)w)->olist, &((PetscObject)v)->olist));
1892     PetscCall(PetscFunctionListDuplicate(((PetscObject)w)->qlist, &((PetscObject)v)->qlist));
1893     v->ops[0] = w->ops[0];
1894     (*V)[i]   = v;
1895   }
1896 
1897   // let the first vector own the long DualView, so when it is destroyed it will free the v_dual
1898   if (m) {
1899     Vec v = (*V)[0];
1900 
1901     static_cast<Vec_Kokkos *>(v->spptr)->w_dual = w_dual; // stash the memory
1902     // disable replacearray of the first vector, as freeing its memory also frees others in the group.
1903     // But replacearray of others is ok, as they don't own their array.
1904     if (m > 1) v->ops->replacearray = VecReplaceArray_Default_GEMV_Error;
1905   }
1906   PetscFunctionReturn(PETSC_SUCCESS);
1907 }
1908 
1909 /*MC
1910    VECSEQKOKKOS - VECSEQKOKKOS = "seqkokkos" - The basic sequential vector, modified to use Kokkos
1911 
1912    Options Database Keys:
1913 . -vec_type seqkokkos - sets the vector type to VECSEQKOKKOS during a call to VecSetFromOptions()
1914 
1915   Level: beginner
1916 
1917 .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`
1918 M*/
VecCreate_SeqKokkos(Vec v)1919 PetscErrorCode VecCreate_SeqKokkos(Vec v)
1920 {
1921   PetscBool                 mdot_use_gemv  = PETSC_TRUE;
1922   PetscBool                 maxpy_use_gemv = PETSC_FALSE; // default is false as we saw bad performance with vendors' GEMV with tall skinny matrices.
1923   PetscScalarKokkosDualView v_dual;
1924 
1925   PetscFunctionBegin;
1926   PetscCall(PetscKokkosInitializeCheck());
1927   PetscCall(PetscLayoutSetUp(v->map));
1928 
1929   // Use DualView to allocate both the host array and the device array.
1930   // DualView first allocates the device array and then mirrors it to host.
1931   // With unified memory (e.g., on AMD MI300A APU), the two arrays are the same, with the host array
1932   // sharing the device array allocated by hipMalloc(), but not the other way around if we call
1933   // VecCreate_Seq() first and let the device array share the host array allocated by malloc().
1934   // hipMalloc() has an advantage over malloc() as it gives great binding and page size settings automatically, see
1935   // https://hpc.llnl.gov/documentation/user-guides/using-el-capitan-systems/introduction-and-quickstart/pro-tips
1936   PetscCallCXX(v_dual = PetscScalarKokkosDualView("v_dual", v->map->n)); // Kokkos init's v_dual to zero
1937 
1938   PetscCall(VecCreate_Seq_Private(v, v_dual.view_host().data()));
1939   PetscCall(PetscObjectChangeTypeName((PetscObject)v, VECSEQKOKKOS));
1940   PetscCall(VecCreate_SeqKokkos_Common(v));
1941   PetscCheck(!v->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "v->spptr not NULL");
1942   PetscCallCXX(v->spptr = new Vec_Kokkos(v_dual));
1943 
1944   PetscCall(PetscOptionsGetBool(NULL, NULL, "-vec_mdot_use_gemv", &mdot_use_gemv, NULL));
1945   PetscCall(PetscOptionsGetBool(NULL, NULL, "-vec_maxpy_use_gemv", &maxpy_use_gemv, NULL));
1946 
1947   // allocate multiple vectors together
1948   if (mdot_use_gemv || maxpy_use_gemv) v->ops[0].duplicatevecs = VecDuplicateVecs_SeqKokkos_GEMV;
1949 
1950   if (mdot_use_gemv) {
1951     v->ops[0].mdot        = VecMDot_SeqKokkos_GEMV;
1952     v->ops[0].mtdot       = VecMTDot_SeqKokkos_GEMV;
1953     v->ops[0].mdot_local  = VecMDot_SeqKokkos_GEMV;
1954     v->ops[0].mtdot_local = VecMTDot_SeqKokkos_GEMV;
1955   }
1956   if (maxpy_use_gemv) v->ops[0].maxpy = VecMAXPY_SeqKokkos_GEMV;
1957   PetscFunctionReturn(PETSC_SUCCESS);
1958 }
1959