1 /*
2 Implements the sequential Kokkos vectors.
3 */
4 #include <petsc_kokkos.hpp>
5 #include <petscvec_kokkos.hpp>
6
7 #include <petsc/private/sfimpl.h>
8 #include <petsc/private/petscimpl.h>
9 #include <petscmath.h>
10 #include <petscviewer.h>
11 #include <KokkosBlas.hpp>
12 #include <Kokkos_Functional.hpp>
13
14 #include <../src/vec/vec/impls/dvecimpl.h> /* for VecCreate_Seq_Private */
15 #include <../src/vec/vec/impls/seq/kokkos/veckokkosimpl.hpp>
16
17 template <class MemorySpace>
VecGetKokkosView_Private(Vec v,PetscScalarKokkosViewType<MemorySpace> * kv,PetscBool overwrite)18 static PetscErrorCode VecGetKokkosView_Private(Vec v, PetscScalarKokkosViewType<MemorySpace> *kv, PetscBool overwrite)
19 {
20 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
21 using ExecutionSpace = typename PetscScalarKokkosViewType<MemorySpace>::traits::device_type;
22
23 PetscFunctionBegin;
24 VecErrorIfNotKokkos(v);
25 if (!overwrite) { /* If overwrite=true, no need to sync the space, since caller will overwrite the data */
26 PetscCallCXX(veckok->v_dual.sync<ExecutionSpace>());
27 }
28 PetscCallCXX(*kv = veckok->v_dual.view<ExecutionSpace>());
29 PetscFunctionReturn(PETSC_SUCCESS);
30 }
31
32 template <class MemorySpace>
VecRestoreKokkosView_Private(Vec v,PetscScalarKokkosViewType<MemorySpace> * kv,PetscBool overwrite)33 static PetscErrorCode VecRestoreKokkosView_Private(Vec v, PetscScalarKokkosViewType<MemorySpace> *kv, PetscBool overwrite)
34 {
35 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
36 using ExecutionSpace = typename PetscScalarKokkosViewType<MemorySpace>::traits::device_type;
37
38 PetscFunctionBegin;
39 VecErrorIfNotKokkos(v);
40 if (overwrite) PetscCallCXX(veckok->v_dual.clear_sync_state()); /* If overwrite=true, clear the old sync state since user forced an overwrite */
41 PetscCallCXX(veckok->v_dual.modify<ExecutionSpace>());
42 PetscCall(PetscObjectStateIncrease((PetscObject)v));
43 PetscFunctionReturn(PETSC_SUCCESS);
44 }
45
46 template <class MemorySpace>
VecGetKokkosView(Vec v,ConstPetscScalarKokkosViewType<MemorySpace> * kv)47 PetscErrorCode VecGetKokkosView(Vec v, ConstPetscScalarKokkosViewType<MemorySpace> *kv)
48 {
49 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
50 using ExecutionSpace = typename PetscScalarKokkosViewType<MemorySpace>::traits::device_type;
51
52 PetscFunctionBegin;
53 VecErrorIfNotKokkos(v);
54 PetscCallCXX(veckok->v_dual.sync<ExecutionSpace>());
55 PetscCallCXX(*kv = veckok->v_dual.view<ExecutionSpace>());
56 PetscFunctionReturn(PETSC_SUCCESS);
57 }
58
59 /* Function template explicit instantiation */
60 template PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec, ConstPetscScalarKokkosView *);
61 template <>
VecGetKokkosView(Vec v,PetscScalarKokkosView * kv)62 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec v, PetscScalarKokkosView *kv)
63 {
64 return VecGetKokkosView_Private(v, kv, PETSC_FALSE);
65 }
66 template <>
VecRestoreKokkosView(Vec v,PetscScalarKokkosView * kv)67 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosView(Vec v, PetscScalarKokkosView *kv)
68 {
69 return VecRestoreKokkosView_Private(v, kv, PETSC_FALSE);
70 }
71 template <>
VecGetKokkosViewWrite(Vec v,PetscScalarKokkosView * kv)72 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosViewWrite(Vec v, PetscScalarKokkosView *kv)
73 {
74 return VecGetKokkosView_Private(v, kv, PETSC_TRUE);
75 }
76 template <>
VecRestoreKokkosViewWrite(Vec v,PetscScalarKokkosView * kv)77 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosViewWrite(Vec v, PetscScalarKokkosView *kv)
78 {
79 return VecRestoreKokkosView_Private(v, kv, PETSC_TRUE);
80 }
81
82 #if !defined(KOKKOS_ENABLE_UNIFIED_MEMORY) /* Get host views if the default memory space is not host space */
83 template PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec, ConstPetscScalarKokkosViewHost *);
84 template <>
VecGetKokkosView(Vec v,PetscScalarKokkosViewHost * kv)85 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosView(Vec v, PetscScalarKokkosViewHost *kv)
86 {
87 return VecGetKokkosView_Private(v, kv, PETSC_FALSE);
88 }
89 template <>
VecRestoreKokkosView(Vec v,PetscScalarKokkosViewHost * kv)90 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosView(Vec v, PetscScalarKokkosViewHost *kv)
91 {
92 return VecRestoreKokkosView_Private(v, kv, PETSC_FALSE);
93 }
94 template <>
VecGetKokkosViewWrite(Vec v,PetscScalarKokkosViewHost * kv)95 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecGetKokkosViewWrite(Vec v, PetscScalarKokkosViewHost *kv)
96 {
97 return VecGetKokkosView_Private(v, kv, PETSC_TRUE);
98 }
99 template <>
VecRestoreKokkosViewWrite(Vec v,PetscScalarKokkosViewHost * kv)100 PETSC_VISIBILITY_PUBLIC PetscErrorCode VecRestoreKokkosViewWrite(Vec v, PetscScalarKokkosViewHost *kv)
101 {
102 return VecRestoreKokkosView_Private(v, kv, PETSC_TRUE);
103 }
104 #endif
105
VecSetRandom_SeqKokkos(Vec xin,PetscRandom r)106 PetscErrorCode VecSetRandom_SeqKokkos(Vec xin, PetscRandom r)
107 {
108 const PetscInt n = xin->map->n;
109 PetscScalar *xx;
110
111 PetscFunctionBegin;
112 PetscCall(VecGetArrayWrite(xin, &xx)); /* TODO: generate randoms directly on device */
113 for (PetscInt i = 0; i < n; i++) PetscCall(PetscRandomGetValue(r, &xx[i]));
114 PetscCall(VecRestoreArrayWrite(xin, &xx));
115 PetscFunctionReturn(PETSC_SUCCESS);
116 }
117
118 /* x = |x| */
VecAbs_SeqKokkos(Vec xin)119 PetscErrorCode VecAbs_SeqKokkos(Vec xin)
120 {
121 PetscScalarKokkosView xv;
122 auto exec = PetscGetKokkosExecutionSpace();
123
124 PetscFunctionBegin;
125 PetscCall(PetscLogGpuTimeBegin());
126 PetscCall(VecGetKokkosView(xin, &xv));
127 PetscCallCXX(KokkosBlas::abs(exec, xv, xv));
128 PetscCall(VecRestoreKokkosView(xin, &xv));
129 PetscCall(PetscLogGpuTimeEnd());
130 PetscFunctionReturn(PETSC_SUCCESS);
131 }
132
133 /* x = 1/x */
VecReciprocal_SeqKokkos(Vec xin)134 PetscErrorCode VecReciprocal_SeqKokkos(Vec xin)
135 {
136 PetscScalarKokkosView xv;
137
138 PetscFunctionBegin;
139 PetscCall(PetscLogGpuTimeBegin());
140 PetscCall(VecGetKokkosView(xin, &xv));
141 PetscCallCXX(Kokkos::parallel_for(
142 "VecReciprocal", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) {
143 if (xv(i) != (PetscScalar)0.0) xv(i) = (PetscScalar)1.0 / xv(i);
144 }));
145 PetscCall(VecRestoreKokkosView(xin, &xv));
146 PetscCall(PetscLogGpuTimeEnd());
147 PetscFunctionReturn(PETSC_SUCCESS);
148 }
149
VecMin_SeqKokkos(Vec xin,PetscInt * p,PetscReal * val)150 PetscErrorCode VecMin_SeqKokkos(Vec xin, PetscInt *p, PetscReal *val)
151 {
152 ConstPetscScalarKokkosView xv;
153 Kokkos::MinFirstLoc<PetscReal, PetscInt>::value_type result;
154
155 PetscFunctionBegin;
156 PetscCall(PetscLogGpuTimeBegin());
157 PetscCall(VecGetKokkosView(xin, &xv));
158 PetscCallCXX(Kokkos::parallel_reduce(
159 "VecMin", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n),
160 KOKKOS_LAMBDA(const PetscInt &i, Kokkos::MinFirstLoc<PetscReal, PetscInt>::value_type &lupdate) {
161 if (PetscRealPart(xv(i)) < lupdate.val) {
162 lupdate.val = PetscRealPart(xv(i));
163 lupdate.loc = i;
164 }
165 },
166 Kokkos::MinFirstLoc<PetscReal, PetscInt>(result)));
167 *val = result.val;
168 if (p) *p = result.loc;
169 PetscCall(VecRestoreKokkosView(xin, &xv));
170 PetscCall(PetscLogGpuTimeEnd());
171 PetscFunctionReturn(PETSC_SUCCESS);
172 }
173
VecMax_SeqKokkos(Vec xin,PetscInt * p,PetscReal * val)174 PetscErrorCode VecMax_SeqKokkos(Vec xin, PetscInt *p, PetscReal *val)
175 {
176 ConstPetscScalarKokkosView xv;
177 Kokkos::MaxFirstLoc<PetscReal, PetscInt>::value_type result;
178
179 PetscFunctionBegin;
180 PetscCall(PetscLogGpuTimeBegin());
181 PetscCall(VecGetKokkosView(xin, &xv));
182 PetscCallCXX(Kokkos::parallel_reduce(
183 "VecMax", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n),
184 KOKKOS_LAMBDA(const PetscInt &i, Kokkos::MaxFirstLoc<PetscReal, PetscInt>::value_type &lupdate) {
185 if (PetscRealPart(xv(i)) > lupdate.val) {
186 lupdate.val = PetscRealPart(xv(i));
187 lupdate.loc = i;
188 }
189 },
190 Kokkos::MaxFirstLoc<PetscReal, PetscInt>(result)));
191 *val = result.val;
192 if (p) *p = result.loc;
193 PetscCall(VecRestoreKokkosView(xin, &xv));
194 PetscCall(PetscLogGpuTimeEnd());
195 PetscFunctionReturn(PETSC_SUCCESS);
196 }
197
VecSum_SeqKokkos(Vec xin,PetscScalar * sum)198 PetscErrorCode VecSum_SeqKokkos(Vec xin, PetscScalar *sum)
199 {
200 ConstPetscScalarKokkosView xv;
201
202 PetscFunctionBegin;
203 PetscCall(PetscLogGpuTimeBegin());
204 PetscCall(VecGetKokkosView(xin, &xv));
205 PetscCallCXX(*sum = KokkosBlas::sum(PetscGetKokkosExecutionSpace(), xv));
206 PetscCall(VecRestoreKokkosView(xin, &xv));
207 PetscCall(PetscLogGpuTimeEnd());
208 PetscFunctionReturn(PETSC_SUCCESS);
209 }
210
VecShift_SeqKokkos(Vec xin,PetscScalar shift)211 PetscErrorCode VecShift_SeqKokkos(Vec xin, PetscScalar shift)
212 {
213 PetscScalarKokkosView xv;
214
215 PetscFunctionBegin;
216 PetscCall(PetscLogGpuTimeBegin());
217 PetscCall(VecGetKokkosView(xin, &xv));
218 PetscCallCXX(Kokkos::parallel_for("VecShift", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) { xv(i) += shift; }); PetscCall(VecRestoreKokkosView(xin, &xv)));
219 PetscCall(PetscLogGpuTimeEnd());
220 PetscCall(PetscLogGpuFlops(xin->map->n));
221 PetscFunctionReturn(PETSC_SUCCESS);
222 }
223
224 /* y = alpha x + y */
VecAXPY_SeqKokkos(Vec yin,PetscScalar alpha,Vec xin)225 PetscErrorCode VecAXPY_SeqKokkos(Vec yin, PetscScalar alpha, Vec xin)
226 {
227 PetscFunctionBegin;
228 if (alpha == (PetscScalar)0.0) PetscFunctionReturn(PETSC_SUCCESS);
229 if (yin == xin) {
230 PetscCall(VecScale_SeqKokkos(yin, alpha + 1));
231 } else {
232 PetscBool xiskok, yiskok;
233
234 PetscCall(PetscObjectTypeCompareAny((PetscObject)xin, &xiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
235 PetscCall(PetscObjectTypeCompareAny((PetscObject)yin, &yiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
236 if (xiskok && yiskok) {
237 PetscScalarKokkosView yv;
238 ConstPetscScalarKokkosView xv;
239
240 PetscCall(PetscLogGpuTimeBegin());
241 PetscCall(VecGetKokkosView(xin, &xv));
242 PetscCall(VecGetKokkosView(yin, &yv));
243 PetscCallCXX(KokkosBlas::axpy(PetscGetKokkosExecutionSpace(), alpha, xv, yv));
244 PetscCall(VecRestoreKokkosView(xin, &xv));
245 PetscCall(VecRestoreKokkosView(yin, &yv));
246 PetscCall(PetscLogGpuTimeEnd());
247 PetscCall(PetscLogGpuFlops(2.0 * yin->map->n));
248 } else {
249 PetscCall(VecAXPY_Seq(yin, alpha, xin));
250 }
251 }
252 PetscFunctionReturn(PETSC_SUCCESS);
253 }
254
255 /* y = x + beta y */
VecAYPX_SeqKokkos(Vec yin,PetscScalar beta,Vec xin)256 PetscErrorCode VecAYPX_SeqKokkos(Vec yin, PetscScalar beta, Vec xin)
257 {
258 PetscFunctionBegin;
259 /* One needs to define KOKKOSBLAS_OPTIMIZATION_LEVEL_AXPBY > 2 to have optimizations for cases alpha/beta = 0,+/-1 */
260 PetscCall(VecAXPBY_SeqKokkos(yin, 1.0, beta, xin));
261 PetscFunctionReturn(PETSC_SUCCESS);
262 }
263
264 /* z = y^T x */
VecTDot_SeqKokkos(Vec xin,Vec yin,PetscScalar * z)265 PetscErrorCode VecTDot_SeqKokkos(Vec xin, Vec yin, PetscScalar *z)
266 {
267 ConstPetscScalarKokkosView xv, yv;
268
269 PetscFunctionBegin;
270 PetscCall(PetscLogGpuTimeBegin());
271 PetscCall(VecGetKokkosView(xin, &xv));
272 PetscCall(VecGetKokkosView(yin, &yv));
273 // Kokkos always overwrites z, so no need to init it
274 PetscCallCXX(Kokkos::parallel_reduce("VecTDot", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &update) { update += yv(i) * xv(i); }, *z));
275 PetscCall(VecRestoreKokkosView(yin, &yv));
276 PetscCall(VecRestoreKokkosView(xin, &xv));
277 PetscCall(PetscLogGpuTimeEnd());
278 if (xin->map->n > 0) PetscCall(PetscLogGpuFlops(2.0 * xin->map->n));
279 PetscFunctionReturn(PETSC_SUCCESS);
280 }
281
282 struct TransposeDotTag { };
283 struct ConjugateDotTag { };
284
285 template <PetscInt ValueCount>
286 struct MDotFunctor {
287 static_assert(ValueCount >= 1 && ValueCount <= 8, "ValueCount must be in [1, 8]");
288 /* Note the C++ notation for an array typedef */
289 // noted, thanks
290 typedef PetscScalar value_type[];
291 typedef ConstPetscScalarKokkosView::size_type size_type;
292
293 /* Tell Kokkos the result array's number of entries. This must be a public value in the functor */
294 static constexpr size_type value_count = ValueCount;
295 ConstPetscScalarKokkosView xv, yv[8];
296
MDotFunctorMDotFunctor297 MDotFunctor(ConstPetscScalarKokkosView &xv, ConstPetscScalarKokkosView &yv0, ConstPetscScalarKokkosView &yv1, ConstPetscScalarKokkosView &yv2, ConstPetscScalarKokkosView &yv3, ConstPetscScalarKokkosView &yv4, ConstPetscScalarKokkosView &yv5, ConstPetscScalarKokkosView &yv6, ConstPetscScalarKokkosView &yv7) :
298 xv(xv)
299 {
300 yv[0] = yv0;
301 yv[1] = yv1;
302 yv[2] = yv2;
303 yv[3] = yv3;
304 yv[4] = yv4;
305 yv[5] = yv5;
306 yv[6] = yv6;
307 yv[7] = yv7;
308 }
309
operator ()MDotFunctor310 KOKKOS_INLINE_FUNCTION void operator()(TransposeDotTag, const size_type i, value_type sum) const
311 {
312 PetscScalar xval = xv(i);
313 for (size_type j = 0; j < value_count; ++j) sum[j] += yv[j](i) * xval;
314 }
315
operator ()MDotFunctor316 KOKKOS_INLINE_FUNCTION void operator()(ConjugateDotTag, const size_type i, value_type sum) const
317 {
318 PetscScalar xval = xv(i);
319 for (size_type j = 0; j < value_count; ++j) sum[j] += PetscConj(yv[j](i)) * xval;
320 }
321
322 // Per https://kokkos.github.io/kokkos-core-wiki/API/core/parallel-dispatch/parallel_reduce.html#requirements
323 // "when specifying a tag in the policy, the functor's potential init/join/final member functions must also be tagged"
324 // So we have this kind of duplicated code.
joinMDotFunctor325 KOKKOS_INLINE_FUNCTION void join(TransposeDotTag, value_type dst, const value_type src) const { join(dst, src); }
joinMDotFunctor326 KOKKOS_INLINE_FUNCTION void join(ConjugateDotTag, value_type dst, const value_type src) const { join(dst, src); }
327
initMDotFunctor328 KOKKOS_INLINE_FUNCTION void init(TransposeDotTag, value_type sum) const { init(sum); }
initMDotFunctor329 KOKKOS_INLINE_FUNCTION void init(ConjugateDotTag, value_type sum) const { init(sum); }
330
joinMDotFunctor331 KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
332 {
333 for (size_type j = 0; j < value_count; j++) dst[j] += src[j];
334 }
335
initMDotFunctor336 KOKKOS_INLINE_FUNCTION void init(value_type sum) const
337 {
338 for (size_type j = 0; j < value_count; j++) sum[j] = 0.0;
339 }
340 };
341
342 template <class WorkTag>
VecMultiDot_Private(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)343 PetscErrorCode VecMultiDot_Private(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
344 {
345 PetscInt i, j, cur = 0, ngroup = nv / 8, rem = nv % 8, N = xin->map->n;
346 ConstPetscScalarKokkosView xv, yv[8];
347 PetscScalarKokkosViewHost zv(z, nv);
348 auto exec = PetscGetKokkosExecutionSpace();
349
350 PetscFunctionBegin;
351 PetscCall(VecGetKokkosView(xin, &xv));
352 for (i = 0; i < ngroup; i++) { /* 8 y's per group */
353 for (j = 0; j < 8; j++) PetscCall(VecGetKokkosView(yin[cur + j], &yv[j]));
354 MDotFunctor<8> mdot(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]); /* Hope Kokkos make it asynchronous */
355 PetscCall(PetscLogGpuTimeBegin());
356 PetscCallCXX(Kokkos::parallel_reduce(Kokkos::RangePolicy<WorkTag>(exec, 0, N), mdot, Kokkos::subview(zv, Kokkos::pair<PetscInt, PetscInt>(cur, cur + 8))));
357 PetscCall(PetscLogGpuTimeEnd());
358 for (j = 0; j < 8; j++) PetscCall(VecRestoreKokkosView(yin[cur + j], &yv[j]));
359 cur += 8;
360 }
361
362 if (rem) { /* The remaining */
363 for (j = 0; j < rem; j++) PetscCall(VecGetKokkosView(yin[cur + j], &yv[j]));
364 Kokkos::RangePolicy<WorkTag> policy(exec, 0, N);
365 auto results = Kokkos::subview(zv, Kokkos::pair<PetscInt, PetscInt>(cur, cur + rem));
366 // clang-format off
367 PetscCall(PetscLogGpuTimeBegin());
368 switch (rem) {
369 case 1: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<1>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
370 case 2: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<2>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
371 case 3: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<3>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
372 case 4: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<4>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
373 case 5: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<5>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
374 case 6: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<6>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
375 case 7: PetscCallCXX(Kokkos::parallel_reduce(policy, MDotFunctor<7>(xv, yv[0], yv[1], yv[2], yv[3], yv[4], yv[5], yv[6], yv[7]), results)); break;
376 }
377 PetscCall(PetscLogGpuTimeEnd());
378 // clang-format on
379 for (j = 0; j < rem; j++) PetscCall(VecRestoreKokkosView(yin[cur + j], &yv[j]));
380 }
381 PetscCall(VecRestoreKokkosView(xin, &xv));
382 exec.fence(); /* If reduce is async, then we need this fence to make sure z is ready for use on host */
383 PetscFunctionReturn(PETSC_SUCCESS);
384 }
385
VecMultiDot_Verbose(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)386 static PetscErrorCode VecMultiDot_Verbose(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
387 {
388 PetscInt ngroup = nv / 8, rem = nv % 8, N = xin->map->n;
389 ConstPetscScalarKokkosView xv, y0, y1, y2, y3, y4, y5, y6, y7;
390 PetscScalar *zp = z;
391 const Vec *yp = yin;
392 Kokkos::RangePolicy<> policy(PetscGetKokkosExecutionSpace(), 0, N);
393
394 // clang-format off
395 PetscFunctionBegin;
396 PetscCall(VecGetKokkosView(xin, &xv));
397 for (PetscInt k = 0; k < ngroup; k++) { // 8 y's per group
398 PetscCall(VecGetKokkosView(yp[0], &y0));
399 PetscCall(VecGetKokkosView(yp[1], &y1));
400 PetscCall(VecGetKokkosView(yp[2], &y2));
401 PetscCall(VecGetKokkosView(yp[3], &y3));
402 PetscCall(VecGetKokkosView(yp[4], &y4));
403 PetscCall(VecGetKokkosView(yp[5], &y5));
404 PetscCall(VecGetKokkosView(yp[6], &y6));
405 PetscCall(VecGetKokkosView(yp[7], &y7));
406 PetscCall(PetscLogGpuTimeBegin()); // only for GPU kernel execution
407 Kokkos::parallel_reduce(
408 "VecMDot8", policy,
409 KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4, PetscScalar &lsum5, PetscScalar &lsum6, PetscScalar &lsum7) {
410 lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
411 lsum4 += xv(i) * PetscConj(y4(i)); lsum5 += xv(i) * PetscConj(y5(i)); lsum6 += xv(i) * PetscConj(y6(i)); lsum7 += xv(i) * PetscConj(y7(i));
412 }, zp[0], zp[1], zp[2], zp[3], zp[4], zp[5], zp[6], zp[7]);
413 PetscCall(PetscLogGpuTimeEnd());
414 PetscCall(PetscLogGpuToCpu(8 * sizeof(PetscScalar))); // for copying to z[] on host
415 PetscCall(VecRestoreKokkosView(yp[0], &y0));
416 PetscCall(VecRestoreKokkosView(yp[1], &y1));
417 PetscCall(VecRestoreKokkosView(yp[2], &y2));
418 PetscCall(VecRestoreKokkosView(yp[3], &y3));
419 PetscCall(VecRestoreKokkosView(yp[4], &y4));
420 PetscCall(VecRestoreKokkosView(yp[5], &y5));
421 PetscCall(VecRestoreKokkosView(yp[6], &y6));
422 PetscCall(VecRestoreKokkosView(yp[7], &y7));
423 yp += 8;
424 zp += 8;
425 }
426
427 if (rem) { /* The remaining */
428 if (rem > 0) PetscCall(VecGetKokkosView(yp[0], &y0));
429 if (rem > 1) PetscCall(VecGetKokkosView(yp[1], &y1));
430 if (rem > 2) PetscCall(VecGetKokkosView(yp[2], &y2));
431 if (rem > 3) PetscCall(VecGetKokkosView(yp[3], &y3));
432 if (rem > 4) PetscCall(VecGetKokkosView(yp[4], &y4));
433 if (rem > 5) PetscCall(VecGetKokkosView(yp[5], &y5));
434 if (rem > 6) PetscCall(VecGetKokkosView(yp[6], &y6));
435 PetscCall(PetscLogGpuTimeBegin());
436 switch (rem) {
437 case 7:
438 Kokkos::parallel_reduce(
439 "VecMDot7", policy,
440 KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4, PetscScalar &lsum5, PetscScalar &lsum6) {
441 lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
442 lsum4 += xv(i) * PetscConj(y4(i)); lsum5 += xv(i) * PetscConj(y5(i)); lsum6 += xv(i) * PetscConj(y6(i));
443 }, zp[0], zp[1], zp[2], zp[3], zp[4], zp[5], zp[6]);
444 break;
445 case 6:
446 Kokkos::parallel_reduce(
447 "VecMDot6", policy,
448 KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4, PetscScalar &lsum5) {
449 lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
450 lsum4 += xv(i) * PetscConj(y4(i)); lsum5 += xv(i) * PetscConj(y5(i));
451 }, zp[0], zp[1], zp[2], zp[3], zp[4], zp[5]);
452 break;
453 case 5:
454 Kokkos::parallel_reduce(
455 "VecMDot5", policy,
456 KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3, PetscScalar &lsum4) {
457 lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
458 lsum4 += xv(i) * PetscConj(y4(i));
459 }, zp[0], zp[1], zp[2], zp[3], zp[4]);
460 break;
461 case 4:
462 Kokkos::parallel_reduce(
463 "VecMDot4", policy,
464 KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2, PetscScalar &lsum3) {
465 lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i)); lsum3 += xv(i) * PetscConj(y3(i));
466 }, zp[0], zp[1], zp[2], zp[3]);
467 break;
468 case 3:
469 Kokkos::parallel_reduce(
470 "VecMDot3", policy,
471 KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1, PetscScalar &lsum2) {
472 lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i)); lsum2 += xv(i) * PetscConj(y2(i));
473 }, zp[0], zp[1], zp[2]);
474 break;
475 case 2:
476 Kokkos::parallel_reduce(
477 "VecMDot2", policy,
478 KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0, PetscScalar &lsum1) {
479 lsum0 += xv(i) * PetscConj(y0(i)); lsum1 += xv(i) * PetscConj(y1(i));
480 }, zp[0], zp[1]);
481 break;
482 case 1:
483 Kokkos::parallel_reduce(
484 "VecMDot1", policy,
485 KOKKOS_LAMBDA(const PetscInt &i, PetscScalar &lsum0) {
486 lsum0 += xv(i) * PetscConj(y0(i));
487 }, zp[0]);
488 break;
489 }
490 PetscCall(PetscLogGpuTimeEnd());
491 PetscCall(PetscLogGpuToCpu(rem * sizeof(PetscScalar))); // for copying to z[] on host
492 if (rem > 0) PetscCall(VecRestoreKokkosView(yp[0], &y0));
493 if (rem > 1) PetscCall(VecRestoreKokkosView(yp[1], &y1));
494 if (rem > 2) PetscCall(VecRestoreKokkosView(yp[2], &y2));
495 if (rem > 3) PetscCall(VecRestoreKokkosView(yp[3], &y3));
496 if (rem > 4) PetscCall(VecRestoreKokkosView(yp[4], &y4));
497 if (rem > 5) PetscCall(VecRestoreKokkosView(yp[5], &y5));
498 if (rem > 6) PetscCall(VecRestoreKokkosView(yp[6], &y6));
499 }
500 PetscCall(VecRestoreKokkosView(xin, &xv));
501 PetscFunctionReturn(PETSC_SUCCESS);
502 // clang-format on
503 }
504
505 /* z[i] = (x,y_i) = y_i^H x */
VecMDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)506 PetscErrorCode VecMDot_SeqKokkos(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
507 {
508 PetscFunctionBegin;
509 // With no good reason, VecMultiDot_Private() performs much worse than VecMultiDot_Verbose() with HIP,
510 // but they are on par with CUDA. Kokkos team is investigating this problem.
511 #if 0
512 PetscCall(VecMultiDot_Private<ConjugateDotTag>(xin, nv, yin, z));
513 #else
514 PetscCall(VecMultiDot_Verbose(xin, nv, yin, z));
515 #endif
516 PetscCall(PetscLogGpuFlops(PetscMax(nv * (2.0 * xin->map->n - 1), 0.0)));
517 PetscFunctionReturn(PETSC_SUCCESS);
518 }
519
520 /* z[i] = (x,y_i) = y_i^T x */
VecMTDot_SeqKokkos(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)521 PetscErrorCode VecMTDot_SeqKokkos(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
522 {
523 PetscFunctionBegin;
524 PetscCall(VecMultiDot_Private<TransposeDotTag>(xin, nv, yin, z));
525 PetscCall(PetscLogGpuFlops(PetscMax(nv * (2.0 * xin->map->n - 1), 0.0)));
526 PetscFunctionReturn(PETSC_SUCCESS);
527 }
528
529 // z[i] = (x,y_i) = y_i^H x OR y_i^T x
VecMultiDot_SeqKokkos_GEMV(PetscBool conjugate,Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z_h)530 static PetscErrorCode VecMultiDot_SeqKokkos_GEMV(PetscBool conjugate, Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z_h)
531 {
532 PetscInt i, j, nfail;
533 ConstPetscScalarKokkosView xv, yfirst, ynext;
534 const PetscScalar *yarray;
535 PetscBool stop = PETSC_FALSE;
536 PetscScalar *z_d = nullptr;
537 const char *trans = conjugate ? "C" : "T";
538 PetscInt64 lda = 0;
539 PetscInt m, n = xin->map->n;
540
541 PetscFunctionBegin;
542 PetscCall(VecGetKokkosView(xin, &xv));
543 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
544 z_d = z_h;
545 #endif
546 i = nfail = 0;
547 while (i < nv) {
548 // search a sequence of vectors with a fixed stride
549 stop = PETSC_FALSE;
550 PetscCall(VecGetKokkosView(yin[i], &yfirst));
551 yarray = yfirst.data();
552 for (j = i + 1; j < nv; j++) {
553 PetscCall(VecGetKokkosView(yin[j], &ynext));
554 if (j == i + 1) {
555 lda = ynext.data() - yarray; // arbitrary ptrdiff could be very large
556 if (lda < 0 || lda - n > 64) stop = PETSC_TRUE; // avoid using arbitrary lda; 64 bytes are a big enough alignment in VecDuplicateVecs
557 } else if (lda * (j - i) != ynext.data() - yarray) { // not in the same stride? if so, stop searching
558 stop = PETSC_TRUE;
559 }
560 PetscCall(VecRestoreKokkosView(yin[j], &ynext));
561 if (stop) break;
562 }
563 PetscCall(VecRestoreKokkosView(yin[i], &yfirst));
564
565 // found m vectors yin[i..j) with a stride lda at address yarray
566 m = j - i;
567 if (m > 1) {
568 if (!z_d) {
569 if (nv > PetscScalarPoolSize) { // rare case
570 PetscScalarPoolSize = nv;
571 PetscCallCXX(PetscScalarPool = static_cast<PetscScalar *>(Kokkos::kokkos_realloc(PetscScalarPool, PetscScalarPoolSize)));
572 }
573 z_d = PetscScalarPool;
574 }
575 const auto &A = Kokkos::View<const PetscScalar **, Kokkos::LayoutLeft>(yarray, lda, m);
576 const auto &Y = Kokkos::subview(A, std::pair<PetscInt, PetscInt>(0, n), Kokkos::ALL);
577 auto zv = PetscScalarKokkosDualView(PetscScalarKokkosView(z_d + i, m), PetscScalarKokkosViewHost(z_h + i, m));
578 PetscCall(PetscLogGpuTimeBegin());
579 PetscCallCXX(KokkosBlas::gemv(PetscGetKokkosExecutionSpace(), trans, 1.0, Y, xv, 0.0, zv.view_device()));
580 PetscCall(PetscLogGpuTimeEnd());
581 PetscCallCXX(zv.modify_device());
582 PetscCall(KokkosDualViewSyncHost(zv, PetscGetKokkosExecutionSpace()));
583 PetscCall(PetscLogGpuFlops(PetscMax(m * (2.0 * n - 1), 0.0)));
584 } else {
585 // we only allow falling back on VecDot once, to avoid doing VecMultiDot via individual VecDots
586 if (nfail++ == 0) {
587 if (conjugate) PetscCall(VecDot_SeqKokkos(xin, yin[i], z_h + i));
588 else PetscCall(VecTDot_SeqKokkos(xin, yin[i], z_h + i));
589 } else break; // break the while loop
590 }
591 i = j;
592 }
593 PetscCall(VecRestoreKokkosView(xin, &xv));
594 if (i < nv) { // finish the remaining if any
595 if (conjugate) PetscCall(VecMDot_SeqKokkos(xin, nv - i, yin + i, z_h + i));
596 else PetscCall(VecMTDot_SeqKokkos(xin, nv - i, yin + i, z_h + i));
597 }
598 PetscFunctionReturn(PETSC_SUCCESS);
599 }
600
VecMDot_SeqKokkos_GEMV(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)601 PetscErrorCode VecMDot_SeqKokkos_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
602 {
603 PetscFunctionBegin;
604 PetscCall(VecMultiDot_SeqKokkos_GEMV(PETSC_TRUE, xin, nv, yin, z)); // conjugate
605 PetscFunctionReturn(PETSC_SUCCESS);
606 }
607
VecMTDot_SeqKokkos_GEMV(Vec xin,PetscInt nv,const Vec yin[],PetscScalar * z)608 PetscErrorCode VecMTDot_SeqKokkos_GEMV(Vec xin, PetscInt nv, const Vec yin[], PetscScalar *z)
609 {
610 PetscFunctionBegin;
611 PetscCall(VecMultiDot_SeqKokkos_GEMV(PETSC_FALSE, xin, nv, yin, z)); // transpose
612 PetscFunctionReturn(PETSC_SUCCESS);
613 }
614
615 /* x[:] = alpha */
VecSet_SeqKokkos(Vec xin,PetscScalar alpha)616 PetscErrorCode VecSet_SeqKokkos(Vec xin, PetscScalar alpha)
617 {
618 PetscScalarKokkosView xv;
619 auto exec = PetscGetKokkosExecutionSpace();
620
621 PetscFunctionBegin;
622 PetscCall(PetscLogGpuTimeBegin());
623 PetscCall(VecGetKokkosViewWrite(xin, &xv));
624 PetscCallCXX(KokkosBlas::fill(exec, xv, alpha));
625 PetscCall(VecRestoreKokkosViewWrite(xin, &xv));
626 PetscCall(PetscLogGpuTimeEnd());
627 PetscFunctionReturn(PETSC_SUCCESS);
628 }
629
630 /* x = alpha x */
VecScale_SeqKokkos(Vec xin,PetscScalar alpha)631 PetscErrorCode VecScale_SeqKokkos(Vec xin, PetscScalar alpha)
632 {
633 auto exec = PetscGetKokkosExecutionSpace();
634
635 PetscFunctionBegin;
636 if (alpha == (PetscScalar)0.0) {
637 PetscCall(VecSet_SeqKokkos(xin, alpha));
638 } else if (alpha != (PetscScalar)1.0) {
639 PetscScalarKokkosView xv;
640
641 PetscCall(PetscLogGpuTimeBegin());
642 PetscCall(VecGetKokkosView(xin, &xv));
643 PetscCallCXX(KokkosBlas::scal(exec, xv, alpha, xv));
644 PetscCall(VecRestoreKokkosView(xin, &xv));
645 PetscCall(PetscLogGpuTimeEnd());
646 PetscCall(PetscLogGpuFlops(xin->map->n));
647 }
648 PetscFunctionReturn(PETSC_SUCCESS);
649 }
650
651 /* z = y^H x */
VecDot_SeqKokkos(Vec xin,Vec yin,PetscScalar * z)652 PetscErrorCode VecDot_SeqKokkos(Vec xin, Vec yin, PetscScalar *z)
653 {
654 ConstPetscScalarKokkosView xv, yv;
655 auto exec = PetscGetKokkosExecutionSpace();
656
657 PetscFunctionBegin;
658 PetscCall(PetscLogGpuTimeBegin());
659 PetscCall(VecGetKokkosView(xin, &xv));
660 PetscCall(VecGetKokkosView(yin, &yv));
661 PetscCallCXX(*z = KokkosBlas::dot(exec, yv, xv)); /* KokkosBlas::dot(a,b) takes conjugate of a */
662 PetscCall(VecRestoreKokkosView(xin, &xv));
663 PetscCall(VecRestoreKokkosView(yin, &yv));
664 PetscCall(PetscLogGpuTimeEnd());
665 PetscCall(PetscLogGpuFlops(PetscMax(2.0 * xin->map->n - 1, 0.0)));
666 PetscFunctionReturn(PETSC_SUCCESS);
667 }
668
669 /* y = x, where x is VECKOKKOS, but y may be not */
VecCopy_SeqKokkos(Vec xin,Vec yin)670 PetscErrorCode VecCopy_SeqKokkos(Vec xin, Vec yin)
671 {
672 auto exec = PetscGetKokkosExecutionSpace();
673
674 PetscFunctionBegin;
675 PetscCall(PetscLogGpuTimeBegin());
676 if (xin != yin) {
677 Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(xin->spptr);
678 if (yin->offloadmask == PETSC_OFFLOAD_KOKKOS) {
679 /* y is also a VecKokkos */
680 Vec_Kokkos *ykok = static_cast<Vec_Kokkos *>(yin->spptr);
681 /* Kokkos rule: if x's host has newer data, it will copy to y's host view; otherwise to y's device view
682 In case x's host is newer, y's device is newer, it will error (though should not, I think). So we just
683 clear y's sync state.
684 */
685 ykok->v_dual.clear_sync_state();
686 PetscCallCXX(Kokkos::deep_copy(exec, ykok->v_dual, xkok->v_dual)); // either cpu2cpu or gpu2cpu, so don't log it
687 } else {
688 PetscScalar *yarray;
689 PetscCall(VecGetArrayWrite(yin, &yarray));
690 PetscScalarKokkosViewHost yv(yarray, yin->map->n);
691 if (xkok->v_dual.need_sync_host()) { // x's device has newer data
692 PetscCallCXX(Kokkos::deep_copy(exec, yv, xkok->v_dual.view_device())); // gpu2cpu
693 PetscCallCXX(exec.fence()); // finish the deep copy
694 PetscCall(PetscLogGpuToCpu(xkok->v_dual.extent(0) * sizeof(PetscScalar)));
695 } else {
696 PetscCallCXX(exec.fence()); // make sure xkok->v_dual.view_host() in ready for use on host; Kokkos might also call it inside deep_copy(). We do it here for safety.
697 PetscCallCXX(Kokkos::deep_copy(exec, yv, xkok->v_dual.view_host())); // Host view to host view deep copy, done on host
698 }
699 PetscCall(VecRestoreArrayWrite(yin, &yarray));
700 }
701 }
702 PetscCall(PetscLogGpuTimeEnd());
703 PetscFunctionReturn(PETSC_SUCCESS);
704 }
705
706 /* y[i] <--> x[i] */
VecSwap_SeqKokkos(Vec xin,Vec yin)707 PetscErrorCode VecSwap_SeqKokkos(Vec xin, Vec yin)
708 {
709 PetscFunctionBegin;
710 if (xin != yin) {
711 PetscScalarKokkosView xv, yv;
712
713 PetscCall(PetscLogGpuTimeBegin());
714 PetscCall(VecGetKokkosView(xin, &xv));
715 PetscCall(VecGetKokkosView(yin, &yv));
716 PetscCallCXX(Kokkos::parallel_for(
717 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) {
718 PetscScalar tmp = xv(i);
719 xv(i) = yv(i);
720 yv(i) = tmp;
721 }));
722 PetscCall(VecRestoreKokkosView(xin, &xv));
723 PetscCall(VecRestoreKokkosView(yin, &yv));
724 PetscCall(PetscLogGpuTimeEnd());
725 }
726 PetscFunctionReturn(PETSC_SUCCESS);
727 }
728
729 /* w = alpha x + y */
VecWAXPY_SeqKokkos(Vec win,PetscScalar alpha,Vec xin,Vec yin)730 PetscErrorCode VecWAXPY_SeqKokkos(Vec win, PetscScalar alpha, Vec xin, Vec yin)
731 {
732 PetscFunctionBegin;
733 if (alpha == (PetscScalar)0.0) {
734 PetscCall(VecCopy_SeqKokkos(yin, win));
735 } else {
736 ConstPetscScalarKokkosView xv, yv;
737 PetscScalarKokkosView wv;
738
739 PetscCall(PetscLogGpuTimeBegin());
740 PetscCall(VecGetKokkosViewWrite(win, &wv));
741 PetscCall(VecGetKokkosView(xin, &xv));
742 PetscCall(VecGetKokkosView(yin, &yv));
743 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, win->map->n), KOKKOS_LAMBDA(const PetscInt &i) { wv(i) = alpha * xv(i) + yv(i); }));
744 PetscCall(VecRestoreKokkosView(xin, &xv));
745 PetscCall(VecRestoreKokkosView(yin, &yv));
746 PetscCall(VecRestoreKokkosViewWrite(win, &wv));
747 PetscCall(PetscLogGpuTimeEnd());
748 PetscCall(PetscLogGpuFlops(2.0 * win->map->n));
749 }
750 PetscFunctionReturn(PETSC_SUCCESS);
751 }
752
753 template <PetscInt ValueCount>
754 struct MAXPYFunctor {
755 static_assert(ValueCount >= 1 && ValueCount <= 8, "ValueCount must be in [1, 8]");
756 typedef ConstPetscScalarKokkosView::size_type size_type;
757
758 PetscScalarKokkosView yv;
759 PetscScalar a[8];
760 ConstPetscScalarKokkosView xv[8];
761
MAXPYFunctorMAXPYFunctor762 MAXPYFunctor(PetscScalarKokkosView yv, PetscScalar a0, PetscScalar a1, PetscScalar a2, PetscScalar a3, PetscScalar a4, PetscScalar a5, PetscScalar a6, PetscScalar a7, ConstPetscScalarKokkosView xv0, ConstPetscScalarKokkosView xv1, ConstPetscScalarKokkosView xv2, ConstPetscScalarKokkosView xv3, ConstPetscScalarKokkosView xv4, ConstPetscScalarKokkosView xv5, ConstPetscScalarKokkosView xv6, ConstPetscScalarKokkosView xv7) :
763 yv(yv)
764 {
765 a[0] = a0;
766 a[1] = a1;
767 a[2] = a2;
768 a[3] = a3;
769 a[4] = a4;
770 a[5] = a5;
771 a[6] = a6;
772 a[7] = a7;
773 xv[0] = xv0;
774 xv[1] = xv1;
775 xv[2] = xv2;
776 xv[3] = xv3;
777 xv[4] = xv4;
778 xv[5] = xv5;
779 xv[6] = xv6;
780 xv[7] = xv7;
781 }
782
operator ()MAXPYFunctor783 KOKKOS_INLINE_FUNCTION void operator()(const size_type i) const
784 {
785 for (PetscInt j = 0; j < ValueCount; ++j) yv(i) += a[j] * xv[j](i);
786 }
787 };
788
789 /* y = y + sum alpha[i] x[i] */
VecMAXPY_SeqKokkos(Vec yin,PetscInt nv,const PetscScalar * alpha,Vec * xin)790 PetscErrorCode VecMAXPY_SeqKokkos(Vec yin, PetscInt nv, const PetscScalar *alpha, Vec *xin)
791 {
792 PetscInt i, j, cur = 0, ngroup = nv / 8, rem = nv % 8, N = yin->map->n;
793 PetscScalarKokkosView yv;
794 PetscScalar a[8];
795 ConstPetscScalarKokkosView xv[8];
796 Kokkos::RangePolicy<> policy(PetscGetKokkosExecutionSpace(), 0, N);
797
798 PetscFunctionBegin;
799 PetscCall(PetscLogGpuTimeBegin());
800 PetscCall(VecGetKokkosView(yin, &yv));
801 for (i = 0; i < ngroup; i++) { /* 8 x's per group */
802 for (j = 0; j < 8; j++) { /* Fill the parameters */
803 a[j] = alpha[cur + j];
804 PetscCall(VecGetKokkosView(xin[cur + j], &xv[j]));
805 }
806 MAXPYFunctor<8> maxpy(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]);
807 PetscCallCXX(Kokkos::parallel_for(policy, maxpy));
808 for (j = 0; j < 8; j++) PetscCall(VecRestoreKokkosView(xin[cur + j], &xv[j]));
809 cur += 8;
810 }
811
812 if (rem) { /* The remaining */
813 for (j = 0; j < rem; j++) {
814 a[j] = alpha[cur + j];
815 PetscCall(VecGetKokkosView(xin[cur + j], &xv[j]));
816 }
817 // clang-format off
818 switch (rem) {
819 case 1: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<1>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
820 case 2: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<2>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
821 case 3: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<3>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
822 case 4: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<4>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
823 case 5: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<5>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
824 case 6: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<6>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
825 case 7: PetscCallCXX(Kokkos::parallel_for(policy, MAXPYFunctor<7>(yv, a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7], xv[0], xv[1], xv[2], xv[3], xv[4], xv[5], xv[6], xv[7]))); break;
826 }
827 // clang-format on
828 for (j = 0; j < rem; j++) PetscCall(VecRestoreKokkosView(xin[cur + j], &xv[j]));
829 }
830 PetscCall(VecRestoreKokkosView(yin, &yv));
831 PetscCall(PetscLogGpuTimeEnd());
832 PetscCall(PetscLogGpuFlops(nv * 2.0 * yin->map->n));
833 PetscFunctionReturn(PETSC_SUCCESS);
834 }
835
836 /* y = y + sum alpha[i] x[i] */
VecMAXPY_SeqKokkos_GEMV(Vec yin,PetscInt nv,const PetscScalar * a_h,Vec * xin)837 PetscErrorCode VecMAXPY_SeqKokkos_GEMV(Vec yin, PetscInt nv, const PetscScalar *a_h, Vec *xin)
838 {
839 const PetscInt n = yin->map->n;
840 PetscInt i, j, nfail;
841 PetscScalarKokkosView yv;
842 ConstPetscScalarKokkosView xfirst, xnext;
843 PetscBool stop = PETSC_FALSE;
844 PetscInt lda = 0, m;
845 const PetscScalar *xarray;
846 PetscScalar *a_d = nullptr;
847
848 PetscFunctionBegin;
849 PetscCall(PetscLogGpuTimeBegin());
850 PetscCall(VecGetKokkosView(yin, &yv));
851 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
852 a_d = const_cast<PetscScalar *>(a_h);
853 #endif
854 i = nfail = 0;
855 while (i < nv) {
856 stop = PETSC_FALSE;
857 PetscCall(VecGetKokkosView(xin[i], &xfirst));
858 xarray = xfirst.data();
859 for (j = i + 1; j < nv; j++) {
860 PetscCall(VecGetKokkosView(xin[j], &xnext));
861 if (j == i + 1) {
862 lda = xnext.data() - xfirst.data();
863 if (lda < 0 || lda - n > 64) stop = PETSC_TRUE; // avoid using arbitrary lda; 64 bytes are a big enough alignment in VecDuplicateVecs
864 } else if (lda * (j - i) != xnext.data() - xarray) { // not in the same stride? if so, stop here
865 stop = PETSC_TRUE;
866 }
867 PetscCall(VecRestoreKokkosView(xin[j], &xnext));
868 if (stop) break;
869 }
870 PetscCall(VecRestoreKokkosView(xin[i], &xfirst));
871
872 m = j - i;
873 if (m > 1) {
874 if (!a_d) {
875 if (nv > PetscScalarPoolSize) { // rare case
876 PetscScalarPoolSize = nv;
877 PetscCallCXX(PetscScalarPool = static_cast<PetscScalar *>(Kokkos::kokkos_realloc(PetscScalarPool, PetscScalarPoolSize)));
878 }
879 a_d = PetscScalarPool;
880 }
881 const auto &B = Kokkos::View<const PetscScalar **, Kokkos::LayoutLeft>(xarray, lda, m);
882 const auto &A = Kokkos::subview(B, std::pair<PetscInt, PetscInt>(0, n), Kokkos::ALL);
883 auto av = PetscScalarKokkosDualView(PetscScalarKokkosView(a_d + i, m), PetscScalarKokkosViewHost(const_cast<PetscScalar *>(a_h) + i, m));
884 av.modify_host();
885 PetscCall(KokkosDualViewSyncDevice(av, PetscGetKokkosExecutionSpace()));
886 PetscCallCXX(KokkosBlas::gemv(PetscGetKokkosExecutionSpace(), "N", 1.0, A, av.view_device(), 1.0, yv));
887 PetscCall(PetscLogGpuFlops(m * 2.0 * n));
888 } else {
889 // we only allow falling back on VecAXPY once
890 if (nfail++ == 0) PetscCall(VecAXPY_SeqKokkos(yin, a_h[i], xin[i]));
891 else break; // break the while loop
892 }
893 i = j;
894 }
895 // finish the remaining if any
896 PetscCall(VecRestoreKokkosView(yin, &yv));
897 if (i < nv) PetscCall(VecMAXPY_SeqKokkos(yin, nv - i, a_h + i, xin + i));
898 PetscCall(PetscLogGpuTimeEnd());
899 PetscFunctionReturn(PETSC_SUCCESS);
900 }
901
902 /* y = alpha x + beta y */
VecAXPBY_SeqKokkos(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)903 PetscErrorCode VecAXPBY_SeqKokkos(Vec yin, PetscScalar alpha, PetscScalar beta, Vec xin)
904 {
905 PetscBool xiskok, yiskok;
906
907 PetscFunctionBegin;
908 PetscCall(PetscObjectTypeCompareAny((PetscObject)xin, &xiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
909 PetscCall(PetscObjectTypeCompareAny((PetscObject)yin, &yiskok, VECSEQKOKKOS, VECMPIKOKKOS, ""));
910 if (xiskok && yiskok) {
911 ConstPetscScalarKokkosView xv;
912 PetscScalarKokkosView yv;
913
914 PetscCall(PetscLogGpuTimeBegin());
915 PetscCall(VecGetKokkosView(xin, &xv));
916 PetscCall(VecGetKokkosView(yin, &yv));
917 PetscCallCXX(KokkosBlas::axpby(PetscGetKokkosExecutionSpace(), alpha, xv, beta, yv));
918 PetscCall(VecRestoreKokkosView(xin, &xv));
919 PetscCall(VecRestoreKokkosView(yin, &yv));
920 PetscCall(PetscLogGpuTimeEnd());
921 if (alpha == (PetscScalar)0.0 || beta == (PetscScalar)0.0) {
922 PetscCall(PetscLogGpuFlops(xin->map->n));
923 } else if (beta == (PetscScalar)1.0 || alpha == (PetscScalar)1.0) {
924 PetscCall(PetscLogGpuFlops(2.0 * xin->map->n));
925 } else {
926 PetscCall(PetscLogGpuFlops(3.0 * xin->map->n));
927 }
928 } else {
929 PetscCall(VecAXPBY_Seq(yin, alpha, beta, xin));
930 }
931 PetscFunctionReturn(PETSC_SUCCESS);
932 }
933
934 /* z = alpha x + beta y + gamma z */
VecAXPBYPCZ_SeqKokkos(Vec zin,PetscScalar alpha,PetscScalar beta,PetscScalar gamma,Vec xin,Vec yin)935 PetscErrorCode VecAXPBYPCZ_SeqKokkos(Vec zin, PetscScalar alpha, PetscScalar beta, PetscScalar gamma, Vec xin, Vec yin)
936 {
937 ConstPetscScalarKokkosView xv, yv;
938 PetscScalarKokkosView zv;
939 Kokkos::RangePolicy<> policy(PetscGetKokkosExecutionSpace(), 0, zin->map->n);
940
941 PetscFunctionBegin;
942 PetscCall(PetscLogGpuTimeBegin());
943 PetscCall(VecGetKokkosView(zin, &zv));
944 PetscCall(VecGetKokkosView(xin, &xv));
945 PetscCall(VecGetKokkosView(yin, &yv));
946 if (gamma == (PetscScalar)0.0) { // a common case
947 if (alpha == -beta) {
948 PetscCallCXX(Kokkos::parallel_for( // a common case
949 policy, KOKKOS_LAMBDA(const PetscInt &i) { zv(i) = alpha * (xv(i) - yv(i)); }));
950 } else {
951 PetscCallCXX(Kokkos::parallel_for(policy, KOKKOS_LAMBDA(const PetscInt &i) { zv(i) = alpha * xv(i) + beta * yv(i); }));
952 }
953 } else {
954 PetscCallCXX(KokkosBlas::update(PetscGetKokkosExecutionSpace(), alpha, xv, beta, yv, gamma, zv));
955 }
956 PetscCall(VecRestoreKokkosView(xin, &xv));
957 PetscCall(VecRestoreKokkosView(yin, &yv));
958 PetscCall(VecRestoreKokkosView(zin, &zv));
959 PetscCall(PetscLogGpuTimeEnd());
960 PetscCall(PetscLogGpuFlops(zin->map->n * 5.0));
961 PetscFunctionReturn(PETSC_SUCCESS);
962 }
963
964 /* w = x*y. Any subset of the x, y, and w may be the same vector.
965
966 w is of type VecKokkos, but x, y may be not.
967 */
VecPointwiseMult_SeqKokkos(Vec win,Vec xin,Vec yin)968 PetscErrorCode VecPointwiseMult_SeqKokkos(Vec win, Vec xin, Vec yin)
969 {
970 PetscInt n;
971
972 PetscFunctionBegin;
973 PetscCall(PetscLogGpuTimeBegin());
974 PetscCall(VecGetLocalSize(win, &n));
975 if (xin->offloadmask != PETSC_OFFLOAD_KOKKOS || yin->offloadmask != PETSC_OFFLOAD_KOKKOS) {
976 PetscScalarKokkosViewHost wv;
977 const PetscScalar *xp, *yp;
978 PetscCall(VecGetArrayRead(xin, &xp));
979 PetscCall(VecGetArrayRead(yin, &yp));
980 PetscCall(VecGetKokkosViewWrite(win, &wv));
981
982 ConstPetscScalarKokkosViewHost xv(xp, n), yv(yp, n);
983 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n), KOKKOS_LAMBDA(const PetscInt &i) { wv(i) = xv(i) * yv(i); }));
984
985 PetscCall(VecRestoreArrayRead(xin, &xp));
986 PetscCall(VecRestoreArrayRead(yin, &yp));
987 PetscCall(VecRestoreKokkosViewWrite(win, &wv));
988 } else {
989 ConstPetscScalarKokkosView xv, yv;
990 PetscScalarKokkosView wv;
991
992 PetscCall(VecGetKokkosViewWrite(win, &wv));
993 PetscCall(VecGetKokkosView(xin, &xv));
994 PetscCall(VecGetKokkosView(yin, &yv));
995 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt &i) { wv(i) = xv(i) * yv(i); }));
996 PetscCall(VecRestoreKokkosView(yin, &yv));
997 PetscCall(VecRestoreKokkosView(xin, &xv));
998 PetscCall(VecRestoreKokkosViewWrite(win, &wv));
999 }
1000 PetscCall(PetscLogGpuTimeEnd());
1001 PetscCall(PetscLogGpuFlops(n));
1002 PetscFunctionReturn(PETSC_SUCCESS);
1003 }
1004
1005 /* w = x/y */
VecPointwiseDivide_SeqKokkos(Vec win,Vec xin,Vec yin)1006 PetscErrorCode VecPointwiseDivide_SeqKokkos(Vec win, Vec xin, Vec yin)
1007 {
1008 PetscInt n;
1009
1010 PetscFunctionBegin;
1011 PetscCall(PetscLogGpuTimeBegin());
1012 PetscCall(VecGetLocalSize(win, &n));
1013 if (xin->offloadmask != PETSC_OFFLOAD_KOKKOS || yin->offloadmask != PETSC_OFFLOAD_KOKKOS) {
1014 PetscScalarKokkosViewHost wv;
1015 const PetscScalar *xp, *yp;
1016 PetscCall(VecGetArrayRead(xin, &xp));
1017 PetscCall(VecGetArrayRead(yin, &yp));
1018 PetscCall(VecGetKokkosViewWrite(win, &wv));
1019
1020 ConstPetscScalarKokkosViewHost xv(xp, n), yv(yp, n);
1021 PetscCallCXX(Kokkos::parallel_for(
1022 Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, n), KOKKOS_LAMBDA(const PetscInt &i) {
1023 if (yv(i) != 0.0) wv(i) = xv(i) / yv(i);
1024 else wv(i) = 0.0;
1025 }));
1026
1027 PetscCall(VecRestoreArrayRead(xin, &xp));
1028 PetscCall(VecRestoreArrayRead(yin, &yp));
1029 PetscCall(VecRestoreKokkosViewWrite(win, &wv));
1030 } else {
1031 ConstPetscScalarKokkosView xv, yv;
1032 PetscScalarKokkosView wv;
1033
1034 PetscCall(VecGetKokkosViewWrite(win, &wv));
1035 PetscCall(VecGetKokkosView(xin, &xv));
1036 PetscCall(VecGetKokkosView(yin, &yv));
1037 PetscCallCXX(Kokkos::parallel_for(
1038 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n), KOKKOS_LAMBDA(const PetscInt &i) {
1039 if (yv(i) != 0.0) wv(i) = xv(i) / yv(i);
1040 else wv(i) = 0.0;
1041 }));
1042 PetscCall(VecRestoreKokkosView(yin, &yv));
1043 PetscCall(VecRestoreKokkosView(xin, &xv));
1044 PetscCall(VecRestoreKokkosViewWrite(win, &wv));
1045 }
1046 PetscCall(PetscLogGpuTimeEnd());
1047 PetscCall(PetscLogGpuFlops(win->map->n));
1048 PetscFunctionReturn(PETSC_SUCCESS);
1049 }
1050
VecNorm_SeqKokkos(Vec xin,NormType type,PetscReal * z)1051 PetscErrorCode VecNorm_SeqKokkos(Vec xin, NormType type, PetscReal *z)
1052 {
1053 const PetscInt n = xin->map->n;
1054 ConstPetscScalarKokkosView xv;
1055 auto exec = PetscGetKokkosExecutionSpace();
1056
1057 PetscFunctionBegin;
1058 if (type == NORM_1_AND_2) {
1059 PetscCall(VecNorm_SeqKokkos(xin, NORM_1, z));
1060 PetscCall(VecNorm_SeqKokkos(xin, NORM_2, z + 1));
1061 } else {
1062 PetscCall(PetscLogGpuTimeBegin());
1063 PetscCall(VecGetKokkosView(xin, &xv));
1064 if (type == NORM_2 || type == NORM_FROBENIUS) {
1065 PetscCallCXX(*z = KokkosBlas::nrm2(exec, xv));
1066 PetscCall(PetscLogGpuFlops(PetscMax(2.0 * n - 1, 0.0)));
1067 } else if (type == NORM_1) {
1068 PetscCallCXX(*z = KokkosBlas::nrm1(exec, xv));
1069 PetscCall(PetscLogGpuFlops(PetscMax(n - 1.0, 0.0)));
1070 } else if (type == NORM_INFINITY) {
1071 PetscCallCXX(*z = KokkosBlas::nrminf(exec, xv));
1072 }
1073 PetscCall(VecRestoreKokkosView(xin, &xv));
1074 PetscCall(PetscLogGpuTimeEnd());
1075 }
1076 PetscFunctionReturn(PETSC_SUCCESS);
1077 }
1078
VecErrorWeightedNorms_SeqKokkos(Vec U,Vec Y,Vec E,NormType wnormtype,PetscReal atol,Vec vatol,PetscReal rtol,Vec vrtol,PetscReal ignore_max,PetscReal * norm,PetscInt * norm_loc,PetscReal * norma,PetscInt * norma_loc,PetscReal * normr,PetscInt * normr_loc)1079 PetscErrorCode VecErrorWeightedNorms_SeqKokkos(Vec U, Vec Y, Vec E, NormType wnormtype, PetscReal atol, Vec vatol, PetscReal rtol, Vec vrtol, PetscReal ignore_max, PetscReal *norm, PetscInt *norm_loc, PetscReal *norma, PetscInt *norma_loc, PetscReal *normr, PetscInt *normr_loc)
1080 {
1081 ConstPetscScalarKokkosView u, y, erra, atola, rtola;
1082 PetscBool has_E = PETSC_FALSE, has_atol = PETSC_FALSE, has_rtol = PETSC_FALSE;
1083 PetscInt n, n_loc = 0, na_loc = 0, nr_loc = 0;
1084 PetscReal nrm = 0, nrma = 0, nrmr = 0;
1085
1086 PetscFunctionBegin;
1087 PetscCall(VecGetLocalSize(U, &n));
1088 PetscCall(VecGetKokkosView(U, &u));
1089 PetscCall(VecGetKokkosView(Y, &y));
1090 if (E) {
1091 PetscCall(VecGetKokkosView(E, &erra));
1092 has_E = PETSC_TRUE;
1093 }
1094 if (vatol) {
1095 PetscCall(VecGetKokkosView(vatol, &atola));
1096 has_atol = PETSC_TRUE;
1097 }
1098 if (vrtol) {
1099 PetscCall(VecGetKokkosView(vrtol, &rtola));
1100 has_rtol = PETSC_TRUE;
1101 }
1102
1103 if (wnormtype == NORM_INFINITY) {
1104 PetscCallCXX(Kokkos::parallel_reduce(
1105 "VecErrorWeightedNorms_INFINITY", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n),
1106 KOKKOS_LAMBDA(const PetscInt &i, PetscReal &l_nrm, PetscReal &l_nrma, PetscReal &l_nrmr, PetscInt &l_n_loc, PetscInt &l_na_loc, PetscInt &l_nr_loc) {
1107 PetscReal err, tol, tola, tolr, l_atol, l_rtol;
1108 if (PetscAbsScalar(y(i)) >= ignore_max && PetscAbsScalar(u(i)) >= ignore_max) {
1109 l_atol = has_atol ? PetscRealPart(atola(i)) : atol;
1110 l_rtol = has_rtol ? PetscRealPart(rtola(i)) : rtol;
1111 err = has_E ? PetscAbsScalar(erra(i)) : PetscAbsScalar(y(i) - u(i));
1112 tola = l_atol;
1113 tolr = l_rtol * PetscMax(PetscAbsScalar(u(i)), PetscAbsScalar(y(i)));
1114 tol = tola + tolr;
1115 if (tola > 0.) {
1116 l_nrma = PetscMax(l_nrma, err / tola);
1117 l_na_loc++;
1118 }
1119 if (tolr > 0.) {
1120 l_nrmr = PetscMax(l_nrmr, err / tolr);
1121 l_nr_loc++;
1122 }
1123 if (tol > 0.) {
1124 l_nrm = PetscMax(l_nrm, err / tol);
1125 l_n_loc++;
1126 }
1127 }
1128 },
1129 Kokkos::Max<PetscReal>(nrm), Kokkos::Max<PetscReal>(nrma), Kokkos::Max<PetscReal>(nrmr), n_loc, na_loc, nr_loc));
1130 } else {
1131 PetscCallCXX(Kokkos::parallel_reduce(
1132 "VecErrorWeightedNorms_NORM_2", Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, n),
1133 KOKKOS_LAMBDA(const PetscInt &i, PetscReal &l_nrm, PetscReal &l_nrma, PetscReal &l_nrmr, PetscInt &l_n_loc, PetscInt &l_na_loc, PetscInt &l_nr_loc) {
1134 PetscReal err, tol, tola, tolr, l_atol, l_rtol;
1135 if (PetscAbsScalar(y(i)) >= ignore_max && PetscAbsScalar(u(i)) >= ignore_max) {
1136 l_atol = has_atol ? PetscRealPart(atola(i)) : atol;
1137 l_rtol = has_rtol ? PetscRealPart(rtola(i)) : rtol;
1138 err = has_E ? PetscAbsScalar(erra(i)) : PetscAbsScalar(y(i) - u(i));
1139 tola = l_atol;
1140 tolr = l_rtol * PetscMax(PetscAbsScalar(u(i)), PetscAbsScalar(y(i)));
1141 tol = tola + tolr;
1142 if (tola > 0.) {
1143 l_nrma += PetscSqr(err / tola);
1144 l_na_loc++;
1145 }
1146 if (tolr > 0.) {
1147 l_nrmr += PetscSqr(err / tolr);
1148 l_nr_loc++;
1149 }
1150 if (tol > 0.) {
1151 l_nrm += PetscSqr(err / tol);
1152 l_n_loc++;
1153 }
1154 }
1155 },
1156 nrm, nrma, nrmr, n_loc, na_loc, nr_loc));
1157 }
1158
1159 if (wnormtype == NORM_2) {
1160 *norm = PetscSqrtReal(nrm);
1161 *norma = PetscSqrtReal(nrma);
1162 *normr = PetscSqrtReal(nrmr);
1163 } else {
1164 *norm = nrm;
1165 *norma = nrma;
1166 *normr = nrmr;
1167 }
1168 *norm_loc = n_loc;
1169 *norma_loc = na_loc;
1170 *normr_loc = nr_loc;
1171
1172 if (E) PetscCall(VecRestoreKokkosView(E, &erra));
1173 if (vatol) PetscCall(VecRestoreKokkosView(vatol, &atola));
1174 if (vrtol) PetscCall(VecRestoreKokkosView(vrtol, &rtola));
1175 PetscCall(VecRestoreKokkosView(U, &u));
1176 PetscCall(VecRestoreKokkosView(Y, &y));
1177 PetscFunctionReturn(PETSC_SUCCESS);
1178 }
1179
1180 /* A functor for DotNorm2 so that we can compute dp and nm in one kernel */
1181 struct DotNorm2 {
1182 typedef PetscScalar value_type[];
1183 typedef ConstPetscScalarKokkosView::size_type size_type;
1184
1185 size_type value_count;
1186 ConstPetscScalarKokkosView xv_, yv_; /* first and second vectors in VecDotNorm2. The order matters. */
1187
DotNorm2DotNorm21188 DotNorm2(ConstPetscScalarKokkosView &xv, ConstPetscScalarKokkosView &yv) : value_count(2), xv_(xv), yv_(yv) { }
1189
operator ()DotNorm21190 KOKKOS_INLINE_FUNCTION void operator()(const size_type i, value_type result) const
1191 {
1192 result[0] += PetscConj(yv_(i)) * xv_(i);
1193 result[1] += PetscConj(yv_(i)) * yv_(i);
1194 }
1195
joinDotNorm21196 KOKKOS_INLINE_FUNCTION void join(value_type dst, const value_type src) const
1197 {
1198 dst[0] += src[0];
1199 dst[1] += src[1];
1200 }
1201
initDotNorm21202 KOKKOS_INLINE_FUNCTION void init(value_type result) const
1203 {
1204 result[0] = 0.0;
1205 result[1] = 0.0;
1206 }
1207 };
1208
1209 /* dp = y^H x, nm = y^H y */
VecDotNorm2_SeqKokkos(Vec xin,Vec yin,PetscScalar * dp,PetscScalar * nm)1210 PetscErrorCode VecDotNorm2_SeqKokkos(Vec xin, Vec yin, PetscScalar *dp, PetscScalar *nm)
1211 {
1212 ConstPetscScalarKokkosView xv, yv;
1213 PetscScalar result[2];
1214
1215 PetscFunctionBegin;
1216 PetscCall(PetscLogGpuTimeBegin());
1217 PetscCall(VecGetKokkosView(xin, &xv));
1218 PetscCall(VecGetKokkosView(yin, &yv));
1219 DotNorm2 dn(xv, yv);
1220 PetscCallCXX(Kokkos::parallel_reduce(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), dn, result));
1221 *dp = result[0];
1222 *nm = result[1];
1223 PetscCall(VecRestoreKokkosView(yin, &yv));
1224 PetscCall(VecRestoreKokkosView(xin, &xv));
1225 PetscCall(PetscLogGpuTimeEnd());
1226 PetscCall(PetscLogGpuFlops(4.0 * xin->map->n));
1227 PetscFunctionReturn(PETSC_SUCCESS);
1228 }
1229
VecConjugate_SeqKokkos(Vec xin)1230 PetscErrorCode VecConjugate_SeqKokkos(Vec xin)
1231 {
1232 PetscScalarKokkosView xv;
1233
1234 PetscFunctionBegin;
1235 PetscCall(PetscLogGpuTimeBegin());
1236 PetscCall(VecGetKokkosView(xin, &xv));
1237 PetscCallCXX(Kokkos::parallel_for(Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, xin->map->n), KOKKOS_LAMBDA(const PetscInt &i) { xv(i) = PetscConj(xv(i)); }));
1238 PetscCall(VecRestoreKokkosView(xin, &xv));
1239 PetscCall(PetscLogGpuTimeEnd());
1240 PetscFunctionReturn(PETSC_SUCCESS);
1241 }
1242
1243 /* Temporarily replace the array in vin with a[]. Return to the original array with a call to VecResetArray() */
VecPlaceArray_SeqKokkos(Vec vin,const PetscScalar * a)1244 PetscErrorCode VecPlaceArray_SeqKokkos(Vec vin, const PetscScalar *a)
1245 {
1246 Vec_Seq *vecseq = (Vec_Seq *)vin->data;
1247 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(vin->spptr);
1248
1249 PetscFunctionBegin;
1250 PetscCall(VecPlaceArray_Seq(vin, a));
1251 PetscCall(veckok->UpdateArray<HostMirrorMemorySpace>(vecseq->array));
1252 PetscFunctionReturn(PETSC_SUCCESS);
1253 }
1254
VecResetArray_SeqKokkos(Vec vin)1255 PetscErrorCode VecResetArray_SeqKokkos(Vec vin)
1256 {
1257 Vec_Seq *vecseq = (Vec_Seq *)vin->data;
1258 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(vin->spptr);
1259
1260 PetscFunctionBegin;
1261 /* User wants to unhook the provided host array. Sync it so that user can get the latest */
1262 PetscCall(KokkosDualViewSyncHost(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1263 PetscCall(VecResetArray_Seq(vin)); /* Swap back the old host array, assuming its has the latest value */
1264 PetscCall(veckok->UpdateArray<HostMirrorMemorySpace>(vecseq->array));
1265 PetscFunctionReturn(PETSC_SUCCESS);
1266 }
1267
1268 /*@C
1269 VecKokkosPlaceArray - Allows one to replace the device array in a VecKokkos vector with a
1270 device array provided by the user. This is useful to avoid copying an array into a vector.
1271
1272 Logically Collective; No Fortran Support
1273
1274 Input Parameters:
1275 + v - the VecKokkos vector
1276 - a - the device array
1277
1278 Level: developer
1279
1280 Notes:
1281
1282 You can return to the original array with a call to `VecKokkosResetArray()`. `vec` does not take
1283 ownership of `array` in any way.
1284
1285 The user manages the device array so PETSc doesn't care how it was allocated.
1286
1287 The user must free `array` themselves but be careful not to
1288 do so before the vector has either been destroyed, had its original array restored with
1289 `VecKokkosResetArray()` or permanently replaced with `VecReplaceArray()`.
1290
1291 .seealso: [](ch_vectors), `Vec`, `VecGetArray()`, `VecKokkosResetArray()`, `VecReplaceArray()`, `VecResetArray()`
1292 @*/
VecKokkosPlaceArray(Vec v,PetscScalar * a)1293 PetscErrorCode VecKokkosPlaceArray(Vec v, PetscScalar *a)
1294 {
1295 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1296
1297 PetscFunctionBegin;
1298 VecErrorIfNotKokkos(v);
1299 // Sync the old device view before replacing it; so that when it is put back, it has the saved value.
1300 PetscCall(KokkosDualViewSyncDevice(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1301 PetscCallCXX(veckok->unplaced_d = veckok->v_dual.view_device());
1302 // We assume a[] contains the latest data and discard the vector's old sync state
1303 PetscCall(veckok->UpdateArray<DefaultMemorySpace>(a));
1304 PetscFunctionReturn(PETSC_SUCCESS);
1305 }
1306
1307 /*@C
1308 VecKokkosResetArray - Resets a vector to use its default memory. Call this
1309 after the use of `VecKokkosPlaceArray()`.
1310
1311 Not Collective
1312
1313 Input Parameter:
1314 . v - the vector
1315
1316 Level: developer
1317
1318 Notes:
1319
1320 After the call, the original array placed in with `VecKokkosPlaceArray()` will contain the latest value of the vector.
1321 Note that device kernels are asynchronous. Users are responsible to sync the device if they wish to have immediate access
1322 to the data in the array. Also, after the call, `v` will contain whatever data before `VecKokkosPlaceArray()`.
1323
1324 .seealso: [](ch_vectors), `Vec`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecKokkosPlaceArray()`
1325 @*/
VecKokkosResetArray(Vec v)1326 PetscErrorCode VecKokkosResetArray(Vec v)
1327 {
1328 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1329
1330 PetscFunctionBegin;
1331 VecErrorIfNotKokkos(v);
1332 // User wants to unhook the provided device array. Sync it so that user can get the latest
1333 PetscCall(KokkosDualViewSyncDevice(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1334 // Put the unplaced device array back, and set an appropriate modify flag
1335 PetscCall(veckok->UpdateArray<DefaultMemorySpace>(veckok->unplaced_d.data()));
1336 PetscFunctionReturn(PETSC_SUCCESS);
1337 }
1338
1339 /* Replace the array in vin with a[] that must be allocated by PetscMalloc. a[] is owned by vin afterwards. */
VecReplaceArray_SeqKokkos(Vec vin,const PetscScalar * a)1340 PetscErrorCode VecReplaceArray_SeqKokkos(Vec vin, const PetscScalar *a)
1341 {
1342 Vec_Seq *vecseq = (Vec_Seq *)vin->data;
1343 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(vin->spptr);
1344
1345 PetscFunctionBegin;
1346 /* Make sure the users array has the latest values */
1347 if (vecseq->array != vecseq->array_allocated) PetscCall(KokkosDualViewSyncHost(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1348 PetscCall(VecReplaceArray_Seq(vin, a));
1349 PetscCall(veckok->UpdateArray<HostMirrorMemorySpace>(vecseq->array));
1350 PetscFunctionReturn(PETSC_SUCCESS);
1351 }
1352
1353 /* Maps the local portion of vector v into vector w */
VecGetLocalVector_SeqKokkos(Vec v,Vec w)1354 PetscErrorCode VecGetLocalVector_SeqKokkos(Vec v, Vec w)
1355 {
1356 Vec_Seq *vecseq = static_cast<Vec_Seq *>(w->data);
1357 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(w->spptr);
1358
1359 PetscFunctionBegin;
1360 PetscCheckTypeNames(w, VECSEQKOKKOS, VECMPIKOKKOS);
1361 /* Destroy w->data, w->spptr */
1362 if (vecseq) {
1363 PetscCall(PetscFree(vecseq->array_allocated));
1364 PetscCall(PetscFree(w->data));
1365 }
1366 delete veckok;
1367
1368 /* Replace with v's */
1369 w->data = v->data;
1370 w->spptr = v->spptr;
1371 PetscCall(PetscObjectStateIncrease((PetscObject)w));
1372 PetscFunctionReturn(PETSC_SUCCESS);
1373 }
1374
VecRestoreLocalVector_SeqKokkos(Vec v,Vec w)1375 PetscErrorCode VecRestoreLocalVector_SeqKokkos(Vec v, Vec w)
1376 {
1377 PetscFunctionBegin;
1378 PetscCheckTypeNames(w, VECSEQKOKKOS, VECMPIKOKKOS);
1379 v->data = w->data;
1380 v->spptr = w->spptr;
1381 PetscCall(PetscObjectStateIncrease((PetscObject)v));
1382 /* TODO: need to think if setting w->data/spptr to NULL is safe */
1383 w->data = NULL;
1384 w->spptr = NULL;
1385 PetscFunctionReturn(PETSC_SUCCESS);
1386 }
1387
VecGetArray_SeqKokkos(Vec v,PetscScalar ** a)1388 PetscErrorCode VecGetArray_SeqKokkos(Vec v, PetscScalar **a)
1389 {
1390 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1391
1392 PetscFunctionBegin;
1393 PetscCall(KokkosDualViewSyncHost(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1394 *a = *((PetscScalar **)v->data);
1395 PetscFunctionReturn(PETSC_SUCCESS);
1396 }
1397
VecRestoreArray_SeqKokkos(Vec v,PetscScalar ** a)1398 PetscErrorCode VecRestoreArray_SeqKokkos(Vec v, PetscScalar **a)
1399 {
1400 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1401
1402 PetscFunctionBegin;
1403 PetscCallCXX(veckok->v_dual.modify_host());
1404 PetscFunctionReturn(PETSC_SUCCESS);
1405 }
1406
1407 /* Get array on host to overwrite, so no need to sync host. In VecRestoreArrayWrite() we will mark host is modified. */
VecGetArrayWrite_SeqKokkos(Vec v,PetscScalar ** a)1408 PetscErrorCode VecGetArrayWrite_SeqKokkos(Vec v, PetscScalar **a)
1409 {
1410 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1411
1412 PetscFunctionBegin;
1413 PetscCallCXX(veckok->v_dual.clear_sync_state());
1414 *a = veckok->v_dual.view_host().data();
1415 PetscFunctionReturn(PETSC_SUCCESS);
1416 }
1417
VecGetArrayAndMemType_SeqKokkos(Vec v,PetscScalar ** a,PetscMemType * mtype)1418 PetscErrorCode VecGetArrayAndMemType_SeqKokkos(Vec v, PetscScalar **a, PetscMemType *mtype)
1419 {
1420 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1421
1422 PetscFunctionBegin;
1423 /* Always return up-to-date in the default memory space */
1424 PetscCall(KokkosDualViewSyncDevice(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1425 *a = veckok->v_dual.view_device().data();
1426 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; // Could be PETSC_MEMTYPE_HOST when Kokkos was not configured with cuda etc.
1427 PetscFunctionReturn(PETSC_SUCCESS);
1428 }
1429
VecRestoreArrayAndMemType_SeqKokkos(Vec v,PetscScalar ** a)1430 PetscErrorCode VecRestoreArrayAndMemType_SeqKokkos(Vec v, PetscScalar **a)
1431 {
1432 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1433
1434 PetscFunctionBegin;
1435 if (PetscMemTypeHost(PETSC_MEMTYPE_KOKKOS)) {
1436 PetscCallCXX(veckok->v_dual.modify_host());
1437 } else {
1438 PetscCallCXX(veckok->v_dual.modify_device());
1439 }
1440 PetscFunctionReturn(PETSC_SUCCESS);
1441 }
1442
VecGetArrayWriteAndMemType_SeqKokkos(Vec v,PetscScalar ** a,PetscMemType * mtype)1443 PetscErrorCode VecGetArrayWriteAndMemType_SeqKokkos(Vec v, PetscScalar **a, PetscMemType *mtype)
1444 {
1445 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1446
1447 PetscFunctionBegin;
1448 // Since the data will be overwritten, we clear the sync state to suppress potential memory copying in sync'ing
1449 PetscCallCXX(veckok->v_dual.clear_sync_state()); // So that in restore, we can safely modify_device()
1450 PetscCall(KokkosDualViewSyncDevice(veckok->v_dual, PetscGetKokkosExecutionSpace()));
1451 *a = veckok->v_dual.view_device().data();
1452 if (mtype) *mtype = PETSC_MEMTYPE_KOKKOS; // Could be PETSC_MEMTYPE_HOST when Kokkos was not configured with cuda etc.
1453 PetscFunctionReturn(PETSC_SUCCESS);
1454 }
1455
1456 /* Copy xin's sync state to y */
VecCopySyncState_Kokkos_Private(Vec xin,Vec yout)1457 static PetscErrorCode VecCopySyncState_Kokkos_Private(Vec xin, Vec yout)
1458 {
1459 Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(xin->spptr);
1460 Vec_Kokkos *ykok = static_cast<Vec_Kokkos *>(yout->spptr);
1461
1462 PetscFunctionBegin;
1463 PetscCallCXX(ykok->v_dual.clear_sync_state());
1464 if (xkok->v_dual.need_sync_host()) {
1465 PetscCallCXX(ykok->v_dual.modify_device());
1466 } else if (xkok->v_dual.need_sync_device()) {
1467 PetscCallCXX(ykok->v_dual.modify_host());
1468 }
1469 PetscFunctionReturn(PETSC_SUCCESS);
1470 }
1471
1472 static PetscErrorCode VecCreateSeqKokkosWithArrays_Private(MPI_Comm, PetscInt, PetscInt, const PetscScalar[], const PetscScalar[], Vec *);
1473
1474 /* Internal routine shared by VecGetSubVector_{SeqKokkos,MPIKokkos} */
VecGetSubVector_Kokkos_Private(Vec x,PetscBool xIsMPI,IS is,Vec * y)1475 PetscErrorCode VecGetSubVector_Kokkos_Private(Vec x, PetscBool xIsMPI, IS is, Vec *y)
1476 {
1477 PetscBool contig;
1478 PetscInt n, N, start, bs;
1479 MPI_Comm comm;
1480 Vec z;
1481
1482 PetscFunctionBegin;
1483 PetscCall(PetscObjectGetComm((PetscObject)x, &comm));
1484 PetscCall(ISGetLocalSize(is, &n));
1485 PetscCall(ISGetSize(is, &N));
1486 PetscCall(VecGetSubVectorContiguityAndBS_Private(x, is, &contig, &start, &bs));
1487
1488 if (contig) { /* We can do a no-copy (in-place) implementation with y sharing x's arrays */
1489 Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(x->spptr);
1490 const PetscScalar *array_h = xkok->v_dual.view_host().data() + start;
1491 const PetscScalar *array_d = xkok->v_dual.view_device().data() + start;
1492
1493 /* These calls assume the input arrays are synced */
1494 if (xIsMPI) PetscCall(VecCreateMPIKokkosWithArrays_Private(comm, bs, n, N, array_h, array_d, &z)); /* x could be MPI even when x's comm size = 1 */
1495 else PetscCall(VecCreateSeqKokkosWithArrays_Private(comm, bs, n, array_h, array_d, &z));
1496
1497 PetscCall(VecCopySyncState_Kokkos_Private(x, z)); /* Copy x's sync state to z */
1498
1499 /* This is relevant only in debug mode */
1500 PetscInt state = 0;
1501 PetscCall(VecLockGet(x, &state));
1502 if (state) { /* x is either in read or read/write mode, therefore z, overlapped with x, can only be in read mode */
1503 PetscCall(VecLockReadPush(z));
1504 }
1505
1506 z->ops->placearray = NULL; /* z's arrays can't be replaced, because z does not own them */
1507 z->ops->replacearray = NULL;
1508
1509 } else { /* Have to create a VecScatter and a stand-alone vector */
1510 PetscCall(VecGetSubVectorThroughVecScatter_Private(x, is, bs, &z));
1511 }
1512 *y = z;
1513 PetscFunctionReturn(PETSC_SUCCESS);
1514 }
1515
VecGetSubVector_SeqKokkos(Vec x,IS is,Vec * y)1516 static PetscErrorCode VecGetSubVector_SeqKokkos(Vec x, IS is, Vec *y)
1517 {
1518 PetscFunctionBegin;
1519 PetscCall(VecGetSubVector_Kokkos_Private(x, PETSC_FALSE, is, y));
1520 PetscFunctionReturn(PETSC_SUCCESS);
1521 }
1522
1523 /* Restore subvector y to x */
VecRestoreSubVector_SeqKokkos(Vec x,IS is,Vec * y)1524 PetscErrorCode VecRestoreSubVector_SeqKokkos(Vec x, IS is, Vec *y)
1525 {
1526 VecScatter vscat;
1527 PETSC_UNUSED PetscObjectState dummystate = 0;
1528 PetscBool unchanged;
1529
1530 PetscFunctionBegin;
1531 PetscCall(PetscObjectComposedDataGetInt((PetscObject)*y, VecGetSubVectorSavedStateId, dummystate, unchanged));
1532 if (unchanged) PetscFunctionReturn(PETSC_SUCCESS); /* If y's state has not changed since VecGetSubVector(), we only need to destroy it */
1533
1534 PetscCall(PetscObjectQuery((PetscObject)*y, "VecGetSubVector_Scatter", (PetscObject *)&vscat));
1535 if (vscat) {
1536 PetscCall(VecScatterBegin(vscat, *y, x, INSERT_VALUES, SCATTER_REVERSE));
1537 PetscCall(VecScatterEnd(vscat, *y, x, INSERT_VALUES, SCATTER_REVERSE));
1538 } else { /* y and x's (host and device) arrays overlap */
1539 Vec_Kokkos *xkok = static_cast<Vec_Kokkos *>(x->spptr);
1540 Vec_Kokkos *ykok = static_cast<Vec_Kokkos *>((*y)->spptr);
1541 PetscInt state;
1542
1543 PetscCall(VecLockGet(x, &state));
1544 PetscCheck(!state, PetscObjectComm((PetscObject)x), PETSC_ERR_ARG_WRONGSTATE, "Vec x is locked for read-only or read/write access");
1545
1546 /* The tricky part: one has to carefully sync the arrays */
1547 auto exec = PetscGetKokkosExecutionSpace();
1548 if (xkok->v_dual.need_sync_device()) { /* x's host has newer data */
1549 /* Move y's latest values to host (since y is just a subset of x) */
1550 PetscCall(KokkosDualViewSyncHost(ykok->v_dual, exec));
1551 } else if (xkok->v_dual.need_sync_host()) { /* x's device has newer data */
1552 PetscCall(KokkosDualViewSyncDevice(ykok->v_dual, exec)); /* Move y's latest data to device */
1553 } else { /* x's host and device data is already sync'ed; Copy y's sync state to x */
1554 PetscCall(VecCopySyncState_Kokkos_Private(*y, x));
1555 }
1556 PetscCall(PetscObjectStateIncrease((PetscObject)x)); /* Since x is updated */
1557 }
1558 PetscFunctionReturn(PETSC_SUCCESS);
1559 }
1560
VecSetPreallocationCOO_SeqKokkos(Vec x,PetscCount ncoo,const PetscInt coo_i[])1561 static PetscErrorCode VecSetPreallocationCOO_SeqKokkos(Vec x, PetscCount ncoo, const PetscInt coo_i[])
1562 {
1563 Vec_Seq *vecseq = static_cast<Vec_Seq *>(x->data);
1564 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(x->spptr);
1565 PetscInt m;
1566
1567 PetscFunctionBegin;
1568 PetscCall(VecSetPreallocationCOO_Seq(x, ncoo, coo_i));
1569 PetscCall(VecGetLocalSize(x, &m));
1570 PetscCall(veckok->SetUpCOO(vecseq, m));
1571 PetscFunctionReturn(PETSC_SUCCESS);
1572 }
1573
VecSetValuesCOO_SeqKokkos(Vec x,const PetscScalar v[],InsertMode imode)1574 static PetscErrorCode VecSetValuesCOO_SeqKokkos(Vec x, const PetscScalar v[], InsertMode imode)
1575 {
1576 Vec_Seq *vecseq = static_cast<Vec_Seq *>(x->data);
1577 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(x->spptr);
1578 const PetscCountKokkosView &jmap1 = veckok->jmap1_d;
1579 const PetscCountKokkosView &perm1 = veckok->perm1_d;
1580 PetscScalarKokkosView xv; /* View for vector x */
1581 ConstPetscScalarKokkosView vv; /* View for array v[] */
1582 PetscInt m;
1583 PetscMemType memtype;
1584
1585 PetscFunctionBegin;
1586 PetscCall(VecGetLocalSize(x, &m));
1587 PetscCall(PetscGetMemType(v, &memtype));
1588 if (PetscMemTypeHost(memtype)) { /* If user gave v[] in host, we might need to copy it to device if any */
1589 PetscCallCXX(vv = Kokkos::create_mirror_view_and_copy(DefaultMemorySpace(), ConstPetscScalarKokkosViewHost(v, vecseq->coo_n)));
1590 } else {
1591 PetscCallCXX(vv = ConstPetscScalarKokkosView(v, vecseq->coo_n)); /* Directly use v[]'s memory */
1592 }
1593
1594 if (imode == INSERT_VALUES) PetscCall(VecGetKokkosViewWrite(x, &xv)); /* write vector */
1595 else PetscCall(VecGetKokkosView(x, &xv)); /* read & write vector */
1596
1597 PetscCallCXX(Kokkos::parallel_for(
1598 Kokkos::RangePolicy<>(PetscGetKokkosExecutionSpace(), 0, m), KOKKOS_LAMBDA(const PetscInt &i) {
1599 PetscScalar sum = 0.0;
1600 for (PetscCount k = jmap1(i); k < jmap1(i + 1); k++) sum += vv(perm1(k));
1601 xv(i) = (imode == INSERT_VALUES ? 0.0 : xv(i)) + sum;
1602 }));
1603
1604 if (imode == INSERT_VALUES) PetscCall(VecRestoreKokkosViewWrite(x, &xv));
1605 else PetscCall(VecRestoreKokkosView(x, &xv));
1606 PetscFunctionReturn(PETSC_SUCCESS);
1607 }
1608
1609 static PetscErrorCode VecCreate_SeqKokkos_Common(Vec); // forward declaration
1610
1611 /* Duplicate layout etc but not the values in the input vector */
VecDuplicate_SeqKokkos(Vec win,Vec * v)1612 static PetscErrorCode VecDuplicate_SeqKokkos(Vec win, Vec *v)
1613 {
1614 PetscFunctionBegin;
1615 PetscCall(VecDuplicate_Seq(win, v)); /* It also dups ops of win */
1616 PetscCall(VecCreate_SeqKokkos_Common(*v));
1617 (*v)->ops[0] = win->ops[0]; // recover the ops[]. We need to always follow ops in win
1618 PetscFunctionReturn(PETSC_SUCCESS);
1619 }
1620
VecDestroy_SeqKokkos(Vec v)1621 static PetscErrorCode VecDestroy_SeqKokkos(Vec v)
1622 {
1623 Vec_Kokkos *veckok = static_cast<Vec_Kokkos *>(v->spptr);
1624 Vec_Seq *vecseq = static_cast<Vec_Seq *>(v->data);
1625
1626 PetscFunctionBegin;
1627 delete veckok;
1628 v->spptr = NULL;
1629 if (vecseq) PetscCall(VecDestroy_Seq(v));
1630 PetscFunctionReturn(PETSC_SUCCESS);
1631 }
1632
1633 // Shared by all VecCreate/Duplicate routines for VecSeqKokkos
VecCreate_SeqKokkos_Common(Vec v)1634 static PetscErrorCode VecCreate_SeqKokkos_Common(Vec v)
1635 {
1636 PetscFunctionBegin;
1637 v->ops->abs = VecAbs_SeqKokkos;
1638 v->ops->reciprocal = VecReciprocal_SeqKokkos;
1639 v->ops->pointwisemult = VecPointwiseMult_SeqKokkos;
1640 v->ops->min = VecMin_SeqKokkos;
1641 v->ops->max = VecMax_SeqKokkos;
1642 v->ops->sum = VecSum_SeqKokkos;
1643 v->ops->shift = VecShift_SeqKokkos;
1644 v->ops->norm = VecNorm_SeqKokkos;
1645 v->ops->scale = VecScale_SeqKokkos;
1646 v->ops->copy = VecCopy_SeqKokkos;
1647 v->ops->set = VecSet_SeqKokkos;
1648 v->ops->swap = VecSwap_SeqKokkos;
1649 v->ops->axpy = VecAXPY_SeqKokkos;
1650 v->ops->axpby = VecAXPBY_SeqKokkos;
1651 v->ops->axpbypcz = VecAXPBYPCZ_SeqKokkos;
1652 v->ops->pointwisedivide = VecPointwiseDivide_SeqKokkos;
1653 v->ops->setrandom = VecSetRandom_SeqKokkos;
1654
1655 v->ops->dot = VecDot_SeqKokkos;
1656 v->ops->tdot = VecTDot_SeqKokkos;
1657 v->ops->mdot = VecMDot_SeqKokkos;
1658 v->ops->mtdot = VecMTDot_SeqKokkos;
1659
1660 v->ops->dot_local = VecDot_SeqKokkos;
1661 v->ops->tdot_local = VecTDot_SeqKokkos;
1662 v->ops->mdot_local = VecMDot_SeqKokkos;
1663 v->ops->mtdot_local = VecMTDot_SeqKokkos;
1664
1665 v->ops->norm_local = VecNorm_SeqKokkos;
1666 v->ops->maxpy = VecMAXPY_SeqKokkos;
1667 v->ops->aypx = VecAYPX_SeqKokkos;
1668 v->ops->waxpy = VecWAXPY_SeqKokkos;
1669 v->ops->dotnorm2 = VecDotNorm2_SeqKokkos;
1670 v->ops->errorwnorm = VecErrorWeightedNorms_SeqKokkos;
1671 v->ops->placearray = VecPlaceArray_SeqKokkos;
1672 v->ops->replacearray = VecReplaceArray_SeqKokkos;
1673 v->ops->resetarray = VecResetArray_SeqKokkos;
1674 v->ops->destroy = VecDestroy_SeqKokkos;
1675 v->ops->duplicate = VecDuplicate_SeqKokkos;
1676 v->ops->conjugate = VecConjugate_SeqKokkos;
1677 v->ops->getlocalvector = VecGetLocalVector_SeqKokkos;
1678 v->ops->restorelocalvector = VecRestoreLocalVector_SeqKokkos;
1679 v->ops->getlocalvectorread = VecGetLocalVector_SeqKokkos;
1680 v->ops->restorelocalvectorread = VecRestoreLocalVector_SeqKokkos;
1681 v->ops->getarraywrite = VecGetArrayWrite_SeqKokkos;
1682 v->ops->getarray = VecGetArray_SeqKokkos;
1683 v->ops->restorearray = VecRestoreArray_SeqKokkos;
1684
1685 v->ops->getarrayandmemtype = VecGetArrayAndMemType_SeqKokkos;
1686 v->ops->restorearrayandmemtype = VecRestoreArrayAndMemType_SeqKokkos;
1687 v->ops->getarraywriteandmemtype = VecGetArrayWriteAndMemType_SeqKokkos;
1688 v->ops->getsubvector = VecGetSubVector_SeqKokkos;
1689 v->ops->restoresubvector = VecRestoreSubVector_SeqKokkos;
1690
1691 v->ops->setpreallocationcoo = VecSetPreallocationCOO_SeqKokkos;
1692 v->ops->setvaluescoo = VecSetValuesCOO_SeqKokkos;
1693
1694 v->offloadmask = PETSC_OFFLOAD_KOKKOS; // Mark this is a VECKOKKOS; We use this flag for cheap VECKOKKOS test.
1695 PetscFunctionReturn(PETSC_SUCCESS);
1696 }
1697
1698 /*@C
1699 VecCreateSeqKokkosWithArray - Creates a Kokkos sequential array-style vector,
1700 where the user provides the array space to store the vector values. The array
1701 provided must be a device array.
1702
1703 Collective
1704
1705 Input Parameters:
1706 + comm - the communicator, should be PETSC_COMM_SELF
1707 . bs - the block size
1708 . n - the vector length
1709 - darray - device memory where the vector elements are to be stored.
1710
1711 Output Parameter:
1712 . v - the vector
1713
1714 Notes:
1715 Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1716 same type as an existing vector.
1717
1718 PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1719 The user should not free the array until the vector is destroyed.
1720
1721 Level: intermediate
1722
1723 .seealso: `VecCreateMPICUDAWithArray()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`,
1724 `VecCreateGhost()`, `VecCreateSeq()`, `VecCreateSeqWithArray()`,
1725 `VecCreateMPIWithArray()`
1726 @*/
VecCreateSeqKokkosWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar darray[],Vec * v)1727 PetscErrorCode VecCreateSeqKokkosWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar darray[], Vec *v)
1728 {
1729 PetscMPIInt size;
1730 Vec w;
1731 Vec_Kokkos *veckok = NULL;
1732 PetscScalar *harray;
1733
1734 PetscFunctionBegin;
1735 PetscCallMPI(MPI_Comm_size(comm, &size));
1736 PetscCheck(size <= 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create VECSEQKOKKOS on more than one process");
1737
1738 PetscCall(PetscKokkosInitializeCheck());
1739 PetscCall(VecCreate(comm, &w));
1740 PetscCall(VecSetSizes(w, n, n));
1741 PetscCall(VecSetBlockSize(w, bs));
1742 if (!darray) { /* Allocate memory ourself if user provided NULL */
1743 PetscCall(VecSetType(w, VECSEQKOKKOS));
1744 } else {
1745 /* Build a VECSEQ, get its harray, and then build Vec_Kokkos along with darray */
1746 if (std::is_same<DefaultMemorySpace, HostMirrorMemorySpace>::value) {
1747 harray = const_cast<PetscScalar *>(darray);
1748 PetscCall(VecCreate_Seq_Private(w, harray)); /* Build a sequential vector with harray */
1749 } else {
1750 PetscCall(VecSetType(w, VECSEQ));
1751 harray = static_cast<Vec_Seq *>(w->data)->array;
1752 }
1753 PetscCall(PetscObjectChangeTypeName((PetscObject)w, VECSEQKOKKOS)); /* Change it to Kokkos */
1754 PetscCall(VecCreate_SeqKokkos_Common(w));
1755 PetscCallCXX(veckok = new Vec_Kokkos{n, harray, const_cast<PetscScalar *>(darray)});
1756 PetscCallCXX(veckok->v_dual.modify_device()); /* Mark the device is modified */
1757 w->spptr = static_cast<void *>(veckok);
1758 }
1759 *v = w;
1760 PetscFunctionReturn(PETSC_SUCCESS);
1761 }
1762
VecConvert_Seq_SeqKokkos_inplace(Vec v)1763 PetscErrorCode VecConvert_Seq_SeqKokkos_inplace(Vec v)
1764 {
1765 Vec_Seq *vecseq;
1766
1767 PetscFunctionBegin;
1768 PetscCall(PetscKokkosInitializeCheck());
1769 PetscCall(PetscLayoutSetUp(v->map));
1770 PetscCall(PetscObjectChangeTypeName((PetscObject)v, VECSEQKOKKOS));
1771 PetscCall(VecCreate_SeqKokkos_Common(v));
1772 PetscCheck(!v->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "v->spptr not NULL");
1773 vecseq = static_cast<Vec_Seq *>(v->data);
1774 PetscCallCXX(v->spptr = new Vec_Kokkos(v->map->n, vecseq->array, NULL));
1775 PetscFunctionReturn(PETSC_SUCCESS);
1776 }
1777
1778 // Create a VECSEQKOKKOS with layout and arrays
VecCreateSeqKokkosWithLayoutAndArrays_Private(PetscLayout map,const PetscScalar harray[],const PetscScalar darray[],Vec * v)1779 static PetscErrorCode VecCreateSeqKokkosWithLayoutAndArrays_Private(PetscLayout map, const PetscScalar harray[], const PetscScalar darray[], Vec *v)
1780 {
1781 Vec w;
1782
1783 PetscFunctionBegin;
1784 if (map->n > 0) PetscCheck(darray, map->comm, PETSC_ERR_ARG_WRONG, "darray cannot be NULL");
1785 #if defined(KOKKOS_ENABLE_UNIFIED_MEMORY)
1786 PetscCheck(harray == darray, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "harray and darray must be the same");
1787 #endif
1788 PetscCall(VecCreateSeqWithLayoutAndArray_Private(map, harray, &w));
1789 PetscCall(PetscObjectChangeTypeName((PetscObject)w, VECSEQKOKKOS)); // Change it to VECKOKKOS
1790 PetscCall(VecCreate_SeqKokkos_Common(w));
1791 PetscCallCXX(w->spptr = new Vec_Kokkos(map->n, const_cast<PetscScalar *>(harray), const_cast<PetscScalar *>(darray)));
1792 *v = w;
1793 PetscFunctionReturn(PETSC_SUCCESS);
1794 }
1795
1796 /*
1797 VecCreateSeqKokkosWithArrays_Private - Creates a Kokkos sequential array-style vector
1798 with user-provided arrays on host and device.
1799
1800 Collective
1801
1802 Input Parameter:
1803 + comm - the communicator, should be PETSC_COMM_SELF
1804 . bs - the block size
1805 . n - the vector length
1806 . harray - host memory where the vector elements are to be stored.
1807 - darray - device memory where the vector elements are to be stored.
1808
1809 Output Parameter:
1810 . v - the vector
1811
1812 Notes:
1813 Unlike VecCreate{Seq,MPI}CUDAWithArrays(), this routine is private since we do not expect users to use it directly.
1814
1815 If there is no device, then harray and darray must be the same.
1816 If n is not zero, then harray and darray must be allocated.
1817 After the call, the created vector is supposed to be in a synchronized state, i.e.,
1818 we suppose harray and darray have the same data.
1819
1820 PETSc does NOT free the array when the vector is destroyed via VecDestroy().
1821 Caller should not free the array until the vector is destroyed.
1822 */
VecCreateSeqKokkosWithArrays_Private(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar harray[],const PetscScalar darray[],Vec * v)1823 static PetscErrorCode VecCreateSeqKokkosWithArrays_Private(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar harray[], const PetscScalar darray[], Vec *v)
1824 {
1825 PetscMPIInt size;
1826 PetscLayout map;
1827
1828 PetscFunctionBegin;
1829 PetscCall(PetscKokkosInitializeCheck());
1830 PetscCallMPI(MPI_Comm_size(comm, &size));
1831 PetscCheck(size == 1, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Cannot create VECSEQKOKKOS on more than one process");
1832 PetscCall(PetscLayoutCreateFromSizes(comm, n, n, bs, &map));
1833 PetscCall(VecCreateSeqKokkosWithLayoutAndArrays_Private(map, harray, darray, v));
1834 PetscCall(PetscLayoutDestroy(&map));
1835 PetscFunctionReturn(PETSC_SUCCESS);
1836 }
1837
1838 /* TODO: ftn-auto generates veckok.kokkosf.c */
1839 /*@C
1840 VecCreateSeqKokkos - Creates a standard, sequential array-style vector.
1841
1842 Collective
1843
1844 Input Parameters:
1845 + comm - the communicator, should be `PETSC_COMM_SELF`
1846 - n - the vector length
1847
1848 Output Parameter:
1849 . v - the vector
1850
1851 Notes:
1852 Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1853 same type as an existing vector.
1854
1855 Level: intermediate
1856
1857 .seealso: `VecCreateMPI()`, `VecCreate()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`
1858 @*/
VecCreateSeqKokkos(MPI_Comm comm,PetscInt n,Vec * v)1859 PetscErrorCode VecCreateSeqKokkos(MPI_Comm comm, PetscInt n, Vec *v)
1860 {
1861 PetscFunctionBegin;
1862 PetscCall(PetscKokkosInitializeCheck());
1863 PetscCall(VecCreate(comm, v));
1864 PetscCall(VecSetSizes(*v, n, n));
1865 PetscCall(VecSetType(*v, VECSEQKOKKOS)); /* Calls VecCreate_SeqKokkos */
1866 PetscFunctionReturn(PETSC_SUCCESS);
1867 }
1868
1869 // Duplicate a VECSEQKOKKOS
VecDuplicateVecs_SeqKokkos_GEMV(Vec w,PetscInt m,Vec * V[])1870 static PetscErrorCode VecDuplicateVecs_SeqKokkos_GEMV(Vec w, PetscInt m, Vec *V[])
1871 {
1872 PetscInt64 lda; // use 64-bit as we will do "m * lda"
1873 PetscScalar *array_h, *array_d;
1874 PetscLayout map;
1875 PetscScalarKokkosDualView w_dual;
1876
1877 PetscFunctionBegin;
1878 PetscCall(PetscKokkosInitializeCheck()); // as we'll call kokkos_malloc()
1879 PetscCall(PetscMalloc1(m, V));
1880 PetscCall(VecGetLayout(w, &map));
1881 VecGetLocalSizeAligned(w, 64, &lda); // get in lda the 64-bytes aligned local size
1882 // See comments in VecCreate_SeqKokkos() on why we use DualView to allocate the memory
1883 PetscCallCXX(w_dual = PetscScalarKokkosDualView("VecDuplicateVecs", m * lda)); // Kokkos init's w_dual to zero
1884
1885 // create the m vectors with raw arrays from v_dual
1886 array_h = w_dual.view_host().data();
1887 array_d = w_dual.view_device().data();
1888 for (PetscInt i = 0; i < m; i++) {
1889 Vec v;
1890 PetscCall(VecCreateSeqKokkosWithLayoutAndArrays_Private(map, &array_h[i * lda], &array_d[i * lda], &v));
1891 PetscCall(PetscObjectListDuplicate(((PetscObject)w)->olist, &((PetscObject)v)->olist));
1892 PetscCall(PetscFunctionListDuplicate(((PetscObject)w)->qlist, &((PetscObject)v)->qlist));
1893 v->ops[0] = w->ops[0];
1894 (*V)[i] = v;
1895 }
1896
1897 // let the first vector own the long DualView, so when it is destroyed it will free the v_dual
1898 if (m) {
1899 Vec v = (*V)[0];
1900
1901 static_cast<Vec_Kokkos *>(v->spptr)->w_dual = w_dual; // stash the memory
1902 // disable replacearray of the first vector, as freeing its memory also frees others in the group.
1903 // But replacearray of others is ok, as they don't own their array.
1904 if (m > 1) v->ops->replacearray = VecReplaceArray_Default_GEMV_Error;
1905 }
1906 PetscFunctionReturn(PETSC_SUCCESS);
1907 }
1908
1909 /*MC
1910 VECSEQKOKKOS - VECSEQKOKKOS = "seqkokkos" - The basic sequential vector, modified to use Kokkos
1911
1912 Options Database Keys:
1913 . -vec_type seqkokkos - sets the vector type to VECSEQKOKKOS during a call to VecSetFromOptions()
1914
1915 Level: beginner
1916
1917 .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECMPI`, `VecType`, `VecCreateMPI()`
1918 M*/
VecCreate_SeqKokkos(Vec v)1919 PetscErrorCode VecCreate_SeqKokkos(Vec v)
1920 {
1921 PetscBool mdot_use_gemv = PETSC_TRUE;
1922 PetscBool maxpy_use_gemv = PETSC_FALSE; // default is false as we saw bad performance with vendors' GEMV with tall skinny matrices.
1923 PetscScalarKokkosDualView v_dual;
1924
1925 PetscFunctionBegin;
1926 PetscCall(PetscKokkosInitializeCheck());
1927 PetscCall(PetscLayoutSetUp(v->map));
1928
1929 // Use DualView to allocate both the host array and the device array.
1930 // DualView first allocates the device array and then mirrors it to host.
1931 // With unified memory (e.g., on AMD MI300A APU), the two arrays are the same, with the host array
1932 // sharing the device array allocated by hipMalloc(), but not the other way around if we call
1933 // VecCreate_Seq() first and let the device array share the host array allocated by malloc().
1934 // hipMalloc() has an advantage over malloc() as it gives great binding and page size settings automatically, see
1935 // https://hpc.llnl.gov/documentation/user-guides/using-el-capitan-systems/introduction-and-quickstart/pro-tips
1936 PetscCallCXX(v_dual = PetscScalarKokkosDualView("v_dual", v->map->n)); // Kokkos init's v_dual to zero
1937
1938 PetscCall(VecCreate_Seq_Private(v, v_dual.view_host().data()));
1939 PetscCall(PetscObjectChangeTypeName((PetscObject)v, VECSEQKOKKOS));
1940 PetscCall(VecCreate_SeqKokkos_Common(v));
1941 PetscCheck(!v->spptr, PETSC_COMM_SELF, PETSC_ERR_PLIB, "v->spptr not NULL");
1942 PetscCallCXX(v->spptr = new Vec_Kokkos(v_dual));
1943
1944 PetscCall(PetscOptionsGetBool(NULL, NULL, "-vec_mdot_use_gemv", &mdot_use_gemv, NULL));
1945 PetscCall(PetscOptionsGetBool(NULL, NULL, "-vec_maxpy_use_gemv", &maxpy_use_gemv, NULL));
1946
1947 // allocate multiple vectors together
1948 if (mdot_use_gemv || maxpy_use_gemv) v->ops[0].duplicatevecs = VecDuplicateVecs_SeqKokkos_GEMV;
1949
1950 if (mdot_use_gemv) {
1951 v->ops[0].mdot = VecMDot_SeqKokkos_GEMV;
1952 v->ops[0].mtdot = VecMTDot_SeqKokkos_GEMV;
1953 v->ops[0].mdot_local = VecMDot_SeqKokkos_GEMV;
1954 v->ops[0].mtdot_local = VecMTDot_SeqKokkos_GEMV;
1955 }
1956 if (maxpy_use_gemv) v->ops[0].maxpy = VecMAXPY_SeqKokkos_GEMV;
1957 PetscFunctionReturn(PETSC_SUCCESS);
1958 }
1959