xref: /petsc/src/vec/vec/impls/seq/cupm/cuda/vecseqcupm.cu (revision 6a97282b1349fb5a3dfe842cdf1bbb3075a9a414)
1 #include "../vecseqcupm.hpp" /*I <petscvec.h> I*/
2 #include "../vecseqcupm_impl.hpp"
3 
4 using namespace ::Petsc::vec::cupm;
5 using ::Petsc::device::cupm::DeviceType;
6 
7 template class impl::VecSeq_CUPM<DeviceType::CUDA>;
8 
9 static constexpr auto VecSeq_CUDA = impl::VecSeq_CUPM<DeviceType::CUDA>{};
10 
11 /*MC
12   VECSEQCUDA - VECSEQCUDA = "seqcuda" - The basic sequential vector, modified to use CUDA
13 
14   Options Database Key:
15 . -vec_type seqcuda - sets the vector type to `VECSEQCUDA` during a call to `VecSetFromOptions()`
16 
17   Level: beginner
18 
19 .seealso: `VecCreate()`, `VecSetType()`, `VecSetFromOptions()`, `VecCreateMPIWithArray()`, `VECSEQ`,
20 `VecType`, `VecCreateMPI()`, `VecSetPinnedMemoryMin()`, `VECCUDA`, `VECHIP`, VECMPICUDA`, `VECMPIHIP`, `VECSEQHIP`
21 M*/
22 
VecCreate_SeqCUDA(Vec v)23 PetscErrorCode VecCreate_SeqCUDA(Vec v)
24 {
25   PetscFunctionBegin;
26   PetscCall(VecSeq_CUDA.Create(v));
27   PetscFunctionReturn(PETSC_SUCCESS);
28 }
29 
VecConvert_Seq_SeqCUDA_inplace(Vec v)30 PetscErrorCode VecConvert_Seq_SeqCUDA_inplace(Vec v)
31 {
32   PetscFunctionBegin;
33   PetscCall(VecSeq_CUDA.Convert_IMPL_IMPLCUPM(v));
34   PetscFunctionReturn(PETSC_SUCCESS);
35 }
36 
37 // PetscClangLinter pragma disable: -fdoc-internal-linkage
38 /*@
39   VecCreateSeqCUDA - Creates a standard, sequential, array-style vector.
40 
41   Collective, Possibly Synchronous
42 
43   Input Parameters:
44 + comm - the communicator, must be `PETSC_COMM_SELF`
45 - n    - the vector length
46 
47   Output Parameter:
48 . v - the vector
49 
50   Level: intermediate
51 
52   Notes:
53   Use `VecDuplicate()` or `VecDuplicateVecs()` to form additional vectors of the same type as an
54   existing vector.
55 
56   This function may initialize `PetscDevice`, which may incur a device synchronization.
57 
58 .seealso: [](ch_vectors), `PetscDeviceInitialize()`, `VecCreate()`, `VecCreateSeq()`, `VecCreateSeqCUDAWithArray()`,
59           `VecCreateMPI()`, `VecCreateMPICUDA()`, `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`
60 @*/
VecCreateSeqCUDA(MPI_Comm comm,PetscInt n,Vec * v)61 PetscErrorCode VecCreateSeqCUDA(MPI_Comm comm, PetscInt n, Vec *v)
62 {
63   PetscFunctionBegin;
64   PetscCall(VecCreateSeqCUPMAsync<DeviceType::CUDA>(comm, n, v));
65   PetscFunctionReturn(PETSC_SUCCESS);
66 }
67 
68 // PetscClangLinter pragma disable: -fdoc-internal-linkage
69 /*@C
70   VecCreateSeqCUDAWithArrays - Creates a sequential, array-style vector using CUDA, where the
71   user provides the complete array space to store the vector values.
72 
73   Collective, Possibly Synchronous
74 
75   Input Parameters:
76 + comm     - the communicator, must be `PETSC_COMM_SELF`
77 . bs       - the block size
78 . n        - the local vector length
79 . cpuarray - CPU memory where the vector elements are to be stored (or `NULL`)
80 - gpuarray - GPU memory where the vector elements are to be stored (or `NULL`)
81 
82   Output Parameter:
83 . v - the vector
84 
85   Level: intermediate
86 
87   Notes:
88   If the user-provided array is `NULL`, then `VecCUDAPlaceArray()` can be used at a later stage to
89   SET the array for storing the vector values. Otherwise, the array must be allocated on the
90   device.
91 
92   If both cpuarray and gpuarray are provided, the provided arrays must have identical
93   values.
94 
95   The arrays are NOT freed when the vector is destroyed via `VecDestroy()`. The user must free
96   them themselves, but not until the vector is destroyed.
97 
98   This function may initialize `PetscDevice`, which may incur a device synchronization.
99 
100 .seealso: [](ch_vectors), `PetscDeviceInitialize()`, `VecCreate()`, `VecCreateSeqWithArray()`, `VecCreateSeqCUDA()`,
101           `VecCreateSeqCUDAWithArray()`, `VecCreateMPICUDA()`, `VecCreateMPICUDAWithArray()`,
102           `VecCreateMPICUDAWithArrays()`, `VecCUDAPlaceArray()`
103 @*/
VecCreateSeqCUDAWithArrays(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar cpuarray[],const PetscScalar gpuarray[],Vec * v)104 PetscErrorCode VecCreateSeqCUDAWithArrays(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar cpuarray[], const PetscScalar gpuarray[], Vec *v)
105 {
106   PetscFunctionBegin;
107   PetscCall(VecCreateSeqCUPMWithArraysAsync<DeviceType::CUDA>(comm, bs, n, cpuarray, gpuarray, v));
108   PetscFunctionReturn(PETSC_SUCCESS);
109 }
110 
111 // PetscClangLinter pragma disable: -fdoc-internal-linkage
112 /*@C
113   VecCreateSeqCUDAWithArray - Creates a sequential, array-style vector using CUDA, where the
114   user provides the device array space to store the vector values.
115 
116   Collective, Possibly Synchronous
117 
118   Input Parameters:
119 + comm     - the communicator, must be `PETSC_COMM_SELF`
120 . bs       - the block size
121 . n        - the vector length
122 - gpuarray - GPU memory where the vector elements are to be stored (or `NULL`)
123 
124   Output Parameter:
125 . v - the vector
126 
127   Level: intermediate
128 
129   Notes:
130   If the user-provided array is `NULL`, then `VecCUDAPlaceArray()` can be used at a later stage to
131   SET the array for storing the vector values. Otherwise, the array must be allocated on the
132   device.
133 
134   The array is NOT freed when the vector is destroyed via `VecDestroy()`. The user must free the
135   array themselves, but not until the vector is destroyed.
136 
137   Use `VecDuplicate()` or `VecDuplicateVecs()` to form additional vectors of the same type as an
138   existing vector.
139 
140   This function may initialize `PetscDevice`, which may incur a device synchronization.
141 
142 .seealso: [](ch_vectors), `PetscDeviceInitialize()`, `VecCreate()`, `VecCreateSeq()`, `VecCreateSeqWithArray()`,
143           `VecCreateMPIWithArray()`, `VecCreateSeqCUDA()`, `VecCreateMPICUDAWithArray()`, `VecCUDAPlaceArray()`,
144           `VecDuplicate()`, `VecDuplicateVecs()`, `VecCreateGhost()`
145 @*/
VecCreateSeqCUDAWithArray(MPI_Comm comm,PetscInt bs,PetscInt n,const PetscScalar gpuarray[],Vec * v)146 PetscErrorCode VecCreateSeqCUDAWithArray(MPI_Comm comm, PetscInt bs, PetscInt n, const PetscScalar gpuarray[], Vec *v)
147 {
148   PetscFunctionBegin;
149   PetscCall(VecCreateSeqCUDAWithArrays(comm, bs, n, nullptr, gpuarray, v));
150   PetscFunctionReturn(PETSC_SUCCESS);
151 }
152 
153 // PetscClangLinter pragma disable: -fdoc-internal-linkage
154 /*@C
155   VecCUDAGetArray - Provides access to the device buffer inside a vector
156 
157   Logically Collective; Asynchronous; No Fortran Support
158 
159   Input Parameter:
160 . v - the vector
161 
162   Output Parameter:
163 . a - the device buffer
164 
165   Level: intermediate
166 
167   Notes:
168   This routine has semantics similar to `VecGetArray()`; the returned buffer points to a
169   consistent view of the vector data. This may involve copying data from the host to the device
170   if the data on the device is out of date. It is also assumed that the returned buffer is
171   immediately modified, marking the host data out of date. This is similar to intent(inout) in
172   Fortran.
173 
174   If the user does require strong memory guarantees, they are encouraged to use
175   `VecCUDAGetArrayRead()` and/or `VecCUDAGetArrayWrite()` instead.
176 
177   The user must call `VecCUDARestoreArray()` when they are finished using the array.
178 
179   Developer Note:
180   If the device memory hasn't been allocated previously it will be allocated as part of this
181   routine.
182 
183 .seealso: [](ch_vectors), `VecCUDARestoreArray()`, `VecCUDAGetArrayRead()`, `VecCUDAGetArrayWrite()`, `VecGetArray()`,
184           `VecGetArrayRead()`, `VecGetArrayWrite()`
185 @*/
VecCUDAGetArray(Vec v,PetscScalar ** a)186 PetscErrorCode VecCUDAGetArray(Vec v, PetscScalar **a)
187 {
188   PetscFunctionBegin;
189   PetscCall(VecCUPMGetArrayAsync<DeviceType::CUDA>(v, a));
190   PetscFunctionReturn(PETSC_SUCCESS);
191 }
192 
193 // PetscClangLinter pragma disable: -fdoc-internal-linkage
194 /*@C
195   VecCUDARestoreArray - Restore a device buffer previously acquired with `VecCUDAGetArray()`.
196 
197   NotCollective; Asynchronous; No Fortran Support
198 
199   Input Parameters:
200 + v - the vector
201 - a - the device buffer
202 
203   Level: intermediate
204 
205   Note:
206   The restored pointer is invalid after this function returns. This function also marks the
207   host data as out of date. Subsequent access to the vector data on the host side via
208   `VecGetArray()` will incur a (synchronous) data transfer.
209 
210 .seealso: [](ch_vectors), `VecCUDAGetArray()`, `VecCUDAGetArrayRead()`, `VecCUDAGetArrayWrite()`, `VecGetArray()`,
211           `VecRestoreArray()`, `VecGetArrayRead()`
212 @*/
VecCUDARestoreArray(Vec v,PetscScalar ** a)213 PetscErrorCode VecCUDARestoreArray(Vec v, PetscScalar **a)
214 {
215   PetscFunctionBegin;
216   PetscCall(VecCUPMRestoreArrayAsync<DeviceType::CUDA>(v, a));
217   PetscFunctionReturn(PETSC_SUCCESS);
218 }
219 
220 // PetscClangLinter pragma disable: -fdoc-internal-linkage
221 /*@C
222   VecCUDAGetArrayRead - Provides read access to the CUDA buffer inside a vector.
223 
224   Not Collective; Asynchronous; No Fortran Support
225 
226   Input Parameter:
227 . v - the vector
228 
229   Output Parameter:
230 . a - the CUDA pointer.
231 
232   Level: intermediate
233 
234   Notes:
235   See `VecCUDAGetArray()` for data movement semantics of this function.
236 
237   This function assumes that the user will not modify the vector data. This is analgogous to
238   intent(in) in Fortran.
239 
240   The device pointer must be restored by calling `VecCUDARestoreArrayRead()`. If the data on the
241   host side was previously up to date it will remain so, i.e. data on both the device and the
242   host is up to date. Accessing data on the host side does not incur a device to host data
243   transfer.
244 
245 .seealso: [](ch_vectors), `VecCUDARestoreArrayRead()`, `VecCUDAGetArray()`, `VecCUDAGetArrayWrite()`, `VecGetArray()`,
246           `VecGetArrayRead()`
247 @*/
VecCUDAGetArrayRead(Vec v,const PetscScalar ** a)248 PetscErrorCode VecCUDAGetArrayRead(Vec v, const PetscScalar **a)
249 {
250   PetscFunctionBegin;
251   PetscCall(VecCUPMGetArrayReadAsync<DeviceType::CUDA>(v, a));
252   PetscFunctionReturn(PETSC_SUCCESS);
253 }
254 
255 // PetscClangLinter pragma disable: -fdoc-internal-linkage
256 /*@C
257   VecCUDARestoreArrayRead - Restore a CUDA device pointer previously acquired with
258   `VecCUDAGetArrayRead()`.
259 
260   Not Collective; Asynchronous; No Fortran Support
261 
262   Input Parameters:
263 + v - the vector
264 - a - the CUDA device pointer
265 
266   Level: intermediate
267 
268   Note:
269   This routine does not modify the corresponding array on the host in any way. The pointer is
270   invalid after this function returns.
271 
272 .seealso: [](ch_vectors), `VecCUDAGetArrayRead()`, `VecCUDAGetArrayWrite()`, `VecCUDAGetArray()`, `VecGetArray()`,
273           `VecRestoreArray()`, `VecGetArrayRead()`
274 @*/
VecCUDARestoreArrayRead(Vec v,const PetscScalar ** a)275 PetscErrorCode VecCUDARestoreArrayRead(Vec v, const PetscScalar **a)
276 {
277   PetscFunctionBegin;
278   PetscCall(VecCUPMRestoreArrayReadAsync<DeviceType::CUDA>(v, a));
279   PetscFunctionReturn(PETSC_SUCCESS);
280 }
281 
282 // PetscClangLinter pragma disable: -fdoc-internal-linkage
283 /*@C
284   VecCUDAGetArrayWrite - Provides write access to the CUDA buffer inside a vector.
285 
286    Logically Collective; Asynchronous; No Fortran Support
287 
288   Input Parameter:
289 . v - the vector
290 
291   Output Parameter:
292 . a - the CUDA pointer
293 
294   Level: advanced
295 
296   Notes:
297   The data pointed to by the device pointer is uninitialized. The user may not read from this
298   data. Furthermore, the entire array needs to be filled by the user to obtain well-defined
299   behaviour. The device memory will be allocated by this function if it hasn't been allocated
300   previously. This is analogous to intent(out) in Fortran.
301 
302   The device pointer needs to be released with `VecCUDARestoreArrayWrite()`. When the pointer is
303   released the host data of the vector is marked as out of data. Subsequent access of the host
304   data with e.g. VecGetArray() incurs a device to host data transfer.
305 
306 .seealso: [](ch_vectors), `VecCUDARestoreArrayWrite()`, `VecCUDAGetArray()`, `VecCUDAGetArrayRead()`,
307           `VecCUDAGetArrayWrite()`, `VecGetArray()`, `VecGetArrayRead()`
308 @*/
VecCUDAGetArrayWrite(Vec v,PetscScalar ** a)309 PetscErrorCode VecCUDAGetArrayWrite(Vec v, PetscScalar **a)
310 {
311   PetscFunctionBegin;
312   PetscCall(VecCUPMGetArrayWriteAsync<DeviceType::CUDA>(v, a));
313   PetscFunctionReturn(PETSC_SUCCESS);
314 }
315 
316 // PetscClangLinter pragma disable: -fdoc-internal-linkage
317 /*@C
318   VecCUDARestoreArrayWrite - Restore a CUDA device pointer previously acquired with
319   `VecCUDAGetArrayWrite()`.
320 
321    Logically Collective; Asynchronous; No Fortran Support
322 
323   Input Parameters:
324 + v - the vector
325 - a - the CUDA device pointer.  This pointer is invalid after `VecCUDARestoreArrayWrite()` returns.
326 
327   Level: intermediate
328 
329   Note:
330   Data on the host will be marked as out of date. Subsequent access of the data on the host
331   side e.g. with `VecGetArray()` will incur a device to host data transfer.
332 
333 .seealso: [](ch_vectors), `VecCUDAGetArrayWrite()`, `VecCUDAGetArray()`, `VecCUDAGetArrayRead()`,
334           `VecCUDAGetArrayWrite()`, `VecGetArray()`, `VecRestoreArray()`, `VecGetArrayRead()`
335 @*/
VecCUDARestoreArrayWrite(Vec v,PetscScalar ** a)336 PetscErrorCode VecCUDARestoreArrayWrite(Vec v, PetscScalar **a)
337 {
338   PetscFunctionBegin;
339   PetscCall(VecCUPMRestoreArrayWriteAsync<DeviceType::CUDA>(v, a));
340   PetscFunctionReturn(PETSC_SUCCESS);
341 }
342 
343 // PetscClangLinter pragma disable: -fdoc-internal-linkage
344 /*@C
345   VecCUDAPlaceArray - Allows one to replace the GPU array in a vector with a GPU array provided
346   by the user.
347 
348   Logically Collective; Asynchronous; No Fortran Support
349 
350   Input Parameters:
351 + vec - the vector
352 - array - the GPU array
353 
354   Level: advanced
355 
356   Notes:
357   Adding `const` to `array` was an oversight, see notes in `VecPlaceArray()`.
358 
359   This routine is useful to avoid copying an array into a vector, though you can return to the
360   original GPU array with a call to `VecCUDAResetArray()`.
361 
362   It is not possible to use `VecCUDAPlaceArray()` and `VecPlaceArray()` at the same time on the
363   same vector.
364 
365   `vec` does not take ownership of `array` in any way. The user must free `array` themselves
366   but be careful not to do so before the vector has either been destroyed, had its original
367   array restored with `VecCUDAResetArray()` or permanently replaced with
368   `VecCUDAReplaceArray()`.
369 
370 .seealso: [](ch_vectors), `VecPlaceArray()`, `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`,
371           `VecResetArray()`, `VecCUDAResetArray()`, `VecCUDAReplaceArray()`
372 @*/
VecCUDAPlaceArray(Vec vin,const PetscScalar array[])373 PetscErrorCode VecCUDAPlaceArray(Vec vin, const PetscScalar array[])
374 {
375   PetscFunctionBegin;
376   PetscCall(VecCUPMPlaceArrayAsync<DeviceType::CUDA>(vin, array));
377   PetscFunctionReturn(PETSC_SUCCESS);
378 }
379 
380 // PetscClangLinter pragma disable: -fdoc-internal-linkage
381 /*@C
382   VecCUDAReplaceArray - Permanently replace the GPU array in a vector with a GPU array provided
383   by the user.
384 
385   Logically Collective; No Fortran Support
386 
387   Input Parameters:
388 + vec   - the vector
389 - array - the GPU array
390 
391   Level: advanced
392 
393   Notes:
394   Adding `const` to `array` was an oversight, see notes in `VecPlaceArray()`.
395 
396   This is useful to avoid copying a GPU array into a vector.
397 
398   This frees the memory associated with the old GPU array. The vector takes ownership of the
399   passed array so it CANNOT be freed by the user. It will be freed when the vector is
400   destroyed.
401 
402 .seealso: [](ch_vectors), `VecGetArray()`, `VecRestoreArray()`, `VecPlaceArray()`, `VecResetArray()`,
403           `VecCUDAResetArray()`, `VecCUDAPlaceArray()`, `VecReplaceArray()`
404 @*/
VecCUDAReplaceArray(Vec vin,const PetscScalar array[])405 PetscErrorCode VecCUDAReplaceArray(Vec vin, const PetscScalar array[])
406 {
407   PetscFunctionBegin;
408   PetscCall(VecCUPMReplaceArrayAsync<DeviceType::CUDA>(vin, array));
409   PetscFunctionReturn(PETSC_SUCCESS);
410 }
411 
412 // PetscClangLinter pragma disable: -fdoc-internal-linkage
413 /*@C
414   VecCUDAResetArray - Resets a vector to use its default memory.
415 
416   Logically Collective; No Fortran Support
417 
418   Input Parameters:
419 . vec - the vector
420 
421   Level: advanced
422 
423   Note:
424   Call this after the use of `VecCUDAPlaceArray()`.
425 
426 .seealso: [](ch_vectors), `VecGetArray()`, `VecRestoreArray()`, `VecReplaceArray()`, `VecPlaceArray()`,
427           `VecResetArray()`, `VecCUDAPlaceArray()`, `VecCUDAReplaceArray()`
428 @*/
VecCUDAResetArray(Vec vin)429 PetscErrorCode VecCUDAResetArray(Vec vin)
430 {
431   PetscFunctionBegin;
432   PetscCall(VecCUPMResetArrayAsync<DeviceType::CUDA>(vin));
433   PetscFunctionReturn(PETSC_SUCCESS);
434 }
435