xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-vector.c (revision fc0f7cc68128f3536a834f19d72828f4c59a4439)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <cuda_runtime.h>
11 #include <math.h>
12 #include <stdbool.h>
13 #include <string.h>
14 
15 #include "../cuda/ceed-cuda-common.h"
16 #include "ceed-cuda-ref.h"
17 
18 //------------------------------------------------------------------------------
19 // Check if host/device sync is needed
20 //------------------------------------------------------------------------------
21 static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_type, bool *need_sync) {
22   bool             has_valid_array = false;
23   CeedVector_Cuda *impl;
24 
25   CeedCallBackend(CeedVectorGetData(vec, &impl));
26   CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array));
27   switch (mem_type) {
28     case CEED_MEM_HOST:
29       *need_sync = has_valid_array && !impl->h_array;
30       break;
31     case CEED_MEM_DEVICE:
32       *need_sync = has_valid_array && !impl->d_array;
33       break;
34   }
35   return CEED_ERROR_SUCCESS;
36 }
37 
38 //------------------------------------------------------------------------------
39 // Sync host to device
40 //------------------------------------------------------------------------------
41 static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
42   CeedSize         length;
43   size_t           bytes;
44   Ceed             ceed;
45   CeedVector_Cuda *impl;
46 
47   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
48   CeedCallBackend(CeedVectorGetData(vec, &impl));
49 
50   CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device");
51 
52   CeedCallBackend(CeedVectorGetLength(vec, &length));
53   bytes = length * sizeof(CeedScalar);
54   if (impl->d_array_borrowed) {
55     impl->d_array = impl->d_array_borrowed;
56   } else if (impl->d_array_owned) {
57     impl->d_array = impl->d_array_owned;
58   } else {
59     CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes));
60     impl->d_array = impl->d_array_owned;
61   }
62   CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice));
63   return CEED_ERROR_SUCCESS;
64 }
65 
66 //------------------------------------------------------------------------------
67 // Sync device to host
68 //------------------------------------------------------------------------------
69 static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) {
70   CeedSize         length;
71   Ceed             ceed;
72   CeedVector_Cuda *impl;
73 
74   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
75   CeedCallBackend(CeedVectorGetData(vec, &impl));
76 
77   CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
78 
79   if (impl->h_array_borrowed) {
80     impl->h_array = impl->h_array_borrowed;
81   } else if (impl->h_array_owned) {
82     impl->h_array = impl->h_array_owned;
83   } else {
84     CeedSize length;
85 
86     CeedCallBackend(CeedVectorGetLength(vec, &length));
87     CeedCallBackend(CeedCalloc(length, &impl->h_array_owned));
88     impl->h_array = impl->h_array_owned;
89   }
90 
91   CeedCallBackend(CeedVectorGetLength(vec, &length));
92   size_t bytes = length * sizeof(CeedScalar);
93 
94   CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost));
95   return CEED_ERROR_SUCCESS;
96 }
97 
98 //------------------------------------------------------------------------------
99 // Sync arrays
100 //------------------------------------------------------------------------------
101 static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type) {
102   bool need_sync = false;
103 
104   // Check whether device/host sync is needed
105   CeedCallBackend(CeedVectorNeedSync_Cuda(vec, mem_type, &need_sync));
106   if (!need_sync) return CEED_ERROR_SUCCESS;
107 
108   switch (mem_type) {
109     case CEED_MEM_HOST:
110       return CeedVectorSyncD2H_Cuda(vec);
111     case CEED_MEM_DEVICE:
112       return CeedVectorSyncH2D_Cuda(vec);
113   }
114   return CEED_ERROR_UNSUPPORTED;
115 }
116 
117 //------------------------------------------------------------------------------
118 // Set all pointers as invalid
119 //------------------------------------------------------------------------------
120 static inline int CeedVectorSetAllInvalid_Cuda(const CeedVector vec) {
121   CeedVector_Cuda *impl;
122 
123   CeedCallBackend(CeedVectorGetData(vec, &impl));
124   impl->h_array = NULL;
125   impl->d_array = NULL;
126   return CEED_ERROR_SUCCESS;
127 }
128 
129 //------------------------------------------------------------------------------
130 // Check if CeedVector has any valid pointer
131 //------------------------------------------------------------------------------
132 static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, bool *has_valid_array) {
133   CeedVector_Cuda *impl;
134 
135   CeedCallBackend(CeedVectorGetData(vec, &impl));
136   *has_valid_array = impl->h_array || impl->d_array;
137   return CEED_ERROR_SUCCESS;
138 }
139 
140 //------------------------------------------------------------------------------
141 // Check if has array of given type
142 //------------------------------------------------------------------------------
143 static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) {
144   CeedVector_Cuda *impl;
145 
146   CeedCallBackend(CeedVectorGetData(vec, &impl));
147   switch (mem_type) {
148     case CEED_MEM_HOST:
149       *has_array_of_type = impl->h_array_borrowed || impl->h_array_owned;
150       break;
151     case CEED_MEM_DEVICE:
152       *has_array_of_type = impl->d_array_borrowed || impl->d_array_owned;
153       break;
154   }
155   return CEED_ERROR_SUCCESS;
156 }
157 
158 //------------------------------------------------------------------------------
159 // Check if has borrowed array of given type
160 //------------------------------------------------------------------------------
161 static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) {
162   CeedVector_Cuda *impl;
163 
164   CeedCallBackend(CeedVectorGetData(vec, &impl));
165   switch (mem_type) {
166     case CEED_MEM_HOST:
167       *has_borrowed_array_of_type = impl->h_array_borrowed;
168       break;
169     case CEED_MEM_DEVICE:
170       *has_borrowed_array_of_type = impl->d_array_borrowed;
171       break;
172   }
173   return CEED_ERROR_SUCCESS;
174 }
175 
176 //------------------------------------------------------------------------------
177 // Set array from host
178 //------------------------------------------------------------------------------
179 static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
180   CeedSize         length;
181   CeedVector_Cuda *impl;
182 
183   CeedCallBackend(CeedVectorGetData(vec, &impl));
184   CeedCallBackend(CeedVectorGetLength(vec, &length));
185 
186   CeedCallBackend(CeedSetHostCeedScalarArray(array, copy_mode, length, (const CeedScalar **)&impl->h_array_owned,
187                                              (const CeedScalar **)&impl->h_array_borrowed, (const CeedScalar **)&impl->h_array));
188   return CEED_ERROR_SUCCESS;
189 }
190 
191 //------------------------------------------------------------------------------
192 // Set array from device
193 //------------------------------------------------------------------------------
194 static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
195   CeedSize         length;
196   Ceed             ceed;
197   CeedVector_Cuda *impl;
198 
199   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
200   CeedCallBackend(CeedVectorGetData(vec, &impl));
201   CeedCallBackend(CeedVectorGetLength(vec, &length));
202 
203   CeedCallBackend(CeedSetDeviceCeedScalarArray_Cuda(ceed, array, copy_mode, length, (const CeedScalar **)&impl->d_array_owned,
204                                                     (const CeedScalar **)&impl->d_array_borrowed, (const CeedScalar **)&impl->d_array));
205   return CEED_ERROR_SUCCESS;
206 }
207 
208 //------------------------------------------------------------------------------
209 // Set the array used by a vector,
210 //   freeing any previously allocated array if applicable
211 //------------------------------------------------------------------------------
212 static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) {
213   CeedVector_Cuda *impl;
214 
215   CeedCallBackend(CeedVectorGetData(vec, &impl));
216   CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec));
217   switch (mem_type) {
218     case CEED_MEM_HOST:
219       return CeedVectorSetArrayHost_Cuda(vec, copy_mode, array);
220     case CEED_MEM_DEVICE:
221       return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array);
222   }
223   return CEED_ERROR_UNSUPPORTED;
224 }
225 
226 //------------------------------------------------------------------------------
227 // Set host array to value
228 //------------------------------------------------------------------------------
229 static int CeedHostSetValue_Cuda(CeedScalar *h_array, CeedSize length, CeedScalar val) {
230   for (CeedSize i = 0; i < length; i++) h_array[i] = val;
231   return CEED_ERROR_SUCCESS;
232 }
233 
234 //------------------------------------------------------------------------------
235 // Set device array to value (impl in .cu file)
236 //------------------------------------------------------------------------------
237 int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val);
238 
239 //------------------------------------------------------------------------------
240 // Set a vector to a value
241 //------------------------------------------------------------------------------
242 static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
243   CeedSize         length;
244   CeedVector_Cuda *impl;
245 
246   CeedCallBackend(CeedVectorGetData(vec, &impl));
247   CeedCallBackend(CeedVectorGetLength(vec, &length));
248   // Set value for synced device/host array
249   if (!impl->d_array && !impl->h_array) {
250     if (impl->d_array_borrowed) {
251       impl->d_array = impl->d_array_borrowed;
252     } else if (impl->h_array_borrowed) {
253       impl->h_array = impl->h_array_borrowed;
254     } else if (impl->d_array_owned) {
255       impl->d_array = impl->d_array_owned;
256     } else if (impl->h_array_owned) {
257       impl->h_array = impl->h_array_owned;
258     } else {
259       CeedCallBackend(CeedVectorSetArray(vec, CEED_MEM_DEVICE, CEED_COPY_VALUES, NULL));
260     }
261   }
262   if (impl->d_array) {
263     CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val));
264     impl->h_array = NULL;
265   }
266   if (impl->h_array) {
267     CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val));
268     impl->d_array = NULL;
269   }
270   return CEED_ERROR_SUCCESS;
271 }
272 
273 //------------------------------------------------------------------------------
274 // Vector Take Array
275 //------------------------------------------------------------------------------
276 static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
277   CeedVector_Cuda *impl;
278 
279   CeedCallBackend(CeedVectorGetData(vec, &impl));
280   // Sync array to requested mem_type
281   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
282   // Update pointer
283   switch (mem_type) {
284     case CEED_MEM_HOST:
285       (*array)               = impl->h_array_borrowed;
286       impl->h_array_borrowed = NULL;
287       impl->h_array          = NULL;
288       break;
289     case CEED_MEM_DEVICE:
290       (*array)               = impl->d_array_borrowed;
291       impl->d_array_borrowed = NULL;
292       impl->d_array          = NULL;
293       break;
294   }
295   return CEED_ERROR_SUCCESS;
296 }
297 
298 //------------------------------------------------------------------------------
299 // Core logic for array syncronization for GetArray.
300 //   If a different memory type is most up to date, this will perform a copy
301 //------------------------------------------------------------------------------
302 static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
303   CeedVector_Cuda *impl;
304 
305   CeedCallBackend(CeedVectorGetData(vec, &impl));
306   // Sync array to requested mem_type
307   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
308   // Update pointer
309   switch (mem_type) {
310     case CEED_MEM_HOST:
311       *array = impl->h_array;
312       break;
313     case CEED_MEM_DEVICE:
314       *array = impl->d_array;
315       break;
316   }
317   return CEED_ERROR_SUCCESS;
318 }
319 
320 //------------------------------------------------------------------------------
321 // Get read-only access to a vector via the specified mem_type
322 //------------------------------------------------------------------------------
323 static int CeedVectorGetArrayRead_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedScalar **array) {
324   return CeedVectorGetArrayCore_Cuda(vec, mem_type, (CeedScalar **)array);
325 }
326 
327 //------------------------------------------------------------------------------
328 // Get read/write access to a vector via the specified mem_type
329 //------------------------------------------------------------------------------
330 static int CeedVectorGetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
331   CeedVector_Cuda *impl;
332 
333   CeedCallBackend(CeedVectorGetData(vec, &impl));
334   CeedCallBackend(CeedVectorGetArrayCore_Cuda(vec, mem_type, array));
335   CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec));
336   switch (mem_type) {
337     case CEED_MEM_HOST:
338       impl->h_array = *array;
339       break;
340     case CEED_MEM_DEVICE:
341       impl->d_array = *array;
342       break;
343   }
344   return CEED_ERROR_SUCCESS;
345 }
346 
347 //------------------------------------------------------------------------------
348 // Get write access to a vector via the specified mem_type
349 //------------------------------------------------------------------------------
350 static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
351   bool             has_array_of_type = true;
352   CeedVector_Cuda *impl;
353 
354   CeedCallBackend(CeedVectorGetData(vec, &impl));
355   CeedCallBackend(CeedVectorHasArrayOfType_Cuda(vec, mem_type, &has_array_of_type));
356   if (!has_array_of_type) {
357     // Allocate if array is not yet allocated
358     CeedCallBackend(CeedVectorSetArray(vec, mem_type, CEED_COPY_VALUES, NULL));
359   } else {
360     // Select dirty array
361     switch (mem_type) {
362       case CEED_MEM_HOST:
363         if (impl->h_array_borrowed) impl->h_array = impl->h_array_borrowed;
364         else impl->h_array = impl->h_array_owned;
365         break;
366       case CEED_MEM_DEVICE:
367         if (impl->d_array_borrowed) impl->d_array = impl->d_array_borrowed;
368         else impl->d_array = impl->d_array_owned;
369     }
370   }
371   return CeedVectorGetArray_Cuda(vec, mem_type, array);
372 }
373 
374 //------------------------------------------------------------------------------
375 // Get the norm of a CeedVector
376 //------------------------------------------------------------------------------
377 static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) {
378   Ceed     ceed;
379   CeedSize length;
380 #if CUDA_VERSION < 12000
381   CeedSize num_calls;
382 #endif
383   const CeedScalar *d_array;
384   CeedVector_Cuda  *impl;
385   cublasHandle_t    handle;
386 
387   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
388   CeedCallBackend(CeedVectorGetData(vec, &impl));
389   CeedCallBackend(CeedVectorGetLength(vec, &length));
390   CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
391 
392 #if CUDA_VERSION < 12000
393   // With CUDA 12, we can use the 64-bit integer interface. Prior to that,
394   // we need to check if the vector is too long to handle with int32,
395   // and if so, divide it into subsections for repeated cuBLAS calls.
396   num_calls = length / INT_MAX;
397   if (length % INT_MAX > 0) num_calls += 1;
398 #endif
399 
400   // Compute norm
401   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
402   switch (type) {
403     case CEED_NORM_1: {
404       *norm = 0.0;
405       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
406 #if CUDA_VERSION >= 12000  // We have CUDA 12, and can use 64-bit integers
407         CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
408 #else
409         float  sub_norm = 0.0;
410         float *d_array_start;
411 
412         for (CeedInt i = 0; i < num_calls; i++) {
413           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
414           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
415           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
416 
417           CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
418           *norm += sub_norm;
419         }
420 #endif
421       } else {
422 #if CUDA_VERSION >= 12000
423         CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
424 #else
425         double  sub_norm = 0.0;
426         double *d_array_start;
427 
428         for (CeedInt i = 0; i < num_calls; i++) {
429           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
430           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
431           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
432 
433           CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
434           *norm += sub_norm;
435         }
436 #endif
437       }
438       break;
439     }
440     case CEED_NORM_2: {
441       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
442 #if CUDA_VERSION >= 12000
443         CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
444 #else
445         float  sub_norm = 0.0, norm_sum = 0.0;
446         float *d_array_start;
447 
448         for (CeedInt i = 0; i < num_calls; i++) {
449           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
450           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
451           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
452 
453           CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
454           norm_sum += sub_norm * sub_norm;
455         }
456         *norm = sqrt(norm_sum);
457 #endif
458       } else {
459 #if CUDA_VERSION >= 12000
460         CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
461 #else
462         double  sub_norm = 0.0, norm_sum = 0.0;
463         double *d_array_start;
464 
465         for (CeedInt i = 0; i < num_calls; i++) {
466           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
467           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
468           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
469 
470           CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
471           norm_sum += sub_norm * sub_norm;
472         }
473         *norm = sqrt(norm_sum);
474 #endif
475       }
476       break;
477     }
478     case CEED_NORM_MAX: {
479       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
480 #if CUDA_VERSION >= 12000
481         int64_t    index;
482         CeedScalar norm_no_abs;
483 
484         CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
485         CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
486         *norm = fabs(norm_no_abs);
487 #else
488         CeedInt index;
489         float   sub_max = 0.0, current_max = 0.0;
490         float  *d_array_start;
491 
492         for (CeedInt i = 0; i < num_calls; i++) {
493           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
494           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
495           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
496 
497           CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
498           CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
499           if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
500         }
501         *norm = current_max;
502 #endif
503       } else {
504 #if CUDA_VERSION >= 12000
505         int64_t    index;
506         CeedScalar norm_no_abs;
507 
508         CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
509         CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
510         *norm = fabs(norm_no_abs);
511 #else
512         CeedInt index;
513         double  sub_max = 0.0, current_max = 0.0;
514         double *d_array_start;
515 
516         for (CeedInt i = 0; i < num_calls; i++) {
517           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
518           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
519           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
520 
521           CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
522           CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
523           if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
524         }
525         *norm = current_max;
526 #endif
527       }
528       break;
529     }
530   }
531   CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array));
532   return CEED_ERROR_SUCCESS;
533 }
534 
535 //------------------------------------------------------------------------------
536 // Take reciprocal of a vector on host
537 //------------------------------------------------------------------------------
538 static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedSize length) {
539   for (CeedSize i = 0; i < length; i++) {
540     if (fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i];
541   }
542   return CEED_ERROR_SUCCESS;
543 }
544 
545 //------------------------------------------------------------------------------
546 // Take reciprocal of a vector on device (impl in .cu file)
547 //------------------------------------------------------------------------------
548 int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length);
549 
550 //------------------------------------------------------------------------------
551 // Take reciprocal of a vector
552 //------------------------------------------------------------------------------
553 static int CeedVectorReciprocal_Cuda(CeedVector vec) {
554   CeedSize         length;
555   CeedVector_Cuda *impl;
556 
557   CeedCallBackend(CeedVectorGetData(vec, &impl));
558   CeedCallBackend(CeedVectorGetLength(vec, &length));
559   // Set value for synced device/host array
560   if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Cuda(impl->d_array, length));
561   if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Cuda(impl->h_array, length));
562   return CEED_ERROR_SUCCESS;
563 }
564 
565 //------------------------------------------------------------------------------
566 // Compute x = alpha x on the host
567 //------------------------------------------------------------------------------
568 static int CeedHostScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
569   for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha;
570   return CEED_ERROR_SUCCESS;
571 }
572 
573 //------------------------------------------------------------------------------
574 // Compute x = alpha x on device (impl in .cu file)
575 //------------------------------------------------------------------------------
576 int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length);
577 
578 //------------------------------------------------------------------------------
579 // Compute x = alpha x
580 //------------------------------------------------------------------------------
581 static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) {
582   CeedSize         length;
583   CeedVector_Cuda *x_impl;
584 
585   CeedCallBackend(CeedVectorGetData(x, &x_impl));
586   CeedCallBackend(CeedVectorGetLength(x, &length));
587   // Set value for synced device/host array
588   if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length));
589   if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length));
590   return CEED_ERROR_SUCCESS;
591 }
592 
593 //------------------------------------------------------------------------------
594 // Compute y = alpha x + y on the host
595 //------------------------------------------------------------------------------
596 static int CeedHostAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
597   for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
598   return CEED_ERROR_SUCCESS;
599 }
600 
601 //------------------------------------------------------------------------------
602 // Compute y = alpha x + y on device (impl in .cu file)
603 //------------------------------------------------------------------------------
604 int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length);
605 
606 //------------------------------------------------------------------------------
607 // Compute y = alpha x + y
608 //------------------------------------------------------------------------------
609 static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
610   Ceed             ceed;
611   CeedSize         length;
612   CeedVector_Cuda *y_impl, *x_impl;
613 
614   CeedCallBackend(CeedVectorGetCeed(y, &ceed));
615   CeedCallBackend(CeedVectorGetData(y, &y_impl));
616   CeedCallBackend(CeedVectorGetData(x, &x_impl));
617   CeedCallBackend(CeedVectorGetLength(y, &length));
618   // Set value for synced device/host array
619   if (y_impl->d_array) {
620     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
621     CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length));
622   }
623   if (y_impl->h_array) {
624     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
625     CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length));
626   }
627   return CEED_ERROR_SUCCESS;
628 }
629 
630 //------------------------------------------------------------------------------
631 // Compute y = alpha x + beta y on the host
632 //------------------------------------------------------------------------------
633 static int CeedHostAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
634   for (CeedSize i = 0; i < length; i++) y_array[i] = alpha * x_array[i] + beta * y_array[i];
635   return CEED_ERROR_SUCCESS;
636 }
637 
638 //------------------------------------------------------------------------------
639 // Compute y = alpha x + beta y on device (impl in .cu file)
640 //------------------------------------------------------------------------------
641 int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length);
642 
643 //------------------------------------------------------------------------------
644 // Compute y = alpha x + beta y
645 //------------------------------------------------------------------------------
646 static int CeedVectorAXPBY_Cuda(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) {
647   CeedSize         length;
648   CeedVector_Cuda *y_impl, *x_impl;
649 
650   CeedCallBackend(CeedVectorGetData(y, &y_impl));
651   CeedCallBackend(CeedVectorGetData(x, &x_impl));
652   CeedCallBackend(CeedVectorGetLength(y, &length));
653   // Set value for synced device/host array
654   if (y_impl->d_array) {
655     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
656     CeedCallBackend(CeedDeviceAXPBY_Cuda(y_impl->d_array, alpha, beta, x_impl->d_array, length));
657   }
658   if (y_impl->h_array) {
659     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
660     CeedCallBackend(CeedHostAXPBY_Cuda(y_impl->h_array, alpha, beta, x_impl->h_array, length));
661   }
662   return CEED_ERROR_SUCCESS;
663 }
664 
665 //------------------------------------------------------------------------------
666 // Compute the pointwise multiplication w = x .* y on the host
667 //------------------------------------------------------------------------------
668 static int CeedHostPointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
669   for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
670   return CEED_ERROR_SUCCESS;
671 }
672 
673 //------------------------------------------------------------------------------
674 // Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
675 //------------------------------------------------------------------------------
676 int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length);
677 
678 //------------------------------------------------------------------------------
679 // Compute the pointwise multiplication w = x .* y
680 //------------------------------------------------------------------------------
681 static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, CeedVector y) {
682   CeedSize         length;
683   CeedVector_Cuda *w_impl, *x_impl, *y_impl;
684 
685   CeedCallBackend(CeedVectorGetData(w, &w_impl));
686   CeedCallBackend(CeedVectorGetData(x, &x_impl));
687   CeedCallBackend(CeedVectorGetData(y, &y_impl));
688   CeedCallBackend(CeedVectorGetLength(w, &length));
689   // Set value for synced device/host array
690   if (!w_impl->d_array && !w_impl->h_array) {
691     CeedCallBackend(CeedVectorSetValue(w, 0.0));
692   }
693   if (w_impl->d_array) {
694     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
695     CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_DEVICE));
696     CeedCallBackend(CeedDevicePointwiseMult_Cuda(w_impl->d_array, x_impl->d_array, y_impl->d_array, length));
697   }
698   if (w_impl->h_array) {
699     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
700     CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST));
701     CeedCallBackend(CeedHostPointwiseMult_Cuda(w_impl->h_array, x_impl->h_array, y_impl->h_array, length));
702   }
703   return CEED_ERROR_SUCCESS;
704 }
705 
706 //------------------------------------------------------------------------------
707 // Destroy the vector
708 //------------------------------------------------------------------------------
709 static int CeedVectorDestroy_Cuda(const CeedVector vec) {
710   CeedVector_Cuda *impl;
711 
712   CeedCallBackend(CeedVectorGetData(vec, &impl));
713   CeedCallCuda(CeedVectorReturnCeed(vec), cudaFree(impl->d_array_owned));
714   CeedCallBackend(CeedFree(&impl->h_array_owned));
715   CeedCallBackend(CeedFree(&impl));
716   return CEED_ERROR_SUCCESS;
717 }
718 
719 //------------------------------------------------------------------------------
720 // Create a vector of the specified length (does not allocate memory)
721 //------------------------------------------------------------------------------
722 int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) {
723   CeedVector_Cuda *impl;
724   Ceed             ceed;
725 
726   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
727   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Cuda));
728   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda));
729   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda));
730   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda));
731   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Cuda));
732   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda));
733   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda));
734   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda));
735   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Cuda));
736   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Cuda));
737   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Cuda));
738   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Cuda));
739   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Cuda));
740   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Cuda));
741   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda));
742   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda));
743   CeedCallBackend(CeedCalloc(1, &impl));
744   CeedCallBackend(CeedVectorSetData(vec, impl));
745   return CEED_ERROR_SUCCESS;
746 }
747 
748 //------------------------------------------------------------------------------
749