xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-vector.c (revision 0c9ac183098ed55d0b8f535d4ee42b5752ebbe4d) !
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <cuda_runtime.h>
11 #include <math.h>
12 #include <stdbool.h>
13 #include <string.h>
14 
15 #include "../cuda/ceed-cuda-common.h"
16 #include "ceed-cuda-ref.h"
17 
18 //------------------------------------------------------------------------------
19 // Check if host/device sync is needed
20 //------------------------------------------------------------------------------
21 static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_type, bool *need_sync) {
22   bool             has_valid_array = false;
23   CeedVector_Cuda *impl;
24 
25   CeedCallBackend(CeedVectorGetData(vec, &impl));
26   CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array));
27   switch (mem_type) {
28     case CEED_MEM_HOST:
29       *need_sync = has_valid_array && !impl->h_array;
30       break;
31     case CEED_MEM_DEVICE:
32       *need_sync = has_valid_array && !impl->d_array;
33       break;
34   }
35   return CEED_ERROR_SUCCESS;
36 }
37 
38 //------------------------------------------------------------------------------
39 // Sync host to device
40 //------------------------------------------------------------------------------
41 static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
42   CeedSize         length;
43   size_t           bytes;
44   Ceed             ceed;
45   CeedVector_Cuda *impl;
46 
47   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
48   CeedCallBackend(CeedVectorGetData(vec, &impl));
49 
50   CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device");
51 
52   CeedCallBackend(CeedVectorGetLength(vec, &length));
53   bytes = length * sizeof(CeedScalar);
54   if (impl->d_array_borrowed) {
55     impl->d_array = impl->d_array_borrowed;
56   } else if (impl->d_array_owned) {
57     impl->d_array = impl->d_array_owned;
58   } else {
59     CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes));
60     impl->d_array = impl->d_array_owned;
61   }
62   CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice));
63   return CEED_ERROR_SUCCESS;
64 }
65 
66 //------------------------------------------------------------------------------
67 // Sync device to host
68 //------------------------------------------------------------------------------
69 static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) {
70   CeedSize         length;
71   Ceed             ceed;
72   CeedVector_Cuda *impl;
73 
74   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
75   CeedCallBackend(CeedVectorGetData(vec, &impl));
76 
77   CeedCheck(impl->d_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid device data to sync to host");
78 
79   if (impl->h_array_borrowed) {
80     impl->h_array = impl->h_array_borrowed;
81   } else if (impl->h_array_owned) {
82     impl->h_array = impl->h_array_owned;
83   } else {
84     CeedSize length;
85 
86     CeedCallBackend(CeedVectorGetLength(vec, &length));
87     CeedCallBackend(CeedCalloc(length, &impl->h_array_owned));
88     impl->h_array = impl->h_array_owned;
89   }
90 
91   CeedCallBackend(CeedVectorGetLength(vec, &length));
92   size_t bytes = length * sizeof(CeedScalar);
93 
94   CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost));
95   return CEED_ERROR_SUCCESS;
96 }
97 
98 //------------------------------------------------------------------------------
99 // Sync arrays
100 //------------------------------------------------------------------------------
101 static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type) {
102   bool need_sync = false;
103 
104   // Check whether device/host sync is needed
105   CeedCallBackend(CeedVectorNeedSync_Cuda(vec, mem_type, &need_sync));
106   if (!need_sync) return CEED_ERROR_SUCCESS;
107 
108   switch (mem_type) {
109     case CEED_MEM_HOST:
110       return CeedVectorSyncD2H_Cuda(vec);
111     case CEED_MEM_DEVICE:
112       return CeedVectorSyncH2D_Cuda(vec);
113   }
114   return CEED_ERROR_UNSUPPORTED;
115 }
116 
117 //------------------------------------------------------------------------------
118 // Set all pointers as invalid
119 //------------------------------------------------------------------------------
120 static inline int CeedVectorSetAllInvalid_Cuda(const CeedVector vec) {
121   CeedVector_Cuda *impl;
122 
123   CeedCallBackend(CeedVectorGetData(vec, &impl));
124   impl->h_array = NULL;
125   impl->d_array = NULL;
126   return CEED_ERROR_SUCCESS;
127 }
128 
129 //------------------------------------------------------------------------------
130 // Check if CeedVector has any valid pointer
131 //------------------------------------------------------------------------------
132 static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, bool *has_valid_array) {
133   CeedVector_Cuda *impl;
134 
135   CeedCallBackend(CeedVectorGetData(vec, &impl));
136   *has_valid_array = impl->h_array || impl->d_array;
137   return CEED_ERROR_SUCCESS;
138 }
139 
140 //------------------------------------------------------------------------------
141 // Check if has array of given type
142 //------------------------------------------------------------------------------
143 static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) {
144   CeedVector_Cuda *impl;
145 
146   CeedCallBackend(CeedVectorGetData(vec, &impl));
147   switch (mem_type) {
148     case CEED_MEM_HOST:
149       *has_array_of_type = impl->h_array_borrowed || impl->h_array_owned;
150       break;
151     case CEED_MEM_DEVICE:
152       *has_array_of_type = impl->d_array_borrowed || impl->d_array_owned;
153       break;
154   }
155   return CEED_ERROR_SUCCESS;
156 }
157 
158 //------------------------------------------------------------------------------
159 // Check if has borrowed array of given type
160 //------------------------------------------------------------------------------
161 static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) {
162   CeedVector_Cuda *impl;
163 
164   CeedCallBackend(CeedVectorGetData(vec, &impl));
165   switch (mem_type) {
166     case CEED_MEM_HOST:
167       *has_borrowed_array_of_type = impl->h_array_borrowed;
168       break;
169     case CEED_MEM_DEVICE:
170       *has_borrowed_array_of_type = impl->d_array_borrowed;
171       break;
172   }
173   return CEED_ERROR_SUCCESS;
174 }
175 
176 //------------------------------------------------------------------------------
177 // Set array from host
178 //------------------------------------------------------------------------------
179 static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
180   CeedVector_Cuda *impl;
181 
182   CeedCallBackend(CeedVectorGetData(vec, &impl));
183   switch (copy_mode) {
184     case CEED_COPY_VALUES: {
185       if (!impl->h_array_owned) {
186         CeedSize length;
187 
188         CeedCallBackend(CeedVectorGetLength(vec, &length));
189         CeedCallBackend(CeedMalloc(length, &impl->h_array_owned));
190       }
191       impl->h_array_borrowed = NULL;
192       impl->h_array          = impl->h_array_owned;
193       if (array) {
194         CeedSize length;
195         size_t   bytes;
196 
197         CeedCallBackend(CeedVectorGetLength(vec, &length));
198         bytes = length * sizeof(CeedScalar);
199         memcpy(impl->h_array, array, bytes);
200       }
201     } break;
202     case CEED_OWN_POINTER:
203       CeedCallBackend(CeedFree(&impl->h_array_owned));
204       impl->h_array_owned    = array;
205       impl->h_array_borrowed = NULL;
206       impl->h_array          = array;
207       break;
208     case CEED_USE_POINTER:
209       CeedCallBackend(CeedFree(&impl->h_array_owned));
210       impl->h_array_borrowed = array;
211       impl->h_array          = array;
212       break;
213   }
214   return CEED_ERROR_SUCCESS;
215 }
216 
217 //------------------------------------------------------------------------------
218 // Set array from device
219 //------------------------------------------------------------------------------
220 static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
221   Ceed             ceed;
222   CeedVector_Cuda *impl;
223 
224   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
225   CeedCallBackend(CeedVectorGetData(vec, &impl));
226   switch (copy_mode) {
227     case CEED_COPY_VALUES: {
228       CeedSize length;
229       size_t   bytes;
230 
231       CeedCallBackend(CeedVectorGetLength(vec, &length));
232       bytes = length * sizeof(CeedScalar);
233       if (!impl->d_array_owned) {
234         CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes));
235       }
236       impl->d_array_borrowed = NULL;
237       impl->d_array          = impl->d_array_owned;
238       if (array) CeedCallCuda(ceed, cudaMemcpy(impl->d_array, array, bytes, cudaMemcpyDeviceToDevice));
239     } break;
240     case CEED_OWN_POINTER:
241       CeedCallCuda(ceed, cudaFree(impl->d_array_owned));
242       impl->d_array_owned    = array;
243       impl->d_array_borrowed = NULL;
244       impl->d_array          = array;
245       break;
246     case CEED_USE_POINTER:
247       CeedCallCuda(ceed, cudaFree(impl->d_array_owned));
248       impl->d_array_owned    = NULL;
249       impl->d_array_borrowed = array;
250       impl->d_array          = array;
251       break;
252   }
253   return CEED_ERROR_SUCCESS;
254 }
255 
256 //------------------------------------------------------------------------------
257 // Set the array used by a vector,
258 //   freeing any previously allocated array if applicable
259 //------------------------------------------------------------------------------
260 static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) {
261   CeedVector_Cuda *impl;
262 
263   CeedCallBackend(CeedVectorGetData(vec, &impl));
264   CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec));
265   switch (mem_type) {
266     case CEED_MEM_HOST:
267       return CeedVectorSetArrayHost_Cuda(vec, copy_mode, array);
268     case CEED_MEM_DEVICE:
269       return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array);
270   }
271   return CEED_ERROR_UNSUPPORTED;
272 }
273 
274 //------------------------------------------------------------------------------
275 // Set host array to value
276 //------------------------------------------------------------------------------
277 static int CeedHostSetValue_Cuda(CeedScalar *h_array, CeedSize length, CeedScalar val) {
278   for (CeedSize i = 0; i < length; i++) h_array[i] = val;
279   return CEED_ERROR_SUCCESS;
280 }
281 
282 //------------------------------------------------------------------------------
283 // Set device array to value (impl in .cu file)
284 //------------------------------------------------------------------------------
285 int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val);
286 
287 //------------------------------------------------------------------------------
288 // Set a vector to a value
289 //------------------------------------------------------------------------------
290 static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
291   CeedSize         length;
292   CeedVector_Cuda *impl;
293 
294   CeedCallBackend(CeedVectorGetData(vec, &impl));
295   CeedCallBackend(CeedVectorGetLength(vec, &length));
296   // Set value for synced device/host array
297   if (!impl->d_array && !impl->h_array) {
298     if (impl->d_array_borrowed) {
299       impl->d_array = impl->d_array_borrowed;
300     } else if (impl->h_array_borrowed) {
301       impl->h_array = impl->h_array_borrowed;
302     } else if (impl->d_array_owned) {
303       impl->d_array = impl->d_array_owned;
304     } else if (impl->h_array_owned) {
305       impl->h_array = impl->h_array_owned;
306     } else {
307       CeedCallBackend(CeedVectorSetArray(vec, CEED_MEM_DEVICE, CEED_COPY_VALUES, NULL));
308     }
309   }
310   if (impl->d_array) {
311     CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val));
312     impl->h_array = NULL;
313   }
314   if (impl->h_array) {
315     CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val));
316     impl->d_array = NULL;
317   }
318   return CEED_ERROR_SUCCESS;
319 }
320 
321 //------------------------------------------------------------------------------
322 // Vector Take Array
323 //------------------------------------------------------------------------------
324 static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
325   CeedVector_Cuda *impl;
326 
327   CeedCallBackend(CeedVectorGetData(vec, &impl));
328   // Sync array to requested mem_type
329   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
330   // Update pointer
331   switch (mem_type) {
332     case CEED_MEM_HOST:
333       (*array)               = impl->h_array_borrowed;
334       impl->h_array_borrowed = NULL;
335       impl->h_array          = NULL;
336       break;
337     case CEED_MEM_DEVICE:
338       (*array)               = impl->d_array_borrowed;
339       impl->d_array_borrowed = NULL;
340       impl->d_array          = NULL;
341       break;
342   }
343   return CEED_ERROR_SUCCESS;
344 }
345 
346 //------------------------------------------------------------------------------
347 // Core logic for array syncronization for GetArray.
348 //   If a different memory type is most up to date, this will perform a copy
349 //------------------------------------------------------------------------------
350 static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
351   CeedVector_Cuda *impl;
352 
353   CeedCallBackend(CeedVectorGetData(vec, &impl));
354   // Sync array to requested mem_type
355   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
356   // Update pointer
357   switch (mem_type) {
358     case CEED_MEM_HOST:
359       *array = impl->h_array;
360       break;
361     case CEED_MEM_DEVICE:
362       *array = impl->d_array;
363       break;
364   }
365   return CEED_ERROR_SUCCESS;
366 }
367 
368 //------------------------------------------------------------------------------
369 // Get read-only access to a vector via the specified mem_type
370 //------------------------------------------------------------------------------
371 static int CeedVectorGetArrayRead_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedScalar **array) {
372   return CeedVectorGetArrayCore_Cuda(vec, mem_type, (CeedScalar **)array);
373 }
374 
375 //------------------------------------------------------------------------------
376 // Get read/write access to a vector via the specified mem_type
377 //------------------------------------------------------------------------------
378 static int CeedVectorGetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
379   CeedVector_Cuda *impl;
380 
381   CeedCallBackend(CeedVectorGetData(vec, &impl));
382   CeedCallBackend(CeedVectorGetArrayCore_Cuda(vec, mem_type, array));
383   CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec));
384   switch (mem_type) {
385     case CEED_MEM_HOST:
386       impl->h_array = *array;
387       break;
388     case CEED_MEM_DEVICE:
389       impl->d_array = *array;
390       break;
391   }
392   return CEED_ERROR_SUCCESS;
393 }
394 
395 //------------------------------------------------------------------------------
396 // Get write access to a vector via the specified mem_type
397 //------------------------------------------------------------------------------
398 static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
399   bool             has_array_of_type = true;
400   CeedVector_Cuda *impl;
401 
402   CeedCallBackend(CeedVectorGetData(vec, &impl));
403   CeedCallBackend(CeedVectorHasArrayOfType_Cuda(vec, mem_type, &has_array_of_type));
404   if (!has_array_of_type) {
405     // Allocate if array is not yet allocated
406     CeedCallBackend(CeedVectorSetArray(vec, mem_type, CEED_COPY_VALUES, NULL));
407   } else {
408     // Select dirty array
409     switch (mem_type) {
410       case CEED_MEM_HOST:
411         if (impl->h_array_borrowed) impl->h_array = impl->h_array_borrowed;
412         else impl->h_array = impl->h_array_owned;
413         break;
414       case CEED_MEM_DEVICE:
415         if (impl->d_array_borrowed) impl->d_array = impl->d_array_borrowed;
416         else impl->d_array = impl->d_array_owned;
417     }
418   }
419   return CeedVectorGetArray_Cuda(vec, mem_type, array);
420 }
421 
422 //------------------------------------------------------------------------------
423 // Get the norm of a CeedVector
424 //------------------------------------------------------------------------------
425 static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) {
426   Ceed     ceed;
427   CeedSize length;
428 #if CUDA_VERSION < 12000
429   CeedSize num_calls;
430 #endif
431   const CeedScalar *d_array;
432   CeedVector_Cuda  *impl;
433   cublasHandle_t    handle;
434 
435   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
436   CeedCallBackend(CeedVectorGetData(vec, &impl));
437   CeedCallBackend(CeedVectorGetLength(vec, &length));
438   CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
439 
440 #if CUDA_VERSION < 12000
441   // With CUDA 12, we can use the 64-bit integer interface. Prior to that,
442   // we need to check if the vector is too long to handle with int32,
443   // and if so, divide it into subsections for repeated cuBLAS calls.
444   num_calls = length / INT_MAX;
445   if (length % INT_MAX > 0) num_calls += 1;
446 #endif
447 
448   // Compute norm
449   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
450   switch (type) {
451     case CEED_NORM_1: {
452       *norm = 0.0;
453       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
454 #if CUDA_VERSION >= 12000  // We have CUDA 12, and can use 64-bit integers
455         CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
456 #else
457         float  sub_norm = 0.0;
458         float *d_array_start;
459 
460         for (CeedInt i = 0; i < num_calls; i++) {
461           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
462           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
463           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
464 
465           CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
466           *norm += sub_norm;
467         }
468 #endif
469       } else {
470 #if CUDA_VERSION >= 12000
471         CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
472 #else
473         double  sub_norm = 0.0;
474         double *d_array_start;
475 
476         for (CeedInt i = 0; i < num_calls; i++) {
477           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
478           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
479           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
480 
481           CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
482           *norm += sub_norm;
483         }
484 #endif
485       }
486       break;
487     }
488     case CEED_NORM_2: {
489       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
490 #if CUDA_VERSION >= 12000
491         CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
492 #else
493         float  sub_norm = 0.0, norm_sum = 0.0;
494         float *d_array_start;
495 
496         for (CeedInt i = 0; i < num_calls; i++) {
497           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
498           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
499           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
500 
501           CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
502           norm_sum += sub_norm * sub_norm;
503         }
504         *norm = sqrt(norm_sum);
505 #endif
506       } else {
507 #if CUDA_VERSION >= 12000
508         CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
509 #else
510         double  sub_norm = 0.0, norm_sum = 0.0;
511         double *d_array_start;
512 
513         for (CeedInt i = 0; i < num_calls; i++) {
514           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
515           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
516           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
517 
518           CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
519           norm_sum += sub_norm * sub_norm;
520         }
521         *norm = sqrt(norm_sum);
522 #endif
523       }
524       break;
525     }
526     case CEED_NORM_MAX: {
527       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
528 #if CUDA_VERSION >= 12000
529         int64_t    index;
530         CeedScalar norm_no_abs;
531 
532         CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
533         CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
534         *norm = fabs(norm_no_abs);
535 #else
536         CeedInt index;
537         float   sub_max = 0.0, current_max = 0.0;
538         float  *d_array_start;
539 
540         for (CeedInt i = 0; i < num_calls; i++) {
541           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
542           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
543           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
544 
545           CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
546           CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
547           if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
548         }
549         *norm = current_max;
550 #endif
551       } else {
552 #if CUDA_VERSION >= 12000
553         int64_t    index;
554         CeedScalar norm_no_abs;
555 
556         CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
557         CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
558         *norm = fabs(norm_no_abs);
559 #else
560         CeedInt index;
561         double  sub_max = 0.0, current_max = 0.0;
562         double *d_array_start;
563 
564         for (CeedInt i = 0; i < num_calls; i++) {
565           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
566           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
567           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
568 
569           CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
570           CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
571           if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
572         }
573         *norm = current_max;
574 #endif
575       }
576       break;
577     }
578   }
579   CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array));
580   return CEED_ERROR_SUCCESS;
581 }
582 
583 //------------------------------------------------------------------------------
584 // Take reciprocal of a vector on host
585 //------------------------------------------------------------------------------
586 static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedSize length) {
587   for (CeedSize i = 0; i < length; i++) {
588     if (fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i];
589   }
590   return CEED_ERROR_SUCCESS;
591 }
592 
593 //------------------------------------------------------------------------------
594 // Take reciprocal of a vector on device (impl in .cu file)
595 //------------------------------------------------------------------------------
596 int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length);
597 
598 //------------------------------------------------------------------------------
599 // Take reciprocal of a vector
600 //------------------------------------------------------------------------------
601 static int CeedVectorReciprocal_Cuda(CeedVector vec) {
602   CeedSize         length;
603   CeedVector_Cuda *impl;
604 
605   CeedCallBackend(CeedVectorGetData(vec, &impl));
606   CeedCallBackend(CeedVectorGetLength(vec, &length));
607   // Set value for synced device/host array
608   if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Cuda(impl->d_array, length));
609   if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Cuda(impl->h_array, length));
610   return CEED_ERROR_SUCCESS;
611 }
612 
613 //------------------------------------------------------------------------------
614 // Compute x = alpha x on the host
615 //------------------------------------------------------------------------------
616 static int CeedHostScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
617   for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha;
618   return CEED_ERROR_SUCCESS;
619 }
620 
621 //------------------------------------------------------------------------------
622 // Compute x = alpha x on device (impl in .cu file)
623 //------------------------------------------------------------------------------
624 int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length);
625 
626 //------------------------------------------------------------------------------
627 // Compute x = alpha x
628 //------------------------------------------------------------------------------
629 static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) {
630   CeedSize         length;
631   CeedVector_Cuda *x_impl;
632 
633   CeedCallBackend(CeedVectorGetData(x, &x_impl));
634   CeedCallBackend(CeedVectorGetLength(x, &length));
635   // Set value for synced device/host array
636   if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length));
637   if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length));
638   return CEED_ERROR_SUCCESS;
639 }
640 
641 //------------------------------------------------------------------------------
642 // Compute y = alpha x + y on the host
643 //------------------------------------------------------------------------------
644 static int CeedHostAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
645   for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
646   return CEED_ERROR_SUCCESS;
647 }
648 
649 //------------------------------------------------------------------------------
650 // Compute y = alpha x + y on device (impl in .cu file)
651 //------------------------------------------------------------------------------
652 int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length);
653 
654 //------------------------------------------------------------------------------
655 // Compute y = alpha x + y
656 //------------------------------------------------------------------------------
657 static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
658   Ceed             ceed;
659   CeedSize         length;
660   CeedVector_Cuda *y_impl, *x_impl;
661 
662   CeedCallBackend(CeedVectorGetCeed(y, &ceed));
663   CeedCallBackend(CeedVectorGetData(y, &y_impl));
664   CeedCallBackend(CeedVectorGetData(x, &x_impl));
665   CeedCallBackend(CeedVectorGetLength(y, &length));
666   // Set value for synced device/host array
667   if (y_impl->d_array) {
668     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
669     CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length));
670   }
671   if (y_impl->h_array) {
672     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
673     CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length));
674   }
675   return CEED_ERROR_SUCCESS;
676 }
677 
678 //------------------------------------------------------------------------------
679 // Compute y = alpha x + beta y on the host
680 //------------------------------------------------------------------------------
681 static int CeedHostAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
682   for (CeedSize i = 0; i < length; i++) y_array[i] = alpha * x_array[i] + beta * y_array[i];
683   return CEED_ERROR_SUCCESS;
684 }
685 
686 //------------------------------------------------------------------------------
687 // Compute y = alpha x + beta y on device (impl in .cu file)
688 //------------------------------------------------------------------------------
689 int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length);
690 
691 //------------------------------------------------------------------------------
692 // Compute y = alpha x + beta y
693 //------------------------------------------------------------------------------
694 static int CeedVectorAXPBY_Cuda(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) {
695   CeedSize         length;
696   CeedVector_Cuda *y_impl, *x_impl;
697 
698   CeedCallBackend(CeedVectorGetData(y, &y_impl));
699   CeedCallBackend(CeedVectorGetData(x, &x_impl));
700   CeedCallBackend(CeedVectorGetLength(y, &length));
701   // Set value for synced device/host array
702   if (y_impl->d_array) {
703     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
704     CeedCallBackend(CeedDeviceAXPBY_Cuda(y_impl->d_array, alpha, beta, x_impl->d_array, length));
705   }
706   if (y_impl->h_array) {
707     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
708     CeedCallBackend(CeedHostAXPBY_Cuda(y_impl->h_array, alpha, beta, x_impl->h_array, length));
709   }
710   return CEED_ERROR_SUCCESS;
711 }
712 
713 //------------------------------------------------------------------------------
714 // Compute the pointwise multiplication w = x .* y on the host
715 //------------------------------------------------------------------------------
716 static int CeedHostPointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
717   for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
718   return CEED_ERROR_SUCCESS;
719 }
720 
721 //------------------------------------------------------------------------------
722 // Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
723 //------------------------------------------------------------------------------
724 int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length);
725 
726 //------------------------------------------------------------------------------
727 // Compute the pointwise multiplication w = x .* y
728 //------------------------------------------------------------------------------
729 static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, CeedVector y) {
730   CeedSize         length;
731   CeedVector_Cuda *w_impl, *x_impl, *y_impl;
732 
733   CeedCallBackend(CeedVectorGetData(w, &w_impl));
734   CeedCallBackend(CeedVectorGetData(x, &x_impl));
735   CeedCallBackend(CeedVectorGetData(y, &y_impl));
736   CeedCallBackend(CeedVectorGetLength(w, &length));
737   // Set value for synced device/host array
738   if (!w_impl->d_array && !w_impl->h_array) {
739     CeedCallBackend(CeedVectorSetValue(w, 0.0));
740   }
741   if (w_impl->d_array) {
742     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
743     CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_DEVICE));
744     CeedCallBackend(CeedDevicePointwiseMult_Cuda(w_impl->d_array, x_impl->d_array, y_impl->d_array, length));
745   }
746   if (w_impl->h_array) {
747     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
748     CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST));
749     CeedCallBackend(CeedHostPointwiseMult_Cuda(w_impl->h_array, x_impl->h_array, y_impl->h_array, length));
750   }
751   return CEED_ERROR_SUCCESS;
752 }
753 
754 //------------------------------------------------------------------------------
755 // Destroy the vector
756 //------------------------------------------------------------------------------
757 static int CeedVectorDestroy_Cuda(const CeedVector vec) {
758   CeedVector_Cuda *impl;
759 
760   CeedCallBackend(CeedVectorGetData(vec, &impl));
761   CeedCallCuda(CeedVectorReturnCeed(vec), cudaFree(impl->d_array_owned));
762   CeedCallBackend(CeedFree(&impl->h_array_owned));
763   CeedCallBackend(CeedFree(&impl));
764   return CEED_ERROR_SUCCESS;
765 }
766 
767 //------------------------------------------------------------------------------
768 // Create a vector of the specified length (does not allocate memory)
769 //------------------------------------------------------------------------------
770 int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) {
771   CeedVector_Cuda *impl;
772   Ceed             ceed;
773 
774   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
775   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Cuda));
776   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda));
777   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda));
778   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda));
779   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Cuda));
780   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda));
781   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda));
782   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda));
783   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Cuda));
784   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Cuda));
785   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Cuda));
786   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Cuda));
787   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Cuda));
788   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Cuda));
789   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda));
790   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda));
791   CeedCallBackend(CeedCalloc(1, &impl));
792   CeedCallBackend(CeedVectorSetData(vec, impl));
793   return CEED_ERROR_SUCCESS;
794 }
795 
796 //------------------------------------------------------------------------------
797