xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-vector.c (revision f59ebe5e436dfb5c2d53e11c74c720481a8777c4) !
1 // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <cuda_runtime.h>
11 #include <math.h>
12 #include <stdbool.h>
13 #include <string.h>
14 
15 #include "../cuda/ceed-cuda-common.h"
16 #include "ceed-cuda-ref.h"
17 
18 //------------------------------------------------------------------------------
19 // Check if host/device sync is needed
20 //------------------------------------------------------------------------------
21 static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_type, bool *need_sync) {
22   bool             has_valid_array = false;
23   CeedVector_Cuda *impl;
24 
25   CeedCallBackend(CeedVectorGetData(vec, &impl));
26   CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array));
27   switch (mem_type) {
28     case CEED_MEM_HOST:
29       *need_sync = has_valid_array && !impl->h_array;
30       break;
31     case CEED_MEM_DEVICE:
32       *need_sync = has_valid_array && !impl->d_array;
33       break;
34   }
35   return CEED_ERROR_SUCCESS;
36 }
37 
38 //------------------------------------------------------------------------------
39 // Sync host to device
40 //------------------------------------------------------------------------------
41 static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
42   CeedSize         length;
43   size_t           bytes;
44   Ceed             ceed;
45   CeedVector_Cuda *impl;
46 
47   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
48   CeedCallBackend(CeedVectorGetData(vec, &impl));
49 
50   CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device");
51 
52   CeedCallBackend(CeedVectorGetLength(vec, &length));
53   bytes = length * sizeof(CeedScalar);
54   if (impl->d_array_borrowed) {
55     impl->d_array = impl->d_array_borrowed;
56   } else if (impl->d_array_owned) {
57     impl->d_array = impl->d_array_owned;
58   } else {
59     CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes));
60     impl->d_array = impl->d_array_owned;
61   }
62   CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice));
63   return CEED_ERROR_SUCCESS;
64 }
65 
66 //------------------------------------------------------------------------------
67 // Sync device to host
68 //------------------------------------------------------------------------------
69 static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) {
70   CeedSize         length;
71   Ceed             ceed;
72   CeedVector_Cuda *impl;
73 
74   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
75   CeedCallBackend(CeedVectorGetData(vec, &impl));
76 
77   CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
78 
79   if (impl->h_array_borrowed) {
80     impl->h_array = impl->h_array_borrowed;
81   } else if (impl->h_array_owned) {
82     impl->h_array = impl->h_array_owned;
83   } else {
84     CeedSize length;
85 
86     CeedCallBackend(CeedVectorGetLength(vec, &length));
87     CeedCallBackend(CeedCalloc(length, &impl->h_array_owned));
88     impl->h_array = impl->h_array_owned;
89   }
90 
91   CeedCallBackend(CeedVectorGetLength(vec, &length));
92   size_t bytes = length * sizeof(CeedScalar);
93 
94   CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost));
95   return CEED_ERROR_SUCCESS;
96 }
97 
98 //------------------------------------------------------------------------------
99 // Sync arrays
100 //------------------------------------------------------------------------------
101 static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type) {
102   bool need_sync = false;
103 
104   // Check whether device/host sync is needed
105   CeedCallBackend(CeedVectorNeedSync_Cuda(vec, mem_type, &need_sync));
106   if (!need_sync) return CEED_ERROR_SUCCESS;
107 
108   switch (mem_type) {
109     case CEED_MEM_HOST:
110       return CeedVectorSyncD2H_Cuda(vec);
111     case CEED_MEM_DEVICE:
112       return CeedVectorSyncH2D_Cuda(vec);
113   }
114   return CEED_ERROR_UNSUPPORTED;
115 }
116 
117 //------------------------------------------------------------------------------
118 // Set all pointers as invalid
119 //------------------------------------------------------------------------------
120 static inline int CeedVectorSetAllInvalid_Cuda(const CeedVector vec) {
121   CeedVector_Cuda *impl;
122 
123   CeedCallBackend(CeedVectorGetData(vec, &impl));
124   impl->h_array = NULL;
125   impl->d_array = NULL;
126   return CEED_ERROR_SUCCESS;
127 }
128 
129 //------------------------------------------------------------------------------
130 // Check if CeedVector has any valid pointer
131 //------------------------------------------------------------------------------
132 static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, bool *has_valid_array) {
133   CeedVector_Cuda *impl;
134 
135   CeedCallBackend(CeedVectorGetData(vec, &impl));
136   *has_valid_array = impl->h_array || impl->d_array;
137   return CEED_ERROR_SUCCESS;
138 }
139 
140 //------------------------------------------------------------------------------
141 // Check if has array of given type
142 //------------------------------------------------------------------------------
143 static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) {
144   CeedVector_Cuda *impl;
145 
146   CeedCallBackend(CeedVectorGetData(vec, &impl));
147   switch (mem_type) {
148     case CEED_MEM_HOST:
149       *has_array_of_type = impl->h_array_borrowed || impl->h_array_owned;
150       break;
151     case CEED_MEM_DEVICE:
152       *has_array_of_type = impl->d_array_borrowed || impl->d_array_owned;
153       break;
154   }
155   return CEED_ERROR_SUCCESS;
156 }
157 
158 //------------------------------------------------------------------------------
159 // Check if has borrowed array of given type
160 //------------------------------------------------------------------------------
161 static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) {
162   CeedVector_Cuda *impl;
163 
164   CeedCallBackend(CeedVectorGetData(vec, &impl));
165   switch (mem_type) {
166     case CEED_MEM_HOST:
167       *has_borrowed_array_of_type = impl->h_array_borrowed;
168       break;
169     case CEED_MEM_DEVICE:
170       *has_borrowed_array_of_type = impl->d_array_borrowed;
171       break;
172   }
173   return CEED_ERROR_SUCCESS;
174 }
175 
176 //------------------------------------------------------------------------------
177 // Set array from host
178 //------------------------------------------------------------------------------
179 static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
180   CeedSize         length;
181   CeedVector_Cuda *impl;
182 
183   CeedCallBackend(CeedVectorGetData(vec, &impl));
184   CeedCallBackend(CeedVectorGetLength(vec, &length));
185 
186   switch (copy_mode) {
187     case CEED_COPY_VALUES:
188       if (!impl->h_array_owned) CeedCallBackend(CeedMalloc(length, &impl->h_array_owned));
189       if (array) memcpy(impl->h_array_owned, array, length * sizeof(array[0]));
190       impl->h_array_borrowed = NULL;
191       impl->h_array          = impl->h_array_owned;
192       break;
193     case CEED_OWN_POINTER:
194       CeedCallBackend(CeedFree(&impl->h_array_owned));
195       impl->h_array_owned    = array;
196       impl->h_array_borrowed = NULL;
197       impl->h_array          = impl->h_array_owned;
198       break;
199     case CEED_USE_POINTER:
200       CeedCallBackend(CeedFree(&impl->h_array_owned));
201       impl->h_array_borrowed = array;
202       impl->h_array          = impl->h_array_borrowed;
203       break;
204   }
205   return CEED_ERROR_SUCCESS;
206 }
207 
208 //------------------------------------------------------------------------------
209 // Set array from device
210 //------------------------------------------------------------------------------
211 static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
212   CeedSize         length;
213   Ceed             ceed;
214   CeedVector_Cuda *impl;
215 
216   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
217   CeedCallBackend(CeedVectorGetData(vec, &impl));
218   CeedCallBackend(CeedVectorGetLength(vec, &length));
219 
220   switch (copy_mode) {
221     case CEED_COPY_VALUES:
222       if (!impl->d_array_owned) CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, length * sizeof(array[0])));
223       if (array) CeedCallCuda(ceed, cudaMemcpy(impl->d_array_owned, array, length * sizeof(array[0]), cudaMemcpyDeviceToDevice));
224       impl->d_array_borrowed = NULL;
225       impl->d_array          = impl->d_array_owned;
226       break;
227     case CEED_OWN_POINTER:
228       CeedCallCuda(ceed, cudaFree(impl->d_array_owned));
229       impl->d_array_owned    = array;
230       impl->d_array_borrowed = NULL;
231       impl->d_array          = impl->d_array_owned;
232       break;
233     case CEED_USE_POINTER:
234       CeedCallCuda(ceed, cudaFree(impl->d_array_owned));
235       impl->d_array_owned    = NULL;
236       impl->d_array_borrowed = array;
237       impl->d_array          = impl->d_array_borrowed;
238       break;
239   }
240   return CEED_ERROR_SUCCESS;
241 }
242 
243 //------------------------------------------------------------------------------
244 // Set the array used by a vector,
245 //   freeing any previously allocated array if applicable
246 //------------------------------------------------------------------------------
247 static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) {
248   CeedVector_Cuda *impl;
249 
250   CeedCallBackend(CeedVectorGetData(vec, &impl));
251   CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec));
252   switch (mem_type) {
253     case CEED_MEM_HOST:
254       return CeedVectorSetArrayHost_Cuda(vec, copy_mode, array);
255     case CEED_MEM_DEVICE:
256       return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array);
257   }
258   return CEED_ERROR_UNSUPPORTED;
259 }
260 
261 //------------------------------------------------------------------------------
262 // Set host array to value
263 //------------------------------------------------------------------------------
264 static int CeedHostSetValue_Cuda(CeedScalar *h_array, CeedSize length, CeedScalar val) {
265   for (CeedSize i = 0; i < length; i++) h_array[i] = val;
266   return CEED_ERROR_SUCCESS;
267 }
268 
269 //------------------------------------------------------------------------------
270 // Set device array to value (impl in .cu file)
271 //------------------------------------------------------------------------------
272 int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val);
273 
274 //------------------------------------------------------------------------------
275 // Set a vector to a value
276 //------------------------------------------------------------------------------
277 static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
278   CeedSize         length;
279   CeedVector_Cuda *impl;
280 
281   CeedCallBackend(CeedVectorGetData(vec, &impl));
282   CeedCallBackend(CeedVectorGetLength(vec, &length));
283   // Set value for synced device/host array
284   if (!impl->d_array && !impl->h_array) {
285     if (impl->d_array_borrowed) {
286       impl->d_array = impl->d_array_borrowed;
287     } else if (impl->h_array_borrowed) {
288       impl->h_array = impl->h_array_borrowed;
289     } else if (impl->d_array_owned) {
290       impl->d_array = impl->d_array_owned;
291     } else if (impl->h_array_owned) {
292       impl->h_array = impl->h_array_owned;
293     } else {
294       CeedCallBackend(CeedVectorSetArray(vec, CEED_MEM_DEVICE, CEED_COPY_VALUES, NULL));
295     }
296   }
297   if (impl->d_array) {
298     CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val));
299     impl->h_array = NULL;
300   }
301   if (impl->h_array) {
302     CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val));
303     impl->d_array = NULL;
304   }
305   return CEED_ERROR_SUCCESS;
306 }
307 
308 //------------------------------------------------------------------------------
309 // Vector Take Array
310 //------------------------------------------------------------------------------
311 static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
312   CeedVector_Cuda *impl;
313 
314   CeedCallBackend(CeedVectorGetData(vec, &impl));
315   // Sync array to requested mem_type
316   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
317   // Update pointer
318   switch (mem_type) {
319     case CEED_MEM_HOST:
320       (*array)               = impl->h_array_borrowed;
321       impl->h_array_borrowed = NULL;
322       impl->h_array          = NULL;
323       break;
324     case CEED_MEM_DEVICE:
325       (*array)               = impl->d_array_borrowed;
326       impl->d_array_borrowed = NULL;
327       impl->d_array          = NULL;
328       break;
329   }
330   return CEED_ERROR_SUCCESS;
331 }
332 
333 //------------------------------------------------------------------------------
334 // Core logic for array syncronization for GetArray.
335 //   If a different memory type is most up to date, this will perform a copy
336 //------------------------------------------------------------------------------
337 static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
338   CeedVector_Cuda *impl;
339 
340   CeedCallBackend(CeedVectorGetData(vec, &impl));
341   // Sync array to requested mem_type
342   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
343   // Update pointer
344   switch (mem_type) {
345     case CEED_MEM_HOST:
346       *array = impl->h_array;
347       break;
348     case CEED_MEM_DEVICE:
349       *array = impl->d_array;
350       break;
351   }
352   return CEED_ERROR_SUCCESS;
353 }
354 
355 //------------------------------------------------------------------------------
356 // Get read-only access to a vector via the specified mem_type
357 //------------------------------------------------------------------------------
358 static int CeedVectorGetArrayRead_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedScalar **array) {
359   return CeedVectorGetArrayCore_Cuda(vec, mem_type, (CeedScalar **)array);
360 }
361 
362 //------------------------------------------------------------------------------
363 // Get read/write access to a vector via the specified mem_type
364 //------------------------------------------------------------------------------
365 static int CeedVectorGetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
366   CeedVector_Cuda *impl;
367 
368   CeedCallBackend(CeedVectorGetData(vec, &impl));
369   CeedCallBackend(CeedVectorGetArrayCore_Cuda(vec, mem_type, array));
370   CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec));
371   switch (mem_type) {
372     case CEED_MEM_HOST:
373       impl->h_array = *array;
374       break;
375     case CEED_MEM_DEVICE:
376       impl->d_array = *array;
377       break;
378   }
379   return CEED_ERROR_SUCCESS;
380 }
381 
382 //------------------------------------------------------------------------------
383 // Get write access to a vector via the specified mem_type
384 //------------------------------------------------------------------------------
385 static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
386   bool             has_array_of_type = true;
387   CeedVector_Cuda *impl;
388 
389   CeedCallBackend(CeedVectorGetData(vec, &impl));
390   CeedCallBackend(CeedVectorHasArrayOfType_Cuda(vec, mem_type, &has_array_of_type));
391   if (!has_array_of_type) {
392     // Allocate if array is not yet allocated
393     CeedCallBackend(CeedVectorSetArray(vec, mem_type, CEED_COPY_VALUES, NULL));
394   } else {
395     // Select dirty array
396     switch (mem_type) {
397       case CEED_MEM_HOST:
398         if (impl->h_array_borrowed) impl->h_array = impl->h_array_borrowed;
399         else impl->h_array = impl->h_array_owned;
400         break;
401       case CEED_MEM_DEVICE:
402         if (impl->d_array_borrowed) impl->d_array = impl->d_array_borrowed;
403         else impl->d_array = impl->d_array_owned;
404     }
405   }
406   return CeedVectorGetArray_Cuda(vec, mem_type, array);
407 }
408 
409 //------------------------------------------------------------------------------
410 // Get the norm of a CeedVector
411 //------------------------------------------------------------------------------
412 static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) {
413   Ceed     ceed;
414   CeedSize length;
415 #if CUDA_VERSION < 12000
416   CeedSize num_calls;
417 #endif
418   const CeedScalar *d_array;
419   CeedVector_Cuda  *impl;
420   cublasHandle_t    handle;
421 
422   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
423   CeedCallBackend(CeedVectorGetData(vec, &impl));
424   CeedCallBackend(CeedVectorGetLength(vec, &length));
425   CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
426 
427 #if CUDA_VERSION < 12000
428   // With CUDA 12, we can use the 64-bit integer interface. Prior to that,
429   // we need to check if the vector is too long to handle with int32,
430   // and if so, divide it into subsections for repeated cuBLAS calls.
431   num_calls = length / INT_MAX;
432   if (length % INT_MAX > 0) num_calls += 1;
433 #endif
434 
435   // Compute norm
436   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
437   switch (type) {
438     case CEED_NORM_1: {
439       *norm = 0.0;
440       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
441 #if CUDA_VERSION >= 12000  // We have CUDA 12, and can use 64-bit integers
442         CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
443 #else
444         float  sub_norm = 0.0;
445         float *d_array_start;
446 
447         for (CeedInt i = 0; i < num_calls; i++) {
448           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
449           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
450           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
451 
452           CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
453           *norm += sub_norm;
454         }
455 #endif
456       } else {
457 #if CUDA_VERSION >= 12000
458         CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
459 #else
460         double  sub_norm = 0.0;
461         double *d_array_start;
462 
463         for (CeedInt i = 0; i < num_calls; i++) {
464           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
465           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
466           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
467 
468           CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
469           *norm += sub_norm;
470         }
471 #endif
472       }
473       break;
474     }
475     case CEED_NORM_2: {
476       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
477 #if CUDA_VERSION >= 12000
478         CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
479 #else
480         float  sub_norm = 0.0, norm_sum = 0.0;
481         float *d_array_start;
482 
483         for (CeedInt i = 0; i < num_calls; i++) {
484           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
485           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
486           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
487 
488           CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
489           norm_sum += sub_norm * sub_norm;
490         }
491         *norm = sqrt(norm_sum);
492 #endif
493       } else {
494 #if CUDA_VERSION >= 12000
495         CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
496 #else
497         double  sub_norm = 0.0, norm_sum = 0.0;
498         double *d_array_start;
499 
500         for (CeedInt i = 0; i < num_calls; i++) {
501           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
502           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
503           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
504 
505           CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
506           norm_sum += sub_norm * sub_norm;
507         }
508         *norm = sqrt(norm_sum);
509 #endif
510       }
511       break;
512     }
513     case CEED_NORM_MAX: {
514       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
515 #if CUDA_VERSION >= 12000
516         int64_t    index;
517         CeedScalar norm_no_abs;
518 
519         CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
520         CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
521         *norm = fabs(norm_no_abs);
522 #else
523         CeedInt index;
524         float   sub_max = 0.0, current_max = 0.0;
525         float  *d_array_start;
526 
527         for (CeedInt i = 0; i < num_calls; i++) {
528           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
529           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
530           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
531 
532           CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
533           CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
534           if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
535         }
536         *norm = current_max;
537 #endif
538       } else {
539 #if CUDA_VERSION >= 12000
540         int64_t    index;
541         CeedScalar norm_no_abs;
542 
543         CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
544         CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
545         *norm = fabs(norm_no_abs);
546 #else
547         CeedInt index;
548         double  sub_max = 0.0, current_max = 0.0;
549         double *d_array_start;
550 
551         for (CeedInt i = 0; i < num_calls; i++) {
552           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
553           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
554           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
555 
556           CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
557           CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
558           if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
559         }
560         *norm = current_max;
561 #endif
562       }
563       break;
564     }
565   }
566   CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array));
567   return CEED_ERROR_SUCCESS;
568 }
569 
570 //------------------------------------------------------------------------------
571 // Take reciprocal of a vector on host
572 //------------------------------------------------------------------------------
573 static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedSize length) {
574   for (CeedSize i = 0; i < length; i++) {
575     if (fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i];
576   }
577   return CEED_ERROR_SUCCESS;
578 }
579 
580 //------------------------------------------------------------------------------
581 // Take reciprocal of a vector on device (impl in .cu file)
582 //------------------------------------------------------------------------------
583 int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length);
584 
585 //------------------------------------------------------------------------------
586 // Take reciprocal of a vector
587 //------------------------------------------------------------------------------
588 static int CeedVectorReciprocal_Cuda(CeedVector vec) {
589   CeedSize         length;
590   CeedVector_Cuda *impl;
591 
592   CeedCallBackend(CeedVectorGetData(vec, &impl));
593   CeedCallBackend(CeedVectorGetLength(vec, &length));
594   // Set value for synced device/host array
595   if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Cuda(impl->d_array, length));
596   if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Cuda(impl->h_array, length));
597   return CEED_ERROR_SUCCESS;
598 }
599 
600 //------------------------------------------------------------------------------
601 // Compute x = alpha x on the host
602 //------------------------------------------------------------------------------
603 static int CeedHostScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
604   for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha;
605   return CEED_ERROR_SUCCESS;
606 }
607 
608 //------------------------------------------------------------------------------
609 // Compute x = alpha x on device (impl in .cu file)
610 //------------------------------------------------------------------------------
611 int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length);
612 
613 //------------------------------------------------------------------------------
614 // Compute x = alpha x
615 //------------------------------------------------------------------------------
616 static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) {
617   CeedSize         length;
618   CeedVector_Cuda *x_impl;
619 
620   CeedCallBackend(CeedVectorGetData(x, &x_impl));
621   CeedCallBackend(CeedVectorGetLength(x, &length));
622   // Set value for synced device/host array
623   if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length));
624   if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length));
625   return CEED_ERROR_SUCCESS;
626 }
627 
628 //------------------------------------------------------------------------------
629 // Compute y = alpha x + y on the host
630 //------------------------------------------------------------------------------
631 static int CeedHostAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
632   for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
633   return CEED_ERROR_SUCCESS;
634 }
635 
636 //------------------------------------------------------------------------------
637 // Compute y = alpha x + y on device (impl in .cu file)
638 //------------------------------------------------------------------------------
639 int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length);
640 
641 //------------------------------------------------------------------------------
642 // Compute y = alpha x + y
643 //------------------------------------------------------------------------------
644 static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
645   Ceed             ceed;
646   CeedSize         length;
647   CeedVector_Cuda *y_impl, *x_impl;
648 
649   CeedCallBackend(CeedVectorGetCeed(y, &ceed));
650   CeedCallBackend(CeedVectorGetData(y, &y_impl));
651   CeedCallBackend(CeedVectorGetData(x, &x_impl));
652   CeedCallBackend(CeedVectorGetLength(y, &length));
653   // Set value for synced device/host array
654   if (y_impl->d_array) {
655     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
656     CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length));
657   }
658   if (y_impl->h_array) {
659     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
660     CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length));
661   }
662   return CEED_ERROR_SUCCESS;
663 }
664 
665 //------------------------------------------------------------------------------
666 // Compute y = alpha x + beta y on the host
667 //------------------------------------------------------------------------------
668 static int CeedHostAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
669   for (CeedSize i = 0; i < length; i++) y_array[i] = alpha * x_array[i] + beta * y_array[i];
670   return CEED_ERROR_SUCCESS;
671 }
672 
673 //------------------------------------------------------------------------------
674 // Compute y = alpha x + beta y on device (impl in .cu file)
675 //------------------------------------------------------------------------------
676 int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length);
677 
678 //------------------------------------------------------------------------------
679 // Compute y = alpha x + beta y
680 //------------------------------------------------------------------------------
681 static int CeedVectorAXPBY_Cuda(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) {
682   CeedSize         length;
683   CeedVector_Cuda *y_impl, *x_impl;
684 
685   CeedCallBackend(CeedVectorGetData(y, &y_impl));
686   CeedCallBackend(CeedVectorGetData(x, &x_impl));
687   CeedCallBackend(CeedVectorGetLength(y, &length));
688   // Set value for synced device/host array
689   if (y_impl->d_array) {
690     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
691     CeedCallBackend(CeedDeviceAXPBY_Cuda(y_impl->d_array, alpha, beta, x_impl->d_array, length));
692   }
693   if (y_impl->h_array) {
694     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
695     CeedCallBackend(CeedHostAXPBY_Cuda(y_impl->h_array, alpha, beta, x_impl->h_array, length));
696   }
697   return CEED_ERROR_SUCCESS;
698 }
699 
700 //------------------------------------------------------------------------------
701 // Compute the pointwise multiplication w = x .* y on the host
702 //------------------------------------------------------------------------------
703 static int CeedHostPointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
704   for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
705   return CEED_ERROR_SUCCESS;
706 }
707 
708 //------------------------------------------------------------------------------
709 // Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
710 //------------------------------------------------------------------------------
711 int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length);
712 
713 //------------------------------------------------------------------------------
714 // Compute the pointwise multiplication w = x .* y
715 //------------------------------------------------------------------------------
716 static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, CeedVector y) {
717   CeedSize         length;
718   CeedVector_Cuda *w_impl, *x_impl, *y_impl;
719 
720   CeedCallBackend(CeedVectorGetData(w, &w_impl));
721   CeedCallBackend(CeedVectorGetData(x, &x_impl));
722   CeedCallBackend(CeedVectorGetData(y, &y_impl));
723   CeedCallBackend(CeedVectorGetLength(w, &length));
724   // Set value for synced device/host array
725   if (!w_impl->d_array && !w_impl->h_array) {
726     CeedCallBackend(CeedVectorSetValue(w, 0.0));
727   }
728   if (w_impl->d_array) {
729     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
730     CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_DEVICE));
731     CeedCallBackend(CeedDevicePointwiseMult_Cuda(w_impl->d_array, x_impl->d_array, y_impl->d_array, length));
732   }
733   if (w_impl->h_array) {
734     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
735     CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST));
736     CeedCallBackend(CeedHostPointwiseMult_Cuda(w_impl->h_array, x_impl->h_array, y_impl->h_array, length));
737   }
738   return CEED_ERROR_SUCCESS;
739 }
740 
741 //------------------------------------------------------------------------------
742 // Destroy the vector
743 //------------------------------------------------------------------------------
744 static int CeedVectorDestroy_Cuda(const CeedVector vec) {
745   CeedVector_Cuda *impl;
746 
747   CeedCallBackend(CeedVectorGetData(vec, &impl));
748   CeedCallCuda(CeedVectorReturnCeed(vec), cudaFree(impl->d_array_owned));
749   CeedCallBackend(CeedFree(&impl->h_array_owned));
750   CeedCallBackend(CeedFree(&impl));
751   return CEED_ERROR_SUCCESS;
752 }
753 
754 //------------------------------------------------------------------------------
755 // Create a vector of the specified length (does not allocate memory)
756 //------------------------------------------------------------------------------
757 int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) {
758   CeedVector_Cuda *impl;
759   Ceed             ceed;
760 
761   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
762   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Cuda));
763   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda));
764   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda));
765   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda));
766   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Cuda));
767   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda));
768   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda));
769   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda));
770   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Cuda));
771   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Cuda));
772   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Cuda));
773   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Cuda));
774   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Cuda));
775   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Cuda));
776   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda));
777   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda));
778   CeedCallBackend(CeedCalloc(1, &impl));
779   CeedCallBackend(CeedVectorSetData(vec, impl));
780   return CEED_ERROR_SUCCESS;
781 }
782 
783 //------------------------------------------------------------------------------
784