xref: /libCEED/backends/cuda-ref/ceed-cuda-ref-vector.c (revision 3196072fa7d47cffcf474f8b421f80aad90fa9c5)
1 // Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include <ceed.h>
9 #include <ceed/backend.h>
10 #include <cuda_runtime.h>
11 #include <math.h>
12 #include <stdbool.h>
13 #include <string.h>
14 
15 #include "../cuda/ceed-cuda-common.h"
16 #include "ceed-cuda-ref.h"
17 
18 //------------------------------------------------------------------------------
19 // Check if host/device sync is needed
20 //------------------------------------------------------------------------------
21 static inline int CeedVectorNeedSync_Cuda(const CeedVector vec, CeedMemType mem_type, bool *need_sync) {
22   bool             has_valid_array = false;
23   CeedVector_Cuda *impl;
24 
25   CeedCallBackend(CeedVectorGetData(vec, &impl));
26   CeedCallBackend(CeedVectorHasValidArray(vec, &has_valid_array));
27   switch (mem_type) {
28     case CEED_MEM_HOST:
29       *need_sync = has_valid_array && !impl->h_array;
30       break;
31     case CEED_MEM_DEVICE:
32       *need_sync = has_valid_array && !impl->d_array;
33       break;
34   }
35   return CEED_ERROR_SUCCESS;
36 }
37 
38 //------------------------------------------------------------------------------
39 // Sync host to device
40 //------------------------------------------------------------------------------
41 static inline int CeedVectorSyncH2D_Cuda(const CeedVector vec) {
42   CeedSize         length;
43   size_t           bytes;
44   Ceed             ceed;
45   CeedVector_Cuda *impl;
46 
47   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
48   CeedCallBackend(CeedVectorGetData(vec, &impl));
49 
50   CeedCheck(impl->h_array, CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "No valid host data to sync to device");
51 
52   CeedCallBackend(CeedVectorGetLength(vec, &length));
53   bytes = length * sizeof(CeedScalar);
54   if (impl->d_array_borrowed) {
55     impl->d_array = impl->d_array_borrowed;
56   } else if (impl->d_array_owned) {
57     impl->d_array = impl->d_array_owned;
58   } else {
59     CeedCallCuda(ceed, cudaMalloc((void **)&impl->d_array_owned, bytes));
60     impl->d_array = impl->d_array_owned;
61   }
62   CeedCallCuda(ceed, cudaMemcpy(impl->d_array, impl->h_array, bytes, cudaMemcpyHostToDevice));
63   return CEED_ERROR_SUCCESS;
64 }
65 
66 //------------------------------------------------------------------------------
67 // Sync device to host
68 //------------------------------------------------------------------------------
69 static inline int CeedVectorSyncD2H_Cuda(const CeedVector vec) {
70   CeedSize         length;
71   Ceed             ceed;
72   CeedVector_Cuda *impl;
73 
74   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
75   CeedCallBackend(CeedVectorGetData(vec, &impl));
76 
77   CeedCheck(impl->d_array, ceed, CEED_ERROR_BACKEND, "No valid device data to sync to host");
78 
79   if (impl->h_array_borrowed) {
80     impl->h_array = impl->h_array_borrowed;
81   } else if (impl->h_array_owned) {
82     impl->h_array = impl->h_array_owned;
83   } else {
84     CeedSize length;
85 
86     CeedCallBackend(CeedVectorGetLength(vec, &length));
87     CeedCallBackend(CeedCalloc(length, &impl->h_array_owned));
88     impl->h_array = impl->h_array_owned;
89   }
90 
91   CeedCallBackend(CeedVectorGetLength(vec, &length));
92   size_t bytes = length * sizeof(CeedScalar);
93 
94   CeedCallCuda(ceed, cudaMemcpy(impl->h_array, impl->d_array, bytes, cudaMemcpyDeviceToHost));
95   return CEED_ERROR_SUCCESS;
96 }
97 
98 //------------------------------------------------------------------------------
99 // Sync arrays
100 //------------------------------------------------------------------------------
101 static int CeedVectorSyncArray_Cuda(const CeedVector vec, CeedMemType mem_type) {
102   bool need_sync = false;
103 
104   // Check whether device/host sync is needed
105   CeedCallBackend(CeedVectorNeedSync_Cuda(vec, mem_type, &need_sync));
106   if (!need_sync) return CEED_ERROR_SUCCESS;
107 
108   switch (mem_type) {
109     case CEED_MEM_HOST:
110       return CeedVectorSyncD2H_Cuda(vec);
111     case CEED_MEM_DEVICE:
112       return CeedVectorSyncH2D_Cuda(vec);
113   }
114   return CEED_ERROR_UNSUPPORTED;
115 }
116 
117 //------------------------------------------------------------------------------
118 // Set all pointers as invalid
119 //------------------------------------------------------------------------------
120 static inline int CeedVectorSetAllInvalid_Cuda(const CeedVector vec) {
121   CeedVector_Cuda *impl;
122 
123   CeedCallBackend(CeedVectorGetData(vec, &impl));
124   impl->h_array = NULL;
125   impl->d_array = NULL;
126   return CEED_ERROR_SUCCESS;
127 }
128 
129 //------------------------------------------------------------------------------
130 // Check if CeedVector has any valid pointer
131 //------------------------------------------------------------------------------
132 static inline int CeedVectorHasValidArray_Cuda(const CeedVector vec, bool *has_valid_array) {
133   CeedVector_Cuda *impl;
134 
135   CeedCallBackend(CeedVectorGetData(vec, &impl));
136   *has_valid_array = impl->h_array || impl->d_array;
137   return CEED_ERROR_SUCCESS;
138 }
139 
140 //------------------------------------------------------------------------------
141 // Check if has array of given type
142 //------------------------------------------------------------------------------
143 static inline int CeedVectorHasArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_array_of_type) {
144   CeedVector_Cuda *impl;
145 
146   CeedCallBackend(CeedVectorGetData(vec, &impl));
147   switch (mem_type) {
148     case CEED_MEM_HOST:
149       *has_array_of_type = impl->h_array_borrowed || impl->h_array_owned;
150       break;
151     case CEED_MEM_DEVICE:
152       *has_array_of_type = impl->d_array_borrowed || impl->d_array_owned;
153       break;
154   }
155   return CEED_ERROR_SUCCESS;
156 }
157 
158 //------------------------------------------------------------------------------
159 // Check if has borrowed array of given type
160 //------------------------------------------------------------------------------
161 static inline int CeedVectorHasBorrowedArrayOfType_Cuda(const CeedVector vec, CeedMemType mem_type, bool *has_borrowed_array_of_type) {
162   CeedVector_Cuda *impl;
163 
164   CeedCallBackend(CeedVectorGetData(vec, &impl));
165   switch (mem_type) {
166     case CEED_MEM_HOST:
167       *has_borrowed_array_of_type = impl->h_array_borrowed;
168       break;
169     case CEED_MEM_DEVICE:
170       *has_borrowed_array_of_type = impl->d_array_borrowed;
171       break;
172   }
173   return CEED_ERROR_SUCCESS;
174 }
175 
176 //------------------------------------------------------------------------------
177 // Set array from host
178 //------------------------------------------------------------------------------
179 static int CeedVectorSetArrayHost_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
180   CeedSize         length;
181   CeedVector_Cuda *impl;
182 
183   CeedCallBackend(CeedVectorGetData(vec, &impl));
184   CeedCallBackend(CeedVectorGetLength(vec, &length));
185 
186   CeedCallBackend(CeedSetHostCeedScalarArray(array, copy_mode, length, (const CeedScalar **)&impl->h_array_owned,
187                                              (const CeedScalar **)&impl->h_array_borrowed, (const CeedScalar **)&impl->h_array));
188   return CEED_ERROR_SUCCESS;
189 }
190 
191 //------------------------------------------------------------------------------
192 // Set array from device
193 //------------------------------------------------------------------------------
194 static int CeedVectorSetArrayDevice_Cuda(const CeedVector vec, const CeedCopyMode copy_mode, CeedScalar *array) {
195   CeedSize         length;
196   Ceed             ceed;
197   CeedVector_Cuda *impl;
198 
199   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
200   CeedCallBackend(CeedVectorGetData(vec, &impl));
201   CeedCallBackend(CeedVectorGetLength(vec, &length));
202 
203   CeedCallBackend(CeedSetDeviceCeedScalarArray_Cuda(ceed, array, copy_mode, length, (const CeedScalar **)&impl->d_array_owned,
204                                                     (const CeedScalar **)&impl->d_array_borrowed, (const CeedScalar **)&impl->d_array));
205   return CEED_ERROR_SUCCESS;
206 }
207 
208 //------------------------------------------------------------------------------
209 // Set the array used by a vector,
210 //   freeing any previously allocated array if applicable
211 //------------------------------------------------------------------------------
212 static int CeedVectorSetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedCopyMode copy_mode, CeedScalar *array) {
213   CeedVector_Cuda *impl;
214 
215   CeedCallBackend(CeedVectorGetData(vec, &impl));
216   CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec));
217   switch (mem_type) {
218     case CEED_MEM_HOST:
219       return CeedVectorSetArrayHost_Cuda(vec, copy_mode, array);
220     case CEED_MEM_DEVICE:
221       return CeedVectorSetArrayDevice_Cuda(vec, copy_mode, array);
222   }
223   return CEED_ERROR_UNSUPPORTED;
224 }
225 
226 //------------------------------------------------------------------------------
227 // Copy host array to value strided
228 //------------------------------------------------------------------------------
229 static int CeedHostCopyStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *h_copy_array) {
230   for (CeedSize i = start; i < length; i += step) h_copy_array[i] = h_array[i];
231   return CEED_ERROR_SUCCESS;
232 }
233 
234 //------------------------------------------------------------------------------
235 // Copy device array to value strided (impl in .cu file)
236 //------------------------------------------------------------------------------
237 int CeedDeviceCopyStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar *d_copy_array);
238 
239 //------------------------------------------------------------------------------
240 // Copy a vector to a value strided
241 //------------------------------------------------------------------------------
242 static int CeedVectorCopyStrided_Cuda(CeedVector vec, CeedSize start, CeedSize step, CeedVector vec_copy) {
243   CeedSize         length;
244   CeedVector_Cuda *impl;
245 
246   CeedCallBackend(CeedVectorGetData(vec, &impl));
247   CeedCallBackend(CeedVectorGetLength(vec, &length));
248   // Set value for synced device/host array
249   if (impl->d_array) {
250     CeedScalar *copy_array;
251 
252     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_DEVICE, &copy_array));
253     CeedCallBackend(CeedDeviceCopyStrided_Cuda(impl->d_array, start, step, length, copy_array));
254     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
255   } else if (impl->h_array) {
256     CeedScalar *copy_array;
257 
258     CeedCallBackend(CeedVectorGetArray(vec_copy, CEED_MEM_HOST, &copy_array));
259     CeedCallBackend(CeedHostCopyStrided_Cuda(impl->h_array, start, step, length, copy_array));
260     CeedCallBackend(CeedVectorRestoreArray(vec_copy, &copy_array));
261   } else {
262     return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
263   }
264   return CEED_ERROR_SUCCESS;
265 }
266 
267 //------------------------------------------------------------------------------
268 // Set host array to value
269 //------------------------------------------------------------------------------
270 static int CeedHostSetValue_Cuda(CeedScalar *h_array, CeedSize length, CeedScalar val) {
271   for (CeedSize i = 0; i < length; i++) h_array[i] = val;
272   return CEED_ERROR_SUCCESS;
273 }
274 
275 //------------------------------------------------------------------------------
276 // Set device array to value (impl in .cu file)
277 //------------------------------------------------------------------------------
278 int CeedDeviceSetValue_Cuda(CeedScalar *d_array, CeedSize length, CeedScalar val);
279 
280 //------------------------------------------------------------------------------
281 // Set a vector to a value
282 //------------------------------------------------------------------------------
283 static int CeedVectorSetValue_Cuda(CeedVector vec, CeedScalar val) {
284   CeedSize         length;
285   CeedVector_Cuda *impl;
286 
287   CeedCallBackend(CeedVectorGetData(vec, &impl));
288   CeedCallBackend(CeedVectorGetLength(vec, &length));
289   // Set value for synced device/host array
290   if (!impl->d_array && !impl->h_array) {
291     if (impl->d_array_borrowed) {
292       impl->d_array = impl->d_array_borrowed;
293     } else if (impl->h_array_borrowed) {
294       impl->h_array = impl->h_array_borrowed;
295     } else if (impl->d_array_owned) {
296       impl->d_array = impl->d_array_owned;
297     } else if (impl->h_array_owned) {
298       impl->h_array = impl->h_array_owned;
299     } else {
300       CeedCallBackend(CeedVectorSetArray(vec, CEED_MEM_DEVICE, CEED_COPY_VALUES, NULL));
301     }
302   }
303   if (impl->d_array) {
304     CeedCallBackend(CeedDeviceSetValue_Cuda(impl->d_array, length, val));
305     impl->h_array = NULL;
306   }
307   if (impl->h_array) {
308     CeedCallBackend(CeedHostSetValue_Cuda(impl->h_array, length, val));
309     impl->d_array = NULL;
310   }
311   return CEED_ERROR_SUCCESS;
312 }
313 
314 //------------------------------------------------------------------------------
315 // Set host array to value strided
316 //------------------------------------------------------------------------------
317 static int CeedHostSetValueStrided_Cuda(CeedScalar *h_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val) {
318   for (CeedSize i = start; i < length; i += step) h_array[i] = val;
319   return CEED_ERROR_SUCCESS;
320 }
321 
322 //------------------------------------------------------------------------------
323 // Set device array to value strided (impl in .cu file)
324 //------------------------------------------------------------------------------
325 int CeedDeviceSetValueStrided_Cuda(CeedScalar *d_array, CeedSize start, CeedSize step, CeedSize length, CeedScalar val);
326 
327 //------------------------------------------------------------------------------
328 // Set a vector to a value strided
329 //------------------------------------------------------------------------------
330 static int CeedVectorSetValueStrided_Cuda(CeedVector vec, CeedSize start, CeedSize step, CeedScalar val) {
331   CeedSize         length;
332   CeedVector_Cuda *impl;
333 
334   CeedCallBackend(CeedVectorGetData(vec, &impl));
335   CeedCallBackend(CeedVectorGetLength(vec, &length));
336   // Set value for synced device/host array
337   if (impl->d_array) {
338     CeedCallBackend(CeedDeviceSetValueStrided_Cuda(impl->d_array, start, step, length, val));
339     impl->h_array = NULL;
340   } else if (impl->h_array) {
341     CeedCallBackend(CeedHostSetValueStrided_Cuda(impl->h_array, start, step, length, val));
342     impl->d_array = NULL;
343   } else {
344     return CeedError(CeedVectorReturnCeed(vec), CEED_ERROR_BACKEND, "CeedVector must have valid data set");
345   }
346   return CEED_ERROR_SUCCESS;
347 }
348 
349 //------------------------------------------------------------------------------
350 // Vector Take Array
351 //------------------------------------------------------------------------------
352 static int CeedVectorTakeArray_Cuda(CeedVector vec, CeedMemType mem_type, CeedScalar **array) {
353   CeedVector_Cuda *impl;
354 
355   CeedCallBackend(CeedVectorGetData(vec, &impl));
356   // Sync array to requested mem_type
357   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
358   // Update pointer
359   switch (mem_type) {
360     case CEED_MEM_HOST:
361       (*array)               = impl->h_array_borrowed;
362       impl->h_array_borrowed = NULL;
363       impl->h_array          = NULL;
364       break;
365     case CEED_MEM_DEVICE:
366       (*array)               = impl->d_array_borrowed;
367       impl->d_array_borrowed = NULL;
368       impl->d_array          = NULL;
369       break;
370   }
371   return CEED_ERROR_SUCCESS;
372 }
373 
374 //------------------------------------------------------------------------------
375 // Core logic for array syncronization for GetArray.
376 //   If a different memory type is most up to date, this will perform a copy
377 //------------------------------------------------------------------------------
378 static int CeedVectorGetArrayCore_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
379   CeedVector_Cuda *impl;
380 
381   CeedCallBackend(CeedVectorGetData(vec, &impl));
382   // Sync array to requested mem_type
383   CeedCallBackend(CeedVectorSyncArray(vec, mem_type));
384   // Update pointer
385   switch (mem_type) {
386     case CEED_MEM_HOST:
387       *array = impl->h_array;
388       break;
389     case CEED_MEM_DEVICE:
390       *array = impl->d_array;
391       break;
392   }
393   return CEED_ERROR_SUCCESS;
394 }
395 
396 //------------------------------------------------------------------------------
397 // Get read-only access to a vector via the specified mem_type
398 //------------------------------------------------------------------------------
399 static int CeedVectorGetArrayRead_Cuda(const CeedVector vec, const CeedMemType mem_type, const CeedScalar **array) {
400   return CeedVectorGetArrayCore_Cuda(vec, mem_type, (CeedScalar **)array);
401 }
402 
403 //------------------------------------------------------------------------------
404 // Get read/write access to a vector via the specified mem_type
405 //------------------------------------------------------------------------------
406 static int CeedVectorGetArray_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
407   CeedVector_Cuda *impl;
408 
409   CeedCallBackend(CeedVectorGetData(vec, &impl));
410   CeedCallBackend(CeedVectorGetArrayCore_Cuda(vec, mem_type, array));
411   CeedCallBackend(CeedVectorSetAllInvalid_Cuda(vec));
412   switch (mem_type) {
413     case CEED_MEM_HOST:
414       impl->h_array = *array;
415       break;
416     case CEED_MEM_DEVICE:
417       impl->d_array = *array;
418       break;
419   }
420   return CEED_ERROR_SUCCESS;
421 }
422 
423 //------------------------------------------------------------------------------
424 // Get write access to a vector via the specified mem_type
425 //------------------------------------------------------------------------------
426 static int CeedVectorGetArrayWrite_Cuda(const CeedVector vec, const CeedMemType mem_type, CeedScalar **array) {
427   bool             has_array_of_type = true;
428   CeedVector_Cuda *impl;
429 
430   CeedCallBackend(CeedVectorGetData(vec, &impl));
431   CeedCallBackend(CeedVectorHasArrayOfType_Cuda(vec, mem_type, &has_array_of_type));
432   if (!has_array_of_type) {
433     // Allocate if array is not yet allocated
434     CeedCallBackend(CeedVectorSetArray(vec, mem_type, CEED_COPY_VALUES, NULL));
435   } else {
436     // Select dirty array
437     switch (mem_type) {
438       case CEED_MEM_HOST:
439         if (impl->h_array_borrowed) impl->h_array = impl->h_array_borrowed;
440         else impl->h_array = impl->h_array_owned;
441         break;
442       case CEED_MEM_DEVICE:
443         if (impl->d_array_borrowed) impl->d_array = impl->d_array_borrowed;
444         else impl->d_array = impl->d_array_owned;
445     }
446   }
447   return CeedVectorGetArray_Cuda(vec, mem_type, array);
448 }
449 
450 //------------------------------------------------------------------------------
451 // Get the norm of a CeedVector
452 //------------------------------------------------------------------------------
453 static int CeedVectorNorm_Cuda(CeedVector vec, CeedNormType type, CeedScalar *norm) {
454   Ceed     ceed;
455   CeedSize length;
456 #if CUDA_VERSION < 12000
457   CeedSize num_calls;
458 #endif
459   const CeedScalar *d_array;
460   CeedVector_Cuda  *impl;
461   cublasHandle_t    handle;
462 
463   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
464   CeedCallBackend(CeedVectorGetData(vec, &impl));
465   CeedCallBackend(CeedVectorGetLength(vec, &length));
466   CeedCallBackend(CeedGetCublasHandle_Cuda(ceed, &handle));
467 
468 #if CUDA_VERSION < 12000
469   // With CUDA 12, we can use the 64-bit integer interface. Prior to that,
470   // we need to check if the vector is too long to handle with int32,
471   // and if so, divide it into subsections for repeated cuBLAS calls.
472   num_calls = length / INT_MAX;
473   if (length % INT_MAX > 0) num_calls += 1;
474 #endif
475 
476   // Compute norm
477   CeedCallBackend(CeedVectorGetArrayRead(vec, CEED_MEM_DEVICE, &d_array));
478   switch (type) {
479     case CEED_NORM_1: {
480       *norm = 0.0;
481       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
482 #if CUDA_VERSION >= 12000  // We have CUDA 12, and can use 64-bit integers
483         CeedCallCublas(ceed, cublasSasum_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
484 #else
485         float  sub_norm = 0.0;
486         float *d_array_start;
487 
488         for (CeedInt i = 0; i < num_calls; i++) {
489           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
490           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
491           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
492 
493           CeedCallCublas(ceed, cublasSasum(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
494           *norm += sub_norm;
495         }
496 #endif
497       } else {
498 #if CUDA_VERSION >= 12000
499         CeedCallCublas(ceed, cublasDasum_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
500 #else
501         double  sub_norm = 0.0;
502         double *d_array_start;
503 
504         for (CeedInt i = 0; i < num_calls; i++) {
505           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
506           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
507           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
508 
509           CeedCallCublas(ceed, cublasDasum(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
510           *norm += sub_norm;
511         }
512 #endif
513       }
514       break;
515     }
516     case CEED_NORM_2: {
517       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
518 #if CUDA_VERSION >= 12000
519         CeedCallCublas(ceed, cublasSnrm2_64(handle, (int64_t)length, (float *)d_array, 1, (float *)norm));
520 #else
521         float  sub_norm = 0.0, norm_sum = 0.0;
522         float *d_array_start;
523 
524         for (CeedInt i = 0; i < num_calls; i++) {
525           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
526           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
527           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
528 
529           CeedCallCublas(ceed, cublasSnrm2(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &sub_norm));
530           norm_sum += sub_norm * sub_norm;
531         }
532         *norm = sqrt(norm_sum);
533 #endif
534       } else {
535 #if CUDA_VERSION >= 12000
536         CeedCallCublas(ceed, cublasDnrm2_64(handle, (int64_t)length, (double *)d_array, 1, (double *)norm));
537 #else
538         double  sub_norm = 0.0, norm_sum = 0.0;
539         double *d_array_start;
540 
541         for (CeedInt i = 0; i < num_calls; i++) {
542           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
543           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
544           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
545 
546           CeedCallCublas(ceed, cublasDnrm2(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &sub_norm));
547           norm_sum += sub_norm * sub_norm;
548         }
549         *norm = sqrt(norm_sum);
550 #endif
551       }
552       break;
553     }
554     case CEED_NORM_MAX: {
555       if (CEED_SCALAR_TYPE == CEED_SCALAR_FP32) {
556 #if CUDA_VERSION >= 12000
557         int64_t    index;
558         CeedScalar norm_no_abs;
559 
560         CeedCallCublas(ceed, cublasIsamax_64(handle, (int64_t)length, (float *)d_array, 1, &index));
561         CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
562         *norm = fabs(norm_no_abs);
563 #else
564         CeedInt index;
565         float   sub_max = 0.0, current_max = 0.0;
566         float  *d_array_start;
567 
568         for (CeedInt i = 0; i < num_calls; i++) {
569           d_array_start             = (float *)d_array + (CeedSize)(i)*INT_MAX;
570           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
571           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
572 
573           CeedCallCublas(ceed, cublasIsamax(handle, (CeedInt)sub_length, (float *)d_array_start, 1, &index));
574           CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
575           if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
576         }
577         *norm = current_max;
578 #endif
579       } else {
580 #if CUDA_VERSION >= 12000
581         int64_t    index;
582         CeedScalar norm_no_abs;
583 
584         CeedCallCublas(ceed, cublasIdamax_64(handle, (int64_t)length, (double *)d_array, 1, &index));
585         CeedCallCuda(ceed, cudaMemcpy(&norm_no_abs, impl->d_array + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
586         *norm = fabs(norm_no_abs);
587 #else
588         CeedInt index;
589         double  sub_max = 0.0, current_max = 0.0;
590         double *d_array_start;
591 
592         for (CeedInt i = 0; i < num_calls; i++) {
593           d_array_start             = (double *)d_array + (CeedSize)(i)*INT_MAX;
594           CeedSize remaining_length = length - (CeedSize)(i)*INT_MAX;
595           CeedInt  sub_length       = (i == num_calls - 1) ? (CeedInt)(remaining_length) : INT_MAX;
596 
597           CeedCallCublas(ceed, cublasIdamax(handle, (CeedInt)sub_length, (double *)d_array_start, 1, &index));
598           CeedCallCuda(ceed, cudaMemcpy(&sub_max, d_array_start + index - 1, sizeof(CeedScalar), cudaMemcpyDeviceToHost));
599           if (fabs(sub_max) > current_max) current_max = fabs(sub_max);
600         }
601         *norm = current_max;
602 #endif
603       }
604       break;
605     }
606   }
607   CeedCallBackend(CeedVectorRestoreArrayRead(vec, &d_array));
608   return CEED_ERROR_SUCCESS;
609 }
610 
611 //------------------------------------------------------------------------------
612 // Take reciprocal of a vector on host
613 //------------------------------------------------------------------------------
614 static int CeedHostReciprocal_Cuda(CeedScalar *h_array, CeedSize length) {
615   for (CeedSize i = 0; i < length; i++) {
616     if (fabs(h_array[i]) > CEED_EPSILON) h_array[i] = 1. / h_array[i];
617   }
618   return CEED_ERROR_SUCCESS;
619 }
620 
621 //------------------------------------------------------------------------------
622 // Take reciprocal of a vector on device (impl in .cu file)
623 //------------------------------------------------------------------------------
624 int CeedDeviceReciprocal_Cuda(CeedScalar *d_array, CeedSize length);
625 
626 //------------------------------------------------------------------------------
627 // Take reciprocal of a vector
628 //------------------------------------------------------------------------------
629 static int CeedVectorReciprocal_Cuda(CeedVector vec) {
630   CeedSize         length;
631   CeedVector_Cuda *impl;
632 
633   CeedCallBackend(CeedVectorGetData(vec, &impl));
634   CeedCallBackend(CeedVectorGetLength(vec, &length));
635   // Set value for synced device/host array
636   if (impl->d_array) CeedCallBackend(CeedDeviceReciprocal_Cuda(impl->d_array, length));
637   if (impl->h_array) CeedCallBackend(CeedHostReciprocal_Cuda(impl->h_array, length));
638   return CEED_ERROR_SUCCESS;
639 }
640 
641 //------------------------------------------------------------------------------
642 // Compute x = alpha x on the host
643 //------------------------------------------------------------------------------
644 static int CeedHostScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length) {
645   for (CeedSize i = 0; i < length; i++) x_array[i] *= alpha;
646   return CEED_ERROR_SUCCESS;
647 }
648 
649 //------------------------------------------------------------------------------
650 // Compute x = alpha x on device (impl in .cu file)
651 //------------------------------------------------------------------------------
652 int CeedDeviceScale_Cuda(CeedScalar *x_array, CeedScalar alpha, CeedSize length);
653 
654 //------------------------------------------------------------------------------
655 // Compute x = alpha x
656 //------------------------------------------------------------------------------
657 static int CeedVectorScale_Cuda(CeedVector x, CeedScalar alpha) {
658   CeedSize         length;
659   CeedVector_Cuda *x_impl;
660 
661   CeedCallBackend(CeedVectorGetData(x, &x_impl));
662   CeedCallBackend(CeedVectorGetLength(x, &length));
663   // Set value for synced device/host array
664   if (x_impl->d_array) CeedCallBackend(CeedDeviceScale_Cuda(x_impl->d_array, alpha, length));
665   if (x_impl->h_array) CeedCallBackend(CeedHostScale_Cuda(x_impl->h_array, alpha, length));
666   return CEED_ERROR_SUCCESS;
667 }
668 
669 //------------------------------------------------------------------------------
670 // Compute y = alpha x + y on the host
671 //------------------------------------------------------------------------------
672 static int CeedHostAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length) {
673   for (CeedSize i = 0; i < length; i++) y_array[i] += alpha * x_array[i];
674   return CEED_ERROR_SUCCESS;
675 }
676 
677 //------------------------------------------------------------------------------
678 // Compute y = alpha x + y on device (impl in .cu file)
679 //------------------------------------------------------------------------------
680 int CeedDeviceAXPY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar *x_array, CeedSize length);
681 
682 //------------------------------------------------------------------------------
683 // Compute y = alpha x + y
684 //------------------------------------------------------------------------------
685 static int CeedVectorAXPY_Cuda(CeedVector y, CeedScalar alpha, CeedVector x) {
686   Ceed             ceed;
687   CeedSize         length;
688   CeedVector_Cuda *y_impl, *x_impl;
689 
690   CeedCallBackend(CeedVectorGetCeed(y, &ceed));
691   CeedCallBackend(CeedVectorGetData(y, &y_impl));
692   CeedCallBackend(CeedVectorGetData(x, &x_impl));
693   CeedCallBackend(CeedVectorGetLength(y, &length));
694   // Set value for synced device/host array
695   if (y_impl->d_array) {
696     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
697     CeedCallBackend(CeedDeviceAXPY_Cuda(y_impl->d_array, alpha, x_impl->d_array, length));
698   }
699   if (y_impl->h_array) {
700     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
701     CeedCallBackend(CeedHostAXPY_Cuda(y_impl->h_array, alpha, x_impl->h_array, length));
702   }
703   return CEED_ERROR_SUCCESS;
704 }
705 
706 //------------------------------------------------------------------------------
707 // Compute y = alpha x + beta y on the host
708 //------------------------------------------------------------------------------
709 static int CeedHostAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length) {
710   for (CeedSize i = 0; i < length; i++) y_array[i] = alpha * x_array[i] + beta * y_array[i];
711   return CEED_ERROR_SUCCESS;
712 }
713 
714 //------------------------------------------------------------------------------
715 // Compute y = alpha x + beta y on device (impl in .cu file)
716 //------------------------------------------------------------------------------
717 int CeedDeviceAXPBY_Cuda(CeedScalar *y_array, CeedScalar alpha, CeedScalar beta, CeedScalar *x_array, CeedSize length);
718 
719 //------------------------------------------------------------------------------
720 // Compute y = alpha x + beta y
721 //------------------------------------------------------------------------------
722 static int CeedVectorAXPBY_Cuda(CeedVector y, CeedScalar alpha, CeedScalar beta, CeedVector x) {
723   CeedSize         length;
724   CeedVector_Cuda *y_impl, *x_impl;
725 
726   CeedCallBackend(CeedVectorGetData(y, &y_impl));
727   CeedCallBackend(CeedVectorGetData(x, &x_impl));
728   CeedCallBackend(CeedVectorGetLength(y, &length));
729   // Set value for synced device/host array
730   if (y_impl->d_array) {
731     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
732     CeedCallBackend(CeedDeviceAXPBY_Cuda(y_impl->d_array, alpha, beta, x_impl->d_array, length));
733   }
734   if (y_impl->h_array) {
735     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
736     CeedCallBackend(CeedHostAXPBY_Cuda(y_impl->h_array, alpha, beta, x_impl->h_array, length));
737   }
738   return CEED_ERROR_SUCCESS;
739 }
740 
741 //------------------------------------------------------------------------------
742 // Compute the pointwise multiplication w = x .* y on the host
743 //------------------------------------------------------------------------------
744 static int CeedHostPointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length) {
745   for (CeedSize i = 0; i < length; i++) w_array[i] = x_array[i] * y_array[i];
746   return CEED_ERROR_SUCCESS;
747 }
748 
749 //------------------------------------------------------------------------------
750 // Compute the pointwise multiplication w = x .* y on device (impl in .cu file)
751 //------------------------------------------------------------------------------
752 int CeedDevicePointwiseMult_Cuda(CeedScalar *w_array, CeedScalar *x_array, CeedScalar *y_array, CeedSize length);
753 
754 //------------------------------------------------------------------------------
755 // Compute the pointwise multiplication w = x .* y
756 //------------------------------------------------------------------------------
757 static int CeedVectorPointwiseMult_Cuda(CeedVector w, CeedVector x, CeedVector y) {
758   CeedSize         length;
759   CeedVector_Cuda *w_impl, *x_impl, *y_impl;
760 
761   CeedCallBackend(CeedVectorGetData(w, &w_impl));
762   CeedCallBackend(CeedVectorGetData(x, &x_impl));
763   CeedCallBackend(CeedVectorGetData(y, &y_impl));
764   CeedCallBackend(CeedVectorGetLength(w, &length));
765   // Set value for synced device/host array
766   if (!w_impl->d_array && !w_impl->h_array) {
767     CeedCallBackend(CeedVectorSetValue(w, 0.0));
768   }
769   if (w_impl->d_array) {
770     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_DEVICE));
771     CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_DEVICE));
772     CeedCallBackend(CeedDevicePointwiseMult_Cuda(w_impl->d_array, x_impl->d_array, y_impl->d_array, length));
773   }
774   if (w_impl->h_array) {
775     CeedCallBackend(CeedVectorSyncArray(x, CEED_MEM_HOST));
776     CeedCallBackend(CeedVectorSyncArray(y, CEED_MEM_HOST));
777     CeedCallBackend(CeedHostPointwiseMult_Cuda(w_impl->h_array, x_impl->h_array, y_impl->h_array, length));
778   }
779   return CEED_ERROR_SUCCESS;
780 }
781 
782 //------------------------------------------------------------------------------
783 // Destroy the vector
784 //------------------------------------------------------------------------------
785 static int CeedVectorDestroy_Cuda(const CeedVector vec) {
786   CeedVector_Cuda *impl;
787 
788   CeedCallBackend(CeedVectorGetData(vec, &impl));
789   CeedCallCuda(CeedVectorReturnCeed(vec), cudaFree(impl->d_array_owned));
790   CeedCallBackend(CeedFree(&impl->h_array_owned));
791   CeedCallBackend(CeedFree(&impl));
792   return CEED_ERROR_SUCCESS;
793 }
794 
795 //------------------------------------------------------------------------------
796 // Create a vector of the specified length (does not allocate memory)
797 //------------------------------------------------------------------------------
798 int CeedVectorCreate_Cuda(CeedSize n, CeedVector vec) {
799   CeedVector_Cuda *impl;
800   Ceed             ceed;
801 
802   CeedCallBackend(CeedVectorGetCeed(vec, &ceed));
803   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasValidArray", CeedVectorHasValidArray_Cuda));
804   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "HasBorrowedArrayOfType", CeedVectorHasBorrowedArrayOfType_Cuda));
805   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetArray", CeedVectorSetArray_Cuda));
806   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "TakeArray", CeedVectorTakeArray_Cuda));
807   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "CopyStrided", (int (*)())CeedVectorCopyStrided_Cuda));
808   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValue", (int (*)())CeedVectorSetValue_Cuda));
809   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SetValueStrided", (int (*)())CeedVectorSetValueStrided_Cuda));
810   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "SyncArray", CeedVectorSyncArray_Cuda));
811   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArray", CeedVectorGetArray_Cuda));
812   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayRead", CeedVectorGetArrayRead_Cuda));
813   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "GetArrayWrite", CeedVectorGetArrayWrite_Cuda));
814   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Norm", CeedVectorNorm_Cuda));
815   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Reciprocal", CeedVectorReciprocal_Cuda));
816   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Scale", (int (*)())CeedVectorScale_Cuda));
817   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPY", (int (*)())CeedVectorAXPY_Cuda));
818   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "AXPBY", (int (*)())CeedVectorAXPBY_Cuda));
819   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "PointwiseMult", CeedVectorPointwiseMult_Cuda));
820   CeedCallBackend(CeedSetBackendFunction(ceed, "Vector", vec, "Destroy", CeedVectorDestroy_Cuda));
821   CeedCallBackend(CeedCalloc(1, &impl));
822   CeedCallBackend(CeedVectorSetData(vec, impl));
823   return CEED_ERROR_SUCCESS;
824 }
825 
826 //------------------------------------------------------------------------------
827