xref: /libCEED/backends/cuda/ceed-cuda-compile.cpp (revision d416dc2b8eb8ab8cb4fa3546f1e63962299dc06a)
1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include "ceed-cuda-compile.h"
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 #include <cuda_runtime.h>
14 #include <dirent.h>
15 #include <nvrtc.h>
16 #include <stdarg.h>
17 #include <stdio.h>
18 #include <string.h>
19 #include <sys/stat.h>
20 #include <sys/types.h>
21 
22 #include <cstdlib>
23 #include <fstream>
24 #include <iostream>
25 #include <sstream>
26 #include <string>
27 
28 #include "ceed-cuda-common.h"
29 
30 #define CeedChk_Nvrtc(ceed, x)                                                                              \
31   do {                                                                                                      \
32     nvrtcResult result = static_cast<nvrtcResult>(x);                                                       \
33     if (result != NVRTC_SUCCESS) return CeedError((ceed), CEED_ERROR_BACKEND, nvrtcGetErrorString(result)); \
34   } while (0)
35 
36 #define CeedCallNvrtc(ceed, ...)  \
37   do {                            \
38     int ierr_q_ = __VA_ARGS__;    \
39     CeedChk_Nvrtc(ceed, ierr_q_); \
40   } while (0)
41 
42 #define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message))
43 
44 //------------------------------------------------------------------------------
45 // Call system command and capture stdout + stderr
46 //------------------------------------------------------------------------------
47 static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
48   CeedDebug(ceed, "Running command:\n$ %s", command);
49   FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
50 
51   CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s", message, command);
52 
53   char        line[CEED_MAX_RESOURCE_LEN] = "";
54   std::string output                      = "";
55 
56   while (fgets(line, sizeof(line), output_stream) != nullptr) {
57     output += line;
58   }
59   CeedDebug(ceed, "output:\n%s\n", output.c_str());
60   CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s\nerror:\n%s", message, command, output.c_str());
61   return CEED_ERROR_SUCCESS;
62 }
63 
64 //------------------------------------------------------------------------------
65 // Compile CUDA kernel
66 //------------------------------------------------------------------------------
67 using std::ifstream;
68 using std::ofstream;
69 using std::ostringstream;
70 
71 static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
72                                 const CeedInt num_defines, va_list args) {
73   size_t                ptx_size;
74   char                 *ptx;
75   const int             num_opts            = 4;
76   CeedInt               num_jit_source_dirs = 0, num_jit_defines = 0;
77   const char          **opts;
78   nvrtcProgram          prog;
79   struct cudaDeviceProp prop;
80   Ceed_Cuda            *ceed_data;
81 
82   cudaFree(0);  // Make sure a Context exists for nvrtc
83 
84   std::ostringstream code;
85   bool               using_clang;
86 
87   CeedCallBackend(CeedGetIsClang(ceed, &using_clang));
88 
89   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS,
90                using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)"
91                            : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment "
92                              "variable GPU_CLANG=1");
93 
94   // Get kernel specific options, such as kernel constants
95   if (num_defines > 0) {
96     char *name;
97     int   val;
98 
99     for (int i = 0; i < num_defines; i++) {
100       name = va_arg(args, char *);
101       val  = va_arg(args, int);
102       code << "#define " << name << " " << val << "\n";
103     }
104   }
105 
106   // Standard libCEED definitions for CUDA backends
107   code << "#include <ceed/jit-source/cuda/cuda-jit.h>\n\n";
108 
109   // Non-macro options
110   CeedCallBackend(CeedCalloc(num_opts, &opts));
111   opts[0] = "-default-device";
112   CeedCallBackend(CeedGetData(ceed, &ceed_data));
113   CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id));
114   std::string arch_arg =
115 #if CUDA_VERSION >= 11010
116       // NVRTC used to support only virtual architectures through the option
117       // -arch, since it was only emitting PTX. It will now support actual
118       // architectures as well to emit SASS.
119       // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
120       "-arch=sm_"
121 #else
122       "-arch=compute_"
123 #endif
124       + std::to_string(prop.major) + std::to_string(prop.minor);
125   opts[1] = arch_arg.c_str();
126   opts[2] = "-Dint32_t=int";
127   opts[3] = "-DCEED_RUNNING_JIT_PASS=1";
128   // Additional include dirs
129   {
130     const char **jit_source_dirs;
131 
132     CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
133     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
134     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
135       std::ostringstream include_dir_arg;
136 
137       include_dir_arg << "-I" << jit_source_dirs[i];
138       CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i]));
139     }
140     CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
141   }
142   // User defines
143   {
144     const char **jit_defines;
145 
146     CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
147     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts));
148     for (CeedInt i = 0; i < num_jit_defines; i++) {
149       std::ostringstream define_arg;
150 
151       define_arg << "-D" << jit_defines[i];
152       CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i]));
153     }
154     CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
155   }
156 
157   // Add string source argument provided in call
158   code << source;
159 
160   // Compile kernel
161   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
162   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
163   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
164 
165   if (!using_clang) {
166     CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
167 
168     if (CeedDebugFlag(ceed)) {
169       // LCOV_EXCL_START
170       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
171       for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
172         CeedDebug(ceed, "Option %d: %s", i, opts[i]);
173       }
174       CeedDebug(ceed, "");
175       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
176       // LCOV_EXCL_STOP
177     }
178 
179     nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
180 
181     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
182       CeedCallBackend(CeedFree(&opts[num_opts + i]));
183     }
184     for (CeedInt i = 0; i < num_jit_defines; i++) {
185       CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
186     }
187     CeedCallBackend(CeedFree(&opts));
188     *is_compile_good = result == NVRTC_SUCCESS;
189     if (!*is_compile_good) {
190       char  *log;
191       size_t log_size;
192 
193       CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
194       CeedCallBackend(CeedMalloc(log_size, &log));
195       CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
196       if (throw_error) {
197         return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
198       } else {
199         // LCOV_EXCL_START
200         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
201         CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
202         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
203         CeedCallBackend(CeedFree(&log));
204         CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
205         return CEED_ERROR_SUCCESS;
206         // LCOV_EXCL_STOP
207       }
208     }
209 
210 #if CUDA_VERSION >= 11010
211     CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size));
212     CeedCallBackend(CeedMalloc(ptx_size, &ptx));
213     CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx));
214 #else
215     CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
216     CeedCallBackend(CeedMalloc(ptx_size, &ptx));
217     CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
218 #endif
219     CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
220 
221     CeedCallCuda(ceed, cuModuleLoadData(module, ptx));
222     CeedCallBackend(CeedFree(&ptx));
223     return CEED_ERROR_SUCCESS;
224   } else {
225     srand(time(NULL));
226     const int build_id = rand();
227 
228     // Create temp dir if needed
229     {
230       DIR *dir = opendir("temp");
231 
232       if (dir) {
233         closedir(dir);
234       } else {
235         // In parallel multiple processes may attempt
236         // Only one process needs to succeed
237         mkdir("temp", 0777);
238         chmod("temp", 0777);
239       }
240     }
241     // Write code to temp file
242     {
243       std::string filename = std::string("temp/kernel_") + std::to_string(build_id) + std::string("_0_source.cu");
244       FILE       *file     = fopen(filename.c_str(), "w");
245 
246       CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang");
247       fputs(code.str().c_str(), file);
248       fclose(file);
249     }
250 
251     // Get rust crate directories
252     const char **rust_source_dirs     = nullptr;
253     int          num_rust_source_dirs = 0;
254 
255     CeedCallBackend(CeedGetRustSourceRoots(ceed, &num_rust_source_dirs, &rust_source_dirs));
256 
257     std::string rust_dirs[10];
258 
259     if (num_rust_source_dirs > 0) {
260       CeedDebug(ceed, "There are %d source dirs, including %s\n", num_rust_source_dirs, rust_source_dirs[0]);
261     }
262 
263     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
264       rust_dirs[i] = std::string(rust_source_dirs[i]);
265     }
266 
267     CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs));
268 
269     char *rust_toolchain = std::getenv("RUST_TOOLCHAIN");
270 
271     if (rust_toolchain == nullptr) {
272       rust_toolchain = (char *)"nightly";
273       setenv("RUST_TOOLCHAIN", "nightly", 0);
274     }
275 
276     // Compile Rust crate(s) needed
277     std::string command;
278 
279     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
280       command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] +
281                 "/.cargo/config.toml --manifest-path " + rust_dirs[i] + "/Cargo.toml";
282       CeedCallSystem(ceed, command.c_str(), "build Rust crate");
283     }
284 
285     // Get Clang version
286     bool use_llvm_version = ceed_data->use_llvm_version;
287     int  llvm_version     = ceed_data->llvm_version;
288 
289     if (llvm_version == 0) {
290       command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) --version";
291       CeedDebug(ceed, "Attempting to detect Rust LLVM version.\ncommand:\n$ %s", command.c_str());
292       FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
293 
294       CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version");
295 
296       char        line[CEED_MAX_RESOURCE_LEN] = "";
297       std::string output                      = "";
298 
299       while (fgets(line, sizeof(line), output_stream) != nullptr) {
300         output += line;
301       }
302       CeedDebug(ceed, "output:\n%s", output.c_str());
303       CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version\ncommand:\n$ %s\nerror:\n%s",
304                 command.c_str(), output.c_str());
305 
306       const char *version_substring = strstr(output.c_str(), "LLVM version ");
307 
308       version_substring += 13;
309 
310       char *next_dot = strchr((char *)version_substring, '.');
311 
312       if (next_dot) {
313         next_dot[0]             = '\0';
314         ceed_data->llvm_version = llvm_version = std::stoi(version_substring);
315         CeedDebug(ceed, "Rust LLVM version number: %d\n", llvm_version);
316 
317         command                     = std::string("clang++-") + std::to_string(llvm_version);
318         output_stream               = popen((command + std::string(" 2>&1")).c_str(), "r");
319         ceed_data->use_llvm_version = use_llvm_version = pclose(output_stream) == 0;
320       } else {
321         ceed_data->llvm_version     = -1;
322         ceed_data->use_llvm_version = use_llvm_version = false;
323       }
324     }
325 
326     // Compile wrapper kernel
327     command = "clang++" + (use_llvm_version ? (std::string("-") + std::to_string(llvm_version)) : "") + " -flto=thin --cuda-gpu-arch=sm_" +
328               std::to_string(prop.major) + std::to_string(prop.minor) + " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) +
329               "_0_source.cu -o temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll ";
330     command += opts[4];
331     CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
332     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll").c_str(), "update JiT file permissions");
333 
334     // Find Rust's llvm-link tool and run it
335     command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
336               std::to_string(build_id) +
337               "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o "
338               "temp/kernel_" +
339               std::to_string(build_id) + "_2_linked.ll ";
340 
341     // Searches for .a files in Rust directory
342     // Note: Rust crate names may not match the folder they are in
343     // TODO: If libCEED switches to c++17, use std::filesystem here
344     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
345       std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release";
346       DIR        *dp  = opendir(dir.c_str());
347 
348       CeedCheck(dp != nullptr, ceed, CEED_ERROR_BACKEND, "Could not open directory: %s", dir.c_str());
349       struct dirent *entry;
350 
351       // Find files ending in .a
352       while ((entry = readdir(dp)) != nullptr) {
353         std::string filename(entry->d_name);
354 
355         if (filename.size() >= 2 && filename.substr(filename.size() - 2) == ".a") {
356           command += dir + "/" + filename + " ";
357         }
358       }
359       closedir(dp);
360     }
361 
362     // Link, optimize, and compile final CUDA kernel
363     CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
364     CeedCallSystem(ceed,
365                    ("$(find $(rustup run " + std::string(rust_toolchain) +
366                     " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" + std::to_string(build_id) +
367                     "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
368                        .c_str(),
369                    "optimize linked C and Rust source");
370     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_2_linked.ll").c_str(), "update JiT file permissions");
371     CeedCallSystem(ceed,
372                    ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
373                     std::to_string(prop.major) + std::to_string(prop.minor) + " temp/kernel_" + std::to_string(build_id) +
374                     "_3_opt.bc -o temp/kernel_" + std::to_string(build_id) + "_4_final.ptx")
375                        .c_str(),
376                    "compile final CUDA kernel");
377     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_4_final.ptx").c_str(), "update JiT file permissions");
378 
379     // Load module from final PTX
380     ifstream      ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx");
381     ostringstream sstr;
382 
383     sstr << ptxfile.rdbuf();
384 
385     auto ptx_data = sstr.str();
386     ptx_size      = ptx_data.length();
387 
388     int result = cuModuleLoadData(module, ptx_data.c_str());
389 
390     *is_compile_good = result == 0;
391     if (!*is_compile_good) {
392       if (throw_error) {
393         return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load module data");
394       } else {
395         // LCOV_EXCL_START
396         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
397         CeedDebug(ceed, "Error: Failed to load module data");
398         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
399         return CEED_ERROR_SUCCESS;
400         // LCOV_EXCL_STOP
401       }
402     }
403   }
404   return CEED_ERROR_SUCCESS;
405 }
406 
407 int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
408   bool    is_compile_good = true;
409   va_list args;
410 
411   va_start(args, num_defines);
412   const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args);
413 
414   va_end(args);
415   CeedCallBackend(ierr);
416   return CEED_ERROR_SUCCESS;
417 }
418 
419 int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
420   va_list args;
421 
422   va_start(args, num_defines);
423   const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args);
424 
425   va_end(args);
426   CeedCallBackend(ierr);
427   return CEED_ERROR_SUCCESS;
428 }
429 
430 //------------------------------------------------------------------------------
431 // Get CUDA kernel
432 //------------------------------------------------------------------------------
433 int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel) {
434   CeedCallCuda(ceed, cuModuleGetFunction(kernel, module, name));
435   return CEED_ERROR_SUCCESS;
436 }
437 
438 //------------------------------------------------------------------------------
439 // Run CUDA kernel with block size selected automatically based on the kernel
440 //     (which may use enough registers to require a smaller block size than the
441 //      hardware is capable)
442 //------------------------------------------------------------------------------
443 int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void **args) {
444   int min_grid_size, max_block_size;
445 
446   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_block_size, kernel, NULL, 0, 0x10000));
447   CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args));
448   return CEED_ERROR_SUCCESS;
449 }
450 
451 //------------------------------------------------------------------------------
452 // Run CUDA kernel
453 //------------------------------------------------------------------------------
454 int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) {
455   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args));
456   return CEED_ERROR_SUCCESS;
457 }
458 
459 //------------------------------------------------------------------------------
460 // Run CUDA kernel for spatial dimension
461 //------------------------------------------------------------------------------
462 int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z,
463                           void **args) {
464   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
465   return CEED_ERROR_SUCCESS;
466 }
467 
468 //------------------------------------------------------------------------------
469 // Run CUDA kernel for spatial dimension with shared memory
470 //------------------------------------------------------------------------------
471 static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x,
472                                            const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
473                                            bool *is_good_run, void **args) {
474 #if CUDA_VERSION >= 9000
475   cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
476 #endif
477   CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
478 
479   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
480     int max_threads_per_block, shared_size_bytes, num_regs;
481 
482     cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
483     cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
484     cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
485     if (throw_error) {
486       return CeedError(ceed, CEED_ERROR_BACKEND,
487                        "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
488                        max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
489     } else {
490       // LCOV_EXCL_START
491       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
492       CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
493                 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
494       CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
495       // LCOV_EXCL_STOP
496     }
497     *is_good_run = false;
498   } else CeedChk_Cu(ceed, result);
499   return CEED_ERROR_SUCCESS;
500 }
501 
502 int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
503                                 const int block_size_z, const int shared_mem_size, void **args) {
504   bool is_good_run = true;
505 
506   CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
507                                                   &is_good_run, args));
508   return CEED_ERROR_SUCCESS;
509 }
510 
511 int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
512                                    const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
513   CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
514                                                   is_good_run, args));
515   return CEED_ERROR_SUCCESS;
516 }
517 
518 //------------------------------------------------------------------------------
519