xref: /libCEED/backends/cuda/ceed-cuda-compile.cpp (revision fa619ecc52f58ebd3ff3ef012ebe7a24b3c56483)
1 // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED:  http://github.com/ceed
7 
8 #include "ceed-cuda-compile.h"
9 
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 #include <cuda_runtime.h>
14 #include <dirent.h>
15 #include <nvrtc.h>
16 #include <stdarg.h>
17 #include <stdio.h>
18 #include <string.h>
19 #include <sys/stat.h>
20 #include <sys/types.h>
21 
22 #include <cstdlib>
23 #include <fstream>
24 #include <iostream>
25 #include <sstream>
26 #include <string>
27 
28 #include "ceed-cuda-common.h"
29 
30 #define CeedChk_Nvrtc(ceed, x)                                                                              \
31   do {                                                                                                      \
32     nvrtcResult result = static_cast<nvrtcResult>(x);                                                       \
33     if (result != NVRTC_SUCCESS) return CeedError((ceed), CEED_ERROR_BACKEND, nvrtcGetErrorString(result)); \
34   } while (0)
35 
36 #define CeedCallNvrtc(ceed, ...)  \
37   do {                            \
38     int ierr_q_ = __VA_ARGS__;    \
39     CeedChk_Nvrtc(ceed, ierr_q_); \
40   } while (0)
41 
42 #define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message))
43 
44 //------------------------------------------------------------------------------
45 // Call system command and capture stdout + stderr
46 //------------------------------------------------------------------------------
47 static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
48   CeedDebug(ceed, "Running command:\n$ %s", command);
49   FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
50 
51   CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s", message, command);
52 
53   char        line[CEED_MAX_RESOURCE_LEN] = "";
54   std::string output                      = "";
55 
56   while (fgets(line, sizeof(line), output_stream) != nullptr) {
57     output += line;
58   }
59   CeedDebug(ceed, "output:\n%s\n", output.c_str());
60   CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s\nerror:\n%s", message, command, output.c_str());
61   return CEED_ERROR_SUCCESS;
62 }
63 
64 //------------------------------------------------------------------------------
65 // Compile CUDA kernel
66 //------------------------------------------------------------------------------
67 using std::ifstream;
68 using std::ofstream;
69 using std::ostringstream;
70 
71 static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
72                                 const CeedInt num_defines, va_list args) {
73   size_t                ptx_size;
74   char                 *ptx;
75   const int             num_opts            = 4;
76   CeedInt               num_jit_source_dirs = 0, num_jit_defines = 0;
77   const char          **opts;
78   nvrtcProgram          prog;
79   struct cudaDeviceProp prop;
80   Ceed_Cuda            *ceed_data;
81 
82   cudaFree(0);  // Make sure a Context exists for nvrtc
83 
84   std::ostringstream code;
85   bool               using_clang;
86 
87   CeedCallBackend(CeedGetIsClang(ceed, &using_clang));
88 
89   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS,
90                using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)"
91                            : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment "
92                              "variable GPU_CLANG=1");
93 
94   // Get kernel specific options, such as kernel constants
95   if (num_defines > 0) {
96     char *name;
97     int   val;
98 
99     for (int i = 0; i < num_defines; i++) {
100       name = va_arg(args, char *);
101       val  = va_arg(args, int);
102       code << "#define " << name << " " << val << "\n";
103     }
104   }
105 
106   // Standard libCEED definitions for CUDA backends
107   code << "#include <ceed/jit-source/cuda/cuda-jit.h>\n\n";
108 
109   // Non-macro options
110   CeedCallBackend(CeedCalloc(num_opts, &opts));
111   opts[0] = "-default-device";
112   CeedCallBackend(CeedGetData(ceed, &ceed_data));
113   CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id));
114   std::string arch_arg =
115 #if CUDA_VERSION >= 11010
116       // NVRTC used to support only virtual architectures through the option
117       // -arch, since it was only emitting PTX. It will now support actual
118       // architectures as well to emit SASS.
119       // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
120       "-arch=sm_"
121 #else
122       "-arch=compute_"
123 #endif
124       + std::to_string(prop.major) + std::to_string(prop.minor);
125   opts[1] = arch_arg.c_str();
126   opts[2] = "-Dint32_t=int";
127   opts[3] = "-DCEED_RUNNING_JIT_PASS=1";
128   // Additional include dirs
129   {
130     const char **jit_source_dirs;
131 
132     CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
133     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
134     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
135       std::ostringstream include_dir_arg;
136 
137       include_dir_arg << "-I" << jit_source_dirs[i];
138       CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i]));
139     }
140     CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
141   }
142   // User defines
143   {
144     const char **jit_defines;
145 
146     CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
147     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts));
148     for (CeedInt i = 0; i < num_jit_defines; i++) {
149       std::ostringstream define_arg;
150 
151       define_arg << "-D" << jit_defines[i];
152       CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i]));
153     }
154     CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
155   }
156 
157   // Add string source argument provided in call
158   code << source;
159 
160   // Compile kernel
161   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
162   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
163   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
164 
165   if (!using_clang) {
166     CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
167 
168     if (CeedDebugFlag(ceed)) {
169       // LCOV_EXCL_START
170       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
171       for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
172         CeedDebug(ceed, "Option %d: %s", i, opts[i]);
173       }
174       CeedDebug(ceed, "");
175       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
176       // LCOV_EXCL_STOP
177     }
178 
179     nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
180 
181     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
182       CeedCallBackend(CeedFree(&opts[num_opts + i]));
183     }
184     for (CeedInt i = 0; i < num_jit_defines; i++) {
185       CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
186     }
187     CeedCallBackend(CeedFree(&opts));
188     *is_compile_good = result == NVRTC_SUCCESS;
189     if (!*is_compile_good) {
190       char  *log;
191       size_t log_size;
192 
193       CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
194       CeedCallBackend(CeedMalloc(log_size, &log));
195       CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
196       if (throw_error) {
197         return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
198       } else {
199         // LCOV_EXCL_START
200         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
201         CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
202         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
203         CeedCallBackend(CeedFree(&log));
204         CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
205         return CEED_ERROR_SUCCESS;
206         // LCOV_EXCL_STOP
207       }
208     }
209 
210 #if CUDA_VERSION >= 11010
211     CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size));
212     CeedCallBackend(CeedMalloc(ptx_size, &ptx));
213     CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx));
214 #else
215     CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
216     CeedCallBackend(CeedMalloc(ptx_size, &ptx));
217     CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
218 #endif
219     CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
220 
221     CeedCallCuda(ceed, cuModuleLoadData(module, ptx));
222     CeedCallBackend(CeedFree(&ptx));
223     return CEED_ERROR_SUCCESS;
224   } else {
225     srand(time(NULL));
226     const int build_id = rand();
227 
228     // Create temp dir if needed
229     {
230       DIR *dir = opendir("temp");
231 
232       if (dir) {
233         closedir(dir);
234       } else {
235         // In parallel multiple processes may attempt
236         // Only one process needs to succeed
237         mkdir("temp", 0777);
238         chmod("temp", 0777);
239       }
240     }
241     // Write code to temp file
242     {
243       std::string filename = std::string("temp/kernel_") + std::to_string(build_id) + std::string("_0_source.cu");
244       FILE       *file     = fopen(filename.c_str(), "w");
245 
246       CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang");
247       fputs(code.str().c_str(), file);
248       fclose(file);
249     }
250 
251     // Get rust crate directories
252     const char **rust_source_dirs     = nullptr;
253     int          num_rust_source_dirs = 0;
254 
255     CeedCallBackend(CeedGetRustSourceRoots(ceed, &num_rust_source_dirs, &rust_source_dirs));
256 
257     std::string rust_dirs[10];
258 
259     if (num_rust_source_dirs > 0) {
260       CeedDebug(ceed, "There are %d source dirs, including %s\n", num_rust_source_dirs, rust_source_dirs[0]);
261     }
262 
263     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
264       rust_dirs[i] = std::string(rust_source_dirs[i]);
265     }
266 
267     CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs));
268 
269     char *rust_toolchain = std::getenv("RUST_TOOLCHAIN");
270 
271     if (rust_toolchain == nullptr) {
272       rust_toolchain = (char *)"nightly";
273       setenv("RUST_TOOLCHAIN", "nightly", 0);
274     }
275 
276     // Compile Rust crate(s) needed
277     std::string command;
278 
279     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
280       command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] +
281                 "/.cargo/config.toml --manifest-path " + rust_dirs[i] + "/Cargo.toml";
282       CeedCallSystem(ceed, command.c_str(), "build Rust crate");
283     }
284 
285     // Get Clang version
286     bool use_llvm_version = ceed_data->use_llvm_version;
287     int  llvm_version     = ceed_data->llvm_version;
288 
289     if (llvm_version == 0) {
290       command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) --version";
291       CeedDebug(ceed, "Attempting to detect Rust LLVM version.\ncommand:\n$ %s", command.c_str());
292       FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
293 
294       CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version");
295 
296       char        line[CEED_MAX_RESOURCE_LEN] = "";
297       std::string output                      = "";
298 
299       while (fgets(line, sizeof(line), output_stream) != nullptr) {
300         output += line;
301       }
302       CeedDebug(ceed, "output:\n%s", output.c_str());
303       CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version\ncommand:\n$ %s\nerror:\n%s",
304                 command.c_str(), output.c_str());
305 
306       const char *version_substring = strstr(output.c_str(), "LLVM version ");
307 
308       version_substring += 13;
309 
310       char *next_dot = strchr((char *)version_substring, '.');
311 
312       next_dot[0]             = '\0';
313       ceed_data->llvm_version = llvm_version = std::stoi(version_substring);
314       CeedDebug(ceed, "Rust LLVM version number: %d\n", llvm_version);
315 
316       command                     = std::string("clang++-") + std::to_string(llvm_version);
317       output_stream               = popen((command + std::string(" 2>&1")).c_str(), "r");
318       ceed_data->use_llvm_version = use_llvm_version = pclose(output_stream) == 0;
319     }
320 
321     // Compile wrapper kernel
322     command = "clang++" + (use_llvm_version ? (std::string("-") + std::to_string(llvm_version)) : "") + " -flto=thin --cuda-gpu-arch=sm_" +
323               std::to_string(prop.major) + std::to_string(prop.minor) + " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) +
324               "_0_source.cu -o temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll ";
325     command += opts[4];
326     CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
327     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll").c_str(), "update JiT file permissions");
328 
329     // Find Rust's llvm-link tool and runs it
330     command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
331               std::to_string(build_id) +
332               "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o "
333               "temp/kernel_" +
334               std::to_string(build_id) + "_2_linked.ll ";
335 
336     // Searches for .a files in rust directoy
337     // Note: this is necessary because Rust crate names may not match the folder they are in
338     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
339       std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release";
340       DIR        *dp  = opendir(dir.c_str());
341 
342       CeedCheck(dp != nullptr, ceed, CEED_ERROR_BACKEND, "Could not open directory: %s", dir.c_str());
343       struct dirent *entry;
344 
345       // Find files ending in .a
346       while ((entry = readdir(dp)) != nullptr) {
347         std::string filename(entry->d_name);
348 
349         if (filename.size() >= 2 && filename.substr(filename.size() - 2) == ".a") {
350           command += dir + "/" + filename + " ";
351         }
352       }
353       closedir(dp);
354       // TODO: when libCEED switches to c++17, switch to std::filesystem for the loop above
355     }
356 
357     // Link, optimize, and compile final CUDA kernel
358     CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
359     CeedCallSystem(
360         ceed,
361         ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" +
362          std::to_string(build_id) + "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
363             .c_str(),
364         "optimize linked C and Rust source");
365     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_2_linked.ll").c_str(), "update JiT file permissions");
366     CeedCallSystem(ceed,
367                    ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
368                     std::to_string(prop.major) + std::to_string(prop.minor) + " temp/kernel_" + std::to_string(build_id) +
369                     "_3_opt.bc -o temp/kernel_" + std::to_string(build_id) + "_4_final.ptx")
370                        .c_str(),
371                    "compile final CUDA kernel");
372     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_4_final.ptx").c_str(), "update JiT file permissions");
373 
374     ifstream      ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx");
375     ostringstream sstr;
376 
377     sstr << ptxfile.rdbuf();
378 
379     auto ptx_data = sstr.str();
380     ptx_size      = ptx_data.length();
381 
382     int result = cuModuleLoadData(module, ptx_data.c_str());
383 
384     *is_compile_good = result == 0;
385     if (!*is_compile_good) {
386       if (throw_error) {
387         return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load module data");
388       } else {
389         // LCOV_EXCL_START
390         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
391         CeedDebug(ceed, "Error: Failed to load module data");
392         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
393         return CEED_ERROR_SUCCESS;
394         // LCOV_EXCL_STOP
395       }
396     }
397   }
398   return CEED_ERROR_SUCCESS;
399 }
400 
401 int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
402   bool    is_compile_good = true;
403   va_list args;
404 
405   va_start(args, num_defines);
406   const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args);
407 
408   va_end(args);
409   CeedCallBackend(ierr);
410   return CEED_ERROR_SUCCESS;
411 }
412 
413 int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
414   va_list args;
415 
416   va_start(args, num_defines);
417   const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args);
418 
419   va_end(args);
420   CeedCallBackend(ierr);
421   return CEED_ERROR_SUCCESS;
422 }
423 
424 //------------------------------------------------------------------------------
425 // Get CUDA kernel
426 //------------------------------------------------------------------------------
427 int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel) {
428   CeedCallCuda(ceed, cuModuleGetFunction(kernel, module, name));
429   return CEED_ERROR_SUCCESS;
430 }
431 
432 //------------------------------------------------------------------------------
433 // Run CUDA kernel with block size selected automatically based on the kernel
434 //     (which may use enough registers to require a smaller block size than the
435 //      hardware is capable)
436 //------------------------------------------------------------------------------
437 int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void **args) {
438   int min_grid_size, max_block_size;
439 
440   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_block_size, kernel, NULL, 0, 0x10000));
441   CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args));
442   return CEED_ERROR_SUCCESS;
443 }
444 
445 //------------------------------------------------------------------------------
446 // Run CUDA kernel
447 //------------------------------------------------------------------------------
448 int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) {
449   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args));
450   return CEED_ERROR_SUCCESS;
451 }
452 
453 //------------------------------------------------------------------------------
454 // Run CUDA kernel for spatial dimension
455 //------------------------------------------------------------------------------
456 int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z,
457                           void **args) {
458   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
459   return CEED_ERROR_SUCCESS;
460 }
461 
462 //------------------------------------------------------------------------------
463 // Run CUDA kernel for spatial dimension with shared memory
464 //------------------------------------------------------------------------------
465 static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x,
466                                            const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
467                                            bool *is_good_run, void **args) {
468 #if CUDA_VERSION >= 9000
469   cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
470 #endif
471   CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
472 
473   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
474     int max_threads_per_block, shared_size_bytes, num_regs;
475 
476     cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
477     cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
478     cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
479     if (throw_error) {
480       return CeedError(ceed, CEED_ERROR_BACKEND,
481                        "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
482                        max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
483     } else {
484       // LCOV_EXCL_START
485       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
486       CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
487                 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
488       CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
489       // LCOV_EXCL_STOP
490     }
491     *is_good_run = false;
492   } else CeedChk_Cu(ceed, result);
493   return CEED_ERROR_SUCCESS;
494 }
495 
496 int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
497                                 const int block_size_z, const int shared_mem_size, void **args) {
498   bool is_good_run = true;
499 
500   CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
501                                                   &is_good_run, args));
502   return CEED_ERROR_SUCCESS;
503 }
504 
505 int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
506                                    const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
507   CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
508                                                   is_good_run, args));
509   return CEED_ERROR_SUCCESS;
510 }
511 
512 //------------------------------------------------------------------------------
513