1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3 //
4 // SPDX-License-Identifier: BSD-2-Clause
5 //
6 // This file is part of CEED: http://github.com/ceed
7
8 #include "ceed-cuda-compile.h"
9
10 #include <ceed.h>
11 #include <ceed/backend.h>
12 #include <ceed/jit-tools.h>
13 #include <cuda_runtime.h>
14 #include <dirent.h>
15 #include <nvrtc.h>
16 #include <stdarg.h>
17 #include <stdio.h>
18 #include <string.h>
19 #include <sys/stat.h>
20 #include <sys/types.h>
21
22 #include <cstdlib>
23 #include <fstream>
24 #include <iostream>
25 #include <sstream>
26 #include <string>
27
28 #include "ceed-cuda-common.h"
29
30 #define CeedChk_Nvrtc(ceed, x) \
31 do { \
32 nvrtcResult result = static_cast<nvrtcResult>(x); \
33 if (result != NVRTC_SUCCESS) return CeedError((ceed), CEED_ERROR_BACKEND, nvrtcGetErrorString(result)); \
34 } while (0)
35
36 #define CeedCallNvrtc(ceed, ...) \
37 do { \
38 int ierr_q_ = __VA_ARGS__; \
39 CeedChk_Nvrtc(ceed, ierr_q_); \
40 } while (0)
41
42 #define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message))
43
44 //------------------------------------------------------------------------------
45 // Call system command and capture stdout + stderr
46 //------------------------------------------------------------------------------
CeedCallSystem_Core(Ceed ceed,const char * command,const char * message)47 static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
48 CeedDebug(ceed, "Running command:\n$ %s", command);
49 FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
50
51 CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s", message, command);
52
53 char line[CEED_MAX_RESOURCE_LEN] = "";
54 std::string output = "";
55
56 while (fgets(line, sizeof(line), output_stream) != nullptr) {
57 output += line;
58 }
59 CeedDebug(ceed, "output:\n%s\n", output.c_str());
60 CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s\nerror:\n%s", message, command, output.c_str());
61 return CEED_ERROR_SUCCESS;
62 }
63
64 //------------------------------------------------------------------------------
65 // Compile CUDA kernel
66 //------------------------------------------------------------------------------
67 using std::ifstream;
68 using std::ofstream;
69 using std::ostringstream;
70
CeedCompileCore_Cuda(Ceed ceed,const char * source,const bool throw_error,bool * is_compile_good,CUmodule * module,const CeedInt num_defines,va_list args)71 static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
72 const CeedInt num_defines, va_list args) {
73 size_t ptx_size;
74 char *ptx;
75 const int num_opts = 4;
76 CeedInt num_jit_source_dirs = 0, num_jit_defines = 0;
77 const char **opts;
78 nvrtcProgram prog;
79 struct cudaDeviceProp prop;
80 Ceed_Cuda *ceed_data;
81
82 cudaFree(0); // Make sure a Context exists for nvrtc
83
84 std::ostringstream code;
85 bool using_clang;
86
87 CeedCallBackend(CeedGetIsClang(ceed, &using_clang));
88
89 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS,
90 using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)"
91 : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment "
92 "variable GPU_CLANG=1");
93
94 // Get kernel specific options, such as kernel constants
95 if (num_defines > 0) {
96 char *name;
97 int val;
98
99 for (int i = 0; i < num_defines; i++) {
100 name = va_arg(args, char *);
101 val = va_arg(args, int);
102 code << "#define " << name << " " << val << "\n";
103 }
104 }
105
106 // Standard libCEED definitions for CUDA backends
107 code << "#include <ceed/jit-source/cuda/cuda-jit.h>\n\n";
108
109 // Non-macro options
110 CeedCallBackend(CeedCalloc(num_opts, &opts));
111 opts[0] = "-default-device";
112 CeedCallBackend(CeedGetData(ceed, &ceed_data));
113 CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id));
114 std::string arch_arg =
115 #if CUDA_VERSION >= 11010
116 // NVRTC used to support only virtual architectures through the option
117 // -arch, since it was only emitting PTX. It will now support actual
118 // architectures as well to emit SASS.
119 // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
120 "-arch=sm_"
121 #else
122 "-arch=compute_"
123 #endif
124 + std::to_string(prop.major) + std::to_string(prop.minor);
125 opts[1] = arch_arg.c_str();
126 opts[2] = "-Dint32_t=int";
127 opts[3] = "-DCEED_RUNNING_JIT_PASS=1";
128 // Additional include dirs
129 {
130 const char **jit_source_dirs;
131
132 CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
133 CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
134 for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
135 std::ostringstream include_dir_arg;
136
137 include_dir_arg << "-I" << jit_source_dirs[i];
138 CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i]));
139 }
140 CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
141 }
142 // User defines
143 {
144 const char **jit_defines;
145
146 CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
147 CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts));
148 for (CeedInt i = 0; i < num_jit_defines; i++) {
149 std::ostringstream define_arg;
150
151 define_arg << "-D" << jit_defines[i];
152 CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i]));
153 }
154 CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
155 }
156
157 // Add string source argument provided in call
158 code << source;
159
160 // Compile kernel
161 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
162 CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
163 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
164
165 if (!using_clang) {
166 CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
167
168 if (CeedDebugFlag(ceed)) {
169 // LCOV_EXCL_START
170 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
171 for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
172 CeedDebug(ceed, "Option %d: %s", i, opts[i]);
173 }
174 CeedDebug(ceed, "");
175 CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
176 // LCOV_EXCL_STOP
177 }
178
179 nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
180
181 for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
182 CeedCallBackend(CeedFree(&opts[num_opts + i]));
183 }
184 for (CeedInt i = 0; i < num_jit_defines; i++) {
185 CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
186 }
187 CeedCallBackend(CeedFree(&opts));
188 *is_compile_good = result == NVRTC_SUCCESS;
189 if (!*is_compile_good) {
190 char *log;
191 size_t log_size;
192
193 CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
194 CeedCallBackend(CeedMalloc(log_size, &log));
195 CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
196 if (throw_error) {
197 return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
198 } else {
199 // LCOV_EXCL_START
200 CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
201 CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
202 CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
203 CeedCallBackend(CeedFree(&log));
204 CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
205 return CEED_ERROR_SUCCESS;
206 // LCOV_EXCL_STOP
207 }
208 }
209
210 #if CUDA_VERSION >= 11010
211 CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size));
212 CeedCallBackend(CeedMalloc(ptx_size, &ptx));
213 CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx));
214 #else
215 CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
216 CeedCallBackend(CeedMalloc(ptx_size, &ptx));
217 CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
218 #endif
219 CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
220
221 CeedCallCuda(ceed, cuModuleLoadData(module, ptx));
222 CeedCallBackend(CeedFree(&ptx));
223 return CEED_ERROR_SUCCESS;
224 } else {
225 srand(time(NULL));
226 const int build_id = rand();
227
228 // Create temp dir if needed
229 {
230 DIR *dir = opendir("temp");
231
232 if (dir) {
233 closedir(dir);
234 } else {
235 // In parallel multiple processes may attempt
236 // Only one process needs to succeed
237 mkdir("temp", 0777);
238 chmod("temp", 0777);
239 }
240 }
241 // Write code to temp file
242 {
243 std::string filename = std::string("temp/kernel_") + std::to_string(build_id) + std::string("_0_source.cu");
244 FILE *file = fopen(filename.c_str(), "w");
245
246 CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang");
247 fputs(code.str().c_str(), file);
248 fclose(file);
249 }
250
251 // Get rust crate directories
252 const char **rust_source_dirs = nullptr;
253 int num_rust_source_dirs = 0;
254
255 CeedCallBackend(CeedGetRustSourceRoots(ceed, &num_rust_source_dirs, &rust_source_dirs));
256
257 std::string rust_dirs[10];
258
259 if (num_rust_source_dirs > 0) {
260 CeedDebug(ceed, "There are %d source dirs, including %s\n", num_rust_source_dirs, rust_source_dirs[0]);
261 }
262
263 for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
264 rust_dirs[i] = std::string(rust_source_dirs[i]);
265 }
266
267 CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs));
268
269 char *rust_toolchain = std::getenv("RUST_TOOLCHAIN");
270
271 if (rust_toolchain == nullptr) {
272 rust_toolchain = (char *)"nightly";
273 setenv("RUST_TOOLCHAIN", "nightly", 0);
274 }
275
276 // Compile Rust crate(s) needed
277 std::string command;
278
279 for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
280 command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] +
281 "/.cargo/config.toml --manifest-path " + rust_dirs[i] + "/Cargo.toml";
282 CeedCallSystem(ceed, command.c_str(), "build Rust crate");
283 }
284
285 // Get Clang version
286 bool use_llvm_version = ceed_data->use_llvm_version;
287 int llvm_version = ceed_data->llvm_version;
288
289 if (llvm_version == 0) {
290 command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) --version";
291 CeedDebug(ceed, "Attempting to detect Rust LLVM version.\ncommand:\n$ %s", command.c_str());
292 FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
293
294 CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version");
295
296 char line[CEED_MAX_RESOURCE_LEN] = "";
297 std::string output = "";
298
299 while (fgets(line, sizeof(line), output_stream) != nullptr) {
300 output += line;
301 }
302 CeedDebug(ceed, "output:\n%s", output.c_str());
303 CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version\ncommand:\n$ %s\nerror:\n%s",
304 command.c_str(), output.c_str());
305
306 const char *version_substring = strstr(output.c_str(), "LLVM version ");
307
308 version_substring += 13;
309
310 char *next_dot = strchr((char *)version_substring, '.');
311
312 if (next_dot) {
313 next_dot[0] = '\0';
314 ceed_data->llvm_version = llvm_version = std::stoi(version_substring);
315 CeedDebug(ceed, "Rust LLVM version number: %d\n", llvm_version);
316
317 command = std::string("clang++-") + std::to_string(llvm_version);
318 output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
319 ceed_data->use_llvm_version = use_llvm_version = pclose(output_stream) == 0;
320 } else {
321 ceed_data->llvm_version = -1;
322 ceed_data->use_llvm_version = use_llvm_version = false;
323 }
324 }
325
326 // Compile wrapper kernel
327 command = "clang++" + (use_llvm_version ? (std::string("-") + std::to_string(llvm_version)) : "") + " -flto=thin --cuda-gpu-arch=sm_" +
328 std::to_string(prop.major) + std::to_string(prop.minor) + " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) +
329 "_0_source.cu -o temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll ";
330 command += opts[4];
331 CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
332 CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll").c_str(), "update JiT file permissions");
333
334 // Find Rust's llvm-link tool and run it
335 command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
336 std::to_string(build_id) +
337 "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o "
338 "temp/kernel_" +
339 std::to_string(build_id) + "_2_linked.ll ";
340
341 // Searches for .a files in Rust directory
342 // Note: Rust crate names may not match the folder they are in
343 // TODO: If libCEED switches to c++17, use std::filesystem here
344 for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
345 std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release";
346 DIR *dp = opendir(dir.c_str());
347
348 CeedCheck(dp != nullptr, ceed, CEED_ERROR_BACKEND, "Could not open directory: %s", dir.c_str());
349 struct dirent *entry;
350
351 // Find files ending in .a
352 while ((entry = readdir(dp)) != nullptr) {
353 std::string filename(entry->d_name);
354
355 if (filename.size() >= 2 && filename.substr(filename.size() - 2) == ".a") {
356 command += dir + "/" + filename + " ";
357 }
358 }
359 closedir(dp);
360 }
361
362 // Link, optimize, and compile final CUDA kernel
363 CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
364 CeedCallSystem(ceed,
365 ("$(find $(rustup run " + std::string(rust_toolchain) +
366 " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" + std::to_string(build_id) +
367 "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
368 .c_str(),
369 "optimize linked C and Rust source");
370 CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_2_linked.ll").c_str(), "update JiT file permissions");
371 CeedCallSystem(ceed,
372 ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
373 std::to_string(prop.major) + std::to_string(prop.minor) + " temp/kernel_" + std::to_string(build_id) +
374 "_3_opt.bc -o temp/kernel_" + std::to_string(build_id) + "_4_final.ptx")
375 .c_str(),
376 "compile final CUDA kernel");
377 CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_4_final.ptx").c_str(), "update JiT file permissions");
378
379 // Load module from final PTX
380 ifstream ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx");
381 ostringstream sstr;
382
383 sstr << ptxfile.rdbuf();
384
385 auto ptx_data = sstr.str();
386 ptx_size = ptx_data.length();
387
388 int result = cuModuleLoadData(module, ptx_data.c_str());
389
390 *is_compile_good = result == 0;
391 if (!*is_compile_good) {
392 if (throw_error) {
393 return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load module data");
394 } else {
395 // LCOV_EXCL_START
396 CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
397 CeedDebug(ceed, "Error: Failed to load module data");
398 CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
399 return CEED_ERROR_SUCCESS;
400 // LCOV_EXCL_STOP
401 }
402 }
403 }
404 return CEED_ERROR_SUCCESS;
405 }
406
CeedCompile_Cuda(Ceed ceed,const char * source,CUmodule * module,const CeedInt num_defines,...)407 int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
408 bool is_compile_good = true;
409 va_list args;
410
411 va_start(args, num_defines);
412 const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args);
413
414 va_end(args);
415 CeedCallBackend(ierr);
416 return CEED_ERROR_SUCCESS;
417 }
418
CeedTryCompile_Cuda(Ceed ceed,const char * source,bool * is_compile_good,CUmodule * module,const CeedInt num_defines,...)419 int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
420 va_list args;
421
422 va_start(args, num_defines);
423 const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args);
424
425 va_end(args);
426 CeedCallBackend(ierr);
427 return CEED_ERROR_SUCCESS;
428 }
429
430 //------------------------------------------------------------------------------
431 // Get CUDA kernel
432 //------------------------------------------------------------------------------
CeedGetKernel_Cuda(Ceed ceed,CUmodule module,const char * name,CUfunction * kernel)433 int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel) {
434 CeedCallCuda(ceed, cuModuleGetFunction(kernel, module, name));
435 return CEED_ERROR_SUCCESS;
436 }
437
438 //------------------------------------------------------------------------------
439 // Run CUDA kernel with block size selected automatically based on the kernel
440 // (which may use enough registers to require a smaller block size than the
441 // hardware is capable)
442 //------------------------------------------------------------------------------
CeedRunKernelAutoblockCuda(Ceed ceed,CUfunction kernel,size_t points,void ** args)443 int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void **args) {
444 int min_grid_size, max_block_size;
445
446 CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_block_size, kernel, NULL, 0, 0x10000));
447 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args));
448 return CEED_ERROR_SUCCESS;
449 }
450
451 //------------------------------------------------------------------------------
452 // Run CUDA kernel
453 //------------------------------------------------------------------------------
CeedRunKernel_Cuda(Ceed ceed,CUfunction kernel,const int grid_size,const int block_size,void ** args)454 int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) {
455 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args));
456 return CEED_ERROR_SUCCESS;
457 }
458
459 //------------------------------------------------------------------------------
460 // Run CUDA kernel for spatial dimension
461 //------------------------------------------------------------------------------
CeedRunKernelDim_Cuda(Ceed ceed,CUfunction kernel,const int grid_size,const int block_size_x,const int block_size_y,const int block_size_z,void ** args)462 int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z,
463 void **args) {
464 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
465 return CEED_ERROR_SUCCESS;
466 }
467
468 //------------------------------------------------------------------------------
469 // Run CUDA kernel for spatial dimension with shared memory
470 //------------------------------------------------------------------------------
CeedRunKernelDimSharedCore_Cuda(Ceed ceed,CUfunction kernel,CUstream stream,const int grid_size,const int block_size_x,const int block_size_y,const int block_size_z,const int shared_mem_size,const bool throw_error,bool * is_good_run,void ** args)471 static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x,
472 const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
473 bool *is_good_run, void **args) {
474 #if CUDA_VERSION >= 9000
475 cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
476 #endif
477 CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
478
479 if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
480 int max_threads_per_block, shared_size_bytes, num_regs;
481
482 cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
483 cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
484 cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
485 if (throw_error) {
486 return CeedError(ceed, CEED_ERROR_BACKEND,
487 "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
488 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
489 } else {
490 // LCOV_EXCL_START
491 CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
492 CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
493 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
494 CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
495 // LCOV_EXCL_STOP
496 }
497 *is_good_run = false;
498 } else CeedChk_Cu(ceed, result);
499 return CEED_ERROR_SUCCESS;
500 }
501
CeedRunKernelDimShared_Cuda(Ceed ceed,CUfunction kernel,CUstream stream,const int grid_size,const int block_size_x,const int block_size_y,const int block_size_z,const int shared_mem_size,void ** args)502 int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
503 const int block_size_z, const int shared_mem_size, void **args) {
504 bool is_good_run = true;
505
506 CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
507 &is_good_run, args));
508 return CEED_ERROR_SUCCESS;
509 }
510
CeedTryRunKernelDimShared_Cuda(Ceed ceed,CUfunction kernel,CUstream stream,const int grid_size,const int block_size_x,const int block_size_y,const int block_size_z,const int shared_mem_size,bool * is_good_run,void ** args)511 int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
512 const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
513 CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
514 is_good_run, args));
515 return CEED_ERROR_SUCCESS;
516 }
517
518 //------------------------------------------------------------------------------
519