xref: /libCEED/rust/libceed-sys/c-src/backends/cuda/ceed-cuda-compile.cpp (revision 1a8516d00062e8132c3db0515cc9f5fa064f6664)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2c9c2c079SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3c9c2c079SJeremy L Thompson //
4c9c2c079SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5c9c2c079SJeremy L Thompson //
6c9c2c079SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7c9c2c079SJeremy L Thompson 
82b730f8bSJeremy L Thompson #include "ceed-cuda-compile.h"
92b730f8bSJeremy L Thompson 
1049aac155SJeremy L Thompson #include <ceed.h>
11c9c2c079SJeremy L Thompson #include <ceed/backend.h>
12c9c2c079SJeremy L Thompson #include <ceed/jit-tools.h>
13c9c2c079SJeremy L Thompson #include <cuda_runtime.h>
142027fb9dSSirAlienTheGreat #include <dirent.h>
15c9c2c079SJeremy L Thompson #include <nvrtc.h>
16c9c2c079SJeremy L Thompson #include <stdarg.h>
17fa619eccSJeremy L Thompson #include <stdio.h>
18c9c2c079SJeremy L Thompson #include <string.h>
199b5f41c8SJeremy L Thompson #include <sys/stat.h>
202027fb9dSSirAlienTheGreat #include <sys/types.h>
212b730f8bSJeremy L Thompson 
222027fb9dSSirAlienTheGreat #include <cstdlib>
232027fb9dSSirAlienTheGreat #include <fstream>
242027fb9dSSirAlienTheGreat #include <iostream>
252b730f8bSJeremy L Thompson #include <sstream>
262027fb9dSSirAlienTheGreat #include <string>
272b730f8bSJeremy L Thompson 
28c9c2c079SJeremy L Thompson #include "ceed-cuda-common.h"
29c9c2c079SJeremy L Thompson 
30c9c2c079SJeremy L Thompson #define CeedChk_Nvrtc(ceed, x)                                                                              \
31c9c2c079SJeremy L Thompson   do {                                                                                                      \
32c9c2c079SJeremy L Thompson     nvrtcResult result = static_cast<nvrtcResult>(x);                                                       \
332b730f8bSJeremy L Thompson     if (result != NVRTC_SUCCESS) return CeedError((ceed), CEED_ERROR_BACKEND, nvrtcGetErrorString(result)); \
34c9c2c079SJeremy L Thompson   } while (0)
35c9c2c079SJeremy L Thompson 
362b730f8bSJeremy L Thompson #define CeedCallNvrtc(ceed, ...)  \
372b730f8bSJeremy L Thompson   do {                            \
382b730f8bSJeremy L Thompson     int ierr_q_ = __VA_ARGS__;    \
392b730f8bSJeremy L Thompson     CeedChk_Nvrtc(ceed, ierr_q_); \
406574a04fSJeremy L Thompson   } while (0)
412b730f8bSJeremy L Thompson 
422027fb9dSSirAlienTheGreat #define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message))
432027fb9dSSirAlienTheGreat 
442027fb9dSSirAlienTheGreat //------------------------------------------------------------------------------
452027fb9dSSirAlienTheGreat // Call system command and capture stdout + stderr
462027fb9dSSirAlienTheGreat //------------------------------------------------------------------------------
472027fb9dSSirAlienTheGreat static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
48fa619eccSJeremy L Thompson   CeedDebug(ceed, "Running command:\n$ %s", command);
492027fb9dSSirAlienTheGreat   FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
502027fb9dSSirAlienTheGreat 
51fa619eccSJeremy L Thompson   CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s", message, command);
522027fb9dSSirAlienTheGreat 
53fa619eccSJeremy L Thompson   char        line[CEED_MAX_RESOURCE_LEN] = "";
54fa619eccSJeremy L Thompson   std::string output                      = "";
552027fb9dSSirAlienTheGreat 
56fa619eccSJeremy L Thompson   while (fgets(line, sizeof(line), output_stream) != nullptr) {
57fa619eccSJeremy L Thompson     output += line;
582027fb9dSSirAlienTheGreat   }
59fa619eccSJeremy L Thompson   CeedDebug(ceed, "output:\n%s\n", output.c_str());
60fa619eccSJeremy L Thompson   CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s\ncommand:\n$ %s\nerror:\n%s", message, command, output.c_str());
612027fb9dSSirAlienTheGreat   return CEED_ERROR_SUCCESS;
622027fb9dSSirAlienTheGreat }
632027fb9dSSirAlienTheGreat 
64c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
65c9c2c079SJeremy L Thompson // Compile CUDA kernel
66c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
672027fb9dSSirAlienTheGreat using std::ifstream;
682027fb9dSSirAlienTheGreat using std::ofstream;
692027fb9dSSirAlienTheGreat using std::ostringstream;
702027fb9dSSirAlienTheGreat 
71ddae5012SJeremy L Thompson static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
72ddae5012SJeremy L Thompson                                 const CeedInt num_defines, va_list args) {
73ca735530SJeremy L Thompson   size_t                ptx_size;
7422070f95SJeremy L Thompson   char                 *ptx;
75a491a57eSJeremy L Thompson   const int             num_opts            = 4;
764753b775SJeremy L Thompson   CeedInt               num_jit_source_dirs = 0, num_jit_defines = 0;
77b13efd58SJeremy L Thompson   const char          **opts;
78c9c2c079SJeremy L Thompson   nvrtcProgram          prog;
79ca735530SJeremy L Thompson   struct cudaDeviceProp prop;
80ca735530SJeremy L Thompson   Ceed_Cuda            *ceed_data;
81ca735530SJeremy L Thompson 
82ca735530SJeremy L Thompson   cudaFree(0);  // Make sure a Context exists for nvrtc
83c9c2c079SJeremy L Thompson 
84c9c2c079SJeremy L Thompson   std::ostringstream code;
852027fb9dSSirAlienTheGreat   bool               using_clang;
862027fb9dSSirAlienTheGreat 
872027fb9dSSirAlienTheGreat   CeedCallBackend(CeedGetIsClang(ceed, &using_clang));
882027fb9dSSirAlienTheGreat 
892027fb9dSSirAlienTheGreat   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS,
902027fb9dSSirAlienTheGreat                using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)"
912027fb9dSSirAlienTheGreat                            : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment "
922027fb9dSSirAlienTheGreat                              "variable GPU_CLANG=1");
93c9c2c079SJeremy L Thompson 
94c9c2c079SJeremy L Thompson   // Get kernel specific options, such as kernel constants
95c9c2c079SJeremy L Thompson   if (num_defines > 0) {
96c9c2c079SJeremy L Thompson     char *name;
97c9c2c079SJeremy L Thompson     int   val;
98ca735530SJeremy L Thompson 
99c9c2c079SJeremy L Thompson     for (int i = 0; i < num_defines; i++) {
100c9c2c079SJeremy L Thompson       name = va_arg(args, char *);
101c9c2c079SJeremy L Thompson       val  = va_arg(args, int);
102c9c2c079SJeremy L Thompson       code << "#define " << name << " " << val << "\n";
103c9c2c079SJeremy L Thompson     }
104c9c2c079SJeremy L Thompson   }
105c9c2c079SJeremy L Thompson 
106c9c2c079SJeremy L Thompson   // Standard libCEED definitions for CUDA backends
10791adc9c8SJeremy L Thompson   code << "#include <ceed/jit-source/cuda/cuda-jit.h>\n\n";
108c9c2c079SJeremy L Thompson 
109c9c2c079SJeremy L Thompson   // Non-macro options
110b13efd58SJeremy L Thompson   CeedCallBackend(CeedCalloc(num_opts, &opts));
111c9c2c079SJeremy L Thompson   opts[0] = "-default-device";
1122b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &ceed_data));
1132b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id));
11429ec485eSJed Brown   std::string arch_arg =
11529ec485eSJed Brown #if CUDA_VERSION >= 11010
11629ec485eSJed Brown       // NVRTC used to support only virtual architectures through the option
11729ec485eSJed Brown       // -arch, since it was only emitting PTX. It will now support actual
11829ec485eSJed Brown       // architectures as well to emit SASS.
11929ec485eSJed Brown       // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
12029ec485eSJed Brown       "-arch=sm_"
12129ec485eSJed Brown #else
12229ec485eSJed Brown       "-arch=compute_"
12329ec485eSJed Brown #endif
12429ec485eSJed Brown       + std::to_string(prop.major) + std::to_string(prop.minor);
125c9c2c079SJeremy L Thompson   opts[1] = arch_arg.c_str();
126c9c2c079SJeremy L Thompson   opts[2] = "-Dint32_t=int";
127a491a57eSJeremy L Thompson   opts[3] = "-DCEED_RUNNING_JIT_PASS=1";
1284753b775SJeremy L Thompson   // Additional include dirs
129b13efd58SJeremy L Thompson   {
130b13efd58SJeremy L Thompson     const char **jit_source_dirs;
131b13efd58SJeremy L Thompson 
132b13efd58SJeremy L Thompson     CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
133b13efd58SJeremy L Thompson     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
134b13efd58SJeremy L Thompson     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
1354753b775SJeremy L Thompson       std::ostringstream include_dir_arg;
136b13efd58SJeremy L Thompson 
1374753b775SJeremy L Thompson       include_dir_arg << "-I" << jit_source_dirs[i];
1384753b775SJeremy L Thompson       CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i]));
139b13efd58SJeremy L Thompson     }
140b13efd58SJeremy L Thompson     CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
141b13efd58SJeremy L Thompson   }
1424753b775SJeremy L Thompson   // User defines
1434753b775SJeremy L Thompson   {
1444753b775SJeremy L Thompson     const char **jit_defines;
1454753b775SJeremy L Thompson 
1464753b775SJeremy L Thompson     CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
1474753b775SJeremy L Thompson     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts));
1484753b775SJeremy L Thompson     for (CeedInt i = 0; i < num_jit_defines; i++) {
1494753b775SJeremy L Thompson       std::ostringstream define_arg;
1504753b775SJeremy L Thompson 
1514753b775SJeremy L Thompson       define_arg << "-D" << jit_defines[i];
1524753b775SJeremy L Thompson       CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i]));
1534753b775SJeremy L Thompson     }
1544753b775SJeremy L Thompson     CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
1554753b775SJeremy L Thompson   }
156c9c2c079SJeremy L Thompson 
157c9c2c079SJeremy L Thompson   // Add string source argument provided in call
158c9c2c079SJeremy L Thompson   code << source;
159c9c2c079SJeremy L Thompson 
160c9c2c079SJeremy L Thompson   // Compile kernel
161c21e34e2SJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
16226ef7cdaSJeremy L Thompson   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
163c21e34e2SJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
1642027fb9dSSirAlienTheGreat 
1652027fb9dSSirAlienTheGreat   if (!using_clang) {
1662027fb9dSSirAlienTheGreat     CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
1672027fb9dSSirAlienTheGreat 
168bdcc2728SJeremy L Thompson     if (CeedDebugFlag(ceed)) {
169bdcc2728SJeremy L Thompson       // LCOV_EXCL_START
170c21e34e2SJeremy L Thompson       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
171bdcc2728SJeremy L Thompson       for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
172bdcc2728SJeremy L Thompson         CeedDebug(ceed, "Option %d: %s", i, opts[i]);
173bdcc2728SJeremy L Thompson       }
174bdcc2728SJeremy L Thompson       CeedDebug(ceed, "");
175c21e34e2SJeremy L Thompson       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
176bdcc2728SJeremy L Thompson       // LCOV_EXCL_STOP
177bdcc2728SJeremy L Thompson     }
1782027fb9dSSirAlienTheGreat 
1794753b775SJeremy L Thompson     nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
180ca735530SJeremy L Thompson 
181b13efd58SJeremy L Thompson     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
182b13efd58SJeremy L Thompson       CeedCallBackend(CeedFree(&opts[num_opts + i]));
183b13efd58SJeremy L Thompson     }
1844753b775SJeremy L Thompson     for (CeedInt i = 0; i < num_jit_defines; i++) {
1854753b775SJeremy L Thompson       CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
1864753b775SJeremy L Thompson     }
187b13efd58SJeremy L Thompson     CeedCallBackend(CeedFree(&opts));
188ddae5012SJeremy L Thompson     *is_compile_good = result == NVRTC_SUCCESS;
18928c1f747SJeremy L Thompson     if (!*is_compile_good) {
190c9c2c079SJeremy L Thompson       char  *log;
191ca735530SJeremy L Thompson       size_t log_size;
192ca735530SJeremy L Thompson 
193ca735530SJeremy L Thompson       CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
1942b730f8bSJeremy L Thompson       CeedCallBackend(CeedMalloc(log_size, &log));
1952b730f8bSJeremy L Thompson       CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
19628c1f747SJeremy L Thompson       if (throw_error) {
1972b730f8bSJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
19828c1f747SJeremy L Thompson       } else {
199c49dc7a7SJeremy L Thompson         // LCOV_EXCL_START
20028c1f747SJeremy L Thompson         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
20128c1f747SJeremy L Thompson         CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
2022027fb9dSSirAlienTheGreat         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
20328c1f747SJeremy L Thompson         CeedCallBackend(CeedFree(&log));
20428c1f747SJeremy L Thompson         CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
20528c1f747SJeremy L Thompson         return CEED_ERROR_SUCCESS;
206c49dc7a7SJeremy L Thompson         // LCOV_EXCL_STOP
20728c1f747SJeremy L Thompson       }
208c9c2c079SJeremy L Thompson     }
209c9c2c079SJeremy L Thompson 
21029ec485eSJed Brown #if CUDA_VERSION >= 11010
21129ec485eSJed Brown     CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size));
21229ec485eSJed Brown     CeedCallBackend(CeedMalloc(ptx_size, &ptx));
21329ec485eSJed Brown     CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx));
21429ec485eSJed Brown #else
2152b730f8bSJeremy L Thompson     CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
2162b730f8bSJeremy L Thompson     CeedCallBackend(CeedMalloc(ptx_size, &ptx));
2172b730f8bSJeremy L Thompson     CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
21829ec485eSJed Brown #endif
2192b730f8bSJeremy L Thompson     CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
220c9c2c079SJeremy L Thompson 
2212b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cuModuleLoadData(module, ptx));
2222b730f8bSJeremy L Thompson     CeedCallBackend(CeedFree(&ptx));
223c9c2c079SJeremy L Thompson     return CEED_ERROR_SUCCESS;
2242027fb9dSSirAlienTheGreat   } else {
2259b5f41c8SJeremy L Thompson     srand(time(NULL));
2269b5f41c8SJeremy L Thompson     const int build_id = rand();
2272027fb9dSSirAlienTheGreat 
2289b5f41c8SJeremy L Thompson     // Create temp dir if needed
2299b5f41c8SJeremy L Thompson     {
2309b5f41c8SJeremy L Thompson       DIR *dir = opendir("temp");
2319b5f41c8SJeremy L Thompson 
2329b5f41c8SJeremy L Thompson       if (dir) {
2339b5f41c8SJeremy L Thompson         closedir(dir);
2349b5f41c8SJeremy L Thompson       } else {
23533cc410dSJeremy L Thompson         // In parallel multiple processes may attempt
23633cc410dSJeremy L Thompson         // Only one process needs to succeed
2379b5f41c8SJeremy L Thompson         mkdir("temp", 0777);
2389b5f41c8SJeremy L Thompson         chmod("temp", 0777);
2399b5f41c8SJeremy L Thompson       }
2409b5f41c8SJeremy L Thompson     }
2419b5f41c8SJeremy L Thompson     // Write code to temp file
2429b5f41c8SJeremy L Thompson     {
2439b5f41c8SJeremy L Thompson       std::string filename = std::string("temp/kernel_") + std::to_string(build_id) + std::string("_0_source.cu");
2449b5f41c8SJeremy L Thompson       FILE       *file     = fopen(filename.c_str(), "w");
2459b5f41c8SJeremy L Thompson 
2469b5f41c8SJeremy L Thompson       CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang");
2472027fb9dSSirAlienTheGreat       fputs(code.str().c_str(), file);
2482027fb9dSSirAlienTheGreat       fclose(file);
2499b5f41c8SJeremy L Thompson     }
2502027fb9dSSirAlienTheGreat 
2512027fb9dSSirAlienTheGreat     // Get rust crate directories
2522027fb9dSSirAlienTheGreat     const char **rust_source_dirs     = nullptr;
2532027fb9dSSirAlienTheGreat     int          num_rust_source_dirs = 0;
2542027fb9dSSirAlienTheGreat 
2552027fb9dSSirAlienTheGreat     CeedCallBackend(CeedGetRustSourceRoots(ceed, &num_rust_source_dirs, &rust_source_dirs));
2562027fb9dSSirAlienTheGreat 
2572027fb9dSSirAlienTheGreat     std::string rust_dirs[10];
2582027fb9dSSirAlienTheGreat 
2592027fb9dSSirAlienTheGreat     if (num_rust_source_dirs > 0) {
2602027fb9dSSirAlienTheGreat       CeedDebug(ceed, "There are %d source dirs, including %s\n", num_rust_source_dirs, rust_source_dirs[0]);
2612027fb9dSSirAlienTheGreat     }
2622027fb9dSSirAlienTheGreat 
2632027fb9dSSirAlienTheGreat     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
2642027fb9dSSirAlienTheGreat       rust_dirs[i] = std::string(rust_source_dirs[i]);
2652027fb9dSSirAlienTheGreat     }
2662027fb9dSSirAlienTheGreat 
2672027fb9dSSirAlienTheGreat     CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs));
2682027fb9dSSirAlienTheGreat 
2692027fb9dSSirAlienTheGreat     char *rust_toolchain = std::getenv("RUST_TOOLCHAIN");
2702027fb9dSSirAlienTheGreat 
2712027fb9dSSirAlienTheGreat     if (rust_toolchain == nullptr) {
2722027fb9dSSirAlienTheGreat       rust_toolchain = (char *)"nightly";
2732027fb9dSSirAlienTheGreat       setenv("RUST_TOOLCHAIN", "nightly", 0);
2742027fb9dSSirAlienTheGreat     }
2752027fb9dSSirAlienTheGreat 
2762027fb9dSSirAlienTheGreat     // Compile Rust crate(s) needed
2772027fb9dSSirAlienTheGreat     std::string command;
2782027fb9dSSirAlienTheGreat 
2792027fb9dSSirAlienTheGreat     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
2802027fb9dSSirAlienTheGreat       command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] +
2812027fb9dSSirAlienTheGreat                 "/.cargo/config.toml --manifest-path " + rust_dirs[i] + "/Cargo.toml";
2822027fb9dSSirAlienTheGreat       CeedCallSystem(ceed, command.c_str(), "build Rust crate");
2832027fb9dSSirAlienTheGreat     }
2842027fb9dSSirAlienTheGreat 
285fa619eccSJeremy L Thompson     // Get Clang version
286fa619eccSJeremy L Thompson     bool use_llvm_version = ceed_data->use_llvm_version;
287fa619eccSJeremy L Thompson     int  llvm_version     = ceed_data->llvm_version;
288fa619eccSJeremy L Thompson 
289fa619eccSJeremy L Thompson     if (llvm_version == 0) {
290fa619eccSJeremy L Thompson       command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) --version";
291fa619eccSJeremy L Thompson       CeedDebug(ceed, "Attempting to detect Rust LLVM version.\ncommand:\n$ %s", command.c_str());
292fa619eccSJeremy L Thompson       FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
293fa619eccSJeremy L Thompson 
294fa619eccSJeremy L Thompson       CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version");
295fa619eccSJeremy L Thompson 
296fa619eccSJeremy L Thompson       char        line[CEED_MAX_RESOURCE_LEN] = "";
297fa619eccSJeremy L Thompson       std::string output                      = "";
298fa619eccSJeremy L Thompson 
299fa619eccSJeremy L Thompson       while (fgets(line, sizeof(line), output_stream) != nullptr) {
300fa619eccSJeremy L Thompson         output += line;
301fa619eccSJeremy L Thompson       }
302fa619eccSJeremy L Thompson       CeedDebug(ceed, "output:\n%s", output.c_str());
303fa619eccSJeremy L Thompson       CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to detect Rust LLVM version\ncommand:\n$ %s\nerror:\n%s",
304fa619eccSJeremy L Thompson                 command.c_str(), output.c_str());
305fa619eccSJeremy L Thompson 
306fa619eccSJeremy L Thompson       const char *version_substring = strstr(output.c_str(), "LLVM version ");
307fa619eccSJeremy L Thompson 
308fa619eccSJeremy L Thompson       version_substring += 13;
309fa619eccSJeremy L Thompson 
310fa619eccSJeremy L Thompson       char *next_dot = strchr((char *)version_substring, '.');
311fa619eccSJeremy L Thompson 
312f03c7eeeSJeremy L Thompson       if (next_dot) {
313fa619eccSJeremy L Thompson         next_dot[0]             = '\0';
314fa619eccSJeremy L Thompson         ceed_data->llvm_version = llvm_version = std::stoi(version_substring);
315fa619eccSJeremy L Thompson         CeedDebug(ceed, "Rust LLVM version number: %d\n", llvm_version);
316fa619eccSJeremy L Thompson 
317fa619eccSJeremy L Thompson         command                     = std::string("clang++-") + std::to_string(llvm_version);
318fa619eccSJeremy L Thompson         output_stream               = popen((command + std::string(" 2>&1")).c_str(), "r");
319fa619eccSJeremy L Thompson         ceed_data->use_llvm_version = use_llvm_version = pclose(output_stream) == 0;
320f03c7eeeSJeremy L Thompson       } else {
321f03c7eeeSJeremy L Thompson         ceed_data->llvm_version     = -1;
322f03c7eeeSJeremy L Thompson         ceed_data->use_llvm_version = use_llvm_version = false;
323f03c7eeeSJeremy L Thompson       }
324fa619eccSJeremy L Thompson     }
325fa619eccSJeremy L Thompson 
3262027fb9dSSirAlienTheGreat     // Compile wrapper kernel
327fa619eccSJeremy L Thompson     command = "clang++" + (use_llvm_version ? (std::string("-") + std::to_string(llvm_version)) : "") + " -flto=thin --cuda-gpu-arch=sm_" +
328fa619eccSJeremy L Thompson               std::to_string(prop.major) + std::to_string(prop.minor) + " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) +
329fa619eccSJeremy L Thompson               "_0_source.cu -o temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll ";
3302027fb9dSSirAlienTheGreat     command += opts[4];
3312027fb9dSSirAlienTheGreat     CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
33233cc410dSJeremy L Thompson     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_1_wrapped.ll").c_str(), "update JiT file permissions");
3332027fb9dSSirAlienTheGreat 
334f03c7eeeSJeremy L Thompson     // Find Rust's llvm-link tool and run it
3359b5f41c8SJeremy L Thompson     command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
3369b5f41c8SJeremy L Thompson               std::to_string(build_id) +
3379b5f41c8SJeremy L Thompson               "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o "
3389b5f41c8SJeremy L Thompson               "temp/kernel_" +
3399b5f41c8SJeremy L Thompson               std::to_string(build_id) + "_2_linked.ll ";
3402027fb9dSSirAlienTheGreat 
341f03c7eeeSJeremy L Thompson     // Searches for .a files in Rust directory
342f03c7eeeSJeremy L Thompson     // Note: Rust crate names may not match the folder they are in
343f03c7eeeSJeremy L Thompson     // TODO: If libCEED switches to c++17, use std::filesystem here
3442027fb9dSSirAlienTheGreat     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
3452027fb9dSSirAlienTheGreat       std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release";
3462027fb9dSSirAlienTheGreat       DIR        *dp  = opendir(dir.c_str());
3472027fb9dSSirAlienTheGreat 
3482027fb9dSSirAlienTheGreat       CeedCheck(dp != nullptr, ceed, CEED_ERROR_BACKEND, "Could not open directory: %s", dir.c_str());
3492027fb9dSSirAlienTheGreat       struct dirent *entry;
3502027fb9dSSirAlienTheGreat 
351fa619eccSJeremy L Thompson       // Find files ending in .a
3522027fb9dSSirAlienTheGreat       while ((entry = readdir(dp)) != nullptr) {
3532027fb9dSSirAlienTheGreat         std::string filename(entry->d_name);
3542027fb9dSSirAlienTheGreat 
3552027fb9dSSirAlienTheGreat         if (filename.size() >= 2 && filename.substr(filename.size() - 2) == ".a") {
3562027fb9dSSirAlienTheGreat           command += dir + "/" + filename + " ";
3572027fb9dSSirAlienTheGreat         }
3582027fb9dSSirAlienTheGreat       }
3592027fb9dSSirAlienTheGreat       closedir(dp);
3602027fb9dSSirAlienTheGreat     }
3612027fb9dSSirAlienTheGreat 
3622027fb9dSSirAlienTheGreat     // Link, optimize, and compile final CUDA kernel
3632027fb9dSSirAlienTheGreat     CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
364*1a8516d0SJames Wright     CeedCallSystem(ceed,
365*1a8516d0SJames Wright                    ("$(find $(rustup run " + std::string(rust_toolchain) +
366*1a8516d0SJames Wright                     " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" + std::to_string(build_id) +
367*1a8516d0SJames Wright                     "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
3682027fb9dSSirAlienTheGreat                        .c_str(),
3692027fb9dSSirAlienTheGreat                    "optimize linked C and Rust source");
37033cc410dSJeremy L Thompson     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_2_linked.ll").c_str(), "update JiT file permissions");
3712027fb9dSSirAlienTheGreat     CeedCallSystem(ceed,
3722027fb9dSSirAlienTheGreat                    ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
3739b5f41c8SJeremy L Thompson                     std::to_string(prop.major) + std::to_string(prop.minor) + " temp/kernel_" + std::to_string(build_id) +
3749b5f41c8SJeremy L Thompson                     "_3_opt.bc -o temp/kernel_" + std::to_string(build_id) + "_4_final.ptx")
3752027fb9dSSirAlienTheGreat                        .c_str(),
3762027fb9dSSirAlienTheGreat                    "compile final CUDA kernel");
37733cc410dSJeremy L Thompson     CeedCallSystem(ceed, ("chmod 0777 temp/kernel_" + std::to_string(build_id) + "_4_final.ptx").c_str(), "update JiT file permissions");
3782027fb9dSSirAlienTheGreat 
379f03c7eeeSJeremy L Thompson     // Load module from final PTX
3809b5f41c8SJeremy L Thompson     ifstream      ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx");
3812027fb9dSSirAlienTheGreat     ostringstream sstr;
3822027fb9dSSirAlienTheGreat 
3832027fb9dSSirAlienTheGreat     sstr << ptxfile.rdbuf();
3842027fb9dSSirAlienTheGreat 
3852027fb9dSSirAlienTheGreat     auto ptx_data = sstr.str();
3862027fb9dSSirAlienTheGreat     ptx_size      = ptx_data.length();
3872027fb9dSSirAlienTheGreat 
3882027fb9dSSirAlienTheGreat     int result = cuModuleLoadData(module, ptx_data.c_str());
3892027fb9dSSirAlienTheGreat 
3902027fb9dSSirAlienTheGreat     *is_compile_good = result == 0;
3912027fb9dSSirAlienTheGreat     if (!*is_compile_good) {
3922027fb9dSSirAlienTheGreat       if (throw_error) {
3932027fb9dSSirAlienTheGreat         return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load module data");
3942027fb9dSSirAlienTheGreat       } else {
3952027fb9dSSirAlienTheGreat         // LCOV_EXCL_START
3962027fb9dSSirAlienTheGreat         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
3972027fb9dSSirAlienTheGreat         CeedDebug(ceed, "Error: Failed to load module data");
3982027fb9dSSirAlienTheGreat         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
3992027fb9dSSirAlienTheGreat         return CEED_ERROR_SUCCESS;
4002027fb9dSSirAlienTheGreat         // LCOV_EXCL_STOP
4012027fb9dSSirAlienTheGreat       }
4022027fb9dSSirAlienTheGreat     }
4032027fb9dSSirAlienTheGreat   }
4042027fb9dSSirAlienTheGreat   return CEED_ERROR_SUCCESS;
405c9c2c079SJeremy L Thompson }
406c9c2c079SJeremy L Thompson 
407ddae5012SJeremy L Thompson int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
408ddae5012SJeremy L Thompson   bool    is_compile_good = true;
409ddae5012SJeremy L Thompson   va_list args;
410ddae5012SJeremy L Thompson 
411ddae5012SJeremy L Thompson   va_start(args, num_defines);
41218c38aeeSJeremy L Thompson   const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args);
41318c38aeeSJeremy L Thompson 
414ddae5012SJeremy L Thompson   va_end(args);
41518c38aeeSJeremy L Thompson   CeedCallBackend(ierr);
416ddae5012SJeremy L Thompson   return CEED_ERROR_SUCCESS;
417ddae5012SJeremy L Thompson }
418ddae5012SJeremy L Thompson 
419ddae5012SJeremy L Thompson int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
420ddae5012SJeremy L Thompson   va_list args;
421ddae5012SJeremy L Thompson 
422ddae5012SJeremy L Thompson   va_start(args, num_defines);
42318c38aeeSJeremy L Thompson   const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args);
42418c38aeeSJeremy L Thompson 
425ddae5012SJeremy L Thompson   va_end(args);
42618c38aeeSJeremy L Thompson   CeedCallBackend(ierr);
427ddae5012SJeremy L Thompson   return CEED_ERROR_SUCCESS;
428ddae5012SJeremy L Thompson }
429ddae5012SJeremy L Thompson 
430c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
431c9c2c079SJeremy L Thompson // Get CUDA kernel
432c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
433eb7e6cafSJeremy L Thompson int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel) {
4342b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleGetFunction(kernel, module, name));
435c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
436c9c2c079SJeremy L Thompson }
437c9c2c079SJeremy L Thompson 
438b2165e7aSSebastian Grimberg //------------------------------------------------------------------------------
439b2165e7aSSebastian Grimberg // Run CUDA kernel with block size selected automatically based on the kernel
440b2165e7aSSebastian Grimberg //     (which may use enough registers to require a smaller block size than the
441b2165e7aSSebastian Grimberg //      hardware is capable)
442b2165e7aSSebastian Grimberg //------------------------------------------------------------------------------
4432b730f8bSJeremy L Thompson int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void **args) {
444c9c2c079SJeremy L Thompson   int min_grid_size, max_block_size;
445ca735530SJeremy L Thompson 
4462b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_block_size, kernel, NULL, 0, 0x10000));
447eb7e6cafSJeremy L Thompson   CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args));
448ca735530SJeremy L Thompson   return CEED_ERROR_SUCCESS;
449c9c2c079SJeremy L Thompson }
450c9c2c079SJeremy L Thompson 
451c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
452c9c2c079SJeremy L Thompson // Run CUDA kernel
453c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
454eb7e6cafSJeremy L Thompson int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) {
455e9c76bddSJeremy L Thompson   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args));
456c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
457c9c2c079SJeremy L Thompson }
458c9c2c079SJeremy L Thompson 
459c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
460c9c2c079SJeremy L Thompson // Run CUDA kernel for spatial dimension
461c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
462eb7e6cafSJeremy L Thompson int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z,
4632b730f8bSJeremy L Thompson                           void **args) {
464e9c76bddSJeremy L Thompson   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
465c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
466c9c2c079SJeremy L Thompson }
467c9c2c079SJeremy L Thompson 
468c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
469ea61e9acSJeremy L Thompson // Run CUDA kernel for spatial dimension with shared memory
470c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
471e9c76bddSJeremy L Thompson static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x,
472e9c76bddSJeremy L Thompson                                            const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
473e9c76bddSJeremy L Thompson                                            bool *is_good_run, void **args) {
474023b8a51Sabdelfattah83 #if CUDA_VERSION >= 9000
475023b8a51Sabdelfattah83   cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
476023b8a51Sabdelfattah83 #endif
477e9c76bddSJeremy L Thompson   CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
478ca735530SJeremy L Thompson 
479c9c2c079SJeremy L Thompson   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
480c9c2c079SJeremy L Thompson     int max_threads_per_block, shared_size_bytes, num_regs;
481ca735530SJeremy L Thompson 
4822b730f8bSJeremy L Thompson     cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
4832b730f8bSJeremy L Thompson     cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
484c9c2c079SJeremy L Thompson     cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
485c49dc7a7SJeremy L Thompson     if (throw_error) {
486c9c2c079SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
487c9c2c079SJeremy L Thompson                        "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
4882b730f8bSJeremy L Thompson                        max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
489c49dc7a7SJeremy L Thompson     } else {
490c49dc7a7SJeremy L Thompson       // LCOV_EXCL_START
491c49dc7a7SJeremy L Thompson       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
492c49dc7a7SJeremy L Thompson       CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
493c49dc7a7SJeremy L Thompson                 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
494c21e34e2SJeremy L Thompson       CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
495c49dc7a7SJeremy L Thompson       // LCOV_EXCL_STOP
496ddae5012SJeremy L Thompson     }
497c49dc7a7SJeremy L Thompson     *is_good_run = false;
498c9c2c079SJeremy L Thompson   } else CeedChk_Cu(ceed, result);
499c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
500c9c2c079SJeremy L Thompson }
5012a86cc9dSSebastian Grimberg 
502e9c76bddSJeremy L Thompson int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
503ddae5012SJeremy L Thompson                                 const int block_size_z, const int shared_mem_size, void **args) {
504ddae5012SJeremy L Thompson   bool is_good_run = true;
505ddae5012SJeremy L Thompson 
506e9c76bddSJeremy L Thompson   CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
507e9c76bddSJeremy L Thompson                                                   &is_good_run, args));
508ddae5012SJeremy L Thompson   return CEED_ERROR_SUCCESS;
509ddae5012SJeremy L Thompson }
510ddae5012SJeremy L Thompson 
511e9c76bddSJeremy L Thompson int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
512ddae5012SJeremy L Thompson                                    const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
513e9c76bddSJeremy L Thompson   CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
514e9c76bddSJeremy L Thompson                                                   is_good_run, args));
515ddae5012SJeremy L Thompson   return CEED_ERROR_SUCCESS;
516ddae5012SJeremy L Thompson }
517ddae5012SJeremy L Thompson 
5182a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------
519