xref: /libCEED/rust/libceed-sys/c-src/backends/cuda/ceed-cuda-compile.cpp (revision 9b5f41c81b637db3e5453a22df59a4f47deed499)
1d275d636SJeremy L Thompson // Copyright (c) 2017-2025, Lawrence Livermore National Security, LLC and other CEED contributors.
2c9c2c079SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3c9c2c079SJeremy L Thompson //
4c9c2c079SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5c9c2c079SJeremy L Thompson //
6c9c2c079SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7c9c2c079SJeremy L Thompson 
82b730f8bSJeremy L Thompson #include "ceed-cuda-compile.h"
92b730f8bSJeremy L Thompson 
1049aac155SJeremy L Thompson #include <ceed.h>
11c9c2c079SJeremy L Thompson #include <ceed/backend.h>
12c9c2c079SJeremy L Thompson #include <ceed/jit-tools.h>
13c9c2c079SJeremy L Thompson #include <cuda_runtime.h>
142027fb9dSSirAlienTheGreat #include <dirent.h>
15c9c2c079SJeremy L Thompson #include <nvrtc.h>
16c9c2c079SJeremy L Thompson #include <stdarg.h>
17c9c2c079SJeremy L Thompson #include <string.h>
18*9b5f41c8SJeremy L Thompson #include <sys/stat.h>
192027fb9dSSirAlienTheGreat #include <sys/types.h>
202b730f8bSJeremy L Thompson 
212027fb9dSSirAlienTheGreat #include <cstdlib>
222027fb9dSSirAlienTheGreat #include <fstream>
232027fb9dSSirAlienTheGreat #include <iostream>
242b730f8bSJeremy L Thompson #include <sstream>
252027fb9dSSirAlienTheGreat #include <string>
262b730f8bSJeremy L Thompson 
27c9c2c079SJeremy L Thompson #include "ceed-cuda-common.h"
28c9c2c079SJeremy L Thompson 
29c9c2c079SJeremy L Thompson #define CeedChk_Nvrtc(ceed, x)                                                                              \
30c9c2c079SJeremy L Thompson   do {                                                                                                      \
31c9c2c079SJeremy L Thompson     nvrtcResult result = static_cast<nvrtcResult>(x);                                                       \
322b730f8bSJeremy L Thompson     if (result != NVRTC_SUCCESS) return CeedError((ceed), CEED_ERROR_BACKEND, nvrtcGetErrorString(result)); \
33c9c2c079SJeremy L Thompson   } while (0)
34c9c2c079SJeremy L Thompson 
352b730f8bSJeremy L Thompson #define CeedCallNvrtc(ceed, ...)  \
362b730f8bSJeremy L Thompson   do {                            \
372b730f8bSJeremy L Thompson     int ierr_q_ = __VA_ARGS__;    \
382b730f8bSJeremy L Thompson     CeedChk_Nvrtc(ceed, ierr_q_); \
396574a04fSJeremy L Thompson   } while (0)
402b730f8bSJeremy L Thompson 
412027fb9dSSirAlienTheGreat #define CeedCallSystem(ceed, command, message) CeedCallBackend(CeedCallSystem_Core(ceed, command, message))
422027fb9dSSirAlienTheGreat 
432027fb9dSSirAlienTheGreat //------------------------------------------------------------------------------
442027fb9dSSirAlienTheGreat // Call system command and capture stdout + stderr
452027fb9dSSirAlienTheGreat //------------------------------------------------------------------------------
462027fb9dSSirAlienTheGreat static int CeedCallSystem_Core(Ceed ceed, const char *command, const char *message) {
472027fb9dSSirAlienTheGreat   CeedDebug(ceed, "Running command:\n$ %s\n", command);
482027fb9dSSirAlienTheGreat   FILE *output_stream = popen((command + std::string(" 2>&1")).c_str(), "r");
492027fb9dSSirAlienTheGreat 
502027fb9dSSirAlienTheGreat   CeedCheck(output_stream != nullptr, ceed, CEED_ERROR_BACKEND, "Failed to %s with command: %s", message, command);
512027fb9dSSirAlienTheGreat 
522027fb9dSSirAlienTheGreat   char output[4 * CEED_MAX_RESOURCE_LEN];
532027fb9dSSirAlienTheGreat 
542027fb9dSSirAlienTheGreat   while (fgets(output, sizeof(output), output_stream) != nullptr) {
552027fb9dSSirAlienTheGreat   }
562027fb9dSSirAlienTheGreat   CeedDebug(ceed, "Command output:\n%s\n", output);
572027fb9dSSirAlienTheGreat 
58*9b5f41c8SJeremy L Thompson   CeedCheck(pclose(output_stream) == 0, ceed, CEED_ERROR_BACKEND, "Failed to %s with command: %s\nand error: %s", message, command, output);
592027fb9dSSirAlienTheGreat   return CEED_ERROR_SUCCESS;
602027fb9dSSirAlienTheGreat }
612027fb9dSSirAlienTheGreat 
62c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
63c9c2c079SJeremy L Thompson // Compile CUDA kernel
64c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
652027fb9dSSirAlienTheGreat using std::ifstream;
662027fb9dSSirAlienTheGreat using std::ofstream;
672027fb9dSSirAlienTheGreat using std::ostringstream;
682027fb9dSSirAlienTheGreat 
69ddae5012SJeremy L Thompson static int CeedCompileCore_Cuda(Ceed ceed, const char *source, const bool throw_error, bool *is_compile_good, CUmodule *module,
70ddae5012SJeremy L Thompson                                 const CeedInt num_defines, va_list args) {
71ca735530SJeremy L Thompson   size_t                ptx_size;
7222070f95SJeremy L Thompson   char                 *ptx;
73a491a57eSJeremy L Thompson   const int             num_opts            = 4;
744753b775SJeremy L Thompson   CeedInt               num_jit_source_dirs = 0, num_jit_defines = 0;
75b13efd58SJeremy L Thompson   const char          **opts;
76c9c2c079SJeremy L Thompson   nvrtcProgram          prog;
77ca735530SJeremy L Thompson   struct cudaDeviceProp prop;
78ca735530SJeremy L Thompson   Ceed_Cuda            *ceed_data;
79ca735530SJeremy L Thompson 
80ca735530SJeremy L Thompson   cudaFree(0);  // Make sure a Context exists for nvrtc
81c9c2c079SJeremy L Thompson 
82c9c2c079SJeremy L Thompson   std::ostringstream code;
832027fb9dSSirAlienTheGreat   bool               using_clang;
842027fb9dSSirAlienTheGreat 
852027fb9dSSirAlienTheGreat   CeedCallBackend(CeedGetIsClang(ceed, &using_clang));
862027fb9dSSirAlienTheGreat 
872027fb9dSSirAlienTheGreat   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS,
882027fb9dSSirAlienTheGreat                using_clang ? "Compiling CUDA with Clang backend (with Rust QFunction support)"
892027fb9dSSirAlienTheGreat                            : "Compiling CUDA with NVRTC backend (without Rust QFunction support).\nTo use the Clang backend, set the environment "
902027fb9dSSirAlienTheGreat                              "variable GPU_CLANG=1");
91c9c2c079SJeremy L Thompson 
92c9c2c079SJeremy L Thompson   // Get kernel specific options, such as kernel constants
93c9c2c079SJeremy L Thompson   if (num_defines > 0) {
94c9c2c079SJeremy L Thompson     char *name;
95c9c2c079SJeremy L Thompson     int   val;
96ca735530SJeremy L Thompson 
97c9c2c079SJeremy L Thompson     for (int i = 0; i < num_defines; i++) {
98c9c2c079SJeremy L Thompson       name = va_arg(args, char *);
99c9c2c079SJeremy L Thompson       val  = va_arg(args, int);
100c9c2c079SJeremy L Thompson       code << "#define " << name << " " << val << "\n";
101c9c2c079SJeremy L Thompson     }
102c9c2c079SJeremy L Thompson   }
103c9c2c079SJeremy L Thompson 
104c9c2c079SJeremy L Thompson   // Standard libCEED definitions for CUDA backends
10591adc9c8SJeremy L Thompson   code << "#include <ceed/jit-source/cuda/cuda-jit.h>\n\n";
106c9c2c079SJeremy L Thompson 
107c9c2c079SJeremy L Thompson   // Non-macro options
108b13efd58SJeremy L Thompson   CeedCallBackend(CeedCalloc(num_opts, &opts));
109c9c2c079SJeremy L Thompson   opts[0] = "-default-device";
1102b730f8bSJeremy L Thompson   CeedCallBackend(CeedGetData(ceed, &ceed_data));
1112b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cudaGetDeviceProperties(&prop, ceed_data->device_id));
11229ec485eSJed Brown   std::string arch_arg =
11329ec485eSJed Brown #if CUDA_VERSION >= 11010
11429ec485eSJed Brown       // NVRTC used to support only virtual architectures through the option
11529ec485eSJed Brown       // -arch, since it was only emitting PTX. It will now support actual
11629ec485eSJed Brown       // architectures as well to emit SASS.
11729ec485eSJed Brown       // https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#dynamic-code-generation
11829ec485eSJed Brown       "-arch=sm_"
11929ec485eSJed Brown #else
12029ec485eSJed Brown       "-arch=compute_"
12129ec485eSJed Brown #endif
12229ec485eSJed Brown       + std::to_string(prop.major) + std::to_string(prop.minor);
123c9c2c079SJeremy L Thompson   opts[1] = arch_arg.c_str();
124c9c2c079SJeremy L Thompson   opts[2] = "-Dint32_t=int";
125a491a57eSJeremy L Thompson   opts[3] = "-DCEED_RUNNING_JIT_PASS=1";
1264753b775SJeremy L Thompson   // Additional include dirs
127b13efd58SJeremy L Thompson   {
128b13efd58SJeremy L Thompson     const char **jit_source_dirs;
129b13efd58SJeremy L Thompson 
130b13efd58SJeremy L Thompson     CeedCallBackend(CeedGetJitSourceRoots(ceed, &num_jit_source_dirs, &jit_source_dirs));
131b13efd58SJeremy L Thompson     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs, &opts));
132b13efd58SJeremy L Thompson     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
1334753b775SJeremy L Thompson       std::ostringstream include_dir_arg;
134b13efd58SJeremy L Thompson 
1354753b775SJeremy L Thompson       include_dir_arg << "-I" << jit_source_dirs[i];
1364753b775SJeremy L Thompson       CeedCallBackend(CeedStringAllocCopy(include_dir_arg.str().c_str(), (char **)&opts[num_opts + i]));
137b13efd58SJeremy L Thompson     }
138b13efd58SJeremy L Thompson     CeedCallBackend(CeedRestoreJitSourceRoots(ceed, &jit_source_dirs));
139b13efd58SJeremy L Thompson   }
1404753b775SJeremy L Thompson   // User defines
1414753b775SJeremy L Thompson   {
1424753b775SJeremy L Thompson     const char **jit_defines;
1434753b775SJeremy L Thompson 
1444753b775SJeremy L Thompson     CeedCallBackend(CeedGetJitDefines(ceed, &num_jit_defines, &jit_defines));
1454753b775SJeremy L Thompson     CeedCallBackend(CeedRealloc(num_opts + num_jit_source_dirs + num_jit_defines, &opts));
1464753b775SJeremy L Thompson     for (CeedInt i = 0; i < num_jit_defines; i++) {
1474753b775SJeremy L Thompson       std::ostringstream define_arg;
1484753b775SJeremy L Thompson 
1494753b775SJeremy L Thompson       define_arg << "-D" << jit_defines[i];
1504753b775SJeremy L Thompson       CeedCallBackend(CeedStringAllocCopy(define_arg.str().c_str(), (char **)&opts[num_opts + num_jit_source_dirs + i]));
1514753b775SJeremy L Thompson     }
1524753b775SJeremy L Thompson     CeedCallBackend(CeedRestoreJitDefines(ceed, &jit_defines));
1534753b775SJeremy L Thompson   }
154c9c2c079SJeremy L Thompson 
155c9c2c079SJeremy L Thompson   // Add string source argument provided in call
156c9c2c079SJeremy L Thompson   code << source;
157c9c2c079SJeremy L Thompson 
158c9c2c079SJeremy L Thompson   // Compile kernel
159c21e34e2SJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- ATTEMPTING TO COMPILE JIT SOURCE ----------\n");
16026ef7cdaSJeremy L Thompson   CeedDebug(ceed, "Source:\n%s\n", code.str().c_str());
161c21e34e2SJeremy L Thompson   CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JIT SOURCE ----------\n");
1622027fb9dSSirAlienTheGreat 
1632027fb9dSSirAlienTheGreat   if (!using_clang) {
1642027fb9dSSirAlienTheGreat     CeedCallNvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
1652027fb9dSSirAlienTheGreat 
166bdcc2728SJeremy L Thompson     if (CeedDebugFlag(ceed)) {
167bdcc2728SJeremy L Thompson       // LCOV_EXCL_START
168c21e34e2SJeremy L Thompson       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- JiT COMPILER OPTIONS ----------\n");
169bdcc2728SJeremy L Thompson       for (CeedInt i = 0; i < num_opts + num_jit_source_dirs + num_jit_defines; i++) {
170bdcc2728SJeremy L Thompson         CeedDebug(ceed, "Option %d: %s", i, opts[i]);
171bdcc2728SJeremy L Thompson       }
172bdcc2728SJeremy L Thompson       CeedDebug(ceed, "");
173c21e34e2SJeremy L Thompson       CeedDebug256(ceed, CEED_DEBUG_COLOR_SUCCESS, "---------- END OF JiT COMPILER OPTIONS ----------\n");
174bdcc2728SJeremy L Thompson       // LCOV_EXCL_STOP
175bdcc2728SJeremy L Thompson     }
1762027fb9dSSirAlienTheGreat 
1774753b775SJeremy L Thompson     nvrtcResult result = nvrtcCompileProgram(prog, num_opts + num_jit_source_dirs + num_jit_defines, opts);
178ca735530SJeremy L Thompson 
179b13efd58SJeremy L Thompson     for (CeedInt i = 0; i < num_jit_source_dirs; i++) {
180b13efd58SJeremy L Thompson       CeedCallBackend(CeedFree(&opts[num_opts + i]));
181b13efd58SJeremy L Thompson     }
1824753b775SJeremy L Thompson     for (CeedInt i = 0; i < num_jit_defines; i++) {
1834753b775SJeremy L Thompson       CeedCallBackend(CeedFree(&opts[num_opts + num_jit_source_dirs + i]));
1844753b775SJeremy L Thompson     }
185b13efd58SJeremy L Thompson     CeedCallBackend(CeedFree(&opts));
186ddae5012SJeremy L Thompson     *is_compile_good = result == NVRTC_SUCCESS;
18728c1f747SJeremy L Thompson     if (!*is_compile_good) {
188c9c2c079SJeremy L Thompson       char  *log;
189ca735530SJeremy L Thompson       size_t log_size;
190ca735530SJeremy L Thompson 
191ca735530SJeremy L Thompson       CeedCallNvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
1922b730f8bSJeremy L Thompson       CeedCallBackend(CeedMalloc(log_size, &log));
1932b730f8bSJeremy L Thompson       CeedCallNvrtc(ceed, nvrtcGetProgramLog(prog, log));
19428c1f747SJeremy L Thompson       if (throw_error) {
1952b730f8bSJeremy L Thompson         return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s", nvrtcGetErrorString(result), log);
19628c1f747SJeremy L Thompson       } else {
197c49dc7a7SJeremy L Thompson         // LCOV_EXCL_START
19828c1f747SJeremy L Thompson         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
19928c1f747SJeremy L Thompson         CeedDebug(ceed, "Error: %s\nCompile log:\n%s\n", nvrtcGetErrorString(result), log);
2002027fb9dSSirAlienTheGreat         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
20128c1f747SJeremy L Thompson         CeedCallBackend(CeedFree(&log));
20228c1f747SJeremy L Thompson         CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
20328c1f747SJeremy L Thompson         return CEED_ERROR_SUCCESS;
204c49dc7a7SJeremy L Thompson         // LCOV_EXCL_STOP
20528c1f747SJeremy L Thompson       }
206c9c2c079SJeremy L Thompson     }
207c9c2c079SJeremy L Thompson 
20829ec485eSJed Brown #if CUDA_VERSION >= 11010
20929ec485eSJed Brown     CeedCallNvrtc(ceed, nvrtcGetCUBINSize(prog, &ptx_size));
21029ec485eSJed Brown     CeedCallBackend(CeedMalloc(ptx_size, &ptx));
21129ec485eSJed Brown     CeedCallNvrtc(ceed, nvrtcGetCUBIN(prog, ptx));
21229ec485eSJed Brown #else
2132b730f8bSJeremy L Thompson     CeedCallNvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
2142b730f8bSJeremy L Thompson     CeedCallBackend(CeedMalloc(ptx_size, &ptx));
2152b730f8bSJeremy L Thompson     CeedCallNvrtc(ceed, nvrtcGetPTX(prog, ptx));
21629ec485eSJed Brown #endif
2172b730f8bSJeremy L Thompson     CeedCallNvrtc(ceed, nvrtcDestroyProgram(&prog));
218c9c2c079SJeremy L Thompson 
2192b730f8bSJeremy L Thompson     CeedCallCuda(ceed, cuModuleLoadData(module, ptx));
2202b730f8bSJeremy L Thompson     CeedCallBackend(CeedFree(&ptx));
221c9c2c079SJeremy L Thompson     return CEED_ERROR_SUCCESS;
2222027fb9dSSirAlienTheGreat   } else {
223*9b5f41c8SJeremy L Thompson     srand(time(NULL));
224*9b5f41c8SJeremy L Thompson     const int build_id = rand();
2252027fb9dSSirAlienTheGreat 
226*9b5f41c8SJeremy L Thompson     // Create temp dir if needed
227*9b5f41c8SJeremy L Thompson     {
228*9b5f41c8SJeremy L Thompson       DIR *dir = opendir("temp");
229*9b5f41c8SJeremy L Thompson 
230*9b5f41c8SJeremy L Thompson       if (dir) {
231*9b5f41c8SJeremy L Thompson         closedir(dir);
232*9b5f41c8SJeremy L Thompson       } else {
233*9b5f41c8SJeremy L Thompson         mkdir("temp", 0777);
234*9b5f41c8SJeremy L Thompson         chmod("temp", 0777);
235*9b5f41c8SJeremy L Thompson       }
236*9b5f41c8SJeremy L Thompson     }
237*9b5f41c8SJeremy L Thompson     // Write code to temp file
238*9b5f41c8SJeremy L Thompson     {
239*9b5f41c8SJeremy L Thompson       std::string filename = std::string("temp/kernel_") + std::to_string(build_id) + std::string("_0_source.cu");
240*9b5f41c8SJeremy L Thompson       FILE       *file     = fopen(filename.c_str(), "w");
241*9b5f41c8SJeremy L Thompson 
242*9b5f41c8SJeremy L Thompson       CeedCheck(file, ceed, CEED_ERROR_BACKEND, "Failed to create file. Write access is required for cuda-clang");
2432027fb9dSSirAlienTheGreat       fputs(code.str().c_str(), file);
2442027fb9dSSirAlienTheGreat       fclose(file);
245*9b5f41c8SJeremy L Thompson     }
2462027fb9dSSirAlienTheGreat 
2472027fb9dSSirAlienTheGreat     // Get rust crate directories
2482027fb9dSSirAlienTheGreat     const char **rust_source_dirs     = nullptr;
2492027fb9dSSirAlienTheGreat     int          num_rust_source_dirs = 0;
2502027fb9dSSirAlienTheGreat 
2512027fb9dSSirAlienTheGreat     CeedCallBackend(CeedGetRustSourceRoots(ceed, &num_rust_source_dirs, &rust_source_dirs));
2522027fb9dSSirAlienTheGreat 
2532027fb9dSSirAlienTheGreat     std::string rust_dirs[10];
2542027fb9dSSirAlienTheGreat 
2552027fb9dSSirAlienTheGreat     if (num_rust_source_dirs > 0) {
2562027fb9dSSirAlienTheGreat       CeedDebug(ceed, "There are %d source dirs, including %s\n", num_rust_source_dirs, rust_source_dirs[0]);
2572027fb9dSSirAlienTheGreat     }
2582027fb9dSSirAlienTheGreat 
2592027fb9dSSirAlienTheGreat     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
2602027fb9dSSirAlienTheGreat       rust_dirs[i] = std::string(rust_source_dirs[i]);
2612027fb9dSSirAlienTheGreat     }
2622027fb9dSSirAlienTheGreat 
2632027fb9dSSirAlienTheGreat     CeedCallBackend(CeedRestoreRustSourceRoots(ceed, &rust_source_dirs));
2642027fb9dSSirAlienTheGreat 
2652027fb9dSSirAlienTheGreat     char *rust_toolchain = std::getenv("RUST_TOOLCHAIN");
2662027fb9dSSirAlienTheGreat 
2672027fb9dSSirAlienTheGreat     if (rust_toolchain == nullptr) {
2682027fb9dSSirAlienTheGreat       rust_toolchain = (char *)"nightly";
2692027fb9dSSirAlienTheGreat       setenv("RUST_TOOLCHAIN", "nightly", 0);
2702027fb9dSSirAlienTheGreat     }
2712027fb9dSSirAlienTheGreat 
2722027fb9dSSirAlienTheGreat     // Compile Rust crate(s) needed
2732027fb9dSSirAlienTheGreat     std::string command;
2742027fb9dSSirAlienTheGreat 
2752027fb9dSSirAlienTheGreat     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
2762027fb9dSSirAlienTheGreat       command = "cargo +" + std::string(rust_toolchain) + " build --release --target nvptx64-nvidia-cuda --config " + rust_dirs[i] +
2772027fb9dSSirAlienTheGreat                 "/.cargo/config.toml --manifest-path " + rust_dirs[i] + "/Cargo.toml";
2782027fb9dSSirAlienTheGreat       CeedCallSystem(ceed, command.c_str(), "build Rust crate");
2792027fb9dSSirAlienTheGreat     }
2802027fb9dSSirAlienTheGreat 
2812027fb9dSSirAlienTheGreat     // Compile wrapper kernel
2822027fb9dSSirAlienTheGreat     command = "clang++ -flto=thin --cuda-gpu-arch=sm_" + std::to_string(prop.major) + std::to_string(prop.minor) +
283*9b5f41c8SJeremy L Thompson               " --cuda-device-only -emit-llvm -S temp/kernel_" + std::to_string(build_id) + "_0_source.cu -o temp/kernel_" +
284*9b5f41c8SJeremy L Thompson               std::to_string(build_id) + "_1_wrapped.ll ";
2852027fb9dSSirAlienTheGreat     command += opts[4];
2862027fb9dSSirAlienTheGreat     CeedCallSystem(ceed, command.c_str(), "JiT kernel source");
2872027fb9dSSirAlienTheGreat 
2882027fb9dSSirAlienTheGreat     // the find command finds the rust-installed llvm-link tool and runs it
289*9b5f41c8SJeremy L Thompson     command = "$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llvm-link) temp/kernel_" +
290*9b5f41c8SJeremy L Thompson               std::to_string(build_id) +
291*9b5f41c8SJeremy L Thompson               "_1_wrapped.ll --ignore-non-bitcode --internalize --only-needed -S -o "
292*9b5f41c8SJeremy L Thompson               "temp/kernel_" +
293*9b5f41c8SJeremy L Thompson               std::to_string(build_id) + "_2_linked.ll ";
2942027fb9dSSirAlienTheGreat 
2952027fb9dSSirAlienTheGreat     // Searches for .a files in rust directoy
2962027fb9dSSirAlienTheGreat     // Note: this is necessary because rust crate names may not match the folder they are in
2972027fb9dSSirAlienTheGreat     for (CeedInt i = 0; i < num_rust_source_dirs; i++) {
2982027fb9dSSirAlienTheGreat       std::string dir = rust_dirs[i] + "/target/nvptx64-nvidia-cuda/release";
2992027fb9dSSirAlienTheGreat       DIR        *dp  = opendir(dir.c_str());
3002027fb9dSSirAlienTheGreat 
3012027fb9dSSirAlienTheGreat       CeedCheck(dp != nullptr, ceed, CEED_ERROR_BACKEND, "Could not open directory: %s", dir.c_str());
3022027fb9dSSirAlienTheGreat       struct dirent *entry;
3032027fb9dSSirAlienTheGreat 
3042027fb9dSSirAlienTheGreat       // finds files ending in .a
3052027fb9dSSirAlienTheGreat       while ((entry = readdir(dp)) != nullptr) {
3062027fb9dSSirAlienTheGreat         std::string filename(entry->d_name);
3072027fb9dSSirAlienTheGreat 
3082027fb9dSSirAlienTheGreat         if (filename.size() >= 2 && filename.substr(filename.size() - 2) == ".a") {
3092027fb9dSSirAlienTheGreat           command += dir + "/" + filename + " ";
3102027fb9dSSirAlienTheGreat         }
3112027fb9dSSirAlienTheGreat       }
3122027fb9dSSirAlienTheGreat       closedir(dp);
3132027fb9dSSirAlienTheGreat       // TODO: when libCEED switches to c++17, switch to std::filesystem for the loop above
3142027fb9dSSirAlienTheGreat     }
3152027fb9dSSirAlienTheGreat 
3162027fb9dSSirAlienTheGreat     // Link, optimize, and compile final CUDA kernel
3172027fb9dSSirAlienTheGreat     // note that the find command is used to find the rust-installed llvm tool
3182027fb9dSSirAlienTheGreat     CeedCallSystem(ceed, command.c_str(), "link C and Rust source");
319*9b5f41c8SJeremy L Thompson     CeedCallSystem(
320*9b5f41c8SJeremy L Thompson         ceed,
321*9b5f41c8SJeremy L Thompson         ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name opt) --passes internalize,inline temp/kernel_" +
322*9b5f41c8SJeremy L Thompson          std::to_string(build_id) + "_2_linked.ll -o temp/kernel_" + std::to_string(build_id) + "_3_opt.bc")
3232027fb9dSSirAlienTheGreat             .c_str(),
3242027fb9dSSirAlienTheGreat         "optimize linked C and Rust source");
3252027fb9dSSirAlienTheGreat     CeedCallSystem(ceed,
3262027fb9dSSirAlienTheGreat                    ("$(find $(rustup run " + std::string(rust_toolchain) + " rustc --print sysroot) -name llc) -O3 -mcpu=sm_" +
327*9b5f41c8SJeremy L Thompson                     std::to_string(prop.major) + std::to_string(prop.minor) + " temp/kernel_" + std::to_string(build_id) +
328*9b5f41c8SJeremy L Thompson                     "_3_opt.bc -o temp/kernel_" + std::to_string(build_id) + "_4_final.ptx")
3292027fb9dSSirAlienTheGreat                        .c_str(),
3302027fb9dSSirAlienTheGreat                    "compile final CUDA kernel");
3312027fb9dSSirAlienTheGreat 
332*9b5f41c8SJeremy L Thompson     ifstream      ptxfile("temp/kernel_" + std::to_string(build_id) + "_4_final.ptx");
3332027fb9dSSirAlienTheGreat     ostringstream sstr;
3342027fb9dSSirAlienTheGreat 
3352027fb9dSSirAlienTheGreat     sstr << ptxfile.rdbuf();
3362027fb9dSSirAlienTheGreat 
3372027fb9dSSirAlienTheGreat     auto ptx_data = sstr.str();
3382027fb9dSSirAlienTheGreat     ptx_size      = ptx_data.length();
3392027fb9dSSirAlienTheGreat 
3402027fb9dSSirAlienTheGreat     int result = cuModuleLoadData(module, ptx_data.c_str());
3412027fb9dSSirAlienTheGreat 
3422027fb9dSSirAlienTheGreat     *is_compile_good = result == 0;
3432027fb9dSSirAlienTheGreat     if (!*is_compile_good) {
3442027fb9dSSirAlienTheGreat       if (throw_error) {
3452027fb9dSSirAlienTheGreat         return CeedError(ceed, CEED_ERROR_BACKEND, "Failed to load module data");
3462027fb9dSSirAlienTheGreat       } else {
3472027fb9dSSirAlienTheGreat         // LCOV_EXCL_START
3482027fb9dSSirAlienTheGreat         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- COMPILE ERROR DETECTED ----------\n");
3492027fb9dSSirAlienTheGreat         CeedDebug(ceed, "Error: Failed to load module data");
3502027fb9dSSirAlienTheGreat         CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- BACKEND MAY FALLBACK ----------\n");
3512027fb9dSSirAlienTheGreat         return CEED_ERROR_SUCCESS;
3522027fb9dSSirAlienTheGreat         // LCOV_EXCL_STOP
3532027fb9dSSirAlienTheGreat       }
3542027fb9dSSirAlienTheGreat     }
3552027fb9dSSirAlienTheGreat   }
3562027fb9dSSirAlienTheGreat   return CEED_ERROR_SUCCESS;
357c9c2c079SJeremy L Thompson }
358c9c2c079SJeremy L Thompson 
359ddae5012SJeremy L Thompson int CeedCompile_Cuda(Ceed ceed, const char *source, CUmodule *module, const CeedInt num_defines, ...) {
360ddae5012SJeremy L Thompson   bool    is_compile_good = true;
361ddae5012SJeremy L Thompson   va_list args;
362ddae5012SJeremy L Thompson 
363ddae5012SJeremy L Thompson   va_start(args, num_defines);
36418c38aeeSJeremy L Thompson   const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, true, &is_compile_good, module, num_defines, args);
36518c38aeeSJeremy L Thompson 
366ddae5012SJeremy L Thompson   va_end(args);
36718c38aeeSJeremy L Thompson   CeedCallBackend(ierr);
368ddae5012SJeremy L Thompson   return CEED_ERROR_SUCCESS;
369ddae5012SJeremy L Thompson }
370ddae5012SJeremy L Thompson 
371ddae5012SJeremy L Thompson int CeedTryCompile_Cuda(Ceed ceed, const char *source, bool *is_compile_good, CUmodule *module, const CeedInt num_defines, ...) {
372ddae5012SJeremy L Thompson   va_list args;
373ddae5012SJeremy L Thompson 
374ddae5012SJeremy L Thompson   va_start(args, num_defines);
37518c38aeeSJeremy L Thompson   const CeedInt ierr = CeedCompileCore_Cuda(ceed, source, false, is_compile_good, module, num_defines, args);
37618c38aeeSJeremy L Thompson 
377ddae5012SJeremy L Thompson   va_end(args);
37818c38aeeSJeremy L Thompson   CeedCallBackend(ierr);
379ddae5012SJeremy L Thompson   return CEED_ERROR_SUCCESS;
380ddae5012SJeremy L Thompson }
381ddae5012SJeremy L Thompson 
382c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
383c9c2c079SJeremy L Thompson // Get CUDA kernel
384c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
385eb7e6cafSJeremy L Thompson int CeedGetKernel_Cuda(Ceed ceed, CUmodule module, const char *name, CUfunction *kernel) {
3862b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuModuleGetFunction(kernel, module, name));
387c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
388c9c2c079SJeremy L Thompson }
389c9c2c079SJeremy L Thompson 
390b2165e7aSSebastian Grimberg //------------------------------------------------------------------------------
391b2165e7aSSebastian Grimberg // Run CUDA kernel with block size selected automatically based on the kernel
392b2165e7aSSebastian Grimberg //     (which may use enough registers to require a smaller block size than the
393b2165e7aSSebastian Grimberg //      hardware is capable)
394b2165e7aSSebastian Grimberg //------------------------------------------------------------------------------
3952b730f8bSJeremy L Thompson int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points, void **args) {
396c9c2c079SJeremy L Thompson   int min_grid_size, max_block_size;
397ca735530SJeremy L Thompson 
3982b730f8bSJeremy L Thompson   CeedCallCuda(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size, &max_block_size, kernel, NULL, 0, 0x10000));
399eb7e6cafSJeremy L Thompson   CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args));
400ca735530SJeremy L Thompson   return CEED_ERROR_SUCCESS;
401c9c2c079SJeremy L Thompson }
402c9c2c079SJeremy L Thompson 
403c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
404c9c2c079SJeremy L Thompson // Run CUDA kernel
405c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
406eb7e6cafSJeremy L Thompson int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) {
407e9c76bddSJeremy L Thompson   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args));
408c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
409c9c2c079SJeremy L Thompson }
410c9c2c079SJeremy L Thompson 
411c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
412c9c2c079SJeremy L Thompson // Run CUDA kernel for spatial dimension
413c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
414eb7e6cafSJeremy L Thompson int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z,
4152b730f8bSJeremy L Thompson                           void **args) {
416e9c76bddSJeremy L Thompson   CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
417c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
418c9c2c079SJeremy L Thompson }
419c9c2c079SJeremy L Thompson 
420c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
421ea61e9acSJeremy L Thompson // Run CUDA kernel for spatial dimension with shared memory
422c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
423e9c76bddSJeremy L Thompson static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x,
424e9c76bddSJeremy L Thompson                                            const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
425e9c76bddSJeremy L Thompson                                            bool *is_good_run, void **args) {
426023b8a51Sabdelfattah83 #if CUDA_VERSION >= 9000
427023b8a51Sabdelfattah83   cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
428023b8a51Sabdelfattah83 #endif
429e9c76bddSJeremy L Thompson   CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
430ca735530SJeremy L Thompson 
431c9c2c079SJeremy L Thompson   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
432c9c2c079SJeremy L Thompson     int max_threads_per_block, shared_size_bytes, num_regs;
433ca735530SJeremy L Thompson 
4342b730f8bSJeremy L Thompson     cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
4352b730f8bSJeremy L Thompson     cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
436c9c2c079SJeremy L Thompson     cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
437c49dc7a7SJeremy L Thompson     if (throw_error) {
438c9c2c079SJeremy L Thompson       return CeedError(ceed, CEED_ERROR_BACKEND,
439c9c2c079SJeremy L Thompson                        "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
4402b730f8bSJeremy L Thompson                        max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
441c49dc7a7SJeremy L Thompson     } else {
442c49dc7a7SJeremy L Thompson       // LCOV_EXCL_START
443c49dc7a7SJeremy L Thompson       CeedDebug256(ceed, CEED_DEBUG_COLOR_ERROR, "---------- LAUNCH ERROR DETECTED ----------\n");
444c49dc7a7SJeremy L Thompson       CeedDebug(ceed, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d\n",
445c49dc7a7SJeremy L Thompson                 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
446c21e34e2SJeremy L Thompson       CeedDebug256(ceed, CEED_DEBUG_COLOR_WARNING, "---------- BACKEND MAY FALLBACK ----------\n");
447c49dc7a7SJeremy L Thompson       // LCOV_EXCL_STOP
448ddae5012SJeremy L Thompson     }
449c49dc7a7SJeremy L Thompson     *is_good_run = false;
450c9c2c079SJeremy L Thompson   } else CeedChk_Cu(ceed, result);
451c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
452c9c2c079SJeremy L Thompson }
4532a86cc9dSSebastian Grimberg 
454e9c76bddSJeremy L Thompson int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
455ddae5012SJeremy L Thompson                                 const int block_size_z, const int shared_mem_size, void **args) {
456ddae5012SJeremy L Thompson   bool is_good_run = true;
457ddae5012SJeremy L Thompson 
458e9c76bddSJeremy L Thompson   CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
459e9c76bddSJeremy L Thompson                                                   &is_good_run, args));
460ddae5012SJeremy L Thompson   return CEED_ERROR_SUCCESS;
461ddae5012SJeremy L Thompson }
462ddae5012SJeremy L Thompson 
463e9c76bddSJeremy L Thompson int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
464ddae5012SJeremy L Thompson                                    const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
465e9c76bddSJeremy L Thompson   CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
466e9c76bddSJeremy L Thompson                                                   is_good_run, args));
467ddae5012SJeremy L Thompson   return CEED_ERROR_SUCCESS;
468ddae5012SJeremy L Thompson }
469ddae5012SJeremy L Thompson 
4702a86cc9dSSebastian Grimberg //------------------------------------------------------------------------------
471