xref: /libCEED/rust/libceed-sys/c-src/backends/cuda/ceed-cuda-compile.cpp (revision c9c2c07970382857cc7b4a28d359710237b91a3e)
1*c9c2c079SJeremy L Thompson // Copyright (c) 2017-2022, Lawrence Livermore National Security, LLC and other CEED contributors.
2*c9c2c079SJeremy L Thompson // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3*c9c2c079SJeremy L Thompson //
4*c9c2c079SJeremy L Thompson // SPDX-License-Identifier: BSD-2-Clause
5*c9c2c079SJeremy L Thompson //
6*c9c2c079SJeremy L Thompson // This file is part of CEED:  http://github.com/ceed
7*c9c2c079SJeremy L Thompson 
8*c9c2c079SJeremy L Thompson #include <ceed/ceed.h>
9*c9c2c079SJeremy L Thompson #include <ceed/backend.h>
10*c9c2c079SJeremy L Thompson #include <ceed/jit-tools.h>
11*c9c2c079SJeremy L Thompson #include <cuda.h>
12*c9c2c079SJeremy L Thompson #include <cuda_runtime.h>
13*c9c2c079SJeremy L Thompson #include <nvrtc.h>
14*c9c2c079SJeremy L Thompson #include <sstream>
15*c9c2c079SJeremy L Thompson #include <stdarg.h>
16*c9c2c079SJeremy L Thompson #include <string.h>
17*c9c2c079SJeremy L Thompson #include "ceed-cuda-common.h"
18*c9c2c079SJeremy L Thompson #include "ceed-cuda-compile.h"
19*c9c2c079SJeremy L Thompson 
20*c9c2c079SJeremy L Thompson #define CeedChk_Nvrtc(ceed, x) \
21*c9c2c079SJeremy L Thompson do { \
22*c9c2c079SJeremy L Thompson   nvrtcResult result = static_cast<nvrtcResult>(x); \
23*c9c2c079SJeremy L Thompson   if (result != NVRTC_SUCCESS) \
24*c9c2c079SJeremy L Thompson     return CeedError((ceed), CEED_ERROR_BACKEND, nvrtcGetErrorString(result)); \
25*c9c2c079SJeremy L Thompson } while (0)
26*c9c2c079SJeremy L Thompson 
27*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
28*c9c2c079SJeremy L Thompson // Compile CUDA kernel
29*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
30*c9c2c079SJeremy L Thompson int CeedCompileCuda(Ceed ceed, const char *source, CUmodule *module,
31*c9c2c079SJeremy L Thompson                     const CeedInt num_defines, ...) {
32*c9c2c079SJeremy L Thompson   int ierr;
33*c9c2c079SJeremy L Thompson   cudaFree(0); // Make sure a Context exists for nvrtc
34*c9c2c079SJeremy L Thompson   nvrtcProgram prog;
35*c9c2c079SJeremy L Thompson 
36*c9c2c079SJeremy L Thompson   std::ostringstream code;
37*c9c2c079SJeremy L Thompson 
38*c9c2c079SJeremy L Thompson   // Get kernel specific options, such as kernel constants
39*c9c2c079SJeremy L Thompson   if (num_defines > 0) {
40*c9c2c079SJeremy L Thompson     va_list args;
41*c9c2c079SJeremy L Thompson     va_start(args, num_defines);
42*c9c2c079SJeremy L Thompson     char *name;
43*c9c2c079SJeremy L Thompson     int val;
44*c9c2c079SJeremy L Thompson     for (int i = 0; i < num_defines; i++) {
45*c9c2c079SJeremy L Thompson       name = va_arg(args, char *);
46*c9c2c079SJeremy L Thompson       val = va_arg(args, int);
47*c9c2c079SJeremy L Thompson       code << "#define " << name << " " << val << "\n";
48*c9c2c079SJeremy L Thompson     }
49*c9c2c079SJeremy L Thompson     va_end(args);
50*c9c2c079SJeremy L Thompson   }
51*c9c2c079SJeremy L Thompson 
52*c9c2c079SJeremy L Thompson   // Standard libCEED definitions for CUDA backends
53*c9c2c079SJeremy L Thompson   char *jit_defs_path, *jit_defs_source;
54*c9c2c079SJeremy L Thompson   ierr = CeedGetJitAbsolutePath(ceed,
55*c9c2c079SJeremy L Thompson                                 "ceed/jit-source/cuda/cuda-jit.h",
56*c9c2c079SJeremy L Thompson                                 &jit_defs_path); CeedChkBackend(ierr);
57*c9c2c079SJeremy L Thompson   ierr = CeedLoadSourceToBuffer(ceed, jit_defs_path, &jit_defs_source);
58*c9c2c079SJeremy L Thompson   CeedChkBackend(ierr);
59*c9c2c079SJeremy L Thompson   code << jit_defs_source;
60*c9c2c079SJeremy L Thompson   code << "\n\n";
61*c9c2c079SJeremy L Thompson   ierr = CeedFree(&jit_defs_path); CeedChkBackend(ierr);
62*c9c2c079SJeremy L Thompson   ierr = CeedFree(&jit_defs_source); CeedChkBackend(ierr);
63*c9c2c079SJeremy L Thompson 
64*c9c2c079SJeremy L Thompson   // Non-macro options
65*c9c2c079SJeremy L Thompson   const int num_opts = 3;
66*c9c2c079SJeremy L Thompson   const char *opts[num_opts];
67*c9c2c079SJeremy L Thompson   opts[0] = "-default-device";
68*c9c2c079SJeremy L Thompson   struct cudaDeviceProp prop;
69*c9c2c079SJeremy L Thompson   Ceed_Cuda *ceed_data;
70*c9c2c079SJeremy L Thompson   ierr = CeedGetData(ceed, &ceed_data); CeedChkBackend(ierr);
71*c9c2c079SJeremy L Thompson   ierr = cudaGetDeviceProperties(&prop, ceed_data->device_id);
72*c9c2c079SJeremy L Thompson   CeedChk_Cu(ceed, ierr);
73*c9c2c079SJeremy L Thompson   std::string arch_arg = "-arch=compute_"  + std::to_string(prop.major) + std::to_string(prop.minor);
74*c9c2c079SJeremy L Thompson   opts[1] = arch_arg.c_str();
75*c9c2c079SJeremy L Thompson   opts[2] = "-Dint32_t=int";
76*c9c2c079SJeremy L Thompson 
77*c9c2c079SJeremy L Thompson   // Add string source argument provided in call
78*c9c2c079SJeremy L Thompson   code << source;
79*c9c2c079SJeremy L Thompson 
80*c9c2c079SJeremy L Thompson   // Create Program
81*c9c2c079SJeremy L Thompson   CeedChk_Nvrtc(ceed, nvrtcCreateProgram(&prog, code.str().c_str(), NULL, 0, NULL, NULL));
82*c9c2c079SJeremy L Thompson 
83*c9c2c079SJeremy L Thompson   // Compile kernel
84*c9c2c079SJeremy L Thompson   nvrtcResult result = nvrtcCompileProgram(prog, num_opts, opts);
85*c9c2c079SJeremy L Thompson   if (result != NVRTC_SUCCESS) {
86*c9c2c079SJeremy L Thompson     size_t log_size;
87*c9c2c079SJeremy L Thompson     CeedChk_Nvrtc(ceed, nvrtcGetProgramLogSize(prog, &log_size));
88*c9c2c079SJeremy L Thompson     char *log;
89*c9c2c079SJeremy L Thompson     ierr = CeedMalloc(log_size, &log); CeedChkBackend(ierr);
90*c9c2c079SJeremy L Thompson     CeedChk_Nvrtc(ceed, nvrtcGetProgramLog(prog, log));
91*c9c2c079SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND, "%s\n%s",
92*c9c2c079SJeremy L Thompson                      nvrtcGetErrorString(result), log);
93*c9c2c079SJeremy L Thompson   }
94*c9c2c079SJeremy L Thompson 
95*c9c2c079SJeremy L Thompson   size_t ptx_size;
96*c9c2c079SJeremy L Thompson   CeedChk_Nvrtc(ceed, nvrtcGetPTXSize(prog, &ptx_size));
97*c9c2c079SJeremy L Thompson   char *ptx;
98*c9c2c079SJeremy L Thompson   ierr = CeedMalloc(ptx_size, &ptx); CeedChkBackend(ierr);
99*c9c2c079SJeremy L Thompson   CeedChk_Nvrtc(ceed, nvrtcGetPTX(prog, ptx));
100*c9c2c079SJeremy L Thompson   CeedChk_Nvrtc(ceed, nvrtcDestroyProgram(&prog));
101*c9c2c079SJeremy L Thompson 
102*c9c2c079SJeremy L Thompson   CeedChk_Cu(ceed, cuModuleLoadData(module, ptx));
103*c9c2c079SJeremy L Thompson   ierr = CeedFree(&ptx); CeedChkBackend(ierr);
104*c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
105*c9c2c079SJeremy L Thompson }
106*c9c2c079SJeremy L Thompson 
107*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
108*c9c2c079SJeremy L Thompson // Get CUDA kernel
109*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
110*c9c2c079SJeremy L Thompson int CeedGetKernelCuda(Ceed ceed, CUmodule module, const char *name,
111*c9c2c079SJeremy L Thompson                       CUfunction *kernel) {
112*c9c2c079SJeremy L Thompson   CeedChk_Cu(ceed, cuModuleGetFunction(kernel, module, name));
113*c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
114*c9c2c079SJeremy L Thompson }
115*c9c2c079SJeremy L Thompson 
116*c9c2c079SJeremy L Thompson // Run kernel with block size selected automatically based on the kernel (which
117*c9c2c079SJeremy L Thompson // may use enough registers to require a smaller block size than the hardware is
118*c9c2c079SJeremy L Thompson // capable).
119*c9c2c079SJeremy L Thompson int CeedRunKernelAutoblockCuda(Ceed ceed, CUfunction kernel, size_t points,
120*c9c2c079SJeremy L Thompson                                void **args) {
121*c9c2c079SJeremy L Thompson   int min_grid_size, max_block_size;
122*c9c2c079SJeremy L Thompson   CeedChk_Cu(ceed, cuOccupancyMaxPotentialBlockSize(&min_grid_size,
123*c9c2c079SJeremy L Thompson              &max_block_size, kernel, NULL, 0, 0x10000));
124*c9c2c079SJeremy L Thompson   CeedChkBackend(CeedRunKernelCuda(ceed, kernel, CeedDivUpInt(points,
125*c9c2c079SJeremy L Thompson                                    max_block_size), max_block_size, args));
126*c9c2c079SJeremy L Thompson   return 0;
127*c9c2c079SJeremy L Thompson }
128*c9c2c079SJeremy L Thompson 
129*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
130*c9c2c079SJeremy L Thompson // Run CUDA kernel
131*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
132*c9c2c079SJeremy L Thompson int CeedRunKernelCuda(Ceed ceed, CUfunction kernel, const int grid_size,
133*c9c2c079SJeremy L Thompson                       const int block_size, void **args) {
134*c9c2c079SJeremy L Thompson   CeedChkBackend(CeedRunKernelDimSharedCuda(ceed, kernel, grid_size,
135*c9c2c079SJeremy L Thompson                  block_size, 1, 1, 0, args));
136*c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
137*c9c2c079SJeremy L Thompson }
138*c9c2c079SJeremy L Thompson 
139*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
140*c9c2c079SJeremy L Thompson // Run CUDA kernel for spatial dimension
141*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
142*c9c2c079SJeremy L Thompson int CeedRunKernelDimCuda(Ceed ceed, CUfunction kernel, const int grid_size,
143*c9c2c079SJeremy L Thompson                          const int block_size_x, const int block_size_y,
144*c9c2c079SJeremy L Thompson                          const int block_size_z, void **args) {
145*c9c2c079SJeremy L Thompson   CeedChkBackend(CeedRunKernelDimSharedCuda(ceed, kernel, grid_size,
146*c9c2c079SJeremy L Thompson                  block_size_x, block_size_y, block_size_z,
147*c9c2c079SJeremy L Thompson                  0, args));
148*c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
149*c9c2c079SJeremy L Thompson }
150*c9c2c079SJeremy L Thompson 
151*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
152*c9c2c079SJeremy L Thompson // Run CUDA kernel for spatial dimension with sharde memory
153*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
154*c9c2c079SJeremy L Thompson int CeedRunKernelDimSharedCuda(Ceed ceed, CUfunction kernel,
155*c9c2c079SJeremy L Thompson                                const int grid_size, const int block_size_x,
156*c9c2c079SJeremy L Thompson                                const int block_size_y, const int block_size_z,
157*c9c2c079SJeremy L Thompson                                const int shared_mem_size, void **args) {
158*c9c2c079SJeremy L Thompson   CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1,
159*c9c2c079SJeremy L Thompson                                    block_size_x, block_size_y, block_size_z,
160*c9c2c079SJeremy L Thompson                                    shared_mem_size, NULL, args, NULL);
161*c9c2c079SJeremy L Thompson   if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
162*c9c2c079SJeremy L Thompson     int max_threads_per_block, shared_size_bytes, num_regs;
163*c9c2c079SJeremy L Thompson     cuFuncGetAttribute(&max_threads_per_block,
164*c9c2c079SJeremy L Thompson                        CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
165*c9c2c079SJeremy L Thompson     cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES,
166*c9c2c079SJeremy L Thompson                        kernel);
167*c9c2c079SJeremy L Thompson     cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
168*c9c2c079SJeremy L Thompson     return CeedError(ceed, CEED_ERROR_BACKEND,
169*c9c2c079SJeremy L Thompson                      "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
170*c9c2c079SJeremy L Thompson                      max_threads_per_block, block_size_x, block_size_y, block_size_z,
171*c9c2c079SJeremy L Thompson                      shared_size_bytes, num_regs);
172*c9c2c079SJeremy L Thompson   } else CeedChk_Cu(ceed, result);
173*c9c2c079SJeremy L Thompson   return CEED_ERROR_SUCCESS;
174*c9c2c079SJeremy L Thompson }
175*c9c2c079SJeremy L Thompson 
176*c9c2c079SJeremy L Thompson //------------------------------------------------------------------------------
177