ceed-cuda-compile.cpp (d7c593293e00f7bc51f26f45435d6890398f4ce7) ceed-cuda-compile.cpp (e9c76bddc0f2a44f522e0176ed6b7e0c0aa1df73)
1// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3//
4// SPDX-License-Identifier: BSD-2-Clause
5//
6// This file is part of CEED: http://github.com/ceed
7
8#include "ceed-cuda-compile.h"

--- 192 unchanged lines hidden (view full) ---

201 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args));
202 return CEED_ERROR_SUCCESS;
203}
204
205//------------------------------------------------------------------------------
206// Run CUDA kernel
207//------------------------------------------------------------------------------
208int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) {
1// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors.
2// All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
3//
4// SPDX-License-Identifier: BSD-2-Clause
5//
6// This file is part of CEED: http://github.com/ceed
7
8#include "ceed-cuda-compile.h"

--- 192 unchanged lines hidden (view full) ---

201 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args));
202 return CEED_ERROR_SUCCESS;
203}
204
205//------------------------------------------------------------------------------
206// Run CUDA kernel
207//------------------------------------------------------------------------------
208int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) {
209 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size, 1, 1, 0, args));
209 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args));
210 return CEED_ERROR_SUCCESS;
211}
212
213//------------------------------------------------------------------------------
214// Run CUDA kernel for spatial dimension
215//------------------------------------------------------------------------------
216int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z,
217 void **args) {
210 return CEED_ERROR_SUCCESS;
211}
212
213//------------------------------------------------------------------------------
214// Run CUDA kernel for spatial dimension
215//------------------------------------------------------------------------------
216int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z,
217 void **args) {
218 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
218 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args));
219 return CEED_ERROR_SUCCESS;
220}
221
222//------------------------------------------------------------------------------
223// Run CUDA kernel for spatial dimension with shared memory
224//------------------------------------------------------------------------------
219 return CEED_ERROR_SUCCESS;
220}
221
222//------------------------------------------------------------------------------
223// Run CUDA kernel for spatial dimension with shared memory
224//------------------------------------------------------------------------------
225static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
226 const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run,
227 void **args) {
225static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x,
226 const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error,
227 bool *is_good_run, void **args) {
228#if CUDA_VERSION >= 9000
229 cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
230#endif
228#if CUDA_VERSION >= 9000
229 cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size);
230#endif
231 CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL);
231 CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL);
232
233 if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
234 *is_good_run = false;
235 if (throw_error) {
236 int max_threads_per_block, shared_size_bytes, num_regs;
237
238 cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
239 cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
240 cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
241 return CeedError(ceed, CEED_ERROR_BACKEND,
242 "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
243 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
244 }
245 } else CeedChk_Cu(ceed, result);
246 return CEED_ERROR_SUCCESS;
247}
248
232
233 if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) {
234 *is_good_run = false;
235 if (throw_error) {
236 int max_threads_per_block, shared_size_bytes, num_regs;
237
238 cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel);
239 cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel);
240 cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel);
241 return CeedError(ceed, CEED_ERROR_BACKEND,
242 "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d",
243 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs);
244 }
245 } else CeedChk_Cu(ceed, result);
246 return CEED_ERROR_SUCCESS;
247}
248
249int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
249int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
250 const int block_size_z, const int shared_mem_size, void **args) {
251 bool is_good_run = true;
252
250 const int block_size_z, const int shared_mem_size, void **args) {
251 bool is_good_run = true;
252
253 CeedCallBackend(
254 CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args));
253 CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true,
254 &is_good_run, args));
255 return CEED_ERROR_SUCCESS;
256}
257
255 return CEED_ERROR_SUCCESS;
256}
257
258int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y,
258int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y,
259 const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
259 const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) {
260 CeedCallBackend(
261 CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args));
260 CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false,
261 is_good_run, args));
262 return CEED_ERROR_SUCCESS;
263}
264
265//------------------------------------------------------------------------------
262 return CEED_ERROR_SUCCESS;
263}
264
265//------------------------------------------------------------------------------