| ceed-cuda-compile.cpp (d7c593293e00f7bc51f26f45435d6890398f4ce7) | ceed-cuda-compile.cpp (e9c76bddc0f2a44f522e0176ed6b7e0c0aa1df73) |
|---|---|
| 1// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3// 4// SPDX-License-Identifier: BSD-2-Clause 5// 6// This file is part of CEED: http://github.com/ceed 7 8#include "ceed-cuda-compile.h" --- 192 unchanged lines hidden (view full) --- 201 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args)); 202 return CEED_ERROR_SUCCESS; 203} 204 205//------------------------------------------------------------------------------ 206// Run CUDA kernel 207//------------------------------------------------------------------------------ 208int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) { | 1// Copyright (c) 2017-2024, Lawrence Livermore National Security, LLC and other CEED contributors. 2// All Rights Reserved. See the top-level LICENSE and NOTICE files for details. 3// 4// SPDX-License-Identifier: BSD-2-Clause 5// 6// This file is part of CEED: http://github.com/ceed 7 8#include "ceed-cuda-compile.h" --- 192 unchanged lines hidden (view full) --- 201 CeedCallBackend(CeedRunKernel_Cuda(ceed, kernel, CeedDivUpInt(points, max_block_size), max_block_size, args)); 202 return CEED_ERROR_SUCCESS; 203} 204 205//------------------------------------------------------------------------------ 206// Run CUDA kernel 207//------------------------------------------------------------------------------ 208int CeedRunKernel_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size, void **args) { |
| 209 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size, 1, 1, 0, args)); | 209 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size, 1, 1, 0, args)); |
| 210 return CEED_ERROR_SUCCESS; 211} 212 213//------------------------------------------------------------------------------ 214// Run CUDA kernel for spatial dimension 215//------------------------------------------------------------------------------ 216int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, 217 void **args) { | 210 return CEED_ERROR_SUCCESS; 211} 212 213//------------------------------------------------------------------------------ 214// Run CUDA kernel for spatial dimension 215//------------------------------------------------------------------------------ 216int CeedRunKernelDim_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, const int block_size_z, 217 void **args) { |
| 218 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, 0, args)); | 218 CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, kernel, NULL, grid_size, block_size_x, block_size_y, block_size_z, 0, args)); |
| 219 return CEED_ERROR_SUCCESS; 220} 221 222//------------------------------------------------------------------------------ 223// Run CUDA kernel for spatial dimension with shared memory 224//------------------------------------------------------------------------------ | 219 return CEED_ERROR_SUCCESS; 220} 221 222//------------------------------------------------------------------------------ 223// Run CUDA kernel for spatial dimension with shared memory 224//------------------------------------------------------------------------------ |
| 225static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, 226 const int block_size_z, const int shared_mem_size, const bool throw_error, bool *is_good_run, 227 void **args) { | 225static int CeedRunKernelDimSharedCore_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, 226 const int block_size_y, const int block_size_z, const int shared_mem_size, const bool throw_error, 227 bool *is_good_run, void **args) { |
| 228#if CUDA_VERSION >= 9000 229 cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size); 230#endif | 228#if CUDA_VERSION >= 9000 229 cuFuncSetAttribute(kernel, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_mem_size); 230#endif |
| 231 CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, NULL, args, NULL); | 231 CUresult result = cuLaunchKernel(kernel, grid_size, 1, 1, block_size_x, block_size_y, block_size_z, shared_mem_size, stream, args, NULL); |
| 232 233 if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) { 234 *is_good_run = false; 235 if (throw_error) { 236 int max_threads_per_block, shared_size_bytes, num_regs; 237 238 cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); 239 cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); 240 cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel); 241 return CeedError(ceed, CEED_ERROR_BACKEND, 242 "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d", 243 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs); 244 } 245 } else CeedChk_Cu(ceed, result); 246 return CEED_ERROR_SUCCESS; 247} 248 | 232 233 if (result == CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES) { 234 *is_good_run = false; 235 if (throw_error) { 236 int max_threads_per_block, shared_size_bytes, num_regs; 237 238 cuFuncGetAttribute(&max_threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); 239 cuFuncGetAttribute(&shared_size_bytes, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); 240 cuFuncGetAttribute(&num_regs, CU_FUNC_ATTRIBUTE_NUM_REGS, kernel); 241 return CeedError(ceed, CEED_ERROR_BACKEND, 242 "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: max_threads_per_block %d on block size (%d,%d,%d), shared_size %d, num_regs %d", 243 max_threads_per_block, block_size_x, block_size_y, block_size_z, shared_size_bytes, num_regs); 244 } 245 } else CeedChk_Cu(ceed, result); 246 return CEED_ERROR_SUCCESS; 247} 248 |
| 249int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, | 249int CeedRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y, |
| 250 const int block_size_z, const int shared_mem_size, void **args) { 251 bool is_good_run = true; 252 | 250 const int block_size_z, const int shared_mem_size, void **args) { 251 bool is_good_run = true; 252 |
| 253 CeedCallBackend( 254 CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, &is_good_run, args)); | 253 CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, true, 254 &is_good_run, args)); |
| 255 return CEED_ERROR_SUCCESS; 256} 257 | 255 return CEED_ERROR_SUCCESS; 256} 257 |
| 258int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, const int grid_size, const int block_size_x, const int block_size_y, | 258int CeedTryRunKernelDimShared_Cuda(Ceed ceed, CUfunction kernel, CUstream stream, const int grid_size, const int block_size_x, const int block_size_y, |
| 259 const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) { | 259 const int block_size_z, const int shared_mem_size, bool *is_good_run, void **args) { |
| 260 CeedCallBackend( 261 CeedRunKernelDimSharedCore_Cuda(ceed, kernel, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, is_good_run, args)); | 260 CeedCallBackend(CeedRunKernelDimSharedCore_Cuda(ceed, kernel, stream, grid_size, block_size_x, block_size_y, block_size_z, shared_mem_size, false, 261 is_good_run, args)); |
| 262 return CEED_ERROR_SUCCESS; 263} 264 265//------------------------------------------------------------------------------ | 262 return CEED_ERROR_SUCCESS; 263} 264 265//------------------------------------------------------------------------------ |