thread (reference) in projects: libCEED

Searched refs:thread (Results 1 – 5 of 5) sorted by relevance

/libCEED/backends/hip-shared/
H A D	ceed-hip-shared-basis.c	543 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Hip_shared() local 547 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; in CeedBasisApplyNonTensorCore_Hip_shared() 550 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Hip_shared() 553 …ared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Hip_shared() 556 …CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread, 1, elems_per_bl… in CeedBasisApplyNonTensorCore_Hip_shared() 566 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Hip_shared() local 570 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; in CeedBasisApplyNonTensorCore_Hip_shared() 573 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Hip_shared() 576 …imShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Hip_shared() 579 …CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread, 1, elems_per_bloc… in CeedBasisApplyNonTensorCore_Hip_shared() [all …]
/libCEED/backends/cuda-shared/
H A D	ceed-cuda-shared-basis.c	479 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Cuda_shared() local 485 …elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); in CeedBasisApplyNonTensorCore_Cuda_shared() 487 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared() 490 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Cuda_shared() 493 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_b… in CeedBasisApplyNonTensorCore_Cuda_shared() 503 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Cuda_shared() local 509 …elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); in CeedBasisApplyNonTensorCore_Cuda_shared() 511 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared() 514 …mShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Cuda_shared() 517 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_blo… in CeedBasisApplyNonTensorCore_Cuda_shared() [all …]
/libCEED/doc/sphinx/source/
H A D	intro.md	`27 …esired specialized implementation at run time. Moreover, each process or thread can instantiate an…`
H A D	releasenotes.md	424 \| `/gpu/cuda/reg` \| Pure CUDA kernels using one thread per element \| 473 parallelization approach, where each thread treats a finite element. Using just in time 504 \| `/gpu/cuda/reg` \| Pure CUDA kernels using one thread per element \|
/libCEED/doc/papers/joss/
H A D	paper.md	`142 …esired specialized implementation at run time. Moreover, each process or thread can instantiate an…`

Project(s)

Full Search
Definition
Symbol
File Path
History
Type