Home
last modified time | relevance | path

Searched refs:thread (Results 1 – 5 of 5) sorted by relevance

/libCEED/backends/hip-shared/
H A Dceed-hip-shared-basis.c543 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Hip_shared() local
547 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; in CeedBasisApplyNonTensorCore_Hip_shared()
550 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Hip_shared()
553 …ared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Hip_shared()
556 …CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread, 1, elems_per_bl… in CeedBasisApplyNonTensorCore_Hip_shared()
566 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Hip_shared() local
570 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; in CeedBasisApplyNonTensorCore_Hip_shared()
573 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Hip_shared()
576 …imShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Hip_shared()
579 …CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread, 1, elems_per_bloc… in CeedBasisApplyNonTensorCore_Hip_shared()
[all …]
/libCEED/backends/cuda-shared/
H A Dceed-cuda-shared-basis.c479 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Cuda_shared() local
485 …elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); in CeedBasisApplyNonTensorCore_Cuda_shared()
487 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared()
490 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Cuda_shared()
493 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_b… in CeedBasisApplyNonTensorCore_Cuda_shared()
503 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Cuda_shared() local
509 …elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); in CeedBasisApplyNonTensorCore_Cuda_shared()
511 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared()
514 …mShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Cuda_shared()
517 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_blo… in CeedBasisApplyNonTensorCore_Cuda_shared()
[all …]
/libCEED/doc/sphinx/source/
H A Dintro.md27 …esired specialized implementation at run time. Moreover, each process or thread can instantiate an…
H A Dreleasenotes.md424 | `/gpu/cuda/reg` | Pure CUDA kernels using one thread per element |
473 parallelization approach, where each thread treats a finite element. Using just in time
504 | `/gpu/cuda/reg` | Pure CUDA kernels using one thread per element |
/libCEED/doc/papers/joss/
H A Dpaper.md142 …esired specialized implementation at run time. Moreover, each process or thread can instantiate an…