Searched refs:thread (Results 1 – 5 of 5) sorted by relevance
| /libCEED/backends/hip-shared/ |
| H A D | ceed-hip-shared-basis.c | 543 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Hip_shared() local 547 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; in CeedBasisApplyNonTensorCore_Hip_shared() 550 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Hip_shared() 553 …ared_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Hip_shared() 556 …CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread, 1, elems_per_bl… in CeedBasisApplyNonTensorCore_Hip_shared() 566 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Hip_shared() local 570 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; in CeedBasisApplyNonTensorCore_Hip_shared() 573 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Hip_shared() 576 …imShared_Hip(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Hip_shared() 579 …CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread, 1, elems_per_bloc… in CeedBasisApplyNonTensorCore_Hip_shared() [all …]
|
| /libCEED/backends/cuda-shared/ |
| H A D | ceed-cuda-shared-basis.c | 479 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Cuda_shared() local 485 …elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); in CeedBasisApplyNonTensorCore_Cuda_shared() 487 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared() 490 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Cuda_shared() 493 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_b… in CeedBasisApplyNonTensorCore_Cuda_shared() 503 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Cuda_shared() local 509 …elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); in CeedBasisApplyNonTensorCore_Cuda_shared() 511 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared() 514 …mShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Cuda_shared() 517 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_blo… in CeedBasisApplyNonTensorCore_Cuda_shared() [all …]
|
| /libCEED/doc/sphinx/source/ |
| H A D | intro.md | 27 …esired specialized implementation at run time. Moreover, each process or thread can instantiate an…
|
| H A D | releasenotes.md | 424 | `/gpu/cuda/reg` | Pure CUDA kernels using one thread per element | 473 parallelization approach, where each thread treats a finite element. Using just in time 504 | `/gpu/cuda/reg` | Pure CUDA kernels using one thread per element |
|
| /libCEED/doc/papers/joss/ |
| H A D | paper.md | 142 …esired specialized implementation at run time. Moreover, each process or thread can instantiate an…
|