Home
last modified time | relevance | path

Searched defs:elems_per_block (Results 1 – 14 of 14) sorted by relevance

/libCEED/backends/cuda-shared/
H A Dceed-cuda-shared-basis.c62 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared() local
75 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared() local
87 CeedInt elems_per_block = 1; in CeedBasisApplyTensorCore_Cuda_shared() local
116 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared() local
129 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared() local
140 CeedInt elems_per_block = 1; in CeedBasisApplyTensorCore_Cuda_shared() local
160 const CeedInt elems_per_block = block_size / Q_1d; in CeedBasisApplyTensorCore_Cuda_shared() local
166 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Cuda_shared() local
172 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Cuda_shared() local
326 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyAtPointsCore_Cuda_shared() local
[all …]
/libCEED/backends/hip-shared/
H A Dceed-hip-shared-basis.c128 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared() local
141 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local
152 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local
180 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared() local
193 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local
204 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local
226 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Hip_shared() local
232 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Hip_shared() local
238 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Hip_shared() local
391 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyAtPointsCore_Hip_shared() local
[all …]
/libCEED/backends/sycl-gen/
H A Dceed-sycl-gen-operator-build.sycl.cpp34 CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; in BlockGridCalculate_Sycl_gen() local
41 const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; in BlockGridCalculate_Sycl_gen() local
47 const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); in BlockGridCalculate_Sycl_gen() local
/libCEED/backends/cuda-gen/
H A Dceed-cuda-gen-operator.c40 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) { in Waste()
79 int elems_per_block = 1; in BlockGridCalculate() local
215 …CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / dat… in CeedOperatorApplyAddCore_Cuda_gen() local
470 …CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / dat… in CeedOperatorLinearAssembleQFunctionCore_Cuda_gen() local
/libCEED/backends/hip-gen/
H A Dceed-hip-gen-operator.c157 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorApplyAddCore_Hip_gen() local
435 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() local
H A Dceed-hip-gen-operator-build.cpp37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in BlockGridCalculate_Hip_gen() local
44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; in BlockGridCalculate_Hip_gen() local
50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); in BlockGridCalculate_Hip_gen() local
/libCEED/backends/sycl-ref/
H A Dceed-sycl-ref.hpp97 CeedInt num_elem, block_size_x, block_size_y, elems_per_block; member
H A Dceed-sycl-ref-operator.sycl.cpp1111 int elems_per_block = 1; in CeedOperatorAssembleSingleSetup_Sycl() local
/libCEED/backends/hip-ref/
H A Dceed-hip-ref-basis.c259 const int elems_per_block = 1; in CeedBasisApplyNonTensorCore_Hip() local
H A Dceed-hip-ref-operator.c1383 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalSetupCompile_Hip() local
1462 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalCore_Hip() local
H A Dceed-hip-ref.h135 CeedInt block_size_x, block_size_y, elems_per_block; member
/libCEED/backends/cuda-ref/
H A Dceed-cuda-ref-basis.c260 const int elems_per_block = 1; in CeedBasisApplyNonTensorCore_Cuda() local
H A Dceed-cuda-ref-operator.c1386 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalSetupCompile_Cuda() local
1465 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalCore_Cuda() local
H A Dceed-cuda-ref.h130 CeedInt block_size_x, block_size_y, elems_per_block; member