| /libCEED/backends/cuda-shared/ |
| H A D | ceed-cuda-shared-basis.c | 62 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared() local 75 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared() local 87 CeedInt elems_per_block = 1; in CeedBasisApplyTensorCore_Cuda_shared() local 116 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared() local 129 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared() local 140 CeedInt elems_per_block = 1; in CeedBasisApplyTensorCore_Cuda_shared() local 160 const CeedInt elems_per_block = block_size / Q_1d; in CeedBasisApplyTensorCore_Cuda_shared() local 166 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Cuda_shared() local 172 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Cuda_shared() local 326 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyAtPointsCore_Cuda_shared() local [all …]
|
| /libCEED/backends/hip-shared/ |
| H A D | ceed-hip-shared-basis.c | 128 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared() local 141 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local 152 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local 180 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared() local 193 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local 204 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local 226 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Hip_shared() local 232 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Hip_shared() local 238 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Hip_shared() local 391 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyAtPointsCore_Hip_shared() local [all …]
|
| /libCEED/backends/sycl-gen/ |
| H A D | ceed-sycl-gen-operator-build.sycl.cpp | 34 CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; in BlockGridCalculate_Sycl_gen() local 41 const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; in BlockGridCalculate_Sycl_gen() local 47 const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); in BlockGridCalculate_Sycl_gen() local
|
| /libCEED/backends/cuda-gen/ |
| H A D | ceed-cuda-gen-operator.c | 40 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) { in Waste() 79 int elems_per_block = 1; in BlockGridCalculate() local 215 …CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / dat… in CeedOperatorApplyAddCore_Cuda_gen() local 470 …CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / dat… in CeedOperatorLinearAssembleQFunctionCore_Cuda_gen() local
|
| /libCEED/backends/hip-gen/ |
| H A D | ceed-hip-gen-operator.c | 157 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorApplyAddCore_Hip_gen() local 435 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() local
|
| H A D | ceed-hip-gen-operator-build.cpp | 37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in BlockGridCalculate_Hip_gen() local 44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; in BlockGridCalculate_Hip_gen() local 50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); in BlockGridCalculate_Hip_gen() local
|
| /libCEED/backends/sycl-ref/ |
| H A D | ceed-sycl-ref.hpp | 97 CeedInt num_elem, block_size_x, block_size_y, elems_per_block; member
|
| H A D | ceed-sycl-ref-operator.sycl.cpp | 1111 int elems_per_block = 1; in CeedOperatorAssembleSingleSetup_Sycl() local
|
| /libCEED/backends/hip-ref/ |
| H A D | ceed-hip-ref-basis.c | 259 const int elems_per_block = 1; in CeedBasisApplyNonTensorCore_Hip() local
|
| H A D | ceed-hip-ref-operator.c | 1383 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalSetupCompile_Hip() local 1462 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalCore_Hip() local
|
| H A D | ceed-hip-ref.h | 135 CeedInt block_size_x, block_size_y, elems_per_block; member
|
| /libCEED/backends/cuda-ref/ |
| H A D | ceed-cuda-ref-basis.c | 260 const int elems_per_block = 1; in CeedBasisApplyNonTensorCore_Cuda() local
|
| H A D | ceed-cuda-ref-operator.c | 1386 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalSetupCompile_Cuda() local 1465 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalCore_Cuda() local
|
| H A D | ceed-cuda-ref.h | 130 CeedInt block_size_x, block_size_y, elems_per_block; member
|