Home
last modified time | relevance | path

Searched refs:thread_1d (Results 1 – 9 of 9) sorted by relevance

/libCEED/backends/hip-shared/
H A Dceed-hip-shared-basis.c44 const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d); in ComputeBasisThreadBlockSizes() local
59 CeedInt required = thread_1d * thread_1d; in ComputeBasisThreadBlockSizes()
73 CeedInt required = thread_1d * thread_1d; in ComputeBasisThreadBlockSizes()
124 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisApplyTensorCore_Hip_shared() local
128 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared()
131 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
134 …d_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1, in CeedBasisApplyTensorCore_Hip_shared()
137 …CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per… in CeedBasisApplyTensorCore_Hip_shared()
141 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared()
143 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
[all …]
/libCEED/backends/cuda-shared/
H A Dceed-cuda-shared-basis.c56 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisApplyTensorCore_Cuda_shared() local
62 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); in CeedBasisApplyTensorCore_Cuda_shared()
64 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
67 …_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1, in CeedBasisApplyTensorCore_Cuda_shared()
70 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_pe… in CeedBasisApplyTensorCore_Cuda_shared()
75 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared()
77 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
80 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, in CeedBasisApplyTensorCore_Cuda_shared()
81thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared()
83 …ckend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyTensorCore_Cuda_shared()
[all …]
/libCEED/backends/hip-gen/
H A Dceed-hip-gen-operator.c152 …CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d),… in CeedOperatorApplyAddCore_Hip_gen()
157 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorApplyAddCore_Hip_gen()
164 CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar); in CeedOperatorApplyAddCore_Hip_gen()
170 CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); in CeedOperatorApplyAddCore_Hip_gen()
176 CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); in CeedOperatorApplyAddCore_Hip_gen()
430 …CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d),… in CeedOperatorLinearAssembleQFunctionCore_Hip_gen()
435 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen()
442 CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar); in CeedOperatorLinearAssembleQFunctionCore_Hip_gen()
448 CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); in CeedOperatorLinearAssembleQFunctionCore_Hip_gen()
454 CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); in CeedOperatorLinearAssembleQFunctionCore_Hip_gen()
[all …]
H A Dceed-hip-gen.h19 CeedInt thread_1d; member
H A Dceed-hip-gen-operator-build.cpp35 const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in BlockGridCalculate_Hip_gen() local
37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in BlockGridCalculate_Hip_gen()
40 block_sizes[0] = thread_1d; in BlockGridCalculate_Hip_gen()
44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; in BlockGridCalculate_Hip_gen()
46 block_sizes[0] = thread_1d; in BlockGridCalculate_Hip_gen()
47 block_sizes[1] = thread_1d; in BlockGridCalculate_Hip_gen()
50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); in BlockGridCalculate_Hip_gen()
52 block_sizes[0] = thread_1d; in BlockGridCalculate_Hip_gen()
53 block_sizes[1] = thread_1d; in BlockGridCalculate_Hip_gen()
1635 data->thread_1d = block_sizes[0]; in CeedOperatorBuildKernel_Hip_gen()
[all …]
/libCEED/backends/sycl-shared/
H A Dceed-sycl-shared-basis.sycl.cpp22 static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt *local_range, CeedI… in ComputeLocalRange() argument
23 local_range[0] = thread_1d; in ComputeLocalRange()
24 local_range[1] = (dim > 1) ? thread_1d : 1; in ComputeLocalRange()
182 const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisCreateTensorH1_Sycl_shared() local
188 CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, interp_lrange)); in CeedBasisCreateTensorH1_Sycl_shared()
193 CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, grad_lrange)); in CeedBasisCreateTensorH1_Sycl_shared()
242 jit_constants["T_1D"] = thread_1d; in CeedBasisCreateTensorH1_Sycl_shared()
/libCEED/backends/cuda-gen/
H A Dceed-cuda-gen.h19 CeedInt thread_1d; member
H A Dceed-cuda-gen-operator.c209 int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; in CeedOperatorApplyAddCore_Cuda_gen()
215 …_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); in CeedOperatorApplyAddCore_Cuda_gen()
464 int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; in CeedOperatorLinearAssembleQFunctionCore_Cuda_gen()
470 …_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); in CeedOperatorLinearAssembleQFunctionCore_Cuda_gen()
639 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; in CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen()
804 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; in CeedOperatorAssembleSingleAtPoints_Cuda_gen()
H A Dceed-cuda-gen-operator-build.cpp1618 data->thread_1d = T_1d; in CeedOperatorBuildKernel_Cuda_gen()
2089 data->thread_1d = T_1d; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen()
2680 data->thread_1d = T_1d; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()