| /libCEED/backends/hip-shared/ |
| H A D | ceed-hip-shared-basis.c | 44 const CeedInt thread_1d = CeedIntMax(P_1d, Q_1d); in ComputeBasisThreadBlockSizes() local 59 CeedInt required = thread_1d * thread_1d; in ComputeBasisThreadBlockSizes() 73 CeedInt required = thread_1d * thread_1d; in ComputeBasisThreadBlockSizes() 124 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisApplyTensorCore_Hip_shared() local 128 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared() 131 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared() 134 …d_Hip(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1, in CeedBasisApplyTensorCore_Hip_shared() 137 …CeedCallBackend(CeedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per… in CeedBasisApplyTensorCore_Hip_shared() 141 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() 143 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared() [all …]
|
| /libCEED/backends/cuda-shared/ |
| H A D | ceed-cuda-shared-basis.c | 56 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisApplyTensorCore_Cuda_shared() local 62 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); in CeedBasisApplyTensorCore_Cuda_shared() 64 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared() 67 …_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1, in CeedBasisApplyTensorCore_Cuda_shared() 70 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_pe… in CeedBasisApplyTensorCore_Cuda_shared() 75 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared() 77 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared() 80 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, in CeedBasisApplyTensorCore_Cuda_shared() 81 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared() 83 …ckend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyTensorCore_Cuda_shared() [all …]
|
| /libCEED/backends/hip-gen/ |
| H A D | ceed-hip-gen-operator.c | 152 …CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d),… in CeedOperatorApplyAddCore_Hip_gen() 157 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorApplyAddCore_Hip_gen() 164 CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar); in CeedOperatorApplyAddCore_Hip_gen() 170 CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); in CeedOperatorApplyAddCore_Hip_gen() 176 CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); in CeedOperatorApplyAddCore_Hip_gen() 430 …CeedInt block_sizes[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d),… in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() 435 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() 442 CeedInt sharedMem = block_sizes[2] * data->thread_1d * sizeof(CeedScalar); in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() 448 CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() 454 CeedInt sharedMem = block_sizes[2] * data->thread_1d * data->thread_1d * sizeof(CeedScalar); in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() [all …]
|
| H A D | ceed-hip-gen.h | 19 CeedInt thread_1d; member
|
| H A D | ceed-hip-gen-operator-build.cpp | 35 const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in BlockGridCalculate_Hip_gen() local 37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in BlockGridCalculate_Hip_gen() 40 block_sizes[0] = thread_1d; in BlockGridCalculate_Hip_gen() 44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; in BlockGridCalculate_Hip_gen() 46 block_sizes[0] = thread_1d; in BlockGridCalculate_Hip_gen() 47 block_sizes[1] = thread_1d; in BlockGridCalculate_Hip_gen() 50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); in BlockGridCalculate_Hip_gen() 52 block_sizes[0] = thread_1d; in BlockGridCalculate_Hip_gen() 53 block_sizes[1] = thread_1d; in BlockGridCalculate_Hip_gen() 1635 data->thread_1d = block_sizes[0]; in CeedOperatorBuildKernel_Hip_gen() [all …]
|
| /libCEED/backends/sycl-shared/ |
| H A D | ceed-sycl-shared-basis.sycl.cpp | 22 static int ComputeLocalRange(Ceed ceed, CeedInt dim, CeedInt thread_1d, CeedInt *local_range, CeedI… in ComputeLocalRange() argument 23 local_range[0] = thread_1d; in ComputeLocalRange() 24 local_range[1] = (dim > 1) ? thread_1d : 1; in ComputeLocalRange() 182 const CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisCreateTensorH1_Sycl_shared() local 188 CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, interp_lrange)); in CeedBasisCreateTensorH1_Sycl_shared() 193 CeedCallBackend(ComputeLocalRange(ceed, dim, thread_1d, grad_lrange)); in CeedBasisCreateTensorH1_Sycl_shared() 242 jit_constants["T_1D"] = thread_1d; in CeedBasisCreateTensorH1_Sycl_shared()
|
| /libCEED/backends/cuda-gen/ |
| H A D | ceed-cuda-gen.h | 19 CeedInt thread_1d; member
|
| H A D | ceed-cuda-gen-operator.c | 209 int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; in CeedOperatorApplyAddCore_Cuda_gen() 215 …_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); in CeedOperatorApplyAddCore_Cuda_gen() 464 int block[3] = {data->thread_1d, ((!is_tensor || data->dim == 1) ? 1 : data->thread_1d), -1}; in CeedOperatorLinearAssembleQFunctionCore_Cuda_gen() 470 …_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / data->thread_1d, 1)); in CeedOperatorLinearAssembleQFunctionCore_Cuda_gen() 639 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; in CeedOperatorLinearAssembleAddDiagonalAtPoints_Cuda_gen() 804 int block[3] = {data->thread_1d, (data->dim == 1 ? 1 : data->thread_1d), -1}; in CeedOperatorAssembleSingleAtPoints_Cuda_gen()
|
| H A D | ceed-cuda-gen-operator-build.cpp | 1618 data->thread_1d = T_1d; in CeedOperatorBuildKernel_Cuda_gen() 2089 data->thread_1d = T_1d; in CeedOperatorBuildKernelAssemblyAtPoints_Cuda_gen() 2680 data->thread_1d = T_1d; in CeedOperatorBuildKernelLinearAssembleQFunction_Cuda_gen()
|