Home
last modified time | relevance | path

Searched refs:elems_per_block (Results 1 – 14 of 14) sorted by relevance

/libCEED/backends/hip-shared/
H A Dceed-hip-shared-basis.c128 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared() local
129 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedBasisApplyTensorCore_Hip_shared()
130 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
131 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
135 elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Hip_shared()
137 …RunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Hip_shared()
141 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local
142 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
143 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
147 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Hip_shared()
[all …]
/libCEED/backends/cuda-shared/
H A Dceed-cuda-shared-basis.c62 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared() local
63 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
64 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
68 elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared()
70 …unKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Cuda_shared()
75 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared() local
76 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
77 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
81 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared()
83 …elDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyTensorCore_Cuda_shared()
[all …]
/libCEED/backends/cuda-gen/
H A Dceed-cuda-gen-operator.c40 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) { in Waste() argument
41 int useful_threads_per_block = threads_per_elem * elems_per_block; in Waste()
79 int elems_per_block = 1; in BlockGridCalculate() local
88 elems_per_block = i; in BlockGridCalculate()
94 block[2] = CeedIntMin(elems_per_block, max_threads_z); in BlockGridCalculate()
95 *grid = CeedDivUpInt(num_elem, elems_per_block); in BlockGridCalculate()
215 …CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / dat… in CeedOperatorApplyAddCore_Cuda_gen() local
217 grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedOperatorApplyAddCore_Cuda_gen()
218 block[2] = elems_per_block; in CeedOperatorApplyAddCore_Cuda_gen()
470 …CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / dat… in CeedOperatorLinearAssembleQFunctionCore_Cuda_gen() local
[all …]
/libCEED/backends/hip-ref/
H A Dceed-hip-ref-basis.c259 const int elems_per_block = 1; in CeedBasisApplyNonTensorCore_Hip() local
260 const int grid = CeedDivUpInt(num_elem, elems_per_block); in CeedBasisApplyNonTensorCore_Hip()
288 …RunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)… in CeedBasisApplyNonTensorCore_Hip()
290 …kend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)… in CeedBasisApplyNonTensorCore_Hip()
298 …edRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); in CeedBasisApplyNonTensorCore_Hip()
300 …ackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); in CeedBasisApplyNonTensorCore_Hip()
308 …eedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); in CeedBasisApplyNonTensorCore_Hip()
310 …Backend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); in CeedBasisApplyNonTensorCore_Hip()
318 …edRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); in CeedBasisApplyNonTensorCore_Hip()
320 …ackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); in CeedBasisApplyNonTensorCore_Hip()
[all …]
H A Dceed-hip-ref-operator.c1383 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalSetupCompile_Hip() local
1391 …edsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block)); in CeedOperatorAssembleDiagonalSetupCompile_Hip()
1462 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalCore_Hip() local
1463 CeedInt grid = CeedDivUpInt(num_elem, elems_per_block); in CeedOperatorAssembleDiagonalCore_Hip()
1469 …end(CeedRunKernelDim_Hip(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args)); in CeedOperatorAssembleDiagonalCore_Hip()
1471 …ckend(CeedRunKernelDim_Hip(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args)); in CeedOperatorAssembleDiagonalCore_Hip()
1603 asmb->elems_per_block = 1; in CeedOperatorAssembleSingleSetup_Hip()
1608 …bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > hip_data->device… in CeedOperatorAssembleSingleSetup_Hip()
1623 …asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_… in CeedOperatorAssembleSingleSetup_Hip()
1781 CeedInt grid = CeedDivUpInt(num_elem_in, asmb->elems_per_block); in CeedOperatorAssembleSingle_Hip()
[all …]
H A Dceed-hip-ref.h135 CeedInt block_size_x, block_size_y, elems_per_block; member
/libCEED/backends/cuda-ref/
H A Dceed-cuda-ref-basis.c260 const int elems_per_block = 1; in CeedBasisApplyNonTensorCore_Cuda() local
261 const int grid = CeedDivUpInt(num_elem, elems_per_block); in CeedBasisApplyNonTensorCore_Cuda()
289 …unKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)… in CeedBasisApplyNonTensorCore_Cuda()
291 …end(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)… in CeedBasisApplyNonTensorCore_Cuda()
299 …dRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); in CeedBasisApplyNonTensorCore_Cuda()
301 …ckend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); in CeedBasisApplyNonTensorCore_Cuda()
309 …edRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); in CeedBasisApplyNonTensorCore_Cuda()
311 …ackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); in CeedBasisApplyNonTensorCore_Cuda()
319 …dRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); in CeedBasisApplyNonTensorCore_Cuda()
321 …ckend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); in CeedBasisApplyNonTensorCore_Cuda()
[all …]
H A Dceed-cuda-ref-operator.c1386 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalSetupCompile_Cuda() local
1394 …edsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block)); in CeedOperatorAssembleDiagonalSetupCompile_Cuda()
1465 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalCore_Cuda() local
1466 CeedInt grid = CeedDivUpInt(num_elem, elems_per_block); in CeedOperatorAssembleDiagonalCore_Cuda()
1472 …nd(CeedRunKernelDim_Cuda(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args)); in CeedOperatorAssembleDiagonalCore_Cuda()
1474 …kend(CeedRunKernelDim_Cuda(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args)); in CeedOperatorAssembleDiagonalCore_Cuda()
1606 asmb->elems_per_block = 1; in CeedOperatorAssembleSingleSetup_Cuda()
1611 …bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > cuda_data->devic… in CeedOperatorAssembleSingleSetup_Cuda()
1626 …asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_… in CeedOperatorAssembleSingleSetup_Cuda()
1784 CeedInt grid = CeedDivUpInt(num_elem_in, asmb->elems_per_block); in CeedOperatorAssembleSingle_Cuda()
[all …]
H A Dceed-cuda-ref.h130 CeedInt block_size_x, block_size_y, elems_per_block; member
/libCEED/backends/sycl-gen/
H A Dceed-sycl-gen-operator-build.sycl.cpp34 CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; in BlockGridCalculate_Sycl_gen() local
36 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in BlockGridCalculate_Sycl_gen()
39 block_sizes[2] = elems_per_block; in BlockGridCalculate_Sycl_gen()
41 const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; in BlockGridCalculate_Sycl_gen() local
45 block_sizes[2] = elems_per_block; in BlockGridCalculate_Sycl_gen()
47 const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); in BlockGridCalculate_Sycl_gen() local
51 block_sizes[2] = elems_per_block; in BlockGridCalculate_Sycl_gen()
/libCEED/backends/hip-gen/
H A Dceed-hip-gen-operator.c157 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorApplyAddCore_Hip_gen() local
159 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedOperatorApplyAddCore_Hip_gen()
160 block_sizes[2] = elems_per_block; in CeedOperatorApplyAddCore_Hip_gen()
435 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() local
437 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen()
438 block_sizes[2] = elems_per_block; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen()
H A Dceed-hip-gen-operator-build.cpp37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in BlockGridCalculate_Hip_gen() local
39 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in BlockGridCalculate_Hip_gen()
42 block_sizes[2] = elems_per_block; in BlockGridCalculate_Hip_gen()
44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; in BlockGridCalculate_Hip_gen() local
48 block_sizes[2] = elems_per_block; in BlockGridCalculate_Hip_gen()
50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); in BlockGridCalculate_Hip_gen() local
54 block_sizes[2] = elems_per_block; in BlockGridCalculate_Hip_gen()
/libCEED/backends/sycl-ref/
H A Dceed-sycl-ref.hpp97 CeedInt num_elem, block_size_x, block_size_y, elems_per_block; member
H A Dceed-sycl-ref-operator.sycl.cpp1111 int elems_per_block = 1; in CeedOperatorAssembleSingleSetup_Sycl() local
1112 asmb->elems_per_block = elems_per_block; in CeedOperatorAssembleSingleSetup_Sycl()
1119 asmb->block_size = elem_size * elem_size * elems_per_block; in CeedOperatorAssembleSingleSetup_Sycl()