| /libCEED/backends/hip-shared/ |
| H A D | ceed-hip-shared-basis.c | 128 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared() local 129 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedBasisApplyTensorCore_Hip_shared() 130 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared() 131 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared() 135 elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Hip_shared() 137 …RunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Hip_shared() 141 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local 142 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared() 143 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared() 147 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Hip_shared() [all …]
|
| /libCEED/backends/cuda-shared/ |
| H A D | ceed-cuda-shared-basis.c | 62 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared() local 63 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared() 64 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared() 68 elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared() 70 …unKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Cuda_shared() 75 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared() local 76 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared() 77 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared() 81 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared() 83 …elDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyTensorCore_Cuda_shared() [all …]
|
| /libCEED/backends/cuda-gen/ |
| H A D | ceed-cuda-gen-operator.c | 40 static int Waste(int threads_per_sm, int warp_size, int threads_per_elem, int elems_per_block) { in Waste() argument 41 int useful_threads_per_block = threads_per_elem * elems_per_block; in Waste() 79 int elems_per_block = 1; in BlockGridCalculate() local 88 elems_per_block = i; in BlockGridCalculate() 94 block[2] = CeedIntMin(elems_per_block, max_threads_z); in BlockGridCalculate() 95 *grid = CeedDivUpInt(num_elem, elems_per_block); in BlockGridCalculate() 215 …CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / dat… in CeedOperatorApplyAddCore_Cuda_gen() local 217 grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedOperatorApplyAddCore_Cuda_gen() 218 block[2] = elems_per_block; in CeedOperatorApplyAddCore_Cuda_gen() 470 …CeedInt elems_per_block = CeedIntMin(cuda_data->device_prop.maxThreadsDim[2], CeedIntMax(512 / dat… in CeedOperatorLinearAssembleQFunctionCore_Cuda_gen() local [all …]
|
| /libCEED/backends/hip-ref/ |
| H A D | ceed-hip-ref-basis.c | 259 const int elems_per_block = 1; in CeedBasisApplyNonTensorCore_Hip() local 260 const int grid = CeedDivUpInt(num_elem, elems_per_block); in CeedBasisApplyNonTensorCore_Hip() 288 …RunKernelDim_Hip(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)… in CeedBasisApplyNonTensorCore_Hip() 290 …kend(CeedRunKernelDim_Hip(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)… in CeedBasisApplyNonTensorCore_Hip() 298 …edRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); in CeedBasisApplyNonTensorCore_Hip() 300 …ackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); in CeedBasisApplyNonTensorCore_Hip() 308 …eedRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); in CeedBasisApplyNonTensorCore_Hip() 310 …Backend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); in CeedBasisApplyNonTensorCore_Hip() 318 …edRunKernelDim_Hip(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); in CeedBasisApplyNonTensorCore_Hip() 320 …ackend(CeedRunKernelDim_Hip(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); in CeedBasisApplyNonTensorCore_Hip() [all …]
|
| H A D | ceed-hip-ref-operator.c | 1383 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalSetupCompile_Hip() local 1391 …edsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block)); in CeedOperatorAssembleDiagonalSetupCompile_Hip() 1462 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalCore_Hip() local 1463 CeedInt grid = CeedDivUpInt(num_elem, elems_per_block); in CeedOperatorAssembleDiagonalCore_Hip() 1469 …end(CeedRunKernelDim_Hip(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args)); in CeedOperatorAssembleDiagonalCore_Hip() 1471 …ckend(CeedRunKernelDim_Hip(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args)); in CeedOperatorAssembleDiagonalCore_Hip() 1603 asmb->elems_per_block = 1; in CeedOperatorAssembleSingleSetup_Hip() 1608 …bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > hip_data->device… in CeedOperatorAssembleSingleSetup_Hip() 1623 …asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_… in CeedOperatorAssembleSingleSetup_Hip() 1781 CeedInt grid = CeedDivUpInt(num_elem_in, asmb->elems_per_block); in CeedOperatorAssembleSingle_Hip() [all …]
|
| H A D | ceed-hip-ref.h | 135 CeedInt block_size_x, block_size_y, elems_per_block; member
|
| /libCEED/backends/cuda-ref/ |
| H A D | ceed-cuda-ref-basis.c | 260 const int elems_per_block = 1; in CeedBasisApplyNonTensorCore_Cuda() local 261 const int grid = CeedDivUpInt(num_elem, elems_per_block); in CeedBasisApplyNonTensorCore_Cuda() 289 …unKernelDim_Cuda(ceed, data->InterpTranspose, grid, block_size_x, 1, elems_per_block, interp_args)… in CeedBasisApplyNonTensorCore_Cuda() 291 …end(CeedRunKernelDim_Cuda(ceed, data->Interp, grid, block_size_x, 1, elems_per_block, interp_args)… in CeedBasisApplyNonTensorCore_Cuda() 299 …dRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, grad_args)); in CeedBasisApplyNonTensorCore_Cuda() 301 …ckend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, grad_args)); in CeedBasisApplyNonTensorCore_Cuda() 309 …edRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, div_args)); in CeedBasisApplyNonTensorCore_Cuda() 311 …ackend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, div_args)); in CeedBasisApplyNonTensorCore_Cuda() 319 …dRunKernelDim_Cuda(ceed, data->DerivTranspose, grid, block_size_x, 1, elems_per_block, curl_args)); in CeedBasisApplyNonTensorCore_Cuda() 321 …ckend(CeedRunKernelDim_Cuda(ceed, data->Deriv, grid, block_size_x, 1, elems_per_block, curl_args)); in CeedBasisApplyNonTensorCore_Cuda() [all …]
|
| H A D | ceed-cuda-ref-operator.c | 1386 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalSetupCompile_Cuda() local 1394 …edsize_idx, "USE_POINT_BLOCK", is_point_block ? 1 : 0, "BLOCK_SIZE", num_nodes * elems_per_block)); in CeedOperatorAssembleDiagonalSetupCompile_Cuda() 1465 CeedInt elems_per_block = 1; in CeedOperatorAssembleDiagonalCore_Cuda() local 1466 CeedInt grid = CeedDivUpInt(num_elem, elems_per_block); in CeedOperatorAssembleDiagonalCore_Cuda() 1472 …nd(CeedRunKernelDim_Cuda(ceed, diag->LinearPointBlock, grid, num_nodes, 1, elems_per_block, args)); in CeedOperatorAssembleDiagonalCore_Cuda() 1474 …kend(CeedRunKernelDim_Cuda(ceed, diag->LinearDiagonal, grid, num_nodes, 1, elems_per_block, args)); in CeedOperatorAssembleDiagonalCore_Cuda() 1606 asmb->elems_per_block = 1; in CeedOperatorAssembleSingleSetup_Cuda() 1611 …bool fallback = asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block > cuda_data->devic… in CeedOperatorAssembleSingleSetup_Cuda() 1626 …asmb->block_size_x * asmb->block_size_y * asmb->elems_per_block, "BLOCK_SIZE_Y", asmb->block_size_… in CeedOperatorAssembleSingleSetup_Cuda() 1784 CeedInt grid = CeedDivUpInt(num_elem_in, asmb->elems_per_block); in CeedOperatorAssembleSingle_Cuda() [all …]
|
| H A D | ceed-cuda-ref.h | 130 CeedInt block_size_x, block_size_y, elems_per_block; member
|
| /libCEED/backends/sycl-gen/ |
| H A D | ceed-sycl-gen-operator-build.sycl.cpp | 34 CeedInt elems_per_block = 64 * thread1d > 256 ? 256 / thread1d : 64; in BlockGridCalculate_Sycl_gen() local 36 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in BlockGridCalculate_Sycl_gen() 39 block_sizes[2] = elems_per_block; in BlockGridCalculate_Sycl_gen() 41 const CeedInt elems_per_block = thread1d < 4 ? 16 : 2; in BlockGridCalculate_Sycl_gen() local 45 block_sizes[2] = elems_per_block; in BlockGridCalculate_Sycl_gen() 47 const CeedInt elems_per_block = thread1d < 6 ? 4 : (thread1d < 8 ? 2 : 1); in BlockGridCalculate_Sycl_gen() local 51 block_sizes[2] = elems_per_block; in BlockGridCalculate_Sycl_gen()
|
| /libCEED/backends/hip-gen/ |
| H A D | ceed-hip-gen-operator.c | 157 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorApplyAddCore_Hip_gen() local 159 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedOperatorApplyAddCore_Hip_gen() 160 block_sizes[2] = elems_per_block; in CeedOperatorApplyAddCore_Hip_gen() 435 CeedInt elems_per_block = 64 * data->thread_1d > 256 ? 256 / data->thread_1d : 64; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() local 437 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen() 438 block_sizes[2] = elems_per_block; in CeedOperatorLinearAssembleQFunctionCore_Hip_gen()
|
| H A D | ceed-hip-gen-operator-build.cpp | 37 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in BlockGridCalculate_Hip_gen() local 39 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in BlockGridCalculate_Hip_gen() 42 block_sizes[2] = elems_per_block; in BlockGridCalculate_Hip_gen() 44 const CeedInt elems_per_block = thread_1d < 4 ? 16 : 2; in BlockGridCalculate_Hip_gen() local 48 block_sizes[2] = elems_per_block; in BlockGridCalculate_Hip_gen() 50 const CeedInt elems_per_block = thread_1d < 6 ? 4 : (thread_1d < 8 ? 2 : 1); in BlockGridCalculate_Hip_gen() local 54 block_sizes[2] = elems_per_block; in BlockGridCalculate_Hip_gen()
|
| /libCEED/backends/sycl-ref/ |
| H A D | ceed-sycl-ref.hpp | 97 CeedInt num_elem, block_size_x, block_size_y, elems_per_block; member
|
| H A D | ceed-sycl-ref-operator.sycl.cpp | 1111 int elems_per_block = 1; in CeedOperatorAssembleSingleSetup_Sycl() local 1112 asmb->elems_per_block = elems_per_block; in CeedOperatorAssembleSingleSetup_Sycl() 1119 asmb->block_size = elem_size * elem_size * elems_per_block; in CeedOperatorAssembleSingleSetup_Sycl()
|