Lines Matching refs:thread
479 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Cuda_shared() local
485 …elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); in CeedBasisApplyNonTensorCore_Cuda_shared()
487 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared()
490 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Cuda_shared()
493 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_b… in CeedBasisApplyNonTensorCore_Cuda_shared()
503 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Cuda_shared() local
509 …elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); in CeedBasisApplyNonTensorCore_Cuda_shared()
511 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared()
514 …mShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread, 1, in CeedBasisApplyNonTensorCore_Cuda_shared()
517 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_blo… in CeedBasisApplyNonTensorCore_Cuda_shared()
527 CeedInt thread = CeedIntMax(Q, P); in CeedBasisApplyNonTensorCore_Cuda_shared() local
533 …elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread, 1)); in CeedBasisApplyNonTensorCore_Cuda_shared()
536 …CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight… in CeedBasisApplyNonTensorCore_Cuda_shared()