Lines Matching refs:thread_1d
56 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisApplyTensorCore_Cuda_shared() local
62 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); in CeedBasisApplyTensorCore_Cuda_shared()
64 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
67 …_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1, in CeedBasisApplyTensorCore_Cuda_shared()
70 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_pe… in CeedBasisApplyTensorCore_Cuda_shared()
75 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared()
77 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
80 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, in CeedBasisApplyTensorCore_Cuda_shared()
81 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared()
83 …ckend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyTensorCore_Cuda_shared()
89 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
92 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, in CeedBasisApplyTensorCore_Cuda_shared()
93 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared()
95 …ckend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyTensorCore_Cuda_shared()
106 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisApplyTensorCore_Cuda_shared() local
116 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); in CeedBasisApplyTensorCore_Cuda_shared()
118 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
121 …ared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1, in CeedBasisApplyTensorCore_Cuda_shared()
124 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_… in CeedBasisApplyTensorCore_Cuda_shared()
129 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared()
131 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
134 …mShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, in CeedBasisApplyTensorCore_Cuda_shared()
135 … thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyTensorCore_Cuda_shared()
137 …Backend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyTensorCore_Cuda_shared()
142 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
145 …mShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, in CeedBasisApplyTensorCore_Cuda_shared()
146 … thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyTensorCore_Cuda_shared()
148 …Backend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyTensorCore_Cuda_shared()
320 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisApplyAtPointsCore_Cuda_shared() local
326 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); in CeedBasisApplyAtPointsCore_Cuda_shared()
328 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
332 … thread_1d, 1, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
334 …end(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_bl… in CeedBasisApplyAtPointsCore_Cuda_shared()
340 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyAtPointsCore_Cuda_shared()
342 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
346 … thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
348 …edRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyAtPointsCore_Cuda_shared()
354 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
358 … thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
360 …edRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyAtPointsCore_Cuda_shared()
370 CeedInt thread_1d = CeedIntMax(Q_1d, P_1d); in CeedBasisApplyAtPointsCore_Cuda_shared() local
376 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1)); in CeedBasisApplyAtPointsCore_Cuda_shared()
378 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
382 … thread_1d, 1, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
384 …ckend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_bl… in CeedBasisApplyAtPointsCore_Cuda_shared()
389 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyAtPointsCore_Cuda_shared()
391 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
395 … thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
397 …CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyAtPointsCore_Cuda_shared()
403 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
407 … thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
409 …CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_… in CeedBasisApplyAtPointsCore_Cuda_shared()