ceed-cuda-shared-basis.c - OpenGrok cross reference for /libCEED/backends/cuda-shared/ceed-cuda-shared-basis.c

Lines Matching refs:thread_1d
56       CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);  in CeedBasisApplyTensorCore_Cuda_shared()  local
62 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));  in CeedBasisApplyTensorCore_Cuda_shared()
64         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
67 …_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d, 1,  in CeedBasisApplyTensorCore_Cuda_shared()
70 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_pe…  in CeedBasisApplyTensorCore_Cuda_shared()
75 …      CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);  in CeedBasisApplyTensorCore_Cuda_shared()
77         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
80 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,  in CeedBasisApplyTensorCore_Cuda_shared()
81 …                                             thread_1d, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyTensorCore_Cuda_shared()
83 …ckend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_…  in CeedBasisApplyTensorCore_Cuda_shared()
89         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
92 …red_Cuda(ceed, apply_add ? data->InterpTransposeAdd : data->InterpTranspose, NULL, grid, thread_1d,  in CeedBasisApplyTensorCore_Cuda_shared()
93 …                                             thread_1d, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyTensorCore_Cuda_shared()
95 …ckend(CeedRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_…  in CeedBasisApplyTensorCore_Cuda_shared()
106       CeedInt     thread_1d = CeedIntMax(Q_1d, P_1d);  in CeedBasisApplyTensorCore_Cuda_shared()  local
116 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));  in CeedBasisApplyTensorCore_Cuda_shared()
118         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
121 …ared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d, 1,  in CeedBasisApplyTensorCore_Cuda_shared()
124 …CeedCallBackend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_…  in CeedBasisApplyTensorCore_Cuda_shared()
129 …      CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);  in CeedBasisApplyTensorCore_Cuda_shared()
131         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
134 …mShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d,  in CeedBasisApplyTensorCore_Cuda_shared()
135 …                                               thread_1d, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyTensorCore_Cuda_shared()
137 …Backend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_…  in CeedBasisApplyTensorCore_Cuda_shared()
142         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
145 …mShared_Cuda(ceed, apply_add ? data->GradTransposeAdd : data->GradTranspose, NULL, grid, thread_1d,  in CeedBasisApplyTensorCore_Cuda_shared()
146 …                                               thread_1d, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyTensorCore_Cuda_shared()
148 …Backend(CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_…  in CeedBasisApplyTensorCore_Cuda_shared()
320       CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);  in CeedBasisApplyAtPointsCore_Cuda_shared()  local
326 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));  in CeedBasisApplyAtPointsCore_Cuda_shared()
328         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
332 …                                          thread_1d, 1, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
334 …end(CeedRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_bl…  in CeedBasisApplyAtPointsCore_Cuda_shared()
340 …      CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);  in CeedBasisApplyAtPointsCore_Cuda_shared()
342         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
346 …                                  thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
348 …edRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_…  in CeedBasisApplyAtPointsCore_Cuda_shared()
354         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
358 …                                  thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
360 …edRunKernelDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_…  in CeedBasisApplyAtPointsCore_Cuda_shared()
370       CeedInt thread_1d = CeedIntMax(Q_1d, P_1d);  in CeedBasisApplyAtPointsCore_Cuda_shared()  local
376 …ms_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thread_1d, 1));  in CeedBasisApplyAtPointsCore_Cuda_shared()
378         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
382 …                                            thread_1d, 1, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
384 …ckend(CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_bl…  in CeedBasisApplyAtPointsCore_Cuda_shared()
389 …      CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);  in CeedBasisApplyAtPointsCore_Cuda_shared()
391         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
395 …                                    thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
397 …CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_…  in CeedBasisApplyAtPointsCore_Cuda_shared()
403         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
407 …                                    thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
409 …CeedRunKernelDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_…  in CeedBasisApplyAtPointsCore_Cuda_shared()