ceed-cuda-shared-basis.c - OpenGrok cross reference for /libCEED/backends/cuda-shared/ceed-cuda-shared-basis.c

Lines Matching refs:elems_per_block
62 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr…  in CeedBasisApplyTensorCore_Cuda_shared()  local
63         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyTensorCore_Cuda_shared()
64         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
68                                                       elems_per_block, shared_mem, interp_args));  in CeedBasisApplyTensorCore_Cuda_shared()
70 …unKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, …  in CeedBasisApplyTensorCore_Cuda_shared()
75 …      CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);  in CeedBasisApplyTensorCore_Cuda_shared()  local
76         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyTensorCore_Cuda_shared()
77         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
81 …                                             thread_1d, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyTensorCore_Cuda_shared()
83 …elDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,  in CeedBasisApplyTensorCore_Cuda_shared()
87         CeedInt elems_per_block = 1;  in CeedBasisApplyTensorCore_Cuda_shared()  local
88         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyTensorCore_Cuda_shared()
89         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
93 …                                             thread_1d, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyTensorCore_Cuda_shared()
95 …elDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,  in CeedBasisApplyTensorCore_Cuda_shared()
116 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr…  in CeedBasisApplyTensorCore_Cuda_shared()  local
117         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyTensorCore_Cuda_shared()
118         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
122                                                       elems_per_block, shared_mem, grad_args));  in CeedBasisApplyTensorCore_Cuda_shared()
124 …dRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, …  in CeedBasisApplyTensorCore_Cuda_shared()
129 …      CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);  in CeedBasisApplyTensorCore_Cuda_shared()  local
130         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyTensorCore_Cuda_shared()
131         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
135 …                                               thread_1d, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyTensorCore_Cuda_shared()
137 …elDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, …  in CeedBasisApplyTensorCore_Cuda_shared()
140         CeedInt elems_per_block = 1;  in CeedBasisApplyTensorCore_Cuda_shared()  local
141         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyTensorCore_Cuda_shared()
142         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyTensorCore_Cuda_shared()
146 …                                               thread_1d, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyTensorCore_Cuda_shared()
148 …elDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, …  in CeedBasisApplyTensorCore_Cuda_shared()
160         const CeedInt elems_per_block = block_size / Q_1d;  in CeedBasisApplyTensorCore_Cuda_shared()  local
161 …     const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyTensorCore_Cuda_shared()
163 …CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, wei…  in CeedBasisApplyTensorCore_Cuda_shared()
166         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;  in CeedBasisApplyTensorCore_Cuda_shared()  local
167 …     const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyTensorCore_Cuda_shared()
169 …end(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)…  in CeedBasisApplyTensorCore_Cuda_shared()
172         const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1;  in CeedBasisApplyTensorCore_Cuda_shared()  local
173 …     const CeedInt grid_size       = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyTensorCore_Cuda_shared()
175 …end(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)…  in CeedBasisApplyTensorCore_Cuda_shared()
326 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr…  in CeedBasisApplyAtPointsCore_Cuda_shared()  local
327         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyAtPointsCore_Cuda_shared()
328         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
332 …                                          thread_1d, 1, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
334 …elDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem,  in CeedBasisApplyAtPointsCore_Cuda_shared()
340 …      CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);  in CeedBasisApplyAtPointsCore_Cuda_shared()  local
341         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyAtPointsCore_Cuda_shared()
342         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
346 …                                  thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
348 …red_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,  in CeedBasisApplyAtPointsCore_Cuda_shared()
352         CeedInt elems_per_block = 1;  in CeedBasisApplyAtPointsCore_Cuda_shared()  local
353         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyAtPointsCore_Cuda_shared()
354         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
358 …                                  thread_1d, thread_1d, elems_per_block, shared_mem, interp_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
360 …red_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,  in CeedBasisApplyAtPointsCore_Cuda_shared()
376 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr…  in CeedBasisApplyAtPointsCore_Cuda_shared()  local
377         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyAtPointsCore_Cuda_shared()
378         CeedInt shared_mem      = elems_per_block * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
382 …                                            thread_1d, 1, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
384 …elDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, …  in CeedBasisApplyAtPointsCore_Cuda_shared()
389 …      CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1);  in CeedBasisApplyAtPointsCore_Cuda_shared()  local
390         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyAtPointsCore_Cuda_shared()
391         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
395 …                                    thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
397 …hared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,  in CeedBasisApplyAtPointsCore_Cuda_shared()
401         CeedInt elems_per_block = 1;  in CeedBasisApplyAtPointsCore_Cuda_shared()  local
402         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyAtPointsCore_Cuda_shared()
403         CeedInt shared_mem      = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar);  in CeedBasisApplyAtPointsCore_Cuda_shared()
407 …                                    thread_1d, thread_1d, elems_per_block, shared_mem, grad_args));  in CeedBasisApplyAtPointsCore_Cuda_shared()
409 …hared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem,  in CeedBasisApplyAtPointsCore_Cuda_shared()
485 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr…  in CeedBasisApplyNonTensorCore_Cuda_shared()  local
486         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyNonTensorCore_Cuda_shared()
487         CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);  in CeedBasisApplyNonTensorCore_Cuda_shared()
491                                                       elems_per_block, shared_mem, interp_args));  in CeedBasisApplyNonTensorCore_Cuda_shared()
493 …edRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, …  in CeedBasisApplyNonTensorCore_Cuda_shared()
509 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr…  in CeedBasisApplyNonTensorCore_Cuda_shared()  local
510         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyNonTensorCore_Cuda_shared()
511         CeedInt shared_mem      = elems_per_block * thread * sizeof(CeedScalar);  in CeedBasisApplyNonTensorCore_Cuda_shared()
515                                                       elems_per_block, shared_mem, grad_args));  in CeedBasisApplyNonTensorCore_Cuda_shared()
517 …CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, …  in CeedBasisApplyNonTensorCore_Cuda_shared()
533 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr…  in CeedBasisApplyNonTensorCore_Cuda_shared()  local
534         CeedInt grid            = num_elem / elems_per_block + (num_elem % elems_per_block > 0);  in CeedBasisApplyNonTensorCore_Cuda_shared()
536 …CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight…  in CeedBasisApplyNonTensorCore_Cuda_shared()