Lines Matching refs:elems_per_block
62 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared() local
63 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
64 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
68 elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared()
70 …unKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Cuda_shared()
75 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared() local
76 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
77 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
81 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared()
83 …elDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyTensorCore_Cuda_shared()
87 CeedInt elems_per_block = 1; in CeedBasisApplyTensorCore_Cuda_shared() local
88 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
89 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
93 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Cuda_shared()
95 …elDimShared_Cuda(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyTensorCore_Cuda_shared()
116 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyTensorCore_Cuda_shared() local
117 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
118 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
122 elems_per_block, shared_mem, grad_args)); in CeedBasisApplyTensorCore_Cuda_shared()
124 …dRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Cuda_shared()
129 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyTensorCore_Cuda_shared() local
130 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
131 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
135 … thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyTensorCore_Cuda_shared()
137 …elDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Cuda_shared()
140 CeedInt elems_per_block = 1; in CeedBasisApplyTensorCore_Cuda_shared() local
141 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
142 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Cuda_shared()
146 … thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyTensorCore_Cuda_shared()
148 …elDimShared_Cuda(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Cuda_shared()
160 const CeedInt elems_per_block = block_size / Q_1d; in CeedBasisApplyTensorCore_Cuda_shared() local
161 … const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
163 …CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, wei… in CeedBasisApplyTensorCore_Cuda_shared()
166 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Cuda_shared() local
167 … const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
169 …end(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)… in CeedBasisApplyTensorCore_Cuda_shared()
172 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Cuda_shared() local
173 … const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Cuda_shared()
175 …end(CeedRunKernelDim_Cuda(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)… in CeedBasisApplyTensorCore_Cuda_shared()
326 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyAtPointsCore_Cuda_shared() local
327 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Cuda_shared()
328 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
332 … thread_1d, 1, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
334 …elDimShared_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, in CeedBasisApplyAtPointsCore_Cuda_shared()
340 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyAtPointsCore_Cuda_shared() local
341 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Cuda_shared()
342 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
346 … thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
348 …red_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyAtPointsCore_Cuda_shared()
352 CeedInt elems_per_block = 1; in CeedBasisApplyAtPointsCore_Cuda_shared() local
353 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Cuda_shared()
354 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
358 … thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
360 …red_Cuda(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyAtPointsCore_Cuda_shared()
376 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyAtPointsCore_Cuda_shared() local
377 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Cuda_shared()
378 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
382 … thread_1d, 1, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
384 …elDimShared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyAtPointsCore_Cuda_shared()
389 … CeedInt elems_per_block = CeedIntMax(thread_1d < 7 ? opt_elems[thread_1d] / num_comp : 1, 1); in CeedBasisApplyAtPointsCore_Cuda_shared() local
390 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Cuda_shared()
391 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
395 … thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
397 …hared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyAtPointsCore_Cuda_shared()
401 CeedInt elems_per_block = 1; in CeedBasisApplyAtPointsCore_Cuda_shared() local
402 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Cuda_shared()
403 CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Cuda_shared()
407 … thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyAtPointsCore_Cuda_shared()
409 …hared_Cuda(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyAtPointsCore_Cuda_shared()
485 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyNonTensorCore_Cuda_shared() local
486 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyNonTensorCore_Cuda_shared()
487 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared()
491 elems_per_block, shared_mem, interp_args)); in CeedBasisApplyNonTensorCore_Cuda_shared()
493 …edRunKernelDimShared_Cuda(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, … in CeedBasisApplyNonTensorCore_Cuda_shared()
509 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyNonTensorCore_Cuda_shared() local
510 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyNonTensorCore_Cuda_shared()
511 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Cuda_shared()
515 elems_per_block, shared_mem, grad_args)); in CeedBasisApplyNonTensorCore_Cuda_shared()
517 …CeedRunKernelDimShared_Cuda(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, … in CeedBasisApplyNonTensorCore_Cuda_shared()
533 …CeedInt elems_per_block = CeedIntMin(ceed_Cuda->device_prop.maxThreadsDim[2], CeedIntMax(512 / thr… in CeedBasisApplyNonTensorCore_Cuda_shared() local
534 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyNonTensorCore_Cuda_shared()
536 …CeedCallBackend(CeedRunKernelDim_Cuda(ceed, data->Weight, grid, thread, elems_per_block, 1, weight… in CeedBasisApplyNonTensorCore_Cuda_shared()