Lines Matching refs:elems_per_block
128 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared() local
129 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedBasisApplyTensorCore_Hip_shared()
130 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
131 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
135 elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Hip_shared()
137 …RunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Hip_shared()
141 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local
142 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
143 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
147 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Hip_shared()
149 …lDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Hip_shared()
152 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local
153 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
154 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
158 … thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyTensorCore_Hip_shared()
160 …lDimShared_Hip(ceed, data->Interp, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Hip_shared()
180 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyTensorCore_Hip_shared() local
181 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedBasisApplyTensorCore_Hip_shared()
182 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
183 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
187 elems_per_block, shared_mem, grad_args)); in CeedBasisApplyTensorCore_Hip_shared()
189 …edRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Hip_shared()
193 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local
194 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
195 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
199 elems_per_block, shared_mem, grad_args)); in CeedBasisApplyTensorCore_Hip_shared()
201 …nelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Hip_shared()
204 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyTensorCore_Hip_shared() local
205 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
206 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyTensorCore_Hip_shared()
210 elems_per_block, shared_mem, grad_args)); in CeedBasisApplyTensorCore_Hip_shared()
212 …nelDimShared_Hip(ceed, data->Grad, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, … in CeedBasisApplyTensorCore_Hip_shared()
226 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Hip_shared() local
227 … const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
229 …CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, elems_per_block, 1, weig… in CeedBasisApplyTensorCore_Hip_shared()
232 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Hip_shared() local
233 … const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
235 …kend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)… in CeedBasisApplyTensorCore_Hip_shared()
238 const CeedInt elems_per_block = opt_elems > 0 ? opt_elems : 1; in CeedBasisApplyTensorCore_Hip_shared() local
239 … const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyTensorCore_Hip_shared()
241 …kend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, Q_1d, Q_1d, elems_per_block, weight_args)… in CeedBasisApplyTensorCore_Hip_shared()
391 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyAtPointsCore_Hip_shared() local
392 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedBasisApplyAtPointsCore_Hip_shared()
393 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Hip_shared()
394 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Hip_shared()
398 … thread_1d, 1, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyAtPointsCore_Hip_shared()
400 …lDimShared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyAtPointsCore_Hip_shared()
404 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyAtPointsCore_Hip_shared() local
405 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Hip_shared()
406 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Hip_shared()
410 … thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyAtPointsCore_Hip_shared()
412 …ared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyAtPointsCore_Hip_shared()
416 const CeedInt elems_per_block = 1; in CeedBasisApplyAtPointsCore_Hip_shared() local
417 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Hip_shared()
418 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Hip_shared()
422 … thread_1d, thread_1d, elems_per_block, shared_mem, interp_args)); in CeedBasisApplyAtPointsCore_Hip_shared()
424 …ared_Hip(ceed, data->InterpAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyAtPointsCore_Hip_shared()
439 CeedInt elems_per_block = 64 * thread_1d > 256 ? 256 / thread_1d : 64; in CeedBasisApplyAtPointsCore_Hip_shared() local
440 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedBasisApplyAtPointsCore_Hip_shared()
441 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Hip_shared()
442 CeedInt shared_mem = elems_per_block * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Hip_shared()
446 … thread_1d, 1, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyAtPointsCore_Hip_shared()
448 …nelDimShared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, 1, elems_per_block, shared_mem, … in CeedBasisApplyAtPointsCore_Hip_shared()
452 const CeedInt elems_per_block = CeedIntMax(block_size / (thread_1d * thread_1d), 1); in CeedBasisApplyAtPointsCore_Hip_shared() local
453 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Hip_shared()
454 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Hip_shared()
458 … thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyAtPointsCore_Hip_shared()
460 …Shared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyAtPointsCore_Hip_shared()
464 const CeedInt elems_per_block = 1; in CeedBasisApplyAtPointsCore_Hip_shared() local
465 … CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyAtPointsCore_Hip_shared()
466 … CeedInt shared_mem = elems_per_block * thread_1d * thread_1d * sizeof(CeedScalar); in CeedBasisApplyAtPointsCore_Hip_shared()
470 … thread_1d, thread_1d, elems_per_block, shared_mem, grad_args)); in CeedBasisApplyAtPointsCore_Hip_shared()
472 …Shared_Hip(ceed, data->GradAtPoints, NULL, grid, thread_1d, thread_1d, elems_per_block, shared_mem, in CeedBasisApplyAtPointsCore_Hip_shared()
547 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; in CeedBasisApplyNonTensorCore_Hip_shared() local
548 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedBasisApplyNonTensorCore_Hip_shared()
549 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyNonTensorCore_Hip_shared()
550 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Hip_shared()
554 elems_per_block, shared_mem, interp_args)); in CeedBasisApplyNonTensorCore_Hip_shared()
556 …eedRunKernelDimShared_Hip(ceed, data->Interp, NULL, grid, thread, 1, elems_per_block, shared_mem, … in CeedBasisApplyNonTensorCore_Hip_shared()
570 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; in CeedBasisApplyNonTensorCore_Hip_shared() local
571 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedBasisApplyNonTensorCore_Hip_shared()
572 CeedInt grid = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyNonTensorCore_Hip_shared()
573 CeedInt shared_mem = elems_per_block * thread * sizeof(CeedScalar); in CeedBasisApplyNonTensorCore_Hip_shared()
577 elems_per_block, shared_mem, grad_args)); in CeedBasisApplyNonTensorCore_Hip_shared()
579 …(CeedRunKernelDimShared_Hip(ceed, data->Grad, NULL, grid, thread, 1, elems_per_block, shared_mem, … in CeedBasisApplyNonTensorCore_Hip_shared()
593 CeedInt elems_per_block = 64 * thread > 256 ? 256 / thread : 64; in CeedBasisApplyNonTensorCore_Hip_shared() local
594 elems_per_block = elems_per_block > 0 ? elems_per_block : 1; in CeedBasisApplyNonTensorCore_Hip_shared()
595 const CeedInt grid_size = num_elem / elems_per_block + (num_elem % elems_per_block > 0); in CeedBasisApplyNonTensorCore_Hip_shared()
597 …CeedCallBackend(CeedRunKernelDim_Hip(ceed, data->Weight, grid_size, thread, elems_per_block, 1, we… in CeedBasisApplyNonTensorCore_Hip_shared()