Lines Matching refs:data
22 SharedData_Hip data; in __launch_bounds__() local
23 data.t_id_x = threadIdx.x; in __launch_bounds__()
24 data.t_id_y = threadIdx.y; in __launch_bounds__()
25 data.t_id_z = threadIdx.z; in __launch_bounds__()
26 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
27 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
34 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); in __launch_bounds__()
40 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D,… in __launch_bounds__()
41 Interp1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); in __launch_bounds__()
42 …WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D… in __launch_bounds__()
44 …ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem… in __launch_bounds__()
45 InterpTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); in __launch_bounds__()
46 …WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_ele… in __launch_bounds__()
48 …ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_… in __launch_bounds__()
50 InterpTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); in __launch_bounds__()
51 …WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q… in __launch_bounds__()
61 SharedData_Hip data; in __launch_bounds__() local
62 data.t_id_x = threadIdx.x; in __launch_bounds__()
63 data.t_id_y = threadIdx.y; in __launch_bounds__()
64 data.t_id_z = threadIdx.z; in __launch_bounds__()
65 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
66 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
73 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D,… in __launch_bounds__()
74 …WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D… in __launch_bounds__()
76 …ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem… in __launch_bounds__()
77 …WriteElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_ele… in __launch_bounds__()
79 …ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_… in __launch_bounds__()
81 …WriteElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q… in __launch_bounds__()
91 SharedData_Hip data; in __launch_bounds__() local
92 data.t_id_x = threadIdx.x; in __launch_bounds__()
93 data.t_id_y = threadIdx.y; in __launch_bounds__()
94 data.t_id_z = threadIdx.z; in __launch_bounds__()
95 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
96 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
103 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); in __launch_bounds__()
109 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D,… in __launch_bounds__()
110 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); in __launch_bounds__()
111 …WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D… in __launch_bounds__()
113 …ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem… in __launch_bounds__()
114 … InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); in __launch_bounds__()
115 …WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_ele… in __launch_bounds__()
117 …ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_… in __launch_bounds__()
119 … InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); in __launch_bounds__()
120 …WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P… in __launch_bounds__()
130 SharedData_Hip data; in __launch_bounds__() local
131 data.t_id_x = threadIdx.x; in __launch_bounds__()
132 data.t_id_y = threadIdx.y; in __launch_bounds__()
133 data.t_id_z = threadIdx.z; in __launch_bounds__()
134 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
135 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
142 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D,… in __launch_bounds__()
143 …WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D… in __launch_bounds__()
145 …ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem… in __launch_bounds__()
146 …WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_ele… in __launch_bounds__()
148 …ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_… in __launch_bounds__()
150 …WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P… in __launch_bounds__()
160 SharedData_Hip data; in __launch_bounds__() local
161 data.t_id_x = threadIdx.x; in __launch_bounds__()
162 data.t_id_y = threadIdx.y; in __launch_bounds__()
163 data.t_id_z = threadIdx.z; in __launch_bounds__()
164 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
165 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
172 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); in __launch_bounds__()
178 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D,… in __launch_bounds__()
179 InterpTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); in __launch_bounds__()
180 …SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, … in __launch_bounds__()
182 …ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem… in __launch_bounds__()
183 … InterpTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); in __launch_bounds__()
184 …SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem,… in __launch_bounds__()
186 …ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_… in __launch_bounds__()
188 … InterpTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, r_V); in __launch_bounds__()
189 …SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1… in __launch_bounds__()
200 SharedData_Hip data; in __launch_bounds__() local
201 data.t_id_x = threadIdx.x; in __launch_bounds__()
202 data.t_id_y = threadIdx.y; in __launch_bounds__()
203 data.t_id_z = threadIdx.z; in __launch_bounds__()
204 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
205 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
212 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D,… in __launch_bounds__()
213 …SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, … in __launch_bounds__()
215 …ReadElementStrided2d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem… in __launch_bounds__()
216 …SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem,… in __launch_bounds__()
218 …ReadElementStrided3d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_… in __launch_bounds__()
220 …SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1… in __launch_bounds__()
233 SharedData_Hip data; in __launch_bounds__() local
234 data.t_id_x = threadIdx.x; in __launch_bounds__()
235 data.t_id_y = threadIdx.y; in __launch_bounds__()
236 data.t_id_z = threadIdx.z; in __launch_bounds__()
237 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
238 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
245 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); in __launch_bounds__()
247 LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G); in __launch_bounds__()
253 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D,… in __launch_bounds__()
254 Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); in __launch_bounds__()
255 …WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D… in __launch_bounds__()
257 …ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem… in __launch_bounds__()
258 GradTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); in __launch_bounds__()
259 …WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_… in __launch_bounds__()
262 …ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_… in __launch_bounds__()
264 …radTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V… in __launch_bounds__()
265 … else GradTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); in __launch_bounds__()
266 …WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_… in __launch_bounds__()
277 SharedData_Hip data; in __launch_bounds__() local
278 data.t_id_x = threadIdx.x; in __launch_bounds__()
279 data.t_id_y = threadIdx.y; in __launch_bounds__()
280 data.t_id_z = threadIdx.z; in __launch_bounds__()
281 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
282 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
289 LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G); in __launch_bounds__()
295 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D,… in __launch_bounds__()
296 Grad1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V); in __launch_bounds__()
297 …WriteElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D… in __launch_bounds__()
299 …ReadElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem… in __launch_bounds__()
300 …GradTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s… in __launch_bounds__()
301 …WriteElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_… in __launch_bounds__()
304 …ReadElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_… in __launch_bounds__()
306 …GradTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s… in __launch_bounds__()
307 …WriteElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_… in __launch_bounds__()
318 SharedData_Hip data; in __launch_bounds__() local
319 data.t_id_x = threadIdx.x; in __launch_bounds__()
320 data.t_id_y = threadIdx.y; in __launch_bounds__()
321 data.t_id_z = threadIdx.z; in __launch_bounds__()
322 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
323 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
330 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); in __launch_bounds__()
332 LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G); in __launch_bounds__()
338 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D,… in __launch_bounds__()
339 GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); in __launch_bounds__()
340 …WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D… in __launch_bounds__()
342 …ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1… in __launch_bounds__()
344 …GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V… in __launch_bounds__()
345 …WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_ele… in __launch_bounds__()
347 …ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1… in __launch_bounds__()
349 …oseTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V… in __launch_bounds__()
350 …else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G… in __launch_bounds__()
351 …WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P… in __launch_bounds__()
362 SharedData_Hip data; in __launch_bounds__() local
363 data.t_id_x = threadIdx.x; in __launch_bounds__()
364 data.t_id_y = threadIdx.y; in __launch_bounds__()
365 data.t_id_z = threadIdx.z; in __launch_bounds__()
366 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
367 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
374 LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G); in __launch_bounds__()
380 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D,… in __launch_bounds__()
381 … GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V); in __launch_bounds__()
382 …WriteElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D… in __launch_bounds__()
384 …ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1… in __launch_bounds__()
386 …GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U… in __launch_bounds__()
387 …WriteElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_ele… in __launch_bounds__()
389 …ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1… in __launch_bounds__()
391 …GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U… in __launch_bounds__()
392 …WriteElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P… in __launch_bounds__()
403 SharedData_Hip data; in __launch_bounds__() local
404 data.t_id_x = threadIdx.x; in __launch_bounds__()
405 data.t_id_y = threadIdx.y; in __launch_bounds__()
406 data.t_id_z = threadIdx.z; in __launch_bounds__()
407 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
408 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
415 LoadMatrix<BASIS_P_1D, BASIS_Q_1D>(data, c_B, s_B); in __launch_bounds__()
417 LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G); in __launch_bounds__()
423 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D,… in __launch_bounds__()
424 GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V); in __launch_bounds__()
425 …SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, … in __launch_bounds__()
427 …ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1… in __launch_bounds__()
429 …GradTransposeTensor2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V… in __launch_bounds__()
430 …SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem,… in __launch_bounds__()
432 …ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1… in __launch_bounds__()
434 …oseTensorCollocated3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G, r_V… in __launch_bounds__()
435 …else GradTransposeTensor3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, s_B, s_G… in __launch_bounds__()
436 …SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1… in __launch_bounds__()
447 SharedData_Hip data; in __launch_bounds__() local
448 data.t_id_x = threadIdx.x; in __launch_bounds__()
449 data.t_id_y = threadIdx.y; in __launch_bounds__()
450 data.t_id_z = threadIdx.z; in __launch_bounds__()
451 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
452 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
459 LoadMatrix<BASIS_Q_1D, BASIS_HAS_COLLOCATED_GRAD ? BASIS_Q_1D : BASIS_P_1D>(data, c_G, s_G); in __launch_bounds__()
465 …ReadElementStrided1d<BASIS_NUM_COMP, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D,… in __launch_bounds__()
466 … GradTranspose1d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U, NULL, s_G, r_V); in __launch_bounds__()
467 …SumElementStrided1d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * num_elem, BASIS_P_1D, … in __launch_bounds__()
469 …ReadElementStrided2d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1… in __launch_bounds__()
471 …GradTransposeTensorCollocatedNodes2d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U… in __launch_bounds__()
472 …SumElementStrided2d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * num_elem,… in __launch_bounds__()
474 …ReadElementStrided3d<BASIS_NUM_COMP * BASIS_DIM, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1… in __launch_bounds__()
476 …GradTransposeTensorCollocatedNodes3d<BASIS_NUM_COMP, BASIS_P_1D, BASIS_Q_1D, BASIS_T_1D>(data, r_U… in __launch_bounds__()
477 …SumElementStrided3d<BASIS_NUM_COMP, BASIS_P_1D>(data, elem, 1, BASIS_P_1D * BASIS_P_1D * BASIS_P_1… in __launch_bounds__()
490 SharedData_Hip data; in __launch_bounds__() local
491 data.t_id_x = threadIdx.x; in __launch_bounds__()
492 data.t_id_y = threadIdx.y; in __launch_bounds__()
493 data.t_id_z = threadIdx.z; in __launch_bounds__()
494 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
495 data.slice = slice + data.t_id_z * BASIS_T_1D * (BASIS_DIM > 1 ? BASIS_T_1D : 1); in __launch_bounds__()
501 Weight1d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W); in __launch_bounds__()
502 … WriteElementStrided1d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * num_elem, BASIS_Q_1D, r_W, d_W); in __launch_bounds__()
504 WeightTensor2d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W); in __launch_bounds__()
505 …WriteElementStrided2d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * num_elem, BASIS_Q_1D… in __launch_bounds__()
507 WeightTensor3d<BASIS_P_1D, BASIS_Q_1D>(data, q_weight_1d, r_W); in __launch_bounds__()
508 …WriteElementStrided3d<1, BASIS_Q_1D>(data, elem, 1, BASIS_Q_1D * BASIS_Q_1D * BASIS_Q_1D * num_ele… in __launch_bounds__()