| /libCEED/include/ceed/jit-source/hip/ |
| H A D | hip-ref-basis-nontensor-templates.h | 19 const CeedInt t_id = threadIdx.x; in Contract() local 31 for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val; in Contract() 34 d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d]; in Contract() 46 const CeedInt t_id = threadIdx.x; in ContractTranspose() local 56 for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; in ContractTranspose() 58 d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V; in ContractTranspose()
|
| H A D | hip-ref-basis-nontensor.h | 60 const CeedInt t_id = threadIdx.x; in Weight() local 64 d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; in Weight()
|
| H A D | hip-shared-basis-nontensor.h | 26 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 53 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 80 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 110 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 137 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 164 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 194 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
|
| H A D | hip-types.h | 37 CeedInt t_id; member
|
| H A D | hip-shared-basis-tensor.h | 26 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 65 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 95 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 134 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 164 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 204 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 237 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 281 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 322 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 366 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() [all …]
|
| H A D | hip-shared-basis-tensor-at-points.h | 32 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 88 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 158 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 220 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 276 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__() 347 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in __launch_bounds__()
|
| H A D | hip-shared-basis-read-write-templates.h | 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; in LoadMatrix()
|
| H A D | hip-gen-templates.h | 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; in LoadMatrix()
|
| /libCEED/include/ceed/jit-source/cuda/ |
| H A D | cuda-ref-basis-nontensor-templates.h | 19 const CeedInt t_id = threadIdx.x; in Contract() local 31 for (CeedInt d = 0; d < Q_COMP; d++) r_V[d] += d_B[i + t_id * P + d * P * Q] * val; in Contract() 34 d_V[elem * strides_elem_V + comp * strides_comp_V + d * strides_q_comp_V + t_id] = r_V[d]; in Contract() 46 const CeedInt t_id = threadIdx.x; in ContractTranspose() local 56 for (CeedInt i = 0; i < Q; i++) r_V += d_B[t_id + i * P + d * P * Q] * U[i]; in ContractTranspose() 58 d_V[elem * strides_elem_V + comp * strides_comp_V + t_id] += r_V; in ContractTranspose()
|
| H A D | cuda-ref-basis-nontensor.h | 60 const CeedInt t_id = threadIdx.x; in Weight() local 64 d_V[elem * BASIS_Q + t_id] = q_weight[t_id]; in Weight()
|
| H A D | cuda-shared-basis-nontensor.h | 25 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in Interp() 52 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpTranspose() 79 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpTransposeAdd() 108 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in Grad() 135 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in GradTranspose() 162 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in GradTransposeAdd() 191 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in Weight()
|
| H A D | cuda-types.h | 37 CeedInt t_id; member
|
| H A D | cuda-shared-basis-tensor.h | 25 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in Interp() 64 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpCollocated() 94 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpTranspose() 133 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpCollocatedTranspose() 163 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpTransposeAdd() 202 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpCollocatedTransposeAdd() 235 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in Grad() 278 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in GradCollocated() 318 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in GradTranspose() 361 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in GradCollocatedTranspose() [all …]
|
| H A D | cuda-shared-basis-tensor-at-points.h | 31 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpAtPoints() 87 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpTransposeAtPoints() 157 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in InterpTransposeAddAtPoints() 218 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in GradAtPoints() 274 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in GradTransposeAtPoints() 345 data.t_id = threadIdx.x + threadIdx.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x; in GradTransposeAddAtPoints()
|
| H A D | cuda-shared-basis-read-write-templates.h | 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; in LoadMatrix()
|
| H A D | cuda-gen-templates.h | 17 for (CeedInt i = data.t_id; i < P * Q; i += blockDim.x * blockDim.y * blockDim.z) B[i] = d_B[i]; in LoadMatrix()
|