Lines Matching refs:CeedInt
42 …TensorContract_Avx_Blocked(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, in CeedTensorContract_Avx_Blocked()
43 … const CeedScalar *restrict t, CeedTransposeMode t_mode, const CeedInt add, in CeedTensorContract_Avx_Blocked()
44 … const CeedScalar *restrict u, CeedScalar *restrict v, const CeedInt JJ, const CeedInt CC) { in CeedTensorContract_Avx_Blocked()
45 CeedInt t_stride_0 = B, t_stride_1 = 1; in CeedTensorContract_Avx_Blocked()
52 for (CeedInt a = 0; a < A; a++) { in CeedTensorContract_Avx_Blocked()
54 for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { in CeedTensorContract_Avx_Blocked()
55 for (CeedInt c = 0; c < (C / CC) * CC; c += CC) { in CeedTensorContract_Avx_Blocked()
57 for (CeedInt jj = 0; jj < JJ; jj++) { in CeedTensorContract_Avx_Blocked()
58 … for (CeedInt cc = 0; cc < CC / 4; cc++) vv[jj][cc] = loadu(&v[(a * J + j + jj) * C + c + cc * 4]); in CeedTensorContract_Avx_Blocked()
60 for (CeedInt b = 0; b < B; b++) { in CeedTensorContract_Avx_Blocked()
61 for (CeedInt jj = 0; jj < JJ; jj++) { // unroll in CeedTensorContract_Avx_Blocked()
63 for (CeedInt cc = 0; cc < CC / 4; cc++) { // unroll in CeedTensorContract_Avx_Blocked()
68 for (CeedInt jj = 0; jj < JJ; jj++) { in CeedTensorContract_Avx_Blocked()
69 … for (CeedInt cc = 0; cc < CC / 4; cc++) storeu(&v[(a * J + j + jj) * C + c + cc * 4], vv[jj][cc]); in CeedTensorContract_Avx_Blocked()
74 const CeedInt j = (J / JJ) * JJ; in CeedTensorContract_Avx_Blocked()
77 for (CeedInt c = 0; c < (C / CC) * CC; c += CC) { in CeedTensorContract_Avx_Blocked()
80 for (CeedInt jj = 0; jj < J - j; jj++) { in CeedTensorContract_Avx_Blocked()
81 … for (CeedInt cc = 0; cc < CC / 4; cc++) vv[jj][cc] = loadu(&v[(a * J + j + jj) * C + c + cc * 4]); in CeedTensorContract_Avx_Blocked()
83 for (CeedInt b = 0; b < B; b++) { in CeedTensorContract_Avx_Blocked()
84 for (CeedInt jj = 0; jj < J - j; jj++) { // doesn't unroll in CeedTensorContract_Avx_Blocked()
87 for (CeedInt cc = 0; cc < CC / 4; cc++) { // unroll in CeedTensorContract_Avx_Blocked()
92 for (CeedInt jj = 0; jj < J - j; jj++) { in CeedTensorContract_Avx_Blocked()
93 … for (CeedInt cc = 0; cc < CC / 4; cc++) storeu(&v[(a * J + j + jj) * C + c + cc * 4], vv[jj][cc]); in CeedTensorContract_Avx_Blocked()
104 …nsorContract_Avx_Remainder(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J, in CeedTensorContract_Avx_Remainder()
105 … const CeedScalar *restrict t, CeedTransposeMode t_mode, const CeedInt add, in CeedTensorContract_Avx_Remainder()
106 … const CeedScalar *restrict u, CeedScalar *restrict v, const CeedInt JJ, const CeedInt CC) { in CeedTensorContract_Avx_Remainder()
107 CeedInt t_stride_0 = B, t_stride_1 = 1; in CeedTensorContract_Avx_Remainder()
114 const CeedInt J_break = J % JJ ? (J / JJ) * JJ : (J / JJ - 1) * JJ; in CeedTensorContract_Avx_Remainder()
116 for (CeedInt a = 0; a < A; a++) { in CeedTensorContract_Avx_Remainder()
118 for (CeedInt c = (C / CC) * CC; c < C; c += 4) { in CeedTensorContract_Avx_Remainder()
120 for (CeedInt j = 0; j < J_break; j += JJ) { in CeedTensorContract_Avx_Remainder()
123 for (CeedInt jj = 0; jj < JJ; jj++) vv[jj] = loadu(&v[(a * J + j + jj) * C + c]); in CeedTensorContract_Avx_Remainder()
124 for (CeedInt b = 0; b < B; b++) { in CeedTensorContract_Avx_Remainder()
131 for (CeedInt jj = 0; jj < JJ; jj++) { // unroll in CeedTensorContract_Avx_Remainder()
135 for (CeedInt jj = 0; jj < JJ; jj++) storeu(&v[(a * J + j + jj) * C + c], vv[jj]); in CeedTensorContract_Avx_Remainder()
139 for (CeedInt j = J_break; j < J; j++) { in CeedTensorContract_Avx_Remainder()
140 for (CeedInt b = 0; b < B; b++) { in CeedTensorContract_Avx_Remainder()
143 … for (CeedInt c = (C / CC) * CC; c < C; c++) v[(a * J + j) * C + c] += tq * u[(a * B + b) * C + c]; in CeedTensorContract_Avx_Remainder()
153 …dTensorContract_Avx_Single(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J… in CeedTensorContract_Avx_Single()
154 … CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v, in CeedTensorContract_Avx_Single()
155 const CeedInt AA, const CeedInt JJ) { in CeedTensorContract_Avx_Single()
156 CeedInt t_stride_0 = B, t_stride_1 = 1; in CeedTensorContract_Avx_Single()
164 for (CeedInt a = 0; a < (A / AA) * AA; a += AA) { in CeedTensorContract_Avx_Single()
165 for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { in CeedTensorContract_Avx_Single()
168 for (CeedInt aa = 0; aa < AA; aa++) { in CeedTensorContract_Avx_Single()
169 for (CeedInt jj = 0; jj < JJ / 4; jj++) vv[aa][jj] = loadu(&v[(a + aa) * J + j + jj * 4]); in CeedTensorContract_Avx_Single()
171 for (CeedInt b = 0; b < B; b++) { in CeedTensorContract_Avx_Single()
172 for (CeedInt jj = 0; jj < JJ / 4; jj++) { // unroll in CeedTensorContract_Avx_Single()
176 for (CeedInt aa = 0; aa < AA; aa++) { // unroll in CeedTensorContract_Avx_Single()
181 for (CeedInt aa = 0; aa < AA; aa++) { in CeedTensorContract_Avx_Single()
182 for (CeedInt jj = 0; jj < JJ / 4; jj++) storeu(&v[(a + aa) * J + j + jj * 4], vv[aa][jj]); in CeedTensorContract_Avx_Single()
187 const CeedInt a = (A / AA) * AA; in CeedTensorContract_Avx_Single()
189 for (CeedInt j = 0; j < (J / JJ) * JJ; j += JJ) { in CeedTensorContract_Avx_Single()
192 for (CeedInt aa = 0; aa < A - a; aa++) { in CeedTensorContract_Avx_Single()
193 for (CeedInt jj = 0; jj < JJ / 4; jj++) vv[aa][jj] = loadu(&v[(a + aa) * J + j + jj * 4]); in CeedTensorContract_Avx_Single()
195 for (CeedInt b = 0; b < B; b++) { in CeedTensorContract_Avx_Single()
196 for (CeedInt jj = 0; jj < JJ / 4; jj++) { // unroll in CeedTensorContract_Avx_Single()
200 for (CeedInt aa = 0; aa < A - a; aa++) { // unroll in CeedTensorContract_Avx_Single()
205 for (CeedInt aa = 0; aa < A - a; aa++) { in CeedTensorContract_Avx_Single()
206 for (CeedInt jj = 0; jj < JJ / 4; jj++) storeu(&v[(a + aa) * J + j + jj * 4], vv[aa][jj]); in CeedTensorContract_Avx_Single()
210 const CeedInt A_break = A % AA ? (A / AA) * AA : (A / AA - 1) * AA; in CeedTensorContract_Avx_Single()
213 for (CeedInt j = (J / JJ) * JJ; j < J; j += 4) { in CeedTensorContract_Avx_Single()
215 for (CeedInt a = 0; a < A_break; a += AA) { in CeedTensorContract_Avx_Single()
218 for (CeedInt aa = 0; aa < AA; aa++) vv[aa] = loadu(&v[(a + aa) * J + j]); in CeedTensorContract_Avx_Single()
219 for (CeedInt b = 0; b < B; b++) { in CeedTensorContract_Avx_Single()
233 for (CeedInt aa = 0; aa < AA; aa++) { // unroll in CeedTensorContract_Avx_Single()
237 for (CeedInt aa = 0; aa < AA; aa++) storeu(&v[(a + aa) * J + j], vv[aa]); in CeedTensorContract_Avx_Single()
241 for (CeedInt b = 0; b < B; b++) { in CeedTensorContract_Avx_Single()
242 for (CeedInt j = (J / JJ) * JJ; j < J; j++) { in CeedTensorContract_Avx_Single()
245 for (CeedInt a = A_break; a < A; a++) v[a * J + j] += tq * u[a * B + b]; in CeedTensorContract_Avx_Single()
254 …orContract_Avx_Blocked_4_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J… in CeedTensorContract_Avx_Blocked_4_8()
255 …CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v)… in CeedTensorContract_Avx_Blocked_4_8()
258 …Contract_Avx_Remainder_8_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J… in CeedTensorContract_Avx_Remainder_8_8()
259 …CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v)… in CeedTensorContract_Avx_Remainder_8_8()
262 …sorContract_Avx_Single_4_8(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J… in CeedTensorContract_Avx_Single_4_8()
263 …CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v)… in CeedTensorContract_Avx_Single_4_8()
270 …eedTensorContractApply_Avx(CeedTensorContract contract, CeedInt A, CeedInt B, CeedInt C, CeedInt J… in CeedTensorContractApply_Avx()
271 …CeedTransposeMode t_mode, const CeedInt add, const CeedScalar *restrict u, CeedScalar *restrict v)… in CeedTensorContractApply_Avx()
272 const CeedInt blk_size = 8; in CeedTensorContractApply_Avx()
275 for (CeedInt q = 0; q < A * J * C; q++) v[q] = (CeedScalar)0.0; in CeedTensorContractApply_Avx()