Lines Matching +full:- +full:vv
1 // Copyright (c) 2017-2026, Lawrence Livermore National Security, LLC and other CEED contributors.
2 // All Rights Reserved. See the top-level LICENSE and NOTICE files for details.
4 // SPDX-License-Identifier: BSD-2-Clause
39 //------------------------------------------------------------------------------
41 //------------------------------------------------------------------------------
56 rtype vv[JJ][CC / 4]; // Output tile to be held in registers in CeedTensorContract_Avx_Blocked() local
58 … for (CeedInt cc = 0; cc < CC / 4; cc++) vv[jj][cc] = loadu(&v[(a * J + j + jj) * C + c + cc * 4]); in CeedTensorContract_Avx_Blocked()
64 fmadd(vv[jj][cc], tqv, loadu(&u[(a * B + b) * C + c + cc * 4])); in CeedTensorContract_Avx_Blocked()
69 … for (CeedInt cc = 0; cc < CC / 4; cc++) storeu(&v[(a * J + j + jj) * C + c + cc * 4], vv[jj][cc]); in CeedTensorContract_Avx_Blocked()
78 rtype vv[JJ][CC / 4]; // Output tile to be held in registers in CeedTensorContract_Avx_Blocked() local
80 for (CeedInt jj = 0; jj < J - j; jj++) { in CeedTensorContract_Avx_Blocked()
81 … for (CeedInt cc = 0; cc < CC / 4; cc++) vv[jj][cc] = loadu(&v[(a * J + j + jj) * C + c + cc * 4]); in CeedTensorContract_Avx_Blocked()
84 for (CeedInt jj = 0; jj < J - j; jj++) { // doesn't unroll in CeedTensorContract_Avx_Blocked()
88 fmadd(vv[jj][cc], tqv, loadu(&u[(a * B + b) * C + c + cc * 4])); in CeedTensorContract_Avx_Blocked()
92 for (CeedInt jj = 0; jj < J - j; jj++) { in CeedTensorContract_Avx_Blocked()
93 … for (CeedInt cc = 0; cc < CC / 4; cc++) storeu(&v[(a * J + j + jj) * C + c + cc * 4], vv[jj][cc]); in CeedTensorContract_Avx_Blocked()
101 //------------------------------------------------------------------------------
103 //------------------------------------------------------------------------------
114 const CeedInt J_break = J % JJ ? (J / JJ) * JJ : (J / JJ - 1) * JJ; in CeedTensorContract_Avx_Remainder()
121 rtype vv[JJ]; // Output tile to be held in registers in CeedTensorContract_Avx_Remainder() local
123 for (CeedInt jj = 0; jj < JJ; jj++) vv[jj] = loadu(&v[(a * J + j + jj) * C + c]); in CeedTensorContract_Avx_Remainder()
127 if (C - c == 1) tqu = set(0.0, 0.0, 0.0, u[(a * B + b) * C + c + 0]); in CeedTensorContract_Avx_Remainder()
128 … else if (C - c == 2) tqu = set(0.0, 0.0, u[(a * B + b) * C + c + 1], u[(a * B + b) * C + c + 0]); in CeedTensorContract_Avx_Remainder()
129 …else if (C - c == 3) tqu = set(0.0, u[(a * B + b) * C + c + 2], u[(a * B + b) * C + c + 1], u[(a *… in CeedTensorContract_Avx_Remainder()
132 fmadd(vv[jj], tqu, set1(t[(j + jj) * t_stride_0 + b * t_stride_1])); in CeedTensorContract_Avx_Remainder()
135 for (CeedInt jj = 0; jj < JJ; jj++) storeu(&v[(a * J + j + jj) * C + c], vv[jj]); in CeedTensorContract_Avx_Remainder()
150 //------------------------------------------------------------------------------
152 //------------------------------------------------------------------------------
166 rtype vv[AA][JJ / 4]; // Output tile to be held in registers in CeedTensorContract_Avx_Single() local
169 for (CeedInt jj = 0; jj < JJ / 4; jj++) vv[aa][jj] = loadu(&v[(a + aa) * J + j + jj * 4]); in CeedTensorContract_Avx_Single()
177 fmadd(vv[aa][jj], tqv, set1(u[(a + aa) * B + b])); in CeedTensorContract_Avx_Single()
182 for (CeedInt jj = 0; jj < JJ / 4; jj++) storeu(&v[(a + aa) * J + j + jj * 4], vv[aa][jj]); in CeedTensorContract_Avx_Single()
190 rtype vv[AA][JJ / 4]; // Output tile to be held in registers in CeedTensorContract_Avx_Single() local
192 for (CeedInt aa = 0; aa < A - a; aa++) { in CeedTensorContract_Avx_Single()
193 for (CeedInt jj = 0; jj < JJ / 4; jj++) vv[aa][jj] = loadu(&v[(a + aa) * J + j + jj * 4]); in CeedTensorContract_Avx_Single()
200 for (CeedInt aa = 0; aa < A - a; aa++) { // unroll in CeedTensorContract_Avx_Single()
201 fmadd(vv[aa][jj], tqv, set1(u[(a + aa) * B + b])); in CeedTensorContract_Avx_Single()
205 for (CeedInt aa = 0; aa < A - a; aa++) { in CeedTensorContract_Avx_Single()
206 for (CeedInt jj = 0; jj < JJ / 4; jj++) storeu(&v[(a + aa) * J + j + jj * 4], vv[aa][jj]); in CeedTensorContract_Avx_Single()
210 const CeedInt A_break = A % AA ? (A / AA) * AA : (A / AA - 1) * AA; in CeedTensorContract_Avx_Single()
216 rtype vv[AA]; // Output tile to be held in registers in CeedTensorContract_Avx_Single() local
218 for (CeedInt aa = 0; aa < AA; aa++) vv[aa] = loadu(&v[(a + aa) * J + j]); in CeedTensorContract_Avx_Single()
222 if (J - j == 1) { in CeedTensorContract_Avx_Single()
224 } else if (J - j == 2) { in CeedTensorContract_Avx_Single()
226 } else if (J - 3 == j) { in CeedTensorContract_Avx_Single()
234 fmadd(vv[aa], tqv, set1(u[(a + aa) * B + b])); in CeedTensorContract_Avx_Single()
237 for (CeedInt aa = 0; aa < AA; aa++) storeu(&v[(a + aa) * J + j], vv[aa]); in CeedTensorContract_Avx_Single()
251 //------------------------------------------------------------------------------
252 // Tensor Contract - Common Sizes
253 //------------------------------------------------------------------------------
267 //------------------------------------------------------------------------------
269 //------------------------------------------------------------------------------
290 //------------------------------------------------------------------------------
292 //------------------------------------------------------------------------------
298 //------------------------------------------------------------------------------