1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* 5 Special case where the matrix was ILU(0) factored in the natural 6 ordering. This eliminates the need for the column and row permutation. 7 */ 8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 9 { 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 11 PetscInt n = a->mbs; 12 const PetscInt *ai = a->i, *aj = a->j; 13 const PetscInt *diag = a->diag; 14 const MatScalar *aa = a->a; 15 PetscScalar *x; 16 const PetscScalar *b; 17 18 PetscFunctionBegin; 19 PetscCall(VecGetArrayRead(bb, &b)); 20 PetscCall(VecGetArray(xx, &x)); 21 22 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 23 { 24 static PetscScalar w[2000]; /* very BAD need to fix */ 25 fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w); 26 } 27 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 28 fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b); 29 #else 30 { 31 PetscScalar s1, s2, s3, s4, x1, x2, x3, x4; 32 const MatScalar *v; 33 PetscInt jdx, idt, idx, nz, i, ai16; 34 const PetscInt *vi; 35 36 /* forward solve the lower triangular */ 37 idx = 0; 38 x[0] = b[0]; 39 x[1] = b[1]; 40 x[2] = b[2]; 41 x[3] = b[3]; 42 for (i = 1; i < n; i++) { 43 v = aa + 16 * ai[i]; 44 vi = aj + ai[i]; 45 nz = diag[i] - ai[i]; 46 idx += 4; 47 s1 = b[idx]; 48 s2 = b[1 + idx]; 49 s3 = b[2 + idx]; 50 s4 = b[3 + idx]; 51 while (nz--) { 52 jdx = 4 * (*vi++); 53 x1 = x[jdx]; 54 x2 = x[1 + jdx]; 55 x3 = x[2 + jdx]; 56 x4 = x[3 + jdx]; 57 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 58 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 59 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 60 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 61 v += 16; 62 } 63 x[idx] = s1; 64 x[1 + idx] = s2; 65 x[2 + idx] = s3; 66 x[3 + idx] = s4; 67 } 68 /* backward solve the upper triangular */ 69 idt = 4 * (n - 1); 70 for (i = n - 1; i >= 0; i--) { 71 ai16 = 16 * diag[i]; 72 v = aa + ai16 + 16; 73 vi = aj + diag[i] + 1; 74 nz = ai[i + 1] - diag[i] - 1; 75 s1 = x[idt]; 76 s2 = x[1 + idt]; 77 s3 = x[2 + idt]; 78 s4 = x[3 + idt]; 79 while (nz--) { 80 idx = 4 * (*vi++); 81 x1 = x[idx]; 82 x2 = x[1 + idx]; 83 x3 = x[2 + idx]; 84 x4 = x[3 + idx]; 85 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 86 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 87 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 88 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 89 v += 16; 90 } 91 v = aa + ai16; 92 x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 93 x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 94 x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 95 x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 96 idt -= 4; 97 } 98 } 99 #endif 100 101 PetscCall(VecRestoreArrayRead(bb, &b)); 102 PetscCall(VecRestoreArray(xx, &x)); 103 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 104 PetscFunctionReturn(PETSC_SUCCESS); 105 } 106 107 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx) 108 { 109 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 110 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 111 PetscInt i, k, nz, idx, jdx, idt; 112 const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 113 const MatScalar *aa = a->a, *v; 114 PetscScalar *x; 115 const PetscScalar *b; 116 PetscScalar s1, s2, s3, s4, x1, x2, x3, x4; 117 118 PetscFunctionBegin; 119 PetscCall(VecGetArrayRead(bb, &b)); 120 PetscCall(VecGetArray(xx, &x)); 121 /* forward solve the lower triangular */ 122 idx = 0; 123 x[0] = b[idx]; 124 x[1] = b[1 + idx]; 125 x[2] = b[2 + idx]; 126 x[3] = b[3 + idx]; 127 for (i = 1; i < n; i++) { 128 v = aa + bs2 * ai[i]; 129 vi = aj + ai[i]; 130 nz = ai[i + 1] - ai[i]; 131 idx = bs * i; 132 s1 = b[idx]; 133 s2 = b[1 + idx]; 134 s3 = b[2 + idx]; 135 s4 = b[3 + idx]; 136 for (k = 0; k < nz; k++) { 137 jdx = bs * vi[k]; 138 x1 = x[jdx]; 139 x2 = x[1 + jdx]; 140 x3 = x[2 + jdx]; 141 x4 = x[3 + jdx]; 142 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 143 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 144 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 145 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 146 147 v += bs2; 148 } 149 150 x[idx] = s1; 151 x[1 + idx] = s2; 152 x[2 + idx] = s3; 153 x[3 + idx] = s4; 154 } 155 156 /* backward solve the upper triangular */ 157 for (i = n - 1; i >= 0; i--) { 158 v = aa + bs2 * (adiag[i + 1] + 1); 159 vi = aj + adiag[i + 1] + 1; 160 nz = adiag[i] - adiag[i + 1] - 1; 161 idt = bs * i; 162 s1 = x[idt]; 163 s2 = x[1 + idt]; 164 s3 = x[2 + idt]; 165 s4 = x[3 + idt]; 166 167 for (k = 0; k < nz; k++) { 168 idx = bs * vi[k]; 169 x1 = x[idx]; 170 x2 = x[1 + idx]; 171 x3 = x[2 + idx]; 172 x4 = x[3 + idx]; 173 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 174 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 175 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 176 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 177 178 v += bs2; 179 } 180 /* x = inv_diagonal*x */ 181 x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 182 x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 183 x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 184 x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 185 } 186 187 PetscCall(VecRestoreArrayRead(bb, &b)); 188 PetscCall(VecRestoreArray(xx, &x)); 189 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 190 PetscFunctionReturn(PETSC_SUCCESS); 191 } 192 193 #if defined(PETSC_HAVE_SSE) 194 195 #include PETSC_HAVE_SSE 196 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx) 197 { 198 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 199 unsigned short *aj = (unsigned short *)a->j; 200 int *ai = a->i, n = a->mbs, *diag = a->diag; 201 MatScalar *aa = a->a; 202 PetscScalar *x, *b; 203 204 PetscFunctionBegin; 205 SSE_SCOPE_BEGIN; 206 /* 207 Note: This code currently uses demotion of double 208 to float when performing the mixed-mode computation. 209 This may not be numerically reasonable for all applications. 210 */ 211 PREFETCH_NTA(aa + 16 * ai[1]); 212 213 PetscCall(VecGetArray(bb, &b)); 214 PetscCall(VecGetArray(xx, &x)); 215 { 216 /* x will first be computed in single precision then promoted inplace to double */ 217 MatScalar *v, *t = (MatScalar *)x; 218 int nz, i, idt, ai16; 219 unsigned int jdx, idx; 220 unsigned short *vi; 221 /* Forward solve the lower triangular factor. */ 222 223 /* First block is the identity. */ 224 idx = 0; 225 CONVERT_DOUBLE4_FLOAT4(t, b); 226 v = aa + 16 * ((unsigned int)ai[1]); 227 228 for (i = 1; i < n;) { 229 PREFETCH_NTA(&v[8]); 230 vi = aj + ai[i]; 231 nz = diag[i] - ai[i]; 232 idx += 4; 233 234 /* Demote RHS from double to float. */ 235 CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]); 236 LOAD_PS(&t[idx], XMM7); 237 238 while (nz--) { 239 PREFETCH_NTA(&v[16]); 240 jdx = 4 * ((unsigned int)(*vi++)); 241 242 /* 4x4 Matrix-Vector product with negative accumulation: */ 243 SSE_INLINE_BEGIN_2(&t[jdx], v) 244 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 245 246 /* First Column */ 247 SSE_COPY_PS(XMM0, XMM6) 248 SSE_SHUFFLE(XMM0, XMM0, 0x00) 249 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 250 SSE_SUB_PS(XMM7, XMM0) 251 252 /* Second Column */ 253 SSE_COPY_PS(XMM1, XMM6) 254 SSE_SHUFFLE(XMM1, XMM1, 0x55) 255 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 256 SSE_SUB_PS(XMM7, XMM1) 257 258 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 259 260 /* Third Column */ 261 SSE_COPY_PS(XMM2, XMM6) 262 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 263 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 264 SSE_SUB_PS(XMM7, XMM2) 265 266 /* Fourth Column */ 267 SSE_COPY_PS(XMM3, XMM6) 268 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 269 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 270 SSE_SUB_PS(XMM7, XMM3) 271 SSE_INLINE_END_2 272 273 v += 16; 274 } 275 v = aa + 16 * ai[++i]; 276 PREFETCH_NTA(v); 277 STORE_PS(&t[idx], XMM7); 278 } 279 280 /* Backward solve the upper triangular factor.*/ 281 282 idt = 4 * (n - 1); 283 ai16 = 16 * diag[n - 1]; 284 v = aa + ai16 + 16; 285 for (i = n - 1; i >= 0;) { 286 PREFETCH_NTA(&v[8]); 287 vi = aj + diag[i] + 1; 288 nz = ai[i + 1] - diag[i] - 1; 289 290 LOAD_PS(&t[idt], XMM7); 291 292 while (nz--) { 293 PREFETCH_NTA(&v[16]); 294 idx = 4 * ((unsigned int)(*vi++)); 295 296 /* 4x4 Matrix-Vector Product with negative accumulation: */ 297 SSE_INLINE_BEGIN_2(&t[idx], v) 298 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 299 300 /* First Column */ 301 SSE_COPY_PS(XMM0, XMM6) 302 SSE_SHUFFLE(XMM0, XMM0, 0x00) 303 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 304 SSE_SUB_PS(XMM7, XMM0) 305 306 /* Second Column */ 307 SSE_COPY_PS(XMM1, XMM6) 308 SSE_SHUFFLE(XMM1, XMM1, 0x55) 309 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 310 SSE_SUB_PS(XMM7, XMM1) 311 312 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 313 314 /* Third Column */ 315 SSE_COPY_PS(XMM2, XMM6) 316 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 317 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 318 SSE_SUB_PS(XMM7, XMM2) 319 320 /* Fourth Column */ 321 SSE_COPY_PS(XMM3, XMM6) 322 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 323 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 324 SSE_SUB_PS(XMM7, XMM3) 325 SSE_INLINE_END_2 326 v += 16; 327 } 328 v = aa + ai16; 329 ai16 = 16 * diag[--i]; 330 PREFETCH_NTA(aa + ai16 + 16); 331 /* 332 Scale the result by the diagonal 4x4 block, 333 which was inverted as part of the factorization 334 */ 335 SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16) 336 /* First Column */ 337 SSE_COPY_PS(XMM0, XMM7) 338 SSE_SHUFFLE(XMM0, XMM0, 0x00) 339 SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 340 341 /* Second Column */ 342 SSE_COPY_PS(XMM1, XMM7) 343 SSE_SHUFFLE(XMM1, XMM1, 0x55) 344 SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 345 SSE_ADD_PS(XMM0, XMM1) 346 347 SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 348 349 /* Third Column */ 350 SSE_COPY_PS(XMM2, XMM7) 351 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 352 SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 353 SSE_ADD_PS(XMM0, XMM2) 354 355 /* Fourth Column */ 356 SSE_COPY_PS(XMM3, XMM7) 357 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 358 SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 359 SSE_ADD_PS(XMM0, XMM3) 360 361 SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 362 SSE_INLINE_END_3 363 364 v = aa + ai16 + 16; 365 idt -= 4; 366 } 367 368 /* Convert t from single precision back to double precision (inplace)*/ 369 idt = 4 * (n - 1); 370 for (i = n - 1; i >= 0; i--) { 371 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 372 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 373 PetscScalar *xtemp = &x[idt]; 374 MatScalar *ttemp = &t[idt]; 375 xtemp[3] = (PetscScalar)ttemp[3]; 376 xtemp[2] = (PetscScalar)ttemp[2]; 377 xtemp[1] = (PetscScalar)ttemp[1]; 378 xtemp[0] = (PetscScalar)ttemp[0]; 379 idt -= 4; 380 } 381 382 } /* End of artificial scope. */ 383 PetscCall(VecRestoreArray(bb, &b)); 384 PetscCall(VecRestoreArray(xx, &x)); 385 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 386 SSE_SCOPE_END; 387 PetscFunctionReturn(PETSC_SUCCESS); 388 } 389 390 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx) 391 { 392 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 393 int *aj = a->j; 394 int *ai = a->i, n = a->mbs, *diag = a->diag; 395 MatScalar *aa = a->a; 396 PetscScalar *x, *b; 397 398 PetscFunctionBegin; 399 SSE_SCOPE_BEGIN; 400 /* 401 Note: This code currently uses demotion of double 402 to float when performing the mixed-mode computation. 403 This may not be numerically reasonable for all applications. 404 */ 405 PREFETCH_NTA(aa + 16 * ai[1]); 406 407 PetscCall(VecGetArray(bb, &b)); 408 PetscCall(VecGetArray(xx, &x)); 409 { 410 /* x will first be computed in single precision then promoted inplace to double */ 411 MatScalar *v, *t = (MatScalar *)x; 412 int nz, i, idt, ai16; 413 int jdx, idx; 414 int *vi; 415 /* Forward solve the lower triangular factor. */ 416 417 /* First block is the identity. */ 418 idx = 0; 419 CONVERT_DOUBLE4_FLOAT4(t, b); 420 v = aa + 16 * ai[1]; 421 422 for (i = 1; i < n;) { 423 PREFETCH_NTA(&v[8]); 424 vi = aj + ai[i]; 425 nz = diag[i] - ai[i]; 426 idx += 4; 427 428 /* Demote RHS from double to float. */ 429 CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]); 430 LOAD_PS(&t[idx], XMM7); 431 432 while (nz--) { 433 PREFETCH_NTA(&v[16]); 434 jdx = 4 * (*vi++); 435 /* jdx = *vi++; */ 436 437 /* 4x4 Matrix-Vector product with negative accumulation: */ 438 SSE_INLINE_BEGIN_2(&t[jdx], v) 439 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 440 441 /* First Column */ 442 SSE_COPY_PS(XMM0, XMM6) 443 SSE_SHUFFLE(XMM0, XMM0, 0x00) 444 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 445 SSE_SUB_PS(XMM7, XMM0) 446 447 /* Second Column */ 448 SSE_COPY_PS(XMM1, XMM6) 449 SSE_SHUFFLE(XMM1, XMM1, 0x55) 450 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 451 SSE_SUB_PS(XMM7, XMM1) 452 453 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 454 455 /* Third Column */ 456 SSE_COPY_PS(XMM2, XMM6) 457 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 458 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 459 SSE_SUB_PS(XMM7, XMM2) 460 461 /* Fourth Column */ 462 SSE_COPY_PS(XMM3, XMM6) 463 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 464 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 465 SSE_SUB_PS(XMM7, XMM3) 466 SSE_INLINE_END_2 467 468 v += 16; 469 } 470 v = aa + 16 * ai[++i]; 471 PREFETCH_NTA(v); 472 STORE_PS(&t[idx], XMM7); 473 } 474 475 /* Backward solve the upper triangular factor.*/ 476 477 idt = 4 * (n - 1); 478 ai16 = 16 * diag[n - 1]; 479 v = aa + ai16 + 16; 480 for (i = n - 1; i >= 0;) { 481 PREFETCH_NTA(&v[8]); 482 vi = aj + diag[i] + 1; 483 nz = ai[i + 1] - diag[i] - 1; 484 485 LOAD_PS(&t[idt], XMM7); 486 487 while (nz--) { 488 PREFETCH_NTA(&v[16]); 489 idx = 4 * (*vi++); 490 /* idx = *vi++; */ 491 492 /* 4x4 Matrix-Vector Product with negative accumulation: */ 493 SSE_INLINE_BEGIN_2(&t[idx], v) 494 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 495 496 /* First Column */ 497 SSE_COPY_PS(XMM0, XMM6) 498 SSE_SHUFFLE(XMM0, XMM0, 0x00) 499 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 500 SSE_SUB_PS(XMM7, XMM0) 501 502 /* Second Column */ 503 SSE_COPY_PS(XMM1, XMM6) 504 SSE_SHUFFLE(XMM1, XMM1, 0x55) 505 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 506 SSE_SUB_PS(XMM7, XMM1) 507 508 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 509 510 /* Third Column */ 511 SSE_COPY_PS(XMM2, XMM6) 512 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 513 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 514 SSE_SUB_PS(XMM7, XMM2) 515 516 /* Fourth Column */ 517 SSE_COPY_PS(XMM3, XMM6) 518 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 519 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 520 SSE_SUB_PS(XMM7, XMM3) 521 SSE_INLINE_END_2 522 v += 16; 523 } 524 v = aa + ai16; 525 ai16 = 16 * diag[--i]; 526 PREFETCH_NTA(aa + ai16 + 16); 527 /* 528 Scale the result by the diagonal 4x4 block, 529 which was inverted as part of the factorization 530 */ 531 SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16) 532 /* First Column */ 533 SSE_COPY_PS(XMM0, XMM7) 534 SSE_SHUFFLE(XMM0, XMM0, 0x00) 535 SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 536 537 /* Second Column */ 538 SSE_COPY_PS(XMM1, XMM7) 539 SSE_SHUFFLE(XMM1, XMM1, 0x55) 540 SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 541 SSE_ADD_PS(XMM0, XMM1) 542 543 SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 544 545 /* Third Column */ 546 SSE_COPY_PS(XMM2, XMM7) 547 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 548 SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 549 SSE_ADD_PS(XMM0, XMM2) 550 551 /* Fourth Column */ 552 SSE_COPY_PS(XMM3, XMM7) 553 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 554 SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 555 SSE_ADD_PS(XMM0, XMM3) 556 557 SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 558 SSE_INLINE_END_3 559 560 v = aa + ai16 + 16; 561 idt -= 4; 562 } 563 564 /* Convert t from single precision back to double precision (inplace)*/ 565 idt = 4 * (n - 1); 566 for (i = n - 1; i >= 0; i--) { 567 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 568 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 569 PetscScalar *xtemp = &x[idt]; 570 MatScalar *ttemp = &t[idt]; 571 xtemp[3] = (PetscScalar)ttemp[3]; 572 xtemp[2] = (PetscScalar)ttemp[2]; 573 xtemp[1] = (PetscScalar)ttemp[1]; 574 xtemp[0] = (PetscScalar)ttemp[0]; 575 idt -= 4; 576 } 577 578 } /* End of artificial scope. */ 579 PetscCall(VecRestoreArray(bb, &b)); 580 PetscCall(VecRestoreArray(xx, &x)); 581 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 582 SSE_SCOPE_END; 583 PetscFunctionReturn(PETSC_SUCCESS); 584 } 585 586 #endif 587