1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* 5 Special case where the matrix was ILU(0) factored in the natural 6 ordering. This eliminates the need for the column and row permutation. 7 */ 8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) { 9 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 10 PetscInt n = a->mbs; 11 const PetscInt *ai = a->i, *aj = a->j; 12 const PetscInt *diag = a->diag; 13 const MatScalar *aa = a->a; 14 PetscScalar *x; 15 const PetscScalar *b; 16 17 PetscFunctionBegin; 18 PetscCall(VecGetArrayRead(bb, &b)); 19 PetscCall(VecGetArray(xx, &x)); 20 21 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 22 { 23 static PetscScalar w[2000]; /* very BAD need to fix */ 24 fortransolvebaij4_(&n, x, ai, aj, diag, aa, b, w); 25 } 26 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 27 fortransolvebaij4unroll_(&n, x, ai, aj, diag, aa, b); 28 #else 29 { 30 PetscScalar s1, s2, s3, s4, x1, x2, x3, x4; 31 const MatScalar *v; 32 PetscInt jdx, idt, idx, nz, i, ai16; 33 const PetscInt *vi; 34 35 /* forward solve the lower triangular */ 36 idx = 0; 37 x[0] = b[0]; 38 x[1] = b[1]; 39 x[2] = b[2]; 40 x[3] = b[3]; 41 for (i = 1; i < n; i++) { 42 v = aa + 16 * ai[i]; 43 vi = aj + ai[i]; 44 nz = diag[i] - ai[i]; 45 idx += 4; 46 s1 = b[idx]; 47 s2 = b[1 + idx]; 48 s3 = b[2 + idx]; 49 s4 = b[3 + idx]; 50 while (nz--) { 51 jdx = 4 * (*vi++); 52 x1 = x[jdx]; 53 x2 = x[1 + jdx]; 54 x3 = x[2 + jdx]; 55 x4 = x[3 + jdx]; 56 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 57 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 58 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 59 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 60 v += 16; 61 } 62 x[idx] = s1; 63 x[1 + idx] = s2; 64 x[2 + idx] = s3; 65 x[3 + idx] = s4; 66 } 67 /* backward solve the upper triangular */ 68 idt = 4 * (n - 1); 69 for (i = n - 1; i >= 0; i--) { 70 ai16 = 16 * diag[i]; 71 v = aa + ai16 + 16; 72 vi = aj + diag[i] + 1; 73 nz = ai[i + 1] - diag[i] - 1; 74 s1 = x[idt]; 75 s2 = x[1 + idt]; 76 s3 = x[2 + idt]; 77 s4 = x[3 + idt]; 78 while (nz--) { 79 idx = 4 * (*vi++); 80 x1 = x[idx]; 81 x2 = x[1 + idx]; 82 x3 = x[2 + idx]; 83 x4 = x[3 + idx]; 84 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 85 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 86 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 87 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 88 v += 16; 89 } 90 v = aa + ai16; 91 x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 92 x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 93 x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 94 x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 95 idt -= 4; 96 } 97 } 98 #endif 99 100 PetscCall(VecRestoreArrayRead(bb, &b)); 101 PetscCall(VecRestoreArray(xx, &x)); 102 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 103 PetscFunctionReturn(0); 104 } 105 106 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx) { 107 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 108 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 109 PetscInt i, k, nz, idx, jdx, idt; 110 const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 111 const MatScalar *aa = a->a, *v; 112 PetscScalar *x; 113 const PetscScalar *b; 114 PetscScalar s1, s2, s3, s4, x1, x2, x3, x4; 115 116 PetscFunctionBegin; 117 PetscCall(VecGetArrayRead(bb, &b)); 118 PetscCall(VecGetArray(xx, &x)); 119 /* forward solve the lower triangular */ 120 idx = 0; 121 x[0] = b[idx]; 122 x[1] = b[1 + idx]; 123 x[2] = b[2 + idx]; 124 x[3] = b[3 + idx]; 125 for (i = 1; i < n; i++) { 126 v = aa + bs2 * ai[i]; 127 vi = aj + ai[i]; 128 nz = ai[i + 1] - ai[i]; 129 idx = bs * i; 130 s1 = b[idx]; 131 s2 = b[1 + idx]; 132 s3 = b[2 + idx]; 133 s4 = b[3 + idx]; 134 for (k = 0; k < nz; k++) { 135 jdx = bs * vi[k]; 136 x1 = x[jdx]; 137 x2 = x[1 + jdx]; 138 x3 = x[2 + jdx]; 139 x4 = x[3 + jdx]; 140 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 141 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 142 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 143 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 144 145 v += bs2; 146 } 147 148 x[idx] = s1; 149 x[1 + idx] = s2; 150 x[2 + idx] = s3; 151 x[3 + idx] = s4; 152 } 153 154 /* backward solve the upper triangular */ 155 for (i = n - 1; i >= 0; i--) { 156 v = aa + bs2 * (adiag[i + 1] + 1); 157 vi = aj + adiag[i + 1] + 1; 158 nz = adiag[i] - adiag[i + 1] - 1; 159 idt = bs * i; 160 s1 = x[idt]; 161 s2 = x[1 + idt]; 162 s3 = x[2 + idt]; 163 s4 = x[3 + idt]; 164 165 for (k = 0; k < nz; k++) { 166 idx = bs * vi[k]; 167 x1 = x[idx]; 168 x2 = x[1 + idx]; 169 x3 = x[2 + idx]; 170 x4 = x[3 + idx]; 171 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 172 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 173 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 174 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 175 176 v += bs2; 177 } 178 /* x = inv_diagonal*x */ 179 x[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 180 x[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 181 x[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 182 x[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 183 } 184 185 PetscCall(VecRestoreArrayRead(bb, &b)); 186 PetscCall(VecRestoreArray(xx, &x)); 187 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 188 PetscFunctionReturn(0); 189 } 190 191 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A, Vec bb, Vec xx) { 192 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 193 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *diag = a->diag; 194 const MatScalar *aa = a->a; 195 const PetscScalar *b; 196 PetscScalar *x; 197 198 PetscFunctionBegin; 199 PetscCall(VecGetArrayRead(bb, &b)); 200 PetscCall(VecGetArray(xx, &x)); 201 202 { 203 MatScalar s1, s2, s3, s4, x1, x2, x3, x4; 204 const MatScalar *v; 205 MatScalar *t = (MatScalar *)x; 206 PetscInt jdx, idt, idx, nz, i, ai16; 207 const PetscInt *vi; 208 209 /* forward solve the lower triangular */ 210 idx = 0; 211 t[0] = (MatScalar)b[0]; 212 t[1] = (MatScalar)b[1]; 213 t[2] = (MatScalar)b[2]; 214 t[3] = (MatScalar)b[3]; 215 for (i = 1; i < n; i++) { 216 v = aa + 16 * ai[i]; 217 vi = aj + ai[i]; 218 nz = diag[i] - ai[i]; 219 idx += 4; 220 s1 = (MatScalar)b[idx]; 221 s2 = (MatScalar)b[1 + idx]; 222 s3 = (MatScalar)b[2 + idx]; 223 s4 = (MatScalar)b[3 + idx]; 224 while (nz--) { 225 jdx = 4 * (*vi++); 226 x1 = t[jdx]; 227 x2 = t[1 + jdx]; 228 x3 = t[2 + jdx]; 229 x4 = t[3 + jdx]; 230 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 231 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 232 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 233 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 234 v += 16; 235 } 236 t[idx] = s1; 237 t[1 + idx] = s2; 238 t[2 + idx] = s3; 239 t[3 + idx] = s4; 240 } 241 /* backward solve the upper triangular */ 242 idt = 4 * (n - 1); 243 for (i = n - 1; i >= 0; i--) { 244 ai16 = 16 * diag[i]; 245 v = aa + ai16 + 16; 246 vi = aj + diag[i] + 1; 247 nz = ai[i + 1] - diag[i] - 1; 248 s1 = t[idt]; 249 s2 = t[1 + idt]; 250 s3 = t[2 + idt]; 251 s4 = t[3 + idt]; 252 while (nz--) { 253 idx = 4 * (*vi++); 254 x1 = (MatScalar)x[idx]; 255 x2 = (MatScalar)x[1 + idx]; 256 x3 = (MatScalar)x[2 + idx]; 257 x4 = (MatScalar)x[3 + idx]; 258 s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 259 s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 260 s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 261 s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 262 v += 16; 263 } 264 v = aa + ai16; 265 x[idt] = (PetscScalar)(v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4); 266 x[1 + idt] = (PetscScalar)(v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4); 267 x[2 + idt] = (PetscScalar)(v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4); 268 x[3 + idt] = (PetscScalar)(v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4); 269 idt -= 4; 270 } 271 } 272 273 PetscCall(VecRestoreArrayRead(bb, &b)); 274 PetscCall(VecRestoreArray(xx, &x)); 275 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 276 PetscFunctionReturn(0); 277 } 278 279 #if defined(PETSC_HAVE_SSE) 280 281 #include PETSC_HAVE_SSE 282 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A, Vec bb, Vec xx) { 283 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 284 unsigned short *aj = (unsigned short *)a->j; 285 int *ai = a->i, n = a->mbs, *diag = a->diag; 286 MatScalar *aa = a->a; 287 PetscScalar *x, *b; 288 289 PetscFunctionBegin; 290 SSE_SCOPE_BEGIN; 291 /* 292 Note: This code currently uses demotion of double 293 to float when performing the mixed-mode computation. 294 This may not be numerically reasonable for all applications. 295 */ 296 PREFETCH_NTA(aa + 16 * ai[1]); 297 298 PetscCall(VecGetArray(bb, &b)); 299 PetscCall(VecGetArray(xx, &x)); 300 { 301 /* x will first be computed in single precision then promoted inplace to double */ 302 MatScalar *v, *t = (MatScalar *)x; 303 int nz, i, idt, ai16; 304 unsigned int jdx, idx; 305 unsigned short *vi; 306 /* Forward solve the lower triangular factor. */ 307 308 /* First block is the identity. */ 309 idx = 0; 310 CONVERT_DOUBLE4_FLOAT4(t, b); 311 v = aa + 16 * ((unsigned int)ai[1]); 312 313 for (i = 1; i < n;) { 314 PREFETCH_NTA(&v[8]); 315 vi = aj + ai[i]; 316 nz = diag[i] - ai[i]; 317 idx += 4; 318 319 /* Demote RHS from double to float. */ 320 CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]); 321 LOAD_PS(&t[idx], XMM7); 322 323 while (nz--) { 324 PREFETCH_NTA(&v[16]); 325 jdx = 4 * ((unsigned int)(*vi++)); 326 327 /* 4x4 Matrix-Vector product with negative accumulation: */ 328 SSE_INLINE_BEGIN_2(&t[jdx], v) 329 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 330 331 /* First Column */ 332 SSE_COPY_PS(XMM0, XMM6) 333 SSE_SHUFFLE(XMM0, XMM0, 0x00) 334 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 335 SSE_SUB_PS(XMM7, XMM0) 336 337 /* Second Column */ 338 SSE_COPY_PS(XMM1, XMM6) 339 SSE_SHUFFLE(XMM1, XMM1, 0x55) 340 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 341 SSE_SUB_PS(XMM7, XMM1) 342 343 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 344 345 /* Third Column */ 346 SSE_COPY_PS(XMM2, XMM6) 347 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 348 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 349 SSE_SUB_PS(XMM7, XMM2) 350 351 /* Fourth Column */ 352 SSE_COPY_PS(XMM3, XMM6) 353 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 354 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 355 SSE_SUB_PS(XMM7, XMM3) 356 SSE_INLINE_END_2 357 358 v += 16; 359 } 360 v = aa + 16 * ai[++i]; 361 PREFETCH_NTA(v); 362 STORE_PS(&t[idx], XMM7); 363 } 364 365 /* Backward solve the upper triangular factor.*/ 366 367 idt = 4 * (n - 1); 368 ai16 = 16 * diag[n - 1]; 369 v = aa + ai16 + 16; 370 for (i = n - 1; i >= 0;) { 371 PREFETCH_NTA(&v[8]); 372 vi = aj + diag[i] + 1; 373 nz = ai[i + 1] - diag[i] - 1; 374 375 LOAD_PS(&t[idt], XMM7); 376 377 while (nz--) { 378 PREFETCH_NTA(&v[16]); 379 idx = 4 * ((unsigned int)(*vi++)); 380 381 /* 4x4 Matrix-Vector Product with negative accumulation: */ 382 SSE_INLINE_BEGIN_2(&t[idx], v) 383 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 384 385 /* First Column */ 386 SSE_COPY_PS(XMM0, XMM6) 387 SSE_SHUFFLE(XMM0, XMM0, 0x00) 388 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 389 SSE_SUB_PS(XMM7, XMM0) 390 391 /* Second Column */ 392 SSE_COPY_PS(XMM1, XMM6) 393 SSE_SHUFFLE(XMM1, XMM1, 0x55) 394 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 395 SSE_SUB_PS(XMM7, XMM1) 396 397 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 398 399 /* Third Column */ 400 SSE_COPY_PS(XMM2, XMM6) 401 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 402 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 403 SSE_SUB_PS(XMM7, XMM2) 404 405 /* Fourth Column */ 406 SSE_COPY_PS(XMM3, XMM6) 407 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 408 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 409 SSE_SUB_PS(XMM7, XMM3) 410 SSE_INLINE_END_2 411 v += 16; 412 } 413 v = aa + ai16; 414 ai16 = 16 * diag[--i]; 415 PREFETCH_NTA(aa + ai16 + 16); 416 /* 417 Scale the result by the diagonal 4x4 block, 418 which was inverted as part of the factorization 419 */ 420 SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16) 421 /* First Column */ 422 SSE_COPY_PS(XMM0, XMM7) 423 SSE_SHUFFLE(XMM0, XMM0, 0x00) 424 SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 425 426 /* Second Column */ 427 SSE_COPY_PS(XMM1, XMM7) 428 SSE_SHUFFLE(XMM1, XMM1, 0x55) 429 SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 430 SSE_ADD_PS(XMM0, XMM1) 431 432 SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 433 434 /* Third Column */ 435 SSE_COPY_PS(XMM2, XMM7) 436 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 437 SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 438 SSE_ADD_PS(XMM0, XMM2) 439 440 /* Fourth Column */ 441 SSE_COPY_PS(XMM3, XMM7) 442 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 443 SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 444 SSE_ADD_PS(XMM0, XMM3) 445 446 SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 447 SSE_INLINE_END_3 448 449 v = aa + ai16 + 16; 450 idt -= 4; 451 } 452 453 /* Convert t from single precision back to double precision (inplace)*/ 454 idt = 4 * (n - 1); 455 for (i = n - 1; i >= 0; i--) { 456 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 457 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 458 PetscScalar *xtemp = &x[idt]; 459 MatScalar *ttemp = &t[idt]; 460 xtemp[3] = (PetscScalar)ttemp[3]; 461 xtemp[2] = (PetscScalar)ttemp[2]; 462 xtemp[1] = (PetscScalar)ttemp[1]; 463 xtemp[0] = (PetscScalar)ttemp[0]; 464 idt -= 4; 465 } 466 467 } /* End of artificial scope. */ 468 PetscCall(VecRestoreArray(bb, &b)); 469 PetscCall(VecRestoreArray(xx, &x)); 470 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 471 SSE_SCOPE_END; 472 PetscFunctionReturn(0); 473 } 474 475 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A, Vec bb, Vec xx) { 476 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 477 int *aj = a->j; 478 int *ai = a->i, n = a->mbs, *diag = a->diag; 479 MatScalar *aa = a->a; 480 PetscScalar *x, *b; 481 482 PetscFunctionBegin; 483 SSE_SCOPE_BEGIN; 484 /* 485 Note: This code currently uses demotion of double 486 to float when performing the mixed-mode computation. 487 This may not be numerically reasonable for all applications. 488 */ 489 PREFETCH_NTA(aa + 16 * ai[1]); 490 491 PetscCall(VecGetArray(bb, &b)); 492 PetscCall(VecGetArray(xx, &x)); 493 { 494 /* x will first be computed in single precision then promoted inplace to double */ 495 MatScalar *v, *t = (MatScalar *)x; 496 int nz, i, idt, ai16; 497 int jdx, idx; 498 int *vi; 499 /* Forward solve the lower triangular factor. */ 500 501 /* First block is the identity. */ 502 idx = 0; 503 CONVERT_DOUBLE4_FLOAT4(t, b); 504 v = aa + 16 * ai[1]; 505 506 for (i = 1; i < n;) { 507 PREFETCH_NTA(&v[8]); 508 vi = aj + ai[i]; 509 nz = diag[i] - ai[i]; 510 idx += 4; 511 512 /* Demote RHS from double to float. */ 513 CONVERT_DOUBLE4_FLOAT4(&t[idx], &b[idx]); 514 LOAD_PS(&t[idx], XMM7); 515 516 while (nz--) { 517 PREFETCH_NTA(&v[16]); 518 jdx = 4 * (*vi++); 519 /* jdx = *vi++; */ 520 521 /* 4x4 Matrix-Vector product with negative accumulation: */ 522 SSE_INLINE_BEGIN_2(&t[jdx], v) 523 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 524 525 /* First Column */ 526 SSE_COPY_PS(XMM0, XMM6) 527 SSE_SHUFFLE(XMM0, XMM0, 0x00) 528 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 529 SSE_SUB_PS(XMM7, XMM0) 530 531 /* Second Column */ 532 SSE_COPY_PS(XMM1, XMM6) 533 SSE_SHUFFLE(XMM1, XMM1, 0x55) 534 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 535 SSE_SUB_PS(XMM7, XMM1) 536 537 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 538 539 /* Third Column */ 540 SSE_COPY_PS(XMM2, XMM6) 541 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 542 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 543 SSE_SUB_PS(XMM7, XMM2) 544 545 /* Fourth Column */ 546 SSE_COPY_PS(XMM3, XMM6) 547 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 548 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 549 SSE_SUB_PS(XMM7, XMM3) 550 SSE_INLINE_END_2 551 552 v += 16; 553 } 554 v = aa + 16 * ai[++i]; 555 PREFETCH_NTA(v); 556 STORE_PS(&t[idx], XMM7); 557 } 558 559 /* Backward solve the upper triangular factor.*/ 560 561 idt = 4 * (n - 1); 562 ai16 = 16 * diag[n - 1]; 563 v = aa + ai16 + 16; 564 for (i = n - 1; i >= 0;) { 565 PREFETCH_NTA(&v[8]); 566 vi = aj + diag[i] + 1; 567 nz = ai[i + 1] - diag[i] - 1; 568 569 LOAD_PS(&t[idt], XMM7); 570 571 while (nz--) { 572 PREFETCH_NTA(&v[16]); 573 idx = 4 * (*vi++); 574 /* idx = *vi++; */ 575 576 /* 4x4 Matrix-Vector Product with negative accumulation: */ 577 SSE_INLINE_BEGIN_2(&t[idx], v) 578 SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 579 580 /* First Column */ 581 SSE_COPY_PS(XMM0, XMM6) 582 SSE_SHUFFLE(XMM0, XMM0, 0x00) 583 SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 584 SSE_SUB_PS(XMM7, XMM0) 585 586 /* Second Column */ 587 SSE_COPY_PS(XMM1, XMM6) 588 SSE_SHUFFLE(XMM1, XMM1, 0x55) 589 SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 590 SSE_SUB_PS(XMM7, XMM1) 591 592 SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 593 594 /* Third Column */ 595 SSE_COPY_PS(XMM2, XMM6) 596 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 597 SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 598 SSE_SUB_PS(XMM7, XMM2) 599 600 /* Fourth Column */ 601 SSE_COPY_PS(XMM3, XMM6) 602 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 603 SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 604 SSE_SUB_PS(XMM7, XMM3) 605 SSE_INLINE_END_2 606 v += 16; 607 } 608 v = aa + ai16; 609 ai16 = 16 * diag[--i]; 610 PREFETCH_NTA(aa + ai16 + 16); 611 /* 612 Scale the result by the diagonal 4x4 block, 613 which was inverted as part of the factorization 614 */ 615 SSE_INLINE_BEGIN_3(v, &t[idt], aa + ai16) 616 /* First Column */ 617 SSE_COPY_PS(XMM0, XMM7) 618 SSE_SHUFFLE(XMM0, XMM0, 0x00) 619 SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 620 621 /* Second Column */ 622 SSE_COPY_PS(XMM1, XMM7) 623 SSE_SHUFFLE(XMM1, XMM1, 0x55) 624 SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 625 SSE_ADD_PS(XMM0, XMM1) 626 627 SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 628 629 /* Third Column */ 630 SSE_COPY_PS(XMM2, XMM7) 631 SSE_SHUFFLE(XMM2, XMM2, 0xAA) 632 SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 633 SSE_ADD_PS(XMM0, XMM2) 634 635 /* Fourth Column */ 636 SSE_COPY_PS(XMM3, XMM7) 637 SSE_SHUFFLE(XMM3, XMM3, 0xFF) 638 SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 639 SSE_ADD_PS(XMM0, XMM3) 640 641 SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 642 SSE_INLINE_END_3 643 644 v = aa + ai16 + 16; 645 idt -= 4; 646 } 647 648 /* Convert t from single precision back to double precision (inplace)*/ 649 idt = 4 * (n - 1); 650 for (i = n - 1; i >= 0; i--) { 651 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 652 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 653 PetscScalar *xtemp = &x[idt]; 654 MatScalar *ttemp = &t[idt]; 655 xtemp[3] = (PetscScalar)ttemp[3]; 656 xtemp[2] = (PetscScalar)ttemp[2]; 657 xtemp[1] = (PetscScalar)ttemp[1]; 658 xtemp[0] = (PetscScalar)ttemp[0]; 659 idt -= 4; 660 } 661 662 } /* End of artificial scope. */ 663 PetscCall(VecRestoreArray(bb, &b)); 664 PetscCall(VecRestoreArray(xx, &x)); 665 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 666 SSE_SCOPE_END; 667 PetscFunctionReturn(0); 668 } 669 670 #endif 671