1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* 5 Special case where the matrix was ILU(0) factored in the natural 6 ordering. This eliminates the need for the column and row permutation. 7 */ 8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 9 { 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 11 PetscInt n =a->mbs; 12 const PetscInt *ai=a->i,*aj=a->j; 13 const PetscInt *diag = a->diag; 14 const MatScalar *aa =a->a; 15 PetscScalar *x; 16 const PetscScalar *b; 17 18 PetscFunctionBegin; 19 PetscCall(VecGetArrayRead(bb,&b)); 20 PetscCall(VecGetArray(xx,&x)); 21 22 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 23 { 24 static PetscScalar w[2000]; /* very BAD need to fix */ 25 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 26 } 27 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 28 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 29 #else 30 { 31 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 32 const MatScalar *v; 33 PetscInt jdx,idt,idx,nz,i,ai16; 34 const PetscInt *vi; 35 36 /* forward solve the lower triangular */ 37 idx = 0; 38 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 39 for (i=1; i<n; i++) { 40 v = aa + 16*ai[i]; 41 vi = aj + ai[i]; 42 nz = diag[i] - ai[i]; 43 idx += 4; 44 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 45 while (nz--) { 46 jdx = 4*(*vi++); 47 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 48 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 49 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 50 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 51 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 52 v += 16; 53 } 54 x[idx] = s1; 55 x[1+idx] = s2; 56 x[2+idx] = s3; 57 x[3+idx] = s4; 58 } 59 /* backward solve the upper triangular */ 60 idt = 4*(n-1); 61 for (i=n-1; i>=0; i--) { 62 ai16 = 16*diag[i]; 63 v = aa + ai16 + 16; 64 vi = aj + diag[i] + 1; 65 nz = ai[i+1] - diag[i] - 1; 66 s1 = x[idt]; s2 = x[1+idt]; 67 s3 = x[2+idt];s4 = x[3+idt]; 68 while (nz--) { 69 idx = 4*(*vi++); 70 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 71 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 72 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 73 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 74 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 75 v += 16; 76 } 77 v = aa + ai16; 78 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 79 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 80 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 81 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 82 idt -= 4; 83 } 84 } 85 #endif 86 87 PetscCall(VecRestoreArrayRead(bb,&b)); 88 PetscCall(VecRestoreArray(xx,&x)); 89 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 94 { 95 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 96 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 97 PetscInt i,k,nz,idx,jdx,idt; 98 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 99 const MatScalar *aa=a->a,*v; 100 PetscScalar *x; 101 const PetscScalar *b; 102 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 103 104 PetscFunctionBegin; 105 PetscCall(VecGetArrayRead(bb,&b)); 106 PetscCall(VecGetArray(xx,&x)); 107 /* forward solve the lower triangular */ 108 idx = 0; 109 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 110 for (i=1; i<n; i++) { 111 v = aa + bs2*ai[i]; 112 vi = aj + ai[i]; 113 nz = ai[i+1] - ai[i]; 114 idx = bs*i; 115 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 116 for (k=0; k<nz; k++) { 117 jdx = bs*vi[k]; 118 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 119 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 120 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 121 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 122 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 123 124 v += bs2; 125 } 126 127 x[idx] = s1; 128 x[1+idx] = s2; 129 x[2+idx] = s3; 130 x[3+idx] = s4; 131 } 132 133 /* backward solve the upper triangular */ 134 for (i=n-1; i>=0; i--) { 135 v = aa + bs2*(adiag[i+1]+1); 136 vi = aj + adiag[i+1]+1; 137 nz = adiag[i] - adiag[i+1]-1; 138 idt = bs*i; 139 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 140 141 for (k=0; k<nz; k++) { 142 idx = bs*vi[k]; 143 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 144 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 145 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 146 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 147 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 148 149 v += bs2; 150 } 151 /* x = inv_diagonal*x */ 152 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 153 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 154 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 155 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 156 157 } 158 159 PetscCall(VecRestoreArrayRead(bb,&b)); 160 PetscCall(VecRestoreArray(xx,&x)); 161 PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n)); 162 PetscFunctionReturn(0); 163 } 164 165 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 166 { 167 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 168 const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 169 const MatScalar *aa=a->a; 170 const PetscScalar *b; 171 PetscScalar *x; 172 173 PetscFunctionBegin; 174 PetscCall(VecGetArrayRead(bb,&b)); 175 PetscCall(VecGetArray(xx,&x)); 176 177 { 178 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 179 const MatScalar *v; 180 MatScalar *t=(MatScalar*)x; 181 PetscInt jdx,idt,idx,nz,i,ai16; 182 const PetscInt *vi; 183 184 /* forward solve the lower triangular */ 185 idx = 0; 186 t[0] = (MatScalar)b[0]; 187 t[1] = (MatScalar)b[1]; 188 t[2] = (MatScalar)b[2]; 189 t[3] = (MatScalar)b[3]; 190 for (i=1; i<n; i++) { 191 v = aa + 16*ai[i]; 192 vi = aj + ai[i]; 193 nz = diag[i] - ai[i]; 194 idx += 4; 195 s1 = (MatScalar)b[idx]; 196 s2 = (MatScalar)b[1+idx]; 197 s3 = (MatScalar)b[2+idx]; 198 s4 = (MatScalar)b[3+idx]; 199 while (nz--) { 200 jdx = 4*(*vi++); 201 x1 = t[jdx]; 202 x2 = t[1+jdx]; 203 x3 = t[2+jdx]; 204 x4 = t[3+jdx]; 205 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 206 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 207 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 208 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 209 v += 16; 210 } 211 t[idx] = s1; 212 t[1+idx] = s2; 213 t[2+idx] = s3; 214 t[3+idx] = s4; 215 } 216 /* backward solve the upper triangular */ 217 idt = 4*(n-1); 218 for (i=n-1; i>=0; i--) { 219 ai16 = 16*diag[i]; 220 v = aa + ai16 + 16; 221 vi = aj + diag[i] + 1; 222 nz = ai[i+1] - diag[i] - 1; 223 s1 = t[idt]; 224 s2 = t[1+idt]; 225 s3 = t[2+idt]; 226 s4 = t[3+idt]; 227 while (nz--) { 228 idx = 4*(*vi++); 229 x1 = (MatScalar)x[idx]; 230 x2 = (MatScalar)x[1+idx]; 231 x3 = (MatScalar)x[2+idx]; 232 x4 = (MatScalar)x[3+idx]; 233 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 234 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 235 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 236 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 237 v += 16; 238 } 239 v = aa + ai16; 240 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 241 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 242 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 243 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 244 idt -= 4; 245 } 246 } 247 248 PetscCall(VecRestoreArrayRead(bb,&b)); 249 PetscCall(VecRestoreArray(xx,&x)); 250 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 251 PetscFunctionReturn(0); 252 } 253 254 #if defined(PETSC_HAVE_SSE) 255 256 #include PETSC_HAVE_SSE 257 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 258 { 259 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 260 unsigned short *aj=(unsigned short*)a->j; 261 int *ai=a->i,n=a->mbs,*diag = a->diag; 262 MatScalar *aa=a->a; 263 PetscScalar *x,*b; 264 265 PetscFunctionBegin; 266 SSE_SCOPE_BEGIN; 267 /* 268 Note: This code currently uses demotion of double 269 to float when performing the mixed-mode computation. 270 This may not be numerically reasonable for all applications. 271 */ 272 PREFETCH_NTA(aa+16*ai[1]); 273 274 PetscCall(VecGetArray(bb,&b)); 275 PetscCall(VecGetArray(xx,&x)); 276 { 277 /* x will first be computed in single precision then promoted inplace to double */ 278 MatScalar *v,*t=(MatScalar*)x; 279 int nz,i,idt,ai16; 280 unsigned int jdx,idx; 281 unsigned short *vi; 282 /* Forward solve the lower triangular factor. */ 283 284 /* First block is the identity. */ 285 idx = 0; 286 CONVERT_DOUBLE4_FLOAT4(t,b); 287 v = aa + 16*((unsigned int)ai[1]); 288 289 for (i=1; i<n;) { 290 PREFETCH_NTA(&v[8]); 291 vi = aj + ai[i]; 292 nz = diag[i] - ai[i]; 293 idx += 4; 294 295 /* Demote RHS from double to float. */ 296 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 297 LOAD_PS(&t[idx],XMM7); 298 299 while (nz--) { 300 PREFETCH_NTA(&v[16]); 301 jdx = 4*((unsigned int)(*vi++)); 302 303 /* 4x4 Matrix-Vector product with negative accumulation: */ 304 SSE_INLINE_BEGIN_2(&t[jdx],v) 305 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 306 307 /* First Column */ 308 SSE_COPY_PS(XMM0,XMM6) 309 SSE_SHUFFLE(XMM0,XMM0,0x00) 310 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 311 SSE_SUB_PS(XMM7,XMM0) 312 313 /* Second Column */ 314 SSE_COPY_PS(XMM1,XMM6) 315 SSE_SHUFFLE(XMM1,XMM1,0x55) 316 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 317 SSE_SUB_PS(XMM7,XMM1) 318 319 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 320 321 /* Third Column */ 322 SSE_COPY_PS(XMM2,XMM6) 323 SSE_SHUFFLE(XMM2,XMM2,0xAA) 324 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 325 SSE_SUB_PS(XMM7,XMM2) 326 327 /* Fourth Column */ 328 SSE_COPY_PS(XMM3,XMM6) 329 SSE_SHUFFLE(XMM3,XMM3,0xFF) 330 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 331 SSE_SUB_PS(XMM7,XMM3) 332 SSE_INLINE_END_2 333 334 v += 16; 335 } 336 v = aa + 16*ai[++i]; 337 PREFETCH_NTA(v); 338 STORE_PS(&t[idx],XMM7); 339 } 340 341 /* Backward solve the upper triangular factor.*/ 342 343 idt = 4*(n-1); 344 ai16 = 16*diag[n-1]; 345 v = aa + ai16 + 16; 346 for (i=n-1; i>=0;) { 347 PREFETCH_NTA(&v[8]); 348 vi = aj + diag[i] + 1; 349 nz = ai[i+1] - diag[i] - 1; 350 351 LOAD_PS(&t[idt],XMM7); 352 353 while (nz--) { 354 PREFETCH_NTA(&v[16]); 355 idx = 4*((unsigned int)(*vi++)); 356 357 /* 4x4 Matrix-Vector Product with negative accumulation: */ 358 SSE_INLINE_BEGIN_2(&t[idx],v) 359 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 360 361 /* First Column */ 362 SSE_COPY_PS(XMM0,XMM6) 363 SSE_SHUFFLE(XMM0,XMM0,0x00) 364 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 365 SSE_SUB_PS(XMM7,XMM0) 366 367 /* Second Column */ 368 SSE_COPY_PS(XMM1,XMM6) 369 SSE_SHUFFLE(XMM1,XMM1,0x55) 370 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 371 SSE_SUB_PS(XMM7,XMM1) 372 373 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 374 375 /* Third Column */ 376 SSE_COPY_PS(XMM2,XMM6) 377 SSE_SHUFFLE(XMM2,XMM2,0xAA) 378 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 379 SSE_SUB_PS(XMM7,XMM2) 380 381 /* Fourth Column */ 382 SSE_COPY_PS(XMM3,XMM6) 383 SSE_SHUFFLE(XMM3,XMM3,0xFF) 384 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 385 SSE_SUB_PS(XMM7,XMM3) 386 SSE_INLINE_END_2 387 v += 16; 388 } 389 v = aa + ai16; 390 ai16 = 16*diag[--i]; 391 PREFETCH_NTA(aa+ai16+16); 392 /* 393 Scale the result by the diagonal 4x4 block, 394 which was inverted as part of the factorization 395 */ 396 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 397 /* First Column */ 398 SSE_COPY_PS(XMM0,XMM7) 399 SSE_SHUFFLE(XMM0,XMM0,0x00) 400 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 401 402 /* Second Column */ 403 SSE_COPY_PS(XMM1,XMM7) 404 SSE_SHUFFLE(XMM1,XMM1,0x55) 405 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 406 SSE_ADD_PS(XMM0,XMM1) 407 408 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 409 410 /* Third Column */ 411 SSE_COPY_PS(XMM2,XMM7) 412 SSE_SHUFFLE(XMM2,XMM2,0xAA) 413 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 414 SSE_ADD_PS(XMM0,XMM2) 415 416 /* Fourth Column */ 417 SSE_COPY_PS(XMM3,XMM7) 418 SSE_SHUFFLE(XMM3,XMM3,0xFF) 419 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 420 SSE_ADD_PS(XMM0,XMM3) 421 422 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 423 SSE_INLINE_END_3 424 425 v = aa + ai16 + 16; 426 idt -= 4; 427 } 428 429 /* Convert t from single precision back to double precision (inplace)*/ 430 idt = 4*(n-1); 431 for (i=n-1; i>=0; i--) { 432 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 433 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 434 PetscScalar *xtemp=&x[idt]; 435 MatScalar *ttemp=&t[idt]; 436 xtemp[3] = (PetscScalar)ttemp[3]; 437 xtemp[2] = (PetscScalar)ttemp[2]; 438 xtemp[1] = (PetscScalar)ttemp[1]; 439 xtemp[0] = (PetscScalar)ttemp[0]; 440 idt -= 4; 441 } 442 443 } /* End of artificial scope. */ 444 PetscCall(VecRestoreArray(bb,&b)); 445 PetscCall(VecRestoreArray(xx,&x)); 446 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 447 SSE_SCOPE_END; 448 PetscFunctionReturn(0); 449 } 450 451 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 452 { 453 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 454 int *aj=a->j; 455 int *ai=a->i,n=a->mbs,*diag = a->diag; 456 MatScalar *aa=a->a; 457 PetscScalar *x,*b; 458 459 PetscFunctionBegin; 460 SSE_SCOPE_BEGIN; 461 /* 462 Note: This code currently uses demotion of double 463 to float when performing the mixed-mode computation. 464 This may not be numerically reasonable for all applications. 465 */ 466 PREFETCH_NTA(aa+16*ai[1]); 467 468 PetscCall(VecGetArray(bb,&b)); 469 PetscCall(VecGetArray(xx,&x)); 470 { 471 /* x will first be computed in single precision then promoted inplace to double */ 472 MatScalar *v,*t=(MatScalar*)x; 473 int nz,i,idt,ai16; 474 int jdx,idx; 475 int *vi; 476 /* Forward solve the lower triangular factor. */ 477 478 /* First block is the identity. */ 479 idx = 0; 480 CONVERT_DOUBLE4_FLOAT4(t,b); 481 v = aa + 16*ai[1]; 482 483 for (i=1; i<n;) { 484 PREFETCH_NTA(&v[8]); 485 vi = aj + ai[i]; 486 nz = diag[i] - ai[i]; 487 idx += 4; 488 489 /* Demote RHS from double to float. */ 490 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 491 LOAD_PS(&t[idx],XMM7); 492 493 while (nz--) { 494 PREFETCH_NTA(&v[16]); 495 jdx = 4*(*vi++); 496 /* jdx = *vi++; */ 497 498 /* 4x4 Matrix-Vector product with negative accumulation: */ 499 SSE_INLINE_BEGIN_2(&t[jdx],v) 500 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 501 502 /* First Column */ 503 SSE_COPY_PS(XMM0,XMM6) 504 SSE_SHUFFLE(XMM0,XMM0,0x00) 505 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 506 SSE_SUB_PS(XMM7,XMM0) 507 508 /* Second Column */ 509 SSE_COPY_PS(XMM1,XMM6) 510 SSE_SHUFFLE(XMM1,XMM1,0x55) 511 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 512 SSE_SUB_PS(XMM7,XMM1) 513 514 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 515 516 /* Third Column */ 517 SSE_COPY_PS(XMM2,XMM6) 518 SSE_SHUFFLE(XMM2,XMM2,0xAA) 519 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 520 SSE_SUB_PS(XMM7,XMM2) 521 522 /* Fourth Column */ 523 SSE_COPY_PS(XMM3,XMM6) 524 SSE_SHUFFLE(XMM3,XMM3,0xFF) 525 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 526 SSE_SUB_PS(XMM7,XMM3) 527 SSE_INLINE_END_2 528 529 v += 16; 530 } 531 v = aa + 16*ai[++i]; 532 PREFETCH_NTA(v); 533 STORE_PS(&t[idx],XMM7); 534 } 535 536 /* Backward solve the upper triangular factor.*/ 537 538 idt = 4*(n-1); 539 ai16 = 16*diag[n-1]; 540 v = aa + ai16 + 16; 541 for (i=n-1; i>=0;) { 542 PREFETCH_NTA(&v[8]); 543 vi = aj + diag[i] + 1; 544 nz = ai[i+1] - diag[i] - 1; 545 546 LOAD_PS(&t[idt],XMM7); 547 548 while (nz--) { 549 PREFETCH_NTA(&v[16]); 550 idx = 4*(*vi++); 551 /* idx = *vi++; */ 552 553 /* 4x4 Matrix-Vector Product with negative accumulation: */ 554 SSE_INLINE_BEGIN_2(&t[idx],v) 555 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 556 557 /* First Column */ 558 SSE_COPY_PS(XMM0,XMM6) 559 SSE_SHUFFLE(XMM0,XMM0,0x00) 560 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 561 SSE_SUB_PS(XMM7,XMM0) 562 563 /* Second Column */ 564 SSE_COPY_PS(XMM1,XMM6) 565 SSE_SHUFFLE(XMM1,XMM1,0x55) 566 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 567 SSE_SUB_PS(XMM7,XMM1) 568 569 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 570 571 /* Third Column */ 572 SSE_COPY_PS(XMM2,XMM6) 573 SSE_SHUFFLE(XMM2,XMM2,0xAA) 574 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 575 SSE_SUB_PS(XMM7,XMM2) 576 577 /* Fourth Column */ 578 SSE_COPY_PS(XMM3,XMM6) 579 SSE_SHUFFLE(XMM3,XMM3,0xFF) 580 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 581 SSE_SUB_PS(XMM7,XMM3) 582 SSE_INLINE_END_2 583 v += 16; 584 } 585 v = aa + ai16; 586 ai16 = 16*diag[--i]; 587 PREFETCH_NTA(aa+ai16+16); 588 /* 589 Scale the result by the diagonal 4x4 block, 590 which was inverted as part of the factorization 591 */ 592 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 593 /* First Column */ 594 SSE_COPY_PS(XMM0,XMM7) 595 SSE_SHUFFLE(XMM0,XMM0,0x00) 596 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 597 598 /* Second Column */ 599 SSE_COPY_PS(XMM1,XMM7) 600 SSE_SHUFFLE(XMM1,XMM1,0x55) 601 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 602 SSE_ADD_PS(XMM0,XMM1) 603 604 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 605 606 /* Third Column */ 607 SSE_COPY_PS(XMM2,XMM7) 608 SSE_SHUFFLE(XMM2,XMM2,0xAA) 609 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 610 SSE_ADD_PS(XMM0,XMM2) 611 612 /* Fourth Column */ 613 SSE_COPY_PS(XMM3,XMM7) 614 SSE_SHUFFLE(XMM3,XMM3,0xFF) 615 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 616 SSE_ADD_PS(XMM0,XMM3) 617 618 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 619 SSE_INLINE_END_3 620 621 v = aa + ai16 + 16; 622 idt -= 4; 623 } 624 625 /* Convert t from single precision back to double precision (inplace)*/ 626 idt = 4*(n-1); 627 for (i=n-1; i>=0; i--) { 628 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 629 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 630 PetscScalar *xtemp=&x[idt]; 631 MatScalar *ttemp=&t[idt]; 632 xtemp[3] = (PetscScalar)ttemp[3]; 633 xtemp[2] = (PetscScalar)ttemp[2]; 634 xtemp[1] = (PetscScalar)ttemp[1]; 635 xtemp[0] = (PetscScalar)ttemp[0]; 636 idt -= 4; 637 } 638 639 } /* End of artificial scope. */ 640 PetscCall(VecRestoreArray(bb,&b)); 641 PetscCall(VecRestoreArray(xx,&x)); 642 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 643 SSE_SCOPE_END; 644 PetscFunctionReturn(0); 645 } 646 647 #endif 648