1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* 5 Special case where the matrix was ILU(0) factored in the natural 6 ordering. This eliminates the need for the column and row permutation. 7 */ 8 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 9 { 10 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 11 PetscInt n =a->mbs; 12 const PetscInt *ai=a->i,*aj=a->j; 13 PetscErrorCode ierr; 14 const PetscInt *diag = a->diag; 15 const MatScalar *aa =a->a; 16 PetscScalar *x; 17 const PetscScalar *b; 18 19 PetscFunctionBegin; 20 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22 23 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 24 { 25 static PetscScalar w[2000]; /* very BAD need to fix */ 26 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 27 } 28 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 29 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 30 #else 31 { 32 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 33 const MatScalar *v; 34 PetscInt jdx,idt,idx,nz,i,ai16; 35 const PetscInt *vi; 36 37 /* forward solve the lower triangular */ 38 idx = 0; 39 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 40 for (i=1; i<n; i++) { 41 v = aa + 16*ai[i]; 42 vi = aj + ai[i]; 43 nz = diag[i] - ai[i]; 44 idx += 4; 45 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 46 while (nz--) { 47 jdx = 4*(*vi++); 48 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 49 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 50 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 51 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 52 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 53 v += 16; 54 } 55 x[idx] = s1; 56 x[1+idx] = s2; 57 x[2+idx] = s3; 58 x[3+idx] = s4; 59 } 60 /* backward solve the upper triangular */ 61 idt = 4*(n-1); 62 for (i=n-1; i>=0; i--) { 63 ai16 = 16*diag[i]; 64 v = aa + ai16 + 16; 65 vi = aj + diag[i] + 1; 66 nz = ai[i+1] - diag[i] - 1; 67 s1 = x[idt]; s2 = x[1+idt]; 68 s3 = x[2+idt];s4 = x[3+idt]; 69 while (nz--) { 70 idx = 4*(*vi++); 71 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 72 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 73 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 74 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 75 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 76 v += 16; 77 } 78 v = aa + ai16; 79 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 80 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 81 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 82 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 83 idt -= 4; 84 } 85 } 86 #endif 87 88 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 89 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 90 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 91 PetscFunctionReturn(0); 92 } 93 94 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 95 { 96 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 97 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 98 PetscInt i,k,nz,idx,jdx,idt; 99 PetscErrorCode ierr; 100 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 101 const MatScalar *aa=a->a,*v; 102 PetscScalar *x; 103 const PetscScalar *b; 104 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 105 106 PetscFunctionBegin; 107 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 108 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 109 /* forward solve the lower triangular */ 110 idx = 0; 111 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 112 for (i=1; i<n; i++) { 113 v = aa + bs2*ai[i]; 114 vi = aj + ai[i]; 115 nz = ai[i+1] - ai[i]; 116 idx = bs*i; 117 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 118 for (k=0; k<nz; k++) { 119 jdx = bs*vi[k]; 120 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 121 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 122 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 123 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 124 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 125 126 v += bs2; 127 } 128 129 x[idx] = s1; 130 x[1+idx] = s2; 131 x[2+idx] = s3; 132 x[3+idx] = s4; 133 } 134 135 /* backward solve the upper triangular */ 136 for (i=n-1; i>=0; i--) { 137 v = aa + bs2*(adiag[i+1]+1); 138 vi = aj + adiag[i+1]+1; 139 nz = adiag[i] - adiag[i+1]-1; 140 idt = bs*i; 141 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 142 143 for (k=0; k<nz; k++) { 144 idx = bs*vi[k]; 145 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 146 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 147 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 148 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 149 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 150 151 v += bs2; 152 } 153 /* x = inv_diagonal*x */ 154 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 155 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 156 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 157 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 158 159 } 160 161 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 162 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 163 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 164 PetscFunctionReturn(0); 165 } 166 167 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 168 { 169 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 170 const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 171 PetscErrorCode ierr; 172 const MatScalar *aa=a->a; 173 const PetscScalar *b; 174 PetscScalar *x; 175 176 PetscFunctionBegin; 177 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 178 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 179 180 { 181 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 182 const MatScalar *v; 183 MatScalar *t=(MatScalar*)x; 184 PetscInt jdx,idt,idx,nz,i,ai16; 185 const PetscInt *vi; 186 187 /* forward solve the lower triangular */ 188 idx = 0; 189 t[0] = (MatScalar)b[0]; 190 t[1] = (MatScalar)b[1]; 191 t[2] = (MatScalar)b[2]; 192 t[3] = (MatScalar)b[3]; 193 for (i=1; i<n; i++) { 194 v = aa + 16*ai[i]; 195 vi = aj + ai[i]; 196 nz = diag[i] - ai[i]; 197 idx += 4; 198 s1 = (MatScalar)b[idx]; 199 s2 = (MatScalar)b[1+idx]; 200 s3 = (MatScalar)b[2+idx]; 201 s4 = (MatScalar)b[3+idx]; 202 while (nz--) { 203 jdx = 4*(*vi++); 204 x1 = t[jdx]; 205 x2 = t[1+jdx]; 206 x3 = t[2+jdx]; 207 x4 = t[3+jdx]; 208 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 209 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 210 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 211 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 212 v += 16; 213 } 214 t[idx] = s1; 215 t[1+idx] = s2; 216 t[2+idx] = s3; 217 t[3+idx] = s4; 218 } 219 /* backward solve the upper triangular */ 220 idt = 4*(n-1); 221 for (i=n-1; i>=0; i--) { 222 ai16 = 16*diag[i]; 223 v = aa + ai16 + 16; 224 vi = aj + diag[i] + 1; 225 nz = ai[i+1] - diag[i] - 1; 226 s1 = t[idt]; 227 s2 = t[1+idt]; 228 s3 = t[2+idt]; 229 s4 = t[3+idt]; 230 while (nz--) { 231 idx = 4*(*vi++); 232 x1 = (MatScalar)x[idx]; 233 x2 = (MatScalar)x[1+idx]; 234 x3 = (MatScalar)x[2+idx]; 235 x4 = (MatScalar)x[3+idx]; 236 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 237 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 238 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 239 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 240 v += 16; 241 } 242 v = aa + ai16; 243 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 244 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 245 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 246 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 247 idt -= 4; 248 } 249 } 250 251 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 252 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 253 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 254 PetscFunctionReturn(0); 255 } 256 257 #if defined(PETSC_HAVE_SSE) 258 259 #include PETSC_HAVE_SSE 260 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 261 { 262 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 263 unsigned short *aj=(unsigned short*)a->j; 264 PetscErrorCode ierr; 265 int *ai=a->i,n=a->mbs,*diag = a->diag; 266 MatScalar *aa=a->a; 267 PetscScalar *x,*b; 268 269 PetscFunctionBegin; 270 SSE_SCOPE_BEGIN; 271 /* 272 Note: This code currently uses demotion of double 273 to float when performing the mixed-mode computation. 274 This may not be numerically reasonable for all applications. 275 */ 276 PREFETCH_NTA(aa+16*ai[1]); 277 278 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 279 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 280 { 281 /* x will first be computed in single precision then promoted inplace to double */ 282 MatScalar *v,*t=(MatScalar*)x; 283 int nz,i,idt,ai16; 284 unsigned int jdx,idx; 285 unsigned short *vi; 286 /* Forward solve the lower triangular factor. */ 287 288 /* First block is the identity. */ 289 idx = 0; 290 CONVERT_DOUBLE4_FLOAT4(t,b); 291 v = aa + 16*((unsigned int)ai[1]); 292 293 for (i=1; i<n;) { 294 PREFETCH_NTA(&v[8]); 295 vi = aj + ai[i]; 296 nz = diag[i] - ai[i]; 297 idx += 4; 298 299 /* Demote RHS from double to float. */ 300 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 301 LOAD_PS(&t[idx],XMM7); 302 303 while (nz--) { 304 PREFETCH_NTA(&v[16]); 305 jdx = 4*((unsigned int)(*vi++)); 306 307 /* 4x4 Matrix-Vector product with negative accumulation: */ 308 SSE_INLINE_BEGIN_2(&t[jdx],v) 309 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 310 311 /* First Column */ 312 SSE_COPY_PS(XMM0,XMM6) 313 SSE_SHUFFLE(XMM0,XMM0,0x00) 314 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 315 SSE_SUB_PS(XMM7,XMM0) 316 317 /* Second Column */ 318 SSE_COPY_PS(XMM1,XMM6) 319 SSE_SHUFFLE(XMM1,XMM1,0x55) 320 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 321 SSE_SUB_PS(XMM7,XMM1) 322 323 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 324 325 /* Third Column */ 326 SSE_COPY_PS(XMM2,XMM6) 327 SSE_SHUFFLE(XMM2,XMM2,0xAA) 328 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 329 SSE_SUB_PS(XMM7,XMM2) 330 331 /* Fourth Column */ 332 SSE_COPY_PS(XMM3,XMM6) 333 SSE_SHUFFLE(XMM3,XMM3,0xFF) 334 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 335 SSE_SUB_PS(XMM7,XMM3) 336 SSE_INLINE_END_2 337 338 v += 16; 339 } 340 v = aa + 16*ai[++i]; 341 PREFETCH_NTA(v); 342 STORE_PS(&t[idx],XMM7); 343 } 344 345 /* Backward solve the upper triangular factor.*/ 346 347 idt = 4*(n-1); 348 ai16 = 16*diag[n-1]; 349 v = aa + ai16 + 16; 350 for (i=n-1; i>=0;) { 351 PREFETCH_NTA(&v[8]); 352 vi = aj + diag[i] + 1; 353 nz = ai[i+1] - diag[i] - 1; 354 355 LOAD_PS(&t[idt],XMM7); 356 357 while (nz--) { 358 PREFETCH_NTA(&v[16]); 359 idx = 4*((unsigned int)(*vi++)); 360 361 /* 4x4 Matrix-Vector Product with negative accumulation: */ 362 SSE_INLINE_BEGIN_2(&t[idx],v) 363 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 364 365 /* First Column */ 366 SSE_COPY_PS(XMM0,XMM6) 367 SSE_SHUFFLE(XMM0,XMM0,0x00) 368 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 369 SSE_SUB_PS(XMM7,XMM0) 370 371 /* Second Column */ 372 SSE_COPY_PS(XMM1,XMM6) 373 SSE_SHUFFLE(XMM1,XMM1,0x55) 374 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 375 SSE_SUB_PS(XMM7,XMM1) 376 377 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 378 379 /* Third Column */ 380 SSE_COPY_PS(XMM2,XMM6) 381 SSE_SHUFFLE(XMM2,XMM2,0xAA) 382 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 383 SSE_SUB_PS(XMM7,XMM2) 384 385 /* Fourth Column */ 386 SSE_COPY_PS(XMM3,XMM6) 387 SSE_SHUFFLE(XMM3,XMM3,0xFF) 388 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 389 SSE_SUB_PS(XMM7,XMM3) 390 SSE_INLINE_END_2 391 v += 16; 392 } 393 v = aa + ai16; 394 ai16 = 16*diag[--i]; 395 PREFETCH_NTA(aa+ai16+16); 396 /* 397 Scale the result by the diagonal 4x4 block, 398 which was inverted as part of the factorization 399 */ 400 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 401 /* First Column */ 402 SSE_COPY_PS(XMM0,XMM7) 403 SSE_SHUFFLE(XMM0,XMM0,0x00) 404 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 405 406 /* Second Column */ 407 SSE_COPY_PS(XMM1,XMM7) 408 SSE_SHUFFLE(XMM1,XMM1,0x55) 409 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 410 SSE_ADD_PS(XMM0,XMM1) 411 412 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 413 414 /* Third Column */ 415 SSE_COPY_PS(XMM2,XMM7) 416 SSE_SHUFFLE(XMM2,XMM2,0xAA) 417 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 418 SSE_ADD_PS(XMM0,XMM2) 419 420 /* Fourth Column */ 421 SSE_COPY_PS(XMM3,XMM7) 422 SSE_SHUFFLE(XMM3,XMM3,0xFF) 423 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 424 SSE_ADD_PS(XMM0,XMM3) 425 426 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 427 SSE_INLINE_END_3 428 429 v = aa + ai16 + 16; 430 idt -= 4; 431 } 432 433 /* Convert t from single precision back to double precision (inplace)*/ 434 idt = 4*(n-1); 435 for (i=n-1; i>=0; i--) { 436 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 437 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 438 PetscScalar *xtemp=&x[idt]; 439 MatScalar *ttemp=&t[idt]; 440 xtemp[3] = (PetscScalar)ttemp[3]; 441 xtemp[2] = (PetscScalar)ttemp[2]; 442 xtemp[1] = (PetscScalar)ttemp[1]; 443 xtemp[0] = (PetscScalar)ttemp[0]; 444 idt -= 4; 445 } 446 447 } /* End of artificial scope. */ 448 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 449 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 450 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 451 SSE_SCOPE_END; 452 PetscFunctionReturn(0); 453 } 454 455 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 456 { 457 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 458 int *aj=a->j; 459 PetscErrorCode ierr; 460 int *ai=a->i,n=a->mbs,*diag = a->diag; 461 MatScalar *aa=a->a; 462 PetscScalar *x,*b; 463 464 PetscFunctionBegin; 465 SSE_SCOPE_BEGIN; 466 /* 467 Note: This code currently uses demotion of double 468 to float when performing the mixed-mode computation. 469 This may not be numerically reasonable for all applications. 470 */ 471 PREFETCH_NTA(aa+16*ai[1]); 472 473 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 474 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 475 { 476 /* x will first be computed in single precision then promoted inplace to double */ 477 MatScalar *v,*t=(MatScalar*)x; 478 int nz,i,idt,ai16; 479 int jdx,idx; 480 int *vi; 481 /* Forward solve the lower triangular factor. */ 482 483 /* First block is the identity. */ 484 idx = 0; 485 CONVERT_DOUBLE4_FLOAT4(t,b); 486 v = aa + 16*ai[1]; 487 488 for (i=1; i<n;) { 489 PREFETCH_NTA(&v[8]); 490 vi = aj + ai[i]; 491 nz = diag[i] - ai[i]; 492 idx += 4; 493 494 /* Demote RHS from double to float. */ 495 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 496 LOAD_PS(&t[idx],XMM7); 497 498 while (nz--) { 499 PREFETCH_NTA(&v[16]); 500 jdx = 4*(*vi++); 501 /* jdx = *vi++; */ 502 503 /* 4x4 Matrix-Vector product with negative accumulation: */ 504 SSE_INLINE_BEGIN_2(&t[jdx],v) 505 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 506 507 /* First Column */ 508 SSE_COPY_PS(XMM0,XMM6) 509 SSE_SHUFFLE(XMM0,XMM0,0x00) 510 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 511 SSE_SUB_PS(XMM7,XMM0) 512 513 /* Second Column */ 514 SSE_COPY_PS(XMM1,XMM6) 515 SSE_SHUFFLE(XMM1,XMM1,0x55) 516 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 517 SSE_SUB_PS(XMM7,XMM1) 518 519 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 520 521 /* Third Column */ 522 SSE_COPY_PS(XMM2,XMM6) 523 SSE_SHUFFLE(XMM2,XMM2,0xAA) 524 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 525 SSE_SUB_PS(XMM7,XMM2) 526 527 /* Fourth Column */ 528 SSE_COPY_PS(XMM3,XMM6) 529 SSE_SHUFFLE(XMM3,XMM3,0xFF) 530 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 531 SSE_SUB_PS(XMM7,XMM3) 532 SSE_INLINE_END_2 533 534 v += 16; 535 } 536 v = aa + 16*ai[++i]; 537 PREFETCH_NTA(v); 538 STORE_PS(&t[idx],XMM7); 539 } 540 541 /* Backward solve the upper triangular factor.*/ 542 543 idt = 4*(n-1); 544 ai16 = 16*diag[n-1]; 545 v = aa + ai16 + 16; 546 for (i=n-1; i>=0;) { 547 PREFETCH_NTA(&v[8]); 548 vi = aj + diag[i] + 1; 549 nz = ai[i+1] - diag[i] - 1; 550 551 LOAD_PS(&t[idt],XMM7); 552 553 while (nz--) { 554 PREFETCH_NTA(&v[16]); 555 idx = 4*(*vi++); 556 /* idx = *vi++; */ 557 558 /* 4x4 Matrix-Vector Product with negative accumulation: */ 559 SSE_INLINE_BEGIN_2(&t[idx],v) 560 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 561 562 /* First Column */ 563 SSE_COPY_PS(XMM0,XMM6) 564 SSE_SHUFFLE(XMM0,XMM0,0x00) 565 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 566 SSE_SUB_PS(XMM7,XMM0) 567 568 /* Second Column */ 569 SSE_COPY_PS(XMM1,XMM6) 570 SSE_SHUFFLE(XMM1,XMM1,0x55) 571 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 572 SSE_SUB_PS(XMM7,XMM1) 573 574 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 575 576 /* Third Column */ 577 SSE_COPY_PS(XMM2,XMM6) 578 SSE_SHUFFLE(XMM2,XMM2,0xAA) 579 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 580 SSE_SUB_PS(XMM7,XMM2) 581 582 /* Fourth Column */ 583 SSE_COPY_PS(XMM3,XMM6) 584 SSE_SHUFFLE(XMM3,XMM3,0xFF) 585 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 586 SSE_SUB_PS(XMM7,XMM3) 587 SSE_INLINE_END_2 588 v += 16; 589 } 590 v = aa + ai16; 591 ai16 = 16*diag[--i]; 592 PREFETCH_NTA(aa+ai16+16); 593 /* 594 Scale the result by the diagonal 4x4 block, 595 which was inverted as part of the factorization 596 */ 597 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 598 /* First Column */ 599 SSE_COPY_PS(XMM0,XMM7) 600 SSE_SHUFFLE(XMM0,XMM0,0x00) 601 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 602 603 /* Second Column */ 604 SSE_COPY_PS(XMM1,XMM7) 605 SSE_SHUFFLE(XMM1,XMM1,0x55) 606 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 607 SSE_ADD_PS(XMM0,XMM1) 608 609 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 610 611 /* Third Column */ 612 SSE_COPY_PS(XMM2,XMM7) 613 SSE_SHUFFLE(XMM2,XMM2,0xAA) 614 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 615 SSE_ADD_PS(XMM0,XMM2) 616 617 /* Fourth Column */ 618 SSE_COPY_PS(XMM3,XMM7) 619 SSE_SHUFFLE(XMM3,XMM3,0xFF) 620 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 621 SSE_ADD_PS(XMM0,XMM3) 622 623 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 624 SSE_INLINE_END_3 625 626 v = aa + ai16 + 16; 627 idt -= 4; 628 } 629 630 /* Convert t from single precision back to double precision (inplace)*/ 631 idt = 4*(n-1); 632 for (i=n-1; i>=0; i--) { 633 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 634 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 635 PetscScalar *xtemp=&x[idt]; 636 MatScalar *ttemp=&t[idt]; 637 xtemp[3] = (PetscScalar)ttemp[3]; 638 xtemp[2] = (PetscScalar)ttemp[2]; 639 xtemp[1] = (PetscScalar)ttemp[1]; 640 xtemp[0] = (PetscScalar)ttemp[0]; 641 idt -= 4; 642 } 643 644 } /* End of artificial scope. */ 645 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 646 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 647 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 648 SSE_SCOPE_END; 649 PetscFunctionReturn(0); 650 } 651 652 #endif 653