1 /*$Id: dgefa4.c,v 1.21 2001/06/22 19:50:38 buschelm Exp $*/ 2 /* 3 Inverts 4 by 4 matrix using partial pivoting. 4 5 Used by the sparse factorization routines in 6 src/mat/impls/baij/seq and src/mat/impls/bdiag/seq 7 8 See also src/inline/ilu.h 9 10 This is a combination of the Linpack routines 11 dgefa() and dgedi() specialized for a size of 4. 12 13 */ 14 #include "petsc.h" 15 16 #undef __FUNCT__ 17 #define __FUNCT__ "Kernel_A_gets_inverse_A_4" 18 int Kernel_A_gets_inverse_A_4(MatScalar *a) 19 { 20 int i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3; 21 int k4,j3; 22 MatScalar *aa,*ax,*ay,work[16],stmp; 23 MatReal tmp,max; 24 25 /* gaussian elimination with partial pivoting */ 26 27 PetscFunctionBegin; 28 /* Parameter adjustments */ 29 a -= 5; 30 31 for (k = 1; k <= 3; ++k) { 32 kp1 = k + 1; 33 k3 = 4*k; 34 k4 = k3 + k; 35 /* find l = pivot index */ 36 37 i__2 = 4 - k; 38 aa = &a[k4]; 39 max = PetscAbsScalar(aa[0]); 40 l = 1; 41 for (ll=1; ll<i__2; ll++) { 42 tmp = PetscAbsScalar(aa[ll]); 43 if (tmp > max) { max = tmp; l = ll+1;} 44 } 45 l += k - 1; 46 ipvt[k-1] = l; 47 48 if (a[l + k3] == 0.) { 49 SETERRQ(k,"Zero pivot"); 50 } 51 52 /* interchange if necessary */ 53 54 if (l != k) { 55 stmp = a[l + k3]; 56 a[l + k3] = a[k4]; 57 a[k4] = stmp; 58 } 59 60 /* compute multipliers */ 61 62 stmp = -1. / a[k4]; 63 i__2 = 4 - k; 64 aa = &a[1 + k4]; 65 for (ll=0; ll<i__2; ll++) { 66 aa[ll] *= stmp; 67 } 68 69 /* row elimination with column indexing */ 70 71 ax = &a[k4+1]; 72 for (j = kp1; j <= 4; ++j) { 73 j3 = 4*j; 74 stmp = a[l + j3]; 75 if (l != k) { 76 a[l + j3] = a[k + j3]; 77 a[k + j3] = stmp; 78 } 79 80 i__3 = 4 - k; 81 ay = &a[1+k+j3]; 82 for (ll=0; ll<i__3; ll++) { 83 ay[ll] += stmp*ax[ll]; 84 } 85 } 86 } 87 ipvt[3] = 4; 88 if (a[20] == 0.) { 89 SETERRQ(3,"Zero pivot,final row"); 90 } 91 92 /* 93 Now form the inverse 94 */ 95 96 /* compute inverse(u) */ 97 98 for (k = 1; k <= 4; ++k) { 99 k3 = 4*k; 100 k4 = k3 + k; 101 a[k4] = 1.0 / a[k4]; 102 stmp = -a[k4]; 103 i__2 = k - 1; 104 aa = &a[k3 + 1]; 105 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 106 kp1 = k + 1; 107 if (4 < kp1) continue; 108 ax = aa; 109 for (j = kp1; j <= 4; ++j) { 110 j3 = 4*j; 111 stmp = a[k + j3]; 112 a[k + j3] = 0.0; 113 ay = &a[j3 + 1]; 114 for (ll=0; ll<k; ll++) { 115 ay[ll] += stmp*ax[ll]; 116 } 117 } 118 } 119 120 /* form inverse(u)*inverse(l) */ 121 122 for (kb = 1; kb <= 3; ++kb) { 123 k = 4 - kb; 124 k3 = 4*k; 125 kp1 = k + 1; 126 aa = a + k3; 127 for (i = kp1; i <= 4; ++i) { 128 work[i-1] = aa[i]; 129 aa[i] = 0.0; 130 } 131 for (j = kp1; j <= 4; ++j) { 132 stmp = work[j-1]; 133 ax = &a[4*j + 1]; 134 ay = &a[k3 + 1]; 135 ay[0] += stmp*ax[0]; 136 ay[1] += stmp*ax[1]; 137 ay[2] += stmp*ax[2]; 138 ay[3] += stmp*ax[3]; 139 } 140 l = ipvt[k-1]; 141 if (l != k) { 142 ax = &a[k3 + 1]; 143 ay = &a[4*l + 1]; 144 stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp; 145 stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp; 146 stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp; 147 stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp; 148 } 149 } 150 PetscFunctionReturn(0); 151 } 152 153 #if defined(PETSC_HAVE_SSE) 154 #include PETSC_HAVE_SSE 155 156 #undef __FUNCT__ 157 #define __FUNCT__ "Kernel_A_gets_inverse_A_4_SSE" 158 int Kernel_A_gets_inverse_A_4_SSE(float *a) 159 { 160 /* 161 This routine is converted from Intel's Small Matrix Library. 162 See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix 163 Order Number: 245043-001 164 March 1999 165 http://www.intel.com 166 167 Inverse of a 4x4 matrix via Kramer's Rule: 168 bool Invert4x4(SMLXMatrix &); 169 */ 170 PetscFunctionBegin; 171 172 SSE_SCOPE_BEGIN; 173 SSE_INLINE_BEGIN_1(a) 174 175 /* ----------------------------------------------- */ 176 177 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 178 SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0) 179 180 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 181 SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5) 182 183 SSE_COPY_PS(XMM3,XMM0) 184 SSE_SHUFFLE(XMM3,XMM5,0x88) 185 186 SSE_SHUFFLE(XMM5,XMM0,0xDD) 187 188 SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0) 189 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0) 190 191 SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6) 192 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6) 193 194 SSE_COPY_PS(XMM4,XMM0) 195 SSE_SHUFFLE(XMM4,XMM6,0x88) 196 197 SSE_SHUFFLE(XMM6,XMM0,0xDD) 198 199 /* ----------------------------------------------- */ 200 201 SSE_COPY_PS(XMM7,XMM4) 202 SSE_MULT_PS(XMM7,XMM6) 203 204 SSE_SHUFFLE(XMM7,XMM7,0xB1) 205 206 SSE_COPY_PS(XMM0,XMM5) 207 SSE_MULT_PS(XMM0,XMM7) 208 209 SSE_COPY_PS(XMM2,XMM3) 210 SSE_MULT_PS(XMM2,XMM7) 211 212 SSE_SHUFFLE(XMM7,XMM7,0x4E) 213 214 SSE_COPY_PS(XMM1,XMM5) 215 SSE_MULT_PS(XMM1,XMM7) 216 SSE_SUB_PS(XMM1,XMM0) 217 218 SSE_MULT_PS(XMM7,XMM3) 219 SSE_SUB_PS(XMM7,XMM2) 220 221 SSE_SHUFFLE(XMM7,XMM7,0x4E) 222 SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7) 223 224 /* ----------------------------------------------- */ 225 226 SSE_COPY_PS(XMM0,XMM5) 227 SSE_MULT_PS(XMM0,XMM4) 228 229 SSE_SHUFFLE(XMM0,XMM0,0xB1) 230 231 SSE_COPY_PS(XMM2,XMM6) 232 SSE_MULT_PS(XMM2,XMM0) 233 SSE_ADD_PS(XMM2,XMM1) 234 235 SSE_COPY_PS(XMM7,XMM3) 236 SSE_MULT_PS(XMM7,XMM0) 237 238 SSE_SHUFFLE(XMM0,XMM0,0x4E) 239 240 SSE_COPY_PS(XMM1,XMM6) 241 SSE_MULT_PS(XMM1,XMM0) 242 SSE_SUB_PS(XMM2,XMM1) 243 244 SSE_MULT_PS(XMM0,XMM3) 245 SSE_SUB_PS(XMM0,XMM7) 246 247 SSE_SHUFFLE(XMM0,XMM0,0x4E) 248 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0) 249 250 /* ----------------------------------------------- */ 251 252 SSE_COPY_PS(XMM7,XMM5) 253 SSE_SHUFFLE(XMM7,XMM5,0x4E) 254 SSE_MULT_PS(XMM7,XMM6) 255 256 SSE_SHUFFLE(XMM7,XMM7,0xB1) 257 258 SSE_SHUFFLE(XMM4,XMM4,0x4E) 259 260 SSE_COPY_PS(XMM0,XMM4) 261 SSE_MULT_PS(XMM0,XMM7) 262 SSE_ADD_PS(XMM0,XMM2) 263 264 SSE_COPY_PS(XMM2,XMM3) 265 SSE_MULT_PS(XMM2,XMM7) 266 267 SSE_SHUFFLE(XMM7,XMM7,0x4E) 268 269 SSE_COPY_PS(XMM1,XMM4) 270 SSE_MULT_PS(XMM1,XMM7) 271 SSE_SUB_PS(XMM0,XMM1) 272 SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0) 273 274 SSE_MULT_PS(XMM7,XMM3) 275 SSE_SUB_PS(XMM7,XMM2) 276 277 SSE_SHUFFLE(XMM7,XMM7,0x4E) 278 279 /* ----------------------------------------------- */ 280 281 SSE_COPY_PS(XMM1,XMM3) 282 SSE_MULT_PS(XMM1,XMM5) 283 284 SSE_SHUFFLE(XMM1,XMM1,0xB1) 285 286 SSE_COPY_PS(XMM0,XMM6) 287 SSE_MULT_PS(XMM0,XMM1) 288 SSE_ADD_PS(XMM0,XMM7) 289 290 SSE_COPY_PS(XMM2,XMM4) 291 SSE_MULT_PS(XMM2,XMM1) 292 SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12) 293 294 SSE_SHUFFLE(XMM1,XMM1,0x4E) 295 296 SSE_COPY_PS(XMM7,XMM6) 297 SSE_MULT_PS(XMM7,XMM1) 298 SSE_SUB_PS(XMM7,XMM0) 299 300 SSE_MULT_PS(XMM1,XMM4) 301 SSE_SUB_PS(XMM2,XMM1) 302 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2) 303 304 /* ----------------------------------------------- */ 305 306 SSE_COPY_PS(XMM1,XMM3) 307 SSE_MULT_PS(XMM1,XMM6) 308 309 SSE_SHUFFLE(XMM1,XMM1,0xB1) 310 311 SSE_COPY_PS(XMM2,XMM4) 312 SSE_MULT_PS(XMM2,XMM1) 313 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0) 314 SSE_SUB_PS(XMM0,XMM2) 315 316 SSE_COPY_PS(XMM2,XMM5) 317 SSE_MULT_PS(XMM2,XMM1) 318 SSE_ADD_PS(XMM2,XMM7) 319 320 SSE_SHUFFLE(XMM1,XMM1,0x4E) 321 322 SSE_COPY_PS(XMM7,XMM4) 323 SSE_MULT_PS(XMM7,XMM1) 324 SSE_ADD_PS(XMM7,XMM0) 325 326 SSE_MULT_PS(XMM1,XMM5) 327 SSE_SUB_PS(XMM2,XMM1) 328 329 /* ----------------------------------------------- */ 330 331 SSE_MULT_PS(XMM4,XMM3) 332 333 SSE_SHUFFLE(XMM4,XMM4,0xB1) 334 335 SSE_COPY_PS(XMM1,XMM6) 336 SSE_MULT_PS(XMM1,XMM4) 337 SSE_ADD_PS(XMM1,XMM7) 338 339 SSE_COPY_PS(XMM0,XMM5) 340 SSE_MULT_PS(XMM0,XMM4) 341 SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7) 342 SSE_SUB_PS(XMM7,XMM0) 343 344 SSE_SHUFFLE(XMM4,XMM4,0x4E) 345 346 SSE_MULT_PS(XMM6,XMM4) 347 SSE_SUB_PS(XMM1,XMM6) 348 349 SSE_MULT_PS(XMM5,XMM4) 350 SSE_ADD_PS(XMM5,XMM7) 351 352 /* ----------------------------------------------- */ 353 354 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 355 SSE_MULT_PS(XMM3,XMM0) 356 357 SSE_COPY_PS(XMM4,XMM3) 358 SSE_SHUFFLE(XMM4,XMM3,0x4E) 359 SSE_ADD_PS(XMM4,XMM3) 360 361 SSE_COPY_PS(XMM6,XMM4) 362 SSE_SHUFFLE(XMM6,XMM4,0xB1) 363 SSE_ADD_SS(XMM6,XMM4) 364 365 SSE_COPY_PS(XMM3,XMM6) 366 SSE_RECIP_SS(XMM3,XMM6) 367 SSE_COPY_SS(XMM4,XMM3) 368 SSE_ADD_SS(XMM4,XMM3) 369 SSE_MULT_SS(XMM3,XMM3) 370 SSE_MULT_SS(XMM6,XMM3) 371 SSE_SUB_SS(XMM4,XMM6) 372 373 SSE_SHUFFLE(XMM4,XMM4,0x00) 374 375 SSE_MULT_PS(XMM0,XMM4) 376 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0) 377 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0) 378 379 SSE_MULT_PS(XMM1,XMM4) 380 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1) 381 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1) 382 383 SSE_MULT_PS(XMM2,XMM4) 384 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2) 385 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2) 386 387 SSE_MULT_PS(XMM4,XMM5) 388 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 389 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 390 391 /* ----------------------------------------------- */ 392 393 SSE_INLINE_END_1; 394 SSE_SCOPE_END; 395 396 PetscFunctionReturn(0); 397 } 398 399 #endif 400 401 402