1 #define PETSCMAT_DLL 2 3 /* 4 Inverts 4 by 4 matrix using partial pivoting. 5 6 Used by the sparse factorization routines in 7 src/mat/impls/baij/seq 8 9 See also src/inline/ilu.h 10 11 This is a combination of the Linpack routines 12 dgefa() and dgedi() specialized for a size of 4. 13 14 */ 15 #include "petsc.h" 16 17 #undef __FUNCT__ 18 #define __FUNCT__ "Kernel_A_gets_inverse_A_4" 19 PetscErrorCode Kernel_A_gets_inverse_A_4(MatScalar *a,PetscReal shift) 20 { 21 PetscInt i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3; 22 PetscInt k4,j3; 23 MatScalar *aa,*ax,*ay,work[16],stmp; 24 MatReal tmp,max; 25 26 /* gaussian elimination with partial pivoting */ 27 28 PetscFunctionBegin; 29 shift = .25*shift*(PetscAbsScalar(a[0]) + PetscAbsScalar(a[5]) + PetscAbsScalar(a[10]) + PetscAbsScalar(a[15])); 30 /* Parameter adjustments */ 31 a -= 5; 32 33 for (k = 1; k <= 3; ++k) { 34 kp1 = k + 1; 35 k3 = 4*k; 36 k4 = k3 + k; 37 /* find l = pivot index */ 38 39 i__2 = 5 - k; 40 aa = &a[k4]; 41 max = PetscAbsScalar(aa[0]); 42 l = 1; 43 for (ll=1; ll<i__2; ll++) { 44 tmp = PetscAbsScalar(aa[ll]); 45 if (tmp > max) { max = tmp; l = ll+1;} 46 } 47 l += k - 1; 48 ipvt[k-1] = l; 49 50 if (a[l + k3] == 0.0) { 51 if (shift == 0.0) { 52 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1); 53 } else { 54 /* SHIFT is applied to SINGLE diagonal entry; does this make any sense? */ 55 a[l + k3] = shift; 56 } 57 } 58 59 /* interchange if necessary */ 60 61 if (l != k) { 62 stmp = a[l + k3]; 63 a[l + k3] = a[k4]; 64 a[k4] = stmp; 65 } 66 67 /* compute multipliers */ 68 69 stmp = -1. / a[k4]; 70 i__2 = 4 - k; 71 aa = &a[1 + k4]; 72 for (ll=0; ll<i__2; ll++) { 73 aa[ll] *= stmp; 74 } 75 76 /* row elimination with column indexing */ 77 78 ax = &a[k4+1]; 79 for (j = kp1; j <= 4; ++j) { 80 j3 = 4*j; 81 stmp = a[l + j3]; 82 if (l != k) { 83 a[l + j3] = a[k + j3]; 84 a[k + j3] = stmp; 85 } 86 87 i__3 = 4 - k; 88 ay = &a[1+k+j3]; 89 for (ll=0; ll<i__3; ll++) { 90 ay[ll] += stmp*ax[ll]; 91 } 92 } 93 } 94 ipvt[3] = 4; 95 if (a[20] == 0.0) { 96 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3); 97 } 98 99 /* 100 Now form the inverse 101 */ 102 103 /* compute inverse(u) */ 104 105 for (k = 1; k <= 4; ++k) { 106 k3 = 4*k; 107 k4 = k3 + k; 108 a[k4] = 1.0 / a[k4]; 109 stmp = -a[k4]; 110 i__2 = k - 1; 111 aa = &a[k3 + 1]; 112 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 113 kp1 = k + 1; 114 if (4 < kp1) continue; 115 ax = aa; 116 for (j = kp1; j <= 4; ++j) { 117 j3 = 4*j; 118 stmp = a[k + j3]; 119 a[k + j3] = 0.0; 120 ay = &a[j3 + 1]; 121 for (ll=0; ll<k; ll++) { 122 ay[ll] += stmp*ax[ll]; 123 } 124 } 125 } 126 127 /* form inverse(u)*inverse(l) */ 128 129 for (kb = 1; kb <= 3; ++kb) { 130 k = 4 - kb; 131 k3 = 4*k; 132 kp1 = k + 1; 133 aa = a + k3; 134 for (i = kp1; i <= 4; ++i) { 135 work[i-1] = aa[i]; 136 aa[i] = 0.0; 137 } 138 for (j = kp1; j <= 4; ++j) { 139 stmp = work[j-1]; 140 ax = &a[4*j + 1]; 141 ay = &a[k3 + 1]; 142 ay[0] += stmp*ax[0]; 143 ay[1] += stmp*ax[1]; 144 ay[2] += stmp*ax[2]; 145 ay[3] += stmp*ax[3]; 146 } 147 l = ipvt[k-1]; 148 if (l != k) { 149 ax = &a[k3 + 1]; 150 ay = &a[4*l + 1]; 151 stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp; 152 stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp; 153 stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp; 154 stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp; 155 } 156 } 157 PetscFunctionReturn(0); 158 } 159 160 #if defined(PETSC_HAVE_SSE) 161 #include PETSC_HAVE_SSE 162 163 #undef __FUNCT__ 164 #define __FUNCT__ "Kernel_A_gets_inverse_A_4_SSE" 165 PetscErrorCode Kernel_A_gets_inverse_A_4_SSE(float *a) 166 { 167 /* 168 This routine is converted from Intel's Small Matrix Library. 169 See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix 170 Order Number: 245043-001 171 March 1999 172 http://www.intel.com 173 174 Inverse of a 4x4 matrix via Kramer's Rule: 175 bool Invert4x4(SMLXMatrix &); 176 */ 177 PetscFunctionBegin; 178 179 SSE_SCOPE_BEGIN; 180 SSE_INLINE_BEGIN_1(a) 181 182 /* ----------------------------------------------- */ 183 184 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 185 SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0) 186 187 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 188 SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5) 189 190 SSE_COPY_PS(XMM3,XMM0) 191 SSE_SHUFFLE(XMM3,XMM5,0x88) 192 193 SSE_SHUFFLE(XMM5,XMM0,0xDD) 194 195 SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0) 196 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0) 197 198 SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6) 199 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6) 200 201 SSE_COPY_PS(XMM4,XMM0) 202 SSE_SHUFFLE(XMM4,XMM6,0x88) 203 204 SSE_SHUFFLE(XMM6,XMM0,0xDD) 205 206 /* ----------------------------------------------- */ 207 208 SSE_COPY_PS(XMM7,XMM4) 209 SSE_MULT_PS(XMM7,XMM6) 210 211 SSE_SHUFFLE(XMM7,XMM7,0xB1) 212 213 SSE_COPY_PS(XMM0,XMM5) 214 SSE_MULT_PS(XMM0,XMM7) 215 216 SSE_COPY_PS(XMM2,XMM3) 217 SSE_MULT_PS(XMM2,XMM7) 218 219 SSE_SHUFFLE(XMM7,XMM7,0x4E) 220 221 SSE_COPY_PS(XMM1,XMM5) 222 SSE_MULT_PS(XMM1,XMM7) 223 SSE_SUB_PS(XMM1,XMM0) 224 225 SSE_MULT_PS(XMM7,XMM3) 226 SSE_SUB_PS(XMM7,XMM2) 227 228 SSE_SHUFFLE(XMM7,XMM7,0x4E) 229 SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7) 230 231 /* ----------------------------------------------- */ 232 233 SSE_COPY_PS(XMM0,XMM5) 234 SSE_MULT_PS(XMM0,XMM4) 235 236 SSE_SHUFFLE(XMM0,XMM0,0xB1) 237 238 SSE_COPY_PS(XMM2,XMM6) 239 SSE_MULT_PS(XMM2,XMM0) 240 SSE_ADD_PS(XMM2,XMM1) 241 242 SSE_COPY_PS(XMM7,XMM3) 243 SSE_MULT_PS(XMM7,XMM0) 244 245 SSE_SHUFFLE(XMM0,XMM0,0x4E) 246 247 SSE_COPY_PS(XMM1,XMM6) 248 SSE_MULT_PS(XMM1,XMM0) 249 SSE_SUB_PS(XMM2,XMM1) 250 251 SSE_MULT_PS(XMM0,XMM3) 252 SSE_SUB_PS(XMM0,XMM7) 253 254 SSE_SHUFFLE(XMM0,XMM0,0x4E) 255 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0) 256 257 /* ----------------------------------------------- */ 258 259 SSE_COPY_PS(XMM7,XMM5) 260 SSE_SHUFFLE(XMM7,XMM5,0x4E) 261 SSE_MULT_PS(XMM7,XMM6) 262 263 SSE_SHUFFLE(XMM7,XMM7,0xB1) 264 265 SSE_SHUFFLE(XMM4,XMM4,0x4E) 266 267 SSE_COPY_PS(XMM0,XMM4) 268 SSE_MULT_PS(XMM0,XMM7) 269 SSE_ADD_PS(XMM0,XMM2) 270 271 SSE_COPY_PS(XMM2,XMM3) 272 SSE_MULT_PS(XMM2,XMM7) 273 274 SSE_SHUFFLE(XMM7,XMM7,0x4E) 275 276 SSE_COPY_PS(XMM1,XMM4) 277 SSE_MULT_PS(XMM1,XMM7) 278 SSE_SUB_PS(XMM0,XMM1) 279 SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0) 280 281 SSE_MULT_PS(XMM7,XMM3) 282 SSE_SUB_PS(XMM7,XMM2) 283 284 SSE_SHUFFLE(XMM7,XMM7,0x4E) 285 286 /* ----------------------------------------------- */ 287 288 SSE_COPY_PS(XMM1,XMM3) 289 SSE_MULT_PS(XMM1,XMM5) 290 291 SSE_SHUFFLE(XMM1,XMM1,0xB1) 292 293 SSE_COPY_PS(XMM0,XMM6) 294 SSE_MULT_PS(XMM0,XMM1) 295 SSE_ADD_PS(XMM0,XMM7) 296 297 SSE_COPY_PS(XMM2,XMM4) 298 SSE_MULT_PS(XMM2,XMM1) 299 SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12) 300 301 SSE_SHUFFLE(XMM1,XMM1,0x4E) 302 303 SSE_COPY_PS(XMM7,XMM6) 304 SSE_MULT_PS(XMM7,XMM1) 305 SSE_SUB_PS(XMM7,XMM0) 306 307 SSE_MULT_PS(XMM1,XMM4) 308 SSE_SUB_PS(XMM2,XMM1) 309 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2) 310 311 /* ----------------------------------------------- */ 312 313 SSE_COPY_PS(XMM1,XMM3) 314 SSE_MULT_PS(XMM1,XMM6) 315 316 SSE_SHUFFLE(XMM1,XMM1,0xB1) 317 318 SSE_COPY_PS(XMM2,XMM4) 319 SSE_MULT_PS(XMM2,XMM1) 320 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0) 321 SSE_SUB_PS(XMM0,XMM2) 322 323 SSE_COPY_PS(XMM2,XMM5) 324 SSE_MULT_PS(XMM2,XMM1) 325 SSE_ADD_PS(XMM2,XMM7) 326 327 SSE_SHUFFLE(XMM1,XMM1,0x4E) 328 329 SSE_COPY_PS(XMM7,XMM4) 330 SSE_MULT_PS(XMM7,XMM1) 331 SSE_ADD_PS(XMM7,XMM0) 332 333 SSE_MULT_PS(XMM1,XMM5) 334 SSE_SUB_PS(XMM2,XMM1) 335 336 /* ----------------------------------------------- */ 337 338 SSE_MULT_PS(XMM4,XMM3) 339 340 SSE_SHUFFLE(XMM4,XMM4,0xB1) 341 342 SSE_COPY_PS(XMM1,XMM6) 343 SSE_MULT_PS(XMM1,XMM4) 344 SSE_ADD_PS(XMM1,XMM7) 345 346 SSE_COPY_PS(XMM0,XMM5) 347 SSE_MULT_PS(XMM0,XMM4) 348 SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7) 349 SSE_SUB_PS(XMM7,XMM0) 350 351 SSE_SHUFFLE(XMM4,XMM4,0x4E) 352 353 SSE_MULT_PS(XMM6,XMM4) 354 SSE_SUB_PS(XMM1,XMM6) 355 356 SSE_MULT_PS(XMM5,XMM4) 357 SSE_ADD_PS(XMM5,XMM7) 358 359 /* ----------------------------------------------- */ 360 361 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 362 SSE_MULT_PS(XMM3,XMM0) 363 364 SSE_COPY_PS(XMM4,XMM3) 365 SSE_SHUFFLE(XMM4,XMM3,0x4E) 366 SSE_ADD_PS(XMM4,XMM3) 367 368 SSE_COPY_PS(XMM6,XMM4) 369 SSE_SHUFFLE(XMM6,XMM4,0xB1) 370 SSE_ADD_SS(XMM6,XMM4) 371 372 SSE_COPY_PS(XMM3,XMM6) 373 SSE_RECIP_SS(XMM3,XMM6) 374 SSE_COPY_SS(XMM4,XMM3) 375 SSE_ADD_SS(XMM4,XMM3) 376 SSE_MULT_SS(XMM3,XMM3) 377 SSE_MULT_SS(XMM6,XMM3) 378 SSE_SUB_SS(XMM4,XMM6) 379 380 SSE_SHUFFLE(XMM4,XMM4,0x00) 381 382 SSE_MULT_PS(XMM0,XMM4) 383 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0) 384 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0) 385 386 SSE_MULT_PS(XMM1,XMM4) 387 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1) 388 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1) 389 390 SSE_MULT_PS(XMM2,XMM4) 391 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2) 392 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2) 393 394 SSE_MULT_PS(XMM4,XMM5) 395 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 396 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 397 398 /* ----------------------------------------------- */ 399 400 SSE_INLINE_END_1; 401 SSE_SCOPE_END; 402 403 PetscFunctionReturn(0); 404 } 405 406 #endif 407 408 409