1 2 /* 3 Inverts 4 by 4 matrix using partial pivoting. 4 5 Used by the sparse factorization routines in 6 src/mat/impls/baij/seq 7 8 This is a combination of the Linpack routines 9 dgefa() and dgedi() specialized for a size of 4. 10 11 */ 12 #include <petscsys.h> 13 14 #undef __FUNCT__ 15 #define __FUNCT__ "PetscKernel_A_gets_inverse_A_4" 16 PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar *a,PetscReal shift) 17 { 18 PetscInt i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3; 19 PetscInt k4,j3; 20 MatScalar *aa,*ax,*ay,work[16],stmp; 21 MatReal tmp,max; 22 23 /* gaussian elimination with partial pivoting */ 24 25 PetscFunctionBegin; 26 shift = .25*shift*(1.e-12 + PetscAbsScalar(a[0]) + PetscAbsScalar(a[5]) + PetscAbsScalar(a[10]) + PetscAbsScalar(a[15])); 27 /* Parameter adjustments */ 28 a -= 5; 29 30 for (k = 1; k <= 3; ++k) { 31 kp1 = k + 1; 32 k3 = 4*k; 33 k4 = k3 + k; 34 /* find l = pivot index */ 35 36 i__2 = 5 - k; 37 aa = &a[k4]; 38 max = PetscAbsScalar(aa[0]); 39 l = 1; 40 for (ll=1; ll<i__2; ll++) { 41 tmp = PetscAbsScalar(aa[ll]); 42 if (tmp > max) { max = tmp; l = ll+1;} 43 } 44 l += k - 1; 45 ipvt[k-1] = l; 46 47 if (a[l + k3] == 0.0) { 48 if (shift == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1); 49 else { 50 /* SHIFT is applied to SINGLE diagonal entry; does this make any sense? */ 51 a[l + k3] = shift; 52 } 53 } 54 55 /* interchange if necessary */ 56 57 if (l != k) { 58 stmp = a[l + k3]; 59 a[l + k3] = a[k4]; 60 a[k4] = stmp; 61 } 62 63 /* compute multipliers */ 64 65 stmp = -1. / a[k4]; 66 i__2 = 4 - k; 67 aa = &a[1 + k4]; 68 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 69 70 /* row elimination with column indexing */ 71 72 ax = &a[k4+1]; 73 for (j = kp1; j <= 4; ++j) { 74 j3 = 4*j; 75 stmp = a[l + j3]; 76 if (l != k) { 77 a[l + j3] = a[k + j3]; 78 a[k + j3] = stmp; 79 } 80 81 i__3 = 4 - k; 82 ay = &a[1+k+j3]; 83 for (ll=0; ll<i__3; ll++) ay[ll] += stmp*ax[ll]; 84 } 85 } 86 ipvt[3] = 4; 87 if (a[20] == 0.0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3); 88 89 /* 90 Now form the inverse 91 */ 92 93 /* compute inverse(u) */ 94 95 for (k = 1; k <= 4; ++k) { 96 k3 = 4*k; 97 k4 = k3 + k; 98 a[k4] = 1.0 / a[k4]; 99 stmp = -a[k4]; 100 i__2 = k - 1; 101 aa = &a[k3 + 1]; 102 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 103 kp1 = k + 1; 104 if (4 < kp1) continue; 105 ax = aa; 106 for (j = kp1; j <= 4; ++j) { 107 j3 = 4*j; 108 stmp = a[k + j3]; 109 a[k + j3] = 0.0; 110 ay = &a[j3 + 1]; 111 for (ll=0; ll<k; ll++) ay[ll] += stmp*ax[ll]; 112 } 113 } 114 115 /* form inverse(u)*inverse(l) */ 116 117 for (kb = 1; kb <= 3; ++kb) { 118 k = 4 - kb; 119 k3 = 4*k; 120 kp1 = k + 1; 121 aa = a + k3; 122 for (i = kp1; i <= 4; ++i) { 123 work[i-1] = aa[i]; 124 aa[i] = 0.0; 125 } 126 for (j = kp1; j <= 4; ++j) { 127 stmp = work[j-1]; 128 ax = &a[4*j + 1]; 129 ay = &a[k3 + 1]; 130 ay[0] += stmp*ax[0]; 131 ay[1] += stmp*ax[1]; 132 ay[2] += stmp*ax[2]; 133 ay[3] += stmp*ax[3]; 134 } 135 l = ipvt[k-1]; 136 if (l != k) { 137 ax = &a[k3 + 1]; 138 ay = &a[4*l + 1]; 139 stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp; 140 stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp; 141 stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp; 142 stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp; 143 } 144 } 145 PetscFunctionReturn(0); 146 } 147 148 #if defined(PETSC_HAVE_SSE) 149 #include PETSC_HAVE_SSE 150 151 #undef __FUNCT__ 152 #define __FUNCT__ "PetscKernel_A_gets_inverse_A_4_SSE" 153 PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(float *a) 154 { 155 /* 156 This routine is converted from Intel's Small Matrix Library. 157 See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix 158 Order Number: 245043-001 159 March 1999 160 http://www.intel.com 161 162 Inverse of a 4x4 matrix via Kramer's Rule: 163 bool Invert4x4(SMLXMatrix &); 164 */ 165 PetscFunctionBegin; 166 SSE_SCOPE_BEGIN; 167 SSE_INLINE_BEGIN_1(a) 168 169 /* ----------------------------------------------- */ 170 171 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 172 SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0) 173 174 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 175 SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5) 176 177 SSE_COPY_PS(XMM3,XMM0) 178 SSE_SHUFFLE(XMM3,XMM5,0x88) 179 180 SSE_SHUFFLE(XMM5,XMM0,0xDD) 181 182 SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0) 183 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0) 184 185 SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6) 186 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6) 187 188 SSE_COPY_PS(XMM4,XMM0) 189 SSE_SHUFFLE(XMM4,XMM6,0x88) 190 191 SSE_SHUFFLE(XMM6,XMM0,0xDD) 192 193 /* ----------------------------------------------- */ 194 195 SSE_COPY_PS(XMM7,XMM4) 196 SSE_MULT_PS(XMM7,XMM6) 197 198 SSE_SHUFFLE(XMM7,XMM7,0xB1) 199 200 SSE_COPY_PS(XMM0,XMM5) 201 SSE_MULT_PS(XMM0,XMM7) 202 203 SSE_COPY_PS(XMM2,XMM3) 204 SSE_MULT_PS(XMM2,XMM7) 205 206 SSE_SHUFFLE(XMM7,XMM7,0x4E) 207 208 SSE_COPY_PS(XMM1,XMM5) 209 SSE_MULT_PS(XMM1,XMM7) 210 SSE_SUB_PS(XMM1,XMM0) 211 212 SSE_MULT_PS(XMM7,XMM3) 213 SSE_SUB_PS(XMM7,XMM2) 214 215 SSE_SHUFFLE(XMM7,XMM7,0x4E) 216 SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7) 217 218 /* ----------------------------------------------- */ 219 220 SSE_COPY_PS(XMM0,XMM5) 221 SSE_MULT_PS(XMM0,XMM4) 222 223 SSE_SHUFFLE(XMM0,XMM0,0xB1) 224 225 SSE_COPY_PS(XMM2,XMM6) 226 SSE_MULT_PS(XMM2,XMM0) 227 SSE_ADD_PS(XMM2,XMM1) 228 229 SSE_COPY_PS(XMM7,XMM3) 230 SSE_MULT_PS(XMM7,XMM0) 231 232 SSE_SHUFFLE(XMM0,XMM0,0x4E) 233 234 SSE_COPY_PS(XMM1,XMM6) 235 SSE_MULT_PS(XMM1,XMM0) 236 SSE_SUB_PS(XMM2,XMM1) 237 238 SSE_MULT_PS(XMM0,XMM3) 239 SSE_SUB_PS(XMM0,XMM7) 240 241 SSE_SHUFFLE(XMM0,XMM0,0x4E) 242 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0) 243 244 /* ----------------------------------------------- */ 245 246 SSE_COPY_PS(XMM7,XMM5) 247 SSE_SHUFFLE(XMM7,XMM5,0x4E) 248 SSE_MULT_PS(XMM7,XMM6) 249 250 SSE_SHUFFLE(XMM7,XMM7,0xB1) 251 252 SSE_SHUFFLE(XMM4,XMM4,0x4E) 253 254 SSE_COPY_PS(XMM0,XMM4) 255 SSE_MULT_PS(XMM0,XMM7) 256 SSE_ADD_PS(XMM0,XMM2) 257 258 SSE_COPY_PS(XMM2,XMM3) 259 SSE_MULT_PS(XMM2,XMM7) 260 261 SSE_SHUFFLE(XMM7,XMM7,0x4E) 262 263 SSE_COPY_PS(XMM1,XMM4) 264 SSE_MULT_PS(XMM1,XMM7) 265 SSE_SUB_PS(XMM0,XMM1) 266 SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0) 267 268 SSE_MULT_PS(XMM7,XMM3) 269 SSE_SUB_PS(XMM7,XMM2) 270 271 SSE_SHUFFLE(XMM7,XMM7,0x4E) 272 273 /* ----------------------------------------------- */ 274 275 SSE_COPY_PS(XMM1,XMM3) 276 SSE_MULT_PS(XMM1,XMM5) 277 278 SSE_SHUFFLE(XMM1,XMM1,0xB1) 279 280 SSE_COPY_PS(XMM0,XMM6) 281 SSE_MULT_PS(XMM0,XMM1) 282 SSE_ADD_PS(XMM0,XMM7) 283 284 SSE_COPY_PS(XMM2,XMM4) 285 SSE_MULT_PS(XMM2,XMM1) 286 SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12) 287 288 SSE_SHUFFLE(XMM1,XMM1,0x4E) 289 290 SSE_COPY_PS(XMM7,XMM6) 291 SSE_MULT_PS(XMM7,XMM1) 292 SSE_SUB_PS(XMM7,XMM0) 293 294 SSE_MULT_PS(XMM1,XMM4) 295 SSE_SUB_PS(XMM2,XMM1) 296 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2) 297 298 /* ----------------------------------------------- */ 299 300 SSE_COPY_PS(XMM1,XMM3) 301 SSE_MULT_PS(XMM1,XMM6) 302 303 SSE_SHUFFLE(XMM1,XMM1,0xB1) 304 305 SSE_COPY_PS(XMM2,XMM4) 306 SSE_MULT_PS(XMM2,XMM1) 307 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0) 308 SSE_SUB_PS(XMM0,XMM2) 309 310 SSE_COPY_PS(XMM2,XMM5) 311 SSE_MULT_PS(XMM2,XMM1) 312 SSE_ADD_PS(XMM2,XMM7) 313 314 SSE_SHUFFLE(XMM1,XMM1,0x4E) 315 316 SSE_COPY_PS(XMM7,XMM4) 317 SSE_MULT_PS(XMM7,XMM1) 318 SSE_ADD_PS(XMM7,XMM0) 319 320 SSE_MULT_PS(XMM1,XMM5) 321 SSE_SUB_PS(XMM2,XMM1) 322 323 /* ----------------------------------------------- */ 324 325 SSE_MULT_PS(XMM4,XMM3) 326 327 SSE_SHUFFLE(XMM4,XMM4,0xB1) 328 329 SSE_COPY_PS(XMM1,XMM6) 330 SSE_MULT_PS(XMM1,XMM4) 331 SSE_ADD_PS(XMM1,XMM7) 332 333 SSE_COPY_PS(XMM0,XMM5) 334 SSE_MULT_PS(XMM0,XMM4) 335 SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7) 336 SSE_SUB_PS(XMM7,XMM0) 337 338 SSE_SHUFFLE(XMM4,XMM4,0x4E) 339 340 SSE_MULT_PS(XMM6,XMM4) 341 SSE_SUB_PS(XMM1,XMM6) 342 343 SSE_MULT_PS(XMM5,XMM4) 344 SSE_ADD_PS(XMM5,XMM7) 345 346 /* ----------------------------------------------- */ 347 348 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 349 SSE_MULT_PS(XMM3,XMM0) 350 351 SSE_COPY_PS(XMM4,XMM3) 352 SSE_SHUFFLE(XMM4,XMM3,0x4E) 353 SSE_ADD_PS(XMM4,XMM3) 354 355 SSE_COPY_PS(XMM6,XMM4) 356 SSE_SHUFFLE(XMM6,XMM4,0xB1) 357 SSE_ADD_SS(XMM6,XMM4) 358 359 SSE_COPY_PS(XMM3,XMM6) 360 SSE_RECIP_SS(XMM3,XMM6) 361 SSE_COPY_SS(XMM4,XMM3) 362 SSE_ADD_SS(XMM4,XMM3) 363 SSE_MULT_SS(XMM3,XMM3) 364 SSE_MULT_SS(XMM6,XMM3) 365 SSE_SUB_SS(XMM4,XMM6) 366 367 SSE_SHUFFLE(XMM4,XMM4,0x00) 368 369 SSE_MULT_PS(XMM0,XMM4) 370 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0) 371 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0) 372 373 SSE_MULT_PS(XMM1,XMM4) 374 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1) 375 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1) 376 377 SSE_MULT_PS(XMM2,XMM4) 378 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2) 379 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2) 380 381 SSE_MULT_PS(XMM4,XMM5) 382 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 383 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 384 385 /* ----------------------------------------------- */ 386 387 SSE_INLINE_END_1; 388 SSE_SCOPE_END; 389 PetscFunctionReturn(0); 390 } 391 392 #endif 393 394 395