1 2 /* 3 Inverts 4 by 4 matrix using gaussian elimination with partial pivoting. 4 5 Used by the sparse factorization routines in 6 src/mat/impls/baij/seq 7 8 This is a combination of the Linpack routines 9 dgefa() and dgedi() specialized for a size of 4. 10 11 */ 12 #include <petscsys.h> 13 14 #undef __FUNCT__ 15 #define __FUNCT__ "PetscKernel_A_gets_inverse_A_4" 16 PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar *a,PetscReal shift,PetscBool allowzeropivot,PetscBool *zeropivotdetected) 17 { 18 PetscInt i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3; 19 PetscInt k4,j3; 20 MatScalar *aa,*ax,*ay,work[16],stmp; 21 MatReal tmp,max; 22 23 PetscFunctionBegin; 24 if (zeropivotdetected) *zeropivotdetected = PETSC_FALSE; 25 shift = .25*shift*(1.e-12 + PetscAbsScalar(a[0]) + PetscAbsScalar(a[5]) + PetscAbsScalar(a[10]) + PetscAbsScalar(a[15])); 26 27 /* Parameter adjustments */ 28 a -= 5; 29 30 for (k = 1; k <= 3; ++k) { 31 kp1 = k + 1; 32 k3 = 4*k; 33 k4 = k3 + k; 34 35 /* find l = pivot index */ 36 i__2 = 5 - k; 37 aa = &a[k4]; 38 max = PetscAbsScalar(aa[0]); 39 l = 1; 40 for (ll=1; ll<i__2; ll++) { 41 tmp = PetscAbsScalar(aa[ll]); 42 if (tmp > max) { max = tmp; l = ll+1;} 43 } 44 l += k - 1; 45 ipvt[k-1] = l; 46 47 if (a[l + k3] == 0.0) { 48 if (shift == 0.0) { 49 if (allowzeropivot) { 50 PetscErrorCode ierr; 51 ierr = PetscInfo1(NULL,"Zero pivot, row %D\n",k-1);CHKERRQ(ierr); 52 *zeropivotdetected = PETSC_TRUE; 53 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1); 54 } else { 55 /* SHIFT is applied to SINGLE diagonal entry; does this make any sense? */ 56 a[l + k3] = shift; 57 } 58 } 59 60 /* interchange if necessary */ 61 if (l != k) { 62 stmp = a[l + k3]; 63 a[l + k3] = a[k4]; 64 a[k4] = stmp; 65 } 66 67 /* compute multipliers */ 68 stmp = -1. / a[k4]; 69 i__2 = 4 - k; 70 aa = &a[1 + k4]; 71 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 72 73 /* row elimination with column indexing */ 74 ax = &a[k4+1]; 75 for (j = kp1; j <= 4; ++j) { 76 j3 = 4*j; 77 stmp = a[l + j3]; 78 if (l != k) { 79 a[l + j3] = a[k + j3]; 80 a[k + j3] = stmp; 81 } 82 83 i__3 = 4 - k; 84 ay = &a[1+k+j3]; 85 for (ll=0; ll<i__3; ll++) ay[ll] += stmp*ax[ll]; 86 } 87 } 88 ipvt[3] = 4; 89 if (a[20] == 0.0) { 90 if (allowzeropivot) { 91 PetscErrorCode ierr; 92 ierr = PetscInfo1(NULL,"Zero pivot, row %D\n",3);CHKERRQ(ierr); 93 *zeropivotdetected = PETSC_TRUE; 94 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3); 95 } 96 97 /* Now form the inverse */ 98 /* compute inverse(u) */ 99 for (k = 1; k <= 4; ++k) { 100 k3 = 4*k; 101 k4 = k3 + k; 102 a[k4] = 1.0 / a[k4]; 103 stmp = -a[k4]; 104 i__2 = k - 1; 105 aa = &a[k3 + 1]; 106 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 107 kp1 = k + 1; 108 if (4 < kp1) continue; 109 ax = aa; 110 for (j = kp1; j <= 4; ++j) { 111 j3 = 4*j; 112 stmp = a[k + j3]; 113 a[k + j3] = 0.0; 114 ay = &a[j3 + 1]; 115 for (ll=0; ll<k; ll++) ay[ll] += stmp*ax[ll]; 116 } 117 } 118 119 /* form inverse(u)*inverse(l) */ 120 for (kb = 1; kb <= 3; ++kb) { 121 k = 4 - kb; 122 k3 = 4*k; 123 kp1 = k + 1; 124 aa = a + k3; 125 for (i = kp1; i <= 4; ++i) { 126 work[i-1] = aa[i]; 127 aa[i] = 0.0; 128 } 129 for (j = kp1; j <= 4; ++j) { 130 stmp = work[j-1]; 131 ax = &a[4*j + 1]; 132 ay = &a[k3 + 1]; 133 ay[0] += stmp*ax[0]; 134 ay[1] += stmp*ax[1]; 135 ay[2] += stmp*ax[2]; 136 ay[3] += stmp*ax[3]; 137 } 138 l = ipvt[k-1]; 139 if (l != k) { 140 ax = &a[k3 + 1]; 141 ay = &a[4*l + 1]; 142 stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp; 143 stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp; 144 stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp; 145 stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp; 146 } 147 } 148 PetscFunctionReturn(0); 149 } 150 151 #if defined(PETSC_HAVE_SSE) 152 #include PETSC_HAVE_SSE 153 154 #undef __FUNCT__ 155 #define __FUNCT__ "PetscKernel_A_gets_inverse_A_4_SSE" 156 PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(float *a) 157 { 158 /* 159 This routine is converted from Intel's Small Matrix Library. 160 See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix 161 Order Number: 245043-001 162 March 1999 163 http://www.intel.com 164 165 Inverse of a 4x4 matrix via Kramer's Rule: 166 bool Invert4x4(SMLXMatrix &); 167 */ 168 PetscFunctionBegin; 169 SSE_SCOPE_BEGIN; 170 SSE_INLINE_BEGIN_1(a) 171 172 /* ----------------------------------------------- */ 173 174 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 175 SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0) 176 177 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 178 SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5) 179 180 SSE_COPY_PS(XMM3,XMM0) 181 SSE_SHUFFLE(XMM3,XMM5,0x88) 182 183 SSE_SHUFFLE(XMM5,XMM0,0xDD) 184 185 SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0) 186 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0) 187 188 SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6) 189 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6) 190 191 SSE_COPY_PS(XMM4,XMM0) 192 SSE_SHUFFLE(XMM4,XMM6,0x88) 193 194 SSE_SHUFFLE(XMM6,XMM0,0xDD) 195 196 /* ----------------------------------------------- */ 197 198 SSE_COPY_PS(XMM7,XMM4) 199 SSE_MULT_PS(XMM7,XMM6) 200 201 SSE_SHUFFLE(XMM7,XMM7,0xB1) 202 203 SSE_COPY_PS(XMM0,XMM5) 204 SSE_MULT_PS(XMM0,XMM7) 205 206 SSE_COPY_PS(XMM2,XMM3) 207 SSE_MULT_PS(XMM2,XMM7) 208 209 SSE_SHUFFLE(XMM7,XMM7,0x4E) 210 211 SSE_COPY_PS(XMM1,XMM5) 212 SSE_MULT_PS(XMM1,XMM7) 213 SSE_SUB_PS(XMM1,XMM0) 214 215 SSE_MULT_PS(XMM7,XMM3) 216 SSE_SUB_PS(XMM7,XMM2) 217 218 SSE_SHUFFLE(XMM7,XMM7,0x4E) 219 SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7) 220 221 /* ----------------------------------------------- */ 222 223 SSE_COPY_PS(XMM0,XMM5) 224 SSE_MULT_PS(XMM0,XMM4) 225 226 SSE_SHUFFLE(XMM0,XMM0,0xB1) 227 228 SSE_COPY_PS(XMM2,XMM6) 229 SSE_MULT_PS(XMM2,XMM0) 230 SSE_ADD_PS(XMM2,XMM1) 231 232 SSE_COPY_PS(XMM7,XMM3) 233 SSE_MULT_PS(XMM7,XMM0) 234 235 SSE_SHUFFLE(XMM0,XMM0,0x4E) 236 237 SSE_COPY_PS(XMM1,XMM6) 238 SSE_MULT_PS(XMM1,XMM0) 239 SSE_SUB_PS(XMM2,XMM1) 240 241 SSE_MULT_PS(XMM0,XMM3) 242 SSE_SUB_PS(XMM0,XMM7) 243 244 SSE_SHUFFLE(XMM0,XMM0,0x4E) 245 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0) 246 247 /* ----------------------------------------------- */ 248 249 SSE_COPY_PS(XMM7,XMM5) 250 SSE_SHUFFLE(XMM7,XMM5,0x4E) 251 SSE_MULT_PS(XMM7,XMM6) 252 253 SSE_SHUFFLE(XMM7,XMM7,0xB1) 254 255 SSE_SHUFFLE(XMM4,XMM4,0x4E) 256 257 SSE_COPY_PS(XMM0,XMM4) 258 SSE_MULT_PS(XMM0,XMM7) 259 SSE_ADD_PS(XMM0,XMM2) 260 261 SSE_COPY_PS(XMM2,XMM3) 262 SSE_MULT_PS(XMM2,XMM7) 263 264 SSE_SHUFFLE(XMM7,XMM7,0x4E) 265 266 SSE_COPY_PS(XMM1,XMM4) 267 SSE_MULT_PS(XMM1,XMM7) 268 SSE_SUB_PS(XMM0,XMM1) 269 SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0) 270 271 SSE_MULT_PS(XMM7,XMM3) 272 SSE_SUB_PS(XMM7,XMM2) 273 274 SSE_SHUFFLE(XMM7,XMM7,0x4E) 275 276 /* ----------------------------------------------- */ 277 278 SSE_COPY_PS(XMM1,XMM3) 279 SSE_MULT_PS(XMM1,XMM5) 280 281 SSE_SHUFFLE(XMM1,XMM1,0xB1) 282 283 SSE_COPY_PS(XMM0,XMM6) 284 SSE_MULT_PS(XMM0,XMM1) 285 SSE_ADD_PS(XMM0,XMM7) 286 287 SSE_COPY_PS(XMM2,XMM4) 288 SSE_MULT_PS(XMM2,XMM1) 289 SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12) 290 291 SSE_SHUFFLE(XMM1,XMM1,0x4E) 292 293 SSE_COPY_PS(XMM7,XMM6) 294 SSE_MULT_PS(XMM7,XMM1) 295 SSE_SUB_PS(XMM7,XMM0) 296 297 SSE_MULT_PS(XMM1,XMM4) 298 SSE_SUB_PS(XMM2,XMM1) 299 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2) 300 301 /* ----------------------------------------------- */ 302 303 SSE_COPY_PS(XMM1,XMM3) 304 SSE_MULT_PS(XMM1,XMM6) 305 306 SSE_SHUFFLE(XMM1,XMM1,0xB1) 307 308 SSE_COPY_PS(XMM2,XMM4) 309 SSE_MULT_PS(XMM2,XMM1) 310 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0) 311 SSE_SUB_PS(XMM0,XMM2) 312 313 SSE_COPY_PS(XMM2,XMM5) 314 SSE_MULT_PS(XMM2,XMM1) 315 SSE_ADD_PS(XMM2,XMM7) 316 317 SSE_SHUFFLE(XMM1,XMM1,0x4E) 318 319 SSE_COPY_PS(XMM7,XMM4) 320 SSE_MULT_PS(XMM7,XMM1) 321 SSE_ADD_PS(XMM7,XMM0) 322 323 SSE_MULT_PS(XMM1,XMM5) 324 SSE_SUB_PS(XMM2,XMM1) 325 326 /* ----------------------------------------------- */ 327 328 SSE_MULT_PS(XMM4,XMM3) 329 330 SSE_SHUFFLE(XMM4,XMM4,0xB1) 331 332 SSE_COPY_PS(XMM1,XMM6) 333 SSE_MULT_PS(XMM1,XMM4) 334 SSE_ADD_PS(XMM1,XMM7) 335 336 SSE_COPY_PS(XMM0,XMM5) 337 SSE_MULT_PS(XMM0,XMM4) 338 SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7) 339 SSE_SUB_PS(XMM7,XMM0) 340 341 SSE_SHUFFLE(XMM4,XMM4,0x4E) 342 343 SSE_MULT_PS(XMM6,XMM4) 344 SSE_SUB_PS(XMM1,XMM6) 345 346 SSE_MULT_PS(XMM5,XMM4) 347 SSE_ADD_PS(XMM5,XMM7) 348 349 /* ----------------------------------------------- */ 350 351 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 352 SSE_MULT_PS(XMM3,XMM0) 353 354 SSE_COPY_PS(XMM4,XMM3) 355 SSE_SHUFFLE(XMM4,XMM3,0x4E) 356 SSE_ADD_PS(XMM4,XMM3) 357 358 SSE_COPY_PS(XMM6,XMM4) 359 SSE_SHUFFLE(XMM6,XMM4,0xB1) 360 SSE_ADD_SS(XMM6,XMM4) 361 362 SSE_COPY_PS(XMM3,XMM6) 363 SSE_RECIP_SS(XMM3,XMM6) 364 SSE_COPY_SS(XMM4,XMM3) 365 SSE_ADD_SS(XMM4,XMM3) 366 SSE_MULT_SS(XMM3,XMM3) 367 SSE_MULT_SS(XMM6,XMM3) 368 SSE_SUB_SS(XMM4,XMM6) 369 370 SSE_SHUFFLE(XMM4,XMM4,0x00) 371 372 SSE_MULT_PS(XMM0,XMM4) 373 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0) 374 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0) 375 376 SSE_MULT_PS(XMM1,XMM4) 377 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1) 378 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1) 379 380 SSE_MULT_PS(XMM2,XMM4) 381 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2) 382 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2) 383 384 SSE_MULT_PS(XMM4,XMM5) 385 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 386 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 387 388 /* ----------------------------------------------- */ 389 390 SSE_INLINE_END_1; 391 SSE_SCOPE_END; 392 PetscFunctionReturn(0); 393 } 394 395 #endif 396 397 398