1 2 /* 3 Inverts 4 by 4 matrix using gaussian elimination with partial pivoting. 4 5 Used by the sparse factorization routines in 6 src/mat/impls/baij/seq 7 8 This is a combination of the Linpack routines 9 dgefa() and dgedi() specialized for a size of 4. 10 11 */ 12 #include <petscsys.h> 13 14 PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar *a,PetscReal shift,PetscBool allowzeropivot,PetscBool *zeropivotdetected) 15 { 16 PetscInt i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3; 17 PetscInt k4,j3; 18 MatScalar *aa,*ax,*ay,work[16],stmp; 19 MatReal tmp,max; 20 21 PetscFunctionBegin; 22 if (zeropivotdetected) *zeropivotdetected = PETSC_FALSE; 23 shift = .25*shift*(1.e-12 + PetscAbsScalar(a[0]) + PetscAbsScalar(a[5]) + PetscAbsScalar(a[10]) + PetscAbsScalar(a[15])); 24 25 /* Parameter adjustments */ 26 a -= 5; 27 28 for (k = 1; k <= 3; ++k) { 29 kp1 = k + 1; 30 k3 = 4*k; 31 k4 = k3 + k; 32 33 /* find l = pivot index */ 34 i__2 = 5 - k; 35 aa = &a[k4]; 36 max = PetscAbsScalar(aa[0]); 37 l = 1; 38 for (ll=1; ll<i__2; ll++) { 39 tmp = PetscAbsScalar(aa[ll]); 40 if (tmp > max) { max = tmp; l = ll+1;} 41 } 42 l += k - 1; 43 ipvt[k-1] = l; 44 45 if (a[l + k3] == 0.0) { 46 if (shift == 0.0) { 47 if (allowzeropivot) { 48 PetscErrorCode ierr; 49 ierr = PetscInfo1(NULL,"Zero pivot, row %D\n",k-1);CHKERRQ(ierr); 50 if (zeropivotdetected) *zeropivotdetected = PETSC_TRUE; 51 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",k-1); 52 } else { 53 /* SHIFT is applied to SINGLE diagonal entry; does this make any sense? */ 54 a[l + k3] = shift; 55 } 56 } 57 58 /* interchange if necessary */ 59 if (l != k) { 60 stmp = a[l + k3]; 61 a[l + k3] = a[k4]; 62 a[k4] = stmp; 63 } 64 65 /* compute multipliers */ 66 stmp = -1. / a[k4]; 67 i__2 = 4 - k; 68 aa = &a[1 + k4]; 69 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 70 71 /* row elimination with column indexing */ 72 ax = &a[k4+1]; 73 for (j = kp1; j <= 4; ++j) { 74 j3 = 4*j; 75 stmp = a[l + j3]; 76 if (l != k) { 77 a[l + j3] = a[k + j3]; 78 a[k + j3] = stmp; 79 } 80 81 i__3 = 4 - k; 82 ay = &a[1+k+j3]; 83 for (ll=0; ll<i__3; ll++) ay[ll] += stmp*ax[ll]; 84 } 85 } 86 ipvt[3] = 4; 87 if (a[20] == 0.0) { 88 if (allowzeropivot) { 89 PetscErrorCode ierr; 90 ierr = PetscInfo1(NULL,"Zero pivot, row %D\n",3);CHKERRQ(ierr); 91 if (zeropivotdetected) *zeropivotdetected = PETSC_TRUE; 92 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %D",3); 93 } 94 95 /* Now form the inverse */ 96 /* compute inverse(u) */ 97 for (k = 1; k <= 4; ++k) { 98 k3 = 4*k; 99 k4 = k3 + k; 100 a[k4] = 1.0 / a[k4]; 101 stmp = -a[k4]; 102 i__2 = k - 1; 103 aa = &a[k3 + 1]; 104 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 105 kp1 = k + 1; 106 if (4 < kp1) continue; 107 ax = aa; 108 for (j = kp1; j <= 4; ++j) { 109 j3 = 4*j; 110 stmp = a[k + j3]; 111 a[k + j3] = 0.0; 112 ay = &a[j3 + 1]; 113 for (ll=0; ll<k; ll++) ay[ll] += stmp*ax[ll]; 114 } 115 } 116 117 /* form inverse(u)*inverse(l) */ 118 for (kb = 1; kb <= 3; ++kb) { 119 k = 4 - kb; 120 k3 = 4*k; 121 kp1 = k + 1; 122 aa = a + k3; 123 for (i = kp1; i <= 4; ++i) { 124 work[i-1] = aa[i]; 125 aa[i] = 0.0; 126 } 127 for (j = kp1; j <= 4; ++j) { 128 stmp = work[j-1]; 129 ax = &a[4*j + 1]; 130 ay = &a[k3 + 1]; 131 ay[0] += stmp*ax[0]; 132 ay[1] += stmp*ax[1]; 133 ay[2] += stmp*ax[2]; 134 ay[3] += stmp*ax[3]; 135 } 136 l = ipvt[k-1]; 137 if (l != k) { 138 ax = &a[k3 + 1]; 139 ay = &a[4*l + 1]; 140 stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp; 141 stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp; 142 stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp; 143 stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp; 144 } 145 } 146 PetscFunctionReturn(0); 147 } 148 149 #if defined(PETSC_HAVE_SSE) 150 #include PETSC_HAVE_SSE 151 152 PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(float *a) 153 { 154 /* 155 This routine is converted from Intel's Small Matrix Library. 156 See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix 157 Order Number: 245043-001 158 March 1999 159 http://www.intel.com 160 161 Inverse of a 4x4 matrix via Kramer's Rule: 162 bool Invert4x4(SMLXMatrix &); 163 */ 164 PetscFunctionBegin; 165 SSE_SCOPE_BEGIN; 166 SSE_INLINE_BEGIN_1(a) 167 168 /* ----------------------------------------------- */ 169 170 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 171 SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0) 172 173 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 174 SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5) 175 176 SSE_COPY_PS(XMM3,XMM0) 177 SSE_SHUFFLE(XMM3,XMM5,0x88) 178 179 SSE_SHUFFLE(XMM5,XMM0,0xDD) 180 181 SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0) 182 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0) 183 184 SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6) 185 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6) 186 187 SSE_COPY_PS(XMM4,XMM0) 188 SSE_SHUFFLE(XMM4,XMM6,0x88) 189 190 SSE_SHUFFLE(XMM6,XMM0,0xDD) 191 192 /* ----------------------------------------------- */ 193 194 SSE_COPY_PS(XMM7,XMM4) 195 SSE_MULT_PS(XMM7,XMM6) 196 197 SSE_SHUFFLE(XMM7,XMM7,0xB1) 198 199 SSE_COPY_PS(XMM0,XMM5) 200 SSE_MULT_PS(XMM0,XMM7) 201 202 SSE_COPY_PS(XMM2,XMM3) 203 SSE_MULT_PS(XMM2,XMM7) 204 205 SSE_SHUFFLE(XMM7,XMM7,0x4E) 206 207 SSE_COPY_PS(XMM1,XMM5) 208 SSE_MULT_PS(XMM1,XMM7) 209 SSE_SUB_PS(XMM1,XMM0) 210 211 SSE_MULT_PS(XMM7,XMM3) 212 SSE_SUB_PS(XMM7,XMM2) 213 214 SSE_SHUFFLE(XMM7,XMM7,0x4E) 215 SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7) 216 217 /* ----------------------------------------------- */ 218 219 SSE_COPY_PS(XMM0,XMM5) 220 SSE_MULT_PS(XMM0,XMM4) 221 222 SSE_SHUFFLE(XMM0,XMM0,0xB1) 223 224 SSE_COPY_PS(XMM2,XMM6) 225 SSE_MULT_PS(XMM2,XMM0) 226 SSE_ADD_PS(XMM2,XMM1) 227 228 SSE_COPY_PS(XMM7,XMM3) 229 SSE_MULT_PS(XMM7,XMM0) 230 231 SSE_SHUFFLE(XMM0,XMM0,0x4E) 232 233 SSE_COPY_PS(XMM1,XMM6) 234 SSE_MULT_PS(XMM1,XMM0) 235 SSE_SUB_PS(XMM2,XMM1) 236 237 SSE_MULT_PS(XMM0,XMM3) 238 SSE_SUB_PS(XMM0,XMM7) 239 240 SSE_SHUFFLE(XMM0,XMM0,0x4E) 241 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0) 242 243 /* ----------------------------------------------- */ 244 245 SSE_COPY_PS(XMM7,XMM5) 246 SSE_SHUFFLE(XMM7,XMM5,0x4E) 247 SSE_MULT_PS(XMM7,XMM6) 248 249 SSE_SHUFFLE(XMM7,XMM7,0xB1) 250 251 SSE_SHUFFLE(XMM4,XMM4,0x4E) 252 253 SSE_COPY_PS(XMM0,XMM4) 254 SSE_MULT_PS(XMM0,XMM7) 255 SSE_ADD_PS(XMM0,XMM2) 256 257 SSE_COPY_PS(XMM2,XMM3) 258 SSE_MULT_PS(XMM2,XMM7) 259 260 SSE_SHUFFLE(XMM7,XMM7,0x4E) 261 262 SSE_COPY_PS(XMM1,XMM4) 263 SSE_MULT_PS(XMM1,XMM7) 264 SSE_SUB_PS(XMM0,XMM1) 265 SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0) 266 267 SSE_MULT_PS(XMM7,XMM3) 268 SSE_SUB_PS(XMM7,XMM2) 269 270 SSE_SHUFFLE(XMM7,XMM7,0x4E) 271 272 /* ----------------------------------------------- */ 273 274 SSE_COPY_PS(XMM1,XMM3) 275 SSE_MULT_PS(XMM1,XMM5) 276 277 SSE_SHUFFLE(XMM1,XMM1,0xB1) 278 279 SSE_COPY_PS(XMM0,XMM6) 280 SSE_MULT_PS(XMM0,XMM1) 281 SSE_ADD_PS(XMM0,XMM7) 282 283 SSE_COPY_PS(XMM2,XMM4) 284 SSE_MULT_PS(XMM2,XMM1) 285 SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12) 286 287 SSE_SHUFFLE(XMM1,XMM1,0x4E) 288 289 SSE_COPY_PS(XMM7,XMM6) 290 SSE_MULT_PS(XMM7,XMM1) 291 SSE_SUB_PS(XMM7,XMM0) 292 293 SSE_MULT_PS(XMM1,XMM4) 294 SSE_SUB_PS(XMM2,XMM1) 295 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2) 296 297 /* ----------------------------------------------- */ 298 299 SSE_COPY_PS(XMM1,XMM3) 300 SSE_MULT_PS(XMM1,XMM6) 301 302 SSE_SHUFFLE(XMM1,XMM1,0xB1) 303 304 SSE_COPY_PS(XMM2,XMM4) 305 SSE_MULT_PS(XMM2,XMM1) 306 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0) 307 SSE_SUB_PS(XMM0,XMM2) 308 309 SSE_COPY_PS(XMM2,XMM5) 310 SSE_MULT_PS(XMM2,XMM1) 311 SSE_ADD_PS(XMM2,XMM7) 312 313 SSE_SHUFFLE(XMM1,XMM1,0x4E) 314 315 SSE_COPY_PS(XMM7,XMM4) 316 SSE_MULT_PS(XMM7,XMM1) 317 SSE_ADD_PS(XMM7,XMM0) 318 319 SSE_MULT_PS(XMM1,XMM5) 320 SSE_SUB_PS(XMM2,XMM1) 321 322 /* ----------------------------------------------- */ 323 324 SSE_MULT_PS(XMM4,XMM3) 325 326 SSE_SHUFFLE(XMM4,XMM4,0xB1) 327 328 SSE_COPY_PS(XMM1,XMM6) 329 SSE_MULT_PS(XMM1,XMM4) 330 SSE_ADD_PS(XMM1,XMM7) 331 332 SSE_COPY_PS(XMM0,XMM5) 333 SSE_MULT_PS(XMM0,XMM4) 334 SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7) 335 SSE_SUB_PS(XMM7,XMM0) 336 337 SSE_SHUFFLE(XMM4,XMM4,0x4E) 338 339 SSE_MULT_PS(XMM6,XMM4) 340 SSE_SUB_PS(XMM1,XMM6) 341 342 SSE_MULT_PS(XMM5,XMM4) 343 SSE_ADD_PS(XMM5,XMM7) 344 345 /* ----------------------------------------------- */ 346 347 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 348 SSE_MULT_PS(XMM3,XMM0) 349 350 SSE_COPY_PS(XMM4,XMM3) 351 SSE_SHUFFLE(XMM4,XMM3,0x4E) 352 SSE_ADD_PS(XMM4,XMM3) 353 354 SSE_COPY_PS(XMM6,XMM4) 355 SSE_SHUFFLE(XMM6,XMM4,0xB1) 356 SSE_ADD_SS(XMM6,XMM4) 357 358 SSE_COPY_PS(XMM3,XMM6) 359 SSE_RECIP_SS(XMM3,XMM6) 360 SSE_COPY_SS(XMM4,XMM3) 361 SSE_ADD_SS(XMM4,XMM3) 362 SSE_MULT_SS(XMM3,XMM3) 363 SSE_MULT_SS(XMM6,XMM3) 364 SSE_SUB_SS(XMM4,XMM6) 365 366 SSE_SHUFFLE(XMM4,XMM4,0x00) 367 368 SSE_MULT_PS(XMM0,XMM4) 369 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0) 370 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0) 371 372 SSE_MULT_PS(XMM1,XMM4) 373 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1) 374 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1) 375 376 SSE_MULT_PS(XMM2,XMM4) 377 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2) 378 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2) 379 380 SSE_MULT_PS(XMM4,XMM5) 381 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 382 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 383 384 /* ----------------------------------------------- */ 385 386 SSE_INLINE_END_1; 387 SSE_SCOPE_END; 388 PetscFunctionReturn(0); 389 } 390 391 #endif 392 393 394