1 2 /* 3 Inverts 4 by 4 matrix using gaussian elimination with partial pivoting. 4 5 Used by the sparse factorization routines in 6 src/mat/impls/baij/seq 7 8 This is a combination of the Linpack routines 9 dgefa() and dgedi() specialized for a size of 4. 10 11 */ 12 #include <petscsys.h> 13 14 PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4(MatScalar *a,PetscReal shift,PetscBool allowzeropivot,PetscBool *zeropivotdetected) 15 { 16 PetscInt i__2,i__3,kp1,j,k,l,ll,i,ipvt[4],kb,k3; 17 PetscInt k4,j3; 18 MatScalar *aa,*ax,*ay,work[16],stmp; 19 MatReal tmp,max; 20 21 PetscFunctionBegin; 22 if (zeropivotdetected) *zeropivotdetected = PETSC_FALSE; 23 shift = .25*shift*(1.e-12 + PetscAbsScalar(a[0]) + PetscAbsScalar(a[5]) + PetscAbsScalar(a[10]) + PetscAbsScalar(a[15])); 24 25 /* Parameter adjustments */ 26 a -= 5; 27 28 for (k = 1; k <= 3; ++k) { 29 kp1 = k + 1; 30 k3 = 4*k; 31 k4 = k3 + k; 32 33 /* find l = pivot index */ 34 i__2 = 5 - k; 35 aa = &a[k4]; 36 max = PetscAbsScalar(aa[0]); 37 l = 1; 38 for (ll=1; ll<i__2; ll++) { 39 tmp = PetscAbsScalar(aa[ll]); 40 if (tmp > max) { max = tmp; l = ll+1;} 41 } 42 l += k - 1; 43 ipvt[k-1] = l; 44 45 if (a[l + k3] == 0.0) { 46 if (shift == 0.0) { 47 if (allowzeropivot) { 48 PetscCall(PetscInfo(NULL,"Zero pivot, row %" PetscInt_FMT "\n",k-1)); 49 if (zeropivotdetected) *zeropivotdetected = PETSC_TRUE; 50 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row %" PetscInt_FMT,k-1); 51 } else { 52 /* SHIFT is applied to SINGLE diagonal entry; does this make any sense? */ 53 a[l + k3] = shift; 54 } 55 } 56 57 /* interchange if necessary */ 58 if (l != k) { 59 stmp = a[l + k3]; 60 a[l + k3] = a[k4]; 61 a[k4] = stmp; 62 } 63 64 /* compute multipliers */ 65 stmp = -1. / a[k4]; 66 i__2 = 4 - k; 67 aa = &a[1 + k4]; 68 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 69 70 /* row elimination with column indexing */ 71 ax = &a[k4+1]; 72 for (j = kp1; j <= 4; ++j) { 73 j3 = 4*j; 74 stmp = a[l + j3]; 75 if (l != k) { 76 a[l + j3] = a[k + j3]; 77 a[k + j3] = stmp; 78 } 79 80 i__3 = 4 - k; 81 ay = &a[1+k+j3]; 82 for (ll=0; ll<i__3; ll++) ay[ll] += stmp*ax[ll]; 83 } 84 } 85 ipvt[3] = 4; 86 if (a[20] == 0.0) { 87 if (PetscLikely(allowzeropivot)) { 88 PetscCall(PetscInfo(NULL,"Zero pivot, row 3\n")); 89 if (zeropivotdetected) *zeropivotdetected = PETSC_TRUE; 90 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot, row 3"); 91 } 92 93 /* Now form the inverse */ 94 /* compute inverse(u) */ 95 for (k = 1; k <= 4; ++k) { 96 k3 = 4*k; 97 k4 = k3 + k; 98 a[k4] = 1.0 / a[k4]; 99 stmp = -a[k4]; 100 i__2 = k - 1; 101 aa = &a[k3 + 1]; 102 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 103 kp1 = k + 1; 104 if (4 < kp1) continue; 105 ax = aa; 106 for (j = kp1; j <= 4; ++j) { 107 j3 = 4*j; 108 stmp = a[k + j3]; 109 a[k + j3] = 0.0; 110 ay = &a[j3 + 1]; 111 for (ll=0; ll<k; ll++) ay[ll] += stmp*ax[ll]; 112 } 113 } 114 115 /* form inverse(u)*inverse(l) */ 116 for (kb = 1; kb <= 3; ++kb) { 117 k = 4 - kb; 118 k3 = 4*k; 119 kp1 = k + 1; 120 aa = a + k3; 121 for (i = kp1; i <= 4; ++i) { 122 work[i-1] = aa[i]; 123 aa[i] = 0.0; 124 } 125 for (j = kp1; j <= 4; ++j) { 126 stmp = work[j-1]; 127 ax = &a[4*j + 1]; 128 ay = &a[k3 + 1]; 129 ay[0] += stmp*ax[0]; 130 ay[1] += stmp*ax[1]; 131 ay[2] += stmp*ax[2]; 132 ay[3] += stmp*ax[3]; 133 } 134 l = ipvt[k-1]; 135 if (l != k) { 136 ax = &a[k3 + 1]; 137 ay = &a[4*l + 1]; 138 stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp; 139 stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp; 140 stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp; 141 stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp; 142 } 143 } 144 PetscFunctionReturn(0); 145 } 146 147 #if defined(PETSC_HAVE_SSE) 148 #include PETSC_HAVE_SSE 149 150 PETSC_EXTERN PetscErrorCode PetscKernel_A_gets_inverse_A_4_SSE(float *a) 151 { 152 /* 153 This routine is converted from Intel's Small Matrix Library. 154 See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix 155 Order Number: 245043-001 156 March 1999 157 https://www.intel.com/content/www/us/en/homepage.html 158 159 Inverse of a 4x4 matrix via Kramer's Rule: 160 bool Invert4x4(SMLXMatrix &); 161 */ 162 PetscFunctionBegin; 163 SSE_SCOPE_BEGIN; 164 SSE_INLINE_BEGIN_1(a) 165 166 /* ----------------------------------------------- */ 167 168 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 169 SSE_LOADH_PS(SSE_ARG_1,FLOAT_4,XMM0) 170 171 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 172 SSE_LOADH_PS(SSE_ARG_1,FLOAT_12,XMM5) 173 174 SSE_COPY_PS(XMM3,XMM0) 175 SSE_SHUFFLE(XMM3,XMM5,0x88) 176 177 SSE_SHUFFLE(XMM5,XMM0,0xDD) 178 179 SSE_LOADL_PS(SSE_ARG_1,FLOAT_2,XMM0) 180 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM0) 181 182 SSE_LOADL_PS(SSE_ARG_1,FLOAT_10,XMM6) 183 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM6) 184 185 SSE_COPY_PS(XMM4,XMM0) 186 SSE_SHUFFLE(XMM4,XMM6,0x88) 187 188 SSE_SHUFFLE(XMM6,XMM0,0xDD) 189 190 /* ----------------------------------------------- */ 191 192 SSE_COPY_PS(XMM7,XMM4) 193 SSE_MULT_PS(XMM7,XMM6) 194 195 SSE_SHUFFLE(XMM7,XMM7,0xB1) 196 197 SSE_COPY_PS(XMM0,XMM5) 198 SSE_MULT_PS(XMM0,XMM7) 199 200 SSE_COPY_PS(XMM2,XMM3) 201 SSE_MULT_PS(XMM2,XMM7) 202 203 SSE_SHUFFLE(XMM7,XMM7,0x4E) 204 205 SSE_COPY_PS(XMM1,XMM5) 206 SSE_MULT_PS(XMM1,XMM7) 207 SSE_SUB_PS(XMM1,XMM0) 208 209 SSE_MULT_PS(XMM7,XMM3) 210 SSE_SUB_PS(XMM7,XMM2) 211 212 SSE_SHUFFLE(XMM7,XMM7,0x4E) 213 SSE_STORE_PS(SSE_ARG_1,FLOAT_4,XMM7) 214 215 /* ----------------------------------------------- */ 216 217 SSE_COPY_PS(XMM0,XMM5) 218 SSE_MULT_PS(XMM0,XMM4) 219 220 SSE_SHUFFLE(XMM0,XMM0,0xB1) 221 222 SSE_COPY_PS(XMM2,XMM6) 223 SSE_MULT_PS(XMM2,XMM0) 224 SSE_ADD_PS(XMM2,XMM1) 225 226 SSE_COPY_PS(XMM7,XMM3) 227 SSE_MULT_PS(XMM7,XMM0) 228 229 SSE_SHUFFLE(XMM0,XMM0,0x4E) 230 231 SSE_COPY_PS(XMM1,XMM6) 232 SSE_MULT_PS(XMM1,XMM0) 233 SSE_SUB_PS(XMM2,XMM1) 234 235 SSE_MULT_PS(XMM0,XMM3) 236 SSE_SUB_PS(XMM0,XMM7) 237 238 SSE_SHUFFLE(XMM0,XMM0,0x4E) 239 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM0) 240 241 /* ----------------------------------------------- */ 242 243 SSE_COPY_PS(XMM7,XMM5) 244 SSE_SHUFFLE(XMM7,XMM5,0x4E) 245 SSE_MULT_PS(XMM7,XMM6) 246 247 SSE_SHUFFLE(XMM7,XMM7,0xB1) 248 249 SSE_SHUFFLE(XMM4,XMM4,0x4E) 250 251 SSE_COPY_PS(XMM0,XMM4) 252 SSE_MULT_PS(XMM0,XMM7) 253 SSE_ADD_PS(XMM0,XMM2) 254 255 SSE_COPY_PS(XMM2,XMM3) 256 SSE_MULT_PS(XMM2,XMM7) 257 258 SSE_SHUFFLE(XMM7,XMM7,0x4E) 259 260 SSE_COPY_PS(XMM1,XMM4) 261 SSE_MULT_PS(XMM1,XMM7) 262 SSE_SUB_PS(XMM0,XMM1) 263 SSE_STORE_PS(SSE_ARG_1,FLOAT_0,XMM0) 264 265 SSE_MULT_PS(XMM7,XMM3) 266 SSE_SUB_PS(XMM7,XMM2) 267 268 SSE_SHUFFLE(XMM7,XMM7,0x4E) 269 270 /* ----------------------------------------------- */ 271 272 SSE_COPY_PS(XMM1,XMM3) 273 SSE_MULT_PS(XMM1,XMM5) 274 275 SSE_SHUFFLE(XMM1,XMM1,0xB1) 276 277 SSE_COPY_PS(XMM0,XMM6) 278 SSE_MULT_PS(XMM0,XMM1) 279 SSE_ADD_PS(XMM0,XMM7) 280 281 SSE_COPY_PS(XMM2,XMM4) 282 SSE_MULT_PS(XMM2,XMM1) 283 SSE_SUB_PS_M(XMM2,SSE_ARG_1,FLOAT_12) 284 285 SSE_SHUFFLE(XMM1,XMM1,0x4E) 286 287 SSE_COPY_PS(XMM7,XMM6) 288 SSE_MULT_PS(XMM7,XMM1) 289 SSE_SUB_PS(XMM7,XMM0) 290 291 SSE_MULT_PS(XMM1,XMM4) 292 SSE_SUB_PS(XMM2,XMM1) 293 SSE_STORE_PS(SSE_ARG_1,FLOAT_12,XMM2) 294 295 /* ----------------------------------------------- */ 296 297 SSE_COPY_PS(XMM1,XMM3) 298 SSE_MULT_PS(XMM1,XMM6) 299 300 SSE_SHUFFLE(XMM1,XMM1,0xB1) 301 302 SSE_COPY_PS(XMM2,XMM4) 303 SSE_MULT_PS(XMM2,XMM1) 304 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM0) 305 SSE_SUB_PS(XMM0,XMM2) 306 307 SSE_COPY_PS(XMM2,XMM5) 308 SSE_MULT_PS(XMM2,XMM1) 309 SSE_ADD_PS(XMM2,XMM7) 310 311 SSE_SHUFFLE(XMM1,XMM1,0x4E) 312 313 SSE_COPY_PS(XMM7,XMM4) 314 SSE_MULT_PS(XMM7,XMM1) 315 SSE_ADD_PS(XMM7,XMM0) 316 317 SSE_MULT_PS(XMM1,XMM5) 318 SSE_SUB_PS(XMM2,XMM1) 319 320 /* ----------------------------------------------- */ 321 322 SSE_MULT_PS(XMM4,XMM3) 323 324 SSE_SHUFFLE(XMM4,XMM4,0xB1) 325 326 SSE_COPY_PS(XMM1,XMM6) 327 SSE_MULT_PS(XMM1,XMM4) 328 SSE_ADD_PS(XMM1,XMM7) 329 330 SSE_COPY_PS(XMM0,XMM5) 331 SSE_MULT_PS(XMM0,XMM4) 332 SSE_LOAD_PS(SSE_ARG_1,FLOAT_12,XMM7) 333 SSE_SUB_PS(XMM7,XMM0) 334 335 SSE_SHUFFLE(XMM4,XMM4,0x4E) 336 337 SSE_MULT_PS(XMM6,XMM4) 338 SSE_SUB_PS(XMM1,XMM6) 339 340 SSE_MULT_PS(XMM5,XMM4) 341 SSE_ADD_PS(XMM5,XMM7) 342 343 /* ----------------------------------------------- */ 344 345 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 346 SSE_MULT_PS(XMM3,XMM0) 347 348 SSE_COPY_PS(XMM4,XMM3) 349 SSE_SHUFFLE(XMM4,XMM3,0x4E) 350 SSE_ADD_PS(XMM4,XMM3) 351 352 SSE_COPY_PS(XMM6,XMM4) 353 SSE_SHUFFLE(XMM6,XMM4,0xB1) 354 SSE_ADD_SS(XMM6,XMM4) 355 356 SSE_COPY_PS(XMM3,XMM6) 357 SSE_RECIP_SS(XMM3,XMM6) 358 SSE_COPY_SS(XMM4,XMM3) 359 SSE_ADD_SS(XMM4,XMM3) 360 SSE_MULT_SS(XMM3,XMM3) 361 SSE_MULT_SS(XMM6,XMM3) 362 SSE_SUB_SS(XMM4,XMM6) 363 364 SSE_SHUFFLE(XMM4,XMM4,0x00) 365 366 SSE_MULT_PS(XMM0,XMM4) 367 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM0) 368 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM0) 369 370 SSE_MULT_PS(XMM1,XMM4) 371 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM1) 372 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM1) 373 374 SSE_MULT_PS(XMM2,XMM4) 375 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM2) 376 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM2) 377 378 SSE_MULT_PS(XMM4,XMM5) 379 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 380 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 381 382 /* ----------------------------------------------- */ 383 384 SSE_INLINE_END_1; 385 SSE_SCOPE_END; 386 PetscFunctionReturn(0); 387 } 388 389 #endif 390