1 /*$Id: dgefa4.c,v 1.15 2001/03/23 23:22:07 balay Exp buschelm $*/ 2 /* 3 Inverts 4 by 4 matrix using partial pivoting. 4 5 Used by the sparse factorization routines in 6 src/mat/impls/baij/seq and src/mat/impls/bdiag/seq 7 8 See also src/inline/ilu.h 9 10 This is a combination of the Linpack routines 11 dgefa() and dgedi() specialized for a size of 4. 12 13 */ 14 #include "petsc.h" 15 16 #undef __FUNCT__ 17 #define __FUNCT__ "Kernel_A_gets_inverse_A_4" 18 int Kernel_A_gets_inverse_A_4(MatScalar *a) 19 { 20 int i__2,i__3,kp1,j,k,l,ll,i,ipvt_l[4],*ipvt = ipvt_l-1,kb,k3; 21 int k4,j3; 22 MatScalar *aa,*ax,*ay,work_l[16],*work = work_l-1,stmp; 23 MatReal tmp,max; 24 25 /* gaussian elimination with partial pivoting */ 26 27 PetscFunctionBegin; 28 /* Parameter adjustments */ 29 a -= 5; 30 31 for (k = 1; k <= 3; ++k) { 32 kp1 = k + 1; 33 k3 = 4*k; 34 k4 = k3 + k; 35 /* find l = pivot index */ 36 37 i__2 = 4 - k; 38 aa = &a[k4]; 39 max = PetscAbsScalar(aa[0]); 40 l = 1; 41 for (ll=1; ll<i__2; ll++) { 42 tmp = PetscAbsScalar(aa[ll]); 43 if (tmp > max) { max = tmp; l = ll+1;} 44 } 45 l += k - 1; 46 ipvt[k] = l; 47 48 if (a[l + k3] == 0.) { 49 SETERRQ(k,"Zero pivot"); 50 } 51 52 /* interchange if necessary */ 53 54 if (l != k) { 55 stmp = a[l + k3]; 56 a[l + k3] = a[k4]; 57 a[k4] = stmp; 58 } 59 60 /* compute multipliers */ 61 62 stmp = -1. / a[k4]; 63 i__2 = 4 - k; 64 aa = &a[1 + k4]; 65 for (ll=0; ll<i__2; ll++) { 66 aa[ll] *= stmp; 67 } 68 69 /* row elimination with column indexing */ 70 71 ax = &a[k4+1]; 72 for (j = kp1; j <= 4; ++j) { 73 j3 = 4*j; 74 stmp = a[l + j3]; 75 if (l != k) { 76 a[l + j3] = a[k + j3]; 77 a[k + j3] = stmp; 78 } 79 80 i__3 = 4 - k; 81 ay = &a[1+k+j3]; 82 for (ll=0; ll<i__3; ll++) { 83 ay[ll] += stmp*ax[ll]; 84 } 85 } 86 } 87 ipvt[4] = 4; 88 if (a[20] == 0.) { 89 SETERRQ(3,"Zero pivot,final row"); 90 } 91 92 /* 93 Now form the inverse 94 */ 95 96 /* compute inverse(u) */ 97 98 for (k = 1; k <= 4; ++k) { 99 k3 = 4*k; 100 k4 = k3 + k; 101 a[k4] = 1.0 / a[k4]; 102 stmp = -a[k4]; 103 i__2 = k - 1; 104 aa = &a[k3 + 1]; 105 for (ll=0; ll<i__2; ll++) aa[ll] *= stmp; 106 kp1 = k + 1; 107 if (4 < kp1) continue; 108 ax = aa; 109 for (j = kp1; j <= 4; ++j) { 110 j3 = 4*j; 111 stmp = a[k + j3]; 112 a[k + j3] = 0.0; 113 ay = &a[j3 + 1]; 114 for (ll=0; ll<k; ll++) { 115 ay[ll] += stmp*ax[ll]; 116 } 117 } 118 } 119 120 /* form inverse(u)*inverse(l) */ 121 122 for (kb = 1; kb <= 3; ++kb) { 123 k = 4 - kb; 124 k3 = 4*k; 125 kp1 = k + 1; 126 aa = a + k3; 127 for (i = kp1; i <= 4; ++i) { 128 work_l[i-1] = aa[i]; 129 /* work[i] = aa[i]; Fix for -O3 error on Origin 2000 */ 130 aa[i] = 0.0; 131 } 132 for (j = kp1; j <= 4; ++j) { 133 stmp = work[j]; 134 ax = &a[4*j + 1]; 135 ay = &a[k3 + 1]; 136 ay[0] += stmp*ax[0]; 137 ay[1] += stmp*ax[1]; 138 ay[2] += stmp*ax[2]; 139 ay[3] += stmp*ax[3]; 140 } 141 l = ipvt[k]; 142 if (l != k) { 143 ax = &a[k3 + 1]; 144 ay = &a[4*l + 1]; 145 stmp = ax[0]; ax[0] = ay[0]; ay[0] = stmp; 146 stmp = ax[1]; ax[1] = ay[1]; ay[1] = stmp; 147 stmp = ax[2]; ax[2] = ay[2]; ay[2] = stmp; 148 stmp = ax[3]; ax[3] = ay[3]; ay[3] = stmp; 149 } 150 } 151 PetscFunctionReturn(0); 152 } 153 154 #ifdef PETSC_HAVE_ICL_SSE 155 #include "xmmintrin.h" 156 157 #undef __FUNCT__ 158 #define __FUNCT__ "Kernel_A_gets_inverse_A_4SSE" 159 int Kernel_A_gets_inverse_A_4SSE(float *a) 160 { 161 /* 162 This routine is taken from Intel's Small Matrix Library. 163 See: Streaming SIMD Extensions -- Inverse of 4x4 Matrix 164 Order Number: 245043-001 165 March 1999 166 http://www.intel.com 167 168 Note: Intel's SML uses row-wise storage for these small matrices, 169 and PETSc uses column-wise storage. However since inv(A')=(inv(A))' 170 the same code can be used here. 171 172 Inverse of a 4x4 matrix via Kramer's Rule: 173 bool Invert4x4(SMLXMatrix &); 174 */ 175 __m128 minor0, minor1, minor2, minor3; 176 __m128 row0, row1, row2, row3; 177 __m128 det, tmp1; 178 179 PetscFunctionBegin; 180 tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(a)), (__m64*)(a+ 4)); 181 row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(a+8)), (__m64*)(a+12)); 182 row0 = _mm_shuffle_ps(tmp1, row1, 0x88); 183 row1 = _mm_shuffle_ps(row1, tmp1, 0xDD); 184 tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(a+ 2)), (__m64*)(a+ 6)); 185 row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(a+10)), (__m64*)(a+14)); 186 row2 = _mm_shuffle_ps(tmp1, row3, 0x88); 187 row3 = _mm_shuffle_ps(row3, tmp1, 0xDD); 188 /* ----------------------------------------------- */ 189 tmp1 = _mm_mul_ps(row2, row3); 190 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); 191 minor0 = _mm_mul_ps(row1, tmp1); 192 minor1 = _mm_mul_ps(row0, tmp1); 193 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); 194 minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0); 195 minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1); 196 minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E); 197 /* ----------------------------------------------- */ 198 tmp1 = _mm_mul_ps(row1, row2); 199 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); 200 minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0); 201 minor3 = _mm_mul_ps(row0, tmp1); 202 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); 203 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1)); 204 minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3); 205 minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E); 206 /* ----------------------------------------------- */ 207 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3); 208 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); 209 row2 = _mm_shuffle_ps(row2, row2, 0x4E); 210 minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0); 211 minor2 = _mm_mul_ps(row0, tmp1); 212 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); 213 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1)); 214 minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2); 215 minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E); 216 /* ----------------------------------------------- */ 217 tmp1 = _mm_mul_ps(row0, row1); 218 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); 219 minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2); 220 minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3); 221 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); 222 minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2); 223 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1)); 224 /* ----------------------------------------------- */ 225 tmp1 = _mm_mul_ps(row0, row3); 226 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); 227 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1)); 228 minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2); 229 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); 230 minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1); 231 minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1)); 232 /* ----------------------------------------------- */ 233 tmp1 = _mm_mul_ps(row0, row2); 234 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1); 235 minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1); 236 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1)); 237 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E); 238 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1)); 239 minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3); 240 /* ----------------------------------------------- */ 241 det = _mm_mul_ps(row0, minor0); 242 det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det); 243 det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det); 244 tmp1 = _mm_rcp_ss(det); 245 det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1))); 246 det = _mm_shuffle_ps(det, det, 0x00); 247 minor0 = _mm_mul_ps(det, minor0); 248 _mm_storel_pi((__m64*)(a), minor0); 249 _mm_storeh_pi((__m64*)(a+2), minor0); 250 minor1 = _mm_mul_ps(det, minor1); 251 _mm_storel_pi((__m64*)(a+4), minor1); 252 _mm_storeh_pi((__m64*)(a+6), minor1); 253 minor2 = _mm_mul_ps(det, minor2); 254 _mm_storel_pi((__m64*)(a+ 8), minor2); 255 _mm_storeh_pi((__m64*)(a+10), minor2); 256 minor3 = _mm_mul_ps(det, minor3); 257 _mm_storel_pi((__m64*)(a+12), minor3); 258 _mm_storeh_pi((__m64*)(a+14), minor3); 259 PetscFunctionReturn(0); 260 } 261 262 #endif 263 264 265