1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info) { 13 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 14 IS isrow = b->row, isicol = b->icol; 15 const PetscInt *r, *ic; 16 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 17 PetscInt *ajtmpold, *ajtmp, nz, row; 18 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 19 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 20 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 21 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 22 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 23 MatScalar m13, m14, m15, m16; 24 MatScalar *ba = b->a, *aa = a->a; 25 PetscBool pivotinblocks = b->pivotinblocks; 26 PetscReal shift = info->shiftamount; 27 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 28 29 PetscFunctionBegin; 30 PetscCall(ISGetIndices(isrow, &r)); 31 PetscCall(ISGetIndices(isicol, &ic)); 32 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 33 allowzeropivot = PetscNot(A->erroriffailure); 34 35 for (i = 0; i < n; i++) { 36 nz = bi[i + 1] - bi[i]; 37 ajtmp = bj + bi[i]; 38 for (j = 0; j < nz; j++) { 39 x = rtmp + 16 * ajtmp[j]; 40 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 41 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 42 } 43 /* load in initial (unfactored row) */ 44 idx = r[i]; 45 nz = ai[idx + 1] - ai[idx]; 46 ajtmpold = aj + ai[idx]; 47 v = aa + 16 * ai[idx]; 48 for (j = 0; j < nz; j++) { 49 x = rtmp + 16 * ic[ajtmpold[j]]; 50 x[0] = v[0]; 51 x[1] = v[1]; 52 x[2] = v[2]; 53 x[3] = v[3]; 54 x[4] = v[4]; 55 x[5] = v[5]; 56 x[6] = v[6]; 57 x[7] = v[7]; 58 x[8] = v[8]; 59 x[9] = v[9]; 60 x[10] = v[10]; 61 x[11] = v[11]; 62 x[12] = v[12]; 63 x[13] = v[13]; 64 x[14] = v[14]; 65 x[15] = v[15]; 66 v += 16; 67 } 68 row = *ajtmp++; 69 while (row < i) { 70 pc = rtmp + 16 * row; 71 p1 = pc[0]; 72 p2 = pc[1]; 73 p3 = pc[2]; 74 p4 = pc[3]; 75 p5 = pc[4]; 76 p6 = pc[5]; 77 p7 = pc[6]; 78 p8 = pc[7]; 79 p9 = pc[8]; 80 p10 = pc[9]; 81 p11 = pc[10]; 82 p12 = pc[11]; 83 p13 = pc[12]; 84 p14 = pc[13]; 85 p15 = pc[14]; 86 p16 = pc[15]; 87 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 88 pv = ba + 16 * diag_offset[row]; 89 pj = bj + diag_offset[row] + 1; 90 x1 = pv[0]; 91 x2 = pv[1]; 92 x3 = pv[2]; 93 x4 = pv[3]; 94 x5 = pv[4]; 95 x6 = pv[5]; 96 x7 = pv[6]; 97 x8 = pv[7]; 98 x9 = pv[8]; 99 x10 = pv[9]; 100 x11 = pv[10]; 101 x12 = pv[11]; 102 x13 = pv[12]; 103 x14 = pv[13]; 104 x15 = pv[14]; 105 x16 = pv[15]; 106 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 107 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 108 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 109 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 110 111 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 112 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 113 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 114 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 115 116 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 117 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 118 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 119 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 120 121 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 122 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 123 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 124 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 125 126 nz = bi[row + 1] - diag_offset[row] - 1; 127 pv += 16; 128 for (j = 0; j < nz; j++) { 129 x1 = pv[0]; 130 x2 = pv[1]; 131 x3 = pv[2]; 132 x4 = pv[3]; 133 x5 = pv[4]; 134 x6 = pv[5]; 135 x7 = pv[6]; 136 x8 = pv[7]; 137 x9 = pv[8]; 138 x10 = pv[9]; 139 x11 = pv[10]; 140 x12 = pv[11]; 141 x13 = pv[12]; 142 x14 = pv[13]; 143 x15 = pv[14]; 144 x16 = pv[15]; 145 x = rtmp + 16 * pj[j]; 146 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 147 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 148 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 149 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 150 151 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 152 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 153 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 154 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 155 156 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 157 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 158 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 159 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 160 161 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 162 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 163 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 164 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 165 166 pv += 16; 167 } 168 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 169 } 170 row = *ajtmp++; 171 } 172 /* finished row so stick it into b->a */ 173 pv = ba + 16 * bi[i]; 174 pj = bj + bi[i]; 175 nz = bi[i + 1] - bi[i]; 176 for (j = 0; j < nz; j++) { 177 x = rtmp + 16 * pj[j]; 178 pv[0] = x[0]; 179 pv[1] = x[1]; 180 pv[2] = x[2]; 181 pv[3] = x[3]; 182 pv[4] = x[4]; 183 pv[5] = x[5]; 184 pv[6] = x[6]; 185 pv[7] = x[7]; 186 pv[8] = x[8]; 187 pv[9] = x[9]; 188 pv[10] = x[10]; 189 pv[11] = x[11]; 190 pv[12] = x[12]; 191 pv[13] = x[13]; 192 pv[14] = x[14]; 193 pv[15] = x[15]; 194 pv += 16; 195 } 196 /* invert diagonal block */ 197 w = ba + 16 * diag_offset[i]; 198 if (pivotinblocks) { 199 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 200 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 201 } else { 202 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 203 } 204 } 205 206 PetscCall(PetscFree(rtmp)); 207 PetscCall(ISRestoreIndices(isicol, &ic)); 208 PetscCall(ISRestoreIndices(isrow, &r)); 209 210 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 211 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 212 C->assembled = PETSC_TRUE; 213 214 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 215 PetscFunctionReturn(0); 216 } 217 218 /* MatLUFactorNumeric_SeqBAIJ_4 - 219 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 220 PetscKernel_A_gets_A_times_B() 221 PetscKernel_A_gets_A_minus_B_times_C() 222 PetscKernel_A_gets_inverse_A() 223 */ 224 225 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info) { 226 Mat C = B; 227 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 228 IS isrow = b->row, isicol = b->icol; 229 const PetscInt *r, *ic; 230 PetscInt i, j, k, nz, nzL, row; 231 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 232 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 233 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 234 PetscInt flg; 235 PetscReal shift; 236 PetscBool allowzeropivot, zeropivotdetected; 237 238 PetscFunctionBegin; 239 allowzeropivot = PetscNot(A->erroriffailure); 240 PetscCall(ISGetIndices(isrow, &r)); 241 PetscCall(ISGetIndices(isicol, &ic)); 242 243 if (info->shifttype == MAT_SHIFT_NONE) { 244 shift = 0; 245 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 246 shift = info->shiftamount; 247 } 248 249 /* generate work space needed by the factorization */ 250 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 251 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 252 253 for (i = 0; i < n; i++) { 254 /* zero rtmp */ 255 /* L part */ 256 nz = bi[i + 1] - bi[i]; 257 bjtmp = bj + bi[i]; 258 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 259 260 /* U part */ 261 nz = bdiag[i] - bdiag[i + 1]; 262 bjtmp = bj + bdiag[i + 1] + 1; 263 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 264 265 /* load in initial (unfactored row) */ 266 nz = ai[r[i] + 1] - ai[r[i]]; 267 ajtmp = aj + ai[r[i]]; 268 v = aa + bs2 * ai[r[i]]; 269 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); } 270 271 /* elimination */ 272 bjtmp = bj + bi[i]; 273 nzL = bi[i + 1] - bi[i]; 274 for (k = 0; k < nzL; k++) { 275 row = bjtmp[k]; 276 pc = rtmp + bs2 * row; 277 for (flg = 0, j = 0; j < bs2; j++) { 278 if (pc[j] != 0.0) { 279 flg = 1; 280 break; 281 } 282 } 283 if (flg) { 284 pv = b->a + bs2 * bdiag[row]; 285 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 286 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 287 288 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 289 pv = b->a + bs2 * (bdiag[row + 1] + 1); 290 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 291 for (j = 0; j < nz; j++) { 292 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 293 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 294 v = rtmp + bs2 * pj[j]; 295 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 296 pv += bs2; 297 } 298 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 299 } 300 } 301 302 /* finished row so stick it into b->a */ 303 /* L part */ 304 pv = b->a + bs2 * bi[i]; 305 pj = b->j + bi[i]; 306 nz = bi[i + 1] - bi[i]; 307 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 308 309 /* Mark diagonal and invert diagonal for simpler triangular solves */ 310 pv = b->a + bs2 * bdiag[i]; 311 pj = b->j + bdiag[i]; 312 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 313 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 314 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 315 316 /* U part */ 317 pv = b->a + bs2 * (bdiag[i + 1] + 1); 318 pj = b->j + bdiag[i + 1] + 1; 319 nz = bdiag[i] - bdiag[i + 1] - 1; 320 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 321 } 322 323 PetscCall(PetscFree2(rtmp, mwork)); 324 PetscCall(ISRestoreIndices(isicol, &ic)); 325 PetscCall(ISRestoreIndices(isrow, &r)); 326 327 C->ops->solve = MatSolve_SeqBAIJ_4; 328 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 329 C->assembled = PETSC_TRUE; 330 331 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 332 PetscFunctionReturn(0); 333 } 334 335 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) { 336 /* 337 Default Version for when blocks are 4 by 4 Using natural ordering 338 */ 339 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 340 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 341 PetscInt *ajtmpold, *ajtmp, nz, row; 342 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 343 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 344 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 345 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 346 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 347 MatScalar m13, m14, m15, m16; 348 MatScalar *ba = b->a, *aa = a->a; 349 PetscBool pivotinblocks = b->pivotinblocks; 350 PetscReal shift = info->shiftamount; 351 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 352 353 PetscFunctionBegin; 354 allowzeropivot = PetscNot(A->erroriffailure); 355 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 356 357 for (i = 0; i < n; i++) { 358 nz = bi[i + 1] - bi[i]; 359 ajtmp = bj + bi[i]; 360 for (j = 0; j < nz; j++) { 361 x = rtmp + 16 * ajtmp[j]; 362 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 363 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 364 } 365 /* load in initial (unfactored row) */ 366 nz = ai[i + 1] - ai[i]; 367 ajtmpold = aj + ai[i]; 368 v = aa + 16 * ai[i]; 369 for (j = 0; j < nz; j++) { 370 x = rtmp + 16 * ajtmpold[j]; 371 x[0] = v[0]; 372 x[1] = v[1]; 373 x[2] = v[2]; 374 x[3] = v[3]; 375 x[4] = v[4]; 376 x[5] = v[5]; 377 x[6] = v[6]; 378 x[7] = v[7]; 379 x[8] = v[8]; 380 x[9] = v[9]; 381 x[10] = v[10]; 382 x[11] = v[11]; 383 x[12] = v[12]; 384 x[13] = v[13]; 385 x[14] = v[14]; 386 x[15] = v[15]; 387 v += 16; 388 } 389 row = *ajtmp++; 390 while (row < i) { 391 pc = rtmp + 16 * row; 392 p1 = pc[0]; 393 p2 = pc[1]; 394 p3 = pc[2]; 395 p4 = pc[3]; 396 p5 = pc[4]; 397 p6 = pc[5]; 398 p7 = pc[6]; 399 p8 = pc[7]; 400 p9 = pc[8]; 401 p10 = pc[9]; 402 p11 = pc[10]; 403 p12 = pc[11]; 404 p13 = pc[12]; 405 p14 = pc[13]; 406 p15 = pc[14]; 407 p16 = pc[15]; 408 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 409 pv = ba + 16 * diag_offset[row]; 410 pj = bj + diag_offset[row] + 1; 411 x1 = pv[0]; 412 x2 = pv[1]; 413 x3 = pv[2]; 414 x4 = pv[3]; 415 x5 = pv[4]; 416 x6 = pv[5]; 417 x7 = pv[6]; 418 x8 = pv[7]; 419 x9 = pv[8]; 420 x10 = pv[9]; 421 x11 = pv[10]; 422 x12 = pv[11]; 423 x13 = pv[12]; 424 x14 = pv[13]; 425 x15 = pv[14]; 426 x16 = pv[15]; 427 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 428 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 429 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 430 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 431 432 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 433 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 434 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 435 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 436 437 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 438 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 439 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 440 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 441 442 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 443 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 444 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 445 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 446 nz = bi[row + 1] - diag_offset[row] - 1; 447 pv += 16; 448 for (j = 0; j < nz; j++) { 449 x1 = pv[0]; 450 x2 = pv[1]; 451 x3 = pv[2]; 452 x4 = pv[3]; 453 x5 = pv[4]; 454 x6 = pv[5]; 455 x7 = pv[6]; 456 x8 = pv[7]; 457 x9 = pv[8]; 458 x10 = pv[9]; 459 x11 = pv[10]; 460 x12 = pv[11]; 461 x13 = pv[12]; 462 x14 = pv[13]; 463 x15 = pv[14]; 464 x16 = pv[15]; 465 x = rtmp + 16 * pj[j]; 466 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 467 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 468 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 469 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 470 471 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 472 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 473 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 474 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 475 476 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 477 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 478 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 479 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 480 481 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 482 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 483 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 484 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 485 486 pv += 16; 487 } 488 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 489 } 490 row = *ajtmp++; 491 } 492 /* finished row so stick it into b->a */ 493 pv = ba + 16 * bi[i]; 494 pj = bj + bi[i]; 495 nz = bi[i + 1] - bi[i]; 496 for (j = 0; j < nz; j++) { 497 x = rtmp + 16 * pj[j]; 498 pv[0] = x[0]; 499 pv[1] = x[1]; 500 pv[2] = x[2]; 501 pv[3] = x[3]; 502 pv[4] = x[4]; 503 pv[5] = x[5]; 504 pv[6] = x[6]; 505 pv[7] = x[7]; 506 pv[8] = x[8]; 507 pv[9] = x[9]; 508 pv[10] = x[10]; 509 pv[11] = x[11]; 510 pv[12] = x[12]; 511 pv[13] = x[13]; 512 pv[14] = x[14]; 513 pv[15] = x[15]; 514 pv += 16; 515 } 516 /* invert diagonal block */ 517 w = ba + 16 * diag_offset[i]; 518 if (pivotinblocks) { 519 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 520 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 521 } else { 522 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 523 } 524 } 525 526 PetscCall(PetscFree(rtmp)); 527 528 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 529 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 530 C->assembled = PETSC_TRUE; 531 532 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 533 PetscFunctionReturn(0); 534 } 535 536 /* 537 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 538 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 539 */ 540 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) { 541 Mat C = B; 542 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 543 PetscInt i, j, k, nz, nzL, row; 544 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 545 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 546 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 547 PetscInt flg; 548 PetscReal shift; 549 PetscBool allowzeropivot, zeropivotdetected; 550 551 PetscFunctionBegin; 552 allowzeropivot = PetscNot(A->erroriffailure); 553 554 /* generate work space needed by the factorization */ 555 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 556 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 557 558 if (info->shifttype == MAT_SHIFT_NONE) { 559 shift = 0; 560 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 561 shift = info->shiftamount; 562 } 563 564 for (i = 0; i < n; i++) { 565 /* zero rtmp */ 566 /* L part */ 567 nz = bi[i + 1] - bi[i]; 568 bjtmp = bj + bi[i]; 569 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 570 571 /* U part */ 572 nz = bdiag[i] - bdiag[i + 1]; 573 bjtmp = bj + bdiag[i + 1] + 1; 574 for (j = 0; j < nz; j++) { PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); } 575 576 /* load in initial (unfactored row) */ 577 nz = ai[i + 1] - ai[i]; 578 ajtmp = aj + ai[i]; 579 v = aa + bs2 * ai[i]; 580 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); } 581 582 /* elimination */ 583 bjtmp = bj + bi[i]; 584 nzL = bi[i + 1] - bi[i]; 585 for (k = 0; k < nzL; k++) { 586 row = bjtmp[k]; 587 pc = rtmp + bs2 * row; 588 for (flg = 0, j = 0; j < bs2; j++) { 589 if (pc[j] != 0.0) { 590 flg = 1; 591 break; 592 } 593 } 594 if (flg) { 595 pv = b->a + bs2 * bdiag[row]; 596 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 597 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 598 599 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 600 pv = b->a + bs2 * (bdiag[row + 1] + 1); 601 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 602 for (j = 0; j < nz; j++) { 603 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 604 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 605 v = rtmp + bs2 * pj[j]; 606 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 607 pv += bs2; 608 } 609 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 610 } 611 } 612 613 /* finished row so stick it into b->a */ 614 /* L part */ 615 pv = b->a + bs2 * bi[i]; 616 pj = b->j + bi[i]; 617 nz = bi[i + 1] - bi[i]; 618 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 619 620 /* Mark diagonal and invert diagonal for simpler triangular solves */ 621 pv = b->a + bs2 * bdiag[i]; 622 pj = b->j + bdiag[i]; 623 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 624 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 625 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 626 627 /* U part */ 628 pv = b->a + bs2 * (bdiag[i + 1] + 1); 629 pj = b->j + bdiag[i + 1] + 1; 630 nz = bdiag[i] - bdiag[i + 1] - 1; 631 for (j = 0; j < nz; j++) { PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); } 632 } 633 PetscCall(PetscFree2(rtmp, mwork)); 634 635 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 636 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 637 C->assembled = PETSC_TRUE; 638 639 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 640 PetscFunctionReturn(0); 641 } 642 643 #if defined(PETSC_HAVE_SSE) 644 645 #include PETSC_HAVE_SSE 646 647 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 648 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info) { 649 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 650 int i, j, n = a->mbs; 651 int *bj = b->j, *bjtmp, *pj; 652 int row; 653 int *ajtmpold, nz, *bi = b->i; 654 int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 655 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 656 MatScalar *ba = b->a, *aa = a->a; 657 int nonzero = 0; 658 PetscBool pivotinblocks = b->pivotinblocks; 659 PetscReal shift = info->shiftamount; 660 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 661 662 PetscFunctionBegin; 663 allowzeropivot = PetscNot(A->erroriffailure); 664 SSE_SCOPE_BEGIN; 665 666 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 667 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 668 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 669 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 670 /* if ((unsigned long)bj==(unsigned long)aj) { */ 671 /* colscale = 4; */ 672 /* } */ 673 for (i = 0; i < n; i++) { 674 nz = bi[i + 1] - bi[i]; 675 bjtmp = bj + bi[i]; 676 /* zero out the 4x4 block accumulators */ 677 /* zero out one register */ 678 XOR_PS(XMM7, XMM7); 679 for (j = 0; j < nz; j++) { 680 x = rtmp + 16 * bjtmp[j]; 681 /* x = rtmp+4*bjtmp[j]; */ 682 SSE_INLINE_BEGIN_1(x) 683 /* Copy zero register to memory locations */ 684 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 685 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 686 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 687 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 688 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 689 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 690 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 691 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 692 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 693 SSE_INLINE_END_1; 694 } 695 /* load in initial (unfactored row) */ 696 nz = ai[i + 1] - ai[i]; 697 ajtmpold = aj + ai[i]; 698 v = aa + 16 * ai[i]; 699 for (j = 0; j < nz; j++) { 700 x = rtmp + 16 * ajtmpold[j]; 701 /* x = rtmp+colscale*ajtmpold[j]; */ 702 /* Copy v block into x block */ 703 SSE_INLINE_BEGIN_2(v, x) 704 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 705 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 706 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 707 708 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 709 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 710 711 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 712 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 713 714 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 715 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 716 717 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 718 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 719 720 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 721 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 722 723 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 724 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 725 726 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 727 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 728 SSE_INLINE_END_2; 729 730 v += 16; 731 } 732 /* row = (*bjtmp++)/4; */ 733 row = *bjtmp++; 734 while (row < i) { 735 pc = rtmp + 16 * row; 736 SSE_INLINE_BEGIN_1(pc) 737 /* Load block from lower triangle */ 738 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 739 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 740 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 741 742 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 743 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 744 745 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 746 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 747 748 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 749 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 750 751 /* Compare block to zero block */ 752 753 SSE_COPY_PS(XMM4, XMM7) 754 SSE_CMPNEQ_PS(XMM4, XMM0) 755 756 SSE_COPY_PS(XMM5, XMM7) 757 SSE_CMPNEQ_PS(XMM5, XMM1) 758 759 SSE_COPY_PS(XMM6, XMM7) 760 SSE_CMPNEQ_PS(XMM6, XMM2) 761 762 SSE_CMPNEQ_PS(XMM7, XMM3) 763 764 /* Reduce the comparisons to one SSE register */ 765 SSE_OR_PS(XMM6, XMM7) 766 SSE_OR_PS(XMM5, XMM4) 767 SSE_OR_PS(XMM5, XMM6) 768 SSE_INLINE_END_1; 769 770 /* Reduce the one SSE register to an integer register for branching */ 771 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 772 MOVEMASK(nonzero, XMM5); 773 774 /* If block is nonzero ... */ 775 if (nonzero) { 776 pv = ba + 16 * diag_offset[row]; 777 PREFETCH_L1(&pv[16]); 778 pj = bj + diag_offset[row] + 1; 779 780 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 781 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 782 /* but the diagonal was inverted already */ 783 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 784 785 SSE_INLINE_BEGIN_2(pv, pc) 786 /* Column 0, product is accumulated in XMM4 */ 787 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 788 SSE_SHUFFLE(XMM4, XMM4, 0x00) 789 SSE_MULT_PS(XMM4, XMM0) 790 791 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 792 SSE_SHUFFLE(XMM5, XMM5, 0x00) 793 SSE_MULT_PS(XMM5, XMM1) 794 SSE_ADD_PS(XMM4, XMM5) 795 796 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 797 SSE_SHUFFLE(XMM6, XMM6, 0x00) 798 SSE_MULT_PS(XMM6, XMM2) 799 SSE_ADD_PS(XMM4, XMM6) 800 801 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 802 SSE_SHUFFLE(XMM7, XMM7, 0x00) 803 SSE_MULT_PS(XMM7, XMM3) 804 SSE_ADD_PS(XMM4, XMM7) 805 806 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 807 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 808 809 /* Column 1, product is accumulated in XMM5 */ 810 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 811 SSE_SHUFFLE(XMM5, XMM5, 0x00) 812 SSE_MULT_PS(XMM5, XMM0) 813 814 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 815 SSE_SHUFFLE(XMM6, XMM6, 0x00) 816 SSE_MULT_PS(XMM6, XMM1) 817 SSE_ADD_PS(XMM5, XMM6) 818 819 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 820 SSE_SHUFFLE(XMM7, XMM7, 0x00) 821 SSE_MULT_PS(XMM7, XMM2) 822 SSE_ADD_PS(XMM5, XMM7) 823 824 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 825 SSE_SHUFFLE(XMM6, XMM6, 0x00) 826 SSE_MULT_PS(XMM6, XMM3) 827 SSE_ADD_PS(XMM5, XMM6) 828 829 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 830 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 831 832 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 833 834 /* Column 2, product is accumulated in XMM6 */ 835 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 836 SSE_SHUFFLE(XMM6, XMM6, 0x00) 837 SSE_MULT_PS(XMM6, XMM0) 838 839 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 840 SSE_SHUFFLE(XMM7, XMM7, 0x00) 841 SSE_MULT_PS(XMM7, XMM1) 842 SSE_ADD_PS(XMM6, XMM7) 843 844 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 845 SSE_SHUFFLE(XMM7, XMM7, 0x00) 846 SSE_MULT_PS(XMM7, XMM2) 847 SSE_ADD_PS(XMM6, XMM7) 848 849 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 850 SSE_SHUFFLE(XMM7, XMM7, 0x00) 851 SSE_MULT_PS(XMM7, XMM3) 852 SSE_ADD_PS(XMM6, XMM7) 853 854 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 855 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 856 857 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 858 /* Column 3, product is accumulated in XMM0 */ 859 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 860 SSE_SHUFFLE(XMM7, XMM7, 0x00) 861 SSE_MULT_PS(XMM0, XMM7) 862 863 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 864 SSE_SHUFFLE(XMM7, XMM7, 0x00) 865 SSE_MULT_PS(XMM1, XMM7) 866 SSE_ADD_PS(XMM0, XMM1) 867 868 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 869 SSE_SHUFFLE(XMM1, XMM1, 0x00) 870 SSE_MULT_PS(XMM1, XMM2) 871 SSE_ADD_PS(XMM0, XMM1) 872 873 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 874 SSE_SHUFFLE(XMM7, XMM7, 0x00) 875 SSE_MULT_PS(XMM3, XMM7) 876 SSE_ADD_PS(XMM0, XMM3) 877 878 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 879 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 880 881 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 882 /* This is code to be maintained and read by humans after all. */ 883 /* Copy Multiplier Col 3 into XMM3 */ 884 SSE_COPY_PS(XMM3, XMM0) 885 /* Copy Multiplier Col 2 into XMM2 */ 886 SSE_COPY_PS(XMM2, XMM6) 887 /* Copy Multiplier Col 1 into XMM1 */ 888 SSE_COPY_PS(XMM1, XMM5) 889 /* Copy Multiplier Col 0 into XMM0 */ 890 SSE_COPY_PS(XMM0, XMM4) 891 SSE_INLINE_END_2; 892 893 /* Update the row: */ 894 nz = bi[row + 1] - diag_offset[row] - 1; 895 pv += 16; 896 for (j = 0; j < nz; j++) { 897 PREFETCH_L1(&pv[16]); 898 x = rtmp + 16 * pj[j]; 899 /* x = rtmp + 4*pj[j]; */ 900 901 /* X:=X-M*PV, One column at a time */ 902 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 903 SSE_INLINE_BEGIN_2(x, pv) 904 /* Load First Column of X*/ 905 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 906 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 907 908 /* Matrix-Vector Product: */ 909 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 910 SSE_SHUFFLE(XMM5, XMM5, 0x00) 911 SSE_MULT_PS(XMM5, XMM0) 912 SSE_SUB_PS(XMM4, XMM5) 913 914 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 915 SSE_SHUFFLE(XMM6, XMM6, 0x00) 916 SSE_MULT_PS(XMM6, XMM1) 917 SSE_SUB_PS(XMM4, XMM6) 918 919 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 920 SSE_SHUFFLE(XMM7, XMM7, 0x00) 921 SSE_MULT_PS(XMM7, XMM2) 922 SSE_SUB_PS(XMM4, XMM7) 923 924 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 925 SSE_SHUFFLE(XMM5, XMM5, 0x00) 926 SSE_MULT_PS(XMM5, XMM3) 927 SSE_SUB_PS(XMM4, XMM5) 928 929 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 930 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 931 932 /* Second Column */ 933 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 934 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 935 936 /* Matrix-Vector Product: */ 937 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 938 SSE_SHUFFLE(XMM6, XMM6, 0x00) 939 SSE_MULT_PS(XMM6, XMM0) 940 SSE_SUB_PS(XMM5, XMM6) 941 942 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 943 SSE_SHUFFLE(XMM7, XMM7, 0x00) 944 SSE_MULT_PS(XMM7, XMM1) 945 SSE_SUB_PS(XMM5, XMM7) 946 947 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 948 SSE_SHUFFLE(XMM4, XMM4, 0x00) 949 SSE_MULT_PS(XMM4, XMM2) 950 SSE_SUB_PS(XMM5, XMM4) 951 952 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 953 SSE_SHUFFLE(XMM6, XMM6, 0x00) 954 SSE_MULT_PS(XMM6, XMM3) 955 SSE_SUB_PS(XMM5, XMM6) 956 957 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 958 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 959 960 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 961 962 /* Third Column */ 963 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 964 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 965 966 /* Matrix-Vector Product: */ 967 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 968 SSE_SHUFFLE(XMM7, XMM7, 0x00) 969 SSE_MULT_PS(XMM7, XMM0) 970 SSE_SUB_PS(XMM6, XMM7) 971 972 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 973 SSE_SHUFFLE(XMM4, XMM4, 0x00) 974 SSE_MULT_PS(XMM4, XMM1) 975 SSE_SUB_PS(XMM6, XMM4) 976 977 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 978 SSE_SHUFFLE(XMM5, XMM5, 0x00) 979 SSE_MULT_PS(XMM5, XMM2) 980 SSE_SUB_PS(XMM6, XMM5) 981 982 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 983 SSE_SHUFFLE(XMM7, XMM7, 0x00) 984 SSE_MULT_PS(XMM7, XMM3) 985 SSE_SUB_PS(XMM6, XMM7) 986 987 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 988 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 989 990 /* Fourth Column */ 991 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 992 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 993 994 /* Matrix-Vector Product: */ 995 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 996 SSE_SHUFFLE(XMM5, XMM5, 0x00) 997 SSE_MULT_PS(XMM5, XMM0) 998 SSE_SUB_PS(XMM4, XMM5) 999 1000 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1001 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1002 SSE_MULT_PS(XMM6, XMM1) 1003 SSE_SUB_PS(XMM4, XMM6) 1004 1005 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1006 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1007 SSE_MULT_PS(XMM7, XMM2) 1008 SSE_SUB_PS(XMM4, XMM7) 1009 1010 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1011 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1012 SSE_MULT_PS(XMM5, XMM3) 1013 SSE_SUB_PS(XMM4, XMM5) 1014 1015 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1016 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1017 SSE_INLINE_END_2; 1018 pv += 16; 1019 } 1020 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1021 } 1022 row = *bjtmp++; 1023 /* row = (*bjtmp++)/4; */ 1024 } 1025 /* finished row so stick it into b->a */ 1026 pv = ba + 16 * bi[i]; 1027 pj = bj + bi[i]; 1028 nz = bi[i + 1] - bi[i]; 1029 1030 /* Copy x block back into pv block */ 1031 for (j = 0; j < nz; j++) { 1032 x = rtmp + 16 * pj[j]; 1033 /* x = rtmp+4*pj[j]; */ 1034 1035 SSE_INLINE_BEGIN_2(x, pv) 1036 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1037 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1038 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1039 1040 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1041 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1042 1043 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1044 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1045 1046 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1047 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1048 1049 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1050 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1051 1052 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1053 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1054 1055 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1056 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1057 1058 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1059 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1060 SSE_INLINE_END_2; 1061 pv += 16; 1062 } 1063 /* invert diagonal block */ 1064 w = ba + 16 * diag_offset[i]; 1065 if (pivotinblocks) { 1066 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1067 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1068 } else { 1069 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1070 } 1071 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1072 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1073 } 1074 1075 PetscCall(PetscFree(rtmp)); 1076 1077 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1078 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1079 C->assembled = PETSC_TRUE; 1080 1081 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1082 /* Flop Count from inverting diagonal blocks */ 1083 SSE_SCOPE_END; 1084 PetscFunctionReturn(0); 1085 } 1086 1087 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) { 1088 Mat A = C; 1089 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1090 int i, j, n = a->mbs; 1091 unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1092 unsigned short *aj = (unsigned short *)(a->j), *ajtmp; 1093 unsigned int row; 1094 int nz, *bi = b->i; 1095 int *diag_offset = b->diag, *ai = a->i; 1096 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1097 MatScalar *ba = b->a, *aa = a->a; 1098 int nonzero = 0; 1099 PetscBool pivotinblocks = b->pivotinblocks; 1100 PetscReal shift = info->shiftamount; 1101 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1102 1103 PetscFunctionBegin; 1104 allowzeropivot = PetscNot(A->erroriffailure); 1105 SSE_SCOPE_BEGIN; 1106 1107 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1108 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 1109 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1110 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1111 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1112 /* colscale = 4; */ 1113 /* } */ 1114 1115 for (i = 0; i < n; i++) { 1116 nz = bi[i + 1] - bi[i]; 1117 bjtmp = bj + bi[i]; 1118 /* zero out the 4x4 block accumulators */ 1119 /* zero out one register */ 1120 XOR_PS(XMM7, XMM7); 1121 for (j = 0; j < nz; j++) { 1122 x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1123 /* x = rtmp+4*bjtmp[j]; */ 1124 SSE_INLINE_BEGIN_1(x) 1125 /* Copy zero register to memory locations */ 1126 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1127 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1128 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1129 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1130 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1131 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1132 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1133 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1134 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1135 SSE_INLINE_END_1; 1136 } 1137 /* load in initial (unfactored row) */ 1138 nz = ai[i + 1] - ai[i]; 1139 ajtmp = aj + ai[i]; 1140 v = aa + 16 * ai[i]; 1141 for (j = 0; j < nz; j++) { 1142 x = rtmp + 16 * ((unsigned int)ajtmp[j]); 1143 /* x = rtmp+colscale*ajtmp[j]; */ 1144 /* Copy v block into x block */ 1145 SSE_INLINE_BEGIN_2(v, x) 1146 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1147 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1148 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1149 1150 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1151 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1152 1153 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1154 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1155 1156 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1157 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1158 1159 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1160 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1161 1162 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1163 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1164 1165 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1166 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1167 1168 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1169 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1170 SSE_INLINE_END_2; 1171 1172 v += 16; 1173 } 1174 /* row = (*bjtmp++)/4; */ 1175 row = (unsigned int)(*bjtmp++); 1176 while (row < i) { 1177 pc = rtmp + 16 * row; 1178 SSE_INLINE_BEGIN_1(pc) 1179 /* Load block from lower triangle */ 1180 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1181 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1182 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1183 1184 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1185 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1186 1187 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1188 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1189 1190 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1191 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1192 1193 /* Compare block to zero block */ 1194 1195 SSE_COPY_PS(XMM4, XMM7) 1196 SSE_CMPNEQ_PS(XMM4, XMM0) 1197 1198 SSE_COPY_PS(XMM5, XMM7) 1199 SSE_CMPNEQ_PS(XMM5, XMM1) 1200 1201 SSE_COPY_PS(XMM6, XMM7) 1202 SSE_CMPNEQ_PS(XMM6, XMM2) 1203 1204 SSE_CMPNEQ_PS(XMM7, XMM3) 1205 1206 /* Reduce the comparisons to one SSE register */ 1207 SSE_OR_PS(XMM6, XMM7) 1208 SSE_OR_PS(XMM5, XMM4) 1209 SSE_OR_PS(XMM5, XMM6) 1210 SSE_INLINE_END_1; 1211 1212 /* Reduce the one SSE register to an integer register for branching */ 1213 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1214 MOVEMASK(nonzero, XMM5); 1215 1216 /* If block is nonzero ... */ 1217 if (nonzero) { 1218 pv = ba + 16 * diag_offset[row]; 1219 PREFETCH_L1(&pv[16]); 1220 pj = bj + diag_offset[row] + 1; 1221 1222 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1223 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1224 /* but the diagonal was inverted already */ 1225 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1226 1227 SSE_INLINE_BEGIN_2(pv, pc) 1228 /* Column 0, product is accumulated in XMM4 */ 1229 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1230 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1231 SSE_MULT_PS(XMM4, XMM0) 1232 1233 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1234 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1235 SSE_MULT_PS(XMM5, XMM1) 1236 SSE_ADD_PS(XMM4, XMM5) 1237 1238 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1239 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1240 SSE_MULT_PS(XMM6, XMM2) 1241 SSE_ADD_PS(XMM4, XMM6) 1242 1243 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1244 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1245 SSE_MULT_PS(XMM7, XMM3) 1246 SSE_ADD_PS(XMM4, XMM7) 1247 1248 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1249 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1250 1251 /* Column 1, product is accumulated in XMM5 */ 1252 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1253 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1254 SSE_MULT_PS(XMM5, XMM0) 1255 1256 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1257 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1258 SSE_MULT_PS(XMM6, XMM1) 1259 SSE_ADD_PS(XMM5, XMM6) 1260 1261 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1262 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1263 SSE_MULT_PS(XMM7, XMM2) 1264 SSE_ADD_PS(XMM5, XMM7) 1265 1266 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1267 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1268 SSE_MULT_PS(XMM6, XMM3) 1269 SSE_ADD_PS(XMM5, XMM6) 1270 1271 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1272 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1273 1274 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1275 1276 /* Column 2, product is accumulated in XMM6 */ 1277 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1278 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1279 SSE_MULT_PS(XMM6, XMM0) 1280 1281 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1282 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1283 SSE_MULT_PS(XMM7, XMM1) 1284 SSE_ADD_PS(XMM6, XMM7) 1285 1286 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1287 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1288 SSE_MULT_PS(XMM7, XMM2) 1289 SSE_ADD_PS(XMM6, XMM7) 1290 1291 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1292 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1293 SSE_MULT_PS(XMM7, XMM3) 1294 SSE_ADD_PS(XMM6, XMM7) 1295 1296 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1297 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1298 1299 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1300 /* Column 3, product is accumulated in XMM0 */ 1301 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1302 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1303 SSE_MULT_PS(XMM0, XMM7) 1304 1305 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1306 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1307 SSE_MULT_PS(XMM1, XMM7) 1308 SSE_ADD_PS(XMM0, XMM1) 1309 1310 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1311 SSE_SHUFFLE(XMM1, XMM1, 0x00) 1312 SSE_MULT_PS(XMM1, XMM2) 1313 SSE_ADD_PS(XMM0, XMM1) 1314 1315 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1316 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1317 SSE_MULT_PS(XMM3, XMM7) 1318 SSE_ADD_PS(XMM0, XMM3) 1319 1320 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1321 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1322 1323 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1324 /* This is code to be maintained and read by humans after all. */ 1325 /* Copy Multiplier Col 3 into XMM3 */ 1326 SSE_COPY_PS(XMM3, XMM0) 1327 /* Copy Multiplier Col 2 into XMM2 */ 1328 SSE_COPY_PS(XMM2, XMM6) 1329 /* Copy Multiplier Col 1 into XMM1 */ 1330 SSE_COPY_PS(XMM1, XMM5) 1331 /* Copy Multiplier Col 0 into XMM0 */ 1332 SSE_COPY_PS(XMM0, XMM4) 1333 SSE_INLINE_END_2; 1334 1335 /* Update the row: */ 1336 nz = bi[row + 1] - diag_offset[row] - 1; 1337 pv += 16; 1338 for (j = 0; j < nz; j++) { 1339 PREFETCH_L1(&pv[16]); 1340 x = rtmp + 16 * ((unsigned int)pj[j]); 1341 /* x = rtmp + 4*pj[j]; */ 1342 1343 /* X:=X-M*PV, One column at a time */ 1344 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1345 SSE_INLINE_BEGIN_2(x, pv) 1346 /* Load First Column of X*/ 1347 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1348 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1349 1350 /* Matrix-Vector Product: */ 1351 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1352 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1353 SSE_MULT_PS(XMM5, XMM0) 1354 SSE_SUB_PS(XMM4, XMM5) 1355 1356 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1357 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1358 SSE_MULT_PS(XMM6, XMM1) 1359 SSE_SUB_PS(XMM4, XMM6) 1360 1361 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1362 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1363 SSE_MULT_PS(XMM7, XMM2) 1364 SSE_SUB_PS(XMM4, XMM7) 1365 1366 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1367 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1368 SSE_MULT_PS(XMM5, XMM3) 1369 SSE_SUB_PS(XMM4, XMM5) 1370 1371 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1372 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1373 1374 /* Second Column */ 1375 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1376 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1377 1378 /* Matrix-Vector Product: */ 1379 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1380 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1381 SSE_MULT_PS(XMM6, XMM0) 1382 SSE_SUB_PS(XMM5, XMM6) 1383 1384 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1385 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1386 SSE_MULT_PS(XMM7, XMM1) 1387 SSE_SUB_PS(XMM5, XMM7) 1388 1389 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1390 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1391 SSE_MULT_PS(XMM4, XMM2) 1392 SSE_SUB_PS(XMM5, XMM4) 1393 1394 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1395 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1396 SSE_MULT_PS(XMM6, XMM3) 1397 SSE_SUB_PS(XMM5, XMM6) 1398 1399 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1400 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1401 1402 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1403 1404 /* Third Column */ 1405 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1406 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1407 1408 /* Matrix-Vector Product: */ 1409 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1410 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1411 SSE_MULT_PS(XMM7, XMM0) 1412 SSE_SUB_PS(XMM6, XMM7) 1413 1414 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1415 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1416 SSE_MULT_PS(XMM4, XMM1) 1417 SSE_SUB_PS(XMM6, XMM4) 1418 1419 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1420 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1421 SSE_MULT_PS(XMM5, XMM2) 1422 SSE_SUB_PS(XMM6, XMM5) 1423 1424 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1425 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1426 SSE_MULT_PS(XMM7, XMM3) 1427 SSE_SUB_PS(XMM6, XMM7) 1428 1429 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1430 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1431 1432 /* Fourth Column */ 1433 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1434 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1435 1436 /* Matrix-Vector Product: */ 1437 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1438 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1439 SSE_MULT_PS(XMM5, XMM0) 1440 SSE_SUB_PS(XMM4, XMM5) 1441 1442 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1443 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1444 SSE_MULT_PS(XMM6, XMM1) 1445 SSE_SUB_PS(XMM4, XMM6) 1446 1447 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1448 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1449 SSE_MULT_PS(XMM7, XMM2) 1450 SSE_SUB_PS(XMM4, XMM7) 1451 1452 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1453 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1454 SSE_MULT_PS(XMM5, XMM3) 1455 SSE_SUB_PS(XMM4, XMM5) 1456 1457 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1458 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1459 SSE_INLINE_END_2; 1460 pv += 16; 1461 } 1462 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1463 } 1464 row = (unsigned int)(*bjtmp++); 1465 /* row = (*bjtmp++)/4; */ 1466 /* bjtmp++; */ 1467 } 1468 /* finished row so stick it into b->a */ 1469 pv = ba + 16 * bi[i]; 1470 pj = bj + bi[i]; 1471 nz = bi[i + 1] - bi[i]; 1472 1473 /* Copy x block back into pv block */ 1474 for (j = 0; j < nz; j++) { 1475 x = rtmp + 16 * ((unsigned int)pj[j]); 1476 /* x = rtmp+4*pj[j]; */ 1477 1478 SSE_INLINE_BEGIN_2(x, pv) 1479 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1480 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1481 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1482 1483 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1484 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1485 1486 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1487 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1488 1489 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1490 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1491 1492 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1493 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1494 1495 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1496 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1497 1498 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1499 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1500 1501 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1502 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1503 SSE_INLINE_END_2; 1504 pv += 16; 1505 } 1506 /* invert diagonal block */ 1507 w = ba + 16 * diag_offset[i]; 1508 if (pivotinblocks) { 1509 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1510 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1511 } else { 1512 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1513 } 1514 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1515 } 1516 1517 PetscCall(PetscFree(rtmp)); 1518 1519 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1520 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1521 C->assembled = PETSC_TRUE; 1522 1523 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1524 /* Flop Count from inverting diagonal blocks */ 1525 SSE_SCOPE_END; 1526 PetscFunctionReturn(0); 1527 } 1528 1529 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info) { 1530 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1531 int i, j, n = a->mbs; 1532 unsigned short *bj = (unsigned short *)(b->j), *bjtmp, *pj; 1533 unsigned int row; 1534 int *ajtmpold, nz, *bi = b->i; 1535 int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 1536 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1537 MatScalar *ba = b->a, *aa = a->a; 1538 int nonzero = 0; 1539 PetscBool pivotinblocks = b->pivotinblocks; 1540 PetscReal shift = info->shiftamount; 1541 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1542 1543 PetscFunctionBegin; 1544 allowzeropivot = PetscNot(A->erroriffailure); 1545 SSE_SCOPE_BEGIN; 1546 1547 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1548 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 1549 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1550 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1551 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1552 /* colscale = 4; */ 1553 /* } */ 1554 if ((unsigned long)bj == (unsigned long)aj) { return (MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); } 1555 1556 for (i = 0; i < n; i++) { 1557 nz = bi[i + 1] - bi[i]; 1558 bjtmp = bj + bi[i]; 1559 /* zero out the 4x4 block accumulators */ 1560 /* zero out one register */ 1561 XOR_PS(XMM7, XMM7); 1562 for (j = 0; j < nz; j++) { 1563 x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1564 /* x = rtmp+4*bjtmp[j]; */ 1565 SSE_INLINE_BEGIN_1(x) 1566 /* Copy zero register to memory locations */ 1567 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1568 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1569 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1570 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1571 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1572 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1573 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1574 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1575 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1576 SSE_INLINE_END_1; 1577 } 1578 /* load in initial (unfactored row) */ 1579 nz = ai[i + 1] - ai[i]; 1580 ajtmpold = aj + ai[i]; 1581 v = aa + 16 * ai[i]; 1582 for (j = 0; j < nz; j++) { 1583 x = rtmp + 16 * ajtmpold[j]; 1584 /* x = rtmp+colscale*ajtmpold[j]; */ 1585 /* Copy v block into x block */ 1586 SSE_INLINE_BEGIN_2(v, x) 1587 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1588 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1589 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1590 1591 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1592 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1593 1594 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1595 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1596 1597 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1598 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1599 1600 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1601 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1602 1603 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1604 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1605 1606 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1607 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1608 1609 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1610 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1611 SSE_INLINE_END_2; 1612 1613 v += 16; 1614 } 1615 /* row = (*bjtmp++)/4; */ 1616 row = (unsigned int)(*bjtmp++); 1617 while (row < i) { 1618 pc = rtmp + 16 * row; 1619 SSE_INLINE_BEGIN_1(pc) 1620 /* Load block from lower triangle */ 1621 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1622 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1623 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1624 1625 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1626 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1627 1628 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1629 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1630 1631 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1632 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1633 1634 /* Compare block to zero block */ 1635 1636 SSE_COPY_PS(XMM4, XMM7) 1637 SSE_CMPNEQ_PS(XMM4, XMM0) 1638 1639 SSE_COPY_PS(XMM5, XMM7) 1640 SSE_CMPNEQ_PS(XMM5, XMM1) 1641 1642 SSE_COPY_PS(XMM6, XMM7) 1643 SSE_CMPNEQ_PS(XMM6, XMM2) 1644 1645 SSE_CMPNEQ_PS(XMM7, XMM3) 1646 1647 /* Reduce the comparisons to one SSE register */ 1648 SSE_OR_PS(XMM6, XMM7) 1649 SSE_OR_PS(XMM5, XMM4) 1650 SSE_OR_PS(XMM5, XMM6) 1651 SSE_INLINE_END_1; 1652 1653 /* Reduce the one SSE register to an integer register for branching */ 1654 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1655 MOVEMASK(nonzero, XMM5); 1656 1657 /* If block is nonzero ... */ 1658 if (nonzero) { 1659 pv = ba + 16 * diag_offset[row]; 1660 PREFETCH_L1(&pv[16]); 1661 pj = bj + diag_offset[row] + 1; 1662 1663 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1664 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1665 /* but the diagonal was inverted already */ 1666 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1667 1668 SSE_INLINE_BEGIN_2(pv, pc) 1669 /* Column 0, product is accumulated in XMM4 */ 1670 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1671 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1672 SSE_MULT_PS(XMM4, XMM0) 1673 1674 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1675 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1676 SSE_MULT_PS(XMM5, XMM1) 1677 SSE_ADD_PS(XMM4, XMM5) 1678 1679 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1680 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1681 SSE_MULT_PS(XMM6, XMM2) 1682 SSE_ADD_PS(XMM4, XMM6) 1683 1684 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1685 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1686 SSE_MULT_PS(XMM7, XMM3) 1687 SSE_ADD_PS(XMM4, XMM7) 1688 1689 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1690 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1691 1692 /* Column 1, product is accumulated in XMM5 */ 1693 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1694 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1695 SSE_MULT_PS(XMM5, XMM0) 1696 1697 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1698 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1699 SSE_MULT_PS(XMM6, XMM1) 1700 SSE_ADD_PS(XMM5, XMM6) 1701 1702 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1703 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1704 SSE_MULT_PS(XMM7, XMM2) 1705 SSE_ADD_PS(XMM5, XMM7) 1706 1707 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1708 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1709 SSE_MULT_PS(XMM6, XMM3) 1710 SSE_ADD_PS(XMM5, XMM6) 1711 1712 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1713 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1714 1715 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1716 1717 /* Column 2, product is accumulated in XMM6 */ 1718 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1719 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1720 SSE_MULT_PS(XMM6, XMM0) 1721 1722 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1723 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1724 SSE_MULT_PS(XMM7, XMM1) 1725 SSE_ADD_PS(XMM6, XMM7) 1726 1727 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1728 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1729 SSE_MULT_PS(XMM7, XMM2) 1730 SSE_ADD_PS(XMM6, XMM7) 1731 1732 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1733 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1734 SSE_MULT_PS(XMM7, XMM3) 1735 SSE_ADD_PS(XMM6, XMM7) 1736 1737 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1738 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1739 1740 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1741 /* Column 3, product is accumulated in XMM0 */ 1742 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1743 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1744 SSE_MULT_PS(XMM0, XMM7) 1745 1746 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1747 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1748 SSE_MULT_PS(XMM1, XMM7) 1749 SSE_ADD_PS(XMM0, XMM1) 1750 1751 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1752 SSE_SHUFFLE(XMM1, XMM1, 0x00) 1753 SSE_MULT_PS(XMM1, XMM2) 1754 SSE_ADD_PS(XMM0, XMM1) 1755 1756 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1757 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1758 SSE_MULT_PS(XMM3, XMM7) 1759 SSE_ADD_PS(XMM0, XMM3) 1760 1761 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1762 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1763 1764 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1765 /* This is code to be maintained and read by humans after all. */ 1766 /* Copy Multiplier Col 3 into XMM3 */ 1767 SSE_COPY_PS(XMM3, XMM0) 1768 /* Copy Multiplier Col 2 into XMM2 */ 1769 SSE_COPY_PS(XMM2, XMM6) 1770 /* Copy Multiplier Col 1 into XMM1 */ 1771 SSE_COPY_PS(XMM1, XMM5) 1772 /* Copy Multiplier Col 0 into XMM0 */ 1773 SSE_COPY_PS(XMM0, XMM4) 1774 SSE_INLINE_END_2; 1775 1776 /* Update the row: */ 1777 nz = bi[row + 1] - diag_offset[row] - 1; 1778 pv += 16; 1779 for (j = 0; j < nz; j++) { 1780 PREFETCH_L1(&pv[16]); 1781 x = rtmp + 16 * ((unsigned int)pj[j]); 1782 /* x = rtmp + 4*pj[j]; */ 1783 1784 /* X:=X-M*PV, One column at a time */ 1785 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1786 SSE_INLINE_BEGIN_2(x, pv) 1787 /* Load First Column of X*/ 1788 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1789 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1790 1791 /* Matrix-Vector Product: */ 1792 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1793 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1794 SSE_MULT_PS(XMM5, XMM0) 1795 SSE_SUB_PS(XMM4, XMM5) 1796 1797 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1798 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1799 SSE_MULT_PS(XMM6, XMM1) 1800 SSE_SUB_PS(XMM4, XMM6) 1801 1802 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1803 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1804 SSE_MULT_PS(XMM7, XMM2) 1805 SSE_SUB_PS(XMM4, XMM7) 1806 1807 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1808 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1809 SSE_MULT_PS(XMM5, XMM3) 1810 SSE_SUB_PS(XMM4, XMM5) 1811 1812 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1813 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1814 1815 /* Second Column */ 1816 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1817 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1818 1819 /* Matrix-Vector Product: */ 1820 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1821 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1822 SSE_MULT_PS(XMM6, XMM0) 1823 SSE_SUB_PS(XMM5, XMM6) 1824 1825 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1826 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1827 SSE_MULT_PS(XMM7, XMM1) 1828 SSE_SUB_PS(XMM5, XMM7) 1829 1830 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1831 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1832 SSE_MULT_PS(XMM4, XMM2) 1833 SSE_SUB_PS(XMM5, XMM4) 1834 1835 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1836 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1837 SSE_MULT_PS(XMM6, XMM3) 1838 SSE_SUB_PS(XMM5, XMM6) 1839 1840 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1841 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1842 1843 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1844 1845 /* Third Column */ 1846 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1847 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1848 1849 /* Matrix-Vector Product: */ 1850 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1851 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1852 SSE_MULT_PS(XMM7, XMM0) 1853 SSE_SUB_PS(XMM6, XMM7) 1854 1855 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1856 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1857 SSE_MULT_PS(XMM4, XMM1) 1858 SSE_SUB_PS(XMM6, XMM4) 1859 1860 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1861 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1862 SSE_MULT_PS(XMM5, XMM2) 1863 SSE_SUB_PS(XMM6, XMM5) 1864 1865 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1866 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1867 SSE_MULT_PS(XMM7, XMM3) 1868 SSE_SUB_PS(XMM6, XMM7) 1869 1870 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1871 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1872 1873 /* Fourth Column */ 1874 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1875 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1876 1877 /* Matrix-Vector Product: */ 1878 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1879 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1880 SSE_MULT_PS(XMM5, XMM0) 1881 SSE_SUB_PS(XMM4, XMM5) 1882 1883 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1884 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1885 SSE_MULT_PS(XMM6, XMM1) 1886 SSE_SUB_PS(XMM4, XMM6) 1887 1888 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1889 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1890 SSE_MULT_PS(XMM7, XMM2) 1891 SSE_SUB_PS(XMM4, XMM7) 1892 1893 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1894 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1895 SSE_MULT_PS(XMM5, XMM3) 1896 SSE_SUB_PS(XMM4, XMM5) 1897 1898 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1899 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1900 SSE_INLINE_END_2; 1901 pv += 16; 1902 } 1903 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1904 } 1905 row = (unsigned int)(*bjtmp++); 1906 /* row = (*bjtmp++)/4; */ 1907 /* bjtmp++; */ 1908 } 1909 /* finished row so stick it into b->a */ 1910 pv = ba + 16 * bi[i]; 1911 pj = bj + bi[i]; 1912 nz = bi[i + 1] - bi[i]; 1913 1914 /* Copy x block back into pv block */ 1915 for (j = 0; j < nz; j++) { 1916 x = rtmp + 16 * ((unsigned int)pj[j]); 1917 /* x = rtmp+4*pj[j]; */ 1918 1919 SSE_INLINE_BEGIN_2(x, pv) 1920 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1921 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1922 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1923 1924 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1925 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1926 1927 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1928 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1929 1930 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1931 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1932 1933 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1934 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1935 1936 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1937 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1938 1939 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1940 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1941 1942 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1943 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1944 SSE_INLINE_END_2; 1945 pv += 16; 1946 } 1947 /* invert diagonal block */ 1948 w = ba + 16 * diag_offset[i]; 1949 if (pivotinblocks) { 1950 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected)); 1951 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1952 } else { 1953 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1954 } 1955 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1956 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1957 } 1958 1959 PetscCall(PetscFree(rtmp)); 1960 1961 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1962 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1963 C->assembled = PETSC_TRUE; 1964 1965 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1966 /* Flop Count from inverting diagonal blocks */ 1967 SSE_SCOPE_END; 1968 PetscFunctionReturn(0); 1969 } 1970 1971 #endif 1972