1 /* 2 Factorization code for BAIJ format. 3 */ 4 #include <../src/mat/impls/baij/seq/baij.h> 5 #include <petsc/private/kernels/blockinvert.h> 6 7 /* 8 Version for when blocks are 4 by 4 9 */ 10 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C, Mat A, const MatFactorInfo *info) 11 { 12 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 13 IS isrow = b->row, isicol = b->icol; 14 const PetscInt *r, *ic; 15 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 16 PetscInt *ajtmpold, *ajtmp, nz, row; 17 PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 18 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 19 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 20 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 21 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 22 MatScalar m13, m14, m15, m16; 23 MatScalar *ba = b->a, *aa = a->a; 24 PetscBool pivotinblocks = b->pivotinblocks; 25 PetscReal shift = info->shiftamount; 26 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 27 28 PetscFunctionBegin; 29 PetscCall(ISGetIndices(isrow, &r)); 30 PetscCall(ISGetIndices(isicol, &ic)); 31 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 32 allowzeropivot = PetscNot(A->erroriffailure); 33 34 for (i = 0; i < n; i++) { 35 nz = bi[i + 1] - bi[i]; 36 ajtmp = bj + bi[i]; 37 for (j = 0; j < nz; j++) { 38 x = rtmp + 16 * ajtmp[j]; 39 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 40 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 41 } 42 /* load in initial (unfactored row) */ 43 idx = r[i]; 44 nz = ai[idx + 1] - ai[idx]; 45 ajtmpold = aj + ai[idx]; 46 v = aa + 16 * ai[idx]; 47 for (j = 0; j < nz; j++) { 48 x = rtmp + 16 * ic[ajtmpold[j]]; 49 x[0] = v[0]; 50 x[1] = v[1]; 51 x[2] = v[2]; 52 x[3] = v[3]; 53 x[4] = v[4]; 54 x[5] = v[5]; 55 x[6] = v[6]; 56 x[7] = v[7]; 57 x[8] = v[8]; 58 x[9] = v[9]; 59 x[10] = v[10]; 60 x[11] = v[11]; 61 x[12] = v[12]; 62 x[13] = v[13]; 63 x[14] = v[14]; 64 x[15] = v[15]; 65 v += 16; 66 } 67 row = *ajtmp++; 68 while (row < i) { 69 pc = rtmp + 16 * row; 70 p1 = pc[0]; 71 p2 = pc[1]; 72 p3 = pc[2]; 73 p4 = pc[3]; 74 p5 = pc[4]; 75 p6 = pc[5]; 76 p7 = pc[6]; 77 p8 = pc[7]; 78 p9 = pc[8]; 79 p10 = pc[9]; 80 p11 = pc[10]; 81 p12 = pc[11]; 82 p13 = pc[12]; 83 p14 = pc[13]; 84 p15 = pc[14]; 85 p16 = pc[15]; 86 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 87 pv = ba + 16 * diag_offset[row]; 88 pj = bj + diag_offset[row] + 1; 89 x1 = pv[0]; 90 x2 = pv[1]; 91 x3 = pv[2]; 92 x4 = pv[3]; 93 x5 = pv[4]; 94 x6 = pv[5]; 95 x7 = pv[6]; 96 x8 = pv[7]; 97 x9 = pv[8]; 98 x10 = pv[9]; 99 x11 = pv[10]; 100 x12 = pv[11]; 101 x13 = pv[12]; 102 x14 = pv[13]; 103 x15 = pv[14]; 104 x16 = pv[15]; 105 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 106 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 107 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 108 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 109 110 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 111 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 112 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 113 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 114 115 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 116 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 117 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 118 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 119 120 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 121 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 122 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 123 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 124 125 nz = bi[row + 1] - diag_offset[row] - 1; 126 pv += 16; 127 for (j = 0; j < nz; j++) { 128 x1 = pv[0]; 129 x2 = pv[1]; 130 x3 = pv[2]; 131 x4 = pv[3]; 132 x5 = pv[4]; 133 x6 = pv[5]; 134 x7 = pv[6]; 135 x8 = pv[7]; 136 x9 = pv[8]; 137 x10 = pv[9]; 138 x11 = pv[10]; 139 x12 = pv[11]; 140 x13 = pv[12]; 141 x14 = pv[13]; 142 x15 = pv[14]; 143 x16 = pv[15]; 144 x = rtmp + 16 * pj[j]; 145 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 146 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 147 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 148 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 149 150 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 151 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 152 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 153 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 154 155 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 156 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 157 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 158 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 159 160 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 161 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 162 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 163 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 164 165 pv += 16; 166 } 167 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 168 } 169 row = *ajtmp++; 170 } 171 /* finished row so stick it into b->a */ 172 pv = ba + 16 * bi[i]; 173 pj = bj + bi[i]; 174 nz = bi[i + 1] - bi[i]; 175 for (j = 0; j < nz; j++) { 176 x = rtmp + 16 * pj[j]; 177 pv[0] = x[0]; 178 pv[1] = x[1]; 179 pv[2] = x[2]; 180 pv[3] = x[3]; 181 pv[4] = x[4]; 182 pv[5] = x[5]; 183 pv[6] = x[6]; 184 pv[7] = x[7]; 185 pv[8] = x[8]; 186 pv[9] = x[9]; 187 pv[10] = x[10]; 188 pv[11] = x[11]; 189 pv[12] = x[12]; 190 pv[13] = x[13]; 191 pv[14] = x[14]; 192 pv[15] = x[15]; 193 pv += 16; 194 } 195 /* invert diagonal block */ 196 w = ba + 16 * diag_offset[i]; 197 if (pivotinblocks) { 198 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 199 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 200 } else { 201 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 202 } 203 } 204 205 PetscCall(PetscFree(rtmp)); 206 PetscCall(ISRestoreIndices(isicol, &ic)); 207 PetscCall(ISRestoreIndices(isrow, &r)); 208 209 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 210 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 211 C->assembled = PETSC_TRUE; 212 213 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 214 PetscFunctionReturn(PETSC_SUCCESS); 215 } 216 217 /* MatLUFactorNumeric_SeqBAIJ_4 - 218 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 219 PetscKernel_A_gets_A_times_B() 220 PetscKernel_A_gets_A_minus_B_times_C() 221 PetscKernel_A_gets_inverse_A() 222 */ 223 224 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B, Mat A, const MatFactorInfo *info) 225 { 226 Mat C = B; 227 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 228 IS isrow = b->row, isicol = b->icol; 229 const PetscInt *r, *ic; 230 PetscInt i, j, k, nz, nzL, row; 231 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 232 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 233 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 234 PetscInt flg; 235 PetscReal shift; 236 PetscBool allowzeropivot, zeropivotdetected; 237 238 PetscFunctionBegin; 239 allowzeropivot = PetscNot(A->erroriffailure); 240 PetscCall(ISGetIndices(isrow, &r)); 241 PetscCall(ISGetIndices(isicol, &ic)); 242 243 if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) { 244 shift = 0; 245 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 246 shift = info->shiftamount; 247 } 248 249 /* generate work space needed by the factorization */ 250 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 251 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 252 253 for (i = 0; i < n; i++) { 254 /* zero rtmp */ 255 /* L part */ 256 nz = bi[i + 1] - bi[i]; 257 bjtmp = bj + bi[i]; 258 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 259 260 /* U part */ 261 nz = bdiag[i] - bdiag[i + 1]; 262 bjtmp = bj + bdiag[i + 1] + 1; 263 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 264 265 /* load in initial (unfactored row) */ 266 nz = ai[r[i] + 1] - ai[r[i]]; 267 ajtmp = aj + ai[r[i]]; 268 v = aa + bs2 * ai[r[i]]; 269 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 270 271 /* elimination */ 272 bjtmp = bj + bi[i]; 273 nzL = bi[i + 1] - bi[i]; 274 for (k = 0; k < nzL; k++) { 275 row = bjtmp[k]; 276 pc = rtmp + bs2 * row; 277 for (flg = 0, j = 0; j < bs2; j++) { 278 if (pc[j] != 0.0) { 279 flg = 1; 280 break; 281 } 282 } 283 if (flg) { 284 pv = b->a + bs2 * bdiag[row]; 285 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 286 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 287 288 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 289 pv = b->a + bs2 * (bdiag[row + 1] + 1); 290 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 291 for (j = 0; j < nz; j++) { 292 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 293 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 294 v = rtmp + bs2 * pj[j]; 295 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 296 pv += bs2; 297 } 298 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 299 } 300 } 301 302 /* finished row so stick it into b->a */ 303 /* L part */ 304 pv = b->a + bs2 * bi[i]; 305 pj = b->j + bi[i]; 306 nz = bi[i + 1] - bi[i]; 307 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 308 309 /* Mark diagonal and invert diagonal for simpler triangular solves */ 310 pv = b->a + bs2 * bdiag[i]; 311 pj = b->j + bdiag[i]; 312 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 313 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 314 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 315 316 /* U part */ 317 pv = b->a + bs2 * (bdiag[i + 1] + 1); 318 pj = b->j + bdiag[i + 1] + 1; 319 nz = bdiag[i] - bdiag[i + 1] - 1; 320 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 321 } 322 323 PetscCall(PetscFree2(rtmp, mwork)); 324 PetscCall(ISRestoreIndices(isicol, &ic)); 325 PetscCall(ISRestoreIndices(isrow, &r)); 326 327 C->ops->solve = MatSolve_SeqBAIJ_4; 328 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 329 C->assembled = PETSC_TRUE; 330 331 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 332 PetscFunctionReturn(PETSC_SUCCESS); 333 } 334 335 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 336 { 337 /* 338 Default Version for when blocks are 4 by 4 Using natural ordering 339 */ 340 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 341 PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 342 PetscInt *ajtmpold, *ajtmp, nz, row; 343 PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 344 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 345 MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4; 346 MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, x16; 347 MatScalar p10, p11, p12, p13, p14, p15, p16, m10, m11, m12; 348 MatScalar m13, m14, m15, m16; 349 MatScalar *ba = b->a, *aa = a->a; 350 PetscBool pivotinblocks = b->pivotinblocks; 351 PetscReal shift = info->shiftamount; 352 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 353 354 PetscFunctionBegin; 355 allowzeropivot = PetscNot(A->erroriffailure); 356 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 357 358 for (i = 0; i < n; i++) { 359 nz = bi[i + 1] - bi[i]; 360 ajtmp = bj + bi[i]; 361 for (j = 0; j < nz; j++) { 362 x = rtmp + 16 * ajtmp[j]; 363 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 364 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 365 } 366 /* load in initial (unfactored row) */ 367 nz = ai[i + 1] - ai[i]; 368 ajtmpold = aj + ai[i]; 369 v = aa + 16 * ai[i]; 370 for (j = 0; j < nz; j++) { 371 x = rtmp + 16 * ajtmpold[j]; 372 x[0] = v[0]; 373 x[1] = v[1]; 374 x[2] = v[2]; 375 x[3] = v[3]; 376 x[4] = v[4]; 377 x[5] = v[5]; 378 x[6] = v[6]; 379 x[7] = v[7]; 380 x[8] = v[8]; 381 x[9] = v[9]; 382 x[10] = v[10]; 383 x[11] = v[11]; 384 x[12] = v[12]; 385 x[13] = v[13]; 386 x[14] = v[14]; 387 x[15] = v[15]; 388 v += 16; 389 } 390 row = *ajtmp++; 391 while (row < i) { 392 pc = rtmp + 16 * row; 393 p1 = pc[0]; 394 p2 = pc[1]; 395 p3 = pc[2]; 396 p4 = pc[3]; 397 p5 = pc[4]; 398 p6 = pc[5]; 399 p7 = pc[6]; 400 p8 = pc[7]; 401 p9 = pc[8]; 402 p10 = pc[9]; 403 p11 = pc[10]; 404 p12 = pc[11]; 405 p13 = pc[12]; 406 p14 = pc[13]; 407 p15 = pc[14]; 408 p16 = pc[15]; 409 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 || p16 != 0.0) { 410 pv = ba + 16 * diag_offset[row]; 411 pj = bj + diag_offset[row] + 1; 412 x1 = pv[0]; 413 x2 = pv[1]; 414 x3 = pv[2]; 415 x4 = pv[3]; 416 x5 = pv[4]; 417 x6 = pv[5]; 418 x7 = pv[6]; 419 x8 = pv[7]; 420 x9 = pv[8]; 421 x10 = pv[9]; 422 x11 = pv[10]; 423 x12 = pv[11]; 424 x13 = pv[12]; 425 x14 = pv[13]; 426 x15 = pv[14]; 427 x16 = pv[15]; 428 pc[0] = m1 = p1 * x1 + p5 * x2 + p9 * x3 + p13 * x4; 429 pc[1] = m2 = p2 * x1 + p6 * x2 + p10 * x3 + p14 * x4; 430 pc[2] = m3 = p3 * x1 + p7 * x2 + p11 * x3 + p15 * x4; 431 pc[3] = m4 = p4 * x1 + p8 * x2 + p12 * x3 + p16 * x4; 432 433 pc[4] = m5 = p1 * x5 + p5 * x6 + p9 * x7 + p13 * x8; 434 pc[5] = m6 = p2 * x5 + p6 * x6 + p10 * x7 + p14 * x8; 435 pc[6] = m7 = p3 * x5 + p7 * x6 + p11 * x7 + p15 * x8; 436 pc[7] = m8 = p4 * x5 + p8 * x6 + p12 * x7 + p16 * x8; 437 438 pc[8] = m9 = p1 * x9 + p5 * x10 + p9 * x11 + p13 * x12; 439 pc[9] = m10 = p2 * x9 + p6 * x10 + p10 * x11 + p14 * x12; 440 pc[10] = m11 = p3 * x9 + p7 * x10 + p11 * x11 + p15 * x12; 441 pc[11] = m12 = p4 * x9 + p8 * x10 + p12 * x11 + p16 * x12; 442 443 pc[12] = m13 = p1 * x13 + p5 * x14 + p9 * x15 + p13 * x16; 444 pc[13] = m14 = p2 * x13 + p6 * x14 + p10 * x15 + p14 * x16; 445 pc[14] = m15 = p3 * x13 + p7 * x14 + p11 * x15 + p15 * x16; 446 pc[15] = m16 = p4 * x13 + p8 * x14 + p12 * x15 + p16 * x16; 447 nz = bi[row + 1] - diag_offset[row] - 1; 448 pv += 16; 449 for (j = 0; j < nz; j++) { 450 x1 = pv[0]; 451 x2 = pv[1]; 452 x3 = pv[2]; 453 x4 = pv[3]; 454 x5 = pv[4]; 455 x6 = pv[5]; 456 x7 = pv[6]; 457 x8 = pv[7]; 458 x9 = pv[8]; 459 x10 = pv[9]; 460 x11 = pv[10]; 461 x12 = pv[11]; 462 x13 = pv[12]; 463 x14 = pv[13]; 464 x15 = pv[14]; 465 x16 = pv[15]; 466 x = rtmp + 16 * pj[j]; 467 x[0] -= m1 * x1 + m5 * x2 + m9 * x3 + m13 * x4; 468 x[1] -= m2 * x1 + m6 * x2 + m10 * x3 + m14 * x4; 469 x[2] -= m3 * x1 + m7 * x2 + m11 * x3 + m15 * x4; 470 x[3] -= m4 * x1 + m8 * x2 + m12 * x3 + m16 * x4; 471 472 x[4] -= m1 * x5 + m5 * x6 + m9 * x7 + m13 * x8; 473 x[5] -= m2 * x5 + m6 * x6 + m10 * x7 + m14 * x8; 474 x[6] -= m3 * x5 + m7 * x6 + m11 * x7 + m15 * x8; 475 x[7] -= m4 * x5 + m8 * x6 + m12 * x7 + m16 * x8; 476 477 x[8] -= m1 * x9 + m5 * x10 + m9 * x11 + m13 * x12; 478 x[9] -= m2 * x9 + m6 * x10 + m10 * x11 + m14 * x12; 479 x[10] -= m3 * x9 + m7 * x10 + m11 * x11 + m15 * x12; 480 x[11] -= m4 * x9 + m8 * x10 + m12 * x11 + m16 * x12; 481 482 x[12] -= m1 * x13 + m5 * x14 + m9 * x15 + m13 * x16; 483 x[13] -= m2 * x13 + m6 * x14 + m10 * x15 + m14 * x16; 484 x[14] -= m3 * x13 + m7 * x14 + m11 * x15 + m15 * x16; 485 x[15] -= m4 * x13 + m8 * x14 + m12 * x15 + m16 * x16; 486 487 pv += 16; 488 } 489 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 490 } 491 row = *ajtmp++; 492 } 493 /* finished row so stick it into b->a */ 494 pv = ba + 16 * bi[i]; 495 pj = bj + bi[i]; 496 nz = bi[i + 1] - bi[i]; 497 for (j = 0; j < nz; j++) { 498 x = rtmp + 16 * pj[j]; 499 pv[0] = x[0]; 500 pv[1] = x[1]; 501 pv[2] = x[2]; 502 pv[3] = x[3]; 503 pv[4] = x[4]; 504 pv[5] = x[5]; 505 pv[6] = x[6]; 506 pv[7] = x[7]; 507 pv[8] = x[8]; 508 pv[9] = x[9]; 509 pv[10] = x[10]; 510 pv[11] = x[11]; 511 pv[12] = x[12]; 512 pv[13] = x[13]; 513 pv[14] = x[14]; 514 pv[15] = x[15]; 515 pv += 16; 516 } 517 /* invert diagonal block */ 518 w = ba + 16 * diag_offset[i]; 519 if (pivotinblocks) { 520 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 521 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 522 } else { 523 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 524 } 525 } 526 527 PetscCall(PetscFree(rtmp)); 528 529 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 530 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 531 C->assembled = PETSC_TRUE; 532 533 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * b->mbs)); /* from inverting diagonal blocks */ 534 PetscFunctionReturn(PETSC_SUCCESS); 535 } 536 537 /* 538 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 539 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 540 */ 541 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 542 { 543 Mat C = B; 544 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 545 PetscInt i, j, k, nz, nzL, row; 546 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 547 const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2; 548 MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a; 549 PetscInt flg; 550 PetscReal shift; 551 PetscBool allowzeropivot, zeropivotdetected; 552 553 PetscFunctionBegin; 554 allowzeropivot = PetscNot(A->erroriffailure); 555 556 /* generate work space needed by the factorization */ 557 PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 558 PetscCall(PetscArrayzero(rtmp, bs2 * n)); 559 560 if (info->shifttype == (PetscReal)MAT_SHIFT_NONE) { 561 shift = 0; 562 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 563 shift = info->shiftamount; 564 } 565 566 for (i = 0; i < n; i++) { 567 /* zero rtmp */ 568 /* L part */ 569 nz = bi[i + 1] - bi[i]; 570 bjtmp = bj + bi[i]; 571 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 572 573 /* U part */ 574 nz = bdiag[i] - bdiag[i + 1]; 575 bjtmp = bj + bdiag[i + 1] + 1; 576 for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 577 578 /* load in initial (unfactored row) */ 579 nz = ai[i + 1] - ai[i]; 580 ajtmp = aj + ai[i]; 581 v = aa + bs2 * ai[i]; 582 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 583 584 /* elimination */ 585 bjtmp = bj + bi[i]; 586 nzL = bi[i + 1] - bi[i]; 587 for (k = 0; k < nzL; k++) { 588 row = bjtmp[k]; 589 pc = rtmp + bs2 * row; 590 for (flg = 0, j = 0; j < bs2; j++) { 591 if (pc[j] != 0.0) { 592 flg = 1; 593 break; 594 } 595 } 596 if (flg) { 597 pv = b->a + bs2 * bdiag[row]; 598 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 599 PetscCall(PetscKernel_A_gets_A_times_B_4(pc, pv, mwork)); 600 601 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 602 pv = b->a + bs2 * (bdiag[row + 1] + 1); 603 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 604 for (j = 0; j < nz; j++) { 605 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 606 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 607 v = rtmp + bs2 * pj[j]; 608 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v, pc, pv)); 609 pv += bs2; 610 } 611 PetscCall(PetscLogFlops(128.0 * nz + 112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 612 } 613 } 614 615 /* finished row so stick it into b->a */ 616 /* L part */ 617 pv = b->a + bs2 * bi[i]; 618 pj = b->j + bi[i]; 619 nz = bi[i + 1] - bi[i]; 620 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 621 622 /* Mark diagonal and invert diagonal for simpler triangular solves */ 623 pv = b->a + bs2 * bdiag[i]; 624 pj = b->j + bdiag[i]; 625 PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 626 PetscCall(PetscKernel_A_gets_inverse_A_4(pv, shift, allowzeropivot, &zeropivotdetected)); 627 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 628 629 /* U part */ 630 pv = b->a + bs2 * (bdiag[i + 1] + 1); 631 pj = b->j + bdiag[i + 1] + 1; 632 nz = bdiag[i] - bdiag[i + 1] - 1; 633 for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 634 } 635 PetscCall(PetscFree2(rtmp, mwork)); 636 637 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 638 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 639 C->assembled = PETSC_TRUE; 640 641 PetscCall(PetscLogFlops(1.333333333333 * 4 * 4 * 4 * n)); /* from inverting diagonal blocks */ 642 PetscFunctionReturn(PETSC_SUCCESS); 643 } 644 645 #if defined(PETSC_HAVE_SSE) 646 647 #include PETSC_HAVE_SSE 648 649 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 650 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B, Mat A, const MatFactorInfo *info) 651 { 652 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 653 int i, j, n = a->mbs; 654 int *bj = b->j, *bjtmp, *pj; 655 int row; 656 int *ajtmpold, nz, *bi = b->i; 657 int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 658 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 659 MatScalar *ba = b->a, *aa = a->a; 660 int nonzero = 0; 661 PetscBool pivotinblocks = b->pivotinblocks; 662 PetscReal shift = info->shiftamount; 663 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 664 665 PetscFunctionBegin; 666 allowzeropivot = PetscNot(A->erroriffailure); 667 SSE_SCOPE_BEGIN; 668 669 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 670 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 671 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 672 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 673 /* if ((unsigned long)bj==(unsigned long)aj) { */ 674 /* colscale = 4; */ 675 /* } */ 676 for (i = 0; i < n; i++) { 677 nz = bi[i + 1] - bi[i]; 678 bjtmp = bj + bi[i]; 679 /* zero out the 4x4 block accumulators */ 680 /* zero out one register */ 681 XOR_PS(XMM7, XMM7); 682 for (j = 0; j < nz; j++) { 683 x = rtmp + 16 * bjtmp[j]; 684 /* x = rtmp+4*bjtmp[j]; */ 685 SSE_INLINE_BEGIN_1(x) 686 /* Copy zero register to memory locations */ 687 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 688 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 689 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 690 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 691 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 692 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 693 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 694 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 695 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 696 SSE_INLINE_END_1; 697 } 698 /* load in initial (unfactored row) */ 699 nz = ai[i + 1] - ai[i]; 700 ajtmpold = aj + ai[i]; 701 v = aa + 16 * ai[i]; 702 for (j = 0; j < nz; j++) { 703 x = rtmp + 16 * ajtmpold[j]; 704 /* x = rtmp+colscale*ajtmpold[j]; */ 705 /* Copy v block into x block */ 706 SSE_INLINE_BEGIN_2(v, x) 707 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 708 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 709 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 710 711 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 712 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 713 714 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 715 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 716 717 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 718 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 719 720 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 721 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 722 723 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 724 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 725 726 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 727 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 728 729 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 730 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 731 SSE_INLINE_END_2; 732 733 v += 16; 734 } 735 /* row = (*bjtmp++)/4; */ 736 row = *bjtmp++; 737 while (row < i) { 738 pc = rtmp + 16 * row; 739 SSE_INLINE_BEGIN_1(pc) 740 /* Load block from lower triangle */ 741 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 742 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 743 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 744 745 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 746 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 747 748 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 749 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 750 751 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 752 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 753 754 /* Compare block to zero block */ 755 756 SSE_COPY_PS(XMM4, XMM7) 757 SSE_CMPNEQ_PS(XMM4, XMM0) 758 759 SSE_COPY_PS(XMM5, XMM7) 760 SSE_CMPNEQ_PS(XMM5, XMM1) 761 762 SSE_COPY_PS(XMM6, XMM7) 763 SSE_CMPNEQ_PS(XMM6, XMM2) 764 765 SSE_CMPNEQ_PS(XMM7, XMM3) 766 767 /* Reduce the comparisons to one SSE register */ 768 SSE_OR_PS(XMM6, XMM7) 769 SSE_OR_PS(XMM5, XMM4) 770 SSE_OR_PS(XMM5, XMM6) 771 SSE_INLINE_END_1; 772 773 /* Reduce the one SSE register to an integer register for branching */ 774 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 775 MOVEMASK(nonzero, XMM5); 776 777 /* If block is nonzero ... */ 778 if (nonzero) { 779 pv = ba + 16 * diag_offset[row]; 780 PREFETCH_L1(&pv[16]); 781 pj = bj + diag_offset[row] + 1; 782 783 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 784 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 785 /* but the diagonal was inverted already */ 786 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 787 788 SSE_INLINE_BEGIN_2(pv, pc) 789 /* Column 0, product is accumulated in XMM4 */ 790 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 791 SSE_SHUFFLE(XMM4, XMM4, 0x00) 792 SSE_MULT_PS(XMM4, XMM0) 793 794 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 795 SSE_SHUFFLE(XMM5, XMM5, 0x00) 796 SSE_MULT_PS(XMM5, XMM1) 797 SSE_ADD_PS(XMM4, XMM5) 798 799 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 800 SSE_SHUFFLE(XMM6, XMM6, 0x00) 801 SSE_MULT_PS(XMM6, XMM2) 802 SSE_ADD_PS(XMM4, XMM6) 803 804 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 805 SSE_SHUFFLE(XMM7, XMM7, 0x00) 806 SSE_MULT_PS(XMM7, XMM3) 807 SSE_ADD_PS(XMM4, XMM7) 808 809 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 810 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 811 812 /* Column 1, product is accumulated in XMM5 */ 813 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 814 SSE_SHUFFLE(XMM5, XMM5, 0x00) 815 SSE_MULT_PS(XMM5, XMM0) 816 817 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 818 SSE_SHUFFLE(XMM6, XMM6, 0x00) 819 SSE_MULT_PS(XMM6, XMM1) 820 SSE_ADD_PS(XMM5, XMM6) 821 822 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 823 SSE_SHUFFLE(XMM7, XMM7, 0x00) 824 SSE_MULT_PS(XMM7, XMM2) 825 SSE_ADD_PS(XMM5, XMM7) 826 827 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 828 SSE_SHUFFLE(XMM6, XMM6, 0x00) 829 SSE_MULT_PS(XMM6, XMM3) 830 SSE_ADD_PS(XMM5, XMM6) 831 832 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 833 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 834 835 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 836 837 /* Column 2, product is accumulated in XMM6 */ 838 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 839 SSE_SHUFFLE(XMM6, XMM6, 0x00) 840 SSE_MULT_PS(XMM6, XMM0) 841 842 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 843 SSE_SHUFFLE(XMM7, XMM7, 0x00) 844 SSE_MULT_PS(XMM7, XMM1) 845 SSE_ADD_PS(XMM6, XMM7) 846 847 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 848 SSE_SHUFFLE(XMM7, XMM7, 0x00) 849 SSE_MULT_PS(XMM7, XMM2) 850 SSE_ADD_PS(XMM6, XMM7) 851 852 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 853 SSE_SHUFFLE(XMM7, XMM7, 0x00) 854 SSE_MULT_PS(XMM7, XMM3) 855 SSE_ADD_PS(XMM6, XMM7) 856 857 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 858 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 859 860 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 861 /* Column 3, product is accumulated in XMM0 */ 862 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 863 SSE_SHUFFLE(XMM7, XMM7, 0x00) 864 SSE_MULT_PS(XMM0, XMM7) 865 866 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 867 SSE_SHUFFLE(XMM7, XMM7, 0x00) 868 SSE_MULT_PS(XMM1, XMM7) 869 SSE_ADD_PS(XMM0, XMM1) 870 871 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 872 SSE_SHUFFLE(XMM1, XMM1, 0x00) 873 SSE_MULT_PS(XMM1, XMM2) 874 SSE_ADD_PS(XMM0, XMM1) 875 876 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 877 SSE_SHUFFLE(XMM7, XMM7, 0x00) 878 SSE_MULT_PS(XMM3, XMM7) 879 SSE_ADD_PS(XMM0, XMM3) 880 881 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 882 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 883 884 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 885 /* This is code to be maintained and read by humans after all. */ 886 /* Copy Multiplier Col 3 into XMM3 */ 887 SSE_COPY_PS(XMM3, XMM0) 888 /* Copy Multiplier Col 2 into XMM2 */ 889 SSE_COPY_PS(XMM2, XMM6) 890 /* Copy Multiplier Col 1 into XMM1 */ 891 SSE_COPY_PS(XMM1, XMM5) 892 /* Copy Multiplier Col 0 into XMM0 */ 893 SSE_COPY_PS(XMM0, XMM4) 894 SSE_INLINE_END_2; 895 896 /* Update the row: */ 897 nz = bi[row + 1] - diag_offset[row] - 1; 898 pv += 16; 899 for (j = 0; j < nz; j++) { 900 PREFETCH_L1(&pv[16]); 901 x = rtmp + 16 * pj[j]; 902 /* x = rtmp + 4*pj[j]; */ 903 904 /* X:=X-M*PV, One column at a time */ 905 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 906 SSE_INLINE_BEGIN_2(x, pv) 907 /* Load First Column of X*/ 908 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 909 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 910 911 /* Matrix-Vector Product: */ 912 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 913 SSE_SHUFFLE(XMM5, XMM5, 0x00) 914 SSE_MULT_PS(XMM5, XMM0) 915 SSE_SUB_PS(XMM4, XMM5) 916 917 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 918 SSE_SHUFFLE(XMM6, XMM6, 0x00) 919 SSE_MULT_PS(XMM6, XMM1) 920 SSE_SUB_PS(XMM4, XMM6) 921 922 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 923 SSE_SHUFFLE(XMM7, XMM7, 0x00) 924 SSE_MULT_PS(XMM7, XMM2) 925 SSE_SUB_PS(XMM4, XMM7) 926 927 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 928 SSE_SHUFFLE(XMM5, XMM5, 0x00) 929 SSE_MULT_PS(XMM5, XMM3) 930 SSE_SUB_PS(XMM4, XMM5) 931 932 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 933 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 934 935 /* Second Column */ 936 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 937 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 938 939 /* Matrix-Vector Product: */ 940 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 941 SSE_SHUFFLE(XMM6, XMM6, 0x00) 942 SSE_MULT_PS(XMM6, XMM0) 943 SSE_SUB_PS(XMM5, XMM6) 944 945 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 946 SSE_SHUFFLE(XMM7, XMM7, 0x00) 947 SSE_MULT_PS(XMM7, XMM1) 948 SSE_SUB_PS(XMM5, XMM7) 949 950 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 951 SSE_SHUFFLE(XMM4, XMM4, 0x00) 952 SSE_MULT_PS(XMM4, XMM2) 953 SSE_SUB_PS(XMM5, XMM4) 954 955 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 956 SSE_SHUFFLE(XMM6, XMM6, 0x00) 957 SSE_MULT_PS(XMM6, XMM3) 958 SSE_SUB_PS(XMM5, XMM6) 959 960 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 961 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 962 963 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 964 965 /* Third Column */ 966 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 967 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 968 969 /* Matrix-Vector Product: */ 970 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 971 SSE_SHUFFLE(XMM7, XMM7, 0x00) 972 SSE_MULT_PS(XMM7, XMM0) 973 SSE_SUB_PS(XMM6, XMM7) 974 975 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 976 SSE_SHUFFLE(XMM4, XMM4, 0x00) 977 SSE_MULT_PS(XMM4, XMM1) 978 SSE_SUB_PS(XMM6, XMM4) 979 980 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 981 SSE_SHUFFLE(XMM5, XMM5, 0x00) 982 SSE_MULT_PS(XMM5, XMM2) 983 SSE_SUB_PS(XMM6, XMM5) 984 985 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 986 SSE_SHUFFLE(XMM7, XMM7, 0x00) 987 SSE_MULT_PS(XMM7, XMM3) 988 SSE_SUB_PS(XMM6, XMM7) 989 990 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 991 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 992 993 /* Fourth Column */ 994 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 995 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 996 997 /* Matrix-Vector Product: */ 998 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 999 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1000 SSE_MULT_PS(XMM5, XMM0) 1001 SSE_SUB_PS(XMM4, XMM5) 1002 1003 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1004 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1005 SSE_MULT_PS(XMM6, XMM1) 1006 SSE_SUB_PS(XMM4, XMM6) 1007 1008 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1009 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1010 SSE_MULT_PS(XMM7, XMM2) 1011 SSE_SUB_PS(XMM4, XMM7) 1012 1013 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1014 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1015 SSE_MULT_PS(XMM5, XMM3) 1016 SSE_SUB_PS(XMM4, XMM5) 1017 1018 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1019 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1020 SSE_INLINE_END_2; 1021 pv += 16; 1022 } 1023 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1024 } 1025 row = *bjtmp++; 1026 /* row = (*bjtmp++)/4; */ 1027 } 1028 /* finished row so stick it into b->a */ 1029 pv = ba + 16 * bi[i]; 1030 pj = bj + bi[i]; 1031 nz = bi[i + 1] - bi[i]; 1032 1033 /* Copy x block back into pv block */ 1034 for (j = 0; j < nz; j++) { 1035 x = rtmp + 16 * pj[j]; 1036 /* x = rtmp+4*pj[j]; */ 1037 1038 SSE_INLINE_BEGIN_2(x, pv) 1039 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1040 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1041 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1042 1043 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1044 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1045 1046 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1047 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1048 1049 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1050 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1051 1052 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1053 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1054 1055 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1056 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1057 1058 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1059 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1060 1061 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1062 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1063 SSE_INLINE_END_2; 1064 pv += 16; 1065 } 1066 /* invert diagonal block */ 1067 w = ba + 16 * diag_offset[i]; 1068 if (pivotinblocks) { 1069 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1070 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1071 } else { 1072 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1073 } 1074 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1075 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1076 } 1077 1078 PetscCall(PetscFree(rtmp)); 1079 1080 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1081 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1082 C->assembled = PETSC_TRUE; 1083 1084 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1085 /* Flop Count from inverting diagonal blocks */ 1086 SSE_SCOPE_END; 1087 PetscFunctionReturn(PETSC_SUCCESS); 1088 } 1089 1090 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1091 { 1092 Mat A = C; 1093 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1094 int i, j, n = a->mbs; 1095 unsigned short *bj = (unsigned short *)b->j, *bjtmp, *pj; 1096 unsigned short *aj = (unsigned short *)a->j, *ajtmp; 1097 unsigned int row; 1098 int nz, *bi = b->i; 1099 int *diag_offset = b->diag, *ai = a->i; 1100 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1101 MatScalar *ba = b->a, *aa = a->a; 1102 int nonzero = 0; 1103 PetscBool pivotinblocks = b->pivotinblocks; 1104 PetscReal shift = info->shiftamount; 1105 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1106 1107 PetscFunctionBegin; 1108 allowzeropivot = PetscNot(A->erroriffailure); 1109 SSE_SCOPE_BEGIN; 1110 1111 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1112 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 1113 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1114 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1115 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1116 /* colscale = 4; */ 1117 /* } */ 1118 1119 for (i = 0; i < n; i++) { 1120 nz = bi[i + 1] - bi[i]; 1121 bjtmp = bj + bi[i]; 1122 /* zero out the 4x4 block accumulators */ 1123 /* zero out one register */ 1124 XOR_PS(XMM7, XMM7); 1125 for (j = 0; j < nz; j++) { 1126 x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1127 /* x = rtmp+4*bjtmp[j]; */ 1128 SSE_INLINE_BEGIN_1(x) 1129 /* Copy zero register to memory locations */ 1130 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1131 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1132 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1133 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1134 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1135 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1136 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1137 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1138 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1139 SSE_INLINE_END_1; 1140 } 1141 /* load in initial (unfactored row) */ 1142 nz = ai[i + 1] - ai[i]; 1143 ajtmp = aj + ai[i]; 1144 v = aa + 16 * ai[i]; 1145 for (j = 0; j < nz; j++) { 1146 x = rtmp + 16 * ((unsigned int)ajtmp[j]); 1147 /* x = rtmp+colscale*ajtmp[j]; */ 1148 /* Copy v block into x block */ 1149 SSE_INLINE_BEGIN_2(v, x) 1150 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1151 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1152 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1153 1154 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1155 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1156 1157 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1158 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1159 1160 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1161 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1162 1163 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1164 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1165 1166 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1167 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1168 1169 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1170 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1171 1172 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1173 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1174 SSE_INLINE_END_2; 1175 1176 v += 16; 1177 } 1178 /* row = (*bjtmp++)/4; */ 1179 row = (unsigned int)(*bjtmp++); 1180 while (row < i) { 1181 pc = rtmp + 16 * row; 1182 SSE_INLINE_BEGIN_1(pc) 1183 /* Load block from lower triangle */ 1184 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1185 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1186 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1187 1188 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1189 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1190 1191 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1192 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1193 1194 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1195 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1196 1197 /* Compare block to zero block */ 1198 1199 SSE_COPY_PS(XMM4, XMM7) 1200 SSE_CMPNEQ_PS(XMM4, XMM0) 1201 1202 SSE_COPY_PS(XMM5, XMM7) 1203 SSE_CMPNEQ_PS(XMM5, XMM1) 1204 1205 SSE_COPY_PS(XMM6, XMM7) 1206 SSE_CMPNEQ_PS(XMM6, XMM2) 1207 1208 SSE_CMPNEQ_PS(XMM7, XMM3) 1209 1210 /* Reduce the comparisons to one SSE register */ 1211 SSE_OR_PS(XMM6, XMM7) 1212 SSE_OR_PS(XMM5, XMM4) 1213 SSE_OR_PS(XMM5, XMM6) 1214 SSE_INLINE_END_1; 1215 1216 /* Reduce the one SSE register to an integer register for branching */ 1217 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1218 MOVEMASK(nonzero, XMM5); 1219 1220 /* If block is nonzero ... */ 1221 if (nonzero) { 1222 pv = ba + 16 * diag_offset[row]; 1223 PREFETCH_L1(&pv[16]); 1224 pj = bj + diag_offset[row] + 1; 1225 1226 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1227 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1228 /* but the diagonal was inverted already */ 1229 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1230 1231 SSE_INLINE_BEGIN_2(pv, pc) 1232 /* Column 0, product is accumulated in XMM4 */ 1233 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1234 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1235 SSE_MULT_PS(XMM4, XMM0) 1236 1237 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1238 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1239 SSE_MULT_PS(XMM5, XMM1) 1240 SSE_ADD_PS(XMM4, XMM5) 1241 1242 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1243 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1244 SSE_MULT_PS(XMM6, XMM2) 1245 SSE_ADD_PS(XMM4, XMM6) 1246 1247 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1248 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1249 SSE_MULT_PS(XMM7, XMM3) 1250 SSE_ADD_PS(XMM4, XMM7) 1251 1252 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1253 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1254 1255 /* Column 1, product is accumulated in XMM5 */ 1256 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1257 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1258 SSE_MULT_PS(XMM5, XMM0) 1259 1260 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1261 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1262 SSE_MULT_PS(XMM6, XMM1) 1263 SSE_ADD_PS(XMM5, XMM6) 1264 1265 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1266 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1267 SSE_MULT_PS(XMM7, XMM2) 1268 SSE_ADD_PS(XMM5, XMM7) 1269 1270 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1271 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1272 SSE_MULT_PS(XMM6, XMM3) 1273 SSE_ADD_PS(XMM5, XMM6) 1274 1275 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1276 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1277 1278 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1279 1280 /* Column 2, product is accumulated in XMM6 */ 1281 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1282 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1283 SSE_MULT_PS(XMM6, XMM0) 1284 1285 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1286 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1287 SSE_MULT_PS(XMM7, XMM1) 1288 SSE_ADD_PS(XMM6, XMM7) 1289 1290 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1291 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1292 SSE_MULT_PS(XMM7, XMM2) 1293 SSE_ADD_PS(XMM6, XMM7) 1294 1295 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1296 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1297 SSE_MULT_PS(XMM7, XMM3) 1298 SSE_ADD_PS(XMM6, XMM7) 1299 1300 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1301 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1302 1303 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1304 /* Column 3, product is accumulated in XMM0 */ 1305 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1306 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1307 SSE_MULT_PS(XMM0, XMM7) 1308 1309 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1310 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1311 SSE_MULT_PS(XMM1, XMM7) 1312 SSE_ADD_PS(XMM0, XMM1) 1313 1314 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1315 SSE_SHUFFLE(XMM1, XMM1, 0x00) 1316 SSE_MULT_PS(XMM1, XMM2) 1317 SSE_ADD_PS(XMM0, XMM1) 1318 1319 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1320 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1321 SSE_MULT_PS(XMM3, XMM7) 1322 SSE_ADD_PS(XMM0, XMM3) 1323 1324 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1325 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1326 1327 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1328 /* This is code to be maintained and read by humans after all. */ 1329 /* Copy Multiplier Col 3 into XMM3 */ 1330 SSE_COPY_PS(XMM3, XMM0) 1331 /* Copy Multiplier Col 2 into XMM2 */ 1332 SSE_COPY_PS(XMM2, XMM6) 1333 /* Copy Multiplier Col 1 into XMM1 */ 1334 SSE_COPY_PS(XMM1, XMM5) 1335 /* Copy Multiplier Col 0 into XMM0 */ 1336 SSE_COPY_PS(XMM0, XMM4) 1337 SSE_INLINE_END_2; 1338 1339 /* Update the row: */ 1340 nz = bi[row + 1] - diag_offset[row] - 1; 1341 pv += 16; 1342 for (j = 0; j < nz; j++) { 1343 PREFETCH_L1(&pv[16]); 1344 x = rtmp + 16 * ((unsigned int)pj[j]); 1345 /* x = rtmp + 4*pj[j]; */ 1346 1347 /* X:=X-M*PV, One column at a time */ 1348 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1349 SSE_INLINE_BEGIN_2(x, pv) 1350 /* Load First Column of X*/ 1351 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1352 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1353 1354 /* Matrix-Vector Product: */ 1355 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1356 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1357 SSE_MULT_PS(XMM5, XMM0) 1358 SSE_SUB_PS(XMM4, XMM5) 1359 1360 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1361 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1362 SSE_MULT_PS(XMM6, XMM1) 1363 SSE_SUB_PS(XMM4, XMM6) 1364 1365 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1366 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1367 SSE_MULT_PS(XMM7, XMM2) 1368 SSE_SUB_PS(XMM4, XMM7) 1369 1370 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1371 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1372 SSE_MULT_PS(XMM5, XMM3) 1373 SSE_SUB_PS(XMM4, XMM5) 1374 1375 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1376 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1377 1378 /* Second Column */ 1379 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1380 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1381 1382 /* Matrix-Vector Product: */ 1383 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1384 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1385 SSE_MULT_PS(XMM6, XMM0) 1386 SSE_SUB_PS(XMM5, XMM6) 1387 1388 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1389 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1390 SSE_MULT_PS(XMM7, XMM1) 1391 SSE_SUB_PS(XMM5, XMM7) 1392 1393 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1394 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1395 SSE_MULT_PS(XMM4, XMM2) 1396 SSE_SUB_PS(XMM5, XMM4) 1397 1398 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1399 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1400 SSE_MULT_PS(XMM6, XMM3) 1401 SSE_SUB_PS(XMM5, XMM6) 1402 1403 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1404 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1405 1406 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1407 1408 /* Third Column */ 1409 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1410 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1411 1412 /* Matrix-Vector Product: */ 1413 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1414 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1415 SSE_MULT_PS(XMM7, XMM0) 1416 SSE_SUB_PS(XMM6, XMM7) 1417 1418 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1419 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1420 SSE_MULT_PS(XMM4, XMM1) 1421 SSE_SUB_PS(XMM6, XMM4) 1422 1423 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1424 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1425 SSE_MULT_PS(XMM5, XMM2) 1426 SSE_SUB_PS(XMM6, XMM5) 1427 1428 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1429 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1430 SSE_MULT_PS(XMM7, XMM3) 1431 SSE_SUB_PS(XMM6, XMM7) 1432 1433 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1434 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1435 1436 /* Fourth Column */ 1437 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1438 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1439 1440 /* Matrix-Vector Product: */ 1441 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1442 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1443 SSE_MULT_PS(XMM5, XMM0) 1444 SSE_SUB_PS(XMM4, XMM5) 1445 1446 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1447 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1448 SSE_MULT_PS(XMM6, XMM1) 1449 SSE_SUB_PS(XMM4, XMM6) 1450 1451 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1452 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1453 SSE_MULT_PS(XMM7, XMM2) 1454 SSE_SUB_PS(XMM4, XMM7) 1455 1456 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1457 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1458 SSE_MULT_PS(XMM5, XMM3) 1459 SSE_SUB_PS(XMM4, XMM5) 1460 1461 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1462 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1463 SSE_INLINE_END_2; 1464 pv += 16; 1465 } 1466 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1467 } 1468 row = (unsigned int)(*bjtmp++); 1469 /* row = (*bjtmp++)/4; */ 1470 /* bjtmp++; */ 1471 } 1472 /* finished row so stick it into b->a */ 1473 pv = ba + 16 * bi[i]; 1474 pj = bj + bi[i]; 1475 nz = bi[i + 1] - bi[i]; 1476 1477 /* Copy x block back into pv block */ 1478 for (j = 0; j < nz; j++) { 1479 x = rtmp + 16 * ((unsigned int)pj[j]); 1480 /* x = rtmp+4*pj[j]; */ 1481 1482 SSE_INLINE_BEGIN_2(x, pv) 1483 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1484 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1485 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1486 1487 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1488 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1489 1490 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1491 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1492 1493 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1494 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1495 1496 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1497 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1498 1499 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1500 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1501 1502 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1503 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1504 1505 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1506 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1507 SSE_INLINE_END_2; 1508 pv += 16; 1509 } 1510 /* invert diagonal block */ 1511 w = ba + 16 * diag_offset[i]; 1512 if (pivotinblocks) { 1513 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, &zeropivotdetected)); 1514 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1515 } else { 1516 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1517 } 1518 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1519 } 1520 1521 PetscCall(PetscFree(rtmp)); 1522 1523 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1524 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1525 C->assembled = PETSC_TRUE; 1526 1527 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1528 /* Flop Count from inverting diagonal blocks */ 1529 SSE_SCOPE_END; 1530 PetscFunctionReturn(PETSC_SUCCESS); 1531 } 1532 1533 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C, Mat A, const MatFactorInfo *info) 1534 { 1535 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 1536 int i, j, n = a->mbs; 1537 unsigned short *bj = (unsigned short *)b->j, *bjtmp, *pj; 1538 unsigned int row; 1539 int *ajtmpold, nz, *bi = b->i; 1540 int *diag_offset = b->diag, *ai = a->i, *aj = a->j; 1541 MatScalar *pv, *v, *rtmp, *pc, *w, *x; 1542 MatScalar *ba = b->a, *aa = a->a; 1543 int nonzero = 0; 1544 PetscBool pivotinblocks = b->pivotinblocks; 1545 PetscReal shift = info->shiftamount; 1546 PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 1547 1548 PetscFunctionBegin; 1549 allowzeropivot = PetscNot(A->erroriffailure); 1550 SSE_SCOPE_BEGIN; 1551 1552 PetscCheck((unsigned long)aa % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer aa is not 16 byte aligned. SSE will not work."); 1553 PetscCheck((unsigned long)ba % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer ba is not 16 byte aligned. SSE will not work."); 1554 PetscCall(PetscMalloc1(16 * (n + 1), &rtmp)); 1555 PetscCheck((unsigned long)rtmp % 16 == 0, PETSC_COMM_SELF, PETSC_ERR_ARG_BADPTR, "Pointer rtmp is not 16 byte aligned. SSE will not work."); 1556 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1557 /* colscale = 4; */ 1558 /* } */ 1559 if ((unsigned long)bj == (unsigned long)aj) return MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C); 1560 1561 for (i = 0; i < n; i++) { 1562 nz = bi[i + 1] - bi[i]; 1563 bjtmp = bj + bi[i]; 1564 /* zero out the 4x4 block accumulators */ 1565 /* zero out one register */ 1566 XOR_PS(XMM7, XMM7); 1567 for (j = 0; j < nz; j++) { 1568 x = rtmp + 16 * ((unsigned int)bjtmp[j]); 1569 /* x = rtmp+4*bjtmp[j]; */ 1570 SSE_INLINE_BEGIN_1(x) 1571 /* Copy zero register to memory locations */ 1572 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1573 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM7) 1574 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM7) 1575 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM7) 1576 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM7) 1577 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM7) 1578 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM7) 1579 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1580 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM7) 1581 SSE_INLINE_END_1; 1582 } 1583 /* load in initial (unfactored row) */ 1584 nz = ai[i + 1] - ai[i]; 1585 ajtmpold = aj + ai[i]; 1586 v = aa + 16 * ai[i]; 1587 for (j = 0; j < nz; j++) { 1588 x = rtmp + 16 * ajtmpold[j]; 1589 /* x = rtmp+colscale*ajtmpold[j]; */ 1590 /* Copy v block into x block */ 1591 SSE_INLINE_BEGIN_2(v, x) 1592 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1593 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1594 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM0) 1595 1596 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM1) 1597 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM1) 1598 1599 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM2) 1600 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM2) 1601 1602 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM3) 1603 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM3) 1604 1605 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM4) 1606 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM4) 1607 1608 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM5) 1609 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM5) 1610 1611 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM6) 1612 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM6) 1613 1614 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1615 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1616 SSE_INLINE_END_2; 1617 1618 v += 16; 1619 } 1620 /* row = (*bjtmp++)/4; */ 1621 row = (unsigned int)(*bjtmp++); 1622 while (row < i) { 1623 pc = rtmp + 16 * row; 1624 SSE_INLINE_BEGIN_1(pc) 1625 /* Load block from lower triangle */ 1626 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1627 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM0) 1628 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM0) 1629 1630 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM1) 1631 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM1) 1632 1633 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM2) 1634 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM2) 1635 1636 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM3) 1637 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM3) 1638 1639 /* Compare block to zero block */ 1640 1641 SSE_COPY_PS(XMM4, XMM7) 1642 SSE_CMPNEQ_PS(XMM4, XMM0) 1643 1644 SSE_COPY_PS(XMM5, XMM7) 1645 SSE_CMPNEQ_PS(XMM5, XMM1) 1646 1647 SSE_COPY_PS(XMM6, XMM7) 1648 SSE_CMPNEQ_PS(XMM6, XMM2) 1649 1650 SSE_CMPNEQ_PS(XMM7, XMM3) 1651 1652 /* Reduce the comparisons to one SSE register */ 1653 SSE_OR_PS(XMM6, XMM7) 1654 SSE_OR_PS(XMM5, XMM4) 1655 SSE_OR_PS(XMM5, XMM6) 1656 SSE_INLINE_END_1; 1657 1658 /* Reduce the one SSE register to an integer register for branching */ 1659 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1660 MOVEMASK(nonzero, XMM5); 1661 1662 /* If block is nonzero ... */ 1663 if (nonzero) { 1664 pv = ba + 16 * diag_offset[row]; 1665 PREFETCH_L1(&pv[16]); 1666 pj = bj + diag_offset[row] + 1; 1667 1668 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1669 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1670 /* but the diagonal was inverted already */ 1671 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1672 1673 SSE_INLINE_BEGIN_2(pv, pc) 1674 /* Column 0, product is accumulated in XMM4 */ 1675 SSE_LOAD_SS(SSE_ARG_1, FLOAT_0, XMM4) 1676 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1677 SSE_MULT_PS(XMM4, XMM0) 1678 1679 SSE_LOAD_SS(SSE_ARG_1, FLOAT_1, XMM5) 1680 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1681 SSE_MULT_PS(XMM5, XMM1) 1682 SSE_ADD_PS(XMM4, XMM5) 1683 1684 SSE_LOAD_SS(SSE_ARG_1, FLOAT_2, XMM6) 1685 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1686 SSE_MULT_PS(XMM6, XMM2) 1687 SSE_ADD_PS(XMM4, XMM6) 1688 1689 SSE_LOAD_SS(SSE_ARG_1, FLOAT_3, XMM7) 1690 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1691 SSE_MULT_PS(XMM7, XMM3) 1692 SSE_ADD_PS(XMM4, XMM7) 1693 1694 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM4) 1695 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM4) 1696 1697 /* Column 1, product is accumulated in XMM5 */ 1698 SSE_LOAD_SS(SSE_ARG_1, FLOAT_4, XMM5) 1699 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1700 SSE_MULT_PS(XMM5, XMM0) 1701 1702 SSE_LOAD_SS(SSE_ARG_1, FLOAT_5, XMM6) 1703 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1704 SSE_MULT_PS(XMM6, XMM1) 1705 SSE_ADD_PS(XMM5, XMM6) 1706 1707 SSE_LOAD_SS(SSE_ARG_1, FLOAT_6, XMM7) 1708 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1709 SSE_MULT_PS(XMM7, XMM2) 1710 SSE_ADD_PS(XMM5, XMM7) 1711 1712 SSE_LOAD_SS(SSE_ARG_1, FLOAT_7, XMM6) 1713 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1714 SSE_MULT_PS(XMM6, XMM3) 1715 SSE_ADD_PS(XMM5, XMM6) 1716 1717 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM5) 1718 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM5) 1719 1720 SSE_PREFETCH_L1(SSE_ARG_1, FLOAT_24) 1721 1722 /* Column 2, product is accumulated in XMM6 */ 1723 SSE_LOAD_SS(SSE_ARG_1, FLOAT_8, XMM6) 1724 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1725 SSE_MULT_PS(XMM6, XMM0) 1726 1727 SSE_LOAD_SS(SSE_ARG_1, FLOAT_9, XMM7) 1728 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1729 SSE_MULT_PS(XMM7, XMM1) 1730 SSE_ADD_PS(XMM6, XMM7) 1731 1732 SSE_LOAD_SS(SSE_ARG_1, FLOAT_10, XMM7) 1733 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1734 SSE_MULT_PS(XMM7, XMM2) 1735 SSE_ADD_PS(XMM6, XMM7) 1736 1737 SSE_LOAD_SS(SSE_ARG_1, FLOAT_11, XMM7) 1738 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1739 SSE_MULT_PS(XMM7, XMM3) 1740 SSE_ADD_PS(XMM6, XMM7) 1741 1742 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM6) 1743 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1744 1745 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1746 /* Column 3, product is accumulated in XMM0 */ 1747 SSE_LOAD_SS(SSE_ARG_1, FLOAT_12, XMM7) 1748 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1749 SSE_MULT_PS(XMM0, XMM7) 1750 1751 SSE_LOAD_SS(SSE_ARG_1, FLOAT_13, XMM7) 1752 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1753 SSE_MULT_PS(XMM1, XMM7) 1754 SSE_ADD_PS(XMM0, XMM1) 1755 1756 SSE_LOAD_SS(SSE_ARG_1, FLOAT_14, XMM1) 1757 SSE_SHUFFLE(XMM1, XMM1, 0x00) 1758 SSE_MULT_PS(XMM1, XMM2) 1759 SSE_ADD_PS(XMM0, XMM1) 1760 1761 SSE_LOAD_SS(SSE_ARG_1, FLOAT_15, XMM7) 1762 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1763 SSE_MULT_PS(XMM3, XMM7) 1764 SSE_ADD_PS(XMM0, XMM3) 1765 1766 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM0) 1767 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1768 1769 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1770 /* This is code to be maintained and read by humans after all. */ 1771 /* Copy Multiplier Col 3 into XMM3 */ 1772 SSE_COPY_PS(XMM3, XMM0) 1773 /* Copy Multiplier Col 2 into XMM2 */ 1774 SSE_COPY_PS(XMM2, XMM6) 1775 /* Copy Multiplier Col 1 into XMM1 */ 1776 SSE_COPY_PS(XMM1, XMM5) 1777 /* Copy Multiplier Col 0 into XMM0 */ 1778 SSE_COPY_PS(XMM0, XMM4) 1779 SSE_INLINE_END_2; 1780 1781 /* Update the row: */ 1782 nz = bi[row + 1] - diag_offset[row] - 1; 1783 pv += 16; 1784 for (j = 0; j < nz; j++) { 1785 PREFETCH_L1(&pv[16]); 1786 x = rtmp + 16 * ((unsigned int)pj[j]); 1787 /* x = rtmp + 4*pj[j]; */ 1788 1789 /* X:=X-M*PV, One column at a time */ 1790 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1791 SSE_INLINE_BEGIN_2(x, pv) 1792 /* Load First Column of X*/ 1793 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1794 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1795 1796 /* Matrix-Vector Product: */ 1797 SSE_LOAD_SS(SSE_ARG_2, FLOAT_0, XMM5) 1798 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1799 SSE_MULT_PS(XMM5, XMM0) 1800 SSE_SUB_PS(XMM4, XMM5) 1801 1802 SSE_LOAD_SS(SSE_ARG_2, FLOAT_1, XMM6) 1803 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1804 SSE_MULT_PS(XMM6, XMM1) 1805 SSE_SUB_PS(XMM4, XMM6) 1806 1807 SSE_LOAD_SS(SSE_ARG_2, FLOAT_2, XMM7) 1808 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1809 SSE_MULT_PS(XMM7, XMM2) 1810 SSE_SUB_PS(XMM4, XMM7) 1811 1812 SSE_LOAD_SS(SSE_ARG_2, FLOAT_3, XMM5) 1813 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1814 SSE_MULT_PS(XMM5, XMM3) 1815 SSE_SUB_PS(XMM4, XMM5) 1816 1817 SSE_STOREL_PS(SSE_ARG_1, FLOAT_0, XMM4) 1818 SSE_STOREH_PS(SSE_ARG_1, FLOAT_2, XMM4) 1819 1820 /* Second Column */ 1821 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1822 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1823 1824 /* Matrix-Vector Product: */ 1825 SSE_LOAD_SS(SSE_ARG_2, FLOAT_4, XMM6) 1826 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1827 SSE_MULT_PS(XMM6, XMM0) 1828 SSE_SUB_PS(XMM5, XMM6) 1829 1830 SSE_LOAD_SS(SSE_ARG_2, FLOAT_5, XMM7) 1831 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1832 SSE_MULT_PS(XMM7, XMM1) 1833 SSE_SUB_PS(XMM5, XMM7) 1834 1835 SSE_LOAD_SS(SSE_ARG_2, FLOAT_6, XMM4) 1836 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1837 SSE_MULT_PS(XMM4, XMM2) 1838 SSE_SUB_PS(XMM5, XMM4) 1839 1840 SSE_LOAD_SS(SSE_ARG_2, FLOAT_7, XMM6) 1841 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1842 SSE_MULT_PS(XMM6, XMM3) 1843 SSE_SUB_PS(XMM5, XMM6) 1844 1845 SSE_STOREL_PS(SSE_ARG_1, FLOAT_4, XMM5) 1846 SSE_STOREH_PS(SSE_ARG_1, FLOAT_6, XMM5) 1847 1848 SSE_PREFETCH_L1(SSE_ARG_2, FLOAT_24) 1849 1850 /* Third Column */ 1851 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1852 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1853 1854 /* Matrix-Vector Product: */ 1855 SSE_LOAD_SS(SSE_ARG_2, FLOAT_8, XMM7) 1856 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1857 SSE_MULT_PS(XMM7, XMM0) 1858 SSE_SUB_PS(XMM6, XMM7) 1859 1860 SSE_LOAD_SS(SSE_ARG_2, FLOAT_9, XMM4) 1861 SSE_SHUFFLE(XMM4, XMM4, 0x00) 1862 SSE_MULT_PS(XMM4, XMM1) 1863 SSE_SUB_PS(XMM6, XMM4) 1864 1865 SSE_LOAD_SS(SSE_ARG_2, FLOAT_10, XMM5) 1866 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1867 SSE_MULT_PS(XMM5, XMM2) 1868 SSE_SUB_PS(XMM6, XMM5) 1869 1870 SSE_LOAD_SS(SSE_ARG_2, FLOAT_11, XMM7) 1871 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1872 SSE_MULT_PS(XMM7, XMM3) 1873 SSE_SUB_PS(XMM6, XMM7) 1874 1875 SSE_STOREL_PS(SSE_ARG_1, FLOAT_8, XMM6) 1876 SSE_STOREH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1877 1878 /* Fourth Column */ 1879 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1880 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1881 1882 /* Matrix-Vector Product: */ 1883 SSE_LOAD_SS(SSE_ARG_2, FLOAT_12, XMM5) 1884 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1885 SSE_MULT_PS(XMM5, XMM0) 1886 SSE_SUB_PS(XMM4, XMM5) 1887 1888 SSE_LOAD_SS(SSE_ARG_2, FLOAT_13, XMM6) 1889 SSE_SHUFFLE(XMM6, XMM6, 0x00) 1890 SSE_MULT_PS(XMM6, XMM1) 1891 SSE_SUB_PS(XMM4, XMM6) 1892 1893 SSE_LOAD_SS(SSE_ARG_2, FLOAT_14, XMM7) 1894 SSE_SHUFFLE(XMM7, XMM7, 0x00) 1895 SSE_MULT_PS(XMM7, XMM2) 1896 SSE_SUB_PS(XMM4, XMM7) 1897 1898 SSE_LOAD_SS(SSE_ARG_2, FLOAT_15, XMM5) 1899 SSE_SHUFFLE(XMM5, XMM5, 0x00) 1900 SSE_MULT_PS(XMM5, XMM3) 1901 SSE_SUB_PS(XMM4, XMM5) 1902 1903 SSE_STOREL_PS(SSE_ARG_1, FLOAT_12, XMM4) 1904 SSE_STOREH_PS(SSE_ARG_1, FLOAT_14, XMM4) 1905 SSE_INLINE_END_2; 1906 pv += 16; 1907 } 1908 PetscCall(PetscLogFlops(128.0 * nz + 112.0)); 1909 } 1910 row = (unsigned int)(*bjtmp++); 1911 /* row = (*bjtmp++)/4; */ 1912 /* bjtmp++; */ 1913 } 1914 /* finished row so stick it into b->a */ 1915 pv = ba + 16 * bi[i]; 1916 pj = bj + bi[i]; 1917 nz = bi[i + 1] - bi[i]; 1918 1919 /* Copy x block back into pv block */ 1920 for (j = 0; j < nz; j++) { 1921 x = rtmp + 16 * ((unsigned int)pj[j]); 1922 /* x = rtmp+4*pj[j]; */ 1923 1924 SSE_INLINE_BEGIN_2(x, pv) 1925 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1926 SSE_LOADL_PS(SSE_ARG_1, FLOAT_0, XMM1) 1927 SSE_STOREL_PS(SSE_ARG_2, FLOAT_0, XMM1) 1928 1929 SSE_LOADH_PS(SSE_ARG_1, FLOAT_2, XMM2) 1930 SSE_STOREH_PS(SSE_ARG_2, FLOAT_2, XMM2) 1931 1932 SSE_LOADL_PS(SSE_ARG_1, FLOAT_4, XMM3) 1933 SSE_STOREL_PS(SSE_ARG_2, FLOAT_4, XMM3) 1934 1935 SSE_LOADH_PS(SSE_ARG_1, FLOAT_6, XMM4) 1936 SSE_STOREH_PS(SSE_ARG_2, FLOAT_6, XMM4) 1937 1938 SSE_LOADL_PS(SSE_ARG_1, FLOAT_8, XMM5) 1939 SSE_STOREL_PS(SSE_ARG_2, FLOAT_8, XMM5) 1940 1941 SSE_LOADH_PS(SSE_ARG_1, FLOAT_10, XMM6) 1942 SSE_STOREH_PS(SSE_ARG_2, FLOAT_10, XMM6) 1943 1944 SSE_LOADL_PS(SSE_ARG_1, FLOAT_12, XMM7) 1945 SSE_STOREL_PS(SSE_ARG_2, FLOAT_12, XMM7) 1946 1947 SSE_LOADH_PS(SSE_ARG_1, FLOAT_14, XMM0) 1948 SSE_STOREH_PS(SSE_ARG_2, FLOAT_14, XMM0) 1949 SSE_INLINE_END_2; 1950 pv += 16; 1951 } 1952 /* invert diagonal block */ 1953 w = ba + 16 * diag_offset[i]; 1954 if (pivotinblocks) { 1955 PetscCall(PetscKernel_A_gets_inverse_A_4(w, shift, allowzeropivot, , &zeropivotdetected)); 1956 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1957 } else { 1958 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1959 } 1960 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1961 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1962 } 1963 1964 PetscCall(PetscFree(rtmp)); 1965 1966 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1967 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1968 C->assembled = PETSC_TRUE; 1969 1970 PetscCall(PetscLogFlops(1.333333333333 * bs * bs2 * b->mbs)); 1971 /* Flop Count from inverting diagonal blocks */ 1972 SSE_SCOPE_END; 1973 PetscFunctionReturn(PETSC_SUCCESS); 1974 } 1975 1976 #endif 1977