1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 15 IS isrow = b->row,isicol = b->icol; 16 const PetscInt *r,*ic; 17 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 18 PetscInt *ajtmpold,*ajtmp,nz,row; 19 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 20 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 21 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 22 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 23 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 24 MatScalar m13,m14,m15,m16; 25 MatScalar *ba = b->a,*aa = a->a; 26 PetscBool pivotinblocks = b->pivotinblocks; 27 PetscReal shift = info->shiftamount; 28 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 29 30 PetscFunctionBegin; 31 PetscCall(ISGetIndices(isrow,&r)); 32 PetscCall(ISGetIndices(isicol,&ic)); 33 PetscCall(PetscMalloc1(16*(n+1),&rtmp)); 34 allowzeropivot = PetscNot(A->erroriffailure); 35 36 for (i=0; i<n; i++) { 37 nz = bi[i+1] - bi[i]; 38 ajtmp = bj + bi[i]; 39 for (j=0; j<nz; j++) { 40 x = rtmp+16*ajtmp[j]; 41 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 42 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 43 } 44 /* load in initial (unfactored row) */ 45 idx = r[i]; 46 nz = ai[idx+1] - ai[idx]; 47 ajtmpold = aj + ai[idx]; 48 v = aa + 16*ai[idx]; 49 for (j=0; j<nz; j++) { 50 x = rtmp+16*ic[ajtmpold[j]]; 51 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 52 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 53 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 54 x[14] = v[14]; x[15] = v[15]; 55 v += 16; 56 } 57 row = *ajtmp++; 58 while (row < i) { 59 pc = rtmp + 16*row; 60 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 61 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 62 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 63 p15 = pc[14]; p16 = pc[15]; 64 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 65 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 66 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 67 || p16 != 0.0) { 68 pv = ba + 16*diag_offset[row]; 69 pj = bj + diag_offset[row] + 1; 70 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 71 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 72 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 73 x15 = pv[14]; x16 = pv[15]; 74 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 75 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 76 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 77 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 78 79 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 80 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 81 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 82 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 83 84 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 85 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 86 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 87 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 88 89 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 90 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 91 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 92 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 93 94 nz = bi[row+1] - diag_offset[row] - 1; 95 pv += 16; 96 for (j=0; j<nz; j++) { 97 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 98 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 99 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 100 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 101 x = rtmp + 16*pj[j]; 102 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 103 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 104 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 105 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 106 107 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 108 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 109 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 110 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 111 112 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 113 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 114 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 115 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 116 117 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 118 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 119 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 120 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 121 122 pv += 16; 123 } 124 PetscCall(PetscLogFlops(128.0*nz+112.0)); 125 } 126 row = *ajtmp++; 127 } 128 /* finished row so stick it into b->a */ 129 pv = ba + 16*bi[i]; 130 pj = bj + bi[i]; 131 nz = bi[i+1] - bi[i]; 132 for (j=0; j<nz; j++) { 133 x = rtmp+16*pj[j]; 134 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 135 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 136 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 137 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 138 pv += 16; 139 } 140 /* invert diagonal block */ 141 w = ba + 16*diag_offset[i]; 142 if (pivotinblocks) { 143 PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected)); 144 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 145 } else { 146 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 147 } 148 } 149 150 PetscCall(PetscFree(rtmp)); 151 PetscCall(ISRestoreIndices(isicol,&ic)); 152 PetscCall(ISRestoreIndices(isrow,&r)); 153 154 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 156 C->assembled = PETSC_TRUE; 157 158 PetscCall(PetscLogFlops(1.333333333333*4*4*4*b->mbs)); /* from inverting diagonal blocks */ 159 PetscFunctionReturn(0); 160 } 161 162 /* MatLUFactorNumeric_SeqBAIJ_4 - 163 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 164 PetscKernel_A_gets_A_times_B() 165 PetscKernel_A_gets_A_minus_B_times_C() 166 PetscKernel_A_gets_inverse_A() 167 */ 168 169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 170 { 171 Mat C = B; 172 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 173 IS isrow = b->row,isicol = b->icol; 174 const PetscInt *r,*ic; 175 PetscInt i,j,k,nz,nzL,row; 176 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 177 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 178 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 179 PetscInt flg; 180 PetscReal shift; 181 PetscBool allowzeropivot,zeropivotdetected; 182 183 PetscFunctionBegin; 184 allowzeropivot = PetscNot(A->erroriffailure); 185 PetscCall(ISGetIndices(isrow,&r)); 186 PetscCall(ISGetIndices(isicol,&ic)); 187 188 if (info->shifttype == MAT_SHIFT_NONE) { 189 shift = 0; 190 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 191 shift = info->shiftamount; 192 } 193 194 /* generate work space needed by the factorization */ 195 PetscCall(PetscMalloc2(bs2*n,&rtmp,bs2,&mwork)); 196 PetscCall(PetscArrayzero(rtmp,bs2*n)); 197 198 for (i=0; i<n; i++) { 199 /* zero rtmp */ 200 /* L part */ 201 nz = bi[i+1] - bi[i]; 202 bjtmp = bj + bi[i]; 203 for (j=0; j<nz; j++) { 204 PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)); 205 } 206 207 /* U part */ 208 nz = bdiag[i]-bdiag[i+1]; 209 bjtmp = bj + bdiag[i+1]+1; 210 for (j=0; j<nz; j++) { 211 PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)); 212 } 213 214 /* load in initial (unfactored row) */ 215 nz = ai[r[i]+1] - ai[r[i]]; 216 ajtmp = aj + ai[r[i]]; 217 v = aa + bs2*ai[r[i]]; 218 for (j=0; j<nz; j++) { 219 PetscCall(PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2)); 220 } 221 222 /* elimination */ 223 bjtmp = bj + bi[i]; 224 nzL = bi[i+1] - bi[i]; 225 for (k=0; k < nzL; k++) { 226 row = bjtmp[k]; 227 pc = rtmp + bs2*row; 228 for (flg=0,j=0; j<bs2; j++) { 229 if (pc[j]!=0.0) { 230 flg = 1; 231 break; 232 } 233 } 234 if (flg) { 235 pv = b->a + bs2*bdiag[row]; 236 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 237 PetscCall(PetscKernel_A_gets_A_times_B_4(pc,pv,mwork)); 238 239 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 240 pv = b->a + bs2*(bdiag[row+1]+1); 241 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 242 for (j=0; j<nz; j++) { 243 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 244 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 245 v = rtmp + bs2*pj[j]; 246 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv)); 247 pv += bs2; 248 } 249 PetscCall(PetscLogFlops(128.0*nz+112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 250 } 251 } 252 253 /* finished row so stick it into b->a */ 254 /* L part */ 255 pv = b->a + bs2*bi[i]; 256 pj = b->j + bi[i]; 257 nz = bi[i+1] - bi[i]; 258 for (j=0; j<nz; j++) { 259 PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)); 260 } 261 262 /* Mark diagonal and invert diagonal for simpler triangular solves */ 263 pv = b->a + bs2*bdiag[i]; 264 pj = b->j + bdiag[i]; 265 PetscCall(PetscArraycpy(pv,rtmp+bs2*pj[0],bs2)); 266 PetscCall(PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected)); 267 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 268 269 /* U part */ 270 pv = b->a + bs2*(bdiag[i+1]+1); 271 pj = b->j + bdiag[i+1]+1; 272 nz = bdiag[i] - bdiag[i+1] - 1; 273 for (j=0; j<nz; j++) { 274 PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)); 275 } 276 } 277 278 PetscCall(PetscFree2(rtmp,mwork)); 279 PetscCall(ISRestoreIndices(isicol,&ic)); 280 PetscCall(ISRestoreIndices(isrow,&r)); 281 282 C->ops->solve = MatSolve_SeqBAIJ_4; 283 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 284 C->assembled = PETSC_TRUE; 285 286 PetscCall(PetscLogFlops(1.333333333333*4*4*4*n)); /* from inverting diagonal blocks */ 287 PetscFunctionReturn(0); 288 } 289 290 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 291 { 292 /* 293 Default Version for when blocks are 4 by 4 Using natural ordering 294 */ 295 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 296 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 297 PetscInt *ajtmpold,*ajtmp,nz,row; 298 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 299 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 300 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 301 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 302 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 303 MatScalar m13,m14,m15,m16; 304 MatScalar *ba = b->a,*aa = a->a; 305 PetscBool pivotinblocks = b->pivotinblocks; 306 PetscReal shift = info->shiftamount; 307 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 308 309 PetscFunctionBegin; 310 allowzeropivot = PetscNot(A->erroriffailure); 311 PetscCall(PetscMalloc1(16*(n+1),&rtmp)); 312 313 for (i=0; i<n; i++) { 314 nz = bi[i+1] - bi[i]; 315 ajtmp = bj + bi[i]; 316 for (j=0; j<nz; j++) { 317 x = rtmp+16*ajtmp[j]; 318 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 319 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 320 } 321 /* load in initial (unfactored row) */ 322 nz = ai[i+1] - ai[i]; 323 ajtmpold = aj + ai[i]; 324 v = aa + 16*ai[i]; 325 for (j=0; j<nz; j++) { 326 x = rtmp+16*ajtmpold[j]; 327 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 328 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 329 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 330 x[14] = v[14]; x[15] = v[15]; 331 v += 16; 332 } 333 row = *ajtmp++; 334 while (row < i) { 335 pc = rtmp + 16*row; 336 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 337 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 338 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 339 p15 = pc[14]; p16 = pc[15]; 340 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 341 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 342 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 343 || p16 != 0.0) { 344 pv = ba + 16*diag_offset[row]; 345 pj = bj + diag_offset[row] + 1; 346 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 347 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 348 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 349 x15 = pv[14]; x16 = pv[15]; 350 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 351 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 352 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 353 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 354 355 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 356 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 357 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 358 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 359 360 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 361 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 362 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 363 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 364 365 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 366 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 367 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 368 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 369 nz = bi[row+1] - diag_offset[row] - 1; 370 pv += 16; 371 for (j=0; j<nz; j++) { 372 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 373 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 374 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 375 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 376 x = rtmp + 16*pj[j]; 377 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 378 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 379 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 380 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 381 382 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 383 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 384 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 385 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 386 387 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 388 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 389 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 390 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 391 392 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 393 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 394 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 395 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 396 397 pv += 16; 398 } 399 PetscCall(PetscLogFlops(128.0*nz+112.0)); 400 } 401 row = *ajtmp++; 402 } 403 /* finished row so stick it into b->a */ 404 pv = ba + 16*bi[i]; 405 pj = bj + bi[i]; 406 nz = bi[i+1] - bi[i]; 407 for (j=0; j<nz; j++) { 408 x = rtmp+16*pj[j]; 409 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 410 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 411 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 412 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 413 pv += 16; 414 } 415 /* invert diagonal block */ 416 w = ba + 16*diag_offset[i]; 417 if (pivotinblocks) { 418 PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected)); 419 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 420 } else { 421 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 422 } 423 } 424 425 PetscCall(PetscFree(rtmp)); 426 427 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 428 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 429 C->assembled = PETSC_TRUE; 430 431 PetscCall(PetscLogFlops(1.333333333333*4*4*4*b->mbs)); /* from inverting diagonal blocks */ 432 PetscFunctionReturn(0); 433 } 434 435 /* 436 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 437 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 438 */ 439 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 440 { 441 Mat C =B; 442 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 443 PetscInt i,j,k,nz,nzL,row; 444 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 445 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 446 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 447 PetscInt flg; 448 PetscReal shift; 449 PetscBool allowzeropivot,zeropivotdetected; 450 451 PetscFunctionBegin; 452 allowzeropivot = PetscNot(A->erroriffailure); 453 454 /* generate work space needed by the factorization */ 455 PetscCall(PetscMalloc2(bs2*n,&rtmp,bs2,&mwork)); 456 PetscCall(PetscArrayzero(rtmp,bs2*n)); 457 458 if (info->shifttype == MAT_SHIFT_NONE) { 459 shift = 0; 460 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 461 shift = info->shiftamount; 462 } 463 464 for (i=0; i<n; i++) { 465 /* zero rtmp */ 466 /* L part */ 467 nz = bi[i+1] - bi[i]; 468 bjtmp = bj + bi[i]; 469 for (j=0; j<nz; j++) { 470 PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)); 471 } 472 473 /* U part */ 474 nz = bdiag[i] - bdiag[i+1]; 475 bjtmp = bj + bdiag[i+1]+1; 476 for (j=0; j<nz; j++) { 477 PetscCall(PetscArrayzero(rtmp+bs2*bjtmp[j],bs2)); 478 } 479 480 /* load in initial (unfactored row) */ 481 nz = ai[i+1] - ai[i]; 482 ajtmp = aj + ai[i]; 483 v = aa + bs2*ai[i]; 484 for (j=0; j<nz; j++) { 485 PetscCall(PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2)); 486 } 487 488 /* elimination */ 489 bjtmp = bj + bi[i]; 490 nzL = bi[i+1] - bi[i]; 491 for (k=0; k < nzL; k++) { 492 row = bjtmp[k]; 493 pc = rtmp + bs2*row; 494 for (flg=0,j=0; j<bs2; j++) { 495 if (pc[j]!=0.0) { 496 flg = 1; 497 break; 498 } 499 } 500 if (flg) { 501 pv = b->a + bs2*bdiag[row]; 502 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 503 PetscCall(PetscKernel_A_gets_A_times_B_4(pc,pv,mwork)); 504 505 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 506 pv = b->a + bs2*(bdiag[row+1]+1); 507 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 508 for (j=0; j<nz; j++) { 509 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 510 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 511 v = rtmp + bs2*pj[j]; 512 PetscCall(PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv)); 513 pv += bs2; 514 } 515 PetscCall(PetscLogFlops(128.0*nz+112)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 516 } 517 } 518 519 /* finished row so stick it into b->a */ 520 /* L part */ 521 pv = b->a + bs2*bi[i]; 522 pj = b->j + bi[i]; 523 nz = bi[i+1] - bi[i]; 524 for (j=0; j<nz; j++) { 525 PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)); 526 } 527 528 /* Mark diagonal and invert diagonal for simpler triangular solves */ 529 pv = b->a + bs2*bdiag[i]; 530 pj = b->j + bdiag[i]; 531 PetscCall(PetscArraycpy(pv,rtmp+bs2*pj[0],bs2)); 532 PetscCall(PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected)); 533 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 534 535 /* U part */ 536 pv = b->a + bs2*(bdiag[i+1]+1); 537 pj = b->j + bdiag[i+1]+1; 538 nz = bdiag[i] - bdiag[i+1] - 1; 539 for (j=0; j<nz; j++) { 540 PetscCall(PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2)); 541 } 542 } 543 PetscCall(PetscFree2(rtmp,mwork)); 544 545 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 546 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 547 C->assembled = PETSC_TRUE; 548 549 PetscCall(PetscLogFlops(1.333333333333*4*4*4*n)); /* from inverting diagonal blocks */ 550 PetscFunctionReturn(0); 551 } 552 553 #if defined(PETSC_HAVE_SSE) 554 555 #include PETSC_HAVE_SSE 556 557 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 558 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 559 { 560 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 561 int i,j,n = a->mbs; 562 int *bj = b->j,*bjtmp,*pj; 563 int row; 564 int *ajtmpold,nz,*bi=b->i; 565 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 566 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 567 MatScalar *ba = b->a,*aa = a->a; 568 int nonzero=0; 569 PetscBool pivotinblocks = b->pivotinblocks; 570 PetscReal shift = info->shiftamount; 571 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 572 573 PetscFunctionBegin; 574 allowzeropivot = PetscNot(A->erroriffailure); 575 SSE_SCOPE_BEGIN; 576 577 PetscCheck((unsigned long)aa%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 578 PetscCheck((unsigned long)ba%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 579 PetscCall(PetscMalloc1(16*(n+1),&rtmp)); 580 PetscCheck((unsigned long)rtmp%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 581 /* if ((unsigned long)bj==(unsigned long)aj) { */ 582 /* colscale = 4; */ 583 /* } */ 584 for (i=0; i<n; i++) { 585 nz = bi[i+1] - bi[i]; 586 bjtmp = bj + bi[i]; 587 /* zero out the 4x4 block accumulators */ 588 /* zero out one register */ 589 XOR_PS(XMM7,XMM7); 590 for (j=0; j<nz; j++) { 591 x = rtmp+16*bjtmp[j]; 592 /* x = rtmp+4*bjtmp[j]; */ 593 SSE_INLINE_BEGIN_1(x) 594 /* Copy zero register to memory locations */ 595 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 596 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 597 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 598 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 599 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 600 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 601 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 602 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 603 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 604 SSE_INLINE_END_1; 605 } 606 /* load in initial (unfactored row) */ 607 nz = ai[i+1] - ai[i]; 608 ajtmpold = aj + ai[i]; 609 v = aa + 16*ai[i]; 610 for (j=0; j<nz; j++) { 611 x = rtmp+16*ajtmpold[j]; 612 /* x = rtmp+colscale*ajtmpold[j]; */ 613 /* Copy v block into x block */ 614 SSE_INLINE_BEGIN_2(v,x) 615 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 616 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 617 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 618 619 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 620 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 621 622 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 623 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 624 625 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 626 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 627 628 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 629 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 630 631 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 632 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 633 634 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 635 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 636 637 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 638 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 639 SSE_INLINE_END_2; 640 641 v += 16; 642 } 643 /* row = (*bjtmp++)/4; */ 644 row = *bjtmp++; 645 while (row < i) { 646 pc = rtmp + 16*row; 647 SSE_INLINE_BEGIN_1(pc) 648 /* Load block from lower triangle */ 649 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 650 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 651 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 652 653 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 654 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 655 656 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 657 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 658 659 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 660 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 661 662 /* Compare block to zero block */ 663 664 SSE_COPY_PS(XMM4,XMM7) 665 SSE_CMPNEQ_PS(XMM4,XMM0) 666 667 SSE_COPY_PS(XMM5,XMM7) 668 SSE_CMPNEQ_PS(XMM5,XMM1) 669 670 SSE_COPY_PS(XMM6,XMM7) 671 SSE_CMPNEQ_PS(XMM6,XMM2) 672 673 SSE_CMPNEQ_PS(XMM7,XMM3) 674 675 /* Reduce the comparisons to one SSE register */ 676 SSE_OR_PS(XMM6,XMM7) 677 SSE_OR_PS(XMM5,XMM4) 678 SSE_OR_PS(XMM5,XMM6) 679 SSE_INLINE_END_1; 680 681 /* Reduce the one SSE register to an integer register for branching */ 682 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 683 MOVEMASK(nonzero,XMM5); 684 685 /* If block is nonzero ... */ 686 if (nonzero) { 687 pv = ba + 16*diag_offset[row]; 688 PREFETCH_L1(&pv[16]); 689 pj = bj + diag_offset[row] + 1; 690 691 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 692 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 693 /* but the diagonal was inverted already */ 694 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 695 696 SSE_INLINE_BEGIN_2(pv,pc) 697 /* Column 0, product is accumulated in XMM4 */ 698 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 699 SSE_SHUFFLE(XMM4,XMM4,0x00) 700 SSE_MULT_PS(XMM4,XMM0) 701 702 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 703 SSE_SHUFFLE(XMM5,XMM5,0x00) 704 SSE_MULT_PS(XMM5,XMM1) 705 SSE_ADD_PS(XMM4,XMM5) 706 707 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 708 SSE_SHUFFLE(XMM6,XMM6,0x00) 709 SSE_MULT_PS(XMM6,XMM2) 710 SSE_ADD_PS(XMM4,XMM6) 711 712 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 713 SSE_SHUFFLE(XMM7,XMM7,0x00) 714 SSE_MULT_PS(XMM7,XMM3) 715 SSE_ADD_PS(XMM4,XMM7) 716 717 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 718 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 719 720 /* Column 1, product is accumulated in XMM5 */ 721 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 722 SSE_SHUFFLE(XMM5,XMM5,0x00) 723 SSE_MULT_PS(XMM5,XMM0) 724 725 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 726 SSE_SHUFFLE(XMM6,XMM6,0x00) 727 SSE_MULT_PS(XMM6,XMM1) 728 SSE_ADD_PS(XMM5,XMM6) 729 730 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 731 SSE_SHUFFLE(XMM7,XMM7,0x00) 732 SSE_MULT_PS(XMM7,XMM2) 733 SSE_ADD_PS(XMM5,XMM7) 734 735 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 736 SSE_SHUFFLE(XMM6,XMM6,0x00) 737 SSE_MULT_PS(XMM6,XMM3) 738 SSE_ADD_PS(XMM5,XMM6) 739 740 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 741 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 742 743 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 744 745 /* Column 2, product is accumulated in XMM6 */ 746 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 747 SSE_SHUFFLE(XMM6,XMM6,0x00) 748 SSE_MULT_PS(XMM6,XMM0) 749 750 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 751 SSE_SHUFFLE(XMM7,XMM7,0x00) 752 SSE_MULT_PS(XMM7,XMM1) 753 SSE_ADD_PS(XMM6,XMM7) 754 755 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 756 SSE_SHUFFLE(XMM7,XMM7,0x00) 757 SSE_MULT_PS(XMM7,XMM2) 758 SSE_ADD_PS(XMM6,XMM7) 759 760 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 761 SSE_SHUFFLE(XMM7,XMM7,0x00) 762 SSE_MULT_PS(XMM7,XMM3) 763 SSE_ADD_PS(XMM6,XMM7) 764 765 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 766 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 767 768 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 769 /* Column 3, product is accumulated in XMM0 */ 770 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 771 SSE_SHUFFLE(XMM7,XMM7,0x00) 772 SSE_MULT_PS(XMM0,XMM7) 773 774 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 775 SSE_SHUFFLE(XMM7,XMM7,0x00) 776 SSE_MULT_PS(XMM1,XMM7) 777 SSE_ADD_PS(XMM0,XMM1) 778 779 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 780 SSE_SHUFFLE(XMM1,XMM1,0x00) 781 SSE_MULT_PS(XMM1,XMM2) 782 SSE_ADD_PS(XMM0,XMM1) 783 784 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 785 SSE_SHUFFLE(XMM7,XMM7,0x00) 786 SSE_MULT_PS(XMM3,XMM7) 787 SSE_ADD_PS(XMM0,XMM3) 788 789 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 790 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 791 792 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 793 /* This is code to be maintained and read by humans after all. */ 794 /* Copy Multiplier Col 3 into XMM3 */ 795 SSE_COPY_PS(XMM3,XMM0) 796 /* Copy Multiplier Col 2 into XMM2 */ 797 SSE_COPY_PS(XMM2,XMM6) 798 /* Copy Multiplier Col 1 into XMM1 */ 799 SSE_COPY_PS(XMM1,XMM5) 800 /* Copy Multiplier Col 0 into XMM0 */ 801 SSE_COPY_PS(XMM0,XMM4) 802 SSE_INLINE_END_2; 803 804 /* Update the row: */ 805 nz = bi[row+1] - diag_offset[row] - 1; 806 pv += 16; 807 for (j=0; j<nz; j++) { 808 PREFETCH_L1(&pv[16]); 809 x = rtmp + 16*pj[j]; 810 /* x = rtmp + 4*pj[j]; */ 811 812 /* X:=X-M*PV, One column at a time */ 813 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 814 SSE_INLINE_BEGIN_2(x,pv) 815 /* Load First Column of X*/ 816 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 817 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 818 819 /* Matrix-Vector Product: */ 820 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 821 SSE_SHUFFLE(XMM5,XMM5,0x00) 822 SSE_MULT_PS(XMM5,XMM0) 823 SSE_SUB_PS(XMM4,XMM5) 824 825 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 826 SSE_SHUFFLE(XMM6,XMM6,0x00) 827 SSE_MULT_PS(XMM6,XMM1) 828 SSE_SUB_PS(XMM4,XMM6) 829 830 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 831 SSE_SHUFFLE(XMM7,XMM7,0x00) 832 SSE_MULT_PS(XMM7,XMM2) 833 SSE_SUB_PS(XMM4,XMM7) 834 835 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 836 SSE_SHUFFLE(XMM5,XMM5,0x00) 837 SSE_MULT_PS(XMM5,XMM3) 838 SSE_SUB_PS(XMM4,XMM5) 839 840 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 841 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 842 843 /* Second Column */ 844 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 845 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 846 847 /* Matrix-Vector Product: */ 848 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 849 SSE_SHUFFLE(XMM6,XMM6,0x00) 850 SSE_MULT_PS(XMM6,XMM0) 851 SSE_SUB_PS(XMM5,XMM6) 852 853 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 854 SSE_SHUFFLE(XMM7,XMM7,0x00) 855 SSE_MULT_PS(XMM7,XMM1) 856 SSE_SUB_PS(XMM5,XMM7) 857 858 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 859 SSE_SHUFFLE(XMM4,XMM4,0x00) 860 SSE_MULT_PS(XMM4,XMM2) 861 SSE_SUB_PS(XMM5,XMM4) 862 863 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 864 SSE_SHUFFLE(XMM6,XMM6,0x00) 865 SSE_MULT_PS(XMM6,XMM3) 866 SSE_SUB_PS(XMM5,XMM6) 867 868 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 869 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 870 871 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 872 873 /* Third Column */ 874 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 875 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 876 877 /* Matrix-Vector Product: */ 878 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 879 SSE_SHUFFLE(XMM7,XMM7,0x00) 880 SSE_MULT_PS(XMM7,XMM0) 881 SSE_SUB_PS(XMM6,XMM7) 882 883 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 884 SSE_SHUFFLE(XMM4,XMM4,0x00) 885 SSE_MULT_PS(XMM4,XMM1) 886 SSE_SUB_PS(XMM6,XMM4) 887 888 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 889 SSE_SHUFFLE(XMM5,XMM5,0x00) 890 SSE_MULT_PS(XMM5,XMM2) 891 SSE_SUB_PS(XMM6,XMM5) 892 893 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 894 SSE_SHUFFLE(XMM7,XMM7,0x00) 895 SSE_MULT_PS(XMM7,XMM3) 896 SSE_SUB_PS(XMM6,XMM7) 897 898 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 899 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 900 901 /* Fourth Column */ 902 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 903 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 904 905 /* Matrix-Vector Product: */ 906 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 907 SSE_SHUFFLE(XMM5,XMM5,0x00) 908 SSE_MULT_PS(XMM5,XMM0) 909 SSE_SUB_PS(XMM4,XMM5) 910 911 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 912 SSE_SHUFFLE(XMM6,XMM6,0x00) 913 SSE_MULT_PS(XMM6,XMM1) 914 SSE_SUB_PS(XMM4,XMM6) 915 916 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 917 SSE_SHUFFLE(XMM7,XMM7,0x00) 918 SSE_MULT_PS(XMM7,XMM2) 919 SSE_SUB_PS(XMM4,XMM7) 920 921 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 922 SSE_SHUFFLE(XMM5,XMM5,0x00) 923 SSE_MULT_PS(XMM5,XMM3) 924 SSE_SUB_PS(XMM4,XMM5) 925 926 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 927 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 928 SSE_INLINE_END_2; 929 pv += 16; 930 } 931 PetscCall(PetscLogFlops(128.0*nz+112.0)); 932 } 933 row = *bjtmp++; 934 /* row = (*bjtmp++)/4; */ 935 } 936 /* finished row so stick it into b->a */ 937 pv = ba + 16*bi[i]; 938 pj = bj + bi[i]; 939 nz = bi[i+1] - bi[i]; 940 941 /* Copy x block back into pv block */ 942 for (j=0; j<nz; j++) { 943 x = rtmp+16*pj[j]; 944 /* x = rtmp+4*pj[j]; */ 945 946 SSE_INLINE_BEGIN_2(x,pv) 947 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 948 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 949 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 950 951 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 952 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 953 954 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 955 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 956 957 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 958 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 959 960 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 961 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 962 963 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 964 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 965 966 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 967 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 968 969 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 970 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 971 SSE_INLINE_END_2; 972 pv += 16; 973 } 974 /* invert diagonal block */ 975 w = ba + 16*diag_offset[i]; 976 if (pivotinblocks) { 977 PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected)); 978 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 979 } else { 980 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 981 } 982 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 983 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 984 } 985 986 PetscCall(PetscFree(rtmp)); 987 988 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 989 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 990 C->assembled = PETSC_TRUE; 991 992 PetscCall(PetscLogFlops(1.333333333333*bs*bs2*b->mbs)); 993 /* Flop Count from inverting diagonal blocks */ 994 SSE_SCOPE_END; 995 PetscFunctionReturn(0); 996 } 997 998 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 999 { 1000 Mat A =C; 1001 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1002 int i,j,n = a->mbs; 1003 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1004 unsigned short *aj = (unsigned short*)(a->j),*ajtmp; 1005 unsigned int row; 1006 int nz,*bi=b->i; 1007 int *diag_offset = b->diag,*ai=a->i; 1008 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1009 MatScalar *ba = b->a,*aa = a->a; 1010 int nonzero=0; 1011 PetscBool pivotinblocks = b->pivotinblocks; 1012 PetscReal shift = info->shiftamount; 1013 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 1014 1015 PetscFunctionBegin; 1016 allowzeropivot = PetscNot(A->erroriffailure); 1017 SSE_SCOPE_BEGIN; 1018 1019 PetscCheck((unsigned long)aa%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1020 PetscCheck((unsigned long)ba%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1021 PetscCall(PetscMalloc1(16*(n+1),&rtmp)); 1022 PetscCheck((unsigned long)rtmp%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1023 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1024 /* colscale = 4; */ 1025 /* } */ 1026 1027 for (i=0; i<n; i++) { 1028 nz = bi[i+1] - bi[i]; 1029 bjtmp = bj + bi[i]; 1030 /* zero out the 4x4 block accumulators */ 1031 /* zero out one register */ 1032 XOR_PS(XMM7,XMM7); 1033 for (j=0; j<nz; j++) { 1034 x = rtmp+16*((unsigned int)bjtmp[j]); 1035 /* x = rtmp+4*bjtmp[j]; */ 1036 SSE_INLINE_BEGIN_1(x) 1037 /* Copy zero register to memory locations */ 1038 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1039 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1040 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1041 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1042 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1043 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1044 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1045 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1046 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1047 SSE_INLINE_END_1; 1048 } 1049 /* load in initial (unfactored row) */ 1050 nz = ai[i+1] - ai[i]; 1051 ajtmp = aj + ai[i]; 1052 v = aa + 16*ai[i]; 1053 for (j=0; j<nz; j++) { 1054 x = rtmp+16*((unsigned int)ajtmp[j]); 1055 /* x = rtmp+colscale*ajtmp[j]; */ 1056 /* Copy v block into x block */ 1057 SSE_INLINE_BEGIN_2(v,x) 1058 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1059 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1060 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1061 1062 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1063 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1064 1065 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1066 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1067 1068 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1069 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1070 1071 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1072 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1073 1074 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1075 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1076 1077 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1078 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1079 1080 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1081 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1082 SSE_INLINE_END_2; 1083 1084 v += 16; 1085 } 1086 /* row = (*bjtmp++)/4; */ 1087 row = (unsigned int)(*bjtmp++); 1088 while (row < i) { 1089 pc = rtmp + 16*row; 1090 SSE_INLINE_BEGIN_1(pc) 1091 /* Load block from lower triangle */ 1092 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1093 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1094 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1095 1096 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1097 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1098 1099 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1100 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1101 1102 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1103 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1104 1105 /* Compare block to zero block */ 1106 1107 SSE_COPY_PS(XMM4,XMM7) 1108 SSE_CMPNEQ_PS(XMM4,XMM0) 1109 1110 SSE_COPY_PS(XMM5,XMM7) 1111 SSE_CMPNEQ_PS(XMM5,XMM1) 1112 1113 SSE_COPY_PS(XMM6,XMM7) 1114 SSE_CMPNEQ_PS(XMM6,XMM2) 1115 1116 SSE_CMPNEQ_PS(XMM7,XMM3) 1117 1118 /* Reduce the comparisons to one SSE register */ 1119 SSE_OR_PS(XMM6,XMM7) 1120 SSE_OR_PS(XMM5,XMM4) 1121 SSE_OR_PS(XMM5,XMM6) 1122 SSE_INLINE_END_1; 1123 1124 /* Reduce the one SSE register to an integer register for branching */ 1125 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1126 MOVEMASK(nonzero,XMM5); 1127 1128 /* If block is nonzero ... */ 1129 if (nonzero) { 1130 pv = ba + 16*diag_offset[row]; 1131 PREFETCH_L1(&pv[16]); 1132 pj = bj + diag_offset[row] + 1; 1133 1134 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1135 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1136 /* but the diagonal was inverted already */ 1137 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1138 1139 SSE_INLINE_BEGIN_2(pv,pc) 1140 /* Column 0, product is accumulated in XMM4 */ 1141 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1142 SSE_SHUFFLE(XMM4,XMM4,0x00) 1143 SSE_MULT_PS(XMM4,XMM0) 1144 1145 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1146 SSE_SHUFFLE(XMM5,XMM5,0x00) 1147 SSE_MULT_PS(XMM5,XMM1) 1148 SSE_ADD_PS(XMM4,XMM5) 1149 1150 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1151 SSE_SHUFFLE(XMM6,XMM6,0x00) 1152 SSE_MULT_PS(XMM6,XMM2) 1153 SSE_ADD_PS(XMM4,XMM6) 1154 1155 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1156 SSE_SHUFFLE(XMM7,XMM7,0x00) 1157 SSE_MULT_PS(XMM7,XMM3) 1158 SSE_ADD_PS(XMM4,XMM7) 1159 1160 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1161 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1162 1163 /* Column 1, product is accumulated in XMM5 */ 1164 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1165 SSE_SHUFFLE(XMM5,XMM5,0x00) 1166 SSE_MULT_PS(XMM5,XMM0) 1167 1168 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1169 SSE_SHUFFLE(XMM6,XMM6,0x00) 1170 SSE_MULT_PS(XMM6,XMM1) 1171 SSE_ADD_PS(XMM5,XMM6) 1172 1173 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1174 SSE_SHUFFLE(XMM7,XMM7,0x00) 1175 SSE_MULT_PS(XMM7,XMM2) 1176 SSE_ADD_PS(XMM5,XMM7) 1177 1178 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1179 SSE_SHUFFLE(XMM6,XMM6,0x00) 1180 SSE_MULT_PS(XMM6,XMM3) 1181 SSE_ADD_PS(XMM5,XMM6) 1182 1183 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1184 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1185 1186 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1187 1188 /* Column 2, product is accumulated in XMM6 */ 1189 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1190 SSE_SHUFFLE(XMM6,XMM6,0x00) 1191 SSE_MULT_PS(XMM6,XMM0) 1192 1193 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1194 SSE_SHUFFLE(XMM7,XMM7,0x00) 1195 SSE_MULT_PS(XMM7,XMM1) 1196 SSE_ADD_PS(XMM6,XMM7) 1197 1198 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1199 SSE_SHUFFLE(XMM7,XMM7,0x00) 1200 SSE_MULT_PS(XMM7,XMM2) 1201 SSE_ADD_PS(XMM6,XMM7) 1202 1203 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1204 SSE_SHUFFLE(XMM7,XMM7,0x00) 1205 SSE_MULT_PS(XMM7,XMM3) 1206 SSE_ADD_PS(XMM6,XMM7) 1207 1208 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1209 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1210 1211 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1212 /* Column 3, product is accumulated in XMM0 */ 1213 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1214 SSE_SHUFFLE(XMM7,XMM7,0x00) 1215 SSE_MULT_PS(XMM0,XMM7) 1216 1217 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1218 SSE_SHUFFLE(XMM7,XMM7,0x00) 1219 SSE_MULT_PS(XMM1,XMM7) 1220 SSE_ADD_PS(XMM0,XMM1) 1221 1222 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1223 SSE_SHUFFLE(XMM1,XMM1,0x00) 1224 SSE_MULT_PS(XMM1,XMM2) 1225 SSE_ADD_PS(XMM0,XMM1) 1226 1227 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1228 SSE_SHUFFLE(XMM7,XMM7,0x00) 1229 SSE_MULT_PS(XMM3,XMM7) 1230 SSE_ADD_PS(XMM0,XMM3) 1231 1232 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1233 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1234 1235 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1236 /* This is code to be maintained and read by humans after all. */ 1237 /* Copy Multiplier Col 3 into XMM3 */ 1238 SSE_COPY_PS(XMM3,XMM0) 1239 /* Copy Multiplier Col 2 into XMM2 */ 1240 SSE_COPY_PS(XMM2,XMM6) 1241 /* Copy Multiplier Col 1 into XMM1 */ 1242 SSE_COPY_PS(XMM1,XMM5) 1243 /* Copy Multiplier Col 0 into XMM0 */ 1244 SSE_COPY_PS(XMM0,XMM4) 1245 SSE_INLINE_END_2; 1246 1247 /* Update the row: */ 1248 nz = bi[row+1] - diag_offset[row] - 1; 1249 pv += 16; 1250 for (j=0; j<nz; j++) { 1251 PREFETCH_L1(&pv[16]); 1252 x = rtmp + 16*((unsigned int)pj[j]); 1253 /* x = rtmp + 4*pj[j]; */ 1254 1255 /* X:=X-M*PV, One column at a time */ 1256 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1257 SSE_INLINE_BEGIN_2(x,pv) 1258 /* Load First Column of X*/ 1259 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1260 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1261 1262 /* Matrix-Vector Product: */ 1263 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1264 SSE_SHUFFLE(XMM5,XMM5,0x00) 1265 SSE_MULT_PS(XMM5,XMM0) 1266 SSE_SUB_PS(XMM4,XMM5) 1267 1268 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1269 SSE_SHUFFLE(XMM6,XMM6,0x00) 1270 SSE_MULT_PS(XMM6,XMM1) 1271 SSE_SUB_PS(XMM4,XMM6) 1272 1273 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1274 SSE_SHUFFLE(XMM7,XMM7,0x00) 1275 SSE_MULT_PS(XMM7,XMM2) 1276 SSE_SUB_PS(XMM4,XMM7) 1277 1278 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1279 SSE_SHUFFLE(XMM5,XMM5,0x00) 1280 SSE_MULT_PS(XMM5,XMM3) 1281 SSE_SUB_PS(XMM4,XMM5) 1282 1283 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1284 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1285 1286 /* Second Column */ 1287 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1288 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1289 1290 /* Matrix-Vector Product: */ 1291 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1292 SSE_SHUFFLE(XMM6,XMM6,0x00) 1293 SSE_MULT_PS(XMM6,XMM0) 1294 SSE_SUB_PS(XMM5,XMM6) 1295 1296 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1297 SSE_SHUFFLE(XMM7,XMM7,0x00) 1298 SSE_MULT_PS(XMM7,XMM1) 1299 SSE_SUB_PS(XMM5,XMM7) 1300 1301 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1302 SSE_SHUFFLE(XMM4,XMM4,0x00) 1303 SSE_MULT_PS(XMM4,XMM2) 1304 SSE_SUB_PS(XMM5,XMM4) 1305 1306 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1307 SSE_SHUFFLE(XMM6,XMM6,0x00) 1308 SSE_MULT_PS(XMM6,XMM3) 1309 SSE_SUB_PS(XMM5,XMM6) 1310 1311 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1312 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1313 1314 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1315 1316 /* Third Column */ 1317 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1318 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1319 1320 /* Matrix-Vector Product: */ 1321 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1322 SSE_SHUFFLE(XMM7,XMM7,0x00) 1323 SSE_MULT_PS(XMM7,XMM0) 1324 SSE_SUB_PS(XMM6,XMM7) 1325 1326 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1327 SSE_SHUFFLE(XMM4,XMM4,0x00) 1328 SSE_MULT_PS(XMM4,XMM1) 1329 SSE_SUB_PS(XMM6,XMM4) 1330 1331 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1332 SSE_SHUFFLE(XMM5,XMM5,0x00) 1333 SSE_MULT_PS(XMM5,XMM2) 1334 SSE_SUB_PS(XMM6,XMM5) 1335 1336 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1337 SSE_SHUFFLE(XMM7,XMM7,0x00) 1338 SSE_MULT_PS(XMM7,XMM3) 1339 SSE_SUB_PS(XMM6,XMM7) 1340 1341 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1342 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1343 1344 /* Fourth Column */ 1345 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1346 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1347 1348 /* Matrix-Vector Product: */ 1349 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1350 SSE_SHUFFLE(XMM5,XMM5,0x00) 1351 SSE_MULT_PS(XMM5,XMM0) 1352 SSE_SUB_PS(XMM4,XMM5) 1353 1354 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1355 SSE_SHUFFLE(XMM6,XMM6,0x00) 1356 SSE_MULT_PS(XMM6,XMM1) 1357 SSE_SUB_PS(XMM4,XMM6) 1358 1359 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1360 SSE_SHUFFLE(XMM7,XMM7,0x00) 1361 SSE_MULT_PS(XMM7,XMM2) 1362 SSE_SUB_PS(XMM4,XMM7) 1363 1364 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1365 SSE_SHUFFLE(XMM5,XMM5,0x00) 1366 SSE_MULT_PS(XMM5,XMM3) 1367 SSE_SUB_PS(XMM4,XMM5) 1368 1369 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1370 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1371 SSE_INLINE_END_2; 1372 pv += 16; 1373 } 1374 PetscCall(PetscLogFlops(128.0*nz+112.0)); 1375 } 1376 row = (unsigned int)(*bjtmp++); 1377 /* row = (*bjtmp++)/4; */ 1378 /* bjtmp++; */ 1379 } 1380 /* finished row so stick it into b->a */ 1381 pv = ba + 16*bi[i]; 1382 pj = bj + bi[i]; 1383 nz = bi[i+1] - bi[i]; 1384 1385 /* Copy x block back into pv block */ 1386 for (j=0; j<nz; j++) { 1387 x = rtmp+16*((unsigned int)pj[j]); 1388 /* x = rtmp+4*pj[j]; */ 1389 1390 SSE_INLINE_BEGIN_2(x,pv) 1391 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1392 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1393 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1394 1395 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1396 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1397 1398 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1399 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1400 1401 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1402 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1403 1404 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1405 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1406 1407 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1408 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1409 1410 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1411 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1412 1413 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1414 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1415 SSE_INLINE_END_2; 1416 pv += 16; 1417 } 1418 /* invert diagonal block */ 1419 w = ba + 16*diag_offset[i]; 1420 if (pivotinblocks) { 1421 PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected)); 1422 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1423 } else { 1424 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1425 } 1426 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1427 } 1428 1429 PetscCall(PetscFree(rtmp)); 1430 1431 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1432 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1433 C->assembled = PETSC_TRUE; 1434 1435 PetscCall(PetscLogFlops(1.333333333333*bs*bs2*b->mbs)); 1436 /* Flop Count from inverting diagonal blocks */ 1437 SSE_SCOPE_END; 1438 PetscFunctionReturn(0); 1439 } 1440 1441 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1442 { 1443 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1444 int i,j,n = a->mbs; 1445 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1446 unsigned int row; 1447 int *ajtmpold,nz,*bi=b->i; 1448 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1449 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1450 MatScalar *ba = b->a,*aa = a->a; 1451 int nonzero=0; 1452 PetscBool pivotinblocks = b->pivotinblocks; 1453 PetscReal shift = info->shiftamount; 1454 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 1455 1456 PetscFunctionBegin; 1457 allowzeropivot = PetscNot(A->erroriffailure); 1458 SSE_SCOPE_BEGIN; 1459 1460 PetscCheck((unsigned long)aa%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1461 PetscCheck((unsigned long)ba%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1462 PetscCall(PetscMalloc1(16*(n+1),&rtmp)); 1463 PetscCheck((unsigned long)rtmp%16 == 0,PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1464 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1465 /* colscale = 4; */ 1466 /* } */ 1467 if ((unsigned long)bj==(unsigned long)aj) { 1468 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1469 } 1470 1471 for (i=0; i<n; i++) { 1472 nz = bi[i+1] - bi[i]; 1473 bjtmp = bj + bi[i]; 1474 /* zero out the 4x4 block accumulators */ 1475 /* zero out one register */ 1476 XOR_PS(XMM7,XMM7); 1477 for (j=0; j<nz; j++) { 1478 x = rtmp+16*((unsigned int)bjtmp[j]); 1479 /* x = rtmp+4*bjtmp[j]; */ 1480 SSE_INLINE_BEGIN_1(x) 1481 /* Copy zero register to memory locations */ 1482 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1483 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1484 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1485 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1486 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1487 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1488 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1489 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1490 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1491 SSE_INLINE_END_1; 1492 } 1493 /* load in initial (unfactored row) */ 1494 nz = ai[i+1] - ai[i]; 1495 ajtmpold = aj + ai[i]; 1496 v = aa + 16*ai[i]; 1497 for (j=0; j<nz; j++) { 1498 x = rtmp+16*ajtmpold[j]; 1499 /* x = rtmp+colscale*ajtmpold[j]; */ 1500 /* Copy v block into x block */ 1501 SSE_INLINE_BEGIN_2(v,x) 1502 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1503 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1504 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1505 1506 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1507 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1508 1509 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1510 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1511 1512 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1513 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1514 1515 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1516 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1517 1518 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1519 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1520 1521 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1522 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1523 1524 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1525 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1526 SSE_INLINE_END_2; 1527 1528 v += 16; 1529 } 1530 /* row = (*bjtmp++)/4; */ 1531 row = (unsigned int)(*bjtmp++); 1532 while (row < i) { 1533 pc = rtmp + 16*row; 1534 SSE_INLINE_BEGIN_1(pc) 1535 /* Load block from lower triangle */ 1536 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1537 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1538 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1539 1540 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1541 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1542 1543 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1544 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1545 1546 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1547 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1548 1549 /* Compare block to zero block */ 1550 1551 SSE_COPY_PS(XMM4,XMM7) 1552 SSE_CMPNEQ_PS(XMM4,XMM0) 1553 1554 SSE_COPY_PS(XMM5,XMM7) 1555 SSE_CMPNEQ_PS(XMM5,XMM1) 1556 1557 SSE_COPY_PS(XMM6,XMM7) 1558 SSE_CMPNEQ_PS(XMM6,XMM2) 1559 1560 SSE_CMPNEQ_PS(XMM7,XMM3) 1561 1562 /* Reduce the comparisons to one SSE register */ 1563 SSE_OR_PS(XMM6,XMM7) 1564 SSE_OR_PS(XMM5,XMM4) 1565 SSE_OR_PS(XMM5,XMM6) 1566 SSE_INLINE_END_1; 1567 1568 /* Reduce the one SSE register to an integer register for branching */ 1569 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1570 MOVEMASK(nonzero,XMM5); 1571 1572 /* If block is nonzero ... */ 1573 if (nonzero) { 1574 pv = ba + 16*diag_offset[row]; 1575 PREFETCH_L1(&pv[16]); 1576 pj = bj + diag_offset[row] + 1; 1577 1578 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1579 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1580 /* but the diagonal was inverted already */ 1581 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1582 1583 SSE_INLINE_BEGIN_2(pv,pc) 1584 /* Column 0, product is accumulated in XMM4 */ 1585 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1586 SSE_SHUFFLE(XMM4,XMM4,0x00) 1587 SSE_MULT_PS(XMM4,XMM0) 1588 1589 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1590 SSE_SHUFFLE(XMM5,XMM5,0x00) 1591 SSE_MULT_PS(XMM5,XMM1) 1592 SSE_ADD_PS(XMM4,XMM5) 1593 1594 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1595 SSE_SHUFFLE(XMM6,XMM6,0x00) 1596 SSE_MULT_PS(XMM6,XMM2) 1597 SSE_ADD_PS(XMM4,XMM6) 1598 1599 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1600 SSE_SHUFFLE(XMM7,XMM7,0x00) 1601 SSE_MULT_PS(XMM7,XMM3) 1602 SSE_ADD_PS(XMM4,XMM7) 1603 1604 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1605 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1606 1607 /* Column 1, product is accumulated in XMM5 */ 1608 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1609 SSE_SHUFFLE(XMM5,XMM5,0x00) 1610 SSE_MULT_PS(XMM5,XMM0) 1611 1612 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1613 SSE_SHUFFLE(XMM6,XMM6,0x00) 1614 SSE_MULT_PS(XMM6,XMM1) 1615 SSE_ADD_PS(XMM5,XMM6) 1616 1617 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1618 SSE_SHUFFLE(XMM7,XMM7,0x00) 1619 SSE_MULT_PS(XMM7,XMM2) 1620 SSE_ADD_PS(XMM5,XMM7) 1621 1622 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1623 SSE_SHUFFLE(XMM6,XMM6,0x00) 1624 SSE_MULT_PS(XMM6,XMM3) 1625 SSE_ADD_PS(XMM5,XMM6) 1626 1627 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1628 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1629 1630 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1631 1632 /* Column 2, product is accumulated in XMM6 */ 1633 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1634 SSE_SHUFFLE(XMM6,XMM6,0x00) 1635 SSE_MULT_PS(XMM6,XMM0) 1636 1637 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1638 SSE_SHUFFLE(XMM7,XMM7,0x00) 1639 SSE_MULT_PS(XMM7,XMM1) 1640 SSE_ADD_PS(XMM6,XMM7) 1641 1642 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1643 SSE_SHUFFLE(XMM7,XMM7,0x00) 1644 SSE_MULT_PS(XMM7,XMM2) 1645 SSE_ADD_PS(XMM6,XMM7) 1646 1647 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1648 SSE_SHUFFLE(XMM7,XMM7,0x00) 1649 SSE_MULT_PS(XMM7,XMM3) 1650 SSE_ADD_PS(XMM6,XMM7) 1651 1652 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1653 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1654 1655 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1656 /* Column 3, product is accumulated in XMM0 */ 1657 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1658 SSE_SHUFFLE(XMM7,XMM7,0x00) 1659 SSE_MULT_PS(XMM0,XMM7) 1660 1661 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1662 SSE_SHUFFLE(XMM7,XMM7,0x00) 1663 SSE_MULT_PS(XMM1,XMM7) 1664 SSE_ADD_PS(XMM0,XMM1) 1665 1666 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1667 SSE_SHUFFLE(XMM1,XMM1,0x00) 1668 SSE_MULT_PS(XMM1,XMM2) 1669 SSE_ADD_PS(XMM0,XMM1) 1670 1671 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1672 SSE_SHUFFLE(XMM7,XMM7,0x00) 1673 SSE_MULT_PS(XMM3,XMM7) 1674 SSE_ADD_PS(XMM0,XMM3) 1675 1676 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1677 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1678 1679 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1680 /* This is code to be maintained and read by humans after all. */ 1681 /* Copy Multiplier Col 3 into XMM3 */ 1682 SSE_COPY_PS(XMM3,XMM0) 1683 /* Copy Multiplier Col 2 into XMM2 */ 1684 SSE_COPY_PS(XMM2,XMM6) 1685 /* Copy Multiplier Col 1 into XMM1 */ 1686 SSE_COPY_PS(XMM1,XMM5) 1687 /* Copy Multiplier Col 0 into XMM0 */ 1688 SSE_COPY_PS(XMM0,XMM4) 1689 SSE_INLINE_END_2; 1690 1691 /* Update the row: */ 1692 nz = bi[row+1] - diag_offset[row] - 1; 1693 pv += 16; 1694 for (j=0; j<nz; j++) { 1695 PREFETCH_L1(&pv[16]); 1696 x = rtmp + 16*((unsigned int)pj[j]); 1697 /* x = rtmp + 4*pj[j]; */ 1698 1699 /* X:=X-M*PV, One column at a time */ 1700 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1701 SSE_INLINE_BEGIN_2(x,pv) 1702 /* Load First Column of X*/ 1703 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1704 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1705 1706 /* Matrix-Vector Product: */ 1707 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1708 SSE_SHUFFLE(XMM5,XMM5,0x00) 1709 SSE_MULT_PS(XMM5,XMM0) 1710 SSE_SUB_PS(XMM4,XMM5) 1711 1712 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1713 SSE_SHUFFLE(XMM6,XMM6,0x00) 1714 SSE_MULT_PS(XMM6,XMM1) 1715 SSE_SUB_PS(XMM4,XMM6) 1716 1717 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1718 SSE_SHUFFLE(XMM7,XMM7,0x00) 1719 SSE_MULT_PS(XMM7,XMM2) 1720 SSE_SUB_PS(XMM4,XMM7) 1721 1722 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1723 SSE_SHUFFLE(XMM5,XMM5,0x00) 1724 SSE_MULT_PS(XMM5,XMM3) 1725 SSE_SUB_PS(XMM4,XMM5) 1726 1727 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1728 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1729 1730 /* Second Column */ 1731 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1732 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1733 1734 /* Matrix-Vector Product: */ 1735 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1736 SSE_SHUFFLE(XMM6,XMM6,0x00) 1737 SSE_MULT_PS(XMM6,XMM0) 1738 SSE_SUB_PS(XMM5,XMM6) 1739 1740 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1741 SSE_SHUFFLE(XMM7,XMM7,0x00) 1742 SSE_MULT_PS(XMM7,XMM1) 1743 SSE_SUB_PS(XMM5,XMM7) 1744 1745 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1746 SSE_SHUFFLE(XMM4,XMM4,0x00) 1747 SSE_MULT_PS(XMM4,XMM2) 1748 SSE_SUB_PS(XMM5,XMM4) 1749 1750 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1751 SSE_SHUFFLE(XMM6,XMM6,0x00) 1752 SSE_MULT_PS(XMM6,XMM3) 1753 SSE_SUB_PS(XMM5,XMM6) 1754 1755 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1756 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1757 1758 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1759 1760 /* Third Column */ 1761 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1762 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1763 1764 /* Matrix-Vector Product: */ 1765 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1766 SSE_SHUFFLE(XMM7,XMM7,0x00) 1767 SSE_MULT_PS(XMM7,XMM0) 1768 SSE_SUB_PS(XMM6,XMM7) 1769 1770 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1771 SSE_SHUFFLE(XMM4,XMM4,0x00) 1772 SSE_MULT_PS(XMM4,XMM1) 1773 SSE_SUB_PS(XMM6,XMM4) 1774 1775 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1776 SSE_SHUFFLE(XMM5,XMM5,0x00) 1777 SSE_MULT_PS(XMM5,XMM2) 1778 SSE_SUB_PS(XMM6,XMM5) 1779 1780 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1781 SSE_SHUFFLE(XMM7,XMM7,0x00) 1782 SSE_MULT_PS(XMM7,XMM3) 1783 SSE_SUB_PS(XMM6,XMM7) 1784 1785 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1786 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1787 1788 /* Fourth Column */ 1789 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1790 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1791 1792 /* Matrix-Vector Product: */ 1793 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1794 SSE_SHUFFLE(XMM5,XMM5,0x00) 1795 SSE_MULT_PS(XMM5,XMM0) 1796 SSE_SUB_PS(XMM4,XMM5) 1797 1798 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1799 SSE_SHUFFLE(XMM6,XMM6,0x00) 1800 SSE_MULT_PS(XMM6,XMM1) 1801 SSE_SUB_PS(XMM4,XMM6) 1802 1803 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1804 SSE_SHUFFLE(XMM7,XMM7,0x00) 1805 SSE_MULT_PS(XMM7,XMM2) 1806 SSE_SUB_PS(XMM4,XMM7) 1807 1808 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1809 SSE_SHUFFLE(XMM5,XMM5,0x00) 1810 SSE_MULT_PS(XMM5,XMM3) 1811 SSE_SUB_PS(XMM4,XMM5) 1812 1813 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1814 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1815 SSE_INLINE_END_2; 1816 pv += 16; 1817 } 1818 PetscCall(PetscLogFlops(128.0*nz+112.0)); 1819 } 1820 row = (unsigned int)(*bjtmp++); 1821 /* row = (*bjtmp++)/4; */ 1822 /* bjtmp++; */ 1823 } 1824 /* finished row so stick it into b->a */ 1825 pv = ba + 16*bi[i]; 1826 pj = bj + bi[i]; 1827 nz = bi[i+1] - bi[i]; 1828 1829 /* Copy x block back into pv block */ 1830 for (j=0; j<nz; j++) { 1831 x = rtmp+16*((unsigned int)pj[j]); 1832 /* x = rtmp+4*pj[j]; */ 1833 1834 SSE_INLINE_BEGIN_2(x,pv) 1835 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1836 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1837 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1838 1839 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1840 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1841 1842 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1843 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1844 1845 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1846 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1847 1848 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1849 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1850 1851 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1852 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1853 1854 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1855 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1856 1857 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1858 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1859 SSE_INLINE_END_2; 1860 pv += 16; 1861 } 1862 /* invert diagonal block */ 1863 w = ba + 16*diag_offset[i]; 1864 if (pivotinblocks) { 1865 PetscCall(PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected)); 1866 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1867 } else { 1868 PetscCall(PetscKernel_A_gets_inverse_A_4_nopivot(w)); 1869 } 1870 /* PetscCall(PetscKernel_A_gets_inverse_A_4_SSE(w)); */ 1871 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1872 } 1873 1874 PetscCall(PetscFree(rtmp)); 1875 1876 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1877 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1878 C->assembled = PETSC_TRUE; 1879 1880 PetscCall(PetscLogFlops(1.333333333333*bs*bs2*b->mbs)); 1881 /* Flop Count from inverting diagonal blocks */ 1882 SSE_SCOPE_END; 1883 PetscFunctionReturn(0); 1884 } 1885 1886 #endif 1887