1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 13 { 14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 15 IS isrow = b->row,isicol = b->icol; 16 PetscErrorCode ierr; 17 const PetscInt *r,*ic; 18 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 19 PetscInt *ajtmpold,*ajtmp,nz,row; 20 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 21 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 22 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 23 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 24 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 25 MatScalar m13,m14,m15,m16; 26 MatScalar *ba = b->a,*aa = a->a; 27 PetscBool pivotinblocks = b->pivotinblocks; 28 PetscReal shift = info->shiftamount; 29 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 30 31 PetscFunctionBegin; 32 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 33 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 34 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 35 allowzeropivot = PetscNot(A->erroriffailure); 36 37 for (i=0; i<n; i++) { 38 nz = bi[i+1] - bi[i]; 39 ajtmp = bj + bi[i]; 40 for (j=0; j<nz; j++) { 41 x = rtmp+16*ajtmp[j]; 42 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 43 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 44 } 45 /* load in initial (unfactored row) */ 46 idx = r[i]; 47 nz = ai[idx+1] - ai[idx]; 48 ajtmpold = aj + ai[idx]; 49 v = aa + 16*ai[idx]; 50 for (j=0; j<nz; j++) { 51 x = rtmp+16*ic[ajtmpold[j]]; 52 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 53 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 54 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 55 x[14] = v[14]; x[15] = v[15]; 56 v += 16; 57 } 58 row = *ajtmp++; 59 while (row < i) { 60 pc = rtmp + 16*row; 61 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 62 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 63 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 64 p15 = pc[14]; p16 = pc[15]; 65 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 66 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 67 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 68 || p16 != 0.0) { 69 pv = ba + 16*diag_offset[row]; 70 pj = bj + diag_offset[row] + 1; 71 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 72 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 73 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 74 x15 = pv[14]; x16 = pv[15]; 75 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 76 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 77 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 78 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 79 80 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 81 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 82 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 83 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 84 85 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 86 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 87 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 88 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 89 90 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 91 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 92 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 93 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 94 95 nz = bi[row+1] - diag_offset[row] - 1; 96 pv += 16; 97 for (j=0; j<nz; j++) { 98 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 99 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 100 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 101 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 102 x = rtmp + 16*pj[j]; 103 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 104 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 105 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 106 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 107 108 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 109 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 110 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 111 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 112 113 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 114 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 115 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 116 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 117 118 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 119 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 120 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 121 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 122 123 pv += 16; 124 } 125 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 126 } 127 row = *ajtmp++; 128 } 129 /* finished row so stick it into b->a */ 130 pv = ba + 16*bi[i]; 131 pj = bj + bi[i]; 132 nz = bi[i+1] - bi[i]; 133 for (j=0; j<nz; j++) { 134 x = rtmp+16*pj[j]; 135 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 136 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 137 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 138 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 139 pv += 16; 140 } 141 /* invert diagonal block */ 142 w = ba + 16*diag_offset[i]; 143 if (pivotinblocks) { 144 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 145 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 146 } else { 147 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 148 } 149 } 150 151 ierr = PetscFree(rtmp);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 153 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 154 155 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 156 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 157 C->assembled = PETSC_TRUE; 158 159 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 160 PetscFunctionReturn(0); 161 } 162 163 /* MatLUFactorNumeric_SeqBAIJ_4 - 164 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 165 PetscKernel_A_gets_A_times_B() 166 PetscKernel_A_gets_A_minus_B_times_C() 167 PetscKernel_A_gets_inverse_A() 168 */ 169 170 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 171 { 172 Mat C = B; 173 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 174 IS isrow = b->row,isicol = b->icol; 175 PetscErrorCode ierr; 176 const PetscInt *r,*ic; 177 PetscInt i,j,k,nz,nzL,row; 178 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 179 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 180 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 181 PetscInt flg; 182 PetscReal shift; 183 PetscBool allowzeropivot,zeropivotdetected; 184 185 PetscFunctionBegin; 186 allowzeropivot = PetscNot(A->erroriffailure); 187 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 188 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 189 190 if (info->shifttype == MAT_SHIFT_NONE) { 191 shift = 0; 192 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 193 shift = info->shiftamount; 194 } 195 196 /* generate work space needed by the factorization */ 197 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 198 ierr = PetscArrayzero(rtmp,bs2*n);CHKERRQ(ierr); 199 200 for (i=0; i<n; i++) { 201 /* zero rtmp */ 202 /* L part */ 203 nz = bi[i+1] - bi[i]; 204 bjtmp = bj + bi[i]; 205 for (j=0; j<nz; j++) { 206 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 207 } 208 209 /* U part */ 210 nz = bdiag[i]-bdiag[i+1]; 211 bjtmp = bj + bdiag[i+1]+1; 212 for (j=0; j<nz; j++) { 213 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 214 } 215 216 /* load in initial (unfactored row) */ 217 nz = ai[r[i]+1] - ai[r[i]]; 218 ajtmp = aj + ai[r[i]]; 219 v = aa + bs2*ai[r[i]]; 220 for (j=0; j<nz; j++) { 221 ierr = PetscArraycpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2);CHKERRQ(ierr); 222 } 223 224 /* elimination */ 225 bjtmp = bj + bi[i]; 226 nzL = bi[i+1] - bi[i]; 227 for (k=0; k < nzL; k++) { 228 row = bjtmp[k]; 229 pc = rtmp + bs2*row; 230 for (flg=0,j=0; j<bs2; j++) { 231 if (pc[j]!=0.0) { 232 flg = 1; 233 break; 234 } 235 } 236 if (flg) { 237 pv = b->a + bs2*bdiag[row]; 238 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 239 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 240 241 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 242 pv = b->a + bs2*(bdiag[row+1]+1); 243 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 244 for (j=0; j<nz; j++) { 245 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 246 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 247 v = rtmp + bs2*pj[j]; 248 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 249 pv += bs2; 250 } 251 ierr = PetscLogFlops(128.0*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 252 } 253 } 254 255 /* finished row so stick it into b->a */ 256 /* L part */ 257 pv = b->a + bs2*bi[i]; 258 pj = b->j + bi[i]; 259 nz = bi[i+1] - bi[i]; 260 for (j=0; j<nz; j++) { 261 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 262 } 263 264 /* Mark diagonal and invert diagonal for simpler triangular solves */ 265 pv = b->a + bs2*bdiag[i]; 266 pj = b->j + bdiag[i]; 267 ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);CHKERRQ(ierr); 268 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 269 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 270 271 /* U part */ 272 pv = b->a + bs2*(bdiag[i+1]+1); 273 pj = b->j + bdiag[i+1]+1; 274 nz = bdiag[i] - bdiag[i+1] - 1; 275 for (j=0; j<nz; j++) { 276 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 277 } 278 } 279 280 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 281 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 282 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 283 284 C->ops->solve = MatSolve_SeqBAIJ_4; 285 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 286 C->assembled = PETSC_TRUE; 287 288 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 289 PetscFunctionReturn(0); 290 } 291 292 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 293 { 294 /* 295 Default Version for when blocks are 4 by 4 Using natural ordering 296 */ 297 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 298 PetscErrorCode ierr; 299 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 300 PetscInt *ajtmpold,*ajtmp,nz,row; 301 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 302 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 303 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 304 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 305 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 306 MatScalar m13,m14,m15,m16; 307 MatScalar *ba = b->a,*aa = a->a; 308 PetscBool pivotinblocks = b->pivotinblocks; 309 PetscReal shift = info->shiftamount; 310 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 311 312 PetscFunctionBegin; 313 allowzeropivot = PetscNot(A->erroriffailure); 314 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 315 316 for (i=0; i<n; i++) { 317 nz = bi[i+1] - bi[i]; 318 ajtmp = bj + bi[i]; 319 for (j=0; j<nz; j++) { 320 x = rtmp+16*ajtmp[j]; 321 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 322 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 323 } 324 /* load in initial (unfactored row) */ 325 nz = ai[i+1] - ai[i]; 326 ajtmpold = aj + ai[i]; 327 v = aa + 16*ai[i]; 328 for (j=0; j<nz; j++) { 329 x = rtmp+16*ajtmpold[j]; 330 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 331 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 332 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 333 x[14] = v[14]; x[15] = v[15]; 334 v += 16; 335 } 336 row = *ajtmp++; 337 while (row < i) { 338 pc = rtmp + 16*row; 339 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 340 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 341 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 342 p15 = pc[14]; p16 = pc[15]; 343 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 344 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 345 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 346 || p16 != 0.0) { 347 pv = ba + 16*diag_offset[row]; 348 pj = bj + diag_offset[row] + 1; 349 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 350 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 351 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 352 x15 = pv[14]; x16 = pv[15]; 353 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 354 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 355 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 356 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 357 358 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 359 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 360 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 361 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 362 363 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 364 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 365 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 366 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 367 368 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 369 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 370 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 371 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 372 nz = bi[row+1] - diag_offset[row] - 1; 373 pv += 16; 374 for (j=0; j<nz; j++) { 375 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 376 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 377 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 378 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 379 x = rtmp + 16*pj[j]; 380 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 381 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 382 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 383 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 384 385 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 386 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 387 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 388 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 389 390 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 391 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 392 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 393 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 394 395 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 396 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 397 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 398 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 399 400 pv += 16; 401 } 402 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 403 } 404 row = *ajtmp++; 405 } 406 /* finished row so stick it into b->a */ 407 pv = ba + 16*bi[i]; 408 pj = bj + bi[i]; 409 nz = bi[i+1] - bi[i]; 410 for (j=0; j<nz; j++) { 411 x = rtmp+16*pj[j]; 412 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 413 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 414 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 415 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 416 pv += 16; 417 } 418 /* invert diagonal block */ 419 w = ba + 16*diag_offset[i]; 420 if (pivotinblocks) { 421 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 422 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 423 } else { 424 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 425 } 426 } 427 428 ierr = PetscFree(rtmp);CHKERRQ(ierr); 429 430 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 431 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 432 C->assembled = PETSC_TRUE; 433 434 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 435 PetscFunctionReturn(0); 436 } 437 438 /* 439 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 440 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 441 */ 442 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 443 { 444 Mat C =B; 445 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 446 PetscErrorCode ierr; 447 PetscInt i,j,k,nz,nzL,row; 448 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 449 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 450 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 451 PetscInt flg; 452 PetscReal shift; 453 PetscBool allowzeropivot,zeropivotdetected; 454 455 PetscFunctionBegin; 456 allowzeropivot = PetscNot(A->erroriffailure); 457 458 /* generate work space needed by the factorization */ 459 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 460 ierr = PetscArrayzero(rtmp,bs2*n);CHKERRQ(ierr); 461 462 if (info->shifttype == MAT_SHIFT_NONE) { 463 shift = 0; 464 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 465 shift = info->shiftamount; 466 } 467 468 for (i=0; i<n; i++) { 469 /* zero rtmp */ 470 /* L part */ 471 nz = bi[i+1] - bi[i]; 472 bjtmp = bj + bi[i]; 473 for (j=0; j<nz; j++) { 474 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 475 } 476 477 /* U part */ 478 nz = bdiag[i] - bdiag[i+1]; 479 bjtmp = bj + bdiag[i+1]+1; 480 for (j=0; j<nz; j++) { 481 ierr = PetscArrayzero(rtmp+bs2*bjtmp[j],bs2);CHKERRQ(ierr); 482 } 483 484 /* load in initial (unfactored row) */ 485 nz = ai[i+1] - ai[i]; 486 ajtmp = aj + ai[i]; 487 v = aa + bs2*ai[i]; 488 for (j=0; j<nz; j++) { 489 ierr = PetscArraycpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2);CHKERRQ(ierr); 490 } 491 492 /* elimination */ 493 bjtmp = bj + bi[i]; 494 nzL = bi[i+1] - bi[i]; 495 for (k=0; k < nzL; k++) { 496 row = bjtmp[k]; 497 pc = rtmp + bs2*row; 498 for (flg=0,j=0; j<bs2; j++) { 499 if (pc[j]!=0.0) { 500 flg = 1; 501 break; 502 } 503 } 504 if (flg) { 505 pv = b->a + bs2*bdiag[row]; 506 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 507 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 508 509 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 510 pv = b->a + bs2*(bdiag[row+1]+1); 511 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 512 for (j=0; j<nz; j++) { 513 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 514 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 515 v = rtmp + bs2*pj[j]; 516 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 517 pv += bs2; 518 } 519 ierr = PetscLogFlops(128.0*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 520 } 521 } 522 523 /* finished row so stick it into b->a */ 524 /* L part */ 525 pv = b->a + bs2*bi[i]; 526 pj = b->j + bi[i]; 527 nz = bi[i+1] - bi[i]; 528 for (j=0; j<nz; j++) { 529 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 530 } 531 532 /* Mark diagonal and invert diagonal for simpler triangular solves */ 533 pv = b->a + bs2*bdiag[i]; 534 pj = b->j + bdiag[i]; 535 ierr = PetscArraycpy(pv,rtmp+bs2*pj[0],bs2);CHKERRQ(ierr); 536 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 537 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 538 539 /* U part */ 540 pv = b->a + bs2*(bdiag[i+1]+1); 541 pj = b->j + bdiag[i+1]+1; 542 nz = bdiag[i] - bdiag[i+1] - 1; 543 for (j=0; j<nz; j++) { 544 ierr = PetscArraycpy(pv+bs2*j,rtmp+bs2*pj[j],bs2);CHKERRQ(ierr); 545 } 546 } 547 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 548 549 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 550 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 551 C->assembled = PETSC_TRUE; 552 553 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 554 PetscFunctionReturn(0); 555 } 556 557 #if defined(PETSC_HAVE_SSE) 558 559 #include PETSC_HAVE_SSE 560 561 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 562 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 563 { 564 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 565 PetscErrorCode ierr; 566 int i,j,n = a->mbs; 567 int *bj = b->j,*bjtmp,*pj; 568 int row; 569 int *ajtmpold,nz,*bi=b->i; 570 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 571 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 572 MatScalar *ba = b->a,*aa = a->a; 573 int nonzero=0; 574 PetscBool pivotinblocks = b->pivotinblocks; 575 PetscReal shift = info->shiftamount; 576 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 577 578 PetscFunctionBegin; 579 allowzeropivot = PetscNot(A->erroriffailure); 580 SSE_SCOPE_BEGIN; 581 582 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 583 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 584 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 585 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 586 /* if ((unsigned long)bj==(unsigned long)aj) { */ 587 /* colscale = 4; */ 588 /* } */ 589 for (i=0; i<n; i++) { 590 nz = bi[i+1] - bi[i]; 591 bjtmp = bj + bi[i]; 592 /* zero out the 4x4 block accumulators */ 593 /* zero out one register */ 594 XOR_PS(XMM7,XMM7); 595 for (j=0; j<nz; j++) { 596 x = rtmp+16*bjtmp[j]; 597 /* x = rtmp+4*bjtmp[j]; */ 598 SSE_INLINE_BEGIN_1(x) 599 /* Copy zero register to memory locations */ 600 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 601 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 602 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 603 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 604 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 605 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 606 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 607 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 608 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 609 SSE_INLINE_END_1; 610 } 611 /* load in initial (unfactored row) */ 612 nz = ai[i+1] - ai[i]; 613 ajtmpold = aj + ai[i]; 614 v = aa + 16*ai[i]; 615 for (j=0; j<nz; j++) { 616 x = rtmp+16*ajtmpold[j]; 617 /* x = rtmp+colscale*ajtmpold[j]; */ 618 /* Copy v block into x block */ 619 SSE_INLINE_BEGIN_2(v,x) 620 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 621 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 622 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 623 624 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 625 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 626 627 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 628 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 629 630 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 631 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 632 633 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 634 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 635 636 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 637 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 638 639 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 640 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 641 642 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 643 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 644 SSE_INLINE_END_2; 645 646 v += 16; 647 } 648 /* row = (*bjtmp++)/4; */ 649 row = *bjtmp++; 650 while (row < i) { 651 pc = rtmp + 16*row; 652 SSE_INLINE_BEGIN_1(pc) 653 /* Load block from lower triangle */ 654 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 655 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 656 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 657 658 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 659 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 660 661 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 662 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 663 664 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 665 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 666 667 /* Compare block to zero block */ 668 669 SSE_COPY_PS(XMM4,XMM7) 670 SSE_CMPNEQ_PS(XMM4,XMM0) 671 672 SSE_COPY_PS(XMM5,XMM7) 673 SSE_CMPNEQ_PS(XMM5,XMM1) 674 675 SSE_COPY_PS(XMM6,XMM7) 676 SSE_CMPNEQ_PS(XMM6,XMM2) 677 678 SSE_CMPNEQ_PS(XMM7,XMM3) 679 680 /* Reduce the comparisons to one SSE register */ 681 SSE_OR_PS(XMM6,XMM7) 682 SSE_OR_PS(XMM5,XMM4) 683 SSE_OR_PS(XMM5,XMM6) 684 SSE_INLINE_END_1; 685 686 /* Reduce the one SSE register to an integer register for branching */ 687 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 688 MOVEMASK(nonzero,XMM5); 689 690 /* If block is nonzero ... */ 691 if (nonzero) { 692 pv = ba + 16*diag_offset[row]; 693 PREFETCH_L1(&pv[16]); 694 pj = bj + diag_offset[row] + 1; 695 696 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 697 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 698 /* but the diagonal was inverted already */ 699 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 700 701 SSE_INLINE_BEGIN_2(pv,pc) 702 /* Column 0, product is accumulated in XMM4 */ 703 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 704 SSE_SHUFFLE(XMM4,XMM4,0x00) 705 SSE_MULT_PS(XMM4,XMM0) 706 707 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 708 SSE_SHUFFLE(XMM5,XMM5,0x00) 709 SSE_MULT_PS(XMM5,XMM1) 710 SSE_ADD_PS(XMM4,XMM5) 711 712 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 713 SSE_SHUFFLE(XMM6,XMM6,0x00) 714 SSE_MULT_PS(XMM6,XMM2) 715 SSE_ADD_PS(XMM4,XMM6) 716 717 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 718 SSE_SHUFFLE(XMM7,XMM7,0x00) 719 SSE_MULT_PS(XMM7,XMM3) 720 SSE_ADD_PS(XMM4,XMM7) 721 722 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 723 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 724 725 /* Column 1, product is accumulated in XMM5 */ 726 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 727 SSE_SHUFFLE(XMM5,XMM5,0x00) 728 SSE_MULT_PS(XMM5,XMM0) 729 730 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 731 SSE_SHUFFLE(XMM6,XMM6,0x00) 732 SSE_MULT_PS(XMM6,XMM1) 733 SSE_ADD_PS(XMM5,XMM6) 734 735 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 736 SSE_SHUFFLE(XMM7,XMM7,0x00) 737 SSE_MULT_PS(XMM7,XMM2) 738 SSE_ADD_PS(XMM5,XMM7) 739 740 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 741 SSE_SHUFFLE(XMM6,XMM6,0x00) 742 SSE_MULT_PS(XMM6,XMM3) 743 SSE_ADD_PS(XMM5,XMM6) 744 745 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 746 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 747 748 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 749 750 /* Column 2, product is accumulated in XMM6 */ 751 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 752 SSE_SHUFFLE(XMM6,XMM6,0x00) 753 SSE_MULT_PS(XMM6,XMM0) 754 755 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 756 SSE_SHUFFLE(XMM7,XMM7,0x00) 757 SSE_MULT_PS(XMM7,XMM1) 758 SSE_ADD_PS(XMM6,XMM7) 759 760 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 761 SSE_SHUFFLE(XMM7,XMM7,0x00) 762 SSE_MULT_PS(XMM7,XMM2) 763 SSE_ADD_PS(XMM6,XMM7) 764 765 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 766 SSE_SHUFFLE(XMM7,XMM7,0x00) 767 SSE_MULT_PS(XMM7,XMM3) 768 SSE_ADD_PS(XMM6,XMM7) 769 770 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 771 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 772 773 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 774 /* Column 3, product is accumulated in XMM0 */ 775 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 776 SSE_SHUFFLE(XMM7,XMM7,0x00) 777 SSE_MULT_PS(XMM0,XMM7) 778 779 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 780 SSE_SHUFFLE(XMM7,XMM7,0x00) 781 SSE_MULT_PS(XMM1,XMM7) 782 SSE_ADD_PS(XMM0,XMM1) 783 784 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 785 SSE_SHUFFLE(XMM1,XMM1,0x00) 786 SSE_MULT_PS(XMM1,XMM2) 787 SSE_ADD_PS(XMM0,XMM1) 788 789 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 790 SSE_SHUFFLE(XMM7,XMM7,0x00) 791 SSE_MULT_PS(XMM3,XMM7) 792 SSE_ADD_PS(XMM0,XMM3) 793 794 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 795 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 796 797 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 798 /* This is code to be maintained and read by humans afterall. */ 799 /* Copy Multiplier Col 3 into XMM3 */ 800 SSE_COPY_PS(XMM3,XMM0) 801 /* Copy Multiplier Col 2 into XMM2 */ 802 SSE_COPY_PS(XMM2,XMM6) 803 /* Copy Multiplier Col 1 into XMM1 */ 804 SSE_COPY_PS(XMM1,XMM5) 805 /* Copy Multiplier Col 0 into XMM0 */ 806 SSE_COPY_PS(XMM0,XMM4) 807 SSE_INLINE_END_2; 808 809 /* Update the row: */ 810 nz = bi[row+1] - diag_offset[row] - 1; 811 pv += 16; 812 for (j=0; j<nz; j++) { 813 PREFETCH_L1(&pv[16]); 814 x = rtmp + 16*pj[j]; 815 /* x = rtmp + 4*pj[j]; */ 816 817 /* X:=X-M*PV, One column at a time */ 818 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 819 SSE_INLINE_BEGIN_2(x,pv) 820 /* Load First Column of X*/ 821 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 822 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 823 824 /* Matrix-Vector Product: */ 825 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 826 SSE_SHUFFLE(XMM5,XMM5,0x00) 827 SSE_MULT_PS(XMM5,XMM0) 828 SSE_SUB_PS(XMM4,XMM5) 829 830 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 831 SSE_SHUFFLE(XMM6,XMM6,0x00) 832 SSE_MULT_PS(XMM6,XMM1) 833 SSE_SUB_PS(XMM4,XMM6) 834 835 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 836 SSE_SHUFFLE(XMM7,XMM7,0x00) 837 SSE_MULT_PS(XMM7,XMM2) 838 SSE_SUB_PS(XMM4,XMM7) 839 840 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 841 SSE_SHUFFLE(XMM5,XMM5,0x00) 842 SSE_MULT_PS(XMM5,XMM3) 843 SSE_SUB_PS(XMM4,XMM5) 844 845 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 846 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 847 848 /* Second Column */ 849 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 850 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 851 852 /* Matrix-Vector Product: */ 853 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 854 SSE_SHUFFLE(XMM6,XMM6,0x00) 855 SSE_MULT_PS(XMM6,XMM0) 856 SSE_SUB_PS(XMM5,XMM6) 857 858 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 859 SSE_SHUFFLE(XMM7,XMM7,0x00) 860 SSE_MULT_PS(XMM7,XMM1) 861 SSE_SUB_PS(XMM5,XMM7) 862 863 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 864 SSE_SHUFFLE(XMM4,XMM4,0x00) 865 SSE_MULT_PS(XMM4,XMM2) 866 SSE_SUB_PS(XMM5,XMM4) 867 868 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 869 SSE_SHUFFLE(XMM6,XMM6,0x00) 870 SSE_MULT_PS(XMM6,XMM3) 871 SSE_SUB_PS(XMM5,XMM6) 872 873 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 874 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 875 876 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 877 878 /* Third Column */ 879 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 880 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 881 882 /* Matrix-Vector Product: */ 883 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 884 SSE_SHUFFLE(XMM7,XMM7,0x00) 885 SSE_MULT_PS(XMM7,XMM0) 886 SSE_SUB_PS(XMM6,XMM7) 887 888 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 889 SSE_SHUFFLE(XMM4,XMM4,0x00) 890 SSE_MULT_PS(XMM4,XMM1) 891 SSE_SUB_PS(XMM6,XMM4) 892 893 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 894 SSE_SHUFFLE(XMM5,XMM5,0x00) 895 SSE_MULT_PS(XMM5,XMM2) 896 SSE_SUB_PS(XMM6,XMM5) 897 898 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 899 SSE_SHUFFLE(XMM7,XMM7,0x00) 900 SSE_MULT_PS(XMM7,XMM3) 901 SSE_SUB_PS(XMM6,XMM7) 902 903 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 904 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 905 906 /* Fourth Column */ 907 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 908 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 909 910 /* Matrix-Vector Product: */ 911 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 912 SSE_SHUFFLE(XMM5,XMM5,0x00) 913 SSE_MULT_PS(XMM5,XMM0) 914 SSE_SUB_PS(XMM4,XMM5) 915 916 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 917 SSE_SHUFFLE(XMM6,XMM6,0x00) 918 SSE_MULT_PS(XMM6,XMM1) 919 SSE_SUB_PS(XMM4,XMM6) 920 921 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 922 SSE_SHUFFLE(XMM7,XMM7,0x00) 923 SSE_MULT_PS(XMM7,XMM2) 924 SSE_SUB_PS(XMM4,XMM7) 925 926 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 927 SSE_SHUFFLE(XMM5,XMM5,0x00) 928 SSE_MULT_PS(XMM5,XMM3) 929 SSE_SUB_PS(XMM4,XMM5) 930 931 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 932 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 933 SSE_INLINE_END_2; 934 pv += 16; 935 } 936 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 937 } 938 row = *bjtmp++; 939 /* row = (*bjtmp++)/4; */ 940 } 941 /* finished row so stick it into b->a */ 942 pv = ba + 16*bi[i]; 943 pj = bj + bi[i]; 944 nz = bi[i+1] - bi[i]; 945 946 /* Copy x block back into pv block */ 947 for (j=0; j<nz; j++) { 948 x = rtmp+16*pj[j]; 949 /* x = rtmp+4*pj[j]; */ 950 951 SSE_INLINE_BEGIN_2(x,pv) 952 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 953 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 954 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 955 956 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 957 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 958 959 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 960 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 961 962 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 963 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 964 965 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 966 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 967 968 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 969 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 970 971 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 972 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 973 974 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 975 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 976 SSE_INLINE_END_2; 977 pv += 16; 978 } 979 /* invert diagonal block */ 980 w = ba + 16*diag_offset[i]; 981 if (pivotinblocks) { 982 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 983 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 984 } else { 985 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 986 } 987 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 988 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 989 } 990 991 ierr = PetscFree(rtmp);CHKERRQ(ierr); 992 993 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 994 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 995 C->assembled = PETSC_TRUE; 996 997 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 998 /* Flop Count from inverting diagonal blocks */ 999 SSE_SCOPE_END; 1000 PetscFunctionReturn(0); 1001 } 1002 1003 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1004 { 1005 Mat A =C; 1006 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1007 PetscErrorCode ierr; 1008 int i,j,n = a->mbs; 1009 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1010 unsigned short *aj = (unsigned short*)(a->j),*ajtmp; 1011 unsigned int row; 1012 int nz,*bi=b->i; 1013 int *diag_offset = b->diag,*ai=a->i; 1014 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1015 MatScalar *ba = b->a,*aa = a->a; 1016 int nonzero=0; 1017 PetscBool pivotinblocks = b->pivotinblocks; 1018 PetscReal shift = info->shiftamount; 1019 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 1020 1021 PetscFunctionBegin; 1022 allowzeropivot = PetscNot(A->erroriffailure); 1023 SSE_SCOPE_BEGIN; 1024 1025 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1026 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1027 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1028 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1029 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1030 /* colscale = 4; */ 1031 /* } */ 1032 1033 for (i=0; i<n; i++) { 1034 nz = bi[i+1] - bi[i]; 1035 bjtmp = bj + bi[i]; 1036 /* zero out the 4x4 block accumulators */ 1037 /* zero out one register */ 1038 XOR_PS(XMM7,XMM7); 1039 for (j=0; j<nz; j++) { 1040 x = rtmp+16*((unsigned int)bjtmp[j]); 1041 /* x = rtmp+4*bjtmp[j]; */ 1042 SSE_INLINE_BEGIN_1(x) 1043 /* Copy zero register to memory locations */ 1044 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1045 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1046 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1047 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1048 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1049 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1050 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1051 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1052 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1053 SSE_INLINE_END_1; 1054 } 1055 /* load in initial (unfactored row) */ 1056 nz = ai[i+1] - ai[i]; 1057 ajtmp = aj + ai[i]; 1058 v = aa + 16*ai[i]; 1059 for (j=0; j<nz; j++) { 1060 x = rtmp+16*((unsigned int)ajtmp[j]); 1061 /* x = rtmp+colscale*ajtmp[j]; */ 1062 /* Copy v block into x block */ 1063 SSE_INLINE_BEGIN_2(v,x) 1064 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1065 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1066 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1067 1068 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1069 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1070 1071 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1072 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1073 1074 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1075 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1076 1077 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1078 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1079 1080 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1081 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1082 1083 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1084 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1085 1086 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1087 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1088 SSE_INLINE_END_2; 1089 1090 v += 16; 1091 } 1092 /* row = (*bjtmp++)/4; */ 1093 row = (unsigned int)(*bjtmp++); 1094 while (row < i) { 1095 pc = rtmp + 16*row; 1096 SSE_INLINE_BEGIN_1(pc) 1097 /* Load block from lower triangle */ 1098 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1099 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1100 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1101 1102 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1103 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1104 1105 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1106 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1107 1108 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1109 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1110 1111 /* Compare block to zero block */ 1112 1113 SSE_COPY_PS(XMM4,XMM7) 1114 SSE_CMPNEQ_PS(XMM4,XMM0) 1115 1116 SSE_COPY_PS(XMM5,XMM7) 1117 SSE_CMPNEQ_PS(XMM5,XMM1) 1118 1119 SSE_COPY_PS(XMM6,XMM7) 1120 SSE_CMPNEQ_PS(XMM6,XMM2) 1121 1122 SSE_CMPNEQ_PS(XMM7,XMM3) 1123 1124 /* Reduce the comparisons to one SSE register */ 1125 SSE_OR_PS(XMM6,XMM7) 1126 SSE_OR_PS(XMM5,XMM4) 1127 SSE_OR_PS(XMM5,XMM6) 1128 SSE_INLINE_END_1; 1129 1130 /* Reduce the one SSE register to an integer register for branching */ 1131 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1132 MOVEMASK(nonzero,XMM5); 1133 1134 /* If block is nonzero ... */ 1135 if (nonzero) { 1136 pv = ba + 16*diag_offset[row]; 1137 PREFETCH_L1(&pv[16]); 1138 pj = bj + diag_offset[row] + 1; 1139 1140 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1141 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1142 /* but the diagonal was inverted already */ 1143 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1144 1145 SSE_INLINE_BEGIN_2(pv,pc) 1146 /* Column 0, product is accumulated in XMM4 */ 1147 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1148 SSE_SHUFFLE(XMM4,XMM4,0x00) 1149 SSE_MULT_PS(XMM4,XMM0) 1150 1151 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1152 SSE_SHUFFLE(XMM5,XMM5,0x00) 1153 SSE_MULT_PS(XMM5,XMM1) 1154 SSE_ADD_PS(XMM4,XMM5) 1155 1156 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1157 SSE_SHUFFLE(XMM6,XMM6,0x00) 1158 SSE_MULT_PS(XMM6,XMM2) 1159 SSE_ADD_PS(XMM4,XMM6) 1160 1161 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1162 SSE_SHUFFLE(XMM7,XMM7,0x00) 1163 SSE_MULT_PS(XMM7,XMM3) 1164 SSE_ADD_PS(XMM4,XMM7) 1165 1166 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1167 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1168 1169 /* Column 1, product is accumulated in XMM5 */ 1170 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1171 SSE_SHUFFLE(XMM5,XMM5,0x00) 1172 SSE_MULT_PS(XMM5,XMM0) 1173 1174 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1175 SSE_SHUFFLE(XMM6,XMM6,0x00) 1176 SSE_MULT_PS(XMM6,XMM1) 1177 SSE_ADD_PS(XMM5,XMM6) 1178 1179 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1180 SSE_SHUFFLE(XMM7,XMM7,0x00) 1181 SSE_MULT_PS(XMM7,XMM2) 1182 SSE_ADD_PS(XMM5,XMM7) 1183 1184 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1185 SSE_SHUFFLE(XMM6,XMM6,0x00) 1186 SSE_MULT_PS(XMM6,XMM3) 1187 SSE_ADD_PS(XMM5,XMM6) 1188 1189 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1190 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1191 1192 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1193 1194 /* Column 2, product is accumulated in XMM6 */ 1195 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1196 SSE_SHUFFLE(XMM6,XMM6,0x00) 1197 SSE_MULT_PS(XMM6,XMM0) 1198 1199 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1200 SSE_SHUFFLE(XMM7,XMM7,0x00) 1201 SSE_MULT_PS(XMM7,XMM1) 1202 SSE_ADD_PS(XMM6,XMM7) 1203 1204 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1205 SSE_SHUFFLE(XMM7,XMM7,0x00) 1206 SSE_MULT_PS(XMM7,XMM2) 1207 SSE_ADD_PS(XMM6,XMM7) 1208 1209 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1210 SSE_SHUFFLE(XMM7,XMM7,0x00) 1211 SSE_MULT_PS(XMM7,XMM3) 1212 SSE_ADD_PS(XMM6,XMM7) 1213 1214 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1215 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1216 1217 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1218 /* Column 3, product is accumulated in XMM0 */ 1219 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1220 SSE_SHUFFLE(XMM7,XMM7,0x00) 1221 SSE_MULT_PS(XMM0,XMM7) 1222 1223 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1224 SSE_SHUFFLE(XMM7,XMM7,0x00) 1225 SSE_MULT_PS(XMM1,XMM7) 1226 SSE_ADD_PS(XMM0,XMM1) 1227 1228 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1229 SSE_SHUFFLE(XMM1,XMM1,0x00) 1230 SSE_MULT_PS(XMM1,XMM2) 1231 SSE_ADD_PS(XMM0,XMM1) 1232 1233 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1234 SSE_SHUFFLE(XMM7,XMM7,0x00) 1235 SSE_MULT_PS(XMM3,XMM7) 1236 SSE_ADD_PS(XMM0,XMM3) 1237 1238 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1239 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1240 1241 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1242 /* This is code to be maintained and read by humans afterall. */ 1243 /* Copy Multiplier Col 3 into XMM3 */ 1244 SSE_COPY_PS(XMM3,XMM0) 1245 /* Copy Multiplier Col 2 into XMM2 */ 1246 SSE_COPY_PS(XMM2,XMM6) 1247 /* Copy Multiplier Col 1 into XMM1 */ 1248 SSE_COPY_PS(XMM1,XMM5) 1249 /* Copy Multiplier Col 0 into XMM0 */ 1250 SSE_COPY_PS(XMM0,XMM4) 1251 SSE_INLINE_END_2; 1252 1253 /* Update the row: */ 1254 nz = bi[row+1] - diag_offset[row] - 1; 1255 pv += 16; 1256 for (j=0; j<nz; j++) { 1257 PREFETCH_L1(&pv[16]); 1258 x = rtmp + 16*((unsigned int)pj[j]); 1259 /* x = rtmp + 4*pj[j]; */ 1260 1261 /* X:=X-M*PV, One column at a time */ 1262 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1263 SSE_INLINE_BEGIN_2(x,pv) 1264 /* Load First Column of X*/ 1265 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1266 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1267 1268 /* Matrix-Vector Product: */ 1269 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1270 SSE_SHUFFLE(XMM5,XMM5,0x00) 1271 SSE_MULT_PS(XMM5,XMM0) 1272 SSE_SUB_PS(XMM4,XMM5) 1273 1274 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1275 SSE_SHUFFLE(XMM6,XMM6,0x00) 1276 SSE_MULT_PS(XMM6,XMM1) 1277 SSE_SUB_PS(XMM4,XMM6) 1278 1279 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1280 SSE_SHUFFLE(XMM7,XMM7,0x00) 1281 SSE_MULT_PS(XMM7,XMM2) 1282 SSE_SUB_PS(XMM4,XMM7) 1283 1284 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1285 SSE_SHUFFLE(XMM5,XMM5,0x00) 1286 SSE_MULT_PS(XMM5,XMM3) 1287 SSE_SUB_PS(XMM4,XMM5) 1288 1289 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1290 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1291 1292 /* Second Column */ 1293 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1294 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1295 1296 /* Matrix-Vector Product: */ 1297 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1298 SSE_SHUFFLE(XMM6,XMM6,0x00) 1299 SSE_MULT_PS(XMM6,XMM0) 1300 SSE_SUB_PS(XMM5,XMM6) 1301 1302 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1303 SSE_SHUFFLE(XMM7,XMM7,0x00) 1304 SSE_MULT_PS(XMM7,XMM1) 1305 SSE_SUB_PS(XMM5,XMM7) 1306 1307 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1308 SSE_SHUFFLE(XMM4,XMM4,0x00) 1309 SSE_MULT_PS(XMM4,XMM2) 1310 SSE_SUB_PS(XMM5,XMM4) 1311 1312 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1313 SSE_SHUFFLE(XMM6,XMM6,0x00) 1314 SSE_MULT_PS(XMM6,XMM3) 1315 SSE_SUB_PS(XMM5,XMM6) 1316 1317 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1318 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1319 1320 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1321 1322 /* Third Column */ 1323 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1324 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1325 1326 /* Matrix-Vector Product: */ 1327 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1328 SSE_SHUFFLE(XMM7,XMM7,0x00) 1329 SSE_MULT_PS(XMM7,XMM0) 1330 SSE_SUB_PS(XMM6,XMM7) 1331 1332 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1333 SSE_SHUFFLE(XMM4,XMM4,0x00) 1334 SSE_MULT_PS(XMM4,XMM1) 1335 SSE_SUB_PS(XMM6,XMM4) 1336 1337 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1338 SSE_SHUFFLE(XMM5,XMM5,0x00) 1339 SSE_MULT_PS(XMM5,XMM2) 1340 SSE_SUB_PS(XMM6,XMM5) 1341 1342 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1343 SSE_SHUFFLE(XMM7,XMM7,0x00) 1344 SSE_MULT_PS(XMM7,XMM3) 1345 SSE_SUB_PS(XMM6,XMM7) 1346 1347 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1348 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1349 1350 /* Fourth Column */ 1351 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1352 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1353 1354 /* Matrix-Vector Product: */ 1355 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1356 SSE_SHUFFLE(XMM5,XMM5,0x00) 1357 SSE_MULT_PS(XMM5,XMM0) 1358 SSE_SUB_PS(XMM4,XMM5) 1359 1360 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1361 SSE_SHUFFLE(XMM6,XMM6,0x00) 1362 SSE_MULT_PS(XMM6,XMM1) 1363 SSE_SUB_PS(XMM4,XMM6) 1364 1365 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1366 SSE_SHUFFLE(XMM7,XMM7,0x00) 1367 SSE_MULT_PS(XMM7,XMM2) 1368 SSE_SUB_PS(XMM4,XMM7) 1369 1370 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1371 SSE_SHUFFLE(XMM5,XMM5,0x00) 1372 SSE_MULT_PS(XMM5,XMM3) 1373 SSE_SUB_PS(XMM4,XMM5) 1374 1375 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1376 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1377 SSE_INLINE_END_2; 1378 pv += 16; 1379 } 1380 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1381 } 1382 row = (unsigned int)(*bjtmp++); 1383 /* row = (*bjtmp++)/4; */ 1384 /* bjtmp++; */ 1385 } 1386 /* finished row so stick it into b->a */ 1387 pv = ba + 16*bi[i]; 1388 pj = bj + bi[i]; 1389 nz = bi[i+1] - bi[i]; 1390 1391 /* Copy x block back into pv block */ 1392 for (j=0; j<nz; j++) { 1393 x = rtmp+16*((unsigned int)pj[j]); 1394 /* x = rtmp+4*pj[j]; */ 1395 1396 SSE_INLINE_BEGIN_2(x,pv) 1397 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1398 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1399 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1400 1401 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1402 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1403 1404 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1405 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1406 1407 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1408 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1409 1410 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1411 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1412 1413 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1414 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1415 1416 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1417 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1418 1419 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1420 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1421 SSE_INLINE_END_2; 1422 pv += 16; 1423 } 1424 /* invert diagonal block */ 1425 w = ba + 16*diag_offset[i]; 1426 if (pivotinblocks) { 1427 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 1428 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1429 } else { 1430 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1431 } 1432 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1433 } 1434 1435 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1436 1437 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1438 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1439 C->assembled = PETSC_TRUE; 1440 1441 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1442 /* Flop Count from inverting diagonal blocks */ 1443 SSE_SCOPE_END; 1444 PetscFunctionReturn(0); 1445 } 1446 1447 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1448 { 1449 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1450 PetscErrorCode ierr; 1451 int i,j,n = a->mbs; 1452 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1453 unsigned int row; 1454 int *ajtmpold,nz,*bi=b->i; 1455 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1456 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1457 MatScalar *ba = b->a,*aa = a->a; 1458 int nonzero=0; 1459 PetscBool pivotinblocks = b->pivotinblocks; 1460 PetscReal shift = info->shiftamount; 1461 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 1462 1463 PetscFunctionBegin; 1464 allowzeropivot = PetscNot(A->erroriffailure); 1465 SSE_SCOPE_BEGIN; 1466 1467 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1468 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1469 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1470 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1471 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1472 /* colscale = 4; */ 1473 /* } */ 1474 if ((unsigned long)bj==(unsigned long)aj) { 1475 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1476 } 1477 1478 for (i=0; i<n; i++) { 1479 nz = bi[i+1] - bi[i]; 1480 bjtmp = bj + bi[i]; 1481 /* zero out the 4x4 block accumulators */ 1482 /* zero out one register */ 1483 XOR_PS(XMM7,XMM7); 1484 for (j=0; j<nz; j++) { 1485 x = rtmp+16*((unsigned int)bjtmp[j]); 1486 /* x = rtmp+4*bjtmp[j]; */ 1487 SSE_INLINE_BEGIN_1(x) 1488 /* Copy zero register to memory locations */ 1489 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1490 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1491 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1492 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1493 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1494 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1495 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1496 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1497 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1498 SSE_INLINE_END_1; 1499 } 1500 /* load in initial (unfactored row) */ 1501 nz = ai[i+1] - ai[i]; 1502 ajtmpold = aj + ai[i]; 1503 v = aa + 16*ai[i]; 1504 for (j=0; j<nz; j++) { 1505 x = rtmp+16*ajtmpold[j]; 1506 /* x = rtmp+colscale*ajtmpold[j]; */ 1507 /* Copy v block into x block */ 1508 SSE_INLINE_BEGIN_2(v,x) 1509 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1510 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1511 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1512 1513 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1514 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1515 1516 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1517 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1518 1519 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1520 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1521 1522 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1523 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1524 1525 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1526 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1527 1528 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1529 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1530 1531 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1532 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1533 SSE_INLINE_END_2; 1534 1535 v += 16; 1536 } 1537 /* row = (*bjtmp++)/4; */ 1538 row = (unsigned int)(*bjtmp++); 1539 while (row < i) { 1540 pc = rtmp + 16*row; 1541 SSE_INLINE_BEGIN_1(pc) 1542 /* Load block from lower triangle */ 1543 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1544 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1545 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1546 1547 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1548 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1549 1550 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1551 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1552 1553 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1554 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1555 1556 /* Compare block to zero block */ 1557 1558 SSE_COPY_PS(XMM4,XMM7) 1559 SSE_CMPNEQ_PS(XMM4,XMM0) 1560 1561 SSE_COPY_PS(XMM5,XMM7) 1562 SSE_CMPNEQ_PS(XMM5,XMM1) 1563 1564 SSE_COPY_PS(XMM6,XMM7) 1565 SSE_CMPNEQ_PS(XMM6,XMM2) 1566 1567 SSE_CMPNEQ_PS(XMM7,XMM3) 1568 1569 /* Reduce the comparisons to one SSE register */ 1570 SSE_OR_PS(XMM6,XMM7) 1571 SSE_OR_PS(XMM5,XMM4) 1572 SSE_OR_PS(XMM5,XMM6) 1573 SSE_INLINE_END_1; 1574 1575 /* Reduce the one SSE register to an integer register for branching */ 1576 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1577 MOVEMASK(nonzero,XMM5); 1578 1579 /* If block is nonzero ... */ 1580 if (nonzero) { 1581 pv = ba + 16*diag_offset[row]; 1582 PREFETCH_L1(&pv[16]); 1583 pj = bj + diag_offset[row] + 1; 1584 1585 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1586 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1587 /* but the diagonal was inverted already */ 1588 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1589 1590 SSE_INLINE_BEGIN_2(pv,pc) 1591 /* Column 0, product is accumulated in XMM4 */ 1592 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1593 SSE_SHUFFLE(XMM4,XMM4,0x00) 1594 SSE_MULT_PS(XMM4,XMM0) 1595 1596 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1597 SSE_SHUFFLE(XMM5,XMM5,0x00) 1598 SSE_MULT_PS(XMM5,XMM1) 1599 SSE_ADD_PS(XMM4,XMM5) 1600 1601 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1602 SSE_SHUFFLE(XMM6,XMM6,0x00) 1603 SSE_MULT_PS(XMM6,XMM2) 1604 SSE_ADD_PS(XMM4,XMM6) 1605 1606 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1607 SSE_SHUFFLE(XMM7,XMM7,0x00) 1608 SSE_MULT_PS(XMM7,XMM3) 1609 SSE_ADD_PS(XMM4,XMM7) 1610 1611 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1612 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1613 1614 /* Column 1, product is accumulated in XMM5 */ 1615 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1616 SSE_SHUFFLE(XMM5,XMM5,0x00) 1617 SSE_MULT_PS(XMM5,XMM0) 1618 1619 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1620 SSE_SHUFFLE(XMM6,XMM6,0x00) 1621 SSE_MULT_PS(XMM6,XMM1) 1622 SSE_ADD_PS(XMM5,XMM6) 1623 1624 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1625 SSE_SHUFFLE(XMM7,XMM7,0x00) 1626 SSE_MULT_PS(XMM7,XMM2) 1627 SSE_ADD_PS(XMM5,XMM7) 1628 1629 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1630 SSE_SHUFFLE(XMM6,XMM6,0x00) 1631 SSE_MULT_PS(XMM6,XMM3) 1632 SSE_ADD_PS(XMM5,XMM6) 1633 1634 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1635 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1636 1637 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1638 1639 /* Column 2, product is accumulated in XMM6 */ 1640 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1641 SSE_SHUFFLE(XMM6,XMM6,0x00) 1642 SSE_MULT_PS(XMM6,XMM0) 1643 1644 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1645 SSE_SHUFFLE(XMM7,XMM7,0x00) 1646 SSE_MULT_PS(XMM7,XMM1) 1647 SSE_ADD_PS(XMM6,XMM7) 1648 1649 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1650 SSE_SHUFFLE(XMM7,XMM7,0x00) 1651 SSE_MULT_PS(XMM7,XMM2) 1652 SSE_ADD_PS(XMM6,XMM7) 1653 1654 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1655 SSE_SHUFFLE(XMM7,XMM7,0x00) 1656 SSE_MULT_PS(XMM7,XMM3) 1657 SSE_ADD_PS(XMM6,XMM7) 1658 1659 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1660 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1661 1662 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1663 /* Column 3, product is accumulated in XMM0 */ 1664 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1665 SSE_SHUFFLE(XMM7,XMM7,0x00) 1666 SSE_MULT_PS(XMM0,XMM7) 1667 1668 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1669 SSE_SHUFFLE(XMM7,XMM7,0x00) 1670 SSE_MULT_PS(XMM1,XMM7) 1671 SSE_ADD_PS(XMM0,XMM1) 1672 1673 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1674 SSE_SHUFFLE(XMM1,XMM1,0x00) 1675 SSE_MULT_PS(XMM1,XMM2) 1676 SSE_ADD_PS(XMM0,XMM1) 1677 1678 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1679 SSE_SHUFFLE(XMM7,XMM7,0x00) 1680 SSE_MULT_PS(XMM3,XMM7) 1681 SSE_ADD_PS(XMM0,XMM3) 1682 1683 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1684 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1685 1686 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1687 /* This is code to be maintained and read by humans afterall. */ 1688 /* Copy Multiplier Col 3 into XMM3 */ 1689 SSE_COPY_PS(XMM3,XMM0) 1690 /* Copy Multiplier Col 2 into XMM2 */ 1691 SSE_COPY_PS(XMM2,XMM6) 1692 /* Copy Multiplier Col 1 into XMM1 */ 1693 SSE_COPY_PS(XMM1,XMM5) 1694 /* Copy Multiplier Col 0 into XMM0 */ 1695 SSE_COPY_PS(XMM0,XMM4) 1696 SSE_INLINE_END_2; 1697 1698 /* Update the row: */ 1699 nz = bi[row+1] - diag_offset[row] - 1; 1700 pv += 16; 1701 for (j=0; j<nz; j++) { 1702 PREFETCH_L1(&pv[16]); 1703 x = rtmp + 16*((unsigned int)pj[j]); 1704 /* x = rtmp + 4*pj[j]; */ 1705 1706 /* X:=X-M*PV, One column at a time */ 1707 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1708 SSE_INLINE_BEGIN_2(x,pv) 1709 /* Load First Column of X*/ 1710 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1711 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1712 1713 /* Matrix-Vector Product: */ 1714 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1715 SSE_SHUFFLE(XMM5,XMM5,0x00) 1716 SSE_MULT_PS(XMM5,XMM0) 1717 SSE_SUB_PS(XMM4,XMM5) 1718 1719 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1720 SSE_SHUFFLE(XMM6,XMM6,0x00) 1721 SSE_MULT_PS(XMM6,XMM1) 1722 SSE_SUB_PS(XMM4,XMM6) 1723 1724 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1725 SSE_SHUFFLE(XMM7,XMM7,0x00) 1726 SSE_MULT_PS(XMM7,XMM2) 1727 SSE_SUB_PS(XMM4,XMM7) 1728 1729 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1730 SSE_SHUFFLE(XMM5,XMM5,0x00) 1731 SSE_MULT_PS(XMM5,XMM3) 1732 SSE_SUB_PS(XMM4,XMM5) 1733 1734 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1735 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1736 1737 /* Second Column */ 1738 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1739 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1740 1741 /* Matrix-Vector Product: */ 1742 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1743 SSE_SHUFFLE(XMM6,XMM6,0x00) 1744 SSE_MULT_PS(XMM6,XMM0) 1745 SSE_SUB_PS(XMM5,XMM6) 1746 1747 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1748 SSE_SHUFFLE(XMM7,XMM7,0x00) 1749 SSE_MULT_PS(XMM7,XMM1) 1750 SSE_SUB_PS(XMM5,XMM7) 1751 1752 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1753 SSE_SHUFFLE(XMM4,XMM4,0x00) 1754 SSE_MULT_PS(XMM4,XMM2) 1755 SSE_SUB_PS(XMM5,XMM4) 1756 1757 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1758 SSE_SHUFFLE(XMM6,XMM6,0x00) 1759 SSE_MULT_PS(XMM6,XMM3) 1760 SSE_SUB_PS(XMM5,XMM6) 1761 1762 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1763 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1764 1765 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1766 1767 /* Third Column */ 1768 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1769 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1770 1771 /* Matrix-Vector Product: */ 1772 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1773 SSE_SHUFFLE(XMM7,XMM7,0x00) 1774 SSE_MULT_PS(XMM7,XMM0) 1775 SSE_SUB_PS(XMM6,XMM7) 1776 1777 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1778 SSE_SHUFFLE(XMM4,XMM4,0x00) 1779 SSE_MULT_PS(XMM4,XMM1) 1780 SSE_SUB_PS(XMM6,XMM4) 1781 1782 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1783 SSE_SHUFFLE(XMM5,XMM5,0x00) 1784 SSE_MULT_PS(XMM5,XMM2) 1785 SSE_SUB_PS(XMM6,XMM5) 1786 1787 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1788 SSE_SHUFFLE(XMM7,XMM7,0x00) 1789 SSE_MULT_PS(XMM7,XMM3) 1790 SSE_SUB_PS(XMM6,XMM7) 1791 1792 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1793 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1794 1795 /* Fourth Column */ 1796 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1797 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1798 1799 /* Matrix-Vector Product: */ 1800 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1801 SSE_SHUFFLE(XMM5,XMM5,0x00) 1802 SSE_MULT_PS(XMM5,XMM0) 1803 SSE_SUB_PS(XMM4,XMM5) 1804 1805 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1806 SSE_SHUFFLE(XMM6,XMM6,0x00) 1807 SSE_MULT_PS(XMM6,XMM1) 1808 SSE_SUB_PS(XMM4,XMM6) 1809 1810 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1811 SSE_SHUFFLE(XMM7,XMM7,0x00) 1812 SSE_MULT_PS(XMM7,XMM2) 1813 SSE_SUB_PS(XMM4,XMM7) 1814 1815 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1816 SSE_SHUFFLE(XMM5,XMM5,0x00) 1817 SSE_MULT_PS(XMM5,XMM3) 1818 SSE_SUB_PS(XMM4,XMM5) 1819 1820 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1821 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1822 SSE_INLINE_END_2; 1823 pv += 16; 1824 } 1825 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1826 } 1827 row = (unsigned int)(*bjtmp++); 1828 /* row = (*bjtmp++)/4; */ 1829 /* bjtmp++; */ 1830 } 1831 /* finished row so stick it into b->a */ 1832 pv = ba + 16*bi[i]; 1833 pj = bj + bi[i]; 1834 nz = bi[i+1] - bi[i]; 1835 1836 /* Copy x block back into pv block */ 1837 for (j=0; j<nz; j++) { 1838 x = rtmp+16*((unsigned int)pj[j]); 1839 /* x = rtmp+4*pj[j]; */ 1840 1841 SSE_INLINE_BEGIN_2(x,pv) 1842 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1843 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1844 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1845 1846 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1847 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1848 1849 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1850 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1851 1852 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1853 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1854 1855 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1856 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1857 1858 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1859 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1860 1861 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1862 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1863 1864 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1865 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1866 SSE_INLINE_END_2; 1867 pv += 16; 1868 } 1869 /* invert diagonal block */ 1870 w = ba + 16*diag_offset[i]; 1871 if (pivotinblocks) { 1872 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr); 1873 if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1874 } else { 1875 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1876 } 1877 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1878 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1879 } 1880 1881 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1882 1883 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1884 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1885 C->assembled = PETSC_TRUE; 1886 1887 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1888 /* Flop Count from inverting diagonal blocks */ 1889 SSE_SCOPE_END; 1890 PetscFunctionReturn(0); 1891 } 1892 1893 #endif 1894