1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" 14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 15 { 16 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 17 IS isrow = b->row,isicol = b->icol; 18 PetscErrorCode ierr; 19 const PetscInt *r,*ic; 20 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 21 PetscInt *ajtmpold,*ajtmp,nz,row; 22 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 23 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 24 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 25 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 26 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 27 MatScalar m13,m14,m15,m16; 28 MatScalar *ba = b->a,*aa = a->a; 29 PetscBool pivotinblocks = b->pivotinblocks; 30 PetscReal shift = info->shiftamount; 31 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 32 33 PetscFunctionBegin; 34 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 35 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 36 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 37 allowzeropivot = PetscNot(A->erroriffailure); 38 39 for (i=0; i<n; i++) { 40 nz = bi[i+1] - bi[i]; 41 ajtmp = bj + bi[i]; 42 for (j=0; j<nz; j++) { 43 x = rtmp+16*ajtmp[j]; 44 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 45 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 46 } 47 /* load in initial (unfactored row) */ 48 idx = r[i]; 49 nz = ai[idx+1] - ai[idx]; 50 ajtmpold = aj + ai[idx]; 51 v = aa + 16*ai[idx]; 52 for (j=0; j<nz; j++) { 53 x = rtmp+16*ic[ajtmpold[j]]; 54 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 55 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 56 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 57 x[14] = v[14]; x[15] = v[15]; 58 v += 16; 59 } 60 row = *ajtmp++; 61 while (row < i) { 62 pc = rtmp + 16*row; 63 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 64 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 65 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 66 p15 = pc[14]; p16 = pc[15]; 67 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 68 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 69 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 70 || p16 != 0.0) { 71 pv = ba + 16*diag_offset[row]; 72 pj = bj + diag_offset[row] + 1; 73 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 74 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 75 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 76 x15 = pv[14]; x16 = pv[15]; 77 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 78 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 79 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 80 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 81 82 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 83 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 84 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 85 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 86 87 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 88 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 89 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 90 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 91 92 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 93 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 94 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 95 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 96 97 nz = bi[row+1] - diag_offset[row] - 1; 98 pv += 16; 99 for (j=0; j<nz; j++) { 100 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 101 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 102 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 103 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 104 x = rtmp + 16*pj[j]; 105 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 106 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 107 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 108 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 109 110 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 111 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 112 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 113 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 114 115 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 116 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 117 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 118 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 119 120 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 121 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 122 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 123 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 124 125 pv += 16; 126 } 127 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 128 } 129 row = *ajtmp++; 130 } 131 /* finished row so stick it into b->a */ 132 pv = ba + 16*bi[i]; 133 pj = bj + bi[i]; 134 nz = bi[i+1] - bi[i]; 135 for (j=0; j<nz; j++) { 136 x = rtmp+16*pj[j]; 137 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 138 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 139 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 140 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 141 pv += 16; 142 } 143 /* invert diagonal block */ 144 w = ba + 16*diag_offset[i]; 145 if (pivotinblocks) { 146 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 147 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 148 } else { 149 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 150 } 151 } 152 153 ierr = PetscFree(rtmp);CHKERRQ(ierr); 154 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 155 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 156 157 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 158 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 159 C->assembled = PETSC_TRUE; 160 161 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 162 PetscFunctionReturn(0); 163 } 164 165 /* MatLUFactorNumeric_SeqBAIJ_4 - 166 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 167 PetscKernel_A_gets_A_times_B() 168 PetscKernel_A_gets_A_minus_B_times_C() 169 PetscKernel_A_gets_inverse_A() 170 */ 171 172 #undef __FUNCT__ 173 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 174 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 175 { 176 Mat C = B; 177 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 178 IS isrow = b->row,isicol = b->icol; 179 PetscErrorCode ierr; 180 const PetscInt *r,*ic; 181 PetscInt i,j,k,nz,nzL,row; 182 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 183 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 184 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 185 PetscInt flg; 186 PetscReal shift; 187 PetscBool allowzeropivot,zeropivotdetected; 188 189 PetscFunctionBegin; 190 allowzeropivot = PetscNot(A->erroriffailure); 191 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 192 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 193 194 if (info->shifttype == MAT_SHIFT_NONE) { 195 shift = 0; 196 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 197 shift = info->shiftamount; 198 } 199 200 /* generate work space needed by the factorization */ 201 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 202 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 203 204 for (i=0; i<n; i++) { 205 /* zero rtmp */ 206 /* L part */ 207 nz = bi[i+1] - bi[i]; 208 bjtmp = bj + bi[i]; 209 for (j=0; j<nz; j++) { 210 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 211 } 212 213 /* U part */ 214 nz = bdiag[i]-bdiag[i+1]; 215 bjtmp = bj + bdiag[i+1]+1; 216 for (j=0; j<nz; j++) { 217 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 218 } 219 220 /* load in initial (unfactored row) */ 221 nz = ai[r[i]+1] - ai[r[i]]; 222 ajtmp = aj + ai[r[i]]; 223 v = aa + bs2*ai[r[i]]; 224 for (j=0; j<nz; j++) { 225 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 226 } 227 228 /* elimination */ 229 bjtmp = bj + bi[i]; 230 nzL = bi[i+1] - bi[i]; 231 for (k=0; k < nzL; k++) { 232 row = bjtmp[k]; 233 pc = rtmp + bs2*row; 234 for (flg=0,j=0; j<bs2; j++) { 235 if (pc[j]!=0.0) { 236 flg = 1; 237 break; 238 } 239 } 240 if (flg) { 241 pv = b->a + bs2*bdiag[row]; 242 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 243 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 244 245 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 246 pv = b->a + bs2*(bdiag[row+1]+1); 247 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 248 for (j=0; j<nz; j++) { 249 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 250 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 251 v = rtmp + bs2*pj[j]; 252 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 253 pv += bs2; 254 } 255 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 256 } 257 } 258 259 /* finished row so stick it into b->a */ 260 /* L part */ 261 pv = b->a + bs2*bi[i]; 262 pj = b->j + bi[i]; 263 nz = bi[i+1] - bi[i]; 264 for (j=0; j<nz; j++) { 265 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 266 } 267 268 /* Mark diagonal and invert diagonal for simplier triangular solves */ 269 pv = b->a + bs2*bdiag[i]; 270 pj = b->j + bdiag[i]; 271 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 272 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 273 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 274 275 /* U part */ 276 pv = b->a + bs2*(bdiag[i+1]+1); 277 pj = b->j + bdiag[i+1]+1; 278 nz = bdiag[i] - bdiag[i+1] - 1; 279 for (j=0; j<nz; j++) { 280 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 281 } 282 } 283 284 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 285 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 286 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 287 288 C->ops->solve = MatSolve_SeqBAIJ_4; 289 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 290 C->assembled = PETSC_TRUE; 291 292 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 293 PetscFunctionReturn(0); 294 } 295 296 #undef __FUNCT__ 297 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" 298 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 299 { 300 /* 301 Default Version for when blocks are 4 by 4 Using natural ordering 302 */ 303 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 304 PetscErrorCode ierr; 305 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 306 PetscInt *ajtmpold,*ajtmp,nz,row; 307 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 308 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 309 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 310 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 311 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 312 MatScalar m13,m14,m15,m16; 313 MatScalar *ba = b->a,*aa = a->a; 314 PetscBool pivotinblocks = b->pivotinblocks; 315 PetscReal shift = info->shiftamount; 316 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 317 318 PetscFunctionBegin; 319 allowzeropivot = PetscNot(A->erroriffailure); 320 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 321 322 for (i=0; i<n; i++) { 323 nz = bi[i+1] - bi[i]; 324 ajtmp = bj + bi[i]; 325 for (j=0; j<nz; j++) { 326 x = rtmp+16*ajtmp[j]; 327 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 328 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 329 } 330 /* load in initial (unfactored row) */ 331 nz = ai[i+1] - ai[i]; 332 ajtmpold = aj + ai[i]; 333 v = aa + 16*ai[i]; 334 for (j=0; j<nz; j++) { 335 x = rtmp+16*ajtmpold[j]; 336 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 337 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 338 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 339 x[14] = v[14]; x[15] = v[15]; 340 v += 16; 341 } 342 row = *ajtmp++; 343 while (row < i) { 344 pc = rtmp + 16*row; 345 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 346 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 347 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 348 p15 = pc[14]; p16 = pc[15]; 349 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 350 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 351 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 352 || p16 != 0.0) { 353 pv = ba + 16*diag_offset[row]; 354 pj = bj + diag_offset[row] + 1; 355 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 356 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 357 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 358 x15 = pv[14]; x16 = pv[15]; 359 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 360 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 361 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 362 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 363 364 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 365 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 366 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 367 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 368 369 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 370 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 371 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 372 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 373 374 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 375 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 376 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 377 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 378 nz = bi[row+1] - diag_offset[row] - 1; 379 pv += 16; 380 for (j=0; j<nz; j++) { 381 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 382 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 383 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 384 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 385 x = rtmp + 16*pj[j]; 386 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 387 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 388 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 389 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 390 391 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 392 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 393 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 394 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 395 396 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 397 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 398 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 399 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 400 401 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 402 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 403 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 404 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 405 406 pv += 16; 407 } 408 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 409 } 410 row = *ajtmp++; 411 } 412 /* finished row so stick it into b->a */ 413 pv = ba + 16*bi[i]; 414 pj = bj + bi[i]; 415 nz = bi[i+1] - bi[i]; 416 for (j=0; j<nz; j++) { 417 x = rtmp+16*pj[j]; 418 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 419 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 420 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 421 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 422 pv += 16; 423 } 424 /* invert diagonal block */ 425 w = ba + 16*diag_offset[i]; 426 if (pivotinblocks) { 427 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 428 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 429 } else { 430 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 431 } 432 } 433 434 ierr = PetscFree(rtmp);CHKERRQ(ierr); 435 436 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 437 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 438 C->assembled = PETSC_TRUE; 439 440 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 441 PetscFunctionReturn(0); 442 } 443 444 /* 445 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 446 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 447 */ 448 #undef __FUNCT__ 449 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 450 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 451 { 452 Mat C =B; 453 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 454 PetscErrorCode ierr; 455 PetscInt i,j,k,nz,nzL,row; 456 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 457 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 458 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 459 PetscInt flg; 460 PetscReal shift; 461 PetscBool allowzeropivot,zeropivotdetected; 462 463 PetscFunctionBegin; 464 allowzeropivot = PetscNot(A->erroriffailure); 465 466 /* generate work space needed by the factorization */ 467 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 468 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 469 470 if (info->shifttype == MAT_SHIFT_NONE) { 471 shift = 0; 472 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 473 shift = info->shiftamount; 474 } 475 476 for (i=0; i<n; i++) { 477 /* zero rtmp */ 478 /* L part */ 479 nz = bi[i+1] - bi[i]; 480 bjtmp = bj + bi[i]; 481 for (j=0; j<nz; j++) { 482 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 483 } 484 485 /* U part */ 486 nz = bdiag[i] - bdiag[i+1]; 487 bjtmp = bj + bdiag[i+1]+1; 488 for (j=0; j<nz; j++) { 489 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 490 } 491 492 /* load in initial (unfactored row) */ 493 nz = ai[i+1] - ai[i]; 494 ajtmp = aj + ai[i]; 495 v = aa + bs2*ai[i]; 496 for (j=0; j<nz; j++) { 497 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 498 } 499 500 /* elimination */ 501 bjtmp = bj + bi[i]; 502 nzL = bi[i+1] - bi[i]; 503 for (k=0; k < nzL; k++) { 504 row = bjtmp[k]; 505 pc = rtmp + bs2*row; 506 for (flg=0,j=0; j<bs2; j++) { 507 if (pc[j]!=0.0) { 508 flg = 1; 509 break; 510 } 511 } 512 if (flg) { 513 pv = b->a + bs2*bdiag[row]; 514 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 515 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 516 517 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 518 pv = b->a + bs2*(bdiag[row+1]+1); 519 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 520 for (j=0; j<nz; j++) { 521 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 522 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 523 v = rtmp + bs2*pj[j]; 524 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 525 pv += bs2; 526 } 527 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 528 } 529 } 530 531 /* finished row so stick it into b->a */ 532 /* L part */ 533 pv = b->a + bs2*bi[i]; 534 pj = b->j + bi[i]; 535 nz = bi[i+1] - bi[i]; 536 for (j=0; j<nz; j++) { 537 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 538 } 539 540 /* Mark diagonal and invert diagonal for simplier triangular solves */ 541 pv = b->a + bs2*bdiag[i]; 542 pj = b->j + bdiag[i]; 543 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 544 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 545 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 546 547 /* U part */ 548 pv = b->a + bs2*(bdiag[i+1]+1); 549 pj = b->j + bdiag[i+1]+1; 550 nz = bdiag[i] - bdiag[i+1] - 1; 551 for (j=0; j<nz; j++) { 552 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 553 } 554 } 555 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 556 557 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 558 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 559 C->assembled = PETSC_TRUE; 560 561 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 562 PetscFunctionReturn(0); 563 } 564 565 #if defined(PETSC_HAVE_SSE) 566 567 #include PETSC_HAVE_SSE 568 569 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 570 #undef __FUNCT__ 571 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 572 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 573 { 574 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 575 PetscErrorCode ierr; 576 int i,j,n = a->mbs; 577 int *bj = b->j,*bjtmp,*pj; 578 int row; 579 int *ajtmpold,nz,*bi=b->i; 580 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 581 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 582 MatScalar *ba = b->a,*aa = a->a; 583 int nonzero=0; 584 PetscBool pivotinblocks = b->pivotinblocks; 585 PetscReal shift = info->shiftamount; 586 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 587 588 PetscFunctionBegin; 589 allowzeropivot = PetscNot(A->erroriffailure); 590 SSE_SCOPE_BEGIN; 591 592 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 593 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 594 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 595 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 596 /* if ((unsigned long)bj==(unsigned long)aj) { */ 597 /* colscale = 4; */ 598 /* } */ 599 for (i=0; i<n; i++) { 600 nz = bi[i+1] - bi[i]; 601 bjtmp = bj + bi[i]; 602 /* zero out the 4x4 block accumulators */ 603 /* zero out one register */ 604 XOR_PS(XMM7,XMM7); 605 for (j=0; j<nz; j++) { 606 x = rtmp+16*bjtmp[j]; 607 /* x = rtmp+4*bjtmp[j]; */ 608 SSE_INLINE_BEGIN_1(x) 609 /* Copy zero register to memory locations */ 610 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 611 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 612 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 613 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 614 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 615 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 616 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 617 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 618 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 619 SSE_INLINE_END_1; 620 } 621 /* load in initial (unfactored row) */ 622 nz = ai[i+1] - ai[i]; 623 ajtmpold = aj + ai[i]; 624 v = aa + 16*ai[i]; 625 for (j=0; j<nz; j++) { 626 x = rtmp+16*ajtmpold[j]; 627 /* x = rtmp+colscale*ajtmpold[j]; */ 628 /* Copy v block into x block */ 629 SSE_INLINE_BEGIN_2(v,x) 630 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 631 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 632 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 633 634 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 635 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 636 637 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 638 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 639 640 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 641 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 642 643 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 644 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 645 646 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 647 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 648 649 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 650 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 651 652 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 653 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 654 SSE_INLINE_END_2; 655 656 v += 16; 657 } 658 /* row = (*bjtmp++)/4; */ 659 row = *bjtmp++; 660 while (row < i) { 661 pc = rtmp + 16*row; 662 SSE_INLINE_BEGIN_1(pc) 663 /* Load block from lower triangle */ 664 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 665 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 666 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 667 668 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 669 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 670 671 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 672 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 673 674 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 675 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 676 677 /* Compare block to zero block */ 678 679 SSE_COPY_PS(XMM4,XMM7) 680 SSE_CMPNEQ_PS(XMM4,XMM0) 681 682 SSE_COPY_PS(XMM5,XMM7) 683 SSE_CMPNEQ_PS(XMM5,XMM1) 684 685 SSE_COPY_PS(XMM6,XMM7) 686 SSE_CMPNEQ_PS(XMM6,XMM2) 687 688 SSE_CMPNEQ_PS(XMM7,XMM3) 689 690 /* Reduce the comparisons to one SSE register */ 691 SSE_OR_PS(XMM6,XMM7) 692 SSE_OR_PS(XMM5,XMM4) 693 SSE_OR_PS(XMM5,XMM6) 694 SSE_INLINE_END_1; 695 696 /* Reduce the one SSE register to an integer register for branching */ 697 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 698 MOVEMASK(nonzero,XMM5); 699 700 /* If block is nonzero ... */ 701 if (nonzero) { 702 pv = ba + 16*diag_offset[row]; 703 PREFETCH_L1(&pv[16]); 704 pj = bj + diag_offset[row] + 1; 705 706 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 707 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 708 /* but the diagonal was inverted already */ 709 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 710 711 SSE_INLINE_BEGIN_2(pv,pc) 712 /* Column 0, product is accumulated in XMM4 */ 713 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 714 SSE_SHUFFLE(XMM4,XMM4,0x00) 715 SSE_MULT_PS(XMM4,XMM0) 716 717 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 718 SSE_SHUFFLE(XMM5,XMM5,0x00) 719 SSE_MULT_PS(XMM5,XMM1) 720 SSE_ADD_PS(XMM4,XMM5) 721 722 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 723 SSE_SHUFFLE(XMM6,XMM6,0x00) 724 SSE_MULT_PS(XMM6,XMM2) 725 SSE_ADD_PS(XMM4,XMM6) 726 727 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 728 SSE_SHUFFLE(XMM7,XMM7,0x00) 729 SSE_MULT_PS(XMM7,XMM3) 730 SSE_ADD_PS(XMM4,XMM7) 731 732 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 733 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 734 735 /* Column 1, product is accumulated in XMM5 */ 736 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 737 SSE_SHUFFLE(XMM5,XMM5,0x00) 738 SSE_MULT_PS(XMM5,XMM0) 739 740 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 741 SSE_SHUFFLE(XMM6,XMM6,0x00) 742 SSE_MULT_PS(XMM6,XMM1) 743 SSE_ADD_PS(XMM5,XMM6) 744 745 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 746 SSE_SHUFFLE(XMM7,XMM7,0x00) 747 SSE_MULT_PS(XMM7,XMM2) 748 SSE_ADD_PS(XMM5,XMM7) 749 750 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 751 SSE_SHUFFLE(XMM6,XMM6,0x00) 752 SSE_MULT_PS(XMM6,XMM3) 753 SSE_ADD_PS(XMM5,XMM6) 754 755 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 756 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 757 758 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 759 760 /* Column 2, product is accumulated in XMM6 */ 761 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 762 SSE_SHUFFLE(XMM6,XMM6,0x00) 763 SSE_MULT_PS(XMM6,XMM0) 764 765 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 766 SSE_SHUFFLE(XMM7,XMM7,0x00) 767 SSE_MULT_PS(XMM7,XMM1) 768 SSE_ADD_PS(XMM6,XMM7) 769 770 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 771 SSE_SHUFFLE(XMM7,XMM7,0x00) 772 SSE_MULT_PS(XMM7,XMM2) 773 SSE_ADD_PS(XMM6,XMM7) 774 775 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 776 SSE_SHUFFLE(XMM7,XMM7,0x00) 777 SSE_MULT_PS(XMM7,XMM3) 778 SSE_ADD_PS(XMM6,XMM7) 779 780 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 781 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 782 783 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 784 /* Column 3, product is accumulated in XMM0 */ 785 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 786 SSE_SHUFFLE(XMM7,XMM7,0x00) 787 SSE_MULT_PS(XMM0,XMM7) 788 789 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 790 SSE_SHUFFLE(XMM7,XMM7,0x00) 791 SSE_MULT_PS(XMM1,XMM7) 792 SSE_ADD_PS(XMM0,XMM1) 793 794 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 795 SSE_SHUFFLE(XMM1,XMM1,0x00) 796 SSE_MULT_PS(XMM1,XMM2) 797 SSE_ADD_PS(XMM0,XMM1) 798 799 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 800 SSE_SHUFFLE(XMM7,XMM7,0x00) 801 SSE_MULT_PS(XMM3,XMM7) 802 SSE_ADD_PS(XMM0,XMM3) 803 804 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 805 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 806 807 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 808 /* This is code to be maintained and read by humans afterall. */ 809 /* Copy Multiplier Col 3 into XMM3 */ 810 SSE_COPY_PS(XMM3,XMM0) 811 /* Copy Multiplier Col 2 into XMM2 */ 812 SSE_COPY_PS(XMM2,XMM6) 813 /* Copy Multiplier Col 1 into XMM1 */ 814 SSE_COPY_PS(XMM1,XMM5) 815 /* Copy Multiplier Col 0 into XMM0 */ 816 SSE_COPY_PS(XMM0,XMM4) 817 SSE_INLINE_END_2; 818 819 /* Update the row: */ 820 nz = bi[row+1] - diag_offset[row] - 1; 821 pv += 16; 822 for (j=0; j<nz; j++) { 823 PREFETCH_L1(&pv[16]); 824 x = rtmp + 16*pj[j]; 825 /* x = rtmp + 4*pj[j]; */ 826 827 /* X:=X-M*PV, One column at a time */ 828 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 829 SSE_INLINE_BEGIN_2(x,pv) 830 /* Load First Column of X*/ 831 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 832 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 833 834 /* Matrix-Vector Product: */ 835 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 836 SSE_SHUFFLE(XMM5,XMM5,0x00) 837 SSE_MULT_PS(XMM5,XMM0) 838 SSE_SUB_PS(XMM4,XMM5) 839 840 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 841 SSE_SHUFFLE(XMM6,XMM6,0x00) 842 SSE_MULT_PS(XMM6,XMM1) 843 SSE_SUB_PS(XMM4,XMM6) 844 845 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 846 SSE_SHUFFLE(XMM7,XMM7,0x00) 847 SSE_MULT_PS(XMM7,XMM2) 848 SSE_SUB_PS(XMM4,XMM7) 849 850 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 851 SSE_SHUFFLE(XMM5,XMM5,0x00) 852 SSE_MULT_PS(XMM5,XMM3) 853 SSE_SUB_PS(XMM4,XMM5) 854 855 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 856 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 857 858 /* Second Column */ 859 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 860 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 861 862 /* Matrix-Vector Product: */ 863 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 864 SSE_SHUFFLE(XMM6,XMM6,0x00) 865 SSE_MULT_PS(XMM6,XMM0) 866 SSE_SUB_PS(XMM5,XMM6) 867 868 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 869 SSE_SHUFFLE(XMM7,XMM7,0x00) 870 SSE_MULT_PS(XMM7,XMM1) 871 SSE_SUB_PS(XMM5,XMM7) 872 873 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 874 SSE_SHUFFLE(XMM4,XMM4,0x00) 875 SSE_MULT_PS(XMM4,XMM2) 876 SSE_SUB_PS(XMM5,XMM4) 877 878 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 879 SSE_SHUFFLE(XMM6,XMM6,0x00) 880 SSE_MULT_PS(XMM6,XMM3) 881 SSE_SUB_PS(XMM5,XMM6) 882 883 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 884 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 885 886 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 887 888 /* Third Column */ 889 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 890 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 891 892 /* Matrix-Vector Product: */ 893 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 894 SSE_SHUFFLE(XMM7,XMM7,0x00) 895 SSE_MULT_PS(XMM7,XMM0) 896 SSE_SUB_PS(XMM6,XMM7) 897 898 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 899 SSE_SHUFFLE(XMM4,XMM4,0x00) 900 SSE_MULT_PS(XMM4,XMM1) 901 SSE_SUB_PS(XMM6,XMM4) 902 903 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 904 SSE_SHUFFLE(XMM5,XMM5,0x00) 905 SSE_MULT_PS(XMM5,XMM2) 906 SSE_SUB_PS(XMM6,XMM5) 907 908 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 909 SSE_SHUFFLE(XMM7,XMM7,0x00) 910 SSE_MULT_PS(XMM7,XMM3) 911 SSE_SUB_PS(XMM6,XMM7) 912 913 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 914 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 915 916 /* Fourth Column */ 917 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 918 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 919 920 /* Matrix-Vector Product: */ 921 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 922 SSE_SHUFFLE(XMM5,XMM5,0x00) 923 SSE_MULT_PS(XMM5,XMM0) 924 SSE_SUB_PS(XMM4,XMM5) 925 926 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 927 SSE_SHUFFLE(XMM6,XMM6,0x00) 928 SSE_MULT_PS(XMM6,XMM1) 929 SSE_SUB_PS(XMM4,XMM6) 930 931 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 932 SSE_SHUFFLE(XMM7,XMM7,0x00) 933 SSE_MULT_PS(XMM7,XMM2) 934 SSE_SUB_PS(XMM4,XMM7) 935 936 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 937 SSE_SHUFFLE(XMM5,XMM5,0x00) 938 SSE_MULT_PS(XMM5,XMM3) 939 SSE_SUB_PS(XMM4,XMM5) 940 941 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 942 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 943 SSE_INLINE_END_2; 944 pv += 16; 945 } 946 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 947 } 948 row = *bjtmp++; 949 /* row = (*bjtmp++)/4; */ 950 } 951 /* finished row so stick it into b->a */ 952 pv = ba + 16*bi[i]; 953 pj = bj + bi[i]; 954 nz = bi[i+1] - bi[i]; 955 956 /* Copy x block back into pv block */ 957 for (j=0; j<nz; j++) { 958 x = rtmp+16*pj[j]; 959 /* x = rtmp+4*pj[j]; */ 960 961 SSE_INLINE_BEGIN_2(x,pv) 962 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 963 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 964 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 965 966 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 967 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 968 969 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 970 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 971 972 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 973 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 974 975 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 976 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 977 978 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 979 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 980 981 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 982 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 983 984 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 985 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 986 SSE_INLINE_END_2; 987 pv += 16; 988 } 989 /* invert diagonal block */ 990 w = ba + 16*diag_offset[i]; 991 if (pivotinblocks) { 992 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 993 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 994 } else { 995 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 996 } 997 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 998 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 999 } 1000 1001 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1002 1003 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1004 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1005 C->assembled = PETSC_TRUE; 1006 1007 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1008 /* Flop Count from inverting diagonal blocks */ 1009 SSE_SCOPE_END; 1010 PetscFunctionReturn(0); 1011 } 1012 1013 #undef __FUNCT__ 1014 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 1015 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1016 { 1017 Mat A =C; 1018 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1019 PetscErrorCode ierr; 1020 int i,j,n = a->mbs; 1021 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1022 unsigned short *aj = (unsigned short*)(a->j),*ajtmp; 1023 unsigned int row; 1024 int nz,*bi=b->i; 1025 int *diag_offset = b->diag,*ai=a->i; 1026 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1027 MatScalar *ba = b->a,*aa = a->a; 1028 int nonzero=0; 1029 PetscBool pivotinblocks = b->pivotinblocks; 1030 PetscReal shift = info->shiftamount; 1031 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 1032 1033 PetscFunctionBegin; 1034 allowzeropivot = PetscNot(A->erroriffailure); 1035 SSE_SCOPE_BEGIN; 1036 1037 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1038 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1039 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1040 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1041 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1042 /* colscale = 4; */ 1043 /* } */ 1044 1045 for (i=0; i<n; i++) { 1046 nz = bi[i+1] - bi[i]; 1047 bjtmp = bj + bi[i]; 1048 /* zero out the 4x4 block accumulators */ 1049 /* zero out one register */ 1050 XOR_PS(XMM7,XMM7); 1051 for (j=0; j<nz; j++) { 1052 x = rtmp+16*((unsigned int)bjtmp[j]); 1053 /* x = rtmp+4*bjtmp[j]; */ 1054 SSE_INLINE_BEGIN_1(x) 1055 /* Copy zero register to memory locations */ 1056 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1057 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1058 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1059 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1060 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1061 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1062 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1063 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1064 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1065 SSE_INLINE_END_1; 1066 } 1067 /* load in initial (unfactored row) */ 1068 nz = ai[i+1] - ai[i]; 1069 ajtmp = aj + ai[i]; 1070 v = aa + 16*ai[i]; 1071 for (j=0; j<nz; j++) { 1072 x = rtmp+16*((unsigned int)ajtmp[j]); 1073 /* x = rtmp+colscale*ajtmp[j]; */ 1074 /* Copy v block into x block */ 1075 SSE_INLINE_BEGIN_2(v,x) 1076 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1077 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1078 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1079 1080 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1081 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1082 1083 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1084 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1085 1086 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1087 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1088 1089 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1090 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1091 1092 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1093 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1094 1095 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1096 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1097 1098 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1099 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1100 SSE_INLINE_END_2; 1101 1102 v += 16; 1103 } 1104 /* row = (*bjtmp++)/4; */ 1105 row = (unsigned int)(*bjtmp++); 1106 while (row < i) { 1107 pc = rtmp + 16*row; 1108 SSE_INLINE_BEGIN_1(pc) 1109 /* Load block from lower triangle */ 1110 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1111 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1112 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1113 1114 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1115 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1116 1117 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1118 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1119 1120 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1121 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1122 1123 /* Compare block to zero block */ 1124 1125 SSE_COPY_PS(XMM4,XMM7) 1126 SSE_CMPNEQ_PS(XMM4,XMM0) 1127 1128 SSE_COPY_PS(XMM5,XMM7) 1129 SSE_CMPNEQ_PS(XMM5,XMM1) 1130 1131 SSE_COPY_PS(XMM6,XMM7) 1132 SSE_CMPNEQ_PS(XMM6,XMM2) 1133 1134 SSE_CMPNEQ_PS(XMM7,XMM3) 1135 1136 /* Reduce the comparisons to one SSE register */ 1137 SSE_OR_PS(XMM6,XMM7) 1138 SSE_OR_PS(XMM5,XMM4) 1139 SSE_OR_PS(XMM5,XMM6) 1140 SSE_INLINE_END_1; 1141 1142 /* Reduce the one SSE register to an integer register for branching */ 1143 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1144 MOVEMASK(nonzero,XMM5); 1145 1146 /* If block is nonzero ... */ 1147 if (nonzero) { 1148 pv = ba + 16*diag_offset[row]; 1149 PREFETCH_L1(&pv[16]); 1150 pj = bj + diag_offset[row] + 1; 1151 1152 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1153 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1154 /* but the diagonal was inverted already */ 1155 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1156 1157 SSE_INLINE_BEGIN_2(pv,pc) 1158 /* Column 0, product is accumulated in XMM4 */ 1159 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1160 SSE_SHUFFLE(XMM4,XMM4,0x00) 1161 SSE_MULT_PS(XMM4,XMM0) 1162 1163 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1164 SSE_SHUFFLE(XMM5,XMM5,0x00) 1165 SSE_MULT_PS(XMM5,XMM1) 1166 SSE_ADD_PS(XMM4,XMM5) 1167 1168 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1169 SSE_SHUFFLE(XMM6,XMM6,0x00) 1170 SSE_MULT_PS(XMM6,XMM2) 1171 SSE_ADD_PS(XMM4,XMM6) 1172 1173 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1174 SSE_SHUFFLE(XMM7,XMM7,0x00) 1175 SSE_MULT_PS(XMM7,XMM3) 1176 SSE_ADD_PS(XMM4,XMM7) 1177 1178 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1179 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1180 1181 /* Column 1, product is accumulated in XMM5 */ 1182 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1183 SSE_SHUFFLE(XMM5,XMM5,0x00) 1184 SSE_MULT_PS(XMM5,XMM0) 1185 1186 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1187 SSE_SHUFFLE(XMM6,XMM6,0x00) 1188 SSE_MULT_PS(XMM6,XMM1) 1189 SSE_ADD_PS(XMM5,XMM6) 1190 1191 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1192 SSE_SHUFFLE(XMM7,XMM7,0x00) 1193 SSE_MULT_PS(XMM7,XMM2) 1194 SSE_ADD_PS(XMM5,XMM7) 1195 1196 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1197 SSE_SHUFFLE(XMM6,XMM6,0x00) 1198 SSE_MULT_PS(XMM6,XMM3) 1199 SSE_ADD_PS(XMM5,XMM6) 1200 1201 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1202 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1203 1204 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1205 1206 /* Column 2, product is accumulated in XMM6 */ 1207 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1208 SSE_SHUFFLE(XMM6,XMM6,0x00) 1209 SSE_MULT_PS(XMM6,XMM0) 1210 1211 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1212 SSE_SHUFFLE(XMM7,XMM7,0x00) 1213 SSE_MULT_PS(XMM7,XMM1) 1214 SSE_ADD_PS(XMM6,XMM7) 1215 1216 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1217 SSE_SHUFFLE(XMM7,XMM7,0x00) 1218 SSE_MULT_PS(XMM7,XMM2) 1219 SSE_ADD_PS(XMM6,XMM7) 1220 1221 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1222 SSE_SHUFFLE(XMM7,XMM7,0x00) 1223 SSE_MULT_PS(XMM7,XMM3) 1224 SSE_ADD_PS(XMM6,XMM7) 1225 1226 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1227 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1228 1229 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1230 /* Column 3, product is accumulated in XMM0 */ 1231 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1232 SSE_SHUFFLE(XMM7,XMM7,0x00) 1233 SSE_MULT_PS(XMM0,XMM7) 1234 1235 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1236 SSE_SHUFFLE(XMM7,XMM7,0x00) 1237 SSE_MULT_PS(XMM1,XMM7) 1238 SSE_ADD_PS(XMM0,XMM1) 1239 1240 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1241 SSE_SHUFFLE(XMM1,XMM1,0x00) 1242 SSE_MULT_PS(XMM1,XMM2) 1243 SSE_ADD_PS(XMM0,XMM1) 1244 1245 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1246 SSE_SHUFFLE(XMM7,XMM7,0x00) 1247 SSE_MULT_PS(XMM3,XMM7) 1248 SSE_ADD_PS(XMM0,XMM3) 1249 1250 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1251 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1252 1253 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1254 /* This is code to be maintained and read by humans afterall. */ 1255 /* Copy Multiplier Col 3 into XMM3 */ 1256 SSE_COPY_PS(XMM3,XMM0) 1257 /* Copy Multiplier Col 2 into XMM2 */ 1258 SSE_COPY_PS(XMM2,XMM6) 1259 /* Copy Multiplier Col 1 into XMM1 */ 1260 SSE_COPY_PS(XMM1,XMM5) 1261 /* Copy Multiplier Col 0 into XMM0 */ 1262 SSE_COPY_PS(XMM0,XMM4) 1263 SSE_INLINE_END_2; 1264 1265 /* Update the row: */ 1266 nz = bi[row+1] - diag_offset[row] - 1; 1267 pv += 16; 1268 for (j=0; j<nz; j++) { 1269 PREFETCH_L1(&pv[16]); 1270 x = rtmp + 16*((unsigned int)pj[j]); 1271 /* x = rtmp + 4*pj[j]; */ 1272 1273 /* X:=X-M*PV, One column at a time */ 1274 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1275 SSE_INLINE_BEGIN_2(x,pv) 1276 /* Load First Column of X*/ 1277 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1278 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1279 1280 /* Matrix-Vector Product: */ 1281 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1282 SSE_SHUFFLE(XMM5,XMM5,0x00) 1283 SSE_MULT_PS(XMM5,XMM0) 1284 SSE_SUB_PS(XMM4,XMM5) 1285 1286 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1287 SSE_SHUFFLE(XMM6,XMM6,0x00) 1288 SSE_MULT_PS(XMM6,XMM1) 1289 SSE_SUB_PS(XMM4,XMM6) 1290 1291 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1292 SSE_SHUFFLE(XMM7,XMM7,0x00) 1293 SSE_MULT_PS(XMM7,XMM2) 1294 SSE_SUB_PS(XMM4,XMM7) 1295 1296 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1297 SSE_SHUFFLE(XMM5,XMM5,0x00) 1298 SSE_MULT_PS(XMM5,XMM3) 1299 SSE_SUB_PS(XMM4,XMM5) 1300 1301 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1302 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1303 1304 /* Second Column */ 1305 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1306 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1307 1308 /* Matrix-Vector Product: */ 1309 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1310 SSE_SHUFFLE(XMM6,XMM6,0x00) 1311 SSE_MULT_PS(XMM6,XMM0) 1312 SSE_SUB_PS(XMM5,XMM6) 1313 1314 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1315 SSE_SHUFFLE(XMM7,XMM7,0x00) 1316 SSE_MULT_PS(XMM7,XMM1) 1317 SSE_SUB_PS(XMM5,XMM7) 1318 1319 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1320 SSE_SHUFFLE(XMM4,XMM4,0x00) 1321 SSE_MULT_PS(XMM4,XMM2) 1322 SSE_SUB_PS(XMM5,XMM4) 1323 1324 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1325 SSE_SHUFFLE(XMM6,XMM6,0x00) 1326 SSE_MULT_PS(XMM6,XMM3) 1327 SSE_SUB_PS(XMM5,XMM6) 1328 1329 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1330 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1331 1332 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1333 1334 /* Third Column */ 1335 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1336 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1337 1338 /* Matrix-Vector Product: */ 1339 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1340 SSE_SHUFFLE(XMM7,XMM7,0x00) 1341 SSE_MULT_PS(XMM7,XMM0) 1342 SSE_SUB_PS(XMM6,XMM7) 1343 1344 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1345 SSE_SHUFFLE(XMM4,XMM4,0x00) 1346 SSE_MULT_PS(XMM4,XMM1) 1347 SSE_SUB_PS(XMM6,XMM4) 1348 1349 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1350 SSE_SHUFFLE(XMM5,XMM5,0x00) 1351 SSE_MULT_PS(XMM5,XMM2) 1352 SSE_SUB_PS(XMM6,XMM5) 1353 1354 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1355 SSE_SHUFFLE(XMM7,XMM7,0x00) 1356 SSE_MULT_PS(XMM7,XMM3) 1357 SSE_SUB_PS(XMM6,XMM7) 1358 1359 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1360 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1361 1362 /* Fourth Column */ 1363 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1364 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1365 1366 /* Matrix-Vector Product: */ 1367 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1368 SSE_SHUFFLE(XMM5,XMM5,0x00) 1369 SSE_MULT_PS(XMM5,XMM0) 1370 SSE_SUB_PS(XMM4,XMM5) 1371 1372 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1373 SSE_SHUFFLE(XMM6,XMM6,0x00) 1374 SSE_MULT_PS(XMM6,XMM1) 1375 SSE_SUB_PS(XMM4,XMM6) 1376 1377 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1378 SSE_SHUFFLE(XMM7,XMM7,0x00) 1379 SSE_MULT_PS(XMM7,XMM2) 1380 SSE_SUB_PS(XMM4,XMM7) 1381 1382 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1383 SSE_SHUFFLE(XMM5,XMM5,0x00) 1384 SSE_MULT_PS(XMM5,XMM3) 1385 SSE_SUB_PS(XMM4,XMM5) 1386 1387 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1388 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1389 SSE_INLINE_END_2; 1390 pv += 16; 1391 } 1392 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1393 } 1394 row = (unsigned int)(*bjtmp++); 1395 /* row = (*bjtmp++)/4; */ 1396 /* bjtmp++; */ 1397 } 1398 /* finished row so stick it into b->a */ 1399 pv = ba + 16*bi[i]; 1400 pj = bj + bi[i]; 1401 nz = bi[i+1] - bi[i]; 1402 1403 /* Copy x block back into pv block */ 1404 for (j=0; j<nz; j++) { 1405 x = rtmp+16*((unsigned int)pj[j]); 1406 /* x = rtmp+4*pj[j]; */ 1407 1408 SSE_INLINE_BEGIN_2(x,pv) 1409 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1410 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1411 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1412 1413 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1414 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1415 1416 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1417 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1418 1419 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1420 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1421 1422 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1423 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1424 1425 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1426 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1427 1428 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1429 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1430 1431 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1432 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1433 SSE_INLINE_END_2; 1434 pv += 16; 1435 } 1436 /* invert diagonal block */ 1437 w = ba + 16*diag_offset[i]; 1438 if (pivotinblocks) { 1439 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 1440 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1441 } else { 1442 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1443 } 1444 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1445 } 1446 1447 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1448 1449 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1450 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1451 C->assembled = PETSC_TRUE; 1452 1453 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1454 /* Flop Count from inverting diagonal blocks */ 1455 SSE_SCOPE_END; 1456 PetscFunctionReturn(0); 1457 } 1458 1459 #undef __FUNCT__ 1460 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1461 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1462 { 1463 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1464 PetscErrorCode ierr; 1465 int i,j,n = a->mbs; 1466 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1467 unsigned int row; 1468 int *ajtmpold,nz,*bi=b->i; 1469 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1470 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1471 MatScalar *ba = b->a,*aa = a->a; 1472 int nonzero=0; 1473 PetscBool pivotinblocks = b->pivotinblocks; 1474 PetscReal shift = info->shiftamount; 1475 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 1476 1477 PetscFunctionBegin; 1478 allowzeropivot = PetscNot(A->erroriffailure); 1479 SSE_SCOPE_BEGIN; 1480 1481 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1482 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1483 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1484 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1485 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1486 /* colscale = 4; */ 1487 /* } */ 1488 if ((unsigned long)bj==(unsigned long)aj) { 1489 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1490 } 1491 1492 for (i=0; i<n; i++) { 1493 nz = bi[i+1] - bi[i]; 1494 bjtmp = bj + bi[i]; 1495 /* zero out the 4x4 block accumulators */ 1496 /* zero out one register */ 1497 XOR_PS(XMM7,XMM7); 1498 for (j=0; j<nz; j++) { 1499 x = rtmp+16*((unsigned int)bjtmp[j]); 1500 /* x = rtmp+4*bjtmp[j]; */ 1501 SSE_INLINE_BEGIN_1(x) 1502 /* Copy zero register to memory locations */ 1503 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1504 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1505 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1506 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1507 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1508 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1509 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1510 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1511 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1512 SSE_INLINE_END_1; 1513 } 1514 /* load in initial (unfactored row) */ 1515 nz = ai[i+1] - ai[i]; 1516 ajtmpold = aj + ai[i]; 1517 v = aa + 16*ai[i]; 1518 for (j=0; j<nz; j++) { 1519 x = rtmp+16*ajtmpold[j]; 1520 /* x = rtmp+colscale*ajtmpold[j]; */ 1521 /* Copy v block into x block */ 1522 SSE_INLINE_BEGIN_2(v,x) 1523 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1524 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1525 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1526 1527 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1528 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1529 1530 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1531 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1532 1533 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1534 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1535 1536 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1537 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1538 1539 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1540 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1541 1542 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1543 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1544 1545 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1546 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1547 SSE_INLINE_END_2; 1548 1549 v += 16; 1550 } 1551 /* row = (*bjtmp++)/4; */ 1552 row = (unsigned int)(*bjtmp++); 1553 while (row < i) { 1554 pc = rtmp + 16*row; 1555 SSE_INLINE_BEGIN_1(pc) 1556 /* Load block from lower triangle */ 1557 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1558 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1559 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1560 1561 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1562 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1563 1564 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1565 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1566 1567 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1568 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1569 1570 /* Compare block to zero block */ 1571 1572 SSE_COPY_PS(XMM4,XMM7) 1573 SSE_CMPNEQ_PS(XMM4,XMM0) 1574 1575 SSE_COPY_PS(XMM5,XMM7) 1576 SSE_CMPNEQ_PS(XMM5,XMM1) 1577 1578 SSE_COPY_PS(XMM6,XMM7) 1579 SSE_CMPNEQ_PS(XMM6,XMM2) 1580 1581 SSE_CMPNEQ_PS(XMM7,XMM3) 1582 1583 /* Reduce the comparisons to one SSE register */ 1584 SSE_OR_PS(XMM6,XMM7) 1585 SSE_OR_PS(XMM5,XMM4) 1586 SSE_OR_PS(XMM5,XMM6) 1587 SSE_INLINE_END_1; 1588 1589 /* Reduce the one SSE register to an integer register for branching */ 1590 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1591 MOVEMASK(nonzero,XMM5); 1592 1593 /* If block is nonzero ... */ 1594 if (nonzero) { 1595 pv = ba + 16*diag_offset[row]; 1596 PREFETCH_L1(&pv[16]); 1597 pj = bj + diag_offset[row] + 1; 1598 1599 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1600 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1601 /* but the diagonal was inverted already */ 1602 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1603 1604 SSE_INLINE_BEGIN_2(pv,pc) 1605 /* Column 0, product is accumulated in XMM4 */ 1606 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1607 SSE_SHUFFLE(XMM4,XMM4,0x00) 1608 SSE_MULT_PS(XMM4,XMM0) 1609 1610 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1611 SSE_SHUFFLE(XMM5,XMM5,0x00) 1612 SSE_MULT_PS(XMM5,XMM1) 1613 SSE_ADD_PS(XMM4,XMM5) 1614 1615 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1616 SSE_SHUFFLE(XMM6,XMM6,0x00) 1617 SSE_MULT_PS(XMM6,XMM2) 1618 SSE_ADD_PS(XMM4,XMM6) 1619 1620 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1621 SSE_SHUFFLE(XMM7,XMM7,0x00) 1622 SSE_MULT_PS(XMM7,XMM3) 1623 SSE_ADD_PS(XMM4,XMM7) 1624 1625 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1626 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1627 1628 /* Column 1, product is accumulated in XMM5 */ 1629 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1630 SSE_SHUFFLE(XMM5,XMM5,0x00) 1631 SSE_MULT_PS(XMM5,XMM0) 1632 1633 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1634 SSE_SHUFFLE(XMM6,XMM6,0x00) 1635 SSE_MULT_PS(XMM6,XMM1) 1636 SSE_ADD_PS(XMM5,XMM6) 1637 1638 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1639 SSE_SHUFFLE(XMM7,XMM7,0x00) 1640 SSE_MULT_PS(XMM7,XMM2) 1641 SSE_ADD_PS(XMM5,XMM7) 1642 1643 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1644 SSE_SHUFFLE(XMM6,XMM6,0x00) 1645 SSE_MULT_PS(XMM6,XMM3) 1646 SSE_ADD_PS(XMM5,XMM6) 1647 1648 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1649 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1650 1651 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1652 1653 /* Column 2, product is accumulated in XMM6 */ 1654 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1655 SSE_SHUFFLE(XMM6,XMM6,0x00) 1656 SSE_MULT_PS(XMM6,XMM0) 1657 1658 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1659 SSE_SHUFFLE(XMM7,XMM7,0x00) 1660 SSE_MULT_PS(XMM7,XMM1) 1661 SSE_ADD_PS(XMM6,XMM7) 1662 1663 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1664 SSE_SHUFFLE(XMM7,XMM7,0x00) 1665 SSE_MULT_PS(XMM7,XMM2) 1666 SSE_ADD_PS(XMM6,XMM7) 1667 1668 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1669 SSE_SHUFFLE(XMM7,XMM7,0x00) 1670 SSE_MULT_PS(XMM7,XMM3) 1671 SSE_ADD_PS(XMM6,XMM7) 1672 1673 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1674 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1675 1676 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1677 /* Column 3, product is accumulated in XMM0 */ 1678 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1679 SSE_SHUFFLE(XMM7,XMM7,0x00) 1680 SSE_MULT_PS(XMM0,XMM7) 1681 1682 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1683 SSE_SHUFFLE(XMM7,XMM7,0x00) 1684 SSE_MULT_PS(XMM1,XMM7) 1685 SSE_ADD_PS(XMM0,XMM1) 1686 1687 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1688 SSE_SHUFFLE(XMM1,XMM1,0x00) 1689 SSE_MULT_PS(XMM1,XMM2) 1690 SSE_ADD_PS(XMM0,XMM1) 1691 1692 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1693 SSE_SHUFFLE(XMM7,XMM7,0x00) 1694 SSE_MULT_PS(XMM3,XMM7) 1695 SSE_ADD_PS(XMM0,XMM3) 1696 1697 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1698 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1699 1700 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1701 /* This is code to be maintained and read by humans afterall. */ 1702 /* Copy Multiplier Col 3 into XMM3 */ 1703 SSE_COPY_PS(XMM3,XMM0) 1704 /* Copy Multiplier Col 2 into XMM2 */ 1705 SSE_COPY_PS(XMM2,XMM6) 1706 /* Copy Multiplier Col 1 into XMM1 */ 1707 SSE_COPY_PS(XMM1,XMM5) 1708 /* Copy Multiplier Col 0 into XMM0 */ 1709 SSE_COPY_PS(XMM0,XMM4) 1710 SSE_INLINE_END_2; 1711 1712 /* Update the row: */ 1713 nz = bi[row+1] - diag_offset[row] - 1; 1714 pv += 16; 1715 for (j=0; j<nz; j++) { 1716 PREFETCH_L1(&pv[16]); 1717 x = rtmp + 16*((unsigned int)pj[j]); 1718 /* x = rtmp + 4*pj[j]; */ 1719 1720 /* X:=X-M*PV, One column at a time */ 1721 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1722 SSE_INLINE_BEGIN_2(x,pv) 1723 /* Load First Column of X*/ 1724 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1725 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1726 1727 /* Matrix-Vector Product: */ 1728 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1729 SSE_SHUFFLE(XMM5,XMM5,0x00) 1730 SSE_MULT_PS(XMM5,XMM0) 1731 SSE_SUB_PS(XMM4,XMM5) 1732 1733 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1734 SSE_SHUFFLE(XMM6,XMM6,0x00) 1735 SSE_MULT_PS(XMM6,XMM1) 1736 SSE_SUB_PS(XMM4,XMM6) 1737 1738 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1739 SSE_SHUFFLE(XMM7,XMM7,0x00) 1740 SSE_MULT_PS(XMM7,XMM2) 1741 SSE_SUB_PS(XMM4,XMM7) 1742 1743 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1744 SSE_SHUFFLE(XMM5,XMM5,0x00) 1745 SSE_MULT_PS(XMM5,XMM3) 1746 SSE_SUB_PS(XMM4,XMM5) 1747 1748 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1749 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1750 1751 /* Second Column */ 1752 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1753 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1754 1755 /* Matrix-Vector Product: */ 1756 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1757 SSE_SHUFFLE(XMM6,XMM6,0x00) 1758 SSE_MULT_PS(XMM6,XMM0) 1759 SSE_SUB_PS(XMM5,XMM6) 1760 1761 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1762 SSE_SHUFFLE(XMM7,XMM7,0x00) 1763 SSE_MULT_PS(XMM7,XMM1) 1764 SSE_SUB_PS(XMM5,XMM7) 1765 1766 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1767 SSE_SHUFFLE(XMM4,XMM4,0x00) 1768 SSE_MULT_PS(XMM4,XMM2) 1769 SSE_SUB_PS(XMM5,XMM4) 1770 1771 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1772 SSE_SHUFFLE(XMM6,XMM6,0x00) 1773 SSE_MULT_PS(XMM6,XMM3) 1774 SSE_SUB_PS(XMM5,XMM6) 1775 1776 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1777 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1778 1779 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1780 1781 /* Third Column */ 1782 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1783 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1784 1785 /* Matrix-Vector Product: */ 1786 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1787 SSE_SHUFFLE(XMM7,XMM7,0x00) 1788 SSE_MULT_PS(XMM7,XMM0) 1789 SSE_SUB_PS(XMM6,XMM7) 1790 1791 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1792 SSE_SHUFFLE(XMM4,XMM4,0x00) 1793 SSE_MULT_PS(XMM4,XMM1) 1794 SSE_SUB_PS(XMM6,XMM4) 1795 1796 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1797 SSE_SHUFFLE(XMM5,XMM5,0x00) 1798 SSE_MULT_PS(XMM5,XMM2) 1799 SSE_SUB_PS(XMM6,XMM5) 1800 1801 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1802 SSE_SHUFFLE(XMM7,XMM7,0x00) 1803 SSE_MULT_PS(XMM7,XMM3) 1804 SSE_SUB_PS(XMM6,XMM7) 1805 1806 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1807 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1808 1809 /* Fourth Column */ 1810 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1811 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1812 1813 /* Matrix-Vector Product: */ 1814 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1815 SSE_SHUFFLE(XMM5,XMM5,0x00) 1816 SSE_MULT_PS(XMM5,XMM0) 1817 SSE_SUB_PS(XMM4,XMM5) 1818 1819 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1820 SSE_SHUFFLE(XMM6,XMM6,0x00) 1821 SSE_MULT_PS(XMM6,XMM1) 1822 SSE_SUB_PS(XMM4,XMM6) 1823 1824 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1825 SSE_SHUFFLE(XMM7,XMM7,0x00) 1826 SSE_MULT_PS(XMM7,XMM2) 1827 SSE_SUB_PS(XMM4,XMM7) 1828 1829 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1830 SSE_SHUFFLE(XMM5,XMM5,0x00) 1831 SSE_MULT_PS(XMM5,XMM3) 1832 SSE_SUB_PS(XMM4,XMM5) 1833 1834 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1835 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1836 SSE_INLINE_END_2; 1837 pv += 16; 1838 } 1839 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1840 } 1841 row = (unsigned int)(*bjtmp++); 1842 /* row = (*bjtmp++)/4; */ 1843 /* bjtmp++; */ 1844 } 1845 /* finished row so stick it into b->a */ 1846 pv = ba + 16*bi[i]; 1847 pj = bj + bi[i]; 1848 nz = bi[i+1] - bi[i]; 1849 1850 /* Copy x block back into pv block */ 1851 for (j=0; j<nz; j++) { 1852 x = rtmp+16*((unsigned int)pj[j]); 1853 /* x = rtmp+4*pj[j]; */ 1854 1855 SSE_INLINE_BEGIN_2(x,pv) 1856 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1857 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1858 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1859 1860 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1861 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1862 1863 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1864 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1865 1866 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1867 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1868 1869 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1870 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1871 1872 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1873 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1874 1875 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1876 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1877 1878 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1879 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1880 SSE_INLINE_END_2; 1881 pv += 16; 1882 } 1883 /* invert diagonal block */ 1884 w = ba + 16*diag_offset[i]; 1885 if (pivotinblocks) { 1886 ierr = PetscKernel_A_gets_inverse_A_4(w,shift,allowzeropivot,,&zeropivotdetected);CHKERRQ(ierr); 1887 if (zeropivotdetected) C->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 1888 } else { 1889 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1890 } 1891 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1892 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1893 } 1894 1895 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1896 1897 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1898 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1899 C->assembled = PETSC_TRUE; 1900 1901 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1902 /* Flop Count from inverting diagonal blocks */ 1903 SSE_SCOPE_END; 1904 PetscFunctionReturn(0); 1905 } 1906 1907 #endif 1908