1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <../src/mat/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" 14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 15 { 16 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 17 IS isrow = b->row,isicol = b->icol; 18 PetscErrorCode ierr; 19 const PetscInt *r,*ic; 20 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 21 PetscInt *ajtmpold,*ajtmp,nz,row; 22 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 23 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 24 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 25 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 26 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 27 MatScalar m13,m14,m15,m16; 28 MatScalar *ba = b->a,*aa = a->a; 29 PetscBool pivotinblocks = b->pivotinblocks; 30 PetscReal shift = info->shiftamount; 31 32 PetscFunctionBegin; 33 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 34 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 35 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 36 37 for (i=0; i<n; i++) { 38 nz = bi[i+1] - bi[i]; 39 ajtmp = bj + bi[i]; 40 for (j=0; j<nz; j++) { 41 x = rtmp+16*ajtmp[j]; 42 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 43 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 44 } 45 /* load in initial (unfactored row) */ 46 idx = r[i]; 47 nz = ai[idx+1] - ai[idx]; 48 ajtmpold = aj + ai[idx]; 49 v = aa + 16*ai[idx]; 50 for (j=0; j<nz; j++) { 51 x = rtmp+16*ic[ajtmpold[j]]; 52 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 53 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 54 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 55 x[14] = v[14]; x[15] = v[15]; 56 v += 16; 57 } 58 row = *ajtmp++; 59 while (row < i) { 60 pc = rtmp + 16*row; 61 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 62 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 63 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 64 p15 = pc[14]; p16 = pc[15]; 65 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 66 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 67 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 68 || p16 != 0.0) { 69 pv = ba + 16*diag_offset[row]; 70 pj = bj + diag_offset[row] + 1; 71 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 72 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 73 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 74 x15 = pv[14]; x16 = pv[15]; 75 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 76 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 77 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 78 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 79 80 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 81 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 82 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 83 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 84 85 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 86 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 87 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 88 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 89 90 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 91 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 92 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 93 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 94 95 nz = bi[row+1] - diag_offset[row] - 1; 96 pv += 16; 97 for (j=0; j<nz; j++) { 98 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 99 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 100 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 101 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 102 x = rtmp + 16*pj[j]; 103 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 104 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 105 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 106 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 107 108 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 109 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 110 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 111 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 112 113 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 114 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 115 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 116 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 117 118 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 119 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 120 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 121 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 122 123 pv += 16; 124 } 125 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 126 } 127 row = *ajtmp++; 128 } 129 /* finished row so stick it into b->a */ 130 pv = ba + 16*bi[i]; 131 pj = bj + bi[i]; 132 nz = bi[i+1] - bi[i]; 133 for (j=0; j<nz; j++) { 134 x = rtmp+16*pj[j]; 135 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 136 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 137 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 138 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 139 pv += 16; 140 } 141 /* invert diagonal block */ 142 w = ba + 16*diag_offset[i]; 143 if (pivotinblocks) { 144 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 145 } else { 146 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 147 } 148 } 149 150 ierr = PetscFree(rtmp);CHKERRQ(ierr); 151 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 153 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 154 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 155 C->assembled = PETSC_TRUE; 156 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 157 PetscFunctionReturn(0); 158 } 159 160 /* MatLUFactorNumeric_SeqBAIJ_4 - 161 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 162 PetscKernel_A_gets_A_times_B() 163 PetscKernel_A_gets_A_minus_B_times_C() 164 PetscKernel_A_gets_inverse_A() 165 */ 166 167 #undef __FUNCT__ 168 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 169 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 170 { 171 Mat C=B; 172 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 173 IS isrow = b->row,isicol = b->icol; 174 PetscErrorCode ierr; 175 const PetscInt *r,*ic; 176 PetscInt i,j,k,nz,nzL,row; 177 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 178 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 179 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 180 PetscInt flg; 181 PetscReal shift; 182 183 PetscFunctionBegin; 184 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 185 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 186 187 if (info->shifttype == MAT_SHIFT_NONE) { 188 shift = 0; 189 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 190 shift = info->shiftamount; 191 } 192 193 /* generate work space needed by the factorization */ 194 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 195 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 196 197 for (i=0; i<n; i++) { 198 /* zero rtmp */ 199 /* L part */ 200 nz = bi[i+1] - bi[i]; 201 bjtmp = bj + bi[i]; 202 for (j=0; j<nz; j++) { 203 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 204 } 205 206 /* U part */ 207 nz = bdiag[i]-bdiag[i+1]; 208 bjtmp = bj + bdiag[i+1]+1; 209 for (j=0; j<nz; j++) { 210 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 211 } 212 213 /* load in initial (unfactored row) */ 214 nz = ai[r[i]+1] - ai[r[i]]; 215 ajtmp = aj + ai[r[i]]; 216 v = aa + bs2*ai[r[i]]; 217 for (j=0; j<nz; j++) { 218 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 219 } 220 221 /* elimination */ 222 bjtmp = bj + bi[i]; 223 nzL = bi[i+1] - bi[i]; 224 for (k=0;k < nzL;k++) { 225 row = bjtmp[k]; 226 pc = rtmp + bs2*row; 227 for (flg=0,j=0; j<bs2; j++) { 228 if (pc[j]!=0.0) { 229 flg = 1; 230 break; 231 } 232 } 233 if (flg) { 234 pv = b->a + bs2*bdiag[row]; 235 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 236 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 237 238 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 239 pv = b->a + bs2*(bdiag[row+1]+1); 240 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 241 for (j=0; j<nz; j++) { 242 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 243 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 244 v = rtmp + bs2*pj[j]; 245 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 246 pv += bs2; 247 } 248 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 249 } 250 } 251 252 /* finished row so stick it into b->a */ 253 /* L part */ 254 pv = b->a + bs2*bi[i] ; 255 pj = b->j + bi[i] ; 256 nz = bi[i+1] - bi[i]; 257 for (j=0; j<nz; j++) { 258 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 259 } 260 261 /* Mark diagonal and invert diagonal for simplier triangular solves */ 262 pv = b->a + bs2*bdiag[i]; 263 pj = b->j + bdiag[i]; 264 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 265 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 266 267 /* U part */ 268 pv = b->a + bs2*(bdiag[i+1]+1); 269 pj = b->j + bdiag[i+1]+1; 270 nz = bdiag[i] - bdiag[i+1] - 1; 271 for (j=0; j<nz; j++) { 272 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 273 } 274 } 275 276 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 277 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 278 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 279 C->ops->solve = MatSolve_SeqBAIJ_4; 280 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 281 C->assembled = PETSC_TRUE; 282 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 283 PetscFunctionReturn(0); 284 } 285 286 #undef __FUNCT__ 287 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" 288 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 289 { 290 /* 291 Default Version for when blocks are 4 by 4 Using natural ordering 292 */ 293 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 294 PetscErrorCode ierr; 295 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 296 PetscInt *ajtmpold,*ajtmp,nz,row; 297 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 298 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 299 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 300 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 301 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 302 MatScalar m13,m14,m15,m16; 303 MatScalar *ba = b->a,*aa = a->a; 304 PetscBool pivotinblocks = b->pivotinblocks; 305 PetscReal shift = info->shiftamount; 306 307 PetscFunctionBegin; 308 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 309 310 for (i=0; i<n; i++) { 311 nz = bi[i+1] - bi[i]; 312 ajtmp = bj + bi[i]; 313 for (j=0; j<nz; j++) { 314 x = rtmp+16*ajtmp[j]; 315 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 316 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 317 } 318 /* load in initial (unfactored row) */ 319 nz = ai[i+1] - ai[i]; 320 ajtmpold = aj + ai[i]; 321 v = aa + 16*ai[i]; 322 for (j=0; j<nz; j++) { 323 x = rtmp+16*ajtmpold[j]; 324 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 325 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 326 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 327 x[14] = v[14]; x[15] = v[15]; 328 v += 16; 329 } 330 row = *ajtmp++; 331 while (row < i) { 332 pc = rtmp + 16*row; 333 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 334 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 335 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 336 p15 = pc[14]; p16 = pc[15]; 337 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 338 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 339 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 340 || p16 != 0.0) { 341 pv = ba + 16*diag_offset[row]; 342 pj = bj + diag_offset[row] + 1; 343 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 344 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 345 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 346 x15 = pv[14]; x16 = pv[15]; 347 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 348 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 349 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 350 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 351 352 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 353 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 354 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 355 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 356 357 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 358 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 359 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 360 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 361 362 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 363 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 364 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 365 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 366 nz = bi[row+1] - diag_offset[row] - 1; 367 pv += 16; 368 for (j=0; j<nz; j++) { 369 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 370 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 371 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 372 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 373 x = rtmp + 16*pj[j]; 374 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 375 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 376 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 377 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 378 379 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 380 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 381 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 382 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 383 384 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 385 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 386 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 387 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 388 389 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 390 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 391 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 392 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 393 394 pv += 16; 395 } 396 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 397 } 398 row = *ajtmp++; 399 } 400 /* finished row so stick it into b->a */ 401 pv = ba + 16*bi[i]; 402 pj = bj + bi[i]; 403 nz = bi[i+1] - bi[i]; 404 for (j=0; j<nz; j++) { 405 x = rtmp+16*pj[j]; 406 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 407 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 408 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 409 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 410 pv += 16; 411 } 412 /* invert diagonal block */ 413 w = ba + 16*diag_offset[i]; 414 if (pivotinblocks) { 415 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 416 } else { 417 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 418 } 419 } 420 421 ierr = PetscFree(rtmp);CHKERRQ(ierr); 422 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 423 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 424 C->assembled = PETSC_TRUE; 425 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 426 PetscFunctionReturn(0); 427 } 428 429 /* 430 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 431 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 432 */ 433 #undef __FUNCT__ 434 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 435 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 436 { 437 Mat C=B; 438 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 439 PetscErrorCode ierr; 440 PetscInt i,j,k,nz,nzL,row; 441 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 442 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 443 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 444 PetscInt flg; 445 PetscReal shift; 446 447 PetscFunctionBegin; 448 /* generate work space needed by the factorization */ 449 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 450 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 451 452 if (info->shifttype == MAT_SHIFT_NONE) { 453 shift = 0; 454 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 455 shift = info->shiftamount; 456 } 457 458 for (i=0; i<n; i++) { 459 /* zero rtmp */ 460 /* L part */ 461 nz = bi[i+1] - bi[i]; 462 bjtmp = bj + bi[i]; 463 for (j=0; j<nz; j++) { 464 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 465 } 466 467 /* U part */ 468 nz = bdiag[i] - bdiag[i+1]; 469 bjtmp = bj + bdiag[i+1]+1; 470 for (j=0; j<nz; j++) { 471 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 472 } 473 474 /* load in initial (unfactored row) */ 475 nz = ai[i+1] - ai[i]; 476 ajtmp = aj + ai[i]; 477 v = aa + bs2*ai[i]; 478 for (j=0; j<nz; j++) { 479 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 480 } 481 482 /* elimination */ 483 bjtmp = bj + bi[i]; 484 nzL = bi[i+1] - bi[i]; 485 for (k=0;k < nzL;k++) { 486 row = bjtmp[k]; 487 pc = rtmp + bs2*row; 488 for (flg=0,j=0; j<bs2; j++) { 489 if (pc[j]!=0.0) { 490 flg = 1; 491 break; 492 } 493 } 494 if (flg) { 495 pv = b->a + bs2*bdiag[row]; 496 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 497 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 498 499 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 500 pv = b->a + bs2*(bdiag[row+1]+1); 501 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 502 for (j=0; j<nz; j++) { 503 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 504 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 505 v = rtmp + bs2*pj[j]; 506 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 507 pv += bs2; 508 } 509 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 510 } 511 } 512 513 /* finished row so stick it into b->a */ 514 /* L part */ 515 pv = b->a + bs2*bi[i] ; 516 pj = b->j + bi[i] ; 517 nz = bi[i+1] - bi[i]; 518 for (j=0; j<nz; j++) { 519 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 520 } 521 522 /* Mark diagonal and invert diagonal for simplier triangular solves */ 523 pv = b->a + bs2*bdiag[i]; 524 pj = b->j + bdiag[i]; 525 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 526 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 527 528 /* U part */ 529 pv = b->a + bs2*(bdiag[i+1]+1); 530 pj = b->j + bdiag[i+1]+1; 531 nz = bdiag[i] - bdiag[i+1] - 1; 532 for (j=0; j<nz; j++) { 533 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 534 } 535 } 536 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 537 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 538 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 539 C->assembled = PETSC_TRUE; 540 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 541 PetscFunctionReturn(0); 542 } 543 544 #if defined(PETSC_HAVE_SSE) 545 546 #include PETSC_HAVE_SSE 547 548 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 549 #undef __FUNCT__ 550 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 551 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 552 { 553 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 554 PetscErrorCode ierr; 555 int i,j,n = a->mbs; 556 int *bj = b->j,*bjtmp,*pj; 557 int row; 558 int *ajtmpold,nz,*bi=b->i; 559 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 560 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 561 MatScalar *ba = b->a,*aa = a->a; 562 int nonzero=0; 563 /* int nonzero=0,colscale = 16; */ 564 PetscBool pivotinblocks = b->pivotinblocks; 565 PetscReal shift = info->shiftamount; 566 567 PetscFunctionBegin; 568 SSE_SCOPE_BEGIN; 569 570 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 571 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 572 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 573 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 574 /* if ((unsigned long)bj==(unsigned long)aj) { */ 575 /* colscale = 4; */ 576 /* } */ 577 for (i=0; i<n; i++) { 578 nz = bi[i+1] - bi[i]; 579 bjtmp = bj + bi[i]; 580 /* zero out the 4x4 block accumulators */ 581 /* zero out one register */ 582 XOR_PS(XMM7,XMM7); 583 for (j=0; j<nz; j++) { 584 x = rtmp+16*bjtmp[j]; 585 /* x = rtmp+4*bjtmp[j]; */ 586 SSE_INLINE_BEGIN_1(x) 587 /* Copy zero register to memory locations */ 588 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 589 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 590 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 591 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 592 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 593 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 594 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 595 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 596 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 597 SSE_INLINE_END_1; 598 } 599 /* load in initial (unfactored row) */ 600 nz = ai[i+1] - ai[i]; 601 ajtmpold = aj + ai[i]; 602 v = aa + 16*ai[i]; 603 for (j=0; j<nz; j++) { 604 x = rtmp+16*ajtmpold[j]; 605 /* x = rtmp+colscale*ajtmpold[j]; */ 606 /* Copy v block into x block */ 607 SSE_INLINE_BEGIN_2(v,x) 608 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 609 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 610 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 611 612 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 613 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 614 615 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 616 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 617 618 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 619 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 620 621 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 622 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 623 624 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 625 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 626 627 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 628 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 629 630 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 631 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 632 SSE_INLINE_END_2; 633 634 v += 16; 635 } 636 /* row = (*bjtmp++)/4; */ 637 row = *bjtmp++; 638 while (row < i) { 639 pc = rtmp + 16*row; 640 SSE_INLINE_BEGIN_1(pc) 641 /* Load block from lower triangle */ 642 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 643 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 644 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 645 646 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 647 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 648 649 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 650 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 651 652 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 653 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 654 655 /* Compare block to zero block */ 656 657 SSE_COPY_PS(XMM4,XMM7) 658 SSE_CMPNEQ_PS(XMM4,XMM0) 659 660 SSE_COPY_PS(XMM5,XMM7) 661 SSE_CMPNEQ_PS(XMM5,XMM1) 662 663 SSE_COPY_PS(XMM6,XMM7) 664 SSE_CMPNEQ_PS(XMM6,XMM2) 665 666 SSE_CMPNEQ_PS(XMM7,XMM3) 667 668 /* Reduce the comparisons to one SSE register */ 669 SSE_OR_PS(XMM6,XMM7) 670 SSE_OR_PS(XMM5,XMM4) 671 SSE_OR_PS(XMM5,XMM6) 672 SSE_INLINE_END_1; 673 674 /* Reduce the one SSE register to an integer register for branching */ 675 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 676 MOVEMASK(nonzero,XMM5); 677 678 /* If block is nonzero ... */ 679 if (nonzero) { 680 pv = ba + 16*diag_offset[row]; 681 PREFETCH_L1(&pv[16]); 682 pj = bj + diag_offset[row] + 1; 683 684 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 685 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 686 /* but the diagonal was inverted already */ 687 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 688 689 SSE_INLINE_BEGIN_2(pv,pc) 690 /* Column 0, product is accumulated in XMM4 */ 691 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 692 SSE_SHUFFLE(XMM4,XMM4,0x00) 693 SSE_MULT_PS(XMM4,XMM0) 694 695 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 696 SSE_SHUFFLE(XMM5,XMM5,0x00) 697 SSE_MULT_PS(XMM5,XMM1) 698 SSE_ADD_PS(XMM4,XMM5) 699 700 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 701 SSE_SHUFFLE(XMM6,XMM6,0x00) 702 SSE_MULT_PS(XMM6,XMM2) 703 SSE_ADD_PS(XMM4,XMM6) 704 705 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 706 SSE_SHUFFLE(XMM7,XMM7,0x00) 707 SSE_MULT_PS(XMM7,XMM3) 708 SSE_ADD_PS(XMM4,XMM7) 709 710 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 711 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 712 713 /* Column 1, product is accumulated in XMM5 */ 714 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 715 SSE_SHUFFLE(XMM5,XMM5,0x00) 716 SSE_MULT_PS(XMM5,XMM0) 717 718 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 719 SSE_SHUFFLE(XMM6,XMM6,0x00) 720 SSE_MULT_PS(XMM6,XMM1) 721 SSE_ADD_PS(XMM5,XMM6) 722 723 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 724 SSE_SHUFFLE(XMM7,XMM7,0x00) 725 SSE_MULT_PS(XMM7,XMM2) 726 SSE_ADD_PS(XMM5,XMM7) 727 728 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 729 SSE_SHUFFLE(XMM6,XMM6,0x00) 730 SSE_MULT_PS(XMM6,XMM3) 731 SSE_ADD_PS(XMM5,XMM6) 732 733 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 734 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 735 736 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 737 738 /* Column 2, product is accumulated in XMM6 */ 739 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 740 SSE_SHUFFLE(XMM6,XMM6,0x00) 741 SSE_MULT_PS(XMM6,XMM0) 742 743 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 744 SSE_SHUFFLE(XMM7,XMM7,0x00) 745 SSE_MULT_PS(XMM7,XMM1) 746 SSE_ADD_PS(XMM6,XMM7) 747 748 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 749 SSE_SHUFFLE(XMM7,XMM7,0x00) 750 SSE_MULT_PS(XMM7,XMM2) 751 SSE_ADD_PS(XMM6,XMM7) 752 753 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 754 SSE_SHUFFLE(XMM7,XMM7,0x00) 755 SSE_MULT_PS(XMM7,XMM3) 756 SSE_ADD_PS(XMM6,XMM7) 757 758 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 759 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 760 761 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 762 /* Column 3, product is accumulated in XMM0 */ 763 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 764 SSE_SHUFFLE(XMM7,XMM7,0x00) 765 SSE_MULT_PS(XMM0,XMM7) 766 767 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 768 SSE_SHUFFLE(XMM7,XMM7,0x00) 769 SSE_MULT_PS(XMM1,XMM7) 770 SSE_ADD_PS(XMM0,XMM1) 771 772 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 773 SSE_SHUFFLE(XMM1,XMM1,0x00) 774 SSE_MULT_PS(XMM1,XMM2) 775 SSE_ADD_PS(XMM0,XMM1) 776 777 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 778 SSE_SHUFFLE(XMM7,XMM7,0x00) 779 SSE_MULT_PS(XMM3,XMM7) 780 SSE_ADD_PS(XMM0,XMM3) 781 782 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 783 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 784 785 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 786 /* This is code to be maintained and read by humans afterall. */ 787 /* Copy Multiplier Col 3 into XMM3 */ 788 SSE_COPY_PS(XMM3,XMM0) 789 /* Copy Multiplier Col 2 into XMM2 */ 790 SSE_COPY_PS(XMM2,XMM6) 791 /* Copy Multiplier Col 1 into XMM1 */ 792 SSE_COPY_PS(XMM1,XMM5) 793 /* Copy Multiplier Col 0 into XMM0 */ 794 SSE_COPY_PS(XMM0,XMM4) 795 SSE_INLINE_END_2; 796 797 /* Update the row: */ 798 nz = bi[row+1] - diag_offset[row] - 1; 799 pv += 16; 800 for (j=0; j<nz; j++) { 801 PREFETCH_L1(&pv[16]); 802 x = rtmp + 16*pj[j]; 803 /* x = rtmp + 4*pj[j]; */ 804 805 /* X:=X-M*PV, One column at a time */ 806 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 807 SSE_INLINE_BEGIN_2(x,pv) 808 /* Load First Column of X*/ 809 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 810 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 811 812 /* Matrix-Vector Product: */ 813 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 814 SSE_SHUFFLE(XMM5,XMM5,0x00) 815 SSE_MULT_PS(XMM5,XMM0) 816 SSE_SUB_PS(XMM4,XMM5) 817 818 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 819 SSE_SHUFFLE(XMM6,XMM6,0x00) 820 SSE_MULT_PS(XMM6,XMM1) 821 SSE_SUB_PS(XMM4,XMM6) 822 823 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 824 SSE_SHUFFLE(XMM7,XMM7,0x00) 825 SSE_MULT_PS(XMM7,XMM2) 826 SSE_SUB_PS(XMM4,XMM7) 827 828 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 829 SSE_SHUFFLE(XMM5,XMM5,0x00) 830 SSE_MULT_PS(XMM5,XMM3) 831 SSE_SUB_PS(XMM4,XMM5) 832 833 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 834 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 835 836 /* Second Column */ 837 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 838 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 839 840 /* Matrix-Vector Product: */ 841 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 842 SSE_SHUFFLE(XMM6,XMM6,0x00) 843 SSE_MULT_PS(XMM6,XMM0) 844 SSE_SUB_PS(XMM5,XMM6) 845 846 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 847 SSE_SHUFFLE(XMM7,XMM7,0x00) 848 SSE_MULT_PS(XMM7,XMM1) 849 SSE_SUB_PS(XMM5,XMM7) 850 851 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 852 SSE_SHUFFLE(XMM4,XMM4,0x00) 853 SSE_MULT_PS(XMM4,XMM2) 854 SSE_SUB_PS(XMM5,XMM4) 855 856 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 857 SSE_SHUFFLE(XMM6,XMM6,0x00) 858 SSE_MULT_PS(XMM6,XMM3) 859 SSE_SUB_PS(XMM5,XMM6) 860 861 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 862 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 863 864 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 865 866 /* Third Column */ 867 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 868 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 869 870 /* Matrix-Vector Product: */ 871 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 872 SSE_SHUFFLE(XMM7,XMM7,0x00) 873 SSE_MULT_PS(XMM7,XMM0) 874 SSE_SUB_PS(XMM6,XMM7) 875 876 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 877 SSE_SHUFFLE(XMM4,XMM4,0x00) 878 SSE_MULT_PS(XMM4,XMM1) 879 SSE_SUB_PS(XMM6,XMM4) 880 881 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 882 SSE_SHUFFLE(XMM5,XMM5,0x00) 883 SSE_MULT_PS(XMM5,XMM2) 884 SSE_SUB_PS(XMM6,XMM5) 885 886 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 887 SSE_SHUFFLE(XMM7,XMM7,0x00) 888 SSE_MULT_PS(XMM7,XMM3) 889 SSE_SUB_PS(XMM6,XMM7) 890 891 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 892 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 893 894 /* Fourth Column */ 895 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 896 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 897 898 /* Matrix-Vector Product: */ 899 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 900 SSE_SHUFFLE(XMM5,XMM5,0x00) 901 SSE_MULT_PS(XMM5,XMM0) 902 SSE_SUB_PS(XMM4,XMM5) 903 904 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 905 SSE_SHUFFLE(XMM6,XMM6,0x00) 906 SSE_MULT_PS(XMM6,XMM1) 907 SSE_SUB_PS(XMM4,XMM6) 908 909 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 910 SSE_SHUFFLE(XMM7,XMM7,0x00) 911 SSE_MULT_PS(XMM7,XMM2) 912 SSE_SUB_PS(XMM4,XMM7) 913 914 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 915 SSE_SHUFFLE(XMM5,XMM5,0x00) 916 SSE_MULT_PS(XMM5,XMM3) 917 SSE_SUB_PS(XMM4,XMM5) 918 919 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 920 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 921 SSE_INLINE_END_2; 922 pv += 16; 923 } 924 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 925 } 926 row = *bjtmp++; 927 /* row = (*bjtmp++)/4; */ 928 } 929 /* finished row so stick it into b->a */ 930 pv = ba + 16*bi[i]; 931 pj = bj + bi[i]; 932 nz = bi[i+1] - bi[i]; 933 934 /* Copy x block back into pv block */ 935 for (j=0; j<nz; j++) { 936 x = rtmp+16*pj[j]; 937 /* x = rtmp+4*pj[j]; */ 938 939 SSE_INLINE_BEGIN_2(x,pv) 940 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 941 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 942 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 943 944 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 945 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 946 947 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 948 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 949 950 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 951 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 952 953 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 954 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 955 956 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 957 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 958 959 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 960 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 961 962 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 963 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 964 SSE_INLINE_END_2; 965 pv += 16; 966 } 967 /* invert diagonal block */ 968 w = ba + 16*diag_offset[i]; 969 if (pivotinblocks) { 970 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 971 } else { 972 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 973 } 974 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 975 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 976 } 977 978 ierr = PetscFree(rtmp);CHKERRQ(ierr); 979 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 980 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 981 C->assembled = PETSC_TRUE; 982 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 983 /* Flop Count from inverting diagonal blocks */ 984 SSE_SCOPE_END; 985 PetscFunctionReturn(0); 986 } 987 988 #undef __FUNCT__ 989 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 990 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 991 { 992 Mat A=C; 993 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 994 PetscErrorCode ierr; 995 int i,j,n = a->mbs; 996 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 997 unsigned short *aj = (unsigned short *)(a->j),*ajtmp; 998 unsigned int row; 999 int nz,*bi=b->i; 1000 int *diag_offset = b->diag,*ai=a->i; 1001 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1002 MatScalar *ba = b->a,*aa = a->a; 1003 int nonzero=0; 1004 /* int nonzero=0,colscale = 16; */ 1005 PetscBool pivotinblocks = b->pivotinblocks; 1006 PetscReal shift = info->shiftamount; 1007 1008 PetscFunctionBegin; 1009 SSE_SCOPE_BEGIN; 1010 1011 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1012 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1013 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1014 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1015 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1016 /* colscale = 4; */ 1017 /* } */ 1018 1019 for (i=0; i<n; i++) { 1020 nz = bi[i+1] - bi[i]; 1021 bjtmp = bj + bi[i]; 1022 /* zero out the 4x4 block accumulators */ 1023 /* zero out one register */ 1024 XOR_PS(XMM7,XMM7); 1025 for (j=0; j<nz; j++) { 1026 x = rtmp+16*((unsigned int)bjtmp[j]); 1027 /* x = rtmp+4*bjtmp[j]; */ 1028 SSE_INLINE_BEGIN_1(x) 1029 /* Copy zero register to memory locations */ 1030 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1031 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1032 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1033 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1034 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1035 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1036 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1037 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1038 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1039 SSE_INLINE_END_1; 1040 } 1041 /* load in initial (unfactored row) */ 1042 nz = ai[i+1] - ai[i]; 1043 ajtmp = aj + ai[i]; 1044 v = aa + 16*ai[i]; 1045 for (j=0; j<nz; j++) { 1046 x = rtmp+16*((unsigned int)ajtmp[j]); 1047 /* x = rtmp+colscale*ajtmp[j]; */ 1048 /* Copy v block into x block */ 1049 SSE_INLINE_BEGIN_2(v,x) 1050 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1051 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1052 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1053 1054 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1055 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1056 1057 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1058 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1059 1060 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1061 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1062 1063 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1064 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1065 1066 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1067 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1068 1069 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1070 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1071 1072 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1073 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1074 SSE_INLINE_END_2; 1075 1076 v += 16; 1077 } 1078 /* row = (*bjtmp++)/4; */ 1079 row = (unsigned int)(*bjtmp++); 1080 while (row < i) { 1081 pc = rtmp + 16*row; 1082 SSE_INLINE_BEGIN_1(pc) 1083 /* Load block from lower triangle */ 1084 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1085 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1086 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1087 1088 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1089 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1090 1091 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1092 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1093 1094 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1095 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1096 1097 /* Compare block to zero block */ 1098 1099 SSE_COPY_PS(XMM4,XMM7) 1100 SSE_CMPNEQ_PS(XMM4,XMM0) 1101 1102 SSE_COPY_PS(XMM5,XMM7) 1103 SSE_CMPNEQ_PS(XMM5,XMM1) 1104 1105 SSE_COPY_PS(XMM6,XMM7) 1106 SSE_CMPNEQ_PS(XMM6,XMM2) 1107 1108 SSE_CMPNEQ_PS(XMM7,XMM3) 1109 1110 /* Reduce the comparisons to one SSE register */ 1111 SSE_OR_PS(XMM6,XMM7) 1112 SSE_OR_PS(XMM5,XMM4) 1113 SSE_OR_PS(XMM5,XMM6) 1114 SSE_INLINE_END_1; 1115 1116 /* Reduce the one SSE register to an integer register for branching */ 1117 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1118 MOVEMASK(nonzero,XMM5); 1119 1120 /* If block is nonzero ... */ 1121 if (nonzero) { 1122 pv = ba + 16*diag_offset[row]; 1123 PREFETCH_L1(&pv[16]); 1124 pj = bj + diag_offset[row] + 1; 1125 1126 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1127 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1128 /* but the diagonal was inverted already */ 1129 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1130 1131 SSE_INLINE_BEGIN_2(pv,pc) 1132 /* Column 0, product is accumulated in XMM4 */ 1133 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1134 SSE_SHUFFLE(XMM4,XMM4,0x00) 1135 SSE_MULT_PS(XMM4,XMM0) 1136 1137 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1138 SSE_SHUFFLE(XMM5,XMM5,0x00) 1139 SSE_MULT_PS(XMM5,XMM1) 1140 SSE_ADD_PS(XMM4,XMM5) 1141 1142 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1143 SSE_SHUFFLE(XMM6,XMM6,0x00) 1144 SSE_MULT_PS(XMM6,XMM2) 1145 SSE_ADD_PS(XMM4,XMM6) 1146 1147 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1148 SSE_SHUFFLE(XMM7,XMM7,0x00) 1149 SSE_MULT_PS(XMM7,XMM3) 1150 SSE_ADD_PS(XMM4,XMM7) 1151 1152 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1153 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1154 1155 /* Column 1, product is accumulated in XMM5 */ 1156 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1157 SSE_SHUFFLE(XMM5,XMM5,0x00) 1158 SSE_MULT_PS(XMM5,XMM0) 1159 1160 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1161 SSE_SHUFFLE(XMM6,XMM6,0x00) 1162 SSE_MULT_PS(XMM6,XMM1) 1163 SSE_ADD_PS(XMM5,XMM6) 1164 1165 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1166 SSE_SHUFFLE(XMM7,XMM7,0x00) 1167 SSE_MULT_PS(XMM7,XMM2) 1168 SSE_ADD_PS(XMM5,XMM7) 1169 1170 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1171 SSE_SHUFFLE(XMM6,XMM6,0x00) 1172 SSE_MULT_PS(XMM6,XMM3) 1173 SSE_ADD_PS(XMM5,XMM6) 1174 1175 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1176 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1177 1178 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1179 1180 /* Column 2, product is accumulated in XMM6 */ 1181 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1182 SSE_SHUFFLE(XMM6,XMM6,0x00) 1183 SSE_MULT_PS(XMM6,XMM0) 1184 1185 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1186 SSE_SHUFFLE(XMM7,XMM7,0x00) 1187 SSE_MULT_PS(XMM7,XMM1) 1188 SSE_ADD_PS(XMM6,XMM7) 1189 1190 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1191 SSE_SHUFFLE(XMM7,XMM7,0x00) 1192 SSE_MULT_PS(XMM7,XMM2) 1193 SSE_ADD_PS(XMM6,XMM7) 1194 1195 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1196 SSE_SHUFFLE(XMM7,XMM7,0x00) 1197 SSE_MULT_PS(XMM7,XMM3) 1198 SSE_ADD_PS(XMM6,XMM7) 1199 1200 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1201 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1202 1203 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1204 /* Column 3, product is accumulated in XMM0 */ 1205 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1206 SSE_SHUFFLE(XMM7,XMM7,0x00) 1207 SSE_MULT_PS(XMM0,XMM7) 1208 1209 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1210 SSE_SHUFFLE(XMM7,XMM7,0x00) 1211 SSE_MULT_PS(XMM1,XMM7) 1212 SSE_ADD_PS(XMM0,XMM1) 1213 1214 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1215 SSE_SHUFFLE(XMM1,XMM1,0x00) 1216 SSE_MULT_PS(XMM1,XMM2) 1217 SSE_ADD_PS(XMM0,XMM1) 1218 1219 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1220 SSE_SHUFFLE(XMM7,XMM7,0x00) 1221 SSE_MULT_PS(XMM3,XMM7) 1222 SSE_ADD_PS(XMM0,XMM3) 1223 1224 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1225 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1226 1227 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1228 /* This is code to be maintained and read by humans afterall. */ 1229 /* Copy Multiplier Col 3 into XMM3 */ 1230 SSE_COPY_PS(XMM3,XMM0) 1231 /* Copy Multiplier Col 2 into XMM2 */ 1232 SSE_COPY_PS(XMM2,XMM6) 1233 /* Copy Multiplier Col 1 into XMM1 */ 1234 SSE_COPY_PS(XMM1,XMM5) 1235 /* Copy Multiplier Col 0 into XMM0 */ 1236 SSE_COPY_PS(XMM0,XMM4) 1237 SSE_INLINE_END_2; 1238 1239 /* Update the row: */ 1240 nz = bi[row+1] - diag_offset[row] - 1; 1241 pv += 16; 1242 for (j=0; j<nz; j++) { 1243 PREFETCH_L1(&pv[16]); 1244 x = rtmp + 16*((unsigned int)pj[j]); 1245 /* x = rtmp + 4*pj[j]; */ 1246 1247 /* X:=X-M*PV, One column at a time */ 1248 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1249 SSE_INLINE_BEGIN_2(x,pv) 1250 /* Load First Column of X*/ 1251 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1252 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1253 1254 /* Matrix-Vector Product: */ 1255 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1256 SSE_SHUFFLE(XMM5,XMM5,0x00) 1257 SSE_MULT_PS(XMM5,XMM0) 1258 SSE_SUB_PS(XMM4,XMM5) 1259 1260 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1261 SSE_SHUFFLE(XMM6,XMM6,0x00) 1262 SSE_MULT_PS(XMM6,XMM1) 1263 SSE_SUB_PS(XMM4,XMM6) 1264 1265 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1266 SSE_SHUFFLE(XMM7,XMM7,0x00) 1267 SSE_MULT_PS(XMM7,XMM2) 1268 SSE_SUB_PS(XMM4,XMM7) 1269 1270 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1271 SSE_SHUFFLE(XMM5,XMM5,0x00) 1272 SSE_MULT_PS(XMM5,XMM3) 1273 SSE_SUB_PS(XMM4,XMM5) 1274 1275 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1276 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1277 1278 /* Second Column */ 1279 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1280 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1281 1282 /* Matrix-Vector Product: */ 1283 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1284 SSE_SHUFFLE(XMM6,XMM6,0x00) 1285 SSE_MULT_PS(XMM6,XMM0) 1286 SSE_SUB_PS(XMM5,XMM6) 1287 1288 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1289 SSE_SHUFFLE(XMM7,XMM7,0x00) 1290 SSE_MULT_PS(XMM7,XMM1) 1291 SSE_SUB_PS(XMM5,XMM7) 1292 1293 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1294 SSE_SHUFFLE(XMM4,XMM4,0x00) 1295 SSE_MULT_PS(XMM4,XMM2) 1296 SSE_SUB_PS(XMM5,XMM4) 1297 1298 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1299 SSE_SHUFFLE(XMM6,XMM6,0x00) 1300 SSE_MULT_PS(XMM6,XMM3) 1301 SSE_SUB_PS(XMM5,XMM6) 1302 1303 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1304 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1305 1306 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1307 1308 /* Third Column */ 1309 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1310 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1311 1312 /* Matrix-Vector Product: */ 1313 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1314 SSE_SHUFFLE(XMM7,XMM7,0x00) 1315 SSE_MULT_PS(XMM7,XMM0) 1316 SSE_SUB_PS(XMM6,XMM7) 1317 1318 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1319 SSE_SHUFFLE(XMM4,XMM4,0x00) 1320 SSE_MULT_PS(XMM4,XMM1) 1321 SSE_SUB_PS(XMM6,XMM4) 1322 1323 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1324 SSE_SHUFFLE(XMM5,XMM5,0x00) 1325 SSE_MULT_PS(XMM5,XMM2) 1326 SSE_SUB_PS(XMM6,XMM5) 1327 1328 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1329 SSE_SHUFFLE(XMM7,XMM7,0x00) 1330 SSE_MULT_PS(XMM7,XMM3) 1331 SSE_SUB_PS(XMM6,XMM7) 1332 1333 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1334 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1335 1336 /* Fourth Column */ 1337 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1338 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1339 1340 /* Matrix-Vector Product: */ 1341 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1342 SSE_SHUFFLE(XMM5,XMM5,0x00) 1343 SSE_MULT_PS(XMM5,XMM0) 1344 SSE_SUB_PS(XMM4,XMM5) 1345 1346 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1347 SSE_SHUFFLE(XMM6,XMM6,0x00) 1348 SSE_MULT_PS(XMM6,XMM1) 1349 SSE_SUB_PS(XMM4,XMM6) 1350 1351 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1352 SSE_SHUFFLE(XMM7,XMM7,0x00) 1353 SSE_MULT_PS(XMM7,XMM2) 1354 SSE_SUB_PS(XMM4,XMM7) 1355 1356 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1357 SSE_SHUFFLE(XMM5,XMM5,0x00) 1358 SSE_MULT_PS(XMM5,XMM3) 1359 SSE_SUB_PS(XMM4,XMM5) 1360 1361 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1362 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1363 SSE_INLINE_END_2; 1364 pv += 16; 1365 } 1366 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1367 } 1368 row = (unsigned int)(*bjtmp++); 1369 /* row = (*bjtmp++)/4; */ 1370 /* bjtmp++; */ 1371 } 1372 /* finished row so stick it into b->a */ 1373 pv = ba + 16*bi[i]; 1374 pj = bj + bi[i]; 1375 nz = bi[i+1] - bi[i]; 1376 1377 /* Copy x block back into pv block */ 1378 for (j=0; j<nz; j++) { 1379 x = rtmp+16*((unsigned int)pj[j]); 1380 /* x = rtmp+4*pj[j]; */ 1381 1382 SSE_INLINE_BEGIN_2(x,pv) 1383 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1384 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1385 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1386 1387 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1388 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1389 1390 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1391 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1392 1393 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1394 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1395 1396 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1397 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1398 1399 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1400 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1401 1402 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1403 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1404 1405 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1406 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1407 SSE_INLINE_END_2; 1408 pv += 16; 1409 } 1410 /* invert diagonal block */ 1411 w = ba + 16*diag_offset[i]; 1412 if (pivotinblocks) { 1413 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1414 } else { 1415 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1416 } 1417 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1418 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1419 } 1420 1421 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1422 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1423 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1424 C->assembled = PETSC_TRUE; 1425 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1426 /* Flop Count from inverting diagonal blocks */ 1427 SSE_SCOPE_END; 1428 PetscFunctionReturn(0); 1429 } 1430 1431 #undef __FUNCT__ 1432 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1433 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1434 { 1435 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1436 PetscErrorCode ierr; 1437 int i,j,n = a->mbs; 1438 unsigned short *bj = (unsigned short *)(b->j),*bjtmp,*pj; 1439 unsigned int row; 1440 int *ajtmpold,nz,*bi=b->i; 1441 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1442 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1443 MatScalar *ba = b->a,*aa = a->a; 1444 int nonzero=0; 1445 /* int nonzero=0,colscale = 16; */ 1446 PetscBool pivotinblocks = b->pivotinblocks; 1447 PetscReal shift = info->shiftamount; 1448 1449 PetscFunctionBegin; 1450 SSE_SCOPE_BEGIN; 1451 1452 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1453 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1454 ierr = PetscMalloc(16*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1455 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1456 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1457 /* colscale = 4; */ 1458 /* } */ 1459 if ((unsigned long)bj==(unsigned long)aj) { 1460 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1461 } 1462 1463 for (i=0; i<n; i++) { 1464 nz = bi[i+1] - bi[i]; 1465 bjtmp = bj + bi[i]; 1466 /* zero out the 4x4 block accumulators */ 1467 /* zero out one register */ 1468 XOR_PS(XMM7,XMM7); 1469 for (j=0; j<nz; j++) { 1470 x = rtmp+16*((unsigned int)bjtmp[j]); 1471 /* x = rtmp+4*bjtmp[j]; */ 1472 SSE_INLINE_BEGIN_1(x) 1473 /* Copy zero register to memory locations */ 1474 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1475 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1476 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1477 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1478 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1479 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1480 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1481 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1482 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1483 SSE_INLINE_END_1; 1484 } 1485 /* load in initial (unfactored row) */ 1486 nz = ai[i+1] - ai[i]; 1487 ajtmpold = aj + ai[i]; 1488 v = aa + 16*ai[i]; 1489 for (j=0; j<nz; j++) { 1490 x = rtmp+16*ajtmpold[j]; 1491 /* x = rtmp+colscale*ajtmpold[j]; */ 1492 /* Copy v block into x block */ 1493 SSE_INLINE_BEGIN_2(v,x) 1494 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1495 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1496 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1497 1498 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1499 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1500 1501 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1502 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1503 1504 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1505 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1506 1507 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1508 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1509 1510 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1511 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1512 1513 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1514 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1515 1516 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1517 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1518 SSE_INLINE_END_2; 1519 1520 v += 16; 1521 } 1522 /* row = (*bjtmp++)/4; */ 1523 row = (unsigned int)(*bjtmp++); 1524 while (row < i) { 1525 pc = rtmp + 16*row; 1526 SSE_INLINE_BEGIN_1(pc) 1527 /* Load block from lower triangle */ 1528 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1529 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1530 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1531 1532 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1533 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1534 1535 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1536 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1537 1538 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1539 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1540 1541 /* Compare block to zero block */ 1542 1543 SSE_COPY_PS(XMM4,XMM7) 1544 SSE_CMPNEQ_PS(XMM4,XMM0) 1545 1546 SSE_COPY_PS(XMM5,XMM7) 1547 SSE_CMPNEQ_PS(XMM5,XMM1) 1548 1549 SSE_COPY_PS(XMM6,XMM7) 1550 SSE_CMPNEQ_PS(XMM6,XMM2) 1551 1552 SSE_CMPNEQ_PS(XMM7,XMM3) 1553 1554 /* Reduce the comparisons to one SSE register */ 1555 SSE_OR_PS(XMM6,XMM7) 1556 SSE_OR_PS(XMM5,XMM4) 1557 SSE_OR_PS(XMM5,XMM6) 1558 SSE_INLINE_END_1; 1559 1560 /* Reduce the one SSE register to an integer register for branching */ 1561 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1562 MOVEMASK(nonzero,XMM5); 1563 1564 /* If block is nonzero ... */ 1565 if (nonzero) { 1566 pv = ba + 16*diag_offset[row]; 1567 PREFETCH_L1(&pv[16]); 1568 pj = bj + diag_offset[row] + 1; 1569 1570 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1571 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1572 /* but the diagonal was inverted already */ 1573 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1574 1575 SSE_INLINE_BEGIN_2(pv,pc) 1576 /* Column 0, product is accumulated in XMM4 */ 1577 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1578 SSE_SHUFFLE(XMM4,XMM4,0x00) 1579 SSE_MULT_PS(XMM4,XMM0) 1580 1581 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1582 SSE_SHUFFLE(XMM5,XMM5,0x00) 1583 SSE_MULT_PS(XMM5,XMM1) 1584 SSE_ADD_PS(XMM4,XMM5) 1585 1586 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1587 SSE_SHUFFLE(XMM6,XMM6,0x00) 1588 SSE_MULT_PS(XMM6,XMM2) 1589 SSE_ADD_PS(XMM4,XMM6) 1590 1591 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1592 SSE_SHUFFLE(XMM7,XMM7,0x00) 1593 SSE_MULT_PS(XMM7,XMM3) 1594 SSE_ADD_PS(XMM4,XMM7) 1595 1596 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1597 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1598 1599 /* Column 1, product is accumulated in XMM5 */ 1600 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1601 SSE_SHUFFLE(XMM5,XMM5,0x00) 1602 SSE_MULT_PS(XMM5,XMM0) 1603 1604 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1605 SSE_SHUFFLE(XMM6,XMM6,0x00) 1606 SSE_MULT_PS(XMM6,XMM1) 1607 SSE_ADD_PS(XMM5,XMM6) 1608 1609 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1610 SSE_SHUFFLE(XMM7,XMM7,0x00) 1611 SSE_MULT_PS(XMM7,XMM2) 1612 SSE_ADD_PS(XMM5,XMM7) 1613 1614 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1615 SSE_SHUFFLE(XMM6,XMM6,0x00) 1616 SSE_MULT_PS(XMM6,XMM3) 1617 SSE_ADD_PS(XMM5,XMM6) 1618 1619 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1620 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1621 1622 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1623 1624 /* Column 2, product is accumulated in XMM6 */ 1625 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1626 SSE_SHUFFLE(XMM6,XMM6,0x00) 1627 SSE_MULT_PS(XMM6,XMM0) 1628 1629 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1630 SSE_SHUFFLE(XMM7,XMM7,0x00) 1631 SSE_MULT_PS(XMM7,XMM1) 1632 SSE_ADD_PS(XMM6,XMM7) 1633 1634 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1635 SSE_SHUFFLE(XMM7,XMM7,0x00) 1636 SSE_MULT_PS(XMM7,XMM2) 1637 SSE_ADD_PS(XMM6,XMM7) 1638 1639 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1640 SSE_SHUFFLE(XMM7,XMM7,0x00) 1641 SSE_MULT_PS(XMM7,XMM3) 1642 SSE_ADD_PS(XMM6,XMM7) 1643 1644 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1645 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1646 1647 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1648 /* Column 3, product is accumulated in XMM0 */ 1649 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1650 SSE_SHUFFLE(XMM7,XMM7,0x00) 1651 SSE_MULT_PS(XMM0,XMM7) 1652 1653 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1654 SSE_SHUFFLE(XMM7,XMM7,0x00) 1655 SSE_MULT_PS(XMM1,XMM7) 1656 SSE_ADD_PS(XMM0,XMM1) 1657 1658 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1659 SSE_SHUFFLE(XMM1,XMM1,0x00) 1660 SSE_MULT_PS(XMM1,XMM2) 1661 SSE_ADD_PS(XMM0,XMM1) 1662 1663 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1664 SSE_SHUFFLE(XMM7,XMM7,0x00) 1665 SSE_MULT_PS(XMM3,XMM7) 1666 SSE_ADD_PS(XMM0,XMM3) 1667 1668 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1669 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1670 1671 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1672 /* This is code to be maintained and read by humans afterall. */ 1673 /* Copy Multiplier Col 3 into XMM3 */ 1674 SSE_COPY_PS(XMM3,XMM0) 1675 /* Copy Multiplier Col 2 into XMM2 */ 1676 SSE_COPY_PS(XMM2,XMM6) 1677 /* Copy Multiplier Col 1 into XMM1 */ 1678 SSE_COPY_PS(XMM1,XMM5) 1679 /* Copy Multiplier Col 0 into XMM0 */ 1680 SSE_COPY_PS(XMM0,XMM4) 1681 SSE_INLINE_END_2; 1682 1683 /* Update the row: */ 1684 nz = bi[row+1] - diag_offset[row] - 1; 1685 pv += 16; 1686 for (j=0; j<nz; j++) { 1687 PREFETCH_L1(&pv[16]); 1688 x = rtmp + 16*((unsigned int)pj[j]); 1689 /* x = rtmp + 4*pj[j]; */ 1690 1691 /* X:=X-M*PV, One column at a time */ 1692 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1693 SSE_INLINE_BEGIN_2(x,pv) 1694 /* Load First Column of X*/ 1695 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1696 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1697 1698 /* Matrix-Vector Product: */ 1699 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1700 SSE_SHUFFLE(XMM5,XMM5,0x00) 1701 SSE_MULT_PS(XMM5,XMM0) 1702 SSE_SUB_PS(XMM4,XMM5) 1703 1704 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1705 SSE_SHUFFLE(XMM6,XMM6,0x00) 1706 SSE_MULT_PS(XMM6,XMM1) 1707 SSE_SUB_PS(XMM4,XMM6) 1708 1709 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1710 SSE_SHUFFLE(XMM7,XMM7,0x00) 1711 SSE_MULT_PS(XMM7,XMM2) 1712 SSE_SUB_PS(XMM4,XMM7) 1713 1714 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1715 SSE_SHUFFLE(XMM5,XMM5,0x00) 1716 SSE_MULT_PS(XMM5,XMM3) 1717 SSE_SUB_PS(XMM4,XMM5) 1718 1719 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1720 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1721 1722 /* Second Column */ 1723 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1724 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1725 1726 /* Matrix-Vector Product: */ 1727 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1728 SSE_SHUFFLE(XMM6,XMM6,0x00) 1729 SSE_MULT_PS(XMM6,XMM0) 1730 SSE_SUB_PS(XMM5,XMM6) 1731 1732 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1733 SSE_SHUFFLE(XMM7,XMM7,0x00) 1734 SSE_MULT_PS(XMM7,XMM1) 1735 SSE_SUB_PS(XMM5,XMM7) 1736 1737 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1738 SSE_SHUFFLE(XMM4,XMM4,0x00) 1739 SSE_MULT_PS(XMM4,XMM2) 1740 SSE_SUB_PS(XMM5,XMM4) 1741 1742 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1743 SSE_SHUFFLE(XMM6,XMM6,0x00) 1744 SSE_MULT_PS(XMM6,XMM3) 1745 SSE_SUB_PS(XMM5,XMM6) 1746 1747 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1748 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1749 1750 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1751 1752 /* Third Column */ 1753 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1754 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1755 1756 /* Matrix-Vector Product: */ 1757 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1758 SSE_SHUFFLE(XMM7,XMM7,0x00) 1759 SSE_MULT_PS(XMM7,XMM0) 1760 SSE_SUB_PS(XMM6,XMM7) 1761 1762 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1763 SSE_SHUFFLE(XMM4,XMM4,0x00) 1764 SSE_MULT_PS(XMM4,XMM1) 1765 SSE_SUB_PS(XMM6,XMM4) 1766 1767 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1768 SSE_SHUFFLE(XMM5,XMM5,0x00) 1769 SSE_MULT_PS(XMM5,XMM2) 1770 SSE_SUB_PS(XMM6,XMM5) 1771 1772 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1773 SSE_SHUFFLE(XMM7,XMM7,0x00) 1774 SSE_MULT_PS(XMM7,XMM3) 1775 SSE_SUB_PS(XMM6,XMM7) 1776 1777 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1778 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1779 1780 /* Fourth Column */ 1781 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1782 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1783 1784 /* Matrix-Vector Product: */ 1785 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1786 SSE_SHUFFLE(XMM5,XMM5,0x00) 1787 SSE_MULT_PS(XMM5,XMM0) 1788 SSE_SUB_PS(XMM4,XMM5) 1789 1790 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1791 SSE_SHUFFLE(XMM6,XMM6,0x00) 1792 SSE_MULT_PS(XMM6,XMM1) 1793 SSE_SUB_PS(XMM4,XMM6) 1794 1795 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1796 SSE_SHUFFLE(XMM7,XMM7,0x00) 1797 SSE_MULT_PS(XMM7,XMM2) 1798 SSE_SUB_PS(XMM4,XMM7) 1799 1800 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1801 SSE_SHUFFLE(XMM5,XMM5,0x00) 1802 SSE_MULT_PS(XMM5,XMM3) 1803 SSE_SUB_PS(XMM4,XMM5) 1804 1805 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1806 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1807 SSE_INLINE_END_2; 1808 pv += 16; 1809 } 1810 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1811 } 1812 row = (unsigned int)(*bjtmp++); 1813 /* row = (*bjtmp++)/4; */ 1814 /* bjtmp++; */ 1815 } 1816 /* finished row so stick it into b->a */ 1817 pv = ba + 16*bi[i]; 1818 pj = bj + bi[i]; 1819 nz = bi[i+1] - bi[i]; 1820 1821 /* Copy x block back into pv block */ 1822 for (j=0; j<nz; j++) { 1823 x = rtmp+16*((unsigned int)pj[j]); 1824 /* x = rtmp+4*pj[j]; */ 1825 1826 SSE_INLINE_BEGIN_2(x,pv) 1827 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1828 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1829 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1830 1831 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1832 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1833 1834 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1835 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1836 1837 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1838 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1839 1840 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1841 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1842 1843 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1844 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1845 1846 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1847 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1848 1849 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1850 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1851 SSE_INLINE_END_2; 1852 pv += 16; 1853 } 1854 /* invert diagonal block */ 1855 w = ba + 16*diag_offset[i]; 1856 if (pivotinblocks) { 1857 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1858 } else { 1859 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1860 } 1861 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1862 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1863 } 1864 1865 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1866 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1867 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1868 C->assembled = PETSC_TRUE; 1869 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1870 /* Flop Count from inverting diagonal blocks */ 1871 SSE_SCOPE_END; 1872 PetscFunctionReturn(0); 1873 } 1874 1875 #endif 1876