1 2 /* 3 Factorization code for BAIJ format. 4 */ 5 #include <../src/mat/impls/baij/seq/baij.h> 6 #include <petsc/private/kernels/blockinvert.h> 7 8 /* ------------------------------------------------------------*/ 9 /* 10 Version for when blocks are 4 by 4 11 */ 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_inplace" 14 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_inplace(Mat C,Mat A,const MatFactorInfo *info) 15 { 16 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 17 IS isrow = b->row,isicol = b->icol; 18 PetscErrorCode ierr; 19 const PetscInt *r,*ic; 20 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 21 PetscInt *ajtmpold,*ajtmp,nz,row; 22 PetscInt *diag_offset = b->diag,idx,*ai=a->i,*aj=a->j,*pj; 23 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 24 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 25 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 26 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 27 MatScalar m13,m14,m15,m16; 28 MatScalar *ba = b->a,*aa = a->a; 29 PetscBool pivotinblocks = b->pivotinblocks; 30 PetscReal shift = info->shiftamount; 31 32 PetscFunctionBegin; 33 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 34 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 35 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 36 37 for (i=0; i<n; i++) { 38 nz = bi[i+1] - bi[i]; 39 ajtmp = bj + bi[i]; 40 for (j=0; j<nz; j++) { 41 x = rtmp+16*ajtmp[j]; 42 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 43 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 44 } 45 /* load in initial (unfactored row) */ 46 idx = r[i]; 47 nz = ai[idx+1] - ai[idx]; 48 ajtmpold = aj + ai[idx]; 49 v = aa + 16*ai[idx]; 50 for (j=0; j<nz; j++) { 51 x = rtmp+16*ic[ajtmpold[j]]; 52 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 53 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 54 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 55 x[14] = v[14]; x[15] = v[15]; 56 v += 16; 57 } 58 row = *ajtmp++; 59 while (row < i) { 60 pc = rtmp + 16*row; 61 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 62 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 63 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 64 p15 = pc[14]; p16 = pc[15]; 65 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 66 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 67 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 68 || p16 != 0.0) { 69 pv = ba + 16*diag_offset[row]; 70 pj = bj + diag_offset[row] + 1; 71 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 72 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 73 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 74 x15 = pv[14]; x16 = pv[15]; 75 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 76 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 77 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 78 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 79 80 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 81 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 82 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 83 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 84 85 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 86 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 87 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 88 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 89 90 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 91 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 92 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 93 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 94 95 nz = bi[row+1] - diag_offset[row] - 1; 96 pv += 16; 97 for (j=0; j<nz; j++) { 98 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 99 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 100 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 101 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 102 x = rtmp + 16*pj[j]; 103 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 104 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 105 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 106 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 107 108 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 109 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 110 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 111 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 112 113 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 114 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 115 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 116 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 117 118 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 119 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 120 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 121 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 122 123 pv += 16; 124 } 125 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 126 } 127 row = *ajtmp++; 128 } 129 /* finished row so stick it into b->a */ 130 pv = ba + 16*bi[i]; 131 pj = bj + bi[i]; 132 nz = bi[i+1] - bi[i]; 133 for (j=0; j<nz; j++) { 134 x = rtmp+16*pj[j]; 135 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 136 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 137 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 138 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 139 pv += 16; 140 } 141 /* invert diagonal block */ 142 w = ba + 16*diag_offset[i]; 143 if (pivotinblocks) { 144 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 145 } else { 146 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 147 } 148 } 149 150 ierr = PetscFree(rtmp);CHKERRQ(ierr); 151 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 152 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 153 154 C->ops->solve = MatSolve_SeqBAIJ_4_inplace; 155 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_inplace; 156 C->assembled = PETSC_TRUE; 157 158 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 159 PetscFunctionReturn(0); 160 } 161 162 /* MatLUFactorNumeric_SeqBAIJ_4 - 163 copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented 164 PetscKernel_A_gets_A_times_B() 165 PetscKernel_A_gets_A_minus_B_times_C() 166 PetscKernel_A_gets_inverse_A() 167 */ 168 169 #undef __FUNCT__ 170 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4" 171 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4(Mat B,Mat A,const MatFactorInfo *info) 172 { 173 Mat C = B; 174 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 175 IS isrow = b->row,isicol = b->icol; 176 PetscErrorCode ierr; 177 const PetscInt *r,*ic; 178 PetscInt i,j,k,nz,nzL,row; 179 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 180 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 181 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 182 PetscInt flg; 183 PetscReal shift; 184 185 PetscFunctionBegin; 186 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 187 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 188 189 if (info->shifttype == MAT_SHIFT_NONE) { 190 shift = 0; 191 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 192 shift = info->shiftamount; 193 } 194 195 /* generate work space needed by the factorization */ 196 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 197 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 198 199 for (i=0; i<n; i++) { 200 /* zero rtmp */ 201 /* L part */ 202 nz = bi[i+1] - bi[i]; 203 bjtmp = bj + bi[i]; 204 for (j=0; j<nz; j++) { 205 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 206 } 207 208 /* U part */ 209 nz = bdiag[i]-bdiag[i+1]; 210 bjtmp = bj + bdiag[i+1]+1; 211 for (j=0; j<nz; j++) { 212 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 213 } 214 215 /* load in initial (unfactored row) */ 216 nz = ai[r[i]+1] - ai[r[i]]; 217 ajtmp = aj + ai[r[i]]; 218 v = aa + bs2*ai[r[i]]; 219 for (j=0; j<nz; j++) { 220 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 221 } 222 223 /* elimination */ 224 bjtmp = bj + bi[i]; 225 nzL = bi[i+1] - bi[i]; 226 for (k=0; k < nzL; k++) { 227 row = bjtmp[k]; 228 pc = rtmp + bs2*row; 229 for (flg=0,j=0; j<bs2; j++) { 230 if (pc[j]!=0.0) { 231 flg = 1; 232 break; 233 } 234 } 235 if (flg) { 236 pv = b->a + bs2*bdiag[row]; 237 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 238 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 239 240 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 241 pv = b->a + bs2*(bdiag[row+1]+1); 242 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 243 for (j=0; j<nz; j++) { 244 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 245 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 246 v = rtmp + bs2*pj[j]; 247 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 248 pv += bs2; 249 } 250 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 251 } 252 } 253 254 /* finished row so stick it into b->a */ 255 /* L part */ 256 pv = b->a + bs2*bi[i]; 257 pj = b->j + bi[i]; 258 nz = bi[i+1] - bi[i]; 259 for (j=0; j<nz; j++) { 260 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 261 } 262 263 /* Mark diagonal and invert diagonal for simplier triangular solves */ 264 pv = b->a + bs2*bdiag[i]; 265 pj = b->j + bdiag[i]; 266 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 267 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 268 269 /* U part */ 270 pv = b->a + bs2*(bdiag[i+1]+1); 271 pj = b->j + bdiag[i+1]+1; 272 nz = bdiag[i] - bdiag[i+1] - 1; 273 for (j=0; j<nz; j++) { 274 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 275 } 276 } 277 278 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 279 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 280 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 281 282 C->ops->solve = MatSolve_SeqBAIJ_4; 283 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 284 C->assembled = PETSC_TRUE; 285 286 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 287 PetscFunctionReturn(0); 288 } 289 290 #undef __FUNCT__ 291 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace" 292 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo *info) 293 { 294 /* 295 Default Version for when blocks are 4 by 4 Using natural ordering 296 */ 297 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 298 PetscErrorCode ierr; 299 PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 300 PetscInt *ajtmpold,*ajtmp,nz,row; 301 PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 302 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 303 MatScalar p1,p2,p3,p4,m1,m2,m3,m4,m5,m6,m7,m8,m9,x1,x2,x3,x4; 304 MatScalar p5,p6,p7,p8,p9,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,x16; 305 MatScalar p10,p11,p12,p13,p14,p15,p16,m10,m11,m12; 306 MatScalar m13,m14,m15,m16; 307 MatScalar *ba = b->a,*aa = a->a; 308 PetscBool pivotinblocks = b->pivotinblocks; 309 PetscReal shift = info->shiftamount; 310 311 PetscFunctionBegin; 312 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 313 314 for (i=0; i<n; i++) { 315 nz = bi[i+1] - bi[i]; 316 ajtmp = bj + bi[i]; 317 for (j=0; j<nz; j++) { 318 x = rtmp+16*ajtmp[j]; 319 x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = x[9] = 0.0; 320 x[10] = x[11] = x[12] = x[13] = x[14] = x[15] = 0.0; 321 } 322 /* load in initial (unfactored row) */ 323 nz = ai[i+1] - ai[i]; 324 ajtmpold = aj + ai[i]; 325 v = aa + 16*ai[i]; 326 for (j=0; j<nz; j++) { 327 x = rtmp+16*ajtmpold[j]; 328 x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 329 x[4] = v[4]; x[5] = v[5]; x[6] = v[6]; x[7] = v[7]; x[8] = v[8]; 330 x[9] = v[9]; x[10] = v[10]; x[11] = v[11]; x[12] = v[12]; x[13] = v[13]; 331 x[14] = v[14]; x[15] = v[15]; 332 v += 16; 333 } 334 row = *ajtmp++; 335 while (row < i) { 336 pc = rtmp + 16*row; 337 p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 338 p5 = pc[4]; p6 = pc[5]; p7 = pc[6]; p8 = pc[7]; p9 = pc[8]; 339 p10 = pc[9]; p11 = pc[10]; p12 = pc[11]; p13 = pc[12]; p14 = pc[13]; 340 p15 = pc[14]; p16 = pc[15]; 341 if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || 342 p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0 || p10 != 0.0 || 343 p11 != 0.0 || p12 != 0.0 || p13 != 0.0 || p14 != 0.0 || p15 != 0.0 344 || p16 != 0.0) { 345 pv = ba + 16*diag_offset[row]; 346 pj = bj + diag_offset[row] + 1; 347 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 348 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 349 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; x14 = pv[13]; 350 x15 = pv[14]; x16 = pv[15]; 351 pc[0] = m1 = p1*x1 + p5*x2 + p9*x3 + p13*x4; 352 pc[1] = m2 = p2*x1 + p6*x2 + p10*x3 + p14*x4; 353 pc[2] = m3 = p3*x1 + p7*x2 + p11*x3 + p15*x4; 354 pc[3] = m4 = p4*x1 + p8*x2 + p12*x3 + p16*x4; 355 356 pc[4] = m5 = p1*x5 + p5*x6 + p9*x7 + p13*x8; 357 pc[5] = m6 = p2*x5 + p6*x6 + p10*x7 + p14*x8; 358 pc[6] = m7 = p3*x5 + p7*x6 + p11*x7 + p15*x8; 359 pc[7] = m8 = p4*x5 + p8*x6 + p12*x7 + p16*x8; 360 361 pc[8] = m9 = p1*x9 + p5*x10 + p9*x11 + p13*x12; 362 pc[9] = m10 = p2*x9 + p6*x10 + p10*x11 + p14*x12; 363 pc[10] = m11 = p3*x9 + p7*x10 + p11*x11 + p15*x12; 364 pc[11] = m12 = p4*x9 + p8*x10 + p12*x11 + p16*x12; 365 366 pc[12] = m13 = p1*x13 + p5*x14 + p9*x15 + p13*x16; 367 pc[13] = m14 = p2*x13 + p6*x14 + p10*x15 + p14*x16; 368 pc[14] = m15 = p3*x13 + p7*x14 + p11*x15 + p15*x16; 369 pc[15] = m16 = p4*x13 + p8*x14 + p12*x15 + p16*x16; 370 nz = bi[row+1] - diag_offset[row] - 1; 371 pv += 16; 372 for (j=0; j<nz; j++) { 373 x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 374 x5 = pv[4]; x6 = pv[5]; x7 = pv[6]; x8 = pv[7]; x9 = pv[8]; 375 x10 = pv[9]; x11 = pv[10]; x12 = pv[11]; x13 = pv[12]; 376 x14 = pv[13]; x15 = pv[14]; x16 = pv[15]; 377 x = rtmp + 16*pj[j]; 378 x[0] -= m1*x1 + m5*x2 + m9*x3 + m13*x4; 379 x[1] -= m2*x1 + m6*x2 + m10*x3 + m14*x4; 380 x[2] -= m3*x1 + m7*x2 + m11*x3 + m15*x4; 381 x[3] -= m4*x1 + m8*x2 + m12*x3 + m16*x4; 382 383 x[4] -= m1*x5 + m5*x6 + m9*x7 + m13*x8; 384 x[5] -= m2*x5 + m6*x6 + m10*x7 + m14*x8; 385 x[6] -= m3*x5 + m7*x6 + m11*x7 + m15*x8; 386 x[7] -= m4*x5 + m8*x6 + m12*x7 + m16*x8; 387 388 x[8] -= m1*x9 + m5*x10 + m9*x11 + m13*x12; 389 x[9] -= m2*x9 + m6*x10 + m10*x11 + m14*x12; 390 x[10] -= m3*x9 + m7*x10 + m11*x11 + m15*x12; 391 x[11] -= m4*x9 + m8*x10 + m12*x11 + m16*x12; 392 393 x[12] -= m1*x13 + m5*x14 + m9*x15 + m13*x16; 394 x[13] -= m2*x13 + m6*x14 + m10*x15 + m14*x16; 395 x[14] -= m3*x13 + m7*x14 + m11*x15 + m15*x16; 396 x[15] -= m4*x13 + m8*x14 + m12*x15 + m16*x16; 397 398 pv += 16; 399 } 400 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 401 } 402 row = *ajtmp++; 403 } 404 /* finished row so stick it into b->a */ 405 pv = ba + 16*bi[i]; 406 pj = bj + bi[i]; 407 nz = bi[i+1] - bi[i]; 408 for (j=0; j<nz; j++) { 409 x = rtmp+16*pj[j]; 410 pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 411 pv[4] = x[4]; pv[5] = x[5]; pv[6] = x[6]; pv[7] = x[7]; pv[8] = x[8]; 412 pv[9] = x[9]; pv[10] = x[10]; pv[11] = x[11]; pv[12] = x[12]; 413 pv[13] = x[13]; pv[14] = x[14]; pv[15] = x[15]; 414 pv += 16; 415 } 416 /* invert diagonal block */ 417 w = ba + 16*diag_offset[i]; 418 if (pivotinblocks) { 419 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 420 } else { 421 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 422 } 423 } 424 425 ierr = PetscFree(rtmp);CHKERRQ(ierr); 426 427 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_inplace; 428 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace; 429 C->assembled = PETSC_TRUE; 430 431 ierr = PetscLogFlops(1.333333333333*4*4*4*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 432 PetscFunctionReturn(0); 433 } 434 435 /* 436 MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering - 437 copied from MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace() 438 */ 439 #undef __FUNCT__ 440 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering" 441 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 442 { 443 Mat C =B; 444 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 445 PetscErrorCode ierr; 446 PetscInt i,j,k,nz,nzL,row; 447 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 448 const PetscInt *ajtmp,*bjtmp,*bdiag=b->diag,*pj,bs2=a->bs2; 449 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 450 PetscInt flg; 451 PetscReal shift; 452 453 PetscFunctionBegin; 454 /* generate work space needed by the factorization */ 455 ierr = PetscMalloc2(bs2*n,&rtmp,bs2,&mwork);CHKERRQ(ierr); 456 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 457 458 if (info->shifttype == MAT_SHIFT_NONE) { 459 shift = 0; 460 } else { /* info->shifttype == MAT_SHIFT_INBLOCKS */ 461 shift = info->shiftamount; 462 } 463 464 for (i=0; i<n; i++) { 465 /* zero rtmp */ 466 /* L part */ 467 nz = bi[i+1] - bi[i]; 468 bjtmp = bj + bi[i]; 469 for (j=0; j<nz; j++) { 470 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 471 } 472 473 /* U part */ 474 nz = bdiag[i] - bdiag[i+1]; 475 bjtmp = bj + bdiag[i+1]+1; 476 for (j=0; j<nz; j++) { 477 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 478 } 479 480 /* load in initial (unfactored row) */ 481 nz = ai[i+1] - ai[i]; 482 ajtmp = aj + ai[i]; 483 v = aa + bs2*ai[i]; 484 for (j=0; j<nz; j++) { 485 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 486 } 487 488 /* elimination */ 489 bjtmp = bj + bi[i]; 490 nzL = bi[i+1] - bi[i]; 491 for (k=0; k < nzL; k++) { 492 row = bjtmp[k]; 493 pc = rtmp + bs2*row; 494 for (flg=0,j=0; j<bs2; j++) { 495 if (pc[j]!=0.0) { 496 flg = 1; 497 break; 498 } 499 } 500 if (flg) { 501 pv = b->a + bs2*bdiag[row]; 502 /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 503 ierr = PetscKernel_A_gets_A_times_B_4(pc,pv,mwork);CHKERRQ(ierr); 504 505 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 506 pv = b->a + bs2*(bdiag[row+1]+1); 507 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 508 for (j=0; j<nz; j++) { 509 /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 510 /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 511 v = rtmp + bs2*pj[j]; 512 ierr = PetscKernel_A_gets_A_minus_B_times_C_4(v,pc,pv);CHKERRQ(ierr); 513 pv += bs2; 514 } 515 ierr = PetscLogFlops(128*nz+112);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 516 } 517 } 518 519 /* finished row so stick it into b->a */ 520 /* L part */ 521 pv = b->a + bs2*bi[i]; 522 pj = b->j + bi[i]; 523 nz = bi[i+1] - bi[i]; 524 for (j=0; j<nz; j++) { 525 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 526 } 527 528 /* Mark diagonal and invert diagonal for simplier triangular solves */ 529 pv = b->a + bs2*bdiag[i]; 530 pj = b->j + bdiag[i]; 531 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 532 ierr = PetscKernel_A_gets_inverse_A_4(pv,shift);CHKERRQ(ierr); 533 534 /* U part */ 535 pv = b->a + bs2*(bdiag[i+1]+1); 536 pj = b->j + bdiag[i+1]+1; 537 nz = bdiag[i] - bdiag[i+1] - 1; 538 for (j=0; j<nz; j++) { 539 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 540 } 541 } 542 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 543 544 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 545 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 546 C->assembled = PETSC_TRUE; 547 548 ierr = PetscLogFlops(1.333333333333*4*4*4*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 549 PetscFunctionReturn(0); 550 } 551 552 #if defined(PETSC_HAVE_SSE) 553 554 #include PETSC_HAVE_SSE 555 556 /* SSE Version for when blocks are 4 by 4 Using natural ordering */ 557 #undef __FUNCT__ 558 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE" 559 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE(Mat B,Mat A,const MatFactorInfo *info) 560 { 561 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 562 PetscErrorCode ierr; 563 int i,j,n = a->mbs; 564 int *bj = b->j,*bjtmp,*pj; 565 int row; 566 int *ajtmpold,nz,*bi=b->i; 567 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 568 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 569 MatScalar *ba = b->a,*aa = a->a; 570 int nonzero=0; 571 /* int nonzero=0,colscale = 16; */ 572 PetscBool pivotinblocks = b->pivotinblocks; 573 PetscReal shift = info->shiftamount; 574 575 PetscFunctionBegin; 576 SSE_SCOPE_BEGIN; 577 578 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 579 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 580 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 581 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 582 /* if ((unsigned long)bj==(unsigned long)aj) { */ 583 /* colscale = 4; */ 584 /* } */ 585 for (i=0; i<n; i++) { 586 nz = bi[i+1] - bi[i]; 587 bjtmp = bj + bi[i]; 588 /* zero out the 4x4 block accumulators */ 589 /* zero out one register */ 590 XOR_PS(XMM7,XMM7); 591 for (j=0; j<nz; j++) { 592 x = rtmp+16*bjtmp[j]; 593 /* x = rtmp+4*bjtmp[j]; */ 594 SSE_INLINE_BEGIN_1(x) 595 /* Copy zero register to memory locations */ 596 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 597 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 598 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 599 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 600 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 601 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 602 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 603 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 604 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 605 SSE_INLINE_END_1; 606 } 607 /* load in initial (unfactored row) */ 608 nz = ai[i+1] - ai[i]; 609 ajtmpold = aj + ai[i]; 610 v = aa + 16*ai[i]; 611 for (j=0; j<nz; j++) { 612 x = rtmp+16*ajtmpold[j]; 613 /* x = rtmp+colscale*ajtmpold[j]; */ 614 /* Copy v block into x block */ 615 SSE_INLINE_BEGIN_2(v,x) 616 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 617 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 618 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 619 620 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 621 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 622 623 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 624 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 625 626 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 627 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 628 629 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 630 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 631 632 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 633 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 634 635 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 636 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 637 638 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 639 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 640 SSE_INLINE_END_2; 641 642 v += 16; 643 } 644 /* row = (*bjtmp++)/4; */ 645 row = *bjtmp++; 646 while (row < i) { 647 pc = rtmp + 16*row; 648 SSE_INLINE_BEGIN_1(pc) 649 /* Load block from lower triangle */ 650 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 651 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 652 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 653 654 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 655 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 656 657 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 658 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 659 660 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 661 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 662 663 /* Compare block to zero block */ 664 665 SSE_COPY_PS(XMM4,XMM7) 666 SSE_CMPNEQ_PS(XMM4,XMM0) 667 668 SSE_COPY_PS(XMM5,XMM7) 669 SSE_CMPNEQ_PS(XMM5,XMM1) 670 671 SSE_COPY_PS(XMM6,XMM7) 672 SSE_CMPNEQ_PS(XMM6,XMM2) 673 674 SSE_CMPNEQ_PS(XMM7,XMM3) 675 676 /* Reduce the comparisons to one SSE register */ 677 SSE_OR_PS(XMM6,XMM7) 678 SSE_OR_PS(XMM5,XMM4) 679 SSE_OR_PS(XMM5,XMM6) 680 SSE_INLINE_END_1; 681 682 /* Reduce the one SSE register to an integer register for branching */ 683 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 684 MOVEMASK(nonzero,XMM5); 685 686 /* If block is nonzero ... */ 687 if (nonzero) { 688 pv = ba + 16*diag_offset[row]; 689 PREFETCH_L1(&pv[16]); 690 pj = bj + diag_offset[row] + 1; 691 692 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 693 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 694 /* but the diagonal was inverted already */ 695 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 696 697 SSE_INLINE_BEGIN_2(pv,pc) 698 /* Column 0, product is accumulated in XMM4 */ 699 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 700 SSE_SHUFFLE(XMM4,XMM4,0x00) 701 SSE_MULT_PS(XMM4,XMM0) 702 703 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 704 SSE_SHUFFLE(XMM5,XMM5,0x00) 705 SSE_MULT_PS(XMM5,XMM1) 706 SSE_ADD_PS(XMM4,XMM5) 707 708 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 709 SSE_SHUFFLE(XMM6,XMM6,0x00) 710 SSE_MULT_PS(XMM6,XMM2) 711 SSE_ADD_PS(XMM4,XMM6) 712 713 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 714 SSE_SHUFFLE(XMM7,XMM7,0x00) 715 SSE_MULT_PS(XMM7,XMM3) 716 SSE_ADD_PS(XMM4,XMM7) 717 718 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 719 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 720 721 /* Column 1, product is accumulated in XMM5 */ 722 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 723 SSE_SHUFFLE(XMM5,XMM5,0x00) 724 SSE_MULT_PS(XMM5,XMM0) 725 726 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 727 SSE_SHUFFLE(XMM6,XMM6,0x00) 728 SSE_MULT_PS(XMM6,XMM1) 729 SSE_ADD_PS(XMM5,XMM6) 730 731 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 732 SSE_SHUFFLE(XMM7,XMM7,0x00) 733 SSE_MULT_PS(XMM7,XMM2) 734 SSE_ADD_PS(XMM5,XMM7) 735 736 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 737 SSE_SHUFFLE(XMM6,XMM6,0x00) 738 SSE_MULT_PS(XMM6,XMM3) 739 SSE_ADD_PS(XMM5,XMM6) 740 741 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 742 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 743 744 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 745 746 /* Column 2, product is accumulated in XMM6 */ 747 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 748 SSE_SHUFFLE(XMM6,XMM6,0x00) 749 SSE_MULT_PS(XMM6,XMM0) 750 751 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 752 SSE_SHUFFLE(XMM7,XMM7,0x00) 753 SSE_MULT_PS(XMM7,XMM1) 754 SSE_ADD_PS(XMM6,XMM7) 755 756 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 757 SSE_SHUFFLE(XMM7,XMM7,0x00) 758 SSE_MULT_PS(XMM7,XMM2) 759 SSE_ADD_PS(XMM6,XMM7) 760 761 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 762 SSE_SHUFFLE(XMM7,XMM7,0x00) 763 SSE_MULT_PS(XMM7,XMM3) 764 SSE_ADD_PS(XMM6,XMM7) 765 766 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 767 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 768 769 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 770 /* Column 3, product is accumulated in XMM0 */ 771 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 772 SSE_SHUFFLE(XMM7,XMM7,0x00) 773 SSE_MULT_PS(XMM0,XMM7) 774 775 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 776 SSE_SHUFFLE(XMM7,XMM7,0x00) 777 SSE_MULT_PS(XMM1,XMM7) 778 SSE_ADD_PS(XMM0,XMM1) 779 780 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 781 SSE_SHUFFLE(XMM1,XMM1,0x00) 782 SSE_MULT_PS(XMM1,XMM2) 783 SSE_ADD_PS(XMM0,XMM1) 784 785 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 786 SSE_SHUFFLE(XMM7,XMM7,0x00) 787 SSE_MULT_PS(XMM3,XMM7) 788 SSE_ADD_PS(XMM0,XMM3) 789 790 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 791 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 792 793 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 794 /* This is code to be maintained and read by humans afterall. */ 795 /* Copy Multiplier Col 3 into XMM3 */ 796 SSE_COPY_PS(XMM3,XMM0) 797 /* Copy Multiplier Col 2 into XMM2 */ 798 SSE_COPY_PS(XMM2,XMM6) 799 /* Copy Multiplier Col 1 into XMM1 */ 800 SSE_COPY_PS(XMM1,XMM5) 801 /* Copy Multiplier Col 0 into XMM0 */ 802 SSE_COPY_PS(XMM0,XMM4) 803 SSE_INLINE_END_2; 804 805 /* Update the row: */ 806 nz = bi[row+1] - diag_offset[row] - 1; 807 pv += 16; 808 for (j=0; j<nz; j++) { 809 PREFETCH_L1(&pv[16]); 810 x = rtmp + 16*pj[j]; 811 /* x = rtmp + 4*pj[j]; */ 812 813 /* X:=X-M*PV, One column at a time */ 814 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 815 SSE_INLINE_BEGIN_2(x,pv) 816 /* Load First Column of X*/ 817 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 818 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 819 820 /* Matrix-Vector Product: */ 821 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 822 SSE_SHUFFLE(XMM5,XMM5,0x00) 823 SSE_MULT_PS(XMM5,XMM0) 824 SSE_SUB_PS(XMM4,XMM5) 825 826 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 827 SSE_SHUFFLE(XMM6,XMM6,0x00) 828 SSE_MULT_PS(XMM6,XMM1) 829 SSE_SUB_PS(XMM4,XMM6) 830 831 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 832 SSE_SHUFFLE(XMM7,XMM7,0x00) 833 SSE_MULT_PS(XMM7,XMM2) 834 SSE_SUB_PS(XMM4,XMM7) 835 836 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 837 SSE_SHUFFLE(XMM5,XMM5,0x00) 838 SSE_MULT_PS(XMM5,XMM3) 839 SSE_SUB_PS(XMM4,XMM5) 840 841 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 842 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 843 844 /* Second Column */ 845 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 846 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 847 848 /* Matrix-Vector Product: */ 849 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 850 SSE_SHUFFLE(XMM6,XMM6,0x00) 851 SSE_MULT_PS(XMM6,XMM0) 852 SSE_SUB_PS(XMM5,XMM6) 853 854 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 855 SSE_SHUFFLE(XMM7,XMM7,0x00) 856 SSE_MULT_PS(XMM7,XMM1) 857 SSE_SUB_PS(XMM5,XMM7) 858 859 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 860 SSE_SHUFFLE(XMM4,XMM4,0x00) 861 SSE_MULT_PS(XMM4,XMM2) 862 SSE_SUB_PS(XMM5,XMM4) 863 864 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 865 SSE_SHUFFLE(XMM6,XMM6,0x00) 866 SSE_MULT_PS(XMM6,XMM3) 867 SSE_SUB_PS(XMM5,XMM6) 868 869 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 870 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 871 872 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 873 874 /* Third Column */ 875 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 876 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 877 878 /* Matrix-Vector Product: */ 879 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 880 SSE_SHUFFLE(XMM7,XMM7,0x00) 881 SSE_MULT_PS(XMM7,XMM0) 882 SSE_SUB_PS(XMM6,XMM7) 883 884 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 885 SSE_SHUFFLE(XMM4,XMM4,0x00) 886 SSE_MULT_PS(XMM4,XMM1) 887 SSE_SUB_PS(XMM6,XMM4) 888 889 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 890 SSE_SHUFFLE(XMM5,XMM5,0x00) 891 SSE_MULT_PS(XMM5,XMM2) 892 SSE_SUB_PS(XMM6,XMM5) 893 894 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 895 SSE_SHUFFLE(XMM7,XMM7,0x00) 896 SSE_MULT_PS(XMM7,XMM3) 897 SSE_SUB_PS(XMM6,XMM7) 898 899 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 900 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 901 902 /* Fourth Column */ 903 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 904 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 905 906 /* Matrix-Vector Product: */ 907 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 908 SSE_SHUFFLE(XMM5,XMM5,0x00) 909 SSE_MULT_PS(XMM5,XMM0) 910 SSE_SUB_PS(XMM4,XMM5) 911 912 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 913 SSE_SHUFFLE(XMM6,XMM6,0x00) 914 SSE_MULT_PS(XMM6,XMM1) 915 SSE_SUB_PS(XMM4,XMM6) 916 917 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 918 SSE_SHUFFLE(XMM7,XMM7,0x00) 919 SSE_MULT_PS(XMM7,XMM2) 920 SSE_SUB_PS(XMM4,XMM7) 921 922 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 923 SSE_SHUFFLE(XMM5,XMM5,0x00) 924 SSE_MULT_PS(XMM5,XMM3) 925 SSE_SUB_PS(XMM4,XMM5) 926 927 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 928 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 929 SSE_INLINE_END_2; 930 pv += 16; 931 } 932 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 933 } 934 row = *bjtmp++; 935 /* row = (*bjtmp++)/4; */ 936 } 937 /* finished row so stick it into b->a */ 938 pv = ba + 16*bi[i]; 939 pj = bj + bi[i]; 940 nz = bi[i+1] - bi[i]; 941 942 /* Copy x block back into pv block */ 943 for (j=0; j<nz; j++) { 944 x = rtmp+16*pj[j]; 945 /* x = rtmp+4*pj[j]; */ 946 947 SSE_INLINE_BEGIN_2(x,pv) 948 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 949 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 950 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 951 952 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 953 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 954 955 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 956 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 957 958 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 959 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 960 961 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 962 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 963 964 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 965 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 966 967 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 968 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 969 970 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 971 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 972 SSE_INLINE_END_2; 973 pv += 16; 974 } 975 /* invert diagonal block */ 976 w = ba + 16*diag_offset[i]; 977 if (pivotinblocks) { 978 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 979 } else { 980 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 981 } 982 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 983 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 984 } 985 986 ierr = PetscFree(rtmp);CHKERRQ(ierr); 987 988 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 989 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 990 C->assembled = PETSC_TRUE; 991 992 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 993 /* Flop Count from inverting diagonal blocks */ 994 SSE_SCOPE_END; 995 PetscFunctionReturn(0); 996 } 997 998 #undef __FUNCT__ 999 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace" 1000 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(Mat C) 1001 { 1002 Mat A =C; 1003 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1004 PetscErrorCode ierr; 1005 int i,j,n = a->mbs; 1006 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1007 unsigned short *aj = (unsigned short*)(a->j),*ajtmp; 1008 unsigned int row; 1009 int nz,*bi=b->i; 1010 int *diag_offset = b->diag,*ai=a->i; 1011 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1012 MatScalar *ba = b->a,*aa = a->a; 1013 int nonzero=0; 1014 /* int nonzero=0,colscale = 16; */ 1015 PetscBool pivotinblocks = b->pivotinblocks; 1016 PetscReal shift = info->shiftamount; 1017 1018 PetscFunctionBegin; 1019 SSE_SCOPE_BEGIN; 1020 1021 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1022 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1023 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1024 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1025 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1026 /* colscale = 4; */ 1027 /* } */ 1028 1029 for (i=0; i<n; i++) { 1030 nz = bi[i+1] - bi[i]; 1031 bjtmp = bj + bi[i]; 1032 /* zero out the 4x4 block accumulators */ 1033 /* zero out one register */ 1034 XOR_PS(XMM7,XMM7); 1035 for (j=0; j<nz; j++) { 1036 x = rtmp+16*((unsigned int)bjtmp[j]); 1037 /* x = rtmp+4*bjtmp[j]; */ 1038 SSE_INLINE_BEGIN_1(x) 1039 /* Copy zero register to memory locations */ 1040 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1041 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1042 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1043 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1044 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1045 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1046 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1047 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1048 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1049 SSE_INLINE_END_1; 1050 } 1051 /* load in initial (unfactored row) */ 1052 nz = ai[i+1] - ai[i]; 1053 ajtmp = aj + ai[i]; 1054 v = aa + 16*ai[i]; 1055 for (j=0; j<nz; j++) { 1056 x = rtmp+16*((unsigned int)ajtmp[j]); 1057 /* x = rtmp+colscale*ajtmp[j]; */ 1058 /* Copy v block into x block */ 1059 SSE_INLINE_BEGIN_2(v,x) 1060 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1061 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1062 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1063 1064 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1065 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1066 1067 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1068 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1069 1070 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1071 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1072 1073 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1074 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1075 1076 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1077 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1078 1079 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1080 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1081 1082 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1083 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1084 SSE_INLINE_END_2; 1085 1086 v += 16; 1087 } 1088 /* row = (*bjtmp++)/4; */ 1089 row = (unsigned int)(*bjtmp++); 1090 while (row < i) { 1091 pc = rtmp + 16*row; 1092 SSE_INLINE_BEGIN_1(pc) 1093 /* Load block from lower triangle */ 1094 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1095 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1096 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1097 1098 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1099 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1100 1101 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1102 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1103 1104 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1105 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1106 1107 /* Compare block to zero block */ 1108 1109 SSE_COPY_PS(XMM4,XMM7) 1110 SSE_CMPNEQ_PS(XMM4,XMM0) 1111 1112 SSE_COPY_PS(XMM5,XMM7) 1113 SSE_CMPNEQ_PS(XMM5,XMM1) 1114 1115 SSE_COPY_PS(XMM6,XMM7) 1116 SSE_CMPNEQ_PS(XMM6,XMM2) 1117 1118 SSE_CMPNEQ_PS(XMM7,XMM3) 1119 1120 /* Reduce the comparisons to one SSE register */ 1121 SSE_OR_PS(XMM6,XMM7) 1122 SSE_OR_PS(XMM5,XMM4) 1123 SSE_OR_PS(XMM5,XMM6) 1124 SSE_INLINE_END_1; 1125 1126 /* Reduce the one SSE register to an integer register for branching */ 1127 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1128 MOVEMASK(nonzero,XMM5); 1129 1130 /* If block is nonzero ... */ 1131 if (nonzero) { 1132 pv = ba + 16*diag_offset[row]; 1133 PREFETCH_L1(&pv[16]); 1134 pj = bj + diag_offset[row] + 1; 1135 1136 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1137 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1138 /* but the diagonal was inverted already */ 1139 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1140 1141 SSE_INLINE_BEGIN_2(pv,pc) 1142 /* Column 0, product is accumulated in XMM4 */ 1143 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1144 SSE_SHUFFLE(XMM4,XMM4,0x00) 1145 SSE_MULT_PS(XMM4,XMM0) 1146 1147 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1148 SSE_SHUFFLE(XMM5,XMM5,0x00) 1149 SSE_MULT_PS(XMM5,XMM1) 1150 SSE_ADD_PS(XMM4,XMM5) 1151 1152 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1153 SSE_SHUFFLE(XMM6,XMM6,0x00) 1154 SSE_MULT_PS(XMM6,XMM2) 1155 SSE_ADD_PS(XMM4,XMM6) 1156 1157 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1158 SSE_SHUFFLE(XMM7,XMM7,0x00) 1159 SSE_MULT_PS(XMM7,XMM3) 1160 SSE_ADD_PS(XMM4,XMM7) 1161 1162 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1163 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1164 1165 /* Column 1, product is accumulated in XMM5 */ 1166 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1167 SSE_SHUFFLE(XMM5,XMM5,0x00) 1168 SSE_MULT_PS(XMM5,XMM0) 1169 1170 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1171 SSE_SHUFFLE(XMM6,XMM6,0x00) 1172 SSE_MULT_PS(XMM6,XMM1) 1173 SSE_ADD_PS(XMM5,XMM6) 1174 1175 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1176 SSE_SHUFFLE(XMM7,XMM7,0x00) 1177 SSE_MULT_PS(XMM7,XMM2) 1178 SSE_ADD_PS(XMM5,XMM7) 1179 1180 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1181 SSE_SHUFFLE(XMM6,XMM6,0x00) 1182 SSE_MULT_PS(XMM6,XMM3) 1183 SSE_ADD_PS(XMM5,XMM6) 1184 1185 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1186 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1187 1188 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1189 1190 /* Column 2, product is accumulated in XMM6 */ 1191 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1192 SSE_SHUFFLE(XMM6,XMM6,0x00) 1193 SSE_MULT_PS(XMM6,XMM0) 1194 1195 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1196 SSE_SHUFFLE(XMM7,XMM7,0x00) 1197 SSE_MULT_PS(XMM7,XMM1) 1198 SSE_ADD_PS(XMM6,XMM7) 1199 1200 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1201 SSE_SHUFFLE(XMM7,XMM7,0x00) 1202 SSE_MULT_PS(XMM7,XMM2) 1203 SSE_ADD_PS(XMM6,XMM7) 1204 1205 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1206 SSE_SHUFFLE(XMM7,XMM7,0x00) 1207 SSE_MULT_PS(XMM7,XMM3) 1208 SSE_ADD_PS(XMM6,XMM7) 1209 1210 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1211 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1212 1213 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1214 /* Column 3, product is accumulated in XMM0 */ 1215 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1216 SSE_SHUFFLE(XMM7,XMM7,0x00) 1217 SSE_MULT_PS(XMM0,XMM7) 1218 1219 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1220 SSE_SHUFFLE(XMM7,XMM7,0x00) 1221 SSE_MULT_PS(XMM1,XMM7) 1222 SSE_ADD_PS(XMM0,XMM1) 1223 1224 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1225 SSE_SHUFFLE(XMM1,XMM1,0x00) 1226 SSE_MULT_PS(XMM1,XMM2) 1227 SSE_ADD_PS(XMM0,XMM1) 1228 1229 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1230 SSE_SHUFFLE(XMM7,XMM7,0x00) 1231 SSE_MULT_PS(XMM3,XMM7) 1232 SSE_ADD_PS(XMM0,XMM3) 1233 1234 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1235 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1236 1237 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1238 /* This is code to be maintained and read by humans afterall. */ 1239 /* Copy Multiplier Col 3 into XMM3 */ 1240 SSE_COPY_PS(XMM3,XMM0) 1241 /* Copy Multiplier Col 2 into XMM2 */ 1242 SSE_COPY_PS(XMM2,XMM6) 1243 /* Copy Multiplier Col 1 into XMM1 */ 1244 SSE_COPY_PS(XMM1,XMM5) 1245 /* Copy Multiplier Col 0 into XMM0 */ 1246 SSE_COPY_PS(XMM0,XMM4) 1247 SSE_INLINE_END_2; 1248 1249 /* Update the row: */ 1250 nz = bi[row+1] - diag_offset[row] - 1; 1251 pv += 16; 1252 for (j=0; j<nz; j++) { 1253 PREFETCH_L1(&pv[16]); 1254 x = rtmp + 16*((unsigned int)pj[j]); 1255 /* x = rtmp + 4*pj[j]; */ 1256 1257 /* X:=X-M*PV, One column at a time */ 1258 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1259 SSE_INLINE_BEGIN_2(x,pv) 1260 /* Load First Column of X*/ 1261 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1262 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1263 1264 /* Matrix-Vector Product: */ 1265 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1266 SSE_SHUFFLE(XMM5,XMM5,0x00) 1267 SSE_MULT_PS(XMM5,XMM0) 1268 SSE_SUB_PS(XMM4,XMM5) 1269 1270 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1271 SSE_SHUFFLE(XMM6,XMM6,0x00) 1272 SSE_MULT_PS(XMM6,XMM1) 1273 SSE_SUB_PS(XMM4,XMM6) 1274 1275 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1276 SSE_SHUFFLE(XMM7,XMM7,0x00) 1277 SSE_MULT_PS(XMM7,XMM2) 1278 SSE_SUB_PS(XMM4,XMM7) 1279 1280 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1281 SSE_SHUFFLE(XMM5,XMM5,0x00) 1282 SSE_MULT_PS(XMM5,XMM3) 1283 SSE_SUB_PS(XMM4,XMM5) 1284 1285 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1286 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1287 1288 /* Second Column */ 1289 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1290 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1291 1292 /* Matrix-Vector Product: */ 1293 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1294 SSE_SHUFFLE(XMM6,XMM6,0x00) 1295 SSE_MULT_PS(XMM6,XMM0) 1296 SSE_SUB_PS(XMM5,XMM6) 1297 1298 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1299 SSE_SHUFFLE(XMM7,XMM7,0x00) 1300 SSE_MULT_PS(XMM7,XMM1) 1301 SSE_SUB_PS(XMM5,XMM7) 1302 1303 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1304 SSE_SHUFFLE(XMM4,XMM4,0x00) 1305 SSE_MULT_PS(XMM4,XMM2) 1306 SSE_SUB_PS(XMM5,XMM4) 1307 1308 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1309 SSE_SHUFFLE(XMM6,XMM6,0x00) 1310 SSE_MULT_PS(XMM6,XMM3) 1311 SSE_SUB_PS(XMM5,XMM6) 1312 1313 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1314 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1315 1316 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1317 1318 /* Third Column */ 1319 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1320 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1321 1322 /* Matrix-Vector Product: */ 1323 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1324 SSE_SHUFFLE(XMM7,XMM7,0x00) 1325 SSE_MULT_PS(XMM7,XMM0) 1326 SSE_SUB_PS(XMM6,XMM7) 1327 1328 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1329 SSE_SHUFFLE(XMM4,XMM4,0x00) 1330 SSE_MULT_PS(XMM4,XMM1) 1331 SSE_SUB_PS(XMM6,XMM4) 1332 1333 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1334 SSE_SHUFFLE(XMM5,XMM5,0x00) 1335 SSE_MULT_PS(XMM5,XMM2) 1336 SSE_SUB_PS(XMM6,XMM5) 1337 1338 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1339 SSE_SHUFFLE(XMM7,XMM7,0x00) 1340 SSE_MULT_PS(XMM7,XMM3) 1341 SSE_SUB_PS(XMM6,XMM7) 1342 1343 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1344 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1345 1346 /* Fourth Column */ 1347 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1348 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1349 1350 /* Matrix-Vector Product: */ 1351 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1352 SSE_SHUFFLE(XMM5,XMM5,0x00) 1353 SSE_MULT_PS(XMM5,XMM0) 1354 SSE_SUB_PS(XMM4,XMM5) 1355 1356 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1357 SSE_SHUFFLE(XMM6,XMM6,0x00) 1358 SSE_MULT_PS(XMM6,XMM1) 1359 SSE_SUB_PS(XMM4,XMM6) 1360 1361 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1362 SSE_SHUFFLE(XMM7,XMM7,0x00) 1363 SSE_MULT_PS(XMM7,XMM2) 1364 SSE_SUB_PS(XMM4,XMM7) 1365 1366 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1367 SSE_SHUFFLE(XMM5,XMM5,0x00) 1368 SSE_MULT_PS(XMM5,XMM3) 1369 SSE_SUB_PS(XMM4,XMM5) 1370 1371 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1372 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1373 SSE_INLINE_END_2; 1374 pv += 16; 1375 } 1376 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1377 } 1378 row = (unsigned int)(*bjtmp++); 1379 /* row = (*bjtmp++)/4; */ 1380 /* bjtmp++; */ 1381 } 1382 /* finished row so stick it into b->a */ 1383 pv = ba + 16*bi[i]; 1384 pj = bj + bi[i]; 1385 nz = bi[i+1] - bi[i]; 1386 1387 /* Copy x block back into pv block */ 1388 for (j=0; j<nz; j++) { 1389 x = rtmp+16*((unsigned int)pj[j]); 1390 /* x = rtmp+4*pj[j]; */ 1391 1392 SSE_INLINE_BEGIN_2(x,pv) 1393 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1394 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1395 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1396 1397 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1398 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1399 1400 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1401 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1402 1403 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1404 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1405 1406 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1407 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1408 1409 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1410 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1411 1412 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1413 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1414 1415 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1416 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1417 SSE_INLINE_END_2; 1418 pv += 16; 1419 } 1420 /* invert diagonal block */ 1421 w = ba + 16*diag_offset[i]; 1422 if (pivotinblocks) { 1423 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1424 } else { 1425 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1426 } 1427 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1428 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1429 } 1430 1431 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1432 1433 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1434 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1435 C->assembled = PETSC_TRUE; 1436 1437 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1438 /* Flop Count from inverting diagonal blocks */ 1439 SSE_SCOPE_END; 1440 PetscFunctionReturn(0); 1441 } 1442 1443 #undef __FUNCT__ 1444 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj" 1445 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat C,Mat A,const MatFactorInfo *info) 1446 { 1447 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)C->data; 1448 PetscErrorCode ierr; 1449 int i,j,n = a->mbs; 1450 unsigned short *bj = (unsigned short*)(b->j),*bjtmp,*pj; 1451 unsigned int row; 1452 int *ajtmpold,nz,*bi=b->i; 1453 int *diag_offset = b->diag,*ai=a->i,*aj=a->j; 1454 MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1455 MatScalar *ba = b->a,*aa = a->a; 1456 int nonzero=0; 1457 /* int nonzero=0,colscale = 16; */ 1458 PetscBool pivotinblocks = b->pivotinblocks; 1459 PetscReal shift = info->shiftamount; 1460 1461 PetscFunctionBegin; 1462 SSE_SCOPE_BEGIN; 1463 1464 if ((unsigned long)aa%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer aa is not 16 byte aligned. SSE will not work."); 1465 if ((unsigned long)ba%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer ba is not 16 byte aligned. SSE will not work."); 1466 ierr = PetscMalloc1(16*(n+1),&rtmp);CHKERRQ(ierr); 1467 if ((unsigned long)rtmp%16!=0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_BADPTR,"Pointer rtmp is not 16 byte aligned. SSE will not work."); 1468 /* if ((unsigned long)bj==(unsigned long)aj) { */ 1469 /* colscale = 4; */ 1470 /* } */ 1471 if ((unsigned long)bj==(unsigned long)aj) { 1472 return(MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj_Inplace(C)); 1473 } 1474 1475 for (i=0; i<n; i++) { 1476 nz = bi[i+1] - bi[i]; 1477 bjtmp = bj + bi[i]; 1478 /* zero out the 4x4 block accumulators */ 1479 /* zero out one register */ 1480 XOR_PS(XMM7,XMM7); 1481 for (j=0; j<nz; j++) { 1482 x = rtmp+16*((unsigned int)bjtmp[j]); 1483 /* x = rtmp+4*bjtmp[j]; */ 1484 SSE_INLINE_BEGIN_1(x) 1485 /* Copy zero register to memory locations */ 1486 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1487 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM7) 1488 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM7) 1489 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM7) 1490 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM7) 1491 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM7) 1492 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM7) 1493 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1494 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM7) 1495 SSE_INLINE_END_1; 1496 } 1497 /* load in initial (unfactored row) */ 1498 nz = ai[i+1] - ai[i]; 1499 ajtmpold = aj + ai[i]; 1500 v = aa + 16*ai[i]; 1501 for (j=0; j<nz; j++) { 1502 x = rtmp+16*ajtmpold[j]; 1503 /* x = rtmp+colscale*ajtmpold[j]; */ 1504 /* Copy v block into x block */ 1505 SSE_INLINE_BEGIN_2(v,x) 1506 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1507 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1508 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM0) 1509 1510 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM1) 1511 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM1) 1512 1513 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM2) 1514 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM2) 1515 1516 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM3) 1517 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM3) 1518 1519 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM4) 1520 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM4) 1521 1522 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM5) 1523 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM5) 1524 1525 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM6) 1526 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM6) 1527 1528 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1529 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1530 SSE_INLINE_END_2; 1531 1532 v += 16; 1533 } 1534 /* row = (*bjtmp++)/4; */ 1535 row = (unsigned int)(*bjtmp++); 1536 while (row < i) { 1537 pc = rtmp + 16*row; 1538 SSE_INLINE_BEGIN_1(pc) 1539 /* Load block from lower triangle */ 1540 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1541 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM0) 1542 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM0) 1543 1544 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM1) 1545 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM1) 1546 1547 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM2) 1548 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM2) 1549 1550 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM3) 1551 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM3) 1552 1553 /* Compare block to zero block */ 1554 1555 SSE_COPY_PS(XMM4,XMM7) 1556 SSE_CMPNEQ_PS(XMM4,XMM0) 1557 1558 SSE_COPY_PS(XMM5,XMM7) 1559 SSE_CMPNEQ_PS(XMM5,XMM1) 1560 1561 SSE_COPY_PS(XMM6,XMM7) 1562 SSE_CMPNEQ_PS(XMM6,XMM2) 1563 1564 SSE_CMPNEQ_PS(XMM7,XMM3) 1565 1566 /* Reduce the comparisons to one SSE register */ 1567 SSE_OR_PS(XMM6,XMM7) 1568 SSE_OR_PS(XMM5,XMM4) 1569 SSE_OR_PS(XMM5,XMM6) 1570 SSE_INLINE_END_1; 1571 1572 /* Reduce the one SSE register to an integer register for branching */ 1573 /* Note: Since nonzero is an int, there is no INLINE block version of this call */ 1574 MOVEMASK(nonzero,XMM5); 1575 1576 /* If block is nonzero ... */ 1577 if (nonzero) { 1578 pv = ba + 16*diag_offset[row]; 1579 PREFETCH_L1(&pv[16]); 1580 pj = bj + diag_offset[row] + 1; 1581 1582 /* Form Multiplier, one column at a time (Matrix-Matrix Product) */ 1583 /* L_ij^(k+1) = L_ij^(k)*inv(L_jj^(k)) */ 1584 /* but the diagonal was inverted already */ 1585 /* and, L_ij^(k) is already loaded into registers XMM0-XMM3 columnwise */ 1586 1587 SSE_INLINE_BEGIN_2(pv,pc) 1588 /* Column 0, product is accumulated in XMM4 */ 1589 SSE_LOAD_SS(SSE_ARG_1,FLOAT_0,XMM4) 1590 SSE_SHUFFLE(XMM4,XMM4,0x00) 1591 SSE_MULT_PS(XMM4,XMM0) 1592 1593 SSE_LOAD_SS(SSE_ARG_1,FLOAT_1,XMM5) 1594 SSE_SHUFFLE(XMM5,XMM5,0x00) 1595 SSE_MULT_PS(XMM5,XMM1) 1596 SSE_ADD_PS(XMM4,XMM5) 1597 1598 SSE_LOAD_SS(SSE_ARG_1,FLOAT_2,XMM6) 1599 SSE_SHUFFLE(XMM6,XMM6,0x00) 1600 SSE_MULT_PS(XMM6,XMM2) 1601 SSE_ADD_PS(XMM4,XMM6) 1602 1603 SSE_LOAD_SS(SSE_ARG_1,FLOAT_3,XMM7) 1604 SSE_SHUFFLE(XMM7,XMM7,0x00) 1605 SSE_MULT_PS(XMM7,XMM3) 1606 SSE_ADD_PS(XMM4,XMM7) 1607 1608 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM4) 1609 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM4) 1610 1611 /* Column 1, product is accumulated in XMM5 */ 1612 SSE_LOAD_SS(SSE_ARG_1,FLOAT_4,XMM5) 1613 SSE_SHUFFLE(XMM5,XMM5,0x00) 1614 SSE_MULT_PS(XMM5,XMM0) 1615 1616 SSE_LOAD_SS(SSE_ARG_1,FLOAT_5,XMM6) 1617 SSE_SHUFFLE(XMM6,XMM6,0x00) 1618 SSE_MULT_PS(XMM6,XMM1) 1619 SSE_ADD_PS(XMM5,XMM6) 1620 1621 SSE_LOAD_SS(SSE_ARG_1,FLOAT_6,XMM7) 1622 SSE_SHUFFLE(XMM7,XMM7,0x00) 1623 SSE_MULT_PS(XMM7,XMM2) 1624 SSE_ADD_PS(XMM5,XMM7) 1625 1626 SSE_LOAD_SS(SSE_ARG_1,FLOAT_7,XMM6) 1627 SSE_SHUFFLE(XMM6,XMM6,0x00) 1628 SSE_MULT_PS(XMM6,XMM3) 1629 SSE_ADD_PS(XMM5,XMM6) 1630 1631 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM5) 1632 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM5) 1633 1634 SSE_PREFETCH_L1(SSE_ARG_1,FLOAT_24) 1635 1636 /* Column 2, product is accumulated in XMM6 */ 1637 SSE_LOAD_SS(SSE_ARG_1,FLOAT_8,XMM6) 1638 SSE_SHUFFLE(XMM6,XMM6,0x00) 1639 SSE_MULT_PS(XMM6,XMM0) 1640 1641 SSE_LOAD_SS(SSE_ARG_1,FLOAT_9,XMM7) 1642 SSE_SHUFFLE(XMM7,XMM7,0x00) 1643 SSE_MULT_PS(XMM7,XMM1) 1644 SSE_ADD_PS(XMM6,XMM7) 1645 1646 SSE_LOAD_SS(SSE_ARG_1,FLOAT_10,XMM7) 1647 SSE_SHUFFLE(XMM7,XMM7,0x00) 1648 SSE_MULT_PS(XMM7,XMM2) 1649 SSE_ADD_PS(XMM6,XMM7) 1650 1651 SSE_LOAD_SS(SSE_ARG_1,FLOAT_11,XMM7) 1652 SSE_SHUFFLE(XMM7,XMM7,0x00) 1653 SSE_MULT_PS(XMM7,XMM3) 1654 SSE_ADD_PS(XMM6,XMM7) 1655 1656 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM6) 1657 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1658 1659 /* Note: For the last column, we no longer need to preserve XMM0->XMM3 */ 1660 /* Column 3, product is accumulated in XMM0 */ 1661 SSE_LOAD_SS(SSE_ARG_1,FLOAT_12,XMM7) 1662 SSE_SHUFFLE(XMM7,XMM7,0x00) 1663 SSE_MULT_PS(XMM0,XMM7) 1664 1665 SSE_LOAD_SS(SSE_ARG_1,FLOAT_13,XMM7) 1666 SSE_SHUFFLE(XMM7,XMM7,0x00) 1667 SSE_MULT_PS(XMM1,XMM7) 1668 SSE_ADD_PS(XMM0,XMM1) 1669 1670 SSE_LOAD_SS(SSE_ARG_1,FLOAT_14,XMM1) 1671 SSE_SHUFFLE(XMM1,XMM1,0x00) 1672 SSE_MULT_PS(XMM1,XMM2) 1673 SSE_ADD_PS(XMM0,XMM1) 1674 1675 SSE_LOAD_SS(SSE_ARG_1,FLOAT_15,XMM7) 1676 SSE_SHUFFLE(XMM7,XMM7,0x00) 1677 SSE_MULT_PS(XMM3,XMM7) 1678 SSE_ADD_PS(XMM0,XMM3) 1679 1680 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM0) 1681 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1682 1683 /* Simplify Bookkeeping -- Completely Unnecessary Instructions */ 1684 /* This is code to be maintained and read by humans afterall. */ 1685 /* Copy Multiplier Col 3 into XMM3 */ 1686 SSE_COPY_PS(XMM3,XMM0) 1687 /* Copy Multiplier Col 2 into XMM2 */ 1688 SSE_COPY_PS(XMM2,XMM6) 1689 /* Copy Multiplier Col 1 into XMM1 */ 1690 SSE_COPY_PS(XMM1,XMM5) 1691 /* Copy Multiplier Col 0 into XMM0 */ 1692 SSE_COPY_PS(XMM0,XMM4) 1693 SSE_INLINE_END_2; 1694 1695 /* Update the row: */ 1696 nz = bi[row+1] - diag_offset[row] - 1; 1697 pv += 16; 1698 for (j=0; j<nz; j++) { 1699 PREFETCH_L1(&pv[16]); 1700 x = rtmp + 16*((unsigned int)pj[j]); 1701 /* x = rtmp + 4*pj[j]; */ 1702 1703 /* X:=X-M*PV, One column at a time */ 1704 /* Note: M is already loaded columnwise into registers XMM0-XMM3 */ 1705 SSE_INLINE_BEGIN_2(x,pv) 1706 /* Load First Column of X*/ 1707 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1708 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1709 1710 /* Matrix-Vector Product: */ 1711 SSE_LOAD_SS(SSE_ARG_2,FLOAT_0,XMM5) 1712 SSE_SHUFFLE(XMM5,XMM5,0x00) 1713 SSE_MULT_PS(XMM5,XMM0) 1714 SSE_SUB_PS(XMM4,XMM5) 1715 1716 SSE_LOAD_SS(SSE_ARG_2,FLOAT_1,XMM6) 1717 SSE_SHUFFLE(XMM6,XMM6,0x00) 1718 SSE_MULT_PS(XMM6,XMM1) 1719 SSE_SUB_PS(XMM4,XMM6) 1720 1721 SSE_LOAD_SS(SSE_ARG_2,FLOAT_2,XMM7) 1722 SSE_SHUFFLE(XMM7,XMM7,0x00) 1723 SSE_MULT_PS(XMM7,XMM2) 1724 SSE_SUB_PS(XMM4,XMM7) 1725 1726 SSE_LOAD_SS(SSE_ARG_2,FLOAT_3,XMM5) 1727 SSE_SHUFFLE(XMM5,XMM5,0x00) 1728 SSE_MULT_PS(XMM5,XMM3) 1729 SSE_SUB_PS(XMM4,XMM5) 1730 1731 SSE_STOREL_PS(SSE_ARG_1,FLOAT_0,XMM4) 1732 SSE_STOREH_PS(SSE_ARG_1,FLOAT_2,XMM4) 1733 1734 /* Second Column */ 1735 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1736 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1737 1738 /* Matrix-Vector Product: */ 1739 SSE_LOAD_SS(SSE_ARG_2,FLOAT_4,XMM6) 1740 SSE_SHUFFLE(XMM6,XMM6,0x00) 1741 SSE_MULT_PS(XMM6,XMM0) 1742 SSE_SUB_PS(XMM5,XMM6) 1743 1744 SSE_LOAD_SS(SSE_ARG_2,FLOAT_5,XMM7) 1745 SSE_SHUFFLE(XMM7,XMM7,0x00) 1746 SSE_MULT_PS(XMM7,XMM1) 1747 SSE_SUB_PS(XMM5,XMM7) 1748 1749 SSE_LOAD_SS(SSE_ARG_2,FLOAT_6,XMM4) 1750 SSE_SHUFFLE(XMM4,XMM4,0x00) 1751 SSE_MULT_PS(XMM4,XMM2) 1752 SSE_SUB_PS(XMM5,XMM4) 1753 1754 SSE_LOAD_SS(SSE_ARG_2,FLOAT_7,XMM6) 1755 SSE_SHUFFLE(XMM6,XMM6,0x00) 1756 SSE_MULT_PS(XMM6,XMM3) 1757 SSE_SUB_PS(XMM5,XMM6) 1758 1759 SSE_STOREL_PS(SSE_ARG_1,FLOAT_4,XMM5) 1760 SSE_STOREH_PS(SSE_ARG_1,FLOAT_6,XMM5) 1761 1762 SSE_PREFETCH_L1(SSE_ARG_2,FLOAT_24) 1763 1764 /* Third Column */ 1765 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1766 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1767 1768 /* Matrix-Vector Product: */ 1769 SSE_LOAD_SS(SSE_ARG_2,FLOAT_8,XMM7) 1770 SSE_SHUFFLE(XMM7,XMM7,0x00) 1771 SSE_MULT_PS(XMM7,XMM0) 1772 SSE_SUB_PS(XMM6,XMM7) 1773 1774 SSE_LOAD_SS(SSE_ARG_2,FLOAT_9,XMM4) 1775 SSE_SHUFFLE(XMM4,XMM4,0x00) 1776 SSE_MULT_PS(XMM4,XMM1) 1777 SSE_SUB_PS(XMM6,XMM4) 1778 1779 SSE_LOAD_SS(SSE_ARG_2,FLOAT_10,XMM5) 1780 SSE_SHUFFLE(XMM5,XMM5,0x00) 1781 SSE_MULT_PS(XMM5,XMM2) 1782 SSE_SUB_PS(XMM6,XMM5) 1783 1784 SSE_LOAD_SS(SSE_ARG_2,FLOAT_11,XMM7) 1785 SSE_SHUFFLE(XMM7,XMM7,0x00) 1786 SSE_MULT_PS(XMM7,XMM3) 1787 SSE_SUB_PS(XMM6,XMM7) 1788 1789 SSE_STOREL_PS(SSE_ARG_1,FLOAT_8,XMM6) 1790 SSE_STOREH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1791 1792 /* Fourth Column */ 1793 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1794 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1795 1796 /* Matrix-Vector Product: */ 1797 SSE_LOAD_SS(SSE_ARG_2,FLOAT_12,XMM5) 1798 SSE_SHUFFLE(XMM5,XMM5,0x00) 1799 SSE_MULT_PS(XMM5,XMM0) 1800 SSE_SUB_PS(XMM4,XMM5) 1801 1802 SSE_LOAD_SS(SSE_ARG_2,FLOAT_13,XMM6) 1803 SSE_SHUFFLE(XMM6,XMM6,0x00) 1804 SSE_MULT_PS(XMM6,XMM1) 1805 SSE_SUB_PS(XMM4,XMM6) 1806 1807 SSE_LOAD_SS(SSE_ARG_2,FLOAT_14,XMM7) 1808 SSE_SHUFFLE(XMM7,XMM7,0x00) 1809 SSE_MULT_PS(XMM7,XMM2) 1810 SSE_SUB_PS(XMM4,XMM7) 1811 1812 SSE_LOAD_SS(SSE_ARG_2,FLOAT_15,XMM5) 1813 SSE_SHUFFLE(XMM5,XMM5,0x00) 1814 SSE_MULT_PS(XMM5,XMM3) 1815 SSE_SUB_PS(XMM4,XMM5) 1816 1817 SSE_STOREL_PS(SSE_ARG_1,FLOAT_12,XMM4) 1818 SSE_STOREH_PS(SSE_ARG_1,FLOAT_14,XMM4) 1819 SSE_INLINE_END_2; 1820 pv += 16; 1821 } 1822 ierr = PetscLogFlops(128.0*nz+112.0);CHKERRQ(ierr); 1823 } 1824 row = (unsigned int)(*bjtmp++); 1825 /* row = (*bjtmp++)/4; */ 1826 /* bjtmp++; */ 1827 } 1828 /* finished row so stick it into b->a */ 1829 pv = ba + 16*bi[i]; 1830 pj = bj + bi[i]; 1831 nz = bi[i+1] - bi[i]; 1832 1833 /* Copy x block back into pv block */ 1834 for (j=0; j<nz; j++) { 1835 x = rtmp+16*((unsigned int)pj[j]); 1836 /* x = rtmp+4*pj[j]; */ 1837 1838 SSE_INLINE_BEGIN_2(x,pv) 1839 /* Note: on future SSE architectures, STORE might be more efficient than STOREL/H */ 1840 SSE_LOADL_PS(SSE_ARG_1,FLOAT_0,XMM1) 1841 SSE_STOREL_PS(SSE_ARG_2,FLOAT_0,XMM1) 1842 1843 SSE_LOADH_PS(SSE_ARG_1,FLOAT_2,XMM2) 1844 SSE_STOREH_PS(SSE_ARG_2,FLOAT_2,XMM2) 1845 1846 SSE_LOADL_PS(SSE_ARG_1,FLOAT_4,XMM3) 1847 SSE_STOREL_PS(SSE_ARG_2,FLOAT_4,XMM3) 1848 1849 SSE_LOADH_PS(SSE_ARG_1,FLOAT_6,XMM4) 1850 SSE_STOREH_PS(SSE_ARG_2,FLOAT_6,XMM4) 1851 1852 SSE_LOADL_PS(SSE_ARG_1,FLOAT_8,XMM5) 1853 SSE_STOREL_PS(SSE_ARG_2,FLOAT_8,XMM5) 1854 1855 SSE_LOADH_PS(SSE_ARG_1,FLOAT_10,XMM6) 1856 SSE_STOREH_PS(SSE_ARG_2,FLOAT_10,XMM6) 1857 1858 SSE_LOADL_PS(SSE_ARG_1,FLOAT_12,XMM7) 1859 SSE_STOREL_PS(SSE_ARG_2,FLOAT_12,XMM7) 1860 1861 SSE_LOADH_PS(SSE_ARG_1,FLOAT_14,XMM0) 1862 SSE_STOREH_PS(SSE_ARG_2,FLOAT_14,XMM0) 1863 SSE_INLINE_END_2; 1864 pv += 16; 1865 } 1866 /* invert diagonal block */ 1867 w = ba + 16*diag_offset[i]; 1868 if (pivotinblocks) { 1869 ierr = PetscKernel_A_gets_inverse_A_4(w,shift);CHKERRQ(ierr); 1870 } else { 1871 ierr = PetscKernel_A_gets_inverse_A_4_nopivot(w);CHKERRQ(ierr); 1872 } 1873 /* ierr = PetscKernel_A_gets_inverse_A_4_SSE(w);CHKERRQ(ierr); */ 1874 /* Note: Using Kramer's rule, flop count below might be infairly high or low? */ 1875 } 1876 1877 ierr = PetscFree(rtmp);CHKERRQ(ierr); 1878 1879 C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE; 1880 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_SSE; 1881 C->assembled = PETSC_TRUE; 1882 1883 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); 1884 /* Flop Count from inverting diagonal blocks */ 1885 SSE_SCOPE_END; 1886 PetscFunctionReturn(0); 1887 } 1888 1889 #endif 1890