1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 #undef __FUNCT__ 5 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 6 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 7 { 8 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 9 IS iscol=a->col,isrow=a->row; 10 PetscErrorCode ierr; 11 const PetscInt *r,*c,*rout,*cout; 12 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 13 PetscInt i,nz; 14 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 15 const MatScalar *aa=a->a,*v; 16 PetscScalar *x,*s,*t,*ls; 17 const PetscScalar *b; 18 19 PetscFunctionBegin; 20 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22 t = a->solve_work; 23 24 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 25 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 26 27 /* forward solve the lower triangular */ 28 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 29 for (i=1; i<n; i++) { 30 v = aa + bs2*ai[i]; 31 vi = aj + ai[i]; 32 nz = a->diag[i] - ai[i]; 33 s = t + bs*i; 34 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 35 while (nz--) { 36 PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 37 v += bs2; 38 } 39 } 40 /* backward solve the upper triangular */ 41 ls = a->solve_work + A->cmap->n; 42 for (i=n-1; i>=0; i--) { 43 v = aa + bs2*(a->diag[i] + 1); 44 vi = aj + a->diag[i] + 1; 45 nz = ai[i+1] - a->diag[i] - 1; 46 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 47 while (nz--) { 48 PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 49 v += bs2; 50 } 51 PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 52 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 53 } 54 55 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 56 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 57 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 58 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 59 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 60 PetscFunctionReturn(0); 61 } 62 63 #undef __FUNCT__ 64 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 65 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 66 { 67 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 68 IS iscol=a->col,isrow=a->row; 69 PetscErrorCode ierr; 70 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 71 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 72 PetscInt i,nz,idx,idt,idc; 73 const MatScalar *aa=a->a,*v; 74 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 75 const PetscScalar *b; 76 77 PetscFunctionBegin; 78 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 79 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 80 t = a->solve_work; 81 82 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 83 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 84 85 /* forward solve the lower triangular */ 86 idx = 7*(*r++); 87 t[0] = b[idx]; t[1] = b[1+idx]; 88 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 89 t[5] = b[5+idx]; t[6] = b[6+idx]; 90 91 for (i=1; i<n; i++) { 92 v = aa + 49*ai[i]; 93 vi = aj + ai[i]; 94 nz = diag[i] - ai[i]; 95 idx = 7*(*r++); 96 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 97 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 98 while (nz--) { 99 idx = 7*(*vi++); 100 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 101 x4 = t[3+idx];x5 = t[4+idx]; 102 x6 = t[5+idx];x7 = t[6+idx]; 103 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 104 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 105 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 106 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 107 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 108 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 109 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 110 v += 49; 111 } 112 idx = 7*i; 113 t[idx] = s1;t[1+idx] = s2; 114 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 115 t[5+idx] = s6;t[6+idx] = s7; 116 } 117 /* backward solve the upper triangular */ 118 for (i=n-1; i>=0; i--) { 119 v = aa + 49*diag[i] + 49; 120 vi = aj + diag[i] + 1; 121 nz = ai[i+1] - diag[i] - 1; 122 idt = 7*i; 123 s1 = t[idt]; s2 = t[1+idt]; 124 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 125 s6 = t[5+idt];s7 = t[6+idt]; 126 while (nz--) { 127 idx = 7*(*vi++); 128 x1 = t[idx]; x2 = t[1+idx]; 129 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 130 x6 = t[5+idx]; x7 = t[6+idx]; 131 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 132 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 133 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 134 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 135 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 136 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 137 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 138 v += 49; 139 } 140 idc = 7*(*c--); 141 v = aa + 49*diag[i]; 142 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 143 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 144 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 145 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 146 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 147 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 148 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 149 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 150 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 151 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 152 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 153 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 154 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 155 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 156 } 157 158 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 159 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 160 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 161 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 162 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 163 PetscFunctionReturn(0); 164 } 165 166 #undef __FUNCT__ 167 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 168 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 169 { 170 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 171 IS iscol=a->col,isrow=a->row; 172 PetscErrorCode ierr; 173 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 174 const PetscInt n=a->mbs,*rout,*cout,*vi; 175 PetscInt i,nz,idx,idt,idc,m; 176 const MatScalar *aa=a->a,*v; 177 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 178 const PetscScalar *b; 179 180 PetscFunctionBegin; 181 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 182 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 183 t = a->solve_work; 184 185 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 186 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 187 188 /* forward solve the lower triangular */ 189 idx = 7*r[0]; 190 t[0] = b[idx]; t[1] = b[1+idx]; 191 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 192 t[5] = b[5+idx]; t[6] = b[6+idx]; 193 194 for (i=1; i<n; i++) { 195 v = aa + 49*ai[i]; 196 vi = aj + ai[i]; 197 nz = ai[i+1] - ai[i]; 198 idx = 7*r[i]; 199 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 200 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 201 for (m=0; m<nz; m++) { 202 idx = 7*vi[m]; 203 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 204 x4 = t[3+idx];x5 = t[4+idx]; 205 x6 = t[5+idx];x7 = t[6+idx]; 206 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 207 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 208 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 209 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 210 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 211 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 212 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 213 v += 49; 214 } 215 idx = 7*i; 216 t[idx] = s1;t[1+idx] = s2; 217 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 218 t[5+idx] = s6;t[6+idx] = s7; 219 } 220 /* backward solve the upper triangular */ 221 for (i=n-1; i>=0; i--) { 222 v = aa + 49*(adiag[i+1]+1); 223 vi = aj + adiag[i+1]+1; 224 nz = adiag[i] - adiag[i+1] - 1; 225 idt = 7*i; 226 s1 = t[idt]; s2 = t[1+idt]; 227 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 228 s6 = t[5+idt];s7 = t[6+idt]; 229 for (m=0; m<nz; m++) { 230 idx = 7*vi[m]; 231 x1 = t[idx]; x2 = t[1+idx]; 232 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 233 x6 = t[5+idx]; x7 = t[6+idx]; 234 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 235 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 236 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 237 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 238 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 239 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 240 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 241 v += 49; 242 } 243 idc = 7*c[i]; 244 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 245 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 246 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 247 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 248 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 249 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 250 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 251 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 252 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 253 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 254 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 255 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 256 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 257 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 258 } 259 260 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 261 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 262 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 263 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 264 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 265 PetscFunctionReturn(0); 266 } 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 270 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 271 { 272 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 273 IS iscol=a->col,isrow=a->row; 274 PetscErrorCode ierr; 275 const PetscInt *r,*c,*rout,*cout; 276 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 277 PetscInt i,nz,idx,idt,idc; 278 const MatScalar *aa=a->a,*v; 279 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 280 const PetscScalar *b; 281 282 PetscFunctionBegin; 283 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 284 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 285 t = a->solve_work; 286 287 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 288 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 289 290 /* forward solve the lower triangular */ 291 idx = 6*(*r++); 292 t[0] = b[idx]; t[1] = b[1+idx]; 293 t[2] = b[2+idx]; t[3] = b[3+idx]; 294 t[4] = b[4+idx]; t[5] = b[5+idx]; 295 for (i=1; i<n; i++) { 296 v = aa + 36*ai[i]; 297 vi = aj + ai[i]; 298 nz = diag[i] - ai[i]; 299 idx = 6*(*r++); 300 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 301 s5 = b[4+idx]; s6 = b[5+idx]; 302 while (nz--) { 303 idx = 6*(*vi++); 304 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 305 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 306 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 307 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 308 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 309 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 310 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 311 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 312 v += 36; 313 } 314 idx = 6*i; 315 t[idx] = s1;t[1+idx] = s2; 316 t[2+idx] = s3;t[3+idx] = s4; 317 t[4+idx] = s5;t[5+idx] = s6; 318 } 319 /* backward solve the upper triangular */ 320 for (i=n-1; i>=0; i--) { 321 v = aa + 36*diag[i] + 36; 322 vi = aj + diag[i] + 1; 323 nz = ai[i+1] - diag[i] - 1; 324 idt = 6*i; 325 s1 = t[idt]; s2 = t[1+idt]; 326 s3 = t[2+idt];s4 = t[3+idt]; 327 s5 = t[4+idt];s6 = t[5+idt]; 328 while (nz--) { 329 idx = 6*(*vi++); 330 x1 = t[idx]; x2 = t[1+idx]; 331 x3 = t[2+idx]; x4 = t[3+idx]; 332 x5 = t[4+idx]; x6 = t[5+idx]; 333 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 334 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 335 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 336 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 337 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 338 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 339 v += 36; 340 } 341 idc = 6*(*c--); 342 v = aa + 36*diag[i]; 343 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 344 v[18]*s4+v[24]*s5+v[30]*s6; 345 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 346 v[19]*s4+v[25]*s5+v[31]*s6; 347 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 348 v[20]*s4+v[26]*s5+v[32]*s6; 349 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 350 v[21]*s4+v[27]*s5+v[33]*s6; 351 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 352 v[22]*s4+v[28]*s5+v[34]*s6; 353 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 354 v[23]*s4+v[29]*s5+v[35]*s6; 355 } 356 357 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 358 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 359 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 362 PetscFunctionReturn(0); 363 } 364 365 #undef __FUNCT__ 366 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 367 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 368 { 369 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 370 IS iscol=a->col,isrow=a->row; 371 PetscErrorCode ierr; 372 const PetscInt *r,*c,*rout,*cout; 373 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 374 PetscInt i,nz,idx,idt,idc,m; 375 const MatScalar *aa=a->a,*v; 376 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 377 const PetscScalar *b; 378 379 PetscFunctionBegin; 380 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 381 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 382 t = a->solve_work; 383 384 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 385 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 386 387 /* forward solve the lower triangular */ 388 idx = 6*r[0]; 389 t[0] = b[idx]; t[1] = b[1+idx]; 390 t[2] = b[2+idx]; t[3] = b[3+idx]; 391 t[4] = b[4+idx]; t[5] = b[5+idx]; 392 for (i=1; i<n; i++) { 393 v = aa + 36*ai[i]; 394 vi = aj + ai[i]; 395 nz = ai[i+1] - ai[i]; 396 idx = 6*r[i]; 397 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 398 s5 = b[4+idx]; s6 = b[5+idx]; 399 for (m=0; m<nz; m++) { 400 idx = 6*vi[m]; 401 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 402 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 403 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 404 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 405 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 406 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 407 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 408 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 409 v += 36; 410 } 411 idx = 6*i; 412 t[idx] = s1;t[1+idx] = s2; 413 t[2+idx] = s3;t[3+idx] = s4; 414 t[4+idx] = s5;t[5+idx] = s6; 415 } 416 /* backward solve the upper triangular */ 417 for (i=n-1; i>=0; i--) { 418 v = aa + 36*(adiag[i+1]+1); 419 vi = aj + adiag[i+1]+1; 420 nz = adiag[i] - adiag[i+1] - 1; 421 idt = 6*i; 422 s1 = t[idt]; s2 = t[1+idt]; 423 s3 = t[2+idt];s4 = t[3+idt]; 424 s5 = t[4+idt];s6 = t[5+idt]; 425 for (m=0; m<nz; m++) { 426 idx = 6*vi[m]; 427 x1 = t[idx]; x2 = t[1+idx]; 428 x3 = t[2+idx]; x4 = t[3+idx]; 429 x5 = t[4+idx]; x6 = t[5+idx]; 430 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 431 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 432 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 433 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 434 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 435 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 436 v += 36; 437 } 438 idc = 6*c[i]; 439 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 440 v[18]*s4+v[24]*s5+v[30]*s6; 441 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 442 v[19]*s4+v[25]*s5+v[31]*s6; 443 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 444 v[20]*s4+v[26]*s5+v[32]*s6; 445 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 446 v[21]*s4+v[27]*s5+v[33]*s6; 447 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 448 v[22]*s4+v[28]*s5+v[34]*s6; 449 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 450 v[23]*s4+v[29]*s5+v[35]*s6; 451 } 452 453 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 454 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 455 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 458 PetscFunctionReturn(0); 459 } 460 461 #undef __FUNCT__ 462 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 463 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 464 { 465 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 466 IS iscol=a->col,isrow=a->row; 467 PetscErrorCode ierr; 468 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 469 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 470 PetscInt i,nz,idx,idt,idc; 471 const MatScalar *aa=a->a,*v; 472 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 473 const PetscScalar *b; 474 475 PetscFunctionBegin; 476 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 477 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 478 t = a->solve_work; 479 480 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 481 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 482 483 /* forward solve the lower triangular */ 484 idx = 5*(*r++); 485 t[0] = b[idx]; t[1] = b[1+idx]; 486 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 487 for (i=1; i<n; i++) { 488 v = aa + 25*ai[i]; 489 vi = aj + ai[i]; 490 nz = diag[i] - ai[i]; 491 idx = 5*(*r++); 492 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 493 s5 = b[4+idx]; 494 while (nz--) { 495 idx = 5*(*vi++); 496 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 497 x4 = t[3+idx];x5 = t[4+idx]; 498 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 499 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 500 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 501 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 502 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 503 v += 25; 504 } 505 idx = 5*i; 506 t[idx] = s1;t[1+idx] = s2; 507 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 508 } 509 /* backward solve the upper triangular */ 510 for (i=n-1; i>=0; i--) { 511 v = aa + 25*diag[i] + 25; 512 vi = aj + diag[i] + 1; 513 nz = ai[i+1] - diag[i] - 1; 514 idt = 5*i; 515 s1 = t[idt]; s2 = t[1+idt]; 516 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 517 while (nz--) { 518 idx = 5*(*vi++); 519 x1 = t[idx]; x2 = t[1+idx]; 520 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 521 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 522 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 523 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 524 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 525 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 526 v += 25; 527 } 528 idc = 5*(*c--); 529 v = aa + 25*diag[i]; 530 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 531 v[15]*s4+v[20]*s5; 532 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 533 v[16]*s4+v[21]*s5; 534 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 535 v[17]*s4+v[22]*s5; 536 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 537 v[18]*s4+v[23]*s5; 538 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 539 v[19]*s4+v[24]*s5; 540 } 541 542 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 543 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 544 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 545 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 546 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 547 PetscFunctionReturn(0); 548 } 549 550 #undef __FUNCT__ 551 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 552 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 553 { 554 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 555 IS iscol=a->col,isrow=a->row; 556 PetscErrorCode ierr; 557 const PetscInt *r,*c,*rout,*cout; 558 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 559 PetscInt i,nz,idx,idt,idc,m; 560 const MatScalar *aa=a->a,*v; 561 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 562 const PetscScalar *b; 563 564 PetscFunctionBegin; 565 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 566 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 567 t = a->solve_work; 568 569 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 570 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 571 572 /* forward solve the lower triangular */ 573 idx = 5*r[0]; 574 t[0] = b[idx]; t[1] = b[1+idx]; 575 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 576 for (i=1; i<n; i++) { 577 v = aa + 25*ai[i]; 578 vi = aj + ai[i]; 579 nz = ai[i+1] - ai[i]; 580 idx = 5*r[i]; 581 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 582 s5 = b[4+idx]; 583 for (m=0; m<nz; m++) { 584 idx = 5*vi[m]; 585 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 586 x4 = t[3+idx];x5 = t[4+idx]; 587 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 588 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 589 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 590 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 591 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 592 v += 25; 593 } 594 idx = 5*i; 595 t[idx] = s1;t[1+idx] = s2; 596 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 597 } 598 /* backward solve the upper triangular */ 599 for (i=n-1; i>=0; i--) { 600 v = aa + 25*(adiag[i+1]+1); 601 vi = aj + adiag[i+1]+1; 602 nz = adiag[i] - adiag[i+1] - 1; 603 idt = 5*i; 604 s1 = t[idt]; s2 = t[1+idt]; 605 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 606 for (m=0; m<nz; m++) { 607 idx = 5*vi[m]; 608 x1 = t[idx]; x2 = t[1+idx]; 609 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 610 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 611 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 612 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 613 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 614 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 615 v += 25; 616 } 617 idc = 5*c[i]; 618 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 619 v[15]*s4+v[20]*s5; 620 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 621 v[16]*s4+v[21]*s5; 622 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 623 v[17]*s4+v[22]*s5; 624 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 625 v[18]*s4+v[23]*s5; 626 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 627 v[19]*s4+v[24]*s5; 628 } 629 630 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 631 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 632 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 633 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 634 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 635 PetscFunctionReturn(0); 636 } 637 638 #undef __FUNCT__ 639 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 640 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 641 { 642 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 643 IS iscol=a->col,isrow=a->row; 644 PetscErrorCode ierr; 645 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 646 PetscInt i,nz,idx,idt,idc; 647 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 648 const MatScalar *aa=a->a,*v; 649 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 650 const PetscScalar *b; 651 652 PetscFunctionBegin; 653 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 654 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 655 t = a->solve_work; 656 657 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 658 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 659 660 /* forward solve the lower triangular */ 661 idx = 4*(*r++); 662 t[0] = b[idx]; t[1] = b[1+idx]; 663 t[2] = b[2+idx]; t[3] = b[3+idx]; 664 for (i=1; i<n; i++) { 665 v = aa + 16*ai[i]; 666 vi = aj + ai[i]; 667 nz = diag[i] - ai[i]; 668 idx = 4*(*r++); 669 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 670 while (nz--) { 671 idx = 4*(*vi++); 672 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 673 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 674 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 675 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 676 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 677 v += 16; 678 } 679 idx = 4*i; 680 t[idx] = s1;t[1+idx] = s2; 681 t[2+idx] = s3;t[3+idx] = s4; 682 } 683 /* backward solve the upper triangular */ 684 for (i=n-1; i>=0; i--) { 685 v = aa + 16*diag[i] + 16; 686 vi = aj + diag[i] + 1; 687 nz = ai[i+1] - diag[i] - 1; 688 idt = 4*i; 689 s1 = t[idt]; s2 = t[1+idt]; 690 s3 = t[2+idt];s4 = t[3+idt]; 691 while (nz--) { 692 idx = 4*(*vi++); 693 x1 = t[idx]; x2 = t[1+idx]; 694 x3 = t[2+idx]; x4 = t[3+idx]; 695 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 696 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 697 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 698 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 699 v += 16; 700 } 701 idc = 4*(*c--); 702 v = aa + 16*diag[i]; 703 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 704 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 705 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 706 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 707 } 708 709 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 710 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 711 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 712 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 713 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 714 PetscFunctionReturn(0); 715 } 716 717 #undef __FUNCT__ 718 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 719 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 720 { 721 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 722 IS iscol=a->col,isrow=a->row; 723 PetscErrorCode ierr; 724 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 725 PetscInt i,nz,idx,idt,idc,m; 726 const PetscInt *r,*c,*rout,*cout; 727 const MatScalar *aa=a->a,*v; 728 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 729 const PetscScalar *b; 730 731 PetscFunctionBegin; 732 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 733 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 734 t = a->solve_work; 735 736 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 737 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 738 739 /* forward solve the lower triangular */ 740 idx = 4*r[0]; 741 t[0] = b[idx]; t[1] = b[1+idx]; 742 t[2] = b[2+idx]; t[3] = b[3+idx]; 743 for (i=1; i<n; i++) { 744 v = aa + 16*ai[i]; 745 vi = aj + ai[i]; 746 nz = ai[i+1] - ai[i]; 747 idx = 4*r[i]; 748 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 749 for (m=0; m<nz; m++) { 750 idx = 4*vi[m]; 751 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 752 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 753 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 754 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 755 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 756 v += 16; 757 } 758 idx = 4*i; 759 t[idx] = s1;t[1+idx] = s2; 760 t[2+idx] = s3;t[3+idx] = s4; 761 } 762 /* backward solve the upper triangular */ 763 for (i=n-1; i>=0; i--) { 764 v = aa + 16*(adiag[i+1]+1); 765 vi = aj + adiag[i+1]+1; 766 nz = adiag[i] - adiag[i+1] - 1; 767 idt = 4*i; 768 s1 = t[idt]; s2 = t[1+idt]; 769 s3 = t[2+idt];s4 = t[3+idt]; 770 for (m=0; m<nz; m++) { 771 idx = 4*vi[m]; 772 x1 = t[idx]; x2 = t[1+idx]; 773 x3 = t[2+idx]; x4 = t[3+idx]; 774 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 775 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 776 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 777 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 778 v += 16; 779 } 780 idc = 4*c[i]; 781 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 782 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 783 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 784 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 785 } 786 787 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 788 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 789 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 790 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 791 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 792 PetscFunctionReturn(0); 793 } 794 795 #undef __FUNCT__ 796 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 797 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 798 { 799 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 800 IS iscol=a->col,isrow=a->row; 801 PetscErrorCode ierr; 802 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 803 PetscInt i,nz,idx,idt,idc; 804 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 805 const MatScalar *aa=a->a,*v; 806 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 807 PetscScalar *x; 808 const PetscScalar *b; 809 810 PetscFunctionBegin; 811 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 812 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 813 t = (MatScalar*)a->solve_work; 814 815 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 816 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 817 818 /* forward solve the lower triangular */ 819 idx = 4*(*r++); 820 t[0] = (MatScalar)b[idx]; 821 t[1] = (MatScalar)b[1+idx]; 822 t[2] = (MatScalar)b[2+idx]; 823 t[3] = (MatScalar)b[3+idx]; 824 for (i=1; i<n; i++) { 825 v = aa + 16*ai[i]; 826 vi = aj + ai[i]; 827 nz = diag[i] - ai[i]; 828 idx = 4*(*r++); 829 s1 = (MatScalar)b[idx]; 830 s2 = (MatScalar)b[1+idx]; 831 s3 = (MatScalar)b[2+idx]; 832 s4 = (MatScalar)b[3+idx]; 833 while (nz--) { 834 idx = 4*(*vi++); 835 x1 = t[idx]; 836 x2 = t[1+idx]; 837 x3 = t[2+idx]; 838 x4 = t[3+idx]; 839 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 840 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 841 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 842 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 843 v += 16; 844 } 845 idx = 4*i; 846 t[idx] = s1; 847 t[1+idx] = s2; 848 t[2+idx] = s3; 849 t[3+idx] = s4; 850 } 851 /* backward solve the upper triangular */ 852 for (i=n-1; i>=0; i--) { 853 v = aa + 16*diag[i] + 16; 854 vi = aj + diag[i] + 1; 855 nz = ai[i+1] - diag[i] - 1; 856 idt = 4*i; 857 s1 = t[idt]; 858 s2 = t[1+idt]; 859 s3 = t[2+idt]; 860 s4 = t[3+idt]; 861 while (nz--) { 862 idx = 4*(*vi++); 863 x1 = t[idx]; 864 x2 = t[1+idx]; 865 x3 = t[2+idx]; 866 x4 = t[3+idx]; 867 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 868 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 869 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 870 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 871 v += 16; 872 } 873 idc = 4*(*c--); 874 v = aa + 16*diag[i]; 875 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 876 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 877 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 878 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 879 x[idc] = (PetscScalar)t[idt]; 880 x[1+idc] = (PetscScalar)t[1+idt]; 881 x[2+idc] = (PetscScalar)t[2+idt]; 882 x[3+idc] = (PetscScalar)t[3+idt]; 883 } 884 885 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 886 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 887 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 888 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 889 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 890 PetscFunctionReturn(0); 891 } 892 893 #if defined(PETSC_HAVE_SSE) 894 895 #include PETSC_HAVE_SSE 896 897 #undef __FUNCT__ 898 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 899 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 900 { 901 /* 902 Note: This code uses demotion of double 903 to float when performing the mixed-mode computation. 904 This may not be numerically reasonable for all applications. 905 */ 906 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 907 IS iscol=a->col,isrow=a->row; 908 PetscErrorCode ierr; 909 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 910 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 911 MatScalar *aa=a->a,*v; 912 PetscScalar *x,*b,*t; 913 914 /* Make space in temp stack for 16 Byte Aligned arrays */ 915 float ssealignedspace[11],*tmps,*tmpx; 916 unsigned long offset; 917 918 PetscFunctionBegin; 919 SSE_SCOPE_BEGIN; 920 921 offset = (unsigned long)ssealignedspace % 16; 922 if (offset) offset = (16 - offset)/4; 923 tmps = &ssealignedspace[offset]; 924 tmpx = &ssealignedspace[offset+4]; 925 PREFETCH_NTA(aa+16*ai[1]); 926 927 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 928 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 929 t = a->solve_work; 930 931 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 932 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 933 934 /* forward solve the lower triangular */ 935 idx = 4*(*r++); 936 t[0] = b[idx]; t[1] = b[1+idx]; 937 t[2] = b[2+idx]; t[3] = b[3+idx]; 938 v = aa + 16*ai[1]; 939 940 for (i=1; i<n; ) { 941 PREFETCH_NTA(&v[8]); 942 vi = aj + ai[i]; 943 nz = diag[i] - ai[i]; 944 idx = 4*(*r++); 945 946 /* Demote sum from double to float */ 947 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 948 LOAD_PS(tmps,XMM7); 949 950 while (nz--) { 951 PREFETCH_NTA(&v[16]); 952 idx = 4*(*vi++); 953 954 /* Demote solution (so far) from double to float */ 955 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 956 957 /* 4x4 Matrix-Vector product with negative accumulation: */ 958 SSE_INLINE_BEGIN_2(tmpx,v) 959 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 960 961 /* First Column */ 962 SSE_COPY_PS(XMM0,XMM6) 963 SSE_SHUFFLE(XMM0,XMM0,0x00) 964 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 965 SSE_SUB_PS(XMM7,XMM0) 966 967 /* Second Column */ 968 SSE_COPY_PS(XMM1,XMM6) 969 SSE_SHUFFLE(XMM1,XMM1,0x55) 970 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 971 SSE_SUB_PS(XMM7,XMM1) 972 973 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 974 975 /* Third Column */ 976 SSE_COPY_PS(XMM2,XMM6) 977 SSE_SHUFFLE(XMM2,XMM2,0xAA) 978 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 979 SSE_SUB_PS(XMM7,XMM2) 980 981 /* Fourth Column */ 982 SSE_COPY_PS(XMM3,XMM6) 983 SSE_SHUFFLE(XMM3,XMM3,0xFF) 984 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 985 SSE_SUB_PS(XMM7,XMM3) 986 SSE_INLINE_END_2 987 988 v += 16; 989 } 990 idx = 4*i; 991 v = aa + 16*ai[++i]; 992 PREFETCH_NTA(v); 993 STORE_PS(tmps,XMM7); 994 995 /* Promote result from float to double */ 996 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 997 } 998 /* backward solve the upper triangular */ 999 idt = 4*(n-1); 1000 ai16 = 16*diag[n-1]; 1001 v = aa + ai16 + 16; 1002 for (i=n-1; i>=0; ) { 1003 PREFETCH_NTA(&v[8]); 1004 vi = aj + diag[i] + 1; 1005 nz = ai[i+1] - diag[i] - 1; 1006 1007 /* Demote accumulator from double to float */ 1008 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1009 LOAD_PS(tmps,XMM7); 1010 1011 while (nz--) { 1012 PREFETCH_NTA(&v[16]); 1013 idx = 4*(*vi++); 1014 1015 /* Demote solution (so far) from double to float */ 1016 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1017 1018 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1019 SSE_INLINE_BEGIN_2(tmpx,v) 1020 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1021 1022 /* First Column */ 1023 SSE_COPY_PS(XMM0,XMM6) 1024 SSE_SHUFFLE(XMM0,XMM0,0x00) 1025 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1026 SSE_SUB_PS(XMM7,XMM0) 1027 1028 /* Second Column */ 1029 SSE_COPY_PS(XMM1,XMM6) 1030 SSE_SHUFFLE(XMM1,XMM1,0x55) 1031 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1032 SSE_SUB_PS(XMM7,XMM1) 1033 1034 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1035 1036 /* Third Column */ 1037 SSE_COPY_PS(XMM2,XMM6) 1038 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1039 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1040 SSE_SUB_PS(XMM7,XMM2) 1041 1042 /* Fourth Column */ 1043 SSE_COPY_PS(XMM3,XMM6) 1044 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1045 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1046 SSE_SUB_PS(XMM7,XMM3) 1047 SSE_INLINE_END_2 1048 v += 16; 1049 } 1050 v = aa + ai16; 1051 ai16 = 16*diag[--i]; 1052 PREFETCH_NTA(aa+ai16+16); 1053 /* 1054 Scale the result by the diagonal 4x4 block, 1055 which was inverted as part of the factorization 1056 */ 1057 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 1058 /* First Column */ 1059 SSE_COPY_PS(XMM0,XMM7) 1060 SSE_SHUFFLE(XMM0,XMM0,0x00) 1061 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 1062 1063 /* Second Column */ 1064 SSE_COPY_PS(XMM1,XMM7) 1065 SSE_SHUFFLE(XMM1,XMM1,0x55) 1066 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 1067 SSE_ADD_PS(XMM0,XMM1) 1068 1069 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 1070 1071 /* Third Column */ 1072 SSE_COPY_PS(XMM2,XMM7) 1073 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1074 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 1075 SSE_ADD_PS(XMM0,XMM2) 1076 1077 /* Fourth Column */ 1078 SSE_COPY_PS(XMM3,XMM7) 1079 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1080 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 1081 SSE_ADD_PS(XMM0,XMM3) 1082 1083 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 1084 SSE_INLINE_END_3 1085 1086 /* Promote solution from float to double */ 1087 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 1088 1089 /* Apply reordering to t and stream into x. */ 1090 /* This way, x doesn't pollute the cache. */ 1091 /* Be careful with size: 2 doubles = 4 floats! */ 1092 idc = 4*(*c--); 1093 SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc]) 1094 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1095 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 1096 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 1097 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1098 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 1099 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 1100 SSE_INLINE_END_2 1101 v = aa + ai16 + 16; 1102 idt -= 4; 1103 } 1104 1105 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1106 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1107 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1109 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1110 SSE_SCOPE_END; 1111 PetscFunctionReturn(0); 1112 } 1113 1114 #endif 1115 1116 #undef __FUNCT__ 1117 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 1118 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1119 { 1120 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1121 IS iscol=a->col,isrow=a->row; 1122 PetscErrorCode ierr; 1123 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1124 PetscInt i,nz,idx,idt,idc; 1125 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1126 const MatScalar *aa=a->a,*v; 1127 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1128 const PetscScalar *b; 1129 1130 PetscFunctionBegin; 1131 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1133 t = a->solve_work; 1134 1135 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1136 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1137 1138 /* forward solve the lower triangular */ 1139 idx = 3*(*r++); 1140 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1141 for (i=1; i<n; i++) { 1142 v = aa + 9*ai[i]; 1143 vi = aj + ai[i]; 1144 nz = diag[i] - ai[i]; 1145 idx = 3*(*r++); 1146 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1147 while (nz--) { 1148 idx = 3*(*vi++); 1149 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1150 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1151 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1152 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1153 v += 9; 1154 } 1155 idx = 3*i; 1156 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1157 } 1158 /* backward solve the upper triangular */ 1159 for (i=n-1; i>=0; i--) { 1160 v = aa + 9*diag[i] + 9; 1161 vi = aj + diag[i] + 1; 1162 nz = ai[i+1] - diag[i] - 1; 1163 idt = 3*i; 1164 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1165 while (nz--) { 1166 idx = 3*(*vi++); 1167 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1168 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1169 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1170 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1171 v += 9; 1172 } 1173 idc = 3*(*c--); 1174 v = aa + 9*diag[i]; 1175 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1176 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1177 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1178 } 1179 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1180 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1181 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1182 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1183 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1184 PetscFunctionReturn(0); 1185 } 1186 1187 #undef __FUNCT__ 1188 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 1189 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1190 { 1191 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1192 IS iscol=a->col,isrow=a->row; 1193 PetscErrorCode ierr; 1194 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1195 PetscInt i,nz,idx,idt,idc,m; 1196 const PetscInt *r,*c,*rout,*cout; 1197 const MatScalar *aa=a->a,*v; 1198 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1199 const PetscScalar *b; 1200 1201 PetscFunctionBegin; 1202 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1203 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1204 t = a->solve_work; 1205 1206 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1207 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1208 1209 /* forward solve the lower triangular */ 1210 idx = 3*r[0]; 1211 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1212 for (i=1; i<n; i++) { 1213 v = aa + 9*ai[i]; 1214 vi = aj + ai[i]; 1215 nz = ai[i+1] - ai[i]; 1216 idx = 3*r[i]; 1217 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1218 for (m=0; m<nz; m++) { 1219 idx = 3*vi[m]; 1220 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1221 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1222 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1223 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1224 v += 9; 1225 } 1226 idx = 3*i; 1227 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1228 } 1229 /* backward solve the upper triangular */ 1230 for (i=n-1; i>=0; i--) { 1231 v = aa + 9*(adiag[i+1]+1); 1232 vi = aj + adiag[i+1]+1; 1233 nz = adiag[i] - adiag[i+1] - 1; 1234 idt = 3*i; 1235 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1236 for (m=0; m<nz; m++) { 1237 idx = 3*vi[m]; 1238 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1239 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1240 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1241 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1242 v += 9; 1243 } 1244 idc = 3*c[i]; 1245 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1246 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1247 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1248 } 1249 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1250 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1251 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1252 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1253 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1254 PetscFunctionReturn(0); 1255 } 1256 1257 #undef __FUNCT__ 1258 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 1259 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1260 { 1261 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1262 IS iscol=a->col,isrow=a->row; 1263 PetscErrorCode ierr; 1264 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1265 PetscInt i,nz,idx,idt,idc; 1266 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1267 const MatScalar *aa=a->a,*v; 1268 PetscScalar *x,s1,s2,x1,x2,*t; 1269 const PetscScalar *b; 1270 1271 PetscFunctionBegin; 1272 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1273 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1274 t = a->solve_work; 1275 1276 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1277 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1278 1279 /* forward solve the lower triangular */ 1280 idx = 2*(*r++); 1281 t[0] = b[idx]; t[1] = b[1+idx]; 1282 for (i=1; i<n; i++) { 1283 v = aa + 4*ai[i]; 1284 vi = aj + ai[i]; 1285 nz = diag[i] - ai[i]; 1286 idx = 2*(*r++); 1287 s1 = b[idx]; s2 = b[1+idx]; 1288 while (nz--) { 1289 idx = 2*(*vi++); 1290 x1 = t[idx]; x2 = t[1+idx]; 1291 s1 -= v[0]*x1 + v[2]*x2; 1292 s2 -= v[1]*x1 + v[3]*x2; 1293 v += 4; 1294 } 1295 idx = 2*i; 1296 t[idx] = s1; t[1+idx] = s2; 1297 } 1298 /* backward solve the upper triangular */ 1299 for (i=n-1; i>=0; i--) { 1300 v = aa + 4*diag[i] + 4; 1301 vi = aj + diag[i] + 1; 1302 nz = ai[i+1] - diag[i] - 1; 1303 idt = 2*i; 1304 s1 = t[idt]; s2 = t[1+idt]; 1305 while (nz--) { 1306 idx = 2*(*vi++); 1307 x1 = t[idx]; x2 = t[1+idx]; 1308 s1 -= v[0]*x1 + v[2]*x2; 1309 s2 -= v[1]*x1 + v[3]*x2; 1310 v += 4; 1311 } 1312 idc = 2*(*c--); 1313 v = aa + 4*diag[i]; 1314 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1315 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1316 } 1317 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1318 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1319 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1320 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1321 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1322 PetscFunctionReturn(0); 1323 } 1324 1325 #undef __FUNCT__ 1326 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 1327 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1328 { 1329 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1330 IS iscol=a->col,isrow=a->row; 1331 PetscErrorCode ierr; 1332 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1333 PetscInt i,nz,idx,jdx,idt,idc,m; 1334 const PetscInt *r,*c,*rout,*cout; 1335 const MatScalar *aa=a->a,*v; 1336 PetscScalar *x,s1,s2,x1,x2,*t; 1337 const PetscScalar *b; 1338 1339 PetscFunctionBegin; 1340 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1341 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1342 t = a->solve_work; 1343 1344 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1345 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1346 1347 /* forward solve the lower triangular */ 1348 idx = 2*r[0]; 1349 t[0] = b[idx]; t[1] = b[1+idx]; 1350 for (i=1; i<n; i++) { 1351 v = aa + 4*ai[i]; 1352 vi = aj + ai[i]; 1353 nz = ai[i+1] - ai[i]; 1354 idx = 2*r[i]; 1355 s1 = b[idx]; s2 = b[1+idx]; 1356 for (m=0; m<nz; m++) { 1357 jdx = 2*vi[m]; 1358 x1 = t[jdx]; x2 = t[1+jdx]; 1359 s1 -= v[0]*x1 + v[2]*x2; 1360 s2 -= v[1]*x1 + v[3]*x2; 1361 v += 4; 1362 } 1363 idx = 2*i; 1364 t[idx] = s1; t[1+idx] = s2; 1365 } 1366 /* backward solve the upper triangular */ 1367 for (i=n-1; i>=0; i--) { 1368 v = aa + 4*(adiag[i+1]+1); 1369 vi = aj + adiag[i+1]+1; 1370 nz = adiag[i] - adiag[i+1] - 1; 1371 idt = 2*i; 1372 s1 = t[idt]; s2 = t[1+idt]; 1373 for (m=0; m<nz; m++) { 1374 idx = 2*vi[m]; 1375 x1 = t[idx]; x2 = t[1+idx]; 1376 s1 -= v[0]*x1 + v[2]*x2; 1377 s2 -= v[1]*x1 + v[3]*x2; 1378 v += 4; 1379 } 1380 idc = 2*c[i]; 1381 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1382 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1383 } 1384 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1385 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1386 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1387 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1388 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1389 PetscFunctionReturn(0); 1390 } 1391 1392 #undef __FUNCT__ 1393 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 1394 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1395 { 1396 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1397 IS iscol=a->col,isrow=a->row; 1398 PetscErrorCode ierr; 1399 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1400 PetscInt i,nz; 1401 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1402 const MatScalar *aa=a->a,*v; 1403 PetscScalar *x,s1,*t; 1404 const PetscScalar *b; 1405 1406 PetscFunctionBegin; 1407 if (!n) PetscFunctionReturn(0); 1408 1409 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1410 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1411 t = a->solve_work; 1412 1413 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1414 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1415 1416 /* forward solve the lower triangular */ 1417 t[0] = b[*r++]; 1418 for (i=1; i<n; i++) { 1419 v = aa + ai[i]; 1420 vi = aj + ai[i]; 1421 nz = diag[i] - ai[i]; 1422 s1 = b[*r++]; 1423 while (nz--) { 1424 s1 -= (*v++)*t[*vi++]; 1425 } 1426 t[i] = s1; 1427 } 1428 /* backward solve the upper triangular */ 1429 for (i=n-1; i>=0; i--) { 1430 v = aa + diag[i] + 1; 1431 vi = aj + diag[i] + 1; 1432 nz = ai[i+1] - diag[i] - 1; 1433 s1 = t[i]; 1434 while (nz--) { 1435 s1 -= (*v++)*t[*vi++]; 1436 } 1437 x[*c--] = t[i] = aa[diag[i]]*s1; 1438 } 1439 1440 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1441 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1442 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1443 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1444 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 1445 PetscFunctionReturn(0); 1446 } 1447 1448 #undef __FUNCT__ 1449 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 1450 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 1451 { 1452 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1453 IS iscol = a->col,isrow = a->row; 1454 PetscErrorCode ierr; 1455 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 1456 const PetscInt *rout,*cout,*r,*c; 1457 PetscScalar *x,*tmp,sum; 1458 const PetscScalar *b; 1459 const MatScalar *aa = a->a,*v; 1460 1461 PetscFunctionBegin; 1462 if (!n) PetscFunctionReturn(0); 1463 1464 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1465 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1466 tmp = a->solve_work; 1467 1468 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1469 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1470 1471 /* forward solve the lower triangular */ 1472 tmp[0] = b[r[0]]; 1473 v = aa; 1474 vi = aj; 1475 for (i=1; i<n; i++) { 1476 nz = ai[i+1] - ai[i]; 1477 sum = b[r[i]]; 1478 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1479 tmp[i] = sum; 1480 v += nz; vi += nz; 1481 } 1482 1483 /* backward solve the upper triangular */ 1484 for (i=n-1; i>=0; i--) { 1485 v = aa + adiag[i+1]+1; 1486 vi = aj + adiag[i+1]+1; 1487 nz = adiag[i]-adiag[i+1]-1; 1488 sum = tmp[i]; 1489 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1490 x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 1491 } 1492 1493 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1494 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1495 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1496 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1497 ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 1498 PetscFunctionReturn(0); 1499 } 1500 1501