1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 5 { 6 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 7 IS iscol=a->col,isrow=a->row; 8 const PetscInt *r,*c,*rout,*cout; 9 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 10 PetscInt i,nz; 11 const PetscInt bs =A->rmap->bs,bs2=a->bs2; 12 const MatScalar *aa=a->a,*v; 13 PetscScalar *x,*s,*t,*ls; 14 const PetscScalar *b; 15 16 PetscFunctionBegin; 17 PetscCall(VecGetArrayRead(bb,&b)); 18 PetscCall(VecGetArray(xx,&x)); 19 t = a->solve_work; 20 21 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 22 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 23 24 /* forward solve the lower triangular */ 25 PetscCall(PetscArraycpy(t,b+bs*(*r++),bs)); 26 for (i=1; i<n; i++) { 27 v = aa + bs2*ai[i]; 28 vi = aj + ai[i]; 29 nz = a->diag[i] - ai[i]; 30 s = t + bs*i; 31 PetscCall(PetscArraycpy(s,b+bs*(*r++),bs)); 32 while (nz--) { 33 PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 34 v += bs2; 35 } 36 } 37 /* backward solve the upper triangular */ 38 ls = a->solve_work + A->cmap->n; 39 for (i=n-1; i>=0; i--) { 40 v = aa + bs2*(a->diag[i] + 1); 41 vi = aj + a->diag[i] + 1; 42 nz = ai[i+1] - a->diag[i] - 1; 43 PetscCall(PetscArraycpy(ls,t+i*bs,bs)); 44 while (nz--) { 45 PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 46 v += bs2; 47 } 48 PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 49 PetscCall(PetscArraycpy(x + bs*(*c--),t+i*bs,bs)); 50 } 51 52 PetscCall(ISRestoreIndices(isrow,&rout)); 53 PetscCall(ISRestoreIndices(iscol,&cout)); 54 PetscCall(VecRestoreArrayRead(bb,&b)); 55 PetscCall(VecRestoreArray(xx,&x)); 56 PetscCall(PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n)); 57 PetscFunctionReturn(0); 58 } 59 60 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 63 IS iscol=a->col,isrow=a->row; 64 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 65 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 66 PetscInt i,nz,idx,idt,idc; 67 const MatScalar *aa=a->a,*v; 68 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 69 const PetscScalar *b; 70 71 PetscFunctionBegin; 72 PetscCall(VecGetArrayRead(bb,&b)); 73 PetscCall(VecGetArray(xx,&x)); 74 t = a->solve_work; 75 76 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 77 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 78 79 /* forward solve the lower triangular */ 80 idx = 7*(*r++); 81 t[0] = b[idx]; t[1] = b[1+idx]; 82 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 83 t[5] = b[5+idx]; t[6] = b[6+idx]; 84 85 for (i=1; i<n; i++) { 86 v = aa + 49*ai[i]; 87 vi = aj + ai[i]; 88 nz = diag[i] - ai[i]; 89 idx = 7*(*r++); 90 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 91 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 92 while (nz--) { 93 idx = 7*(*vi++); 94 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 95 x4 = t[3+idx];x5 = t[4+idx]; 96 x6 = t[5+idx];x7 = t[6+idx]; 97 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 98 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 99 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 100 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 101 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 102 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 103 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 104 v += 49; 105 } 106 idx = 7*i; 107 t[idx] = s1;t[1+idx] = s2; 108 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 109 t[5+idx] = s6;t[6+idx] = s7; 110 } 111 /* backward solve the upper triangular */ 112 for (i=n-1; i>=0; i--) { 113 v = aa + 49*diag[i] + 49; 114 vi = aj + diag[i] + 1; 115 nz = ai[i+1] - diag[i] - 1; 116 idt = 7*i; 117 s1 = t[idt]; s2 = t[1+idt]; 118 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 119 s6 = t[5+idt];s7 = t[6+idt]; 120 while (nz--) { 121 idx = 7*(*vi++); 122 x1 = t[idx]; x2 = t[1+idx]; 123 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 124 x6 = t[5+idx]; x7 = t[6+idx]; 125 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 126 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 127 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 128 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 129 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 130 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 131 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 132 v += 49; 133 } 134 idc = 7*(*c--); 135 v = aa + 49*diag[i]; 136 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 137 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 138 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 139 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 140 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 141 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 142 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 143 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 144 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 145 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 146 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 147 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 148 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 149 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 150 } 151 152 PetscCall(ISRestoreIndices(isrow,&rout)); 153 PetscCall(ISRestoreIndices(iscol,&cout)); 154 PetscCall(VecRestoreArrayRead(bb,&b)); 155 PetscCall(VecRestoreArray(xx,&x)); 156 PetscCall(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n)); 157 PetscFunctionReturn(0); 158 } 159 160 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 161 { 162 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 163 IS iscol=a->col,isrow=a->row; 164 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 165 const PetscInt n=a->mbs,*rout,*cout,*vi; 166 PetscInt i,nz,idx,idt,idc,m; 167 const MatScalar *aa=a->a,*v; 168 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 169 const PetscScalar *b; 170 171 PetscFunctionBegin; 172 PetscCall(VecGetArrayRead(bb,&b)); 173 PetscCall(VecGetArray(xx,&x)); 174 t = a->solve_work; 175 176 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 177 PetscCall(ISGetIndices(iscol,&cout)); c = cout; 178 179 /* forward solve the lower triangular */ 180 idx = 7*r[0]; 181 t[0] = b[idx]; t[1] = b[1+idx]; 182 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 183 t[5] = b[5+idx]; t[6] = b[6+idx]; 184 185 for (i=1; i<n; i++) { 186 v = aa + 49*ai[i]; 187 vi = aj + ai[i]; 188 nz = ai[i+1] - ai[i]; 189 idx = 7*r[i]; 190 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 191 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 192 for (m=0; m<nz; m++) { 193 idx = 7*vi[m]; 194 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 195 x4 = t[3+idx];x5 = t[4+idx]; 196 x6 = t[5+idx];x7 = t[6+idx]; 197 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 198 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 199 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 200 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 201 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 202 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 203 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 204 v += 49; 205 } 206 idx = 7*i; 207 t[idx] = s1;t[1+idx] = s2; 208 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 209 t[5+idx] = s6;t[6+idx] = s7; 210 } 211 /* backward solve the upper triangular */ 212 for (i=n-1; i>=0; i--) { 213 v = aa + 49*(adiag[i+1]+1); 214 vi = aj + adiag[i+1]+1; 215 nz = adiag[i] - adiag[i+1] - 1; 216 idt = 7*i; 217 s1 = t[idt]; s2 = t[1+idt]; 218 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 219 s6 = t[5+idt];s7 = t[6+idt]; 220 for (m=0; m<nz; m++) { 221 idx = 7*vi[m]; 222 x1 = t[idx]; x2 = t[1+idx]; 223 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 224 x6 = t[5+idx]; x7 = t[6+idx]; 225 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 226 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 227 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 228 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 229 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 230 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 231 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 232 v += 49; 233 } 234 idc = 7*c[i]; 235 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 236 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 237 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 238 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 239 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 240 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 241 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 242 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 243 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 244 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 245 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 246 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 247 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 248 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 249 } 250 251 PetscCall(ISRestoreIndices(isrow,&rout)); 252 PetscCall(ISRestoreIndices(iscol,&cout)); 253 PetscCall(VecRestoreArrayRead(bb,&b)); 254 PetscCall(VecRestoreArray(xx,&x)); 255 PetscCall(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n)); 256 PetscFunctionReturn(0); 257 } 258 259 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 260 { 261 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 262 IS iscol=a->col,isrow=a->row; 263 const PetscInt *r,*c,*rout,*cout; 264 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 265 PetscInt i,nz,idx,idt,idc; 266 const MatScalar *aa=a->a,*v; 267 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 268 const PetscScalar *b; 269 270 PetscFunctionBegin; 271 PetscCall(VecGetArrayRead(bb,&b)); 272 PetscCall(VecGetArray(xx,&x)); 273 t = a->solve_work; 274 275 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 276 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 277 278 /* forward solve the lower triangular */ 279 idx = 6*(*r++); 280 t[0] = b[idx]; t[1] = b[1+idx]; 281 t[2] = b[2+idx]; t[3] = b[3+idx]; 282 t[4] = b[4+idx]; t[5] = b[5+idx]; 283 for (i=1; i<n; i++) { 284 v = aa + 36*ai[i]; 285 vi = aj + ai[i]; 286 nz = diag[i] - ai[i]; 287 idx = 6*(*r++); 288 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 289 s5 = b[4+idx]; s6 = b[5+idx]; 290 while (nz--) { 291 idx = 6*(*vi++); 292 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 293 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 294 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 295 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 296 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 297 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 298 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 299 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 300 v += 36; 301 } 302 idx = 6*i; 303 t[idx] = s1;t[1+idx] = s2; 304 t[2+idx] = s3;t[3+idx] = s4; 305 t[4+idx] = s5;t[5+idx] = s6; 306 } 307 /* backward solve the upper triangular */ 308 for (i=n-1; i>=0; i--) { 309 v = aa + 36*diag[i] + 36; 310 vi = aj + diag[i] + 1; 311 nz = ai[i+1] - diag[i] - 1; 312 idt = 6*i; 313 s1 = t[idt]; s2 = t[1+idt]; 314 s3 = t[2+idt];s4 = t[3+idt]; 315 s5 = t[4+idt];s6 = t[5+idt]; 316 while (nz--) { 317 idx = 6*(*vi++); 318 x1 = t[idx]; x2 = t[1+idx]; 319 x3 = t[2+idx]; x4 = t[3+idx]; 320 x5 = t[4+idx]; x6 = t[5+idx]; 321 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 322 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 323 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 324 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 325 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 326 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 327 v += 36; 328 } 329 idc = 6*(*c--); 330 v = aa + 36*diag[i]; 331 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 332 v[18]*s4+v[24]*s5+v[30]*s6; 333 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 334 v[19]*s4+v[25]*s5+v[31]*s6; 335 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 336 v[20]*s4+v[26]*s5+v[32]*s6; 337 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 338 v[21]*s4+v[27]*s5+v[33]*s6; 339 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 340 v[22]*s4+v[28]*s5+v[34]*s6; 341 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 342 v[23]*s4+v[29]*s5+v[35]*s6; 343 } 344 345 PetscCall(ISRestoreIndices(isrow,&rout)); 346 PetscCall(ISRestoreIndices(iscol,&cout)); 347 PetscCall(VecRestoreArrayRead(bb,&b)); 348 PetscCall(VecRestoreArray(xx,&x)); 349 PetscCall(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n)); 350 PetscFunctionReturn(0); 351 } 352 353 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 354 { 355 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 356 IS iscol=a->col,isrow=a->row; 357 const PetscInt *r,*c,*rout,*cout; 358 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 359 PetscInt i,nz,idx,idt,idc,m; 360 const MatScalar *aa=a->a,*v; 361 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 362 const PetscScalar *b; 363 364 PetscFunctionBegin; 365 PetscCall(VecGetArrayRead(bb,&b)); 366 PetscCall(VecGetArray(xx,&x)); 367 t = a->solve_work; 368 369 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 370 PetscCall(ISGetIndices(iscol,&cout)); c = cout; 371 372 /* forward solve the lower triangular */ 373 idx = 6*r[0]; 374 t[0] = b[idx]; t[1] = b[1+idx]; 375 t[2] = b[2+idx]; t[3] = b[3+idx]; 376 t[4] = b[4+idx]; t[5] = b[5+idx]; 377 for (i=1; i<n; i++) { 378 v = aa + 36*ai[i]; 379 vi = aj + ai[i]; 380 nz = ai[i+1] - ai[i]; 381 idx = 6*r[i]; 382 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 383 s5 = b[4+idx]; s6 = b[5+idx]; 384 for (m=0; m<nz; m++) { 385 idx = 6*vi[m]; 386 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 387 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 388 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 389 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 390 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 391 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 392 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 393 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 394 v += 36; 395 } 396 idx = 6*i; 397 t[idx] = s1;t[1+idx] = s2; 398 t[2+idx] = s3;t[3+idx] = s4; 399 t[4+idx] = s5;t[5+idx] = s6; 400 } 401 /* backward solve the upper triangular */ 402 for (i=n-1; i>=0; i--) { 403 v = aa + 36*(adiag[i+1]+1); 404 vi = aj + adiag[i+1]+1; 405 nz = adiag[i] - adiag[i+1] - 1; 406 idt = 6*i; 407 s1 = t[idt]; s2 = t[1+idt]; 408 s3 = t[2+idt];s4 = t[3+idt]; 409 s5 = t[4+idt];s6 = t[5+idt]; 410 for (m=0; m<nz; m++) { 411 idx = 6*vi[m]; 412 x1 = t[idx]; x2 = t[1+idx]; 413 x3 = t[2+idx]; x4 = t[3+idx]; 414 x5 = t[4+idx]; x6 = t[5+idx]; 415 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 416 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 417 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 418 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 419 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 420 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 421 v += 36; 422 } 423 idc = 6*c[i]; 424 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 425 v[18]*s4+v[24]*s5+v[30]*s6; 426 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 427 v[19]*s4+v[25]*s5+v[31]*s6; 428 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 429 v[20]*s4+v[26]*s5+v[32]*s6; 430 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 431 v[21]*s4+v[27]*s5+v[33]*s6; 432 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 433 v[22]*s4+v[28]*s5+v[34]*s6; 434 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 435 v[23]*s4+v[29]*s5+v[35]*s6; 436 } 437 438 PetscCall(ISRestoreIndices(isrow,&rout)); 439 PetscCall(ISRestoreIndices(iscol,&cout)); 440 PetscCall(VecRestoreArrayRead(bb,&b)); 441 PetscCall(VecRestoreArray(xx,&x)); 442 PetscCall(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n)); 443 PetscFunctionReturn(0); 444 } 445 446 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 447 { 448 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 449 IS iscol=a->col,isrow=a->row; 450 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 451 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 452 PetscInt i,nz,idx,idt,idc; 453 const MatScalar *aa=a->a,*v; 454 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 455 const PetscScalar *b; 456 457 PetscFunctionBegin; 458 PetscCall(VecGetArrayRead(bb,&b)); 459 PetscCall(VecGetArray(xx,&x)); 460 t = a->solve_work; 461 462 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 463 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 464 465 /* forward solve the lower triangular */ 466 idx = 5*(*r++); 467 t[0] = b[idx]; t[1] = b[1+idx]; 468 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 469 for (i=1; i<n; i++) { 470 v = aa + 25*ai[i]; 471 vi = aj + ai[i]; 472 nz = diag[i] - ai[i]; 473 idx = 5*(*r++); 474 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 475 s5 = b[4+idx]; 476 while (nz--) { 477 idx = 5*(*vi++); 478 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 479 x4 = t[3+idx];x5 = t[4+idx]; 480 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 481 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 482 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 483 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 484 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 485 v += 25; 486 } 487 idx = 5*i; 488 t[idx] = s1;t[1+idx] = s2; 489 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 490 } 491 /* backward solve the upper triangular */ 492 for (i=n-1; i>=0; i--) { 493 v = aa + 25*diag[i] + 25; 494 vi = aj + diag[i] + 1; 495 nz = ai[i+1] - diag[i] - 1; 496 idt = 5*i; 497 s1 = t[idt]; s2 = t[1+idt]; 498 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 499 while (nz--) { 500 idx = 5*(*vi++); 501 x1 = t[idx]; x2 = t[1+idx]; 502 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 503 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 504 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 505 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 506 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 507 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 508 v += 25; 509 } 510 idc = 5*(*c--); 511 v = aa + 25*diag[i]; 512 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 513 v[15]*s4+v[20]*s5; 514 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 515 v[16]*s4+v[21]*s5; 516 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 517 v[17]*s4+v[22]*s5; 518 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 519 v[18]*s4+v[23]*s5; 520 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 521 v[19]*s4+v[24]*s5; 522 } 523 524 PetscCall(ISRestoreIndices(isrow,&rout)); 525 PetscCall(ISRestoreIndices(iscol,&cout)); 526 PetscCall(VecRestoreArrayRead(bb,&b)); 527 PetscCall(VecRestoreArray(xx,&x)); 528 PetscCall(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n)); 529 PetscFunctionReturn(0); 530 } 531 532 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 533 { 534 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 535 IS iscol=a->col,isrow=a->row; 536 const PetscInt *r,*c,*rout,*cout; 537 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 538 PetscInt i,nz,idx,idt,idc,m; 539 const MatScalar *aa=a->a,*v; 540 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 541 const PetscScalar *b; 542 543 PetscFunctionBegin; 544 PetscCall(VecGetArrayRead(bb,&b)); 545 PetscCall(VecGetArray(xx,&x)); 546 t = a->solve_work; 547 548 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 549 PetscCall(ISGetIndices(iscol,&cout)); c = cout; 550 551 /* forward solve the lower triangular */ 552 idx = 5*r[0]; 553 t[0] = b[idx]; t[1] = b[1+idx]; 554 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 555 for (i=1; i<n; i++) { 556 v = aa + 25*ai[i]; 557 vi = aj + ai[i]; 558 nz = ai[i+1] - ai[i]; 559 idx = 5*r[i]; 560 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 561 s5 = b[4+idx]; 562 for (m=0; m<nz; m++) { 563 idx = 5*vi[m]; 564 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 565 x4 = t[3+idx];x5 = t[4+idx]; 566 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 567 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 568 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 569 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 570 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 571 v += 25; 572 } 573 idx = 5*i; 574 t[idx] = s1;t[1+idx] = s2; 575 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 576 } 577 /* backward solve the upper triangular */ 578 for (i=n-1; i>=0; i--) { 579 v = aa + 25*(adiag[i+1]+1); 580 vi = aj + adiag[i+1]+1; 581 nz = adiag[i] - adiag[i+1] - 1; 582 idt = 5*i; 583 s1 = t[idt]; s2 = t[1+idt]; 584 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 585 for (m=0; m<nz; m++) { 586 idx = 5*vi[m]; 587 x1 = t[idx]; x2 = t[1+idx]; 588 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 589 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 590 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 591 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 592 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 593 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 594 v += 25; 595 } 596 idc = 5*c[i]; 597 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 598 v[15]*s4+v[20]*s5; 599 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 600 v[16]*s4+v[21]*s5; 601 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 602 v[17]*s4+v[22]*s5; 603 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 604 v[18]*s4+v[23]*s5; 605 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 606 v[19]*s4+v[24]*s5; 607 } 608 609 PetscCall(ISRestoreIndices(isrow,&rout)); 610 PetscCall(ISRestoreIndices(iscol,&cout)); 611 PetscCall(VecRestoreArrayRead(bb,&b)); 612 PetscCall(VecRestoreArray(xx,&x)); 613 PetscCall(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n)); 614 PetscFunctionReturn(0); 615 } 616 617 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 618 { 619 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 620 IS iscol=a->col,isrow=a->row; 621 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 622 PetscInt i,nz,idx,idt,idc; 623 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 624 const MatScalar *aa=a->a,*v; 625 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 626 const PetscScalar *b; 627 628 PetscFunctionBegin; 629 PetscCall(VecGetArrayRead(bb,&b)); 630 PetscCall(VecGetArray(xx,&x)); 631 t = a->solve_work; 632 633 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 634 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 635 636 /* forward solve the lower triangular */ 637 idx = 4*(*r++); 638 t[0] = b[idx]; t[1] = b[1+idx]; 639 t[2] = b[2+idx]; t[3] = b[3+idx]; 640 for (i=1; i<n; i++) { 641 v = aa + 16*ai[i]; 642 vi = aj + ai[i]; 643 nz = diag[i] - ai[i]; 644 idx = 4*(*r++); 645 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 646 while (nz--) { 647 idx = 4*(*vi++); 648 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 649 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 650 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 651 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 652 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 653 v += 16; 654 } 655 idx = 4*i; 656 t[idx] = s1;t[1+idx] = s2; 657 t[2+idx] = s3;t[3+idx] = s4; 658 } 659 /* backward solve the upper triangular */ 660 for (i=n-1; i>=0; i--) { 661 v = aa + 16*diag[i] + 16; 662 vi = aj + diag[i] + 1; 663 nz = ai[i+1] - diag[i] - 1; 664 idt = 4*i; 665 s1 = t[idt]; s2 = t[1+idt]; 666 s3 = t[2+idt];s4 = t[3+idt]; 667 while (nz--) { 668 idx = 4*(*vi++); 669 x1 = t[idx]; x2 = t[1+idx]; 670 x3 = t[2+idx]; x4 = t[3+idx]; 671 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 672 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 673 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 674 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 675 v += 16; 676 } 677 idc = 4*(*c--); 678 v = aa + 16*diag[i]; 679 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 680 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 681 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 682 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 683 } 684 685 PetscCall(ISRestoreIndices(isrow,&rout)); 686 PetscCall(ISRestoreIndices(iscol,&cout)); 687 PetscCall(VecRestoreArrayRead(bb,&b)); 688 PetscCall(VecRestoreArray(xx,&x)); 689 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 690 PetscFunctionReturn(0); 691 } 692 693 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 694 { 695 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 696 IS iscol=a->col,isrow=a->row; 697 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 698 PetscInt i,nz,idx,idt,idc,m; 699 const PetscInt *r,*c,*rout,*cout; 700 const MatScalar *aa=a->a,*v; 701 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 702 const PetscScalar *b; 703 704 PetscFunctionBegin; 705 PetscCall(VecGetArrayRead(bb,&b)); 706 PetscCall(VecGetArray(xx,&x)); 707 t = a->solve_work; 708 709 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 710 PetscCall(ISGetIndices(iscol,&cout)); c = cout; 711 712 /* forward solve the lower triangular */ 713 idx = 4*r[0]; 714 t[0] = b[idx]; t[1] = b[1+idx]; 715 t[2] = b[2+idx]; t[3] = b[3+idx]; 716 for (i=1; i<n; i++) { 717 v = aa + 16*ai[i]; 718 vi = aj + ai[i]; 719 nz = ai[i+1] - ai[i]; 720 idx = 4*r[i]; 721 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 722 for (m=0; m<nz; m++) { 723 idx = 4*vi[m]; 724 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 725 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 726 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 727 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 728 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 729 v += 16; 730 } 731 idx = 4*i; 732 t[idx] = s1;t[1+idx] = s2; 733 t[2+idx] = s3;t[3+idx] = s4; 734 } 735 /* backward solve the upper triangular */ 736 for (i=n-1; i>=0; i--) { 737 v = aa + 16*(adiag[i+1]+1); 738 vi = aj + adiag[i+1]+1; 739 nz = adiag[i] - adiag[i+1] - 1; 740 idt = 4*i; 741 s1 = t[idt]; s2 = t[1+idt]; 742 s3 = t[2+idt];s4 = t[3+idt]; 743 for (m=0; m<nz; m++) { 744 idx = 4*vi[m]; 745 x1 = t[idx]; x2 = t[1+idx]; 746 x3 = t[2+idx]; x4 = t[3+idx]; 747 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 748 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 749 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 750 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 751 v += 16; 752 } 753 idc = 4*c[i]; 754 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 755 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 756 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 757 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 758 } 759 760 PetscCall(ISRestoreIndices(isrow,&rout)); 761 PetscCall(ISRestoreIndices(iscol,&cout)); 762 PetscCall(VecRestoreArrayRead(bb,&b)); 763 PetscCall(VecRestoreArray(xx,&x)); 764 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 765 PetscFunctionReturn(0); 766 } 767 768 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 769 { 770 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 771 IS iscol=a->col,isrow=a->row; 772 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 773 PetscInt i,nz,idx,idt,idc; 774 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 775 const MatScalar *aa=a->a,*v; 776 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 777 PetscScalar *x; 778 const PetscScalar *b; 779 780 PetscFunctionBegin; 781 PetscCall(VecGetArrayRead(bb,&b)); 782 PetscCall(VecGetArray(xx,&x)); 783 t = (MatScalar*)a->solve_work; 784 785 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 786 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 787 788 /* forward solve the lower triangular */ 789 idx = 4*(*r++); 790 t[0] = (MatScalar)b[idx]; 791 t[1] = (MatScalar)b[1+idx]; 792 t[2] = (MatScalar)b[2+idx]; 793 t[3] = (MatScalar)b[3+idx]; 794 for (i=1; i<n; i++) { 795 v = aa + 16*ai[i]; 796 vi = aj + ai[i]; 797 nz = diag[i] - ai[i]; 798 idx = 4*(*r++); 799 s1 = (MatScalar)b[idx]; 800 s2 = (MatScalar)b[1+idx]; 801 s3 = (MatScalar)b[2+idx]; 802 s4 = (MatScalar)b[3+idx]; 803 while (nz--) { 804 idx = 4*(*vi++); 805 x1 = t[idx]; 806 x2 = t[1+idx]; 807 x3 = t[2+idx]; 808 x4 = t[3+idx]; 809 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 810 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 811 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 812 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 813 v += 16; 814 } 815 idx = 4*i; 816 t[idx] = s1; 817 t[1+idx] = s2; 818 t[2+idx] = s3; 819 t[3+idx] = s4; 820 } 821 /* backward solve the upper triangular */ 822 for (i=n-1; i>=0; i--) { 823 v = aa + 16*diag[i] + 16; 824 vi = aj + diag[i] + 1; 825 nz = ai[i+1] - diag[i] - 1; 826 idt = 4*i; 827 s1 = t[idt]; 828 s2 = t[1+idt]; 829 s3 = t[2+idt]; 830 s4 = t[3+idt]; 831 while (nz--) { 832 idx = 4*(*vi++); 833 x1 = t[idx]; 834 x2 = t[1+idx]; 835 x3 = t[2+idx]; 836 x4 = t[3+idx]; 837 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 838 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 839 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 840 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 841 v += 16; 842 } 843 idc = 4*(*c--); 844 v = aa + 16*diag[i]; 845 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 846 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 847 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 848 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 849 x[idc] = (PetscScalar)t[idt]; 850 x[1+idc] = (PetscScalar)t[1+idt]; 851 x[2+idc] = (PetscScalar)t[2+idt]; 852 x[3+idc] = (PetscScalar)t[3+idt]; 853 } 854 855 PetscCall(ISRestoreIndices(isrow,&rout)); 856 PetscCall(ISRestoreIndices(iscol,&cout)); 857 PetscCall(VecRestoreArrayRead(bb,&b)); 858 PetscCall(VecRestoreArray(xx,&x)); 859 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 860 PetscFunctionReturn(0); 861 } 862 863 #if defined(PETSC_HAVE_SSE) 864 865 #include PETSC_HAVE_SSE 866 867 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 868 { 869 /* 870 Note: This code uses demotion of double 871 to float when performing the mixed-mode computation. 872 This may not be numerically reasonable for all applications. 873 */ 874 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 875 IS iscol=a->col,isrow=a->row; 876 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 877 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 878 MatScalar *aa=a->a,*v; 879 PetscScalar *x,*b,*t; 880 881 /* Make space in temp stack for 16 Byte Aligned arrays */ 882 float ssealignedspace[11],*tmps,*tmpx; 883 unsigned long offset; 884 885 PetscFunctionBegin; 886 SSE_SCOPE_BEGIN; 887 888 offset = (unsigned long)ssealignedspace % 16; 889 if (offset) offset = (16 - offset)/4; 890 tmps = &ssealignedspace[offset]; 891 tmpx = &ssealignedspace[offset+4]; 892 PREFETCH_NTA(aa+16*ai[1]); 893 894 PetscCall(VecGetArray(bb,&b)); 895 PetscCall(VecGetArray(xx,&x)); 896 t = a->solve_work; 897 898 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 899 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 900 901 /* forward solve the lower triangular */ 902 idx = 4*(*r++); 903 t[0] = b[idx]; t[1] = b[1+idx]; 904 t[2] = b[2+idx]; t[3] = b[3+idx]; 905 v = aa + 16*ai[1]; 906 907 for (i=1; i<n;) { 908 PREFETCH_NTA(&v[8]); 909 vi = aj + ai[i]; 910 nz = diag[i] - ai[i]; 911 idx = 4*(*r++); 912 913 /* Demote sum from double to float */ 914 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 915 LOAD_PS(tmps,XMM7); 916 917 while (nz--) { 918 PREFETCH_NTA(&v[16]); 919 idx = 4*(*vi++); 920 921 /* Demote solution (so far) from double to float */ 922 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 923 924 /* 4x4 Matrix-Vector product with negative accumulation: */ 925 SSE_INLINE_BEGIN_2(tmpx,v) 926 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 927 928 /* First Column */ 929 SSE_COPY_PS(XMM0,XMM6) 930 SSE_SHUFFLE(XMM0,XMM0,0x00) 931 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 932 SSE_SUB_PS(XMM7,XMM0) 933 934 /* Second Column */ 935 SSE_COPY_PS(XMM1,XMM6) 936 SSE_SHUFFLE(XMM1,XMM1,0x55) 937 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 938 SSE_SUB_PS(XMM7,XMM1) 939 940 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 941 942 /* Third Column */ 943 SSE_COPY_PS(XMM2,XMM6) 944 SSE_SHUFFLE(XMM2,XMM2,0xAA) 945 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 946 SSE_SUB_PS(XMM7,XMM2) 947 948 /* Fourth Column */ 949 SSE_COPY_PS(XMM3,XMM6) 950 SSE_SHUFFLE(XMM3,XMM3,0xFF) 951 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 952 SSE_SUB_PS(XMM7,XMM3) 953 SSE_INLINE_END_2 954 955 v += 16; 956 } 957 idx = 4*i; 958 v = aa + 16*ai[++i]; 959 PREFETCH_NTA(v); 960 STORE_PS(tmps,XMM7); 961 962 /* Promote result from float to double */ 963 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 964 } 965 /* backward solve the upper triangular */ 966 idt = 4*(n-1); 967 ai16 = 16*diag[n-1]; 968 v = aa + ai16 + 16; 969 for (i=n-1; i>=0;) { 970 PREFETCH_NTA(&v[8]); 971 vi = aj + diag[i] + 1; 972 nz = ai[i+1] - diag[i] - 1; 973 974 /* Demote accumulator from double to float */ 975 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 976 LOAD_PS(tmps,XMM7); 977 978 while (nz--) { 979 PREFETCH_NTA(&v[16]); 980 idx = 4*(*vi++); 981 982 /* Demote solution (so far) from double to float */ 983 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 984 985 /* 4x4 Matrix-Vector Product with negative accumulation: */ 986 SSE_INLINE_BEGIN_2(tmpx,v) 987 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 988 989 /* First Column */ 990 SSE_COPY_PS(XMM0,XMM6) 991 SSE_SHUFFLE(XMM0,XMM0,0x00) 992 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 993 SSE_SUB_PS(XMM7,XMM0) 994 995 /* Second Column */ 996 SSE_COPY_PS(XMM1,XMM6) 997 SSE_SHUFFLE(XMM1,XMM1,0x55) 998 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 999 SSE_SUB_PS(XMM7,XMM1) 1000 1001 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1002 1003 /* Third Column */ 1004 SSE_COPY_PS(XMM2,XMM6) 1005 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1006 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1007 SSE_SUB_PS(XMM7,XMM2) 1008 1009 /* Fourth Column */ 1010 SSE_COPY_PS(XMM3,XMM6) 1011 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1012 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1013 SSE_SUB_PS(XMM7,XMM3) 1014 SSE_INLINE_END_2 1015 v += 16; 1016 } 1017 v = aa + ai16; 1018 ai16 = 16*diag[--i]; 1019 PREFETCH_NTA(aa+ai16+16); 1020 /* 1021 Scale the result by the diagonal 4x4 block, 1022 which was inverted as part of the factorization 1023 */ 1024 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 1025 /* First Column */ 1026 SSE_COPY_PS(XMM0,XMM7) 1027 SSE_SHUFFLE(XMM0,XMM0,0x00) 1028 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 1029 1030 /* Second Column */ 1031 SSE_COPY_PS(XMM1,XMM7) 1032 SSE_SHUFFLE(XMM1,XMM1,0x55) 1033 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 1034 SSE_ADD_PS(XMM0,XMM1) 1035 1036 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 1037 1038 /* Third Column */ 1039 SSE_COPY_PS(XMM2,XMM7) 1040 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1041 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 1042 SSE_ADD_PS(XMM0,XMM2) 1043 1044 /* Fourth Column */ 1045 SSE_COPY_PS(XMM3,XMM7) 1046 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1047 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 1048 SSE_ADD_PS(XMM0,XMM3) 1049 1050 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 1051 SSE_INLINE_END_3 1052 1053 /* Promote solution from float to double */ 1054 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 1055 1056 /* Apply reordering to t and stream into x. */ 1057 /* This way, x doesn't pollute the cache. */ 1058 /* Be careful with size: 2 doubles = 4 floats! */ 1059 idc = 4*(*c--); 1060 SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc]) 1061 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1062 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 1063 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 1064 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1065 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 1066 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 1067 SSE_INLINE_END_2 1068 v = aa + ai16 + 16; 1069 idt -= 4; 1070 } 1071 1072 PetscCall(ISRestoreIndices(isrow,&rout)); 1073 PetscCall(ISRestoreIndices(iscol,&cout)); 1074 PetscCall(VecRestoreArray(bb,&b)); 1075 PetscCall(VecRestoreArray(xx,&x)); 1076 PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n)); 1077 SSE_SCOPE_END; 1078 PetscFunctionReturn(0); 1079 } 1080 1081 #endif 1082 1083 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1084 { 1085 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1086 IS iscol=a->col,isrow=a->row; 1087 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1088 PetscInt i,nz,idx,idt,idc; 1089 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1090 const MatScalar *aa=a->a,*v; 1091 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1092 const PetscScalar *b; 1093 1094 PetscFunctionBegin; 1095 PetscCall(VecGetArrayRead(bb,&b)); 1096 PetscCall(VecGetArray(xx,&x)); 1097 t = a->solve_work; 1098 1099 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1100 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 1101 1102 /* forward solve the lower triangular */ 1103 idx = 3*(*r++); 1104 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1105 for (i=1; i<n; i++) { 1106 v = aa + 9*ai[i]; 1107 vi = aj + ai[i]; 1108 nz = diag[i] - ai[i]; 1109 idx = 3*(*r++); 1110 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1111 while (nz--) { 1112 idx = 3*(*vi++); 1113 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1114 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1115 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1116 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1117 v += 9; 1118 } 1119 idx = 3*i; 1120 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1121 } 1122 /* backward solve the upper triangular */ 1123 for (i=n-1; i>=0; i--) { 1124 v = aa + 9*diag[i] + 9; 1125 vi = aj + diag[i] + 1; 1126 nz = ai[i+1] - diag[i] - 1; 1127 idt = 3*i; 1128 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1129 while (nz--) { 1130 idx = 3*(*vi++); 1131 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1132 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1133 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1134 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1135 v += 9; 1136 } 1137 idc = 3*(*c--); 1138 v = aa + 9*diag[i]; 1139 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1140 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1141 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1142 } 1143 PetscCall(ISRestoreIndices(isrow,&rout)); 1144 PetscCall(ISRestoreIndices(iscol,&cout)); 1145 PetscCall(VecRestoreArrayRead(bb,&b)); 1146 PetscCall(VecRestoreArray(xx,&x)); 1147 PetscCall(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n)); 1148 PetscFunctionReturn(0); 1149 } 1150 1151 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1152 { 1153 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1154 IS iscol=a->col,isrow=a->row; 1155 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1156 PetscInt i,nz,idx,idt,idc,m; 1157 const PetscInt *r,*c,*rout,*cout; 1158 const MatScalar *aa=a->a,*v; 1159 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 1160 const PetscScalar *b; 1161 1162 PetscFunctionBegin; 1163 PetscCall(VecGetArrayRead(bb,&b)); 1164 PetscCall(VecGetArray(xx,&x)); 1165 t = a->solve_work; 1166 1167 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1168 PetscCall(ISGetIndices(iscol,&cout)); c = cout; 1169 1170 /* forward solve the lower triangular */ 1171 idx = 3*r[0]; 1172 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1173 for (i=1; i<n; i++) { 1174 v = aa + 9*ai[i]; 1175 vi = aj + ai[i]; 1176 nz = ai[i+1] - ai[i]; 1177 idx = 3*r[i]; 1178 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1179 for (m=0; m<nz; m++) { 1180 idx = 3*vi[m]; 1181 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1182 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1183 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1184 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1185 v += 9; 1186 } 1187 idx = 3*i; 1188 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1189 } 1190 /* backward solve the upper triangular */ 1191 for (i=n-1; i>=0; i--) { 1192 v = aa + 9*(adiag[i+1]+1); 1193 vi = aj + adiag[i+1]+1; 1194 nz = adiag[i] - adiag[i+1] - 1; 1195 idt = 3*i; 1196 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1197 for (m=0; m<nz; m++) { 1198 idx = 3*vi[m]; 1199 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1200 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1201 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1202 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1203 v += 9; 1204 } 1205 idc = 3*c[i]; 1206 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1207 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1208 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1209 } 1210 PetscCall(ISRestoreIndices(isrow,&rout)); 1211 PetscCall(ISRestoreIndices(iscol,&cout)); 1212 PetscCall(VecRestoreArrayRead(bb,&b)); 1213 PetscCall(VecRestoreArray(xx,&x)); 1214 PetscCall(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n)); 1215 PetscFunctionReturn(0); 1216 } 1217 1218 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1219 { 1220 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1221 IS iscol=a->col,isrow=a->row; 1222 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1223 PetscInt i,nz,idx,idt,idc; 1224 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1225 const MatScalar *aa=a->a,*v; 1226 PetscScalar *x,s1,s2,x1,x2,*t; 1227 const PetscScalar *b; 1228 1229 PetscFunctionBegin; 1230 PetscCall(VecGetArrayRead(bb,&b)); 1231 PetscCall(VecGetArray(xx,&x)); 1232 t = a->solve_work; 1233 1234 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1235 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 1236 1237 /* forward solve the lower triangular */ 1238 idx = 2*(*r++); 1239 t[0] = b[idx]; t[1] = b[1+idx]; 1240 for (i=1; i<n; i++) { 1241 v = aa + 4*ai[i]; 1242 vi = aj + ai[i]; 1243 nz = diag[i] - ai[i]; 1244 idx = 2*(*r++); 1245 s1 = b[idx]; s2 = b[1+idx]; 1246 while (nz--) { 1247 idx = 2*(*vi++); 1248 x1 = t[idx]; x2 = t[1+idx]; 1249 s1 -= v[0]*x1 + v[2]*x2; 1250 s2 -= v[1]*x1 + v[3]*x2; 1251 v += 4; 1252 } 1253 idx = 2*i; 1254 t[idx] = s1; t[1+idx] = s2; 1255 } 1256 /* backward solve the upper triangular */ 1257 for (i=n-1; i>=0; i--) { 1258 v = aa + 4*diag[i] + 4; 1259 vi = aj + diag[i] + 1; 1260 nz = ai[i+1] - diag[i] - 1; 1261 idt = 2*i; 1262 s1 = t[idt]; s2 = t[1+idt]; 1263 while (nz--) { 1264 idx = 2*(*vi++); 1265 x1 = t[idx]; x2 = t[1+idx]; 1266 s1 -= v[0]*x1 + v[2]*x2; 1267 s2 -= v[1]*x1 + v[3]*x2; 1268 v += 4; 1269 } 1270 idc = 2*(*c--); 1271 v = aa + 4*diag[i]; 1272 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1273 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1274 } 1275 PetscCall(ISRestoreIndices(isrow,&rout)); 1276 PetscCall(ISRestoreIndices(iscol,&cout)); 1277 PetscCall(VecRestoreArrayRead(bb,&b)); 1278 PetscCall(VecRestoreArray(xx,&x)); 1279 PetscCall(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n)); 1280 PetscFunctionReturn(0); 1281 } 1282 1283 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1284 { 1285 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1286 IS iscol=a->col,isrow=a->row; 1287 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 1288 PetscInt i,nz,idx,jdx,idt,idc,m; 1289 const PetscInt *r,*c,*rout,*cout; 1290 const MatScalar *aa=a->a,*v; 1291 PetscScalar *x,s1,s2,x1,x2,*t; 1292 const PetscScalar *b; 1293 1294 PetscFunctionBegin; 1295 PetscCall(VecGetArrayRead(bb,&b)); 1296 PetscCall(VecGetArray(xx,&x)); 1297 t = a->solve_work; 1298 1299 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1300 PetscCall(ISGetIndices(iscol,&cout)); c = cout; 1301 1302 /* forward solve the lower triangular */ 1303 idx = 2*r[0]; 1304 t[0] = b[idx]; t[1] = b[1+idx]; 1305 for (i=1; i<n; i++) { 1306 v = aa + 4*ai[i]; 1307 vi = aj + ai[i]; 1308 nz = ai[i+1] - ai[i]; 1309 idx = 2*r[i]; 1310 s1 = b[idx]; s2 = b[1+idx]; 1311 for (m=0; m<nz; m++) { 1312 jdx = 2*vi[m]; 1313 x1 = t[jdx]; x2 = t[1+jdx]; 1314 s1 -= v[0]*x1 + v[2]*x2; 1315 s2 -= v[1]*x1 + v[3]*x2; 1316 v += 4; 1317 } 1318 idx = 2*i; 1319 t[idx] = s1; t[1+idx] = s2; 1320 } 1321 /* backward solve the upper triangular */ 1322 for (i=n-1; i>=0; i--) { 1323 v = aa + 4*(adiag[i+1]+1); 1324 vi = aj + adiag[i+1]+1; 1325 nz = adiag[i] - adiag[i+1] - 1; 1326 idt = 2*i; 1327 s1 = t[idt]; s2 = t[1+idt]; 1328 for (m=0; m<nz; m++) { 1329 idx = 2*vi[m]; 1330 x1 = t[idx]; x2 = t[1+idx]; 1331 s1 -= v[0]*x1 + v[2]*x2; 1332 s2 -= v[1]*x1 + v[3]*x2; 1333 v += 4; 1334 } 1335 idc = 2*c[i]; 1336 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 1337 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 1338 } 1339 PetscCall(ISRestoreIndices(isrow,&rout)); 1340 PetscCall(ISRestoreIndices(iscol,&cout)); 1341 PetscCall(VecRestoreArrayRead(bb,&b)); 1342 PetscCall(VecRestoreArray(xx,&x)); 1343 PetscCall(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n)); 1344 PetscFunctionReturn(0); 1345 } 1346 1347 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1348 { 1349 Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1350 IS iscol=a->col,isrow=a->row; 1351 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1352 PetscInt i,nz; 1353 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1354 const MatScalar *aa=a->a,*v; 1355 PetscScalar *x,s1,*t; 1356 const PetscScalar *b; 1357 1358 PetscFunctionBegin; 1359 if (!n) PetscFunctionReturn(0); 1360 1361 PetscCall(VecGetArrayRead(bb,&b)); 1362 PetscCall(VecGetArray(xx,&x)); 1363 t = a->solve_work; 1364 1365 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1366 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 1367 1368 /* forward solve the lower triangular */ 1369 t[0] = b[*r++]; 1370 for (i=1; i<n; i++) { 1371 v = aa + ai[i]; 1372 vi = aj + ai[i]; 1373 nz = diag[i] - ai[i]; 1374 s1 = b[*r++]; 1375 while (nz--) { 1376 s1 -= (*v++)*t[*vi++]; 1377 } 1378 t[i] = s1; 1379 } 1380 /* backward solve the upper triangular */ 1381 for (i=n-1; i>=0; i--) { 1382 v = aa + diag[i] + 1; 1383 vi = aj + diag[i] + 1; 1384 nz = ai[i+1] - diag[i] - 1; 1385 s1 = t[i]; 1386 while (nz--) { 1387 s1 -= (*v++)*t[*vi++]; 1388 } 1389 x[*c--] = t[i] = aa[diag[i]]*s1; 1390 } 1391 1392 PetscCall(ISRestoreIndices(isrow,&rout)); 1393 PetscCall(ISRestoreIndices(iscol,&cout)); 1394 PetscCall(VecRestoreArrayRead(bb,&b)); 1395 PetscCall(VecRestoreArray(xx,&x)); 1396 PetscCall(PetscLogFlops(2.0*1*(a->nz) - A->cmap->n)); 1397 PetscFunctionReturn(0); 1398 } 1399 1400 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 1401 { 1402 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1403 IS iscol = a->col,isrow = a->row; 1404 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 1405 const PetscInt *rout,*cout,*r,*c; 1406 PetscScalar *x,*tmp,sum; 1407 const PetscScalar *b; 1408 const MatScalar *aa = a->a,*v; 1409 1410 PetscFunctionBegin; 1411 if (!n) PetscFunctionReturn(0); 1412 1413 PetscCall(VecGetArrayRead(bb,&b)); 1414 PetscCall(VecGetArray(xx,&x)); 1415 tmp = a->solve_work; 1416 1417 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 1418 PetscCall(ISGetIndices(iscol,&cout)); c = cout; 1419 1420 /* forward solve the lower triangular */ 1421 tmp[0] = b[r[0]]; 1422 v = aa; 1423 vi = aj; 1424 for (i=1; i<n; i++) { 1425 nz = ai[i+1] - ai[i]; 1426 sum = b[r[i]]; 1427 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1428 tmp[i] = sum; 1429 v += nz; vi += nz; 1430 } 1431 1432 /* backward solve the upper triangular */ 1433 for (i=n-1; i>=0; i--) { 1434 v = aa + adiag[i+1]+1; 1435 vi = aj + adiag[i+1]+1; 1436 nz = adiag[i]-adiag[i+1]-1; 1437 sum = tmp[i]; 1438 PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 1439 x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 1440 } 1441 1442 PetscCall(ISRestoreIndices(isrow,&rout)); 1443 PetscCall(ISRestoreIndices(iscol,&cout)); 1444 PetscCall(VecRestoreArrayRead(bb,&b)); 1445 PetscCall(VecRestoreArray(xx,&x)); 1446 PetscCall(PetscLogFlops(2.0*a->nz - A->cmap->n)); 1447 PetscFunctionReturn(0); 1448 } 1449