1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 11 { 12 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 13 IS isrow=a->row; 14 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 15 PetscErrorCode ierr; 16 const PetscInt *r; 17 PetscInt nz,*vj,k,idx,k1; 18 PetscInt bs =A->rmap->bs,bs2 = a->bs2; 19 const MatScalar *aa=a->a,*v,*diag; 20 PetscScalar *x,*xk,*xj,*xk_tmp,*t; 21 const PetscScalar *b; 22 23 PetscFunctionBegin; 24 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 t = a->solve_work; 27 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 28 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 29 30 /* solve U^T * D * y = b by forward substitution */ 31 xk = t; 32 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 33 idx = bs*r[k]; 34 for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1]; 35 } 36 for (k=0; k<mbs; k++) { 37 v = aa + bs2*ai[k]; 38 xk = t + k*bs; /* Dk*xk = k-th block of x */ 39 ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */ 40 nz = ai[k+1] - ai[k]; 41 vj = aj + ai[k]; 42 xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */ 43 while (nz--) { 44 /* x(:) += U(k,:)^T*(Dk*xk) */ 45 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 46 vj++; xj = t + (*vj)*bs; 47 v += bs2; 48 } 49 /* xk = inv(Dk)*(Dk*xk) */ 50 diag = aa+k*bs2; /* ptr to inv(Dk) */ 51 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 52 } 53 54 /* solve U*x = y by back substitution */ 55 for (k=mbs-1; k>=0; k--) { 56 v = aa + bs2*ai[k]; 57 xk = t + k*bs; /* xk */ 58 nz = ai[k+1] - ai[k]; 59 vj = aj + ai[k]; 60 xj = t + (*vj)*bs; 61 while (nz--) { 62 /* xk += U(k,:)*x(:) */ 63 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 64 vj++; 65 v += bs2; xj = t + (*vj)*bs; 66 } 67 idx = bs*r[k]; 68 for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++; 69 } 70 71 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 72 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 73 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 74 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 75 ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr); 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 80 { 81 PetscFunctionBegin; 82 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet"); 83 PetscFunctionReturn(0); 84 } 85 86 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 87 { 88 PetscFunctionBegin; 89 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented"); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 94 { 95 PetscErrorCode ierr; 96 PetscInt nz,k; 97 const PetscInt *vj,bs2 = bs*bs; 98 const MatScalar *v,*diag; 99 PetscScalar *xk,*xj,*xk_tmp; 100 101 PetscFunctionBegin; 102 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 103 for (k=0; k<mbs; k++) { 104 v = aa + bs2*ai[k]; 105 xk = x + k*bs; /* Dk*xk = k-th block of x */ 106 ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */ 107 nz = ai[k+1] - ai[k]; 108 vj = aj + ai[k]; 109 xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */ 110 while (nz--) { 111 /* x(:) += U(k,:)^T*(Dk*xk) */ 112 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 113 vj++; xj = x + (*vj)*bs; 114 v += bs2; 115 } 116 /* xk = inv(Dk)*(Dk*xk) */ 117 diag = aa+k*bs2; /* ptr to inv(Dk) */ 118 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 119 } 120 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 125 { 126 PetscInt nz,k; 127 const PetscInt *vj,bs2 = bs*bs; 128 const MatScalar *v; 129 PetscScalar *xk,*xj; 130 131 PetscFunctionBegin; 132 for (k=mbs-1; k>=0; k--) { 133 v = aa + bs2*ai[k]; 134 xk = x + k*bs; /* xk */ 135 nz = ai[k+1] - ai[k]; 136 vj = aj + ai[k]; 137 xj = x + (*vj)*bs; 138 while (nz--) { 139 /* xk += U(k,:)*x(:) */ 140 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 141 vj++; 142 v += bs2; xj = x + (*vj)*bs; 143 } 144 } 145 PetscFunctionReturn(0); 146 } 147 148 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 149 { 150 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 151 PetscErrorCode ierr; 152 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 153 PetscInt bs =A->rmap->bs; 154 const MatScalar *aa=a->a; 155 PetscScalar *x; 156 const PetscScalar *b; 157 158 PetscFunctionBegin; 159 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 160 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 161 162 /* solve U^T * D * y = b by forward substitution */ 163 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */ 164 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 165 166 /* solve U*x = y by back substitution */ 167 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 168 169 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 170 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 171 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 172 PetscFunctionReturn(0); 173 } 174 175 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 176 { 177 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 178 PetscErrorCode ierr; 179 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 180 PetscInt bs =A->rmap->bs; 181 const MatScalar *aa=a->a; 182 const PetscScalar *b; 183 PetscScalar *x; 184 185 PetscFunctionBegin; 186 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 187 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 188 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */ 189 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 190 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 191 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 192 ierr = PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 } 195 196 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 197 { 198 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 199 PetscErrorCode ierr; 200 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 201 PetscInt bs =A->rmap->bs; 202 const MatScalar *aa=a->a; 203 const PetscScalar *b; 204 PetscScalar *x; 205 206 PetscFunctionBegin; 207 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 208 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 209 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); 210 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 211 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 212 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 213 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 214 PetscFunctionReturn(0); 215 } 216 217 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 218 { 219 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 220 IS isrow=a->row; 221 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 222 PetscErrorCode ierr; 223 PetscInt nz,k,idx; 224 const MatScalar *aa=a->a,*v,*d; 225 PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp; 226 const PetscScalar *b; 227 228 PetscFunctionBegin; 229 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 230 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 231 t = a->solve_work; 232 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 233 234 /* solve U^T * D * y = b by forward substitution */ 235 tp = t; 236 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 237 idx = 7*r[k]; 238 tp[0] = b[idx]; 239 tp[1] = b[idx+1]; 240 tp[2] = b[idx+2]; 241 tp[3] = b[idx+3]; 242 tp[4] = b[idx+4]; 243 tp[5] = b[idx+5]; 244 tp[6] = b[idx+6]; 245 tp += 7; 246 } 247 248 for (k=0; k<mbs; k++) { 249 v = aa + 49*ai[k]; 250 vj = aj + ai[k]; 251 tp = t + k*7; 252 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; 253 nz = ai[k+1] - ai[k]; 254 tp = t + (*vj)*7; 255 while (nz--) { 256 tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 257 tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 258 tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 259 tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 260 tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 261 tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 262 tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 263 vj++; 264 tp = t + (*vj)*7; 265 v += 49; 266 } 267 268 /* xk = inv(Dk)*(Dk*xk) */ 269 d = aa+k*49; /* ptr to inv(Dk) */ 270 tp = t + k*7; 271 tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 272 tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 273 tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 274 tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 275 tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 276 tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 277 tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 278 } 279 280 /* solve U*x = y by back substitution */ 281 for (k=mbs-1; k>=0; k--) { 282 v = aa + 49*ai[k]; 283 vj = aj + ai[k]; 284 tp = t + k*7; 285 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */ 286 nz = ai[k+1] - ai[k]; 287 288 tp = t + (*vj)*7; 289 while (nz--) { 290 /* xk += U(k,:)*x(:) */ 291 x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6]; 292 x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6]; 293 x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6]; 294 x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6]; 295 x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6]; 296 x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6]; 297 x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6]; 298 vj++; 299 tp = t + (*vj)*7; 300 v += 49; 301 } 302 tp = t + k*7; 303 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6; 304 idx = 7*r[k]; 305 x[idx] = x0; 306 x[idx+1] = x1; 307 x[idx+2] = x2; 308 x[idx+3] = x3; 309 x[idx+4] = x4; 310 x[idx+5] = x5; 311 x[idx+6] = x6; 312 } 313 314 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 315 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 316 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 317 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 318 PetscFunctionReturn(0); 319 } 320 321 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 322 { 323 const MatScalar *v,*d; 324 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 325 PetscInt nz,k; 326 const PetscInt *vj; 327 328 PetscFunctionBegin; 329 for (k=0; k<mbs; k++) { 330 v = aa + 49*ai[k]; 331 xp = x + k*7; 332 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */ 333 nz = ai[k+1] - ai[k]; 334 vj = aj + ai[k]; 335 xp = x + (*vj)*7; 336 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 337 PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 338 while (nz--) { 339 /* x(:) += U(k,:)^T*(Dk*xk) */ 340 xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 341 xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 342 xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 343 xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 344 xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 345 xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 346 xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 347 vj++; 348 xp = x + (*vj)*7; 349 v += 49; 350 } 351 /* xk = inv(Dk)*(Dk*xk) */ 352 d = aa+k*49; /* ptr to inv(Dk) */ 353 xp = x + k*7; 354 xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 355 xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 356 xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 357 xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 358 xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 359 xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 360 xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 361 } 362 PetscFunctionReturn(0); 363 } 364 365 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 366 { 367 const MatScalar *v; 368 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 369 PetscInt nz,k; 370 const PetscInt *vj; 371 372 PetscFunctionBegin; 373 for (k=mbs-1; k>=0; k--) { 374 v = aa + 49*ai[k]; 375 xp = x + k*7; 376 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */ 377 nz = ai[k+1] - ai[k]; 378 vj = aj + ai[k]; 379 xp = x + (*vj)*7; 380 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 381 PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 382 while (nz--) { 383 /* xk += U(k,:)*x(:) */ 384 x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6]; 385 x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6]; 386 x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6]; 387 x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6]; 388 x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6]; 389 x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6]; 390 x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6]; 391 vj++; 392 v += 49; xp = x + (*vj)*7; 393 } 394 xp = x + k*7; 395 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6; 396 } 397 PetscFunctionReturn(0); 398 } 399 400 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 401 { 402 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 403 PetscErrorCode ierr; 404 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 405 const MatScalar *aa=a->a; 406 PetscScalar *x; 407 const PetscScalar *b; 408 409 PetscFunctionBegin; 410 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 411 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 412 413 /* solve U^T * D * y = b by forward substitution */ 414 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); /* x <- b */ 415 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 416 417 /* solve U*x = y by back substitution */ 418 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 419 420 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 421 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 422 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 423 PetscFunctionReturn(0); 424 } 425 426 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 427 { 428 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 429 PetscErrorCode ierr; 430 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 431 const MatScalar *aa=a->a; 432 PetscScalar *x; 433 const PetscScalar *b; 434 435 PetscFunctionBegin; 436 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 437 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 438 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); 439 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 440 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 441 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 442 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 443 PetscFunctionReturn(0); 444 } 445 446 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 447 { 448 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 449 PetscErrorCode ierr; 450 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 451 const MatScalar *aa=a->a; 452 PetscScalar *x; 453 const PetscScalar *b; 454 455 PetscFunctionBegin; 456 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 457 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 458 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); 459 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 460 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 461 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 462 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 463 PetscFunctionReturn(0); 464 } 465 466 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 469 IS isrow=a->row; 470 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 471 PetscErrorCode ierr; 472 PetscInt nz,k,idx; 473 const MatScalar *aa=a->a,*v,*d; 474 PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp; 475 const PetscScalar *b; 476 477 PetscFunctionBegin; 478 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480 t = a->solve_work; 481 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 482 483 /* solve U^T * D * y = b by forward substitution */ 484 tp = t; 485 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 486 idx = 6*r[k]; 487 tp[0] = b[idx]; 488 tp[1] = b[idx+1]; 489 tp[2] = b[idx+2]; 490 tp[3] = b[idx+3]; 491 tp[4] = b[idx+4]; 492 tp[5] = b[idx+5]; 493 tp += 6; 494 } 495 496 for (k=0; k<mbs; k++) { 497 v = aa + 36*ai[k]; 498 vj = aj + ai[k]; 499 tp = t + k*6; 500 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; 501 nz = ai[k+1] - ai[k]; 502 tp = t + (*vj)*6; 503 while (nz--) { 504 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 505 tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 506 tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 507 tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 508 tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 509 tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 510 vj++; 511 tp = t + (*vj)*6; 512 v += 36; 513 } 514 515 /* xk = inv(Dk)*(Dk*xk) */ 516 d = aa+k*36; /* ptr to inv(Dk) */ 517 tp = t + k*6; 518 tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 519 tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 520 tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 521 tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 522 tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 523 tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 524 } 525 526 /* solve U*x = y by back substitution */ 527 for (k=mbs-1; k>=0; k--) { 528 v = aa + 36*ai[k]; 529 vj = aj + ai[k]; 530 tp = t + k*6; 531 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */ 532 nz = ai[k+1] - ai[k]; 533 534 tp = t + (*vj)*6; 535 while (nz--) { 536 /* xk += U(k,:)*x(:) */ 537 x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5]; 538 x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5]; 539 x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5]; 540 x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5]; 541 x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5]; 542 x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5]; 543 vj++; 544 tp = t + (*vj)*6; 545 v += 36; 546 } 547 tp = t + k*6; 548 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; 549 idx = 6*r[k]; 550 x[idx] = x0; 551 x[idx+1] = x1; 552 x[idx+2] = x2; 553 x[idx+3] = x3; 554 x[idx+4] = x4; 555 x[idx+5] = x5; 556 } 557 558 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 559 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 560 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 561 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } 564 565 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 566 { 567 const MatScalar *v,*d; 568 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 569 PetscInt nz,k; 570 const PetscInt *vj; 571 572 PetscFunctionBegin; 573 for (k=0; k<mbs; k++) { 574 v = aa + 36*ai[k]; 575 xp = x + k*6; 576 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */ 577 nz = ai[k+1] - ai[k]; 578 vj = aj + ai[k]; 579 xp = x + (*vj)*6; 580 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 581 PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 582 while (nz--) { 583 /* x(:) += U(k,:)^T*(Dk*xk) */ 584 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 585 xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 586 xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 587 xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 588 xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 589 xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 590 vj++; 591 xp = x + (*vj)*6; 592 v += 36; 593 } 594 /* xk = inv(Dk)*(Dk*xk) */ 595 d = aa+k*36; /* ptr to inv(Dk) */ 596 xp = x + k*6; 597 xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 598 xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 599 xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 600 xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 601 xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 602 xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 603 } 604 PetscFunctionReturn(0); 605 } 606 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 607 { 608 const MatScalar *v; 609 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 610 PetscInt nz,k; 611 const PetscInt *vj; 612 613 PetscFunctionBegin; 614 for (k=mbs-1; k>=0; k--) { 615 v = aa + 36*ai[k]; 616 xp = x + k*6; 617 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */ 618 nz = ai[k+1] - ai[k]; 619 vj = aj + ai[k]; 620 xp = x + (*vj)*6; 621 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 622 PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 623 while (nz--) { 624 /* xk += U(k,:)*x(:) */ 625 x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5]; 626 x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5]; 627 x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5]; 628 x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5]; 629 x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5]; 630 x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5]; 631 vj++; 632 v += 36; xp = x + (*vj)*6; 633 } 634 xp = x + k*6; 635 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; 636 } 637 PetscFunctionReturn(0); 638 } 639 640 641 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 642 { 643 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 644 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 645 const MatScalar *aa=a->a; 646 PetscScalar *x; 647 const PetscScalar *b; 648 PetscErrorCode ierr; 649 650 PetscFunctionBegin; 651 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 652 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 653 654 /* solve U^T * D * y = b by forward substitution */ 655 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 656 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 657 658 /* solve U*x = y by back substitution */ 659 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 660 661 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 662 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 663 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 664 PetscFunctionReturn(0); 665 } 666 667 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 668 { 669 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 670 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 671 const MatScalar *aa=a->a; 672 PetscScalar *x; 673 const PetscScalar *b; 674 PetscErrorCode ierr; 675 676 PetscFunctionBegin; 677 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 678 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 679 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 680 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 681 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 682 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 683 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 684 PetscFunctionReturn(0); 685 } 686 687 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 688 { 689 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 690 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 691 const MatScalar *aa=a->a; 692 PetscScalar *x; 693 const PetscScalar *b; 694 PetscErrorCode ierr; 695 696 PetscFunctionBegin; 697 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 698 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 699 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 700 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 701 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 703 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 704 PetscFunctionReturn(0); 705 } 706 707 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 708 { 709 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 710 IS isrow=a->row; 711 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 712 PetscErrorCode ierr; 713 const PetscInt *r,*vj; 714 PetscInt nz,k,idx; 715 const MatScalar *aa=a->a,*v,*diag; 716 PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp; 717 const PetscScalar *b; 718 719 PetscFunctionBegin; 720 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 721 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 722 t = a->solve_work; 723 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 724 725 /* solve U^T * D * y = b by forward substitution */ 726 tp = t; 727 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 728 idx = 5*r[k]; 729 tp[0] = b[idx]; 730 tp[1] = b[idx+1]; 731 tp[2] = b[idx+2]; 732 tp[3] = b[idx+3]; 733 tp[4] = b[idx+4]; 734 tp += 5; 735 } 736 737 for (k=0; k<mbs; k++) { 738 v = aa + 25*ai[k]; 739 vj = aj + ai[k]; 740 tp = t + k*5; 741 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; 742 nz = ai[k+1] - ai[k]; 743 744 tp = t + (*vj)*5; 745 while (nz--) { 746 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 747 tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 748 tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4; 749 tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4; 750 tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4; 751 vj++; 752 tp = t + (*vj)*5; 753 v += 25; 754 } 755 756 /* xk = inv(Dk)*(Dk*xk) */ 757 diag = aa+k*25; /* ptr to inv(Dk) */ 758 tp = t + k*5; 759 tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 760 tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 761 tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 762 tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 763 tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 764 } 765 766 /* solve U*x = y by back substitution */ 767 for (k=mbs-1; k>=0; k--) { 768 v = aa + 25*ai[k]; 769 vj = aj + ai[k]; 770 tp = t + k*5; 771 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */ 772 nz = ai[k+1] - ai[k]; 773 774 tp = t + (*vj)*5; 775 while (nz--) { 776 /* xk += U(k,:)*x(:) */ 777 x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4]; 778 x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4]; 779 x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4]; 780 x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4]; 781 x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4]; 782 vj++; 783 tp = t + (*vj)*5; 784 v += 25; 785 } 786 tp = t + k*5; 787 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; 788 idx = 5*r[k]; 789 x[idx] = x0; 790 x[idx+1] = x1; 791 x[idx+2] = x2; 792 x[idx+3] = x3; 793 x[idx+4] = x4; 794 } 795 796 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 797 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 798 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 799 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 800 PetscFunctionReturn(0); 801 } 802 803 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 804 { 805 const MatScalar *v,*diag; 806 PetscScalar *xp,x0,x1,x2,x3,x4; 807 PetscInt nz,k; 808 const PetscInt *vj; 809 810 PetscFunctionBegin; 811 for (k=0; k<mbs; k++) { 812 v = aa + 25*ai[k]; 813 xp = x + k*5; 814 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */ 815 nz = ai[k+1] - ai[k]; 816 vj = aj + ai[k]; 817 xp = x + (*vj)*5; 818 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 819 PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 820 while (nz--) { 821 /* x(:) += U(k,:)^T*(Dk*xk) */ 822 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 823 xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 824 xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4; 825 xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4; 826 xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4; 827 vj++; 828 xp = x + (*vj)*5; 829 v += 25; 830 } 831 /* xk = inv(Dk)*(Dk*xk) */ 832 diag = aa+k*25; /* ptr to inv(Dk) */ 833 xp = x + k*5; 834 xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 835 xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 836 xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 837 xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 838 xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 839 } 840 PetscFunctionReturn(0); 841 } 842 843 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 844 { 845 const MatScalar *v; 846 PetscScalar *xp,x0,x1,x2,x3,x4; 847 PetscInt nz,k; 848 const PetscInt *vj; 849 850 PetscFunctionBegin; 851 for (k=mbs-1; k>=0; k--) { 852 v = aa + 25*ai[k]; 853 xp = x + k*5; 854 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */ 855 nz = ai[k+1] - ai[k]; 856 vj = aj + ai[k]; 857 xp = x + (*vj)*5; 858 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 859 PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 860 while (nz--) { 861 /* xk += U(k,:)*x(:) */ 862 x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4]; 863 x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4]; 864 x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4]; 865 x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4]; 866 x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4]; 867 vj++; 868 v += 25; xp = x + (*vj)*5; 869 } 870 xp = x + k*5; 871 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; 872 } 873 PetscFunctionReturn(0); 874 } 875 876 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 877 { 878 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 879 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 880 const MatScalar *aa=a->a; 881 PetscScalar *x; 882 const PetscScalar *b; 883 PetscErrorCode ierr; 884 885 PetscFunctionBegin; 886 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 887 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 888 889 /* solve U^T * D * y = b by forward substitution */ 890 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */ 891 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 892 893 /* solve U*x = y by back substitution */ 894 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 895 896 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 897 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 898 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 899 PetscFunctionReturn(0); 900 } 901 902 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 903 { 904 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 905 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 906 const MatScalar *aa=a->a; 907 PetscScalar *x; 908 const PetscScalar *b; 909 PetscErrorCode ierr; 910 911 PetscFunctionBegin; 912 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */ 915 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 916 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 917 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 918 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 923 { 924 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 925 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 926 const MatScalar *aa=a->a; 927 PetscScalar *x; 928 const PetscScalar *b; 929 PetscErrorCode ierr; 930 931 PetscFunctionBegin; 932 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 933 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 934 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); 935 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 936 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 937 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 938 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 939 PetscFunctionReturn(0); 940 } 941 942 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 943 { 944 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 945 IS isrow=a->row; 946 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 947 PetscErrorCode ierr; 948 const PetscInt *r,*vj; 949 PetscInt nz,k,idx; 950 const MatScalar *aa=a->a,*v,*diag; 951 PetscScalar *x,x0,x1,x2,x3,*t,*tp; 952 const PetscScalar *b; 953 954 PetscFunctionBegin; 955 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 956 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 957 t = a->solve_work; 958 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 959 960 /* solve U^T * D * y = b by forward substitution */ 961 tp = t; 962 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 963 idx = 4*r[k]; 964 tp[0] = b[idx]; 965 tp[1] = b[idx+1]; 966 tp[2] = b[idx+2]; 967 tp[3] = b[idx+3]; 968 tp += 4; 969 } 970 971 for (k=0; k<mbs; k++) { 972 v = aa + 16*ai[k]; 973 vj = aj + ai[k]; 974 tp = t + k*4; 975 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; 976 nz = ai[k+1] - ai[k]; 977 978 tp = t + (*vj)*4; 979 while (nz--) { 980 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 981 tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 982 tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 983 tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 984 vj++; 985 tp = t + (*vj)*4; 986 v += 16; 987 } 988 989 /* xk = inv(Dk)*(Dk*xk) */ 990 diag = aa+k*16; /* ptr to inv(Dk) */ 991 tp = t + k*4; 992 tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 993 tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 994 tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 995 tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 996 } 997 998 /* solve U*x = y by back substitution */ 999 for (k=mbs-1; k>=0; k--) { 1000 v = aa + 16*ai[k]; 1001 vj = aj + ai[k]; 1002 tp = t + k*4; 1003 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */ 1004 nz = ai[k+1] - ai[k]; 1005 1006 tp = t + (*vj)*4; 1007 while (nz--) { 1008 /* xk += U(k,:)*x(:) */ 1009 x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3]; 1010 x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3]; 1011 x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3]; 1012 x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3]; 1013 vj++; 1014 tp = t + (*vj)*4; 1015 v += 16; 1016 } 1017 tp = t + k*4; 1018 tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; 1019 idx = 4*r[k]; 1020 x[idx] = x0; 1021 x[idx+1] = x1; 1022 x[idx+2] = x2; 1023 x[idx+3] = x3; 1024 } 1025 1026 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1027 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1028 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1029 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1030 PetscFunctionReturn(0); 1031 } 1032 1033 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1034 { 1035 const MatScalar *v,*diag; 1036 PetscScalar *xp,x0,x1,x2,x3; 1037 PetscInt nz,k; 1038 const PetscInt *vj; 1039 1040 PetscFunctionBegin; 1041 for (k=0; k<mbs; k++) { 1042 v = aa + 16*ai[k]; 1043 xp = x + k*4; 1044 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */ 1045 nz = ai[k+1] - ai[k]; 1046 vj = aj + ai[k]; 1047 xp = x + (*vj)*4; 1048 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1049 PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1050 while (nz--) { 1051 /* x(:) += U(k,:)^T*(Dk*xk) */ 1052 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1053 xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1054 xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1055 xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1056 vj++; 1057 xp = x + (*vj)*4; 1058 v += 16; 1059 } 1060 /* xk = inv(Dk)*(Dk*xk) */ 1061 diag = aa+k*16; /* ptr to inv(Dk) */ 1062 xp = x + k*4; 1063 xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1064 xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1065 xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1066 xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1067 } 1068 PetscFunctionReturn(0); 1069 } 1070 1071 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1072 { 1073 const MatScalar *v; 1074 PetscScalar *xp,x0,x1,x2,x3; 1075 PetscInt nz,k; 1076 const PetscInt *vj; 1077 1078 PetscFunctionBegin; 1079 for (k=mbs-1; k>=0; k--) { 1080 v = aa + 16*ai[k]; 1081 xp = x + k*4; 1082 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */ 1083 nz = ai[k+1] - ai[k]; 1084 vj = aj + ai[k]; 1085 xp = x + (*vj)*4; 1086 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1087 PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1088 while (nz--) { 1089 /* xk += U(k,:)*x(:) */ 1090 x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3]; 1091 x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3]; 1092 x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3]; 1093 x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3]; 1094 vj++; 1095 v += 16; xp = x + (*vj)*4; 1096 } 1097 xp = x + k*4; 1098 xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3; 1099 } 1100 PetscFunctionReturn(0); 1101 } 1102 1103 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1104 { 1105 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1106 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1107 const MatScalar *aa=a->a; 1108 PetscScalar *x; 1109 const PetscScalar *b; 1110 PetscErrorCode ierr; 1111 1112 PetscFunctionBegin; 1113 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1114 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1115 1116 /* solve U^T * D * y = b by forward substitution */ 1117 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */ 1118 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1119 1120 /* solve U*x = y by back substitution */ 1121 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1122 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1123 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1124 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1125 PetscFunctionReturn(0); 1126 } 1127 1128 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1129 { 1130 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1131 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1132 const MatScalar *aa=a->a; 1133 PetscScalar *x; 1134 const PetscScalar *b; 1135 PetscErrorCode ierr; 1136 1137 PetscFunctionBegin; 1138 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1139 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1140 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */ 1141 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1142 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1143 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1144 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1145 PetscFunctionReturn(0); 1146 } 1147 1148 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1149 { 1150 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1151 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1152 const MatScalar *aa=a->a; 1153 PetscScalar *x; 1154 const PetscScalar *b; 1155 PetscErrorCode ierr; 1156 1157 PetscFunctionBegin; 1158 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1159 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1160 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); 1161 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1162 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1163 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1164 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1165 PetscFunctionReturn(0); 1166 } 1167 1168 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1169 { 1170 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1171 IS isrow=a->row; 1172 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1173 PetscErrorCode ierr; 1174 const PetscInt *r; 1175 PetscInt nz,k,idx; 1176 const PetscInt *vj; 1177 const MatScalar *aa=a->a,*v,*diag; 1178 PetscScalar *x,x0,x1,x2,*t,*tp; 1179 const PetscScalar *b; 1180 1181 PetscFunctionBegin; 1182 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1183 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1184 t = a->solve_work; 1185 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1186 1187 /* solve U^T * D * y = b by forward substitution */ 1188 tp = t; 1189 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1190 idx = 3*r[k]; 1191 tp[0] = b[idx]; 1192 tp[1] = b[idx+1]; 1193 tp[2] = b[idx+2]; 1194 tp += 3; 1195 } 1196 1197 for (k=0; k<mbs; k++) { 1198 v = aa + 9*ai[k]; 1199 vj = aj + ai[k]; 1200 tp = t + k*3; 1201 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; 1202 nz = ai[k+1] - ai[k]; 1203 1204 tp = t + (*vj)*3; 1205 while (nz--) { 1206 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1207 tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1208 tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1209 vj++; 1210 tp = t + (*vj)*3; 1211 v += 9; 1212 } 1213 1214 /* xk = inv(Dk)*(Dk*xk) */ 1215 diag = aa+k*9; /* ptr to inv(Dk) */ 1216 tp = t + k*3; 1217 tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1218 tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1219 tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1220 } 1221 1222 /* solve U*x = y by back substitution */ 1223 for (k=mbs-1; k>=0; k--) { 1224 v = aa + 9*ai[k]; 1225 vj = aj + ai[k]; 1226 tp = t + k*3; 1227 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */ 1228 nz = ai[k+1] - ai[k]; 1229 1230 tp = t + (*vj)*3; 1231 while (nz--) { 1232 /* xk += U(k,:)*x(:) */ 1233 x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2]; 1234 x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2]; 1235 x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2]; 1236 vj++; 1237 tp = t + (*vj)*3; 1238 v += 9; 1239 } 1240 tp = t + k*3; 1241 tp[0] = x0; tp[1] = x1; tp[2] = x2; 1242 idx = 3*r[k]; 1243 x[idx] = x0; 1244 x[idx+1] = x1; 1245 x[idx+2] = x2; 1246 } 1247 1248 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1249 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1250 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1251 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1252 PetscFunctionReturn(0); 1253 } 1254 1255 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1256 { 1257 const MatScalar *v,*diag; 1258 PetscScalar *xp,x0,x1,x2; 1259 PetscInt nz,k; 1260 const PetscInt *vj; 1261 1262 PetscFunctionBegin; 1263 for (k=0; k<mbs; k++) { 1264 v = aa + 9*ai[k]; 1265 xp = x + k*3; 1266 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */ 1267 nz = ai[k+1] - ai[k]; 1268 vj = aj + ai[k]; 1269 xp = x + (*vj)*3; 1270 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1271 PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1272 while (nz--) { 1273 /* x(:) += U(k,:)^T*(Dk*xk) */ 1274 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1275 xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1276 xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1277 vj++; 1278 xp = x + (*vj)*3; 1279 v += 9; 1280 } 1281 /* xk = inv(Dk)*(Dk*xk) */ 1282 diag = aa+k*9; /* ptr to inv(Dk) */ 1283 xp = x + k*3; 1284 xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1285 xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1286 xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1287 } 1288 PetscFunctionReturn(0); 1289 } 1290 1291 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1292 { 1293 const MatScalar *v; 1294 PetscScalar *xp,x0,x1,x2; 1295 PetscInt nz,k; 1296 const PetscInt *vj; 1297 1298 PetscFunctionBegin; 1299 for (k=mbs-1; k>=0; k--) { 1300 v = aa + 9*ai[k]; 1301 xp = x + k*3; 1302 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */ 1303 nz = ai[k+1] - ai[k]; 1304 vj = aj + ai[k]; 1305 xp = x + (*vj)*3; 1306 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1307 PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1308 while (nz--) { 1309 /* xk += U(k,:)*x(:) */ 1310 x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2]; 1311 x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2]; 1312 x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2]; 1313 vj++; 1314 v += 9; xp = x + (*vj)*3; 1315 } 1316 xp = x + k*3; 1317 xp[0] = x0; xp[1] = x1; xp[2] = x2; 1318 } 1319 PetscFunctionReturn(0); 1320 } 1321 1322 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1323 { 1324 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1325 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1326 const MatScalar *aa=a->a; 1327 PetscScalar *x; 1328 const PetscScalar *b; 1329 PetscErrorCode ierr; 1330 1331 PetscFunctionBegin; 1332 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1333 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1334 1335 /* solve U^T * D * y = b by forward substitution */ 1336 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1337 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1338 1339 /* solve U*x = y by back substitution */ 1340 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1341 1342 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1343 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1344 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1349 { 1350 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1351 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1352 const MatScalar *aa=a->a; 1353 PetscScalar *x; 1354 const PetscScalar *b; 1355 PetscErrorCode ierr; 1356 1357 PetscFunctionBegin; 1358 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1359 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1360 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1361 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1362 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1363 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1364 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1365 PetscFunctionReturn(0); 1366 } 1367 1368 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1369 { 1370 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1371 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1372 const MatScalar *aa=a->a; 1373 PetscScalar *x; 1374 const PetscScalar *b; 1375 PetscErrorCode ierr; 1376 1377 PetscFunctionBegin; 1378 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1379 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1380 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1381 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1382 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1383 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1384 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1385 PetscFunctionReturn(0); 1386 } 1387 1388 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1389 { 1390 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1391 IS isrow=a->row; 1392 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1393 PetscErrorCode ierr; 1394 const PetscInt *r,*vj; 1395 PetscInt nz,k,k2,idx; 1396 const MatScalar *aa=a->a,*v,*diag; 1397 PetscScalar *x,x0,x1,*t; 1398 const PetscScalar *b; 1399 1400 PetscFunctionBegin; 1401 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1402 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1403 t = a->solve_work; 1404 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1405 1406 /* solve U^T * D * y = perm(b) by forward substitution */ 1407 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1408 idx = 2*r[k]; 1409 t[k*2] = b[idx]; 1410 t[k*2+1] = b[idx+1]; 1411 } 1412 for (k=0; k<mbs; k++) { 1413 v = aa + 4*ai[k]; 1414 vj = aj + ai[k]; 1415 k2 = k*2; 1416 x0 = t[k2]; x1 = t[k2+1]; 1417 nz = ai[k+1] - ai[k]; 1418 while (nz--) { 1419 t[(*vj)*2] += v[0]*x0 + v[1]*x1; 1420 t[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1421 vj++; v += 4; 1422 } 1423 diag = aa+k*4; /* ptr to inv(Dk) */ 1424 t[k2] = diag[0]*x0 + diag[2]*x1; 1425 t[k2+1] = diag[1]*x0 + diag[3]*x1; 1426 } 1427 1428 /* solve U*x = y by back substitution */ 1429 for (k=mbs-1; k>=0; k--) { 1430 v = aa + 4*ai[k]; 1431 vj = aj + ai[k]; 1432 k2 = k*2; 1433 x0 = t[k2]; x1 = t[k2+1]; 1434 nz = ai[k+1] - ai[k]; 1435 while (nz--) { 1436 x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1]; 1437 x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1]; 1438 vj++; 1439 v += 4; 1440 } 1441 t[k2] = x0; 1442 t[k2+1] = x1; 1443 idx = 2*r[k]; 1444 x[idx] = x0; 1445 x[idx+1] = x1; 1446 } 1447 1448 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1449 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1450 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1451 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1452 PetscFunctionReturn(0); 1453 } 1454 1455 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1456 { 1457 const MatScalar *v,*diag; 1458 PetscScalar x0,x1; 1459 PetscInt nz,k,k2; 1460 const PetscInt *vj; 1461 1462 PetscFunctionBegin; 1463 for (k=0; k<mbs; k++) { 1464 v = aa + 4*ai[k]; 1465 vj = aj + ai[k]; 1466 k2 = k*2; 1467 x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */ 1468 nz = ai[k+1] - ai[k]; 1469 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1470 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1471 while (nz--) { 1472 /* x(:) += U(k,:)^T*(Dk*xk) */ 1473 x[(*vj)*2] += v[0]*x0 + v[1]*x1; 1474 x[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1475 vj++; v += 4; 1476 } 1477 /* xk = inv(Dk)*(Dk*xk) */ 1478 diag = aa+k*4; /* ptr to inv(Dk) */ 1479 x[k2] = diag[0]*x0 + diag[2]*x1; 1480 x[k2+1] = diag[1]*x0 + diag[3]*x1; 1481 } 1482 PetscFunctionReturn(0); 1483 } 1484 1485 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1486 { 1487 const MatScalar *v; 1488 PetscScalar x0,x1; 1489 PetscInt nz,k,k2; 1490 const PetscInt *vj; 1491 1492 PetscFunctionBegin; 1493 for (k=mbs-1; k>=0; k--) { 1494 v = aa + 4*ai[k]; 1495 vj = aj + ai[k]; 1496 k2 = k*2; 1497 x0 = x[k2]; x1 = x[k2+1]; /* xk */ 1498 nz = ai[k+1] - ai[k]; 1499 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1500 PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1501 while (nz--) { 1502 /* xk += U(k,:)*x(:) */ 1503 x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1]; 1504 x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1]; 1505 vj++; 1506 v += 4; 1507 } 1508 x[k2] = x0; 1509 x[k2+1] = x1; 1510 } 1511 PetscFunctionReturn(0); 1512 } 1513 1514 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1515 { 1516 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1517 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1518 const MatScalar *aa=a->a; 1519 PetscScalar *x; 1520 const PetscScalar *b; 1521 PetscErrorCode ierr; 1522 1523 PetscFunctionBegin; 1524 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1525 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1526 1527 /* solve U^T * D * y = b by forward substitution */ 1528 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1529 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1530 1531 /* solve U*x = y by back substitution */ 1532 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1533 1534 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1535 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1536 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1537 PetscFunctionReturn(0); 1538 } 1539 1540 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1541 { 1542 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1543 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1544 const MatScalar *aa=a->a; 1545 PetscScalar *x; 1546 const PetscScalar *b; 1547 PetscErrorCode ierr; 1548 1549 PetscFunctionBegin; 1550 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1551 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1552 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1553 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1554 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1555 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1556 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1557 PetscFunctionReturn(0); 1558 } 1559 1560 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1561 { 1562 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1563 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1564 const MatScalar *aa=a->a; 1565 PetscScalar *x; 1566 const PetscScalar *b; 1567 PetscErrorCode ierr; 1568 1569 PetscFunctionBegin; 1570 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1571 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1572 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1573 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1574 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1575 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1576 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 1577 PetscFunctionReturn(0); 1578 } 1579 1580 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1581 { 1582 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1583 IS isrow=a->row; 1584 PetscErrorCode ierr; 1585 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1586 const MatScalar *aa=a->a,*v; 1587 const PetscScalar *b; 1588 PetscScalar *x,xk,*t; 1589 PetscInt nz,k,j; 1590 1591 PetscFunctionBegin; 1592 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1593 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1594 t = a->solve_work; 1595 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1596 1597 /* solve U^T*D*y = perm(b) by forward substitution */ 1598 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1599 for (k=0; k<mbs; k++) { 1600 v = aa + ai[k]; 1601 vj = aj + ai[k]; 1602 xk = t[k]; 1603 nz = ai[k+1] - ai[k] - 1; 1604 for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk; 1605 t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */ 1606 } 1607 1608 /* solve U*perm(x) = y by back substitution */ 1609 for (k=mbs-1; k>=0; k--) { 1610 v = aa + adiag[k] - 1; 1611 vj = aj + adiag[k] - 1; 1612 nz = ai[k+1] - ai[k] - 1; 1613 for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]]; 1614 x[rp[k]] = t[k]; 1615 } 1616 1617 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1618 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1619 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1620 ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr); 1621 PetscFunctionReturn(0); 1622 } 1623 1624 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1625 { 1626 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1627 IS isrow=a->row; 1628 PetscErrorCode ierr; 1629 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1630 const MatScalar *aa=a->a,*v; 1631 PetscScalar *x,xk,*t; 1632 const PetscScalar *b; 1633 PetscInt nz,k; 1634 1635 PetscFunctionBegin; 1636 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1637 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1638 t = a->solve_work; 1639 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1640 1641 /* solve U^T*D*y = perm(b) by forward substitution */ 1642 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1643 for (k=0; k<mbs; k++) { 1644 v = aa + ai[k] + 1; 1645 vj = aj + ai[k] + 1; 1646 xk = t[k]; 1647 nz = ai[k+1] - ai[k] - 1; 1648 while (nz--) t[*vj++] += (*v++) * xk; 1649 t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */ 1650 } 1651 1652 /* solve U*perm(x) = y by back substitution */ 1653 for (k=mbs-1; k>=0; k--) { 1654 v = aa + ai[k] + 1; 1655 vj = aj + ai[k] + 1; 1656 nz = ai[k+1] - ai[k] - 1; 1657 while (nz--) t[k] += (*v++) * t[*vj++]; 1658 x[rp[k]] = t[k]; 1659 } 1660 1661 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1662 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1663 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1664 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1665 PetscFunctionReturn(0); 1666 } 1667 1668 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1669 { 1670 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1671 IS isrow=a->row; 1672 PetscErrorCode ierr; 1673 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1674 const MatScalar *aa=a->a,*v; 1675 PetscReal diagk; 1676 PetscScalar *x,xk; 1677 const PetscScalar *b; 1678 PetscInt nz,k; 1679 1680 PetscFunctionBegin; 1681 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1682 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1683 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1684 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1685 1686 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1687 for (k=0; k<mbs; k++) { 1688 v = aa + ai[k]; 1689 vj = aj + ai[k]; 1690 xk = x[k]; 1691 nz = ai[k+1] - ai[k] - 1; 1692 while (nz--) x[*vj++] += (*v++) * xk; 1693 1694 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1695 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1696 x[k] = xk*PetscSqrtReal(diagk); 1697 } 1698 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1699 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1700 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1701 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1702 PetscFunctionReturn(0); 1703 } 1704 1705 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1706 { 1707 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1708 IS isrow=a->row; 1709 PetscErrorCode ierr; 1710 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1711 const MatScalar *aa=a->a,*v; 1712 PetscReal diagk; 1713 PetscScalar *x,xk; 1714 const PetscScalar *b; 1715 PetscInt nz,k; 1716 1717 PetscFunctionBegin; 1718 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1719 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1720 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1721 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1722 1723 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1724 for (k=0; k<mbs; k++) { 1725 v = aa + ai[k] + 1; 1726 vj = aj + ai[k] + 1; 1727 xk = x[k]; 1728 nz = ai[k+1] - ai[k] - 1; 1729 while (nz--) x[*vj++] += (*v++) * xk; 1730 1731 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1732 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1733 x[k] = xk*PetscSqrtReal(diagk); 1734 } 1735 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1736 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1737 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1738 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1739 PetscFunctionReturn(0); 1740 } 1741 1742 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1743 { 1744 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1745 IS isrow=a->row; 1746 PetscErrorCode ierr; 1747 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1748 const MatScalar *aa=a->a,*v; 1749 PetscReal diagk; 1750 PetscScalar *x,*t; 1751 const PetscScalar *b; 1752 PetscInt nz,k; 1753 1754 PetscFunctionBegin; 1755 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1756 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1757 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1758 t = a->solve_work; 1759 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1760 1761 for (k=mbs-1; k>=0; k--) { 1762 v = aa + ai[k]; 1763 vj = aj + ai[k]; 1764 diagk = PetscRealPart(aa[adiag[k]]); 1765 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1766 t[k] = b[k] * PetscSqrtReal(diagk); 1767 nz = ai[k+1] - ai[k] - 1; 1768 while (nz--) t[k] += (*v++) * t[*vj++]; 1769 x[rp[k]] = t[k]; 1770 } 1771 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1772 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1773 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1774 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1775 PetscFunctionReturn(0); 1776 } 1777 1778 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1779 { 1780 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1781 IS isrow=a->row; 1782 PetscErrorCode ierr; 1783 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1784 const MatScalar *aa=a->a,*v; 1785 PetscReal diagk; 1786 PetscScalar *x,*t; 1787 const PetscScalar *b; 1788 PetscInt nz,k; 1789 1790 PetscFunctionBegin; 1791 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1792 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1793 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1794 t = a->solve_work; 1795 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1796 1797 for (k=mbs-1; k>=0; k--) { 1798 v = aa + ai[k] + 1; 1799 vj = aj + ai[k] + 1; 1800 diagk = PetscRealPart(aa[ai[k]]); 1801 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1802 t[k] = b[k] * PetscSqrtReal(diagk); 1803 nz = ai[k+1] - ai[k] - 1; 1804 while (nz--) t[k] += (*v++) * t[*vj++]; 1805 x[rp[k]] = t[k]; 1806 } 1807 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1808 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1809 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1810 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1811 PetscFunctionReturn(0); 1812 } 1813 1814 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx) 1815 { 1816 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1817 PetscErrorCode ierr; 1818 1819 PetscFunctionBegin; 1820 if (A->rmap->bs == 1) { 1821 ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr); 1822 } else { 1823 IS isrow=a->row; 1824 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1825 const MatScalar *aa=a->a,*v; 1826 PetscScalar *x,*t; 1827 const PetscScalar *b; 1828 PetscInt nz,k,n,i,j; 1829 1830 if (bb->n > a->solves_work_n) { 1831 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1832 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1833 a->solves_work_n = bb->n; 1834 } 1835 n = bb->n; 1836 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1837 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1838 t = a->solves_work; 1839 1840 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1841 1842 /* solve U^T*D*y = perm(b) by forward substitution */ 1843 for (k=0; k<mbs; k++) { 1844 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1845 } 1846 for (k=0; k<mbs; k++) { 1847 v = aa + ai[k]; 1848 vj = aj + ai[k]; 1849 nz = ai[k+1] - ai[k] - 1; 1850 for (j=0; j<nz; j++) { 1851 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1852 v++;vj++; 1853 } 1854 for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1855 } 1856 1857 /* solve U*perm(x) = y by back substitution */ 1858 for (k=mbs-1; k>=0; k--) { 1859 v = aa + ai[k] - 1; 1860 vj = aj + ai[k] - 1; 1861 nz = ai[k+1] - ai[k] - 1; 1862 for (j=0; j<nz; j++) { 1863 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1864 v++;vj++; 1865 } 1866 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1867 } 1868 1869 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1870 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1871 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1872 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1873 } 1874 PetscFunctionReturn(0); 1875 } 1876 1877 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx) 1878 { 1879 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1880 PetscErrorCode ierr; 1881 1882 PetscFunctionBegin; 1883 if (A->rmap->bs == 1) { 1884 ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr); 1885 } else { 1886 IS isrow=a->row; 1887 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1888 const MatScalar *aa=a->a,*v; 1889 PetscScalar *x,*t; 1890 const PetscScalar *b; 1891 PetscInt nz,k,n,i; 1892 1893 if (bb->n > a->solves_work_n) { 1894 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1895 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1896 a->solves_work_n = bb->n; 1897 } 1898 n = bb->n; 1899 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1900 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1901 t = a->solves_work; 1902 1903 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1904 1905 /* solve U^T*D*y = perm(b) by forward substitution */ 1906 for (k=0; k<mbs; k++) { 1907 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1908 } 1909 for (k=0; k<mbs; k++) { 1910 v = aa + ai[k]; 1911 vj = aj + ai[k]; 1912 nz = ai[k+1] - ai[k]; 1913 while (nz--) { 1914 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1915 v++;vj++; 1916 } 1917 for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1918 } 1919 1920 /* solve U*perm(x) = y by back substitution */ 1921 for (k=mbs-1; k>=0; k--) { 1922 v = aa + ai[k]; 1923 vj = aj + ai[k]; 1924 nz = ai[k+1] - ai[k]; 1925 while (nz--) { 1926 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1927 v++;vj++; 1928 } 1929 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1930 } 1931 1932 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1933 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1934 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1935 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1936 } 1937 PetscFunctionReturn(0); 1938 } 1939 1940 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1941 { 1942 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1943 PetscErrorCode ierr; 1944 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1945 const MatScalar *aa=a->a,*v; 1946 const PetscScalar *b; 1947 PetscScalar *x,xi; 1948 PetscInt nz,i,j; 1949 1950 PetscFunctionBegin; 1951 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1952 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1953 /* solve U^T*D*y = b by forward substitution */ 1954 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 1955 for (i=0; i<mbs; i++) { 1956 v = aa + ai[i]; 1957 vj = aj + ai[i]; 1958 xi = x[i]; 1959 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 1960 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 1961 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 1962 } 1963 /* solve U*x = y by backward substitution */ 1964 for (i=mbs-2; i>=0; i--) { 1965 xi = x[i]; 1966 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 1967 vj = aj + adiag[i] - 1; 1968 nz = ai[i+1] - ai[i] - 1; 1969 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 1970 x[i] = xi; 1971 } 1972 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1973 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1974 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1975 PetscFunctionReturn(0); 1976 } 1977 1978 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X) 1979 { 1980 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1981 PetscErrorCode ierr; 1982 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1983 const MatScalar *aa=a->a,*v; 1984 const PetscScalar *b; 1985 PetscScalar *x,xi; 1986 PetscInt nz,i,j,neq,ldb,ldx; 1987 PetscBool isdense; 1988 1989 PetscFunctionBegin; 1990 if (!mbs) PetscFunctionReturn(0); 1991 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 1992 if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix"); 1993 if (X != B) { 1994 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);CHKERRQ(ierr); 1995 if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix"); 1996 } 1997 ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr); 1998 ierr = MatDenseGetLDA(B,&ldb);CHKERRQ(ierr); 1999 ierr = MatDenseGetArray(X,&x);CHKERRQ(ierr); 2000 ierr = MatDenseGetLDA(X,&ldx);CHKERRQ(ierr); 2001 for (neq=0; neq<B->cmap->n; neq++) { 2002 /* solve U^T*D*y = b by forward substitution */ 2003 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2004 for (i=0; i<mbs; i++) { 2005 v = aa + ai[i]; 2006 vj = aj + ai[i]; 2007 xi = x[i]; 2008 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 2009 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 2010 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2011 } 2012 /* solve U*x = y by backward substitution */ 2013 for (i=mbs-2; i>=0; i--) { 2014 xi = x[i]; 2015 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2016 vj = aj + adiag[i] - 1; 2017 nz = ai[i+1] - ai[i] - 1; 2018 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 2019 x[i] = xi; 2020 } 2021 b += ldb; 2022 x += ldx; 2023 } 2024 ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr); 2025 ierr = MatDenseRestoreArray(X,&x);CHKERRQ(ierr); 2026 ierr = PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));CHKERRQ(ierr); 2027 PetscFunctionReturn(0); 2028 } 2029 2030 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2031 { 2032 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2033 PetscErrorCode ierr; 2034 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2035 const MatScalar *aa=a->a,*v; 2036 PetscScalar *x,xk; 2037 const PetscScalar *b; 2038 PetscInt nz,k; 2039 2040 PetscFunctionBegin; 2041 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2042 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2043 2044 /* solve U^T*D*y = b by forward substitution */ 2045 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2046 for (k=0; k<mbs; k++) { 2047 v = aa + ai[k] + 1; 2048 vj = aj + ai[k] + 1; 2049 xk = x[k]; 2050 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2051 while (nz--) x[*vj++] += (*v++) * xk; 2052 x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2053 } 2054 2055 /* solve U*x = y by back substitution */ 2056 for (k=mbs-2; k>=0; k--) { 2057 v = aa + ai[k] + 1; 2058 vj = aj + ai[k] + 1; 2059 xk = x[k]; 2060 nz = ai[k+1] - ai[k] - 1; 2061 while (nz--) { 2062 xk += (*v++) * x[*vj++]; 2063 } 2064 x[k] = xk; 2065 } 2066 2067 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2068 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2069 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2070 PetscFunctionReturn(0); 2071 } 2072 2073 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2074 { 2075 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2076 PetscErrorCode ierr; 2077 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2078 const MatScalar *aa=a->a,*v; 2079 PetscReal diagk; 2080 PetscScalar *x; 2081 const PetscScalar *b; 2082 PetscInt nz,k; 2083 2084 PetscFunctionBegin; 2085 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2086 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2087 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2088 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2089 for (k=0; k<mbs; k++) { 2090 v = aa + ai[k]; 2091 vj = aj + ai[k]; 2092 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2093 while (nz--) x[*vj++] += (*v++) * x[k]; 2094 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2095 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]])); 2096 x[k] *= PetscSqrtReal(diagk); 2097 } 2098 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2099 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2100 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2101 PetscFunctionReturn(0); 2102 } 2103 2104 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2105 { 2106 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2107 PetscErrorCode ierr; 2108 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2109 const MatScalar *aa=a->a,*v; 2110 PetscReal diagk; 2111 PetscScalar *x; 2112 const PetscScalar *b; 2113 PetscInt nz,k; 2114 2115 PetscFunctionBegin; 2116 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2117 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2118 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2119 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2120 for (k=0; k<mbs; k++) { 2121 v = aa + ai[k] + 1; 2122 vj = aj + ai[k] + 1; 2123 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2124 while (nz--) x[*vj++] += (*v++) * x[k]; 2125 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2126 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]])); 2127 x[k] *= PetscSqrtReal(diagk); 2128 } 2129 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2130 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2131 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2132 PetscFunctionReturn(0); 2133 } 2134 2135 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2136 { 2137 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2138 PetscErrorCode ierr; 2139 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2140 const MatScalar *aa=a->a,*v; 2141 PetscReal diagk; 2142 PetscScalar *x; 2143 const PetscScalar *b; 2144 PetscInt nz,k; 2145 2146 PetscFunctionBegin; 2147 /* solve D^(1/2)*U*x = b by back substitution */ 2148 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2149 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2150 2151 for (k=mbs-1; k>=0; k--) { 2152 v = aa + ai[k]; 2153 vj = aj + ai[k]; 2154 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2155 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2156 x[k] = PetscSqrtReal(diagk)*b[k]; 2157 nz = ai[k+1] - ai[k] - 1; 2158 while (nz--) x[k] += (*v++) * x[*vj++]; 2159 } 2160 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2161 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2162 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2163 PetscFunctionReturn(0); 2164 } 2165 2166 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2167 { 2168 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2169 PetscErrorCode ierr; 2170 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2171 const MatScalar *aa=a->a,*v; 2172 PetscReal diagk; 2173 PetscScalar *x; 2174 const PetscScalar *b; 2175 PetscInt nz,k; 2176 2177 PetscFunctionBegin; 2178 /* solve D^(1/2)*U*x = b by back substitution */ 2179 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2180 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2181 2182 for (k=mbs-1; k>=0; k--) { 2183 v = aa + ai[k] + 1; 2184 vj = aj + ai[k] + 1; 2185 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2186 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2187 x[k] = PetscSqrtReal(diagk)*b[k]; 2188 nz = ai[k+1] - ai[k] - 1; 2189 while (nz--) x[k] += (*v++) * x[*vj++]; 2190 } 2191 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2192 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2193 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2194 PetscFunctionReturn(0); 2195 } 2196 2197 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2198 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2199 { 2200 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2201 PetscErrorCode ierr; 2202 const PetscInt *rip,mbs = a->mbs,*ai,*aj; 2203 PetscInt *jutmp,bs = A->rmap->bs,i; 2204 PetscInt m,reallocs = 0,*levtmp; 2205 PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2206 PetscInt incrlev,*lev,shift,prow,nz; 2207 PetscReal f = info->fill,levels = info->levels; 2208 PetscBool perm_identity; 2209 2210 PetscFunctionBegin; 2211 /* check whether perm is the identity mapping */ 2212 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2213 2214 if (perm_identity) { 2215 a->permute = PETSC_FALSE; 2216 ai = a->i; aj = a->j; 2217 } else { /* non-trivial permutation */ 2218 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2219 a->permute = PETSC_TRUE; 2220 ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr); 2221 ai = a->inew; aj = a->jnew; 2222 } 2223 2224 /* initialization */ 2225 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2226 umax = (PetscInt)(f*ai[mbs] + 1); 2227 ierr = PetscMalloc1(umax,&lev);CHKERRQ(ierr); 2228 umax += mbs + 1; 2229 shift = mbs + 1; 2230 ierr = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr); 2231 ierr = PetscMalloc1(umax,&ju);CHKERRQ(ierr); 2232 iu[0] = mbs + 1; 2233 juidx = mbs + 1; 2234 /* prowl: linked list for pivot row */ 2235 ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr); 2236 /* q: linked list for col index */ 2237 2238 for (i=0; i<mbs; i++) { 2239 prowl[i] = mbs; 2240 q[i] = 0; 2241 } 2242 2243 /* for each row k */ 2244 for (k=0; k<mbs; k++) { 2245 nzk = 0; 2246 q[k] = mbs; 2247 /* copy current row into linked list */ 2248 nz = ai[rip[k]+1] - ai[rip[k]]; 2249 j = ai[rip[k]]; 2250 while (nz--) { 2251 vj = rip[aj[j++]]; 2252 if (vj > k) { 2253 qm = k; 2254 do { 2255 m = qm; qm = q[m]; 2256 } while (qm < vj); 2257 if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n"); 2258 nzk++; 2259 q[m] = vj; 2260 q[vj] = qm; 2261 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2262 } 2263 } 2264 2265 /* modify nonzero structure of k-th row by computing fill-in 2266 for each row prow to be merged in */ 2267 prow = k; 2268 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2269 2270 while (prow < k) { 2271 /* merge row prow into k-th row */ 2272 jmin = iu[prow] + 1; 2273 jmax = iu[prow+1]; 2274 qm = k; 2275 for (j=jmin; j<jmax; j++) { 2276 incrlev = lev[j-shift] + 1; 2277 if (incrlev > levels) continue; 2278 vj = ju[j]; 2279 do { 2280 m = qm; qm = q[m]; 2281 } while (qm < vj); 2282 if (qm != vj) { /* a fill */ 2283 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2284 levtmp[vj] = incrlev; 2285 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2286 } 2287 prow = prowl[prow]; /* next pivot row */ 2288 } 2289 2290 /* add k to row list for first nonzero element in k-th row */ 2291 if (nzk > 1) { 2292 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2293 prowl[k] = prowl[i]; prowl[i] = k; 2294 } 2295 iu[k+1] = iu[k] + nzk; 2296 2297 /* allocate more space to ju and lev if needed */ 2298 if (iu[k+1] > umax) { 2299 /* estimate how much additional space we will need */ 2300 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2301 /* just double the memory each time */ 2302 maxadd = umax; 2303 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2304 umax += maxadd; 2305 2306 /* allocate a longer ju */ 2307 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2308 ierr = PetscArraycpy(jutmp,ju,iu[k]);CHKERRQ(ierr); 2309 ierr = PetscFree(ju);CHKERRQ(ierr); 2310 ju = jutmp; 2311 2312 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2313 ierr = PetscArraycpy(jutmp,lev,iu[k]-shift);CHKERRQ(ierr); 2314 ierr = PetscFree(lev);CHKERRQ(ierr); 2315 lev = jutmp; 2316 reallocs += 2; /* count how many times we realloc */ 2317 } 2318 2319 /* save nonzero structure of k-th row in ju */ 2320 i=k; 2321 while (nzk--) { 2322 i = q[i]; 2323 ju[juidx] = i; 2324 lev[juidx-shift] = levtmp[i]; 2325 juidx++; 2326 } 2327 } 2328 2329 #if defined(PETSC_USE_INFO) 2330 if (ai[mbs] != 0) { 2331 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2332 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr); 2333 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2334 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr); 2335 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 2336 } else { 2337 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2338 } 2339 #endif 2340 2341 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2342 ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr); 2343 ierr = PetscFree(lev);CHKERRQ(ierr); 2344 2345 /* put together the new matrix */ 2346 ierr = MatSeqSBAIJSetPreallocation(B,bs,0,NULL);CHKERRQ(ierr); 2347 2348 /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */ 2349 b = (Mat_SeqSBAIJ*)(B)->data; 2350 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2351 2352 b->singlemalloc = PETSC_FALSE; 2353 b->free_a = PETSC_TRUE; 2354 b->free_ij = PETSC_TRUE; 2355 /* the next line frees the default space generated by the Create() */ 2356 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 2357 ierr = PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);CHKERRQ(ierr); 2358 b->j = ju; 2359 b->i = iu; 2360 b->diag = 0; 2361 b->ilen = 0; 2362 b->imax = 0; 2363 2364 ierr = ISDestroy(&b->row);CHKERRQ(ierr); 2365 ierr = ISDestroy(&b->icol);CHKERRQ(ierr); 2366 b->row = perm; 2367 b->icol = perm; 2368 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2369 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2370 ierr = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr); 2371 /* In b structure: Free imax, ilen, old a, old j. 2372 Allocate idnew, solve_work, new a, new j */ 2373 ierr = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2374 b->maxnz = b->nz = iu[mbs]; 2375 2376 (B)->info.factor_mallocs = reallocs; 2377 (B)->info.fill_ratio_given = f; 2378 if (ai[mbs] != 0) { 2379 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2380 } else { 2381 (B)->info.fill_ratio_needed = 0.0; 2382 } 2383 ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr); 2384 PetscFunctionReturn(0); 2385 } 2386 2387 /* 2388 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2389 */ 2390 #include <petscbt.h> 2391 #include <../src/mat/utils/freespace.h> 2392 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2393 { 2394 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2395 PetscErrorCode ierr; 2396 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2397 PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j; 2398 const PetscInt *rip; 2399 PetscInt reallocs=0,i,*ui,*udiag,*cols; 2400 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2401 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr; 2402 PetscReal fill =info->fill,levels=info->levels; 2403 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2404 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2405 PetscBT lnkbt; 2406 2407 PetscFunctionBegin; 2408 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 2409 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2410 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2411 if (bs > 1) { 2412 ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr); 2413 PetscFunctionReturn(0); 2414 } 2415 2416 /* check whether perm is the identity mapping */ 2417 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2418 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2419 a->permute = PETSC_FALSE; 2420 2421 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2422 ierr = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr); 2423 ui[0] = 0; 2424 2425 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2426 if (!levels) { 2427 /* reuse the column pointers and row offsets for memory savings */ 2428 for (i=0; i<am; i++) { 2429 ncols = ai[i+1] - ai[i]; 2430 ui[i+1] = ui[i] + ncols; 2431 udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */ 2432 } 2433 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2434 cols = uj; 2435 for (i=0; i<am; i++) { 2436 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2437 ncols = ai[i+1] - ai[i] -1; 2438 for (j=0; j<ncols; j++) *cols++ = aj[j]; 2439 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */ 2440 } 2441 } else { /* case: levels>0 */ 2442 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2443 2444 /* initialization */ 2445 /* jl: linked list for storing indices of the pivot rows 2446 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2447 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2448 for (i=0; i<am; i++) { 2449 jl[i] = am; il[i] = 0; 2450 } 2451 2452 /* create and initialize a linked list for storing column indices of the active row k */ 2453 nlnk = am + 1; 2454 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2455 2456 /* initial FreeSpace size is fill*(ai[am]+1) */ 2457 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2458 2459 current_space = free_space; 2460 2461 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2462 2463 current_space_lvl = free_space_lvl; 2464 2465 for (k=0; k<am; k++) { /* for each active row k */ 2466 /* initialize lnk by the column indices of row k */ 2467 nzk = 0; 2468 ncols = ai[k+1] - ai[k]; 2469 if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k); 2470 cols = aj+ai[k]; 2471 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2472 nzk += nlnk; 2473 2474 /* update lnk by computing fill-in for each pivot row to be merged in */ 2475 prow = jl[k]; /* 1st pivot row */ 2476 2477 while (prow < k) { 2478 nextprow = jl[prow]; 2479 2480 /* merge prow into k-th row */ 2481 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2482 jmax = ui[prow+1]; 2483 ncols = jmax-jmin; 2484 i = jmin - ui[prow]; 2485 2486 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2487 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2488 j = *(uj - 1); 2489 ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2490 nzk += nlnk; 2491 2492 /* update il and jl for prow */ 2493 if (jmin < jmax) { 2494 il[prow] = jmin; 2495 2496 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2497 } 2498 prow = nextprow; 2499 } 2500 2501 /* if free space is not available, make more free space */ 2502 if (current_space->local_remaining<nzk) { 2503 i = am - k + 1; /* num of unfactored rows */ 2504 i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2505 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2506 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2507 reallocs++; 2508 } 2509 2510 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2511 if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k); 2512 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2513 2514 /* add the k-th row into il and jl */ 2515 if (nzk > 1) { 2516 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2517 jl[k] = jl[i]; jl[i] = k; 2518 il[k] = ui[k] + 1; 2519 } 2520 uj_ptr[k] = current_space->array; 2521 uj_lvl_ptr[k] = current_space_lvl->array; 2522 2523 current_space->array += nzk; 2524 current_space->local_used += nzk; 2525 current_space->local_remaining -= nzk; 2526 current_space_lvl->array += nzk; 2527 current_space_lvl->local_used += nzk; 2528 current_space_lvl->local_remaining -= nzk; 2529 2530 ui[k+1] = ui[k] + nzk; 2531 } 2532 2533 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2534 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2535 2536 /* destroy list of free space and other temporary array(s) */ 2537 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2538 ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor */ 2539 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2540 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2541 2542 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2543 2544 /* put together the new matrix in MATSEQSBAIJ format */ 2545 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2546 2547 b = (Mat_SeqSBAIJ*)(fact)->data; 2548 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2549 2550 b->singlemalloc = PETSC_FALSE; 2551 b->free_a = PETSC_TRUE; 2552 b->free_ij = free_ij; 2553 2554 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2555 2556 b->j = uj; 2557 b->i = ui; 2558 b->diag = udiag; 2559 b->free_diag = PETSC_TRUE; 2560 b->ilen = 0; 2561 b->imax = 0; 2562 b->row = perm; 2563 b->col = perm; 2564 2565 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2566 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2567 2568 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2569 2570 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2571 ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2572 2573 b->maxnz = b->nz = ui[am]; 2574 2575 fact->info.factor_mallocs = reallocs; 2576 fact->info.fill_ratio_given = fill; 2577 if (ai[am] != 0) { 2578 fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am]; 2579 } else { 2580 fact->info.fill_ratio_needed = 0.0; 2581 } 2582 #if defined(PETSC_USE_INFO) 2583 if (ai[am] != 0) { 2584 PetscReal af = fact->info.fill_ratio_needed; 2585 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2586 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2587 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2588 } else { 2589 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2590 } 2591 #endif 2592 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2593 PetscFunctionReturn(0); 2594 } 2595 2596 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2597 { 2598 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2599 Mat_SeqSBAIJ *b; 2600 PetscErrorCode ierr; 2601 PetscBool perm_identity,free_ij = PETSC_TRUE; 2602 PetscInt bs=A->rmap->bs,am=a->mbs; 2603 const PetscInt *cols,*rip,*ai=a->i,*aj=a->j; 2604 PetscInt reallocs=0,i,*ui; 2605 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2606 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 2607 PetscReal fill =info->fill,levels=info->levels,ratio_needed; 2608 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2609 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2610 PetscBT lnkbt; 2611 2612 PetscFunctionBegin; 2613 /* 2614 This code originally uses Modified Sparse Row (MSR) storage 2615 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2616 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2617 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2618 thus the original code in MSR format is still used for these cases. 2619 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2620 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2621 */ 2622 if (bs > 1) { 2623 ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr); 2624 PetscFunctionReturn(0); 2625 } 2626 2627 /* check whether perm is the identity mapping */ 2628 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2629 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2630 a->permute = PETSC_FALSE; 2631 2632 /* special case that simply copies fill pattern */ 2633 if (!levels) { 2634 /* reuse the column pointers and row offsets for memory savings */ 2635 ui = a->i; 2636 uj = a->j; 2637 free_ij = PETSC_FALSE; 2638 ratio_needed = 1.0; 2639 } else { /* case: levels>0 */ 2640 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2641 2642 /* initialization */ 2643 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2644 ui[0] = 0; 2645 2646 /* jl: linked list for storing indices of the pivot rows 2647 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2648 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2649 for (i=0; i<am; i++) { 2650 jl[i] = am; il[i] = 0; 2651 } 2652 2653 /* create and initialize a linked list for storing column indices of the active row k */ 2654 nlnk = am + 1; 2655 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2656 2657 /* initial FreeSpace size is fill*(ai[am]+1) */ 2658 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2659 2660 current_space = free_space; 2661 2662 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2663 2664 current_space_lvl = free_space_lvl; 2665 2666 for (k=0; k<am; k++) { /* for each active row k */ 2667 /* initialize lnk by the column indices of row rip[k] */ 2668 nzk = 0; 2669 ncols = ai[rip[k]+1] - ai[rip[k]]; 2670 cols = aj+ai[rip[k]]; 2671 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2672 nzk += nlnk; 2673 2674 /* update lnk by computing fill-in for each pivot row to be merged in */ 2675 prow = jl[k]; /* 1st pivot row */ 2676 2677 while (prow < k) { 2678 nextprow = jl[prow]; 2679 2680 /* merge prow into k-th row */ 2681 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2682 jmax = ui[prow+1]; 2683 ncols = jmax-jmin; 2684 i = jmin - ui[prow]; 2685 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2686 j = *(uj_lvl_ptr[prow] + i - 1); 2687 cols_lvl = uj_lvl_ptr[prow]+i; 2688 ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2689 nzk += nlnk; 2690 2691 /* update il and jl for prow */ 2692 if (jmin < jmax) { 2693 il[prow] = jmin; 2694 j = *cols; 2695 jl[prow] = jl[j]; 2696 jl[j] = prow; 2697 } 2698 prow = nextprow; 2699 } 2700 2701 /* if free space is not available, make more free space */ 2702 if (current_space->local_remaining<nzk) { 2703 i = am - k + 1; /* num of unfactored rows */ 2704 i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2705 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2706 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2707 reallocs++; 2708 } 2709 2710 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2711 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2712 2713 /* add the k-th row into il and jl */ 2714 if (nzk-1 > 0) { 2715 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2716 jl[k] = jl[i]; jl[i] = k; 2717 il[k] = ui[k] + 1; 2718 } 2719 uj_ptr[k] = current_space->array; 2720 uj_lvl_ptr[k] = current_space_lvl->array; 2721 2722 current_space->array += nzk; 2723 current_space->local_used += nzk; 2724 current_space->local_remaining -= nzk; 2725 current_space_lvl->array += nzk; 2726 current_space_lvl->local_used += nzk; 2727 current_space_lvl->local_remaining -= nzk; 2728 2729 ui[k+1] = ui[k] + nzk; 2730 } 2731 2732 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2733 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2734 2735 /* destroy list of free space and other temporary array(s) */ 2736 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2737 ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 2738 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2739 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2740 if (ai[am] != 0) { 2741 ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 2742 } else { 2743 ratio_needed = 0.0; 2744 } 2745 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2746 2747 /* put together the new matrix in MATSEQSBAIJ format */ 2748 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2749 2750 b = (Mat_SeqSBAIJ*)(fact)->data; 2751 2752 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2753 2754 b->singlemalloc = PETSC_FALSE; 2755 b->free_a = PETSC_TRUE; 2756 b->free_ij = free_ij; 2757 2758 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2759 2760 b->j = uj; 2761 b->i = ui; 2762 b->diag = 0; 2763 b->ilen = 0; 2764 b->imax = 0; 2765 b->row = perm; 2766 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2767 2768 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2769 b->icol = perm; 2770 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2771 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2772 2773 b->maxnz = b->nz = ui[am]; 2774 2775 fact->info.factor_mallocs = reallocs; 2776 fact->info.fill_ratio_given = fill; 2777 fact->info.fill_ratio_needed = ratio_needed; 2778 #if defined(PETSC_USE_INFO) 2779 if (ai[am] != 0) { 2780 PetscReal af = fact->info.fill_ratio_needed; 2781 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2782 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2783 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2784 } else { 2785 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2786 } 2787 #endif 2788 if (perm_identity) { 2789 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2790 } else { 2791 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2792 } 2793 PetscFunctionReturn(0); 2794 } 2795 2796