1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 11 { 12 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 13 IS isrow=a->row; 14 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 15 PetscErrorCode ierr; 16 const PetscInt *r; 17 PetscInt nz,*vj,k,idx,k1; 18 PetscInt bs =A->rmap->bs,bs2 = a->bs2; 19 const MatScalar *aa=a->a,*v,*diag; 20 PetscScalar *x,*xk,*xj,*xk_tmp,*t; 21 const PetscScalar *b; 22 23 PetscFunctionBegin; 24 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 t = a->solve_work; 27 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 28 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 29 30 /* solve U^T * D * y = b by forward substitution */ 31 xk = t; 32 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 33 idx = bs*r[k]; 34 for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1]; 35 } 36 for (k=0; k<mbs; k++) { 37 v = aa + bs2*ai[k]; 38 xk = t + k*bs; /* Dk*xk = k-th block of x */ 39 ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */ 40 nz = ai[k+1] - ai[k]; 41 vj = aj + ai[k]; 42 xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */ 43 while (nz--) { 44 /* x(:) += U(k,:)^T*(Dk*xk) */ 45 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 46 vj++; xj = t + (*vj)*bs; 47 v += bs2; 48 } 49 /* xk = inv(Dk)*(Dk*xk) */ 50 diag = aa+k*bs2; /* ptr to inv(Dk) */ 51 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 52 } 53 54 /* solve U*x = y by back substitution */ 55 for (k=mbs-1; k>=0; k--) { 56 v = aa + bs2*ai[k]; 57 xk = t + k*bs; /* xk */ 58 nz = ai[k+1] - ai[k]; 59 vj = aj + ai[k]; 60 xj = t + (*vj)*bs; 61 while (nz--) { 62 /* xk += U(k,:)*x(:) */ 63 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 64 vj++; 65 v += bs2; xj = t + (*vj)*bs; 66 } 67 idx = bs*r[k]; 68 for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++; 69 } 70 71 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 72 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 73 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 74 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 75 ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr); 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 80 { 81 PetscFunctionBegin; 82 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet"); 83 } 84 85 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 86 { 87 PetscFunctionBegin; 88 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented"); 89 } 90 91 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 92 { 93 PetscErrorCode ierr; 94 PetscInt nz,k; 95 const PetscInt *vj,bs2 = bs*bs; 96 const MatScalar *v,*diag; 97 PetscScalar *xk,*xj,*xk_tmp; 98 99 PetscFunctionBegin; 100 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 101 for (k=0; k<mbs; k++) { 102 v = aa + bs2*ai[k]; 103 xk = x + k*bs; /* Dk*xk = k-th block of x */ 104 ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */ 105 nz = ai[k+1] - ai[k]; 106 vj = aj + ai[k]; 107 xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */ 108 while (nz--) { 109 /* x(:) += U(k,:)^T*(Dk*xk) */ 110 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 111 vj++; xj = x + (*vj)*bs; 112 v += bs2; 113 } 114 /* xk = inv(Dk)*(Dk*xk) */ 115 diag = aa+k*bs2; /* ptr to inv(Dk) */ 116 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 117 } 118 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 123 { 124 PetscInt nz,k; 125 const PetscInt *vj,bs2 = bs*bs; 126 const MatScalar *v; 127 PetscScalar *xk,*xj; 128 129 PetscFunctionBegin; 130 for (k=mbs-1; k>=0; k--) { 131 v = aa + bs2*ai[k]; 132 xk = x + k*bs; /* xk */ 133 nz = ai[k+1] - ai[k]; 134 vj = aj + ai[k]; 135 xj = x + (*vj)*bs; 136 while (nz--) { 137 /* xk += U(k,:)*x(:) */ 138 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 139 vj++; 140 v += bs2; xj = x + (*vj)*bs; 141 } 142 } 143 PetscFunctionReturn(0); 144 } 145 146 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 147 { 148 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 149 PetscErrorCode ierr; 150 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 151 PetscInt bs =A->rmap->bs; 152 const MatScalar *aa=a->a; 153 PetscScalar *x; 154 const PetscScalar *b; 155 156 PetscFunctionBegin; 157 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 158 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 159 160 /* solve U^T * D * y = b by forward substitution */ 161 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */ 162 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 163 164 /* solve U*x = y by back substitution */ 165 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 166 167 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 168 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 169 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 174 { 175 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 176 PetscErrorCode ierr; 177 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 178 PetscInt bs =A->rmap->bs; 179 const MatScalar *aa=a->a; 180 const PetscScalar *b; 181 PetscScalar *x; 182 183 PetscFunctionBegin; 184 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 185 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 186 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */ 187 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 188 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 189 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 190 ierr = PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 } 193 194 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 195 { 196 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 197 PetscErrorCode ierr; 198 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 199 PetscInt bs =A->rmap->bs; 200 const MatScalar *aa=a->a; 201 const PetscScalar *b; 202 PetscScalar *x; 203 204 PetscFunctionBegin; 205 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 206 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 207 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); 208 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 209 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 210 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 211 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 212 PetscFunctionReturn(0); 213 } 214 215 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 216 { 217 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 218 IS isrow=a->row; 219 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 220 PetscErrorCode ierr; 221 PetscInt nz,k,idx; 222 const MatScalar *aa=a->a,*v,*d; 223 PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp; 224 const PetscScalar *b; 225 226 PetscFunctionBegin; 227 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 228 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 229 t = a->solve_work; 230 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 231 232 /* solve U^T * D * y = b by forward substitution */ 233 tp = t; 234 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 235 idx = 7*r[k]; 236 tp[0] = b[idx]; 237 tp[1] = b[idx+1]; 238 tp[2] = b[idx+2]; 239 tp[3] = b[idx+3]; 240 tp[4] = b[idx+4]; 241 tp[5] = b[idx+5]; 242 tp[6] = b[idx+6]; 243 tp += 7; 244 } 245 246 for (k=0; k<mbs; k++) { 247 v = aa + 49*ai[k]; 248 vj = aj + ai[k]; 249 tp = t + k*7; 250 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; 251 nz = ai[k+1] - ai[k]; 252 tp = t + (*vj)*7; 253 while (nz--) { 254 tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 255 tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 256 tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 257 tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 258 tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 259 tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 260 tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 261 vj++; 262 tp = t + (*vj)*7; 263 v += 49; 264 } 265 266 /* xk = inv(Dk)*(Dk*xk) */ 267 d = aa+k*49; /* ptr to inv(Dk) */ 268 tp = t + k*7; 269 tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 270 tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 271 tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 272 tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 273 tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 274 tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 275 tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 276 } 277 278 /* solve U*x = y by back substitution */ 279 for (k=mbs-1; k>=0; k--) { 280 v = aa + 49*ai[k]; 281 vj = aj + ai[k]; 282 tp = t + k*7; 283 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */ 284 nz = ai[k+1] - ai[k]; 285 286 tp = t + (*vj)*7; 287 while (nz--) { 288 /* xk += U(k,:)*x(:) */ 289 x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6]; 290 x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6]; 291 x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6]; 292 x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6]; 293 x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6]; 294 x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6]; 295 x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6]; 296 vj++; 297 tp = t + (*vj)*7; 298 v += 49; 299 } 300 tp = t + k*7; 301 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6; 302 idx = 7*r[k]; 303 x[idx] = x0; 304 x[idx+1] = x1; 305 x[idx+2] = x2; 306 x[idx+3] = x3; 307 x[idx+4] = x4; 308 x[idx+5] = x5; 309 x[idx+6] = x6; 310 } 311 312 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 313 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 314 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 315 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 316 PetscFunctionReturn(0); 317 } 318 319 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 320 { 321 const MatScalar *v,*d; 322 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 323 PetscInt nz,k; 324 const PetscInt *vj; 325 326 PetscFunctionBegin; 327 for (k=0; k<mbs; k++) { 328 v = aa + 49*ai[k]; 329 xp = x + k*7; 330 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */ 331 nz = ai[k+1] - ai[k]; 332 vj = aj + ai[k]; 333 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 334 PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 335 while (nz--) { 336 xp = x + (*vj)*7; 337 /* x(:) += U(k,:)^T*(Dk*xk) */ 338 xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 339 xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 340 xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 341 xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 342 xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 343 xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 344 xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 345 vj++; 346 v += 49; 347 } 348 /* xk = inv(Dk)*(Dk*xk) */ 349 d = aa+k*49; /* ptr to inv(Dk) */ 350 xp = x + k*7; 351 xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 352 xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 353 xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 354 xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 355 xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 356 xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 357 xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 358 } 359 PetscFunctionReturn(0); 360 } 361 362 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 363 { 364 const MatScalar *v; 365 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 366 PetscInt nz,k; 367 const PetscInt *vj; 368 369 PetscFunctionBegin; 370 for (k=mbs-1; k>=0; k--) { 371 v = aa + 49*ai[k]; 372 xp = x + k*7; 373 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */ 374 nz = ai[k+1] - ai[k]; 375 vj = aj + ai[k]; 376 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 377 PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 378 while (nz--) { 379 xp = x + (*vj)*7; 380 /* xk += U(k,:)*x(:) */ 381 x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6]; 382 x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6]; 383 x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6]; 384 x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6]; 385 x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6]; 386 x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6]; 387 x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6]; 388 vj++; 389 v += 49; 390 } 391 xp = x + k*7; 392 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6; 393 } 394 PetscFunctionReturn(0); 395 } 396 397 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 398 { 399 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 400 PetscErrorCode ierr; 401 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 402 const MatScalar *aa=a->a; 403 PetscScalar *x; 404 const PetscScalar *b; 405 406 PetscFunctionBegin; 407 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 408 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 409 410 /* solve U^T * D * y = b by forward substitution */ 411 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); /* x <- b */ 412 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 413 414 /* solve U*x = y by back substitution */ 415 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 416 417 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 418 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 419 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 420 PetscFunctionReturn(0); 421 } 422 423 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 424 { 425 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 426 PetscErrorCode ierr; 427 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 428 const MatScalar *aa=a->a; 429 PetscScalar *x; 430 const PetscScalar *b; 431 432 PetscFunctionBegin; 433 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 434 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 435 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); 436 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 437 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 438 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 439 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 440 PetscFunctionReturn(0); 441 } 442 443 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 444 { 445 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 446 PetscErrorCode ierr; 447 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 448 const MatScalar *aa=a->a; 449 PetscScalar *x; 450 const PetscScalar *b; 451 452 PetscFunctionBegin; 453 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 454 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 455 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); 456 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 457 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 459 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 460 PetscFunctionReturn(0); 461 } 462 463 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 464 { 465 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 466 IS isrow=a->row; 467 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 468 PetscErrorCode ierr; 469 PetscInt nz,k,idx; 470 const MatScalar *aa=a->a,*v,*d; 471 PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp; 472 const PetscScalar *b; 473 474 PetscFunctionBegin; 475 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 476 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477 t = a->solve_work; 478 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 479 480 /* solve U^T * D * y = b by forward substitution */ 481 tp = t; 482 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 483 idx = 6*r[k]; 484 tp[0] = b[idx]; 485 tp[1] = b[idx+1]; 486 tp[2] = b[idx+2]; 487 tp[3] = b[idx+3]; 488 tp[4] = b[idx+4]; 489 tp[5] = b[idx+5]; 490 tp += 6; 491 } 492 493 for (k=0; k<mbs; k++) { 494 v = aa + 36*ai[k]; 495 vj = aj + ai[k]; 496 tp = t + k*6; 497 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; 498 nz = ai[k+1] - ai[k]; 499 tp = t + (*vj)*6; 500 while (nz--) { 501 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 502 tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 503 tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 504 tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 505 tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 506 tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 507 vj++; 508 tp = t + (*vj)*6; 509 v += 36; 510 } 511 512 /* xk = inv(Dk)*(Dk*xk) */ 513 d = aa+k*36; /* ptr to inv(Dk) */ 514 tp = t + k*6; 515 tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 516 tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 517 tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 518 tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 519 tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 520 tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 521 } 522 523 /* solve U*x = y by back substitution */ 524 for (k=mbs-1; k>=0; k--) { 525 v = aa + 36*ai[k]; 526 vj = aj + ai[k]; 527 tp = t + k*6; 528 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */ 529 nz = ai[k+1] - ai[k]; 530 531 tp = t + (*vj)*6; 532 while (nz--) { 533 /* xk += U(k,:)*x(:) */ 534 x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5]; 535 x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5]; 536 x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5]; 537 x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5]; 538 x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5]; 539 x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5]; 540 vj++; 541 tp = t + (*vj)*6; 542 v += 36; 543 } 544 tp = t + k*6; 545 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; 546 idx = 6*r[k]; 547 x[idx] = x0; 548 x[idx+1] = x1; 549 x[idx+2] = x2; 550 x[idx+3] = x3; 551 x[idx+4] = x4; 552 x[idx+5] = x5; 553 } 554 555 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 556 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 557 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 558 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 559 PetscFunctionReturn(0); 560 } 561 562 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 563 { 564 const MatScalar *v,*d; 565 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 566 PetscInt nz,k; 567 const PetscInt *vj; 568 569 PetscFunctionBegin; 570 for (k=0; k<mbs; k++) { 571 v = aa + 36*ai[k]; 572 xp = x + k*6; 573 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */ 574 nz = ai[k+1] - ai[k]; 575 vj = aj + ai[k]; 576 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 577 PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 578 while (nz--) { 579 xp = x + (*vj)*6; 580 /* x(:) += U(k,:)^T*(Dk*xk) */ 581 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 582 xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 583 xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 584 xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 585 xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 586 xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 587 vj++; 588 v += 36; 589 } 590 /* xk = inv(Dk)*(Dk*xk) */ 591 d = aa+k*36; /* ptr to inv(Dk) */ 592 xp = x + k*6; 593 xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 594 xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 595 xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 596 xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 597 xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 598 xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 599 } 600 PetscFunctionReturn(0); 601 } 602 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 603 { 604 const MatScalar *v; 605 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 606 PetscInt nz,k; 607 const PetscInt *vj; 608 609 PetscFunctionBegin; 610 for (k=mbs-1; k>=0; k--) { 611 v = aa + 36*ai[k]; 612 xp = x + k*6; 613 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */ 614 nz = ai[k+1] - ai[k]; 615 vj = aj + ai[k]; 616 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 617 PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 618 while (nz--) { 619 xp = x + (*vj)*6; 620 /* xk += U(k,:)*x(:) */ 621 x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5]; 622 x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5]; 623 x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5]; 624 x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5]; 625 x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5]; 626 x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5]; 627 vj++; 628 v += 36; 629 } 630 xp = x + k*6; 631 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; 632 } 633 PetscFunctionReturn(0); 634 } 635 636 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 637 { 638 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 639 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 640 const MatScalar *aa=a->a; 641 PetscScalar *x; 642 const PetscScalar *b; 643 PetscErrorCode ierr; 644 645 PetscFunctionBegin; 646 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 647 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 648 649 /* solve U^T * D * y = b by forward substitution */ 650 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 651 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 652 653 /* solve U*x = y by back substitution */ 654 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 655 656 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 657 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 658 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 659 PetscFunctionReturn(0); 660 } 661 662 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 663 { 664 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 665 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 666 const MatScalar *aa=a->a; 667 PetscScalar *x; 668 const PetscScalar *b; 669 PetscErrorCode ierr; 670 671 PetscFunctionBegin; 672 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 673 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 674 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 675 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 676 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 677 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 678 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 679 PetscFunctionReturn(0); 680 } 681 682 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 683 { 684 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 685 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 686 const MatScalar *aa=a->a; 687 PetscScalar *x; 688 const PetscScalar *b; 689 PetscErrorCode ierr; 690 691 PetscFunctionBegin; 692 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 693 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 694 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 695 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 696 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 698 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 699 PetscFunctionReturn(0); 700 } 701 702 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 703 { 704 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 705 IS isrow=a->row; 706 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 707 PetscErrorCode ierr; 708 const PetscInt *r,*vj; 709 PetscInt nz,k,idx; 710 const MatScalar *aa=a->a,*v,*diag; 711 PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp; 712 const PetscScalar *b; 713 714 PetscFunctionBegin; 715 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 716 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 717 t = a->solve_work; 718 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 719 720 /* solve U^T * D * y = b by forward substitution */ 721 tp = t; 722 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 723 idx = 5*r[k]; 724 tp[0] = b[idx]; 725 tp[1] = b[idx+1]; 726 tp[2] = b[idx+2]; 727 tp[3] = b[idx+3]; 728 tp[4] = b[idx+4]; 729 tp += 5; 730 } 731 732 for (k=0; k<mbs; k++) { 733 v = aa + 25*ai[k]; 734 vj = aj + ai[k]; 735 tp = t + k*5; 736 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; 737 nz = ai[k+1] - ai[k]; 738 739 tp = t + (*vj)*5; 740 while (nz--) { 741 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 742 tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 743 tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4; 744 tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4; 745 tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4; 746 vj++; 747 tp = t + (*vj)*5; 748 v += 25; 749 } 750 751 /* xk = inv(Dk)*(Dk*xk) */ 752 diag = aa+k*25; /* ptr to inv(Dk) */ 753 tp = t + k*5; 754 tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 755 tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 756 tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 757 tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 758 tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 759 } 760 761 /* solve U*x = y by back substitution */ 762 for (k=mbs-1; k>=0; k--) { 763 v = aa + 25*ai[k]; 764 vj = aj + ai[k]; 765 tp = t + k*5; 766 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */ 767 nz = ai[k+1] - ai[k]; 768 769 tp = t + (*vj)*5; 770 while (nz--) { 771 /* xk += U(k,:)*x(:) */ 772 x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4]; 773 x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4]; 774 x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4]; 775 x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4]; 776 x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4]; 777 vj++; 778 tp = t + (*vj)*5; 779 v += 25; 780 } 781 tp = t + k*5; 782 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; 783 idx = 5*r[k]; 784 x[idx] = x0; 785 x[idx+1] = x1; 786 x[idx+2] = x2; 787 x[idx+3] = x3; 788 x[idx+4] = x4; 789 } 790 791 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 792 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 793 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 794 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 795 PetscFunctionReturn(0); 796 } 797 798 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 799 { 800 const MatScalar *v,*diag; 801 PetscScalar *xp,x0,x1,x2,x3,x4; 802 PetscInt nz,k; 803 const PetscInt *vj; 804 805 PetscFunctionBegin; 806 for (k=0; k<mbs; k++) { 807 v = aa + 25*ai[k]; 808 xp = x + k*5; 809 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */ 810 nz = ai[k+1] - ai[k]; 811 vj = aj + ai[k]; 812 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 813 PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 814 while (nz--) { 815 xp = x + (*vj)*5; 816 /* x(:) += U(k,:)^T*(Dk*xk) */ 817 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 818 xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 819 xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4; 820 xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4; 821 xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4; 822 vj++; 823 v += 25; 824 } 825 /* xk = inv(Dk)*(Dk*xk) */ 826 diag = aa+k*25; /* ptr to inv(Dk) */ 827 xp = x + k*5; 828 xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 829 xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 830 xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 831 xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 832 xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 833 } 834 PetscFunctionReturn(0); 835 } 836 837 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 838 { 839 const MatScalar *v; 840 PetscScalar *xp,x0,x1,x2,x3,x4; 841 PetscInt nz,k; 842 const PetscInt *vj; 843 844 PetscFunctionBegin; 845 for (k=mbs-1; k>=0; k--) { 846 v = aa + 25*ai[k]; 847 xp = x + k*5; 848 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */ 849 nz = ai[k+1] - ai[k]; 850 vj = aj + ai[k]; 851 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 852 PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 853 while (nz--) { 854 xp = x + (*vj)*5; 855 /* xk += U(k,:)*x(:) */ 856 x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4]; 857 x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4]; 858 x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4]; 859 x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4]; 860 x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4]; 861 vj++; 862 v += 25; 863 } 864 xp = x + k*5; 865 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; 866 } 867 PetscFunctionReturn(0); 868 } 869 870 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 871 { 872 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 873 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 874 const MatScalar *aa=a->a; 875 PetscScalar *x; 876 const PetscScalar *b; 877 PetscErrorCode ierr; 878 879 PetscFunctionBegin; 880 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 881 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 882 883 /* solve U^T * D * y = b by forward substitution */ 884 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */ 885 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 886 887 /* solve U*x = y by back substitution */ 888 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 889 890 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 891 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 892 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 893 PetscFunctionReturn(0); 894 } 895 896 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 897 { 898 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 899 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 900 const MatScalar *aa=a->a; 901 PetscScalar *x; 902 const PetscScalar *b; 903 PetscErrorCode ierr; 904 905 PetscFunctionBegin; 906 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 907 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 908 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */ 909 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 910 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 911 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 912 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 913 PetscFunctionReturn(0); 914 } 915 916 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 917 { 918 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 919 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 920 const MatScalar *aa=a->a; 921 PetscScalar *x; 922 const PetscScalar *b; 923 PetscErrorCode ierr; 924 925 PetscFunctionBegin; 926 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 927 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 928 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); 929 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 930 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 931 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 932 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 933 PetscFunctionReturn(0); 934 } 935 936 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 937 { 938 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 939 IS isrow=a->row; 940 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 941 PetscErrorCode ierr; 942 const PetscInt *r,*vj; 943 PetscInt nz,k,idx; 944 const MatScalar *aa=a->a,*v,*diag; 945 PetscScalar *x,x0,x1,x2,x3,*t,*tp; 946 const PetscScalar *b; 947 948 PetscFunctionBegin; 949 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 950 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 951 t = a->solve_work; 952 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 953 954 /* solve U^T * D * y = b by forward substitution */ 955 tp = t; 956 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 957 idx = 4*r[k]; 958 tp[0] = b[idx]; 959 tp[1] = b[idx+1]; 960 tp[2] = b[idx+2]; 961 tp[3] = b[idx+3]; 962 tp += 4; 963 } 964 965 for (k=0; k<mbs; k++) { 966 v = aa + 16*ai[k]; 967 vj = aj + ai[k]; 968 tp = t + k*4; 969 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; 970 nz = ai[k+1] - ai[k]; 971 972 tp = t + (*vj)*4; 973 while (nz--) { 974 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 975 tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 976 tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 977 tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 978 vj++; 979 tp = t + (*vj)*4; 980 v += 16; 981 } 982 983 /* xk = inv(Dk)*(Dk*xk) */ 984 diag = aa+k*16; /* ptr to inv(Dk) */ 985 tp = t + k*4; 986 tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 987 tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 988 tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 989 tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 990 } 991 992 /* solve U*x = y by back substitution */ 993 for (k=mbs-1; k>=0; k--) { 994 v = aa + 16*ai[k]; 995 vj = aj + ai[k]; 996 tp = t + k*4; 997 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */ 998 nz = ai[k+1] - ai[k]; 999 1000 tp = t + (*vj)*4; 1001 while (nz--) { 1002 /* xk += U(k,:)*x(:) */ 1003 x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3]; 1004 x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3]; 1005 x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3]; 1006 x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3]; 1007 vj++; 1008 tp = t + (*vj)*4; 1009 v += 16; 1010 } 1011 tp = t + k*4; 1012 tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; 1013 idx = 4*r[k]; 1014 x[idx] = x0; 1015 x[idx+1] = x1; 1016 x[idx+2] = x2; 1017 x[idx+3] = x3; 1018 } 1019 1020 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1021 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1022 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1023 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1024 PetscFunctionReturn(0); 1025 } 1026 1027 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1028 { 1029 const MatScalar *v,*diag; 1030 PetscScalar *xp,x0,x1,x2,x3; 1031 PetscInt nz,k; 1032 const PetscInt *vj; 1033 1034 PetscFunctionBegin; 1035 for (k=0; k<mbs; k++) { 1036 v = aa + 16*ai[k]; 1037 xp = x + k*4; 1038 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */ 1039 nz = ai[k+1] - ai[k]; 1040 vj = aj + ai[k]; 1041 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1042 PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1043 while (nz--) { 1044 xp = x + (*vj)*4; 1045 /* x(:) += U(k,:)^T*(Dk*xk) */ 1046 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1047 xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1048 xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1049 xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1050 vj++; 1051 v += 16; 1052 } 1053 /* xk = inv(Dk)*(Dk*xk) */ 1054 diag = aa+k*16; /* ptr to inv(Dk) */ 1055 xp = x + k*4; 1056 xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1057 xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1058 xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1059 xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1060 } 1061 PetscFunctionReturn(0); 1062 } 1063 1064 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1065 { 1066 const MatScalar *v; 1067 PetscScalar *xp,x0,x1,x2,x3; 1068 PetscInt nz,k; 1069 const PetscInt *vj; 1070 1071 PetscFunctionBegin; 1072 for (k=mbs-1; k>=0; k--) { 1073 v = aa + 16*ai[k]; 1074 xp = x + k*4; 1075 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */ 1076 nz = ai[k+1] - ai[k]; 1077 vj = aj + ai[k]; 1078 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1079 PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1080 while (nz--) { 1081 xp = x + (*vj)*4; 1082 /* xk += U(k,:)*x(:) */ 1083 x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3]; 1084 x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3]; 1085 x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3]; 1086 x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3]; 1087 vj++; 1088 v += 16; 1089 } 1090 xp = x + k*4; 1091 xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3; 1092 } 1093 PetscFunctionReturn(0); 1094 } 1095 1096 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1097 { 1098 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1099 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1100 const MatScalar *aa=a->a; 1101 PetscScalar *x; 1102 const PetscScalar *b; 1103 PetscErrorCode ierr; 1104 1105 PetscFunctionBegin; 1106 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1107 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1108 1109 /* solve U^T * D * y = b by forward substitution */ 1110 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */ 1111 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1112 1113 /* solve U*x = y by back substitution */ 1114 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1115 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1116 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1117 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1118 PetscFunctionReturn(0); 1119 } 1120 1121 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1122 { 1123 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1124 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1125 const MatScalar *aa=a->a; 1126 PetscScalar *x; 1127 const PetscScalar *b; 1128 PetscErrorCode ierr; 1129 1130 PetscFunctionBegin; 1131 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1133 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */ 1134 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1135 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1136 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1137 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1138 PetscFunctionReturn(0); 1139 } 1140 1141 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1142 { 1143 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1144 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1145 const MatScalar *aa=a->a; 1146 PetscScalar *x; 1147 const PetscScalar *b; 1148 PetscErrorCode ierr; 1149 1150 PetscFunctionBegin; 1151 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1152 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1153 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); 1154 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1155 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1156 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1157 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1158 PetscFunctionReturn(0); 1159 } 1160 1161 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1162 { 1163 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1164 IS isrow=a->row; 1165 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1166 PetscErrorCode ierr; 1167 const PetscInt *r; 1168 PetscInt nz,k,idx; 1169 const PetscInt *vj; 1170 const MatScalar *aa=a->a,*v,*diag; 1171 PetscScalar *x,x0,x1,x2,*t,*tp; 1172 const PetscScalar *b; 1173 1174 PetscFunctionBegin; 1175 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1176 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1177 t = a->solve_work; 1178 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1179 1180 /* solve U^T * D * y = b by forward substitution */ 1181 tp = t; 1182 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1183 idx = 3*r[k]; 1184 tp[0] = b[idx]; 1185 tp[1] = b[idx+1]; 1186 tp[2] = b[idx+2]; 1187 tp += 3; 1188 } 1189 1190 for (k=0; k<mbs; k++) { 1191 v = aa + 9*ai[k]; 1192 vj = aj + ai[k]; 1193 tp = t + k*3; 1194 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; 1195 nz = ai[k+1] - ai[k]; 1196 1197 tp = t + (*vj)*3; 1198 while (nz--) { 1199 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1200 tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1201 tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1202 vj++; 1203 tp = t + (*vj)*3; 1204 v += 9; 1205 } 1206 1207 /* xk = inv(Dk)*(Dk*xk) */ 1208 diag = aa+k*9; /* ptr to inv(Dk) */ 1209 tp = t + k*3; 1210 tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1211 tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1212 tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1213 } 1214 1215 /* solve U*x = y by back substitution */ 1216 for (k=mbs-1; k>=0; k--) { 1217 v = aa + 9*ai[k]; 1218 vj = aj + ai[k]; 1219 tp = t + k*3; 1220 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */ 1221 nz = ai[k+1] - ai[k]; 1222 1223 tp = t + (*vj)*3; 1224 while (nz--) { 1225 /* xk += U(k,:)*x(:) */ 1226 x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2]; 1227 x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2]; 1228 x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2]; 1229 vj++; 1230 tp = t + (*vj)*3; 1231 v += 9; 1232 } 1233 tp = t + k*3; 1234 tp[0] = x0; tp[1] = x1; tp[2] = x2; 1235 idx = 3*r[k]; 1236 x[idx] = x0; 1237 x[idx+1] = x1; 1238 x[idx+2] = x2; 1239 } 1240 1241 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1242 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1243 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1244 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1245 PetscFunctionReturn(0); 1246 } 1247 1248 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1249 { 1250 const MatScalar *v,*diag; 1251 PetscScalar *xp,x0,x1,x2; 1252 PetscInt nz,k; 1253 const PetscInt *vj; 1254 1255 PetscFunctionBegin; 1256 for (k=0; k<mbs; k++) { 1257 v = aa + 9*ai[k]; 1258 xp = x + k*3; 1259 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */ 1260 nz = ai[k+1] - ai[k]; 1261 vj = aj + ai[k]; 1262 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1263 PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1264 while (nz--) { 1265 xp = x + (*vj)*3; 1266 /* x(:) += U(k,:)^T*(Dk*xk) */ 1267 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1268 xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1269 xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1270 vj++; 1271 v += 9; 1272 } 1273 /* xk = inv(Dk)*(Dk*xk) */ 1274 diag = aa+k*9; /* ptr to inv(Dk) */ 1275 xp = x + k*3; 1276 xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1277 xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1278 xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1279 } 1280 PetscFunctionReturn(0); 1281 } 1282 1283 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1284 { 1285 const MatScalar *v; 1286 PetscScalar *xp,x0,x1,x2; 1287 PetscInt nz,k; 1288 const PetscInt *vj; 1289 1290 PetscFunctionBegin; 1291 for (k=mbs-1; k>=0; k--) { 1292 v = aa + 9*ai[k]; 1293 xp = x + k*3; 1294 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */ 1295 nz = ai[k+1] - ai[k]; 1296 vj = aj + ai[k]; 1297 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1298 PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1299 while (nz--) { 1300 xp = x + (*vj)*3; 1301 /* xk += U(k,:)*x(:) */ 1302 x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2]; 1303 x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2]; 1304 x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2]; 1305 vj++; 1306 v += 9; 1307 } 1308 xp = x + k*3; 1309 xp[0] = x0; xp[1] = x1; xp[2] = x2; 1310 } 1311 PetscFunctionReturn(0); 1312 } 1313 1314 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1315 { 1316 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1317 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1318 const MatScalar *aa=a->a; 1319 PetscScalar *x; 1320 const PetscScalar *b; 1321 PetscErrorCode ierr; 1322 1323 PetscFunctionBegin; 1324 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1325 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1326 1327 /* solve U^T * D * y = b by forward substitution */ 1328 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1329 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1330 1331 /* solve U*x = y by back substitution */ 1332 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1333 1334 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1335 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1336 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1337 PetscFunctionReturn(0); 1338 } 1339 1340 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1341 { 1342 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1343 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1344 const MatScalar *aa=a->a; 1345 PetscScalar *x; 1346 const PetscScalar *b; 1347 PetscErrorCode ierr; 1348 1349 PetscFunctionBegin; 1350 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1351 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1352 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1353 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1354 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1355 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1356 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1357 PetscFunctionReturn(0); 1358 } 1359 1360 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1361 { 1362 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1363 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1364 const MatScalar *aa=a->a; 1365 PetscScalar *x; 1366 const PetscScalar *b; 1367 PetscErrorCode ierr; 1368 1369 PetscFunctionBegin; 1370 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1371 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1372 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1373 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1374 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1375 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1376 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1377 PetscFunctionReturn(0); 1378 } 1379 1380 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1381 { 1382 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1383 IS isrow=a->row; 1384 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1385 PetscErrorCode ierr; 1386 const PetscInt *r,*vj; 1387 PetscInt nz,k,k2,idx; 1388 const MatScalar *aa=a->a,*v,*diag; 1389 PetscScalar *x,x0,x1,*t; 1390 const PetscScalar *b; 1391 1392 PetscFunctionBegin; 1393 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1394 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1395 t = a->solve_work; 1396 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1397 1398 /* solve U^T * D * y = perm(b) by forward substitution */ 1399 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1400 idx = 2*r[k]; 1401 t[k*2] = b[idx]; 1402 t[k*2+1] = b[idx+1]; 1403 } 1404 for (k=0; k<mbs; k++) { 1405 v = aa + 4*ai[k]; 1406 vj = aj + ai[k]; 1407 k2 = k*2; 1408 x0 = t[k2]; x1 = t[k2+1]; 1409 nz = ai[k+1] - ai[k]; 1410 while (nz--) { 1411 t[(*vj)*2] += v[0]*x0 + v[1]*x1; 1412 t[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1413 vj++; v += 4; 1414 } 1415 diag = aa+k*4; /* ptr to inv(Dk) */ 1416 t[k2] = diag[0]*x0 + diag[2]*x1; 1417 t[k2+1] = diag[1]*x0 + diag[3]*x1; 1418 } 1419 1420 /* solve U*x = y by back substitution */ 1421 for (k=mbs-1; k>=0; k--) { 1422 v = aa + 4*ai[k]; 1423 vj = aj + ai[k]; 1424 k2 = k*2; 1425 x0 = t[k2]; x1 = t[k2+1]; 1426 nz = ai[k+1] - ai[k]; 1427 while (nz--) { 1428 x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1]; 1429 x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1]; 1430 vj++; 1431 v += 4; 1432 } 1433 t[k2] = x0; 1434 t[k2+1] = x1; 1435 idx = 2*r[k]; 1436 x[idx] = x0; 1437 x[idx+1] = x1; 1438 } 1439 1440 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1441 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1442 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1443 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1444 PetscFunctionReturn(0); 1445 } 1446 1447 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1448 { 1449 const MatScalar *v,*diag; 1450 PetscScalar x0,x1; 1451 PetscInt nz,k,k2; 1452 const PetscInt *vj; 1453 1454 PetscFunctionBegin; 1455 for (k=0; k<mbs; k++) { 1456 v = aa + 4*ai[k]; 1457 vj = aj + ai[k]; 1458 k2 = k*2; 1459 x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */ 1460 nz = ai[k+1] - ai[k]; 1461 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1462 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1463 while (nz--) { 1464 /* x(:) += U(k,:)^T*(Dk*xk) */ 1465 x[(*vj)*2] += v[0]*x0 + v[1]*x1; 1466 x[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1467 vj++; v += 4; 1468 } 1469 /* xk = inv(Dk)*(Dk*xk) */ 1470 diag = aa+k*4; /* ptr to inv(Dk) */ 1471 x[k2] = diag[0]*x0 + diag[2]*x1; 1472 x[k2+1] = diag[1]*x0 + diag[3]*x1; 1473 } 1474 PetscFunctionReturn(0); 1475 } 1476 1477 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1478 { 1479 const MatScalar *v; 1480 PetscScalar x0,x1; 1481 PetscInt nz,k,k2; 1482 const PetscInt *vj; 1483 1484 PetscFunctionBegin; 1485 for (k=mbs-1; k>=0; k--) { 1486 v = aa + 4*ai[k]; 1487 vj = aj + ai[k]; 1488 k2 = k*2; 1489 x0 = x[k2]; x1 = x[k2+1]; /* xk */ 1490 nz = ai[k+1] - ai[k]; 1491 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1492 PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1493 while (nz--) { 1494 /* xk += U(k,:)*x(:) */ 1495 x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1]; 1496 x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1]; 1497 vj++; 1498 v += 4; 1499 } 1500 x[k2] = x0; 1501 x[k2+1] = x1; 1502 } 1503 PetscFunctionReturn(0); 1504 } 1505 1506 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1507 { 1508 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1509 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1510 const MatScalar *aa=a->a; 1511 PetscScalar *x; 1512 const PetscScalar *b; 1513 PetscErrorCode ierr; 1514 1515 PetscFunctionBegin; 1516 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1517 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1518 1519 /* solve U^T * D * y = b by forward substitution */ 1520 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1521 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1522 1523 /* solve U*x = y by back substitution */ 1524 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1525 1526 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1527 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1528 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1529 PetscFunctionReturn(0); 1530 } 1531 1532 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1533 { 1534 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1535 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1536 const MatScalar *aa=a->a; 1537 PetscScalar *x; 1538 const PetscScalar *b; 1539 PetscErrorCode ierr; 1540 1541 PetscFunctionBegin; 1542 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1543 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1544 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1545 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1546 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1547 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1548 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1549 PetscFunctionReturn(0); 1550 } 1551 1552 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1553 { 1554 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1555 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1556 const MatScalar *aa=a->a; 1557 PetscScalar *x; 1558 const PetscScalar *b; 1559 PetscErrorCode ierr; 1560 1561 PetscFunctionBegin; 1562 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1563 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1564 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1565 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1566 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1567 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1568 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 1569 PetscFunctionReturn(0); 1570 } 1571 1572 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1573 { 1574 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1575 IS isrow=a->row; 1576 PetscErrorCode ierr; 1577 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1578 const MatScalar *aa=a->a,*v; 1579 const PetscScalar *b; 1580 PetscScalar *x,xk,*t; 1581 PetscInt nz,k,j; 1582 1583 PetscFunctionBegin; 1584 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1585 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1586 t = a->solve_work; 1587 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1588 1589 /* solve U^T*D*y = perm(b) by forward substitution */ 1590 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1591 for (k=0; k<mbs; k++) { 1592 v = aa + ai[k]; 1593 vj = aj + ai[k]; 1594 xk = t[k]; 1595 nz = ai[k+1] - ai[k] - 1; 1596 for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk; 1597 t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */ 1598 } 1599 1600 /* solve U*perm(x) = y by back substitution */ 1601 for (k=mbs-1; k>=0; k--) { 1602 v = aa + adiag[k] - 1; 1603 vj = aj + adiag[k] - 1; 1604 nz = ai[k+1] - ai[k] - 1; 1605 for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]]; 1606 x[rp[k]] = t[k]; 1607 } 1608 1609 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1610 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1611 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1612 ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr); 1613 PetscFunctionReturn(0); 1614 } 1615 1616 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1617 { 1618 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1619 IS isrow=a->row; 1620 PetscErrorCode ierr; 1621 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1622 const MatScalar *aa=a->a,*v; 1623 PetscScalar *x,xk,*t; 1624 const PetscScalar *b; 1625 PetscInt nz,k; 1626 1627 PetscFunctionBegin; 1628 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1629 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1630 t = a->solve_work; 1631 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1632 1633 /* solve U^T*D*y = perm(b) by forward substitution */ 1634 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1635 for (k=0; k<mbs; k++) { 1636 v = aa + ai[k] + 1; 1637 vj = aj + ai[k] + 1; 1638 xk = t[k]; 1639 nz = ai[k+1] - ai[k] - 1; 1640 while (nz--) t[*vj++] += (*v++) * xk; 1641 t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */ 1642 } 1643 1644 /* solve U*perm(x) = y by back substitution */ 1645 for (k=mbs-1; k>=0; k--) { 1646 v = aa + ai[k] + 1; 1647 vj = aj + ai[k] + 1; 1648 nz = ai[k+1] - ai[k] - 1; 1649 while (nz--) t[k] += (*v++) * t[*vj++]; 1650 x[rp[k]] = t[k]; 1651 } 1652 1653 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1654 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1655 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1656 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1657 PetscFunctionReturn(0); 1658 } 1659 1660 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1661 { 1662 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1663 IS isrow=a->row; 1664 PetscErrorCode ierr; 1665 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1666 const MatScalar *aa=a->a,*v; 1667 PetscReal diagk; 1668 PetscScalar *x,xk; 1669 const PetscScalar *b; 1670 PetscInt nz,k; 1671 1672 PetscFunctionBegin; 1673 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1674 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1675 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1676 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1677 1678 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1679 for (k=0; k<mbs; k++) { 1680 v = aa + ai[k]; 1681 vj = aj + ai[k]; 1682 xk = x[k]; 1683 nz = ai[k+1] - ai[k] - 1; 1684 while (nz--) x[*vj++] += (*v++) * xk; 1685 1686 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1687 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1688 x[k] = xk*PetscSqrtReal(diagk); 1689 } 1690 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1691 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1692 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1693 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1694 PetscFunctionReturn(0); 1695 } 1696 1697 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1698 { 1699 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1700 IS isrow=a->row; 1701 PetscErrorCode ierr; 1702 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1703 const MatScalar *aa=a->a,*v; 1704 PetscReal diagk; 1705 PetscScalar *x,xk; 1706 const PetscScalar *b; 1707 PetscInt nz,k; 1708 1709 PetscFunctionBegin; 1710 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1711 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1712 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1713 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1714 1715 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1716 for (k=0; k<mbs; k++) { 1717 v = aa + ai[k] + 1; 1718 vj = aj + ai[k] + 1; 1719 xk = x[k]; 1720 nz = ai[k+1] - ai[k] - 1; 1721 while (nz--) x[*vj++] += (*v++) * xk; 1722 1723 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1724 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1725 x[k] = xk*PetscSqrtReal(diagk); 1726 } 1727 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1728 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1729 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1730 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1731 PetscFunctionReturn(0); 1732 } 1733 1734 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1735 { 1736 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1737 IS isrow=a->row; 1738 PetscErrorCode ierr; 1739 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1740 const MatScalar *aa=a->a,*v; 1741 PetscReal diagk; 1742 PetscScalar *x,*t; 1743 const PetscScalar *b; 1744 PetscInt nz,k; 1745 1746 PetscFunctionBegin; 1747 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1748 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1749 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1750 t = a->solve_work; 1751 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1752 1753 for (k=mbs-1; k>=0; k--) { 1754 v = aa + ai[k]; 1755 vj = aj + ai[k]; 1756 diagk = PetscRealPart(aa[adiag[k]]); 1757 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1758 t[k] = b[k] * PetscSqrtReal(diagk); 1759 nz = ai[k+1] - ai[k] - 1; 1760 while (nz--) t[k] += (*v++) * t[*vj++]; 1761 x[rp[k]] = t[k]; 1762 } 1763 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1764 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1765 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1766 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1767 PetscFunctionReturn(0); 1768 } 1769 1770 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1771 { 1772 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1773 IS isrow=a->row; 1774 PetscErrorCode ierr; 1775 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1776 const MatScalar *aa=a->a,*v; 1777 PetscReal diagk; 1778 PetscScalar *x,*t; 1779 const PetscScalar *b; 1780 PetscInt nz,k; 1781 1782 PetscFunctionBegin; 1783 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1784 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1785 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1786 t = a->solve_work; 1787 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1788 1789 for (k=mbs-1; k>=0; k--) { 1790 v = aa + ai[k] + 1; 1791 vj = aj + ai[k] + 1; 1792 diagk = PetscRealPart(aa[ai[k]]); 1793 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1794 t[k] = b[k] * PetscSqrtReal(diagk); 1795 nz = ai[k+1] - ai[k] - 1; 1796 while (nz--) t[k] += (*v++) * t[*vj++]; 1797 x[rp[k]] = t[k]; 1798 } 1799 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1800 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1801 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1802 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1803 PetscFunctionReturn(0); 1804 } 1805 1806 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx) 1807 { 1808 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1809 PetscErrorCode ierr; 1810 1811 PetscFunctionBegin; 1812 if (A->rmap->bs == 1) { 1813 ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr); 1814 } else { 1815 IS isrow=a->row; 1816 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1817 const MatScalar *aa=a->a,*v; 1818 PetscScalar *x,*t; 1819 const PetscScalar *b; 1820 PetscInt nz,k,n,i,j; 1821 1822 if (bb->n > a->solves_work_n) { 1823 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1824 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1825 a->solves_work_n = bb->n; 1826 } 1827 n = bb->n; 1828 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1829 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1830 t = a->solves_work; 1831 1832 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1833 1834 /* solve U^T*D*y = perm(b) by forward substitution */ 1835 for (k=0; k<mbs; k++) { 1836 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1837 } 1838 for (k=0; k<mbs; k++) { 1839 v = aa + ai[k]; 1840 vj = aj + ai[k]; 1841 nz = ai[k+1] - ai[k] - 1; 1842 for (j=0; j<nz; j++) { 1843 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1844 v++;vj++; 1845 } 1846 for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1847 } 1848 1849 /* solve U*perm(x) = y by back substitution */ 1850 for (k=mbs-1; k>=0; k--) { 1851 v = aa + ai[k] - 1; 1852 vj = aj + ai[k] - 1; 1853 nz = ai[k+1] - ai[k] - 1; 1854 for (j=0; j<nz; j++) { 1855 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1856 v++;vj++; 1857 } 1858 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1859 } 1860 1861 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1862 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1863 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1864 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1865 } 1866 PetscFunctionReturn(0); 1867 } 1868 1869 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx) 1870 { 1871 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1872 PetscErrorCode ierr; 1873 1874 PetscFunctionBegin; 1875 if (A->rmap->bs == 1) { 1876 ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr); 1877 } else { 1878 IS isrow=a->row; 1879 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1880 const MatScalar *aa=a->a,*v; 1881 PetscScalar *x,*t; 1882 const PetscScalar *b; 1883 PetscInt nz,k,n,i; 1884 1885 if (bb->n > a->solves_work_n) { 1886 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1887 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1888 a->solves_work_n = bb->n; 1889 } 1890 n = bb->n; 1891 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1892 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1893 t = a->solves_work; 1894 1895 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1896 1897 /* solve U^T*D*y = perm(b) by forward substitution */ 1898 for (k=0; k<mbs; k++) { 1899 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1900 } 1901 for (k=0; k<mbs; k++) { 1902 v = aa + ai[k]; 1903 vj = aj + ai[k]; 1904 nz = ai[k+1] - ai[k]; 1905 while (nz--) { 1906 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1907 v++;vj++; 1908 } 1909 for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1910 } 1911 1912 /* solve U*perm(x) = y by back substitution */ 1913 for (k=mbs-1; k>=0; k--) { 1914 v = aa + ai[k]; 1915 vj = aj + ai[k]; 1916 nz = ai[k+1] - ai[k]; 1917 while (nz--) { 1918 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1919 v++;vj++; 1920 } 1921 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1922 } 1923 1924 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1925 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1926 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1927 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1928 } 1929 PetscFunctionReturn(0); 1930 } 1931 1932 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1933 { 1934 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1935 PetscErrorCode ierr; 1936 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1937 const MatScalar *aa=a->a,*v; 1938 const PetscScalar *b; 1939 PetscScalar *x,xi; 1940 PetscInt nz,i,j; 1941 1942 PetscFunctionBegin; 1943 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1944 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1945 /* solve U^T*D*y = b by forward substitution */ 1946 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 1947 for (i=0; i<mbs; i++) { 1948 v = aa + ai[i]; 1949 vj = aj + ai[i]; 1950 xi = x[i]; 1951 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 1952 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 1953 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 1954 } 1955 /* solve U*x = y by backward substitution */ 1956 for (i=mbs-2; i>=0; i--) { 1957 xi = x[i]; 1958 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 1959 vj = aj + adiag[i] - 1; 1960 nz = ai[i+1] - ai[i] - 1; 1961 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 1962 x[i] = xi; 1963 } 1964 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1965 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1966 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1967 PetscFunctionReturn(0); 1968 } 1969 1970 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X) 1971 { 1972 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1973 PetscErrorCode ierr; 1974 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1975 const MatScalar *aa=a->a,*v; 1976 const PetscScalar *b; 1977 PetscScalar *x,xi; 1978 PetscInt nz,i,j,neq,ldb,ldx; 1979 PetscBool isdense; 1980 1981 PetscFunctionBegin; 1982 if (!mbs) PetscFunctionReturn(0); 1983 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 1984 if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix"); 1985 if (X != B) { 1986 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);CHKERRQ(ierr); 1987 if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix"); 1988 } 1989 ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr); 1990 ierr = MatDenseGetLDA(B,&ldb);CHKERRQ(ierr); 1991 ierr = MatDenseGetArray(X,&x);CHKERRQ(ierr); 1992 ierr = MatDenseGetLDA(X,&ldx);CHKERRQ(ierr); 1993 for (neq=0; neq<B->cmap->n; neq++) { 1994 /* solve U^T*D*y = b by forward substitution */ 1995 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 1996 for (i=0; i<mbs; i++) { 1997 v = aa + ai[i]; 1998 vj = aj + ai[i]; 1999 xi = x[i]; 2000 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 2001 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 2002 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2003 } 2004 /* solve U*x = y by backward substitution */ 2005 for (i=mbs-2; i>=0; i--) { 2006 xi = x[i]; 2007 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2008 vj = aj + adiag[i] - 1; 2009 nz = ai[i+1] - ai[i] - 1; 2010 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 2011 x[i] = xi; 2012 } 2013 b += ldb; 2014 x += ldx; 2015 } 2016 ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr); 2017 ierr = MatDenseRestoreArray(X,&x);CHKERRQ(ierr); 2018 ierr = PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));CHKERRQ(ierr); 2019 PetscFunctionReturn(0); 2020 } 2021 2022 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2023 { 2024 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2025 PetscErrorCode ierr; 2026 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2027 const MatScalar *aa=a->a,*v; 2028 PetscScalar *x,xk; 2029 const PetscScalar *b; 2030 PetscInt nz,k; 2031 2032 PetscFunctionBegin; 2033 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2034 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2035 2036 /* solve U^T*D*y = b by forward substitution */ 2037 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2038 for (k=0; k<mbs; k++) { 2039 v = aa + ai[k] + 1; 2040 vj = aj + ai[k] + 1; 2041 xk = x[k]; 2042 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2043 while (nz--) x[*vj++] += (*v++) * xk; 2044 x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2045 } 2046 2047 /* solve U*x = y by back substitution */ 2048 for (k=mbs-2; k>=0; k--) { 2049 v = aa + ai[k] + 1; 2050 vj = aj + ai[k] + 1; 2051 xk = x[k]; 2052 nz = ai[k+1] - ai[k] - 1; 2053 while (nz--) { 2054 xk += (*v++) * x[*vj++]; 2055 } 2056 x[k] = xk; 2057 } 2058 2059 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2060 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2061 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2062 PetscFunctionReturn(0); 2063 } 2064 2065 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2066 { 2067 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2068 PetscErrorCode ierr; 2069 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2070 const MatScalar *aa=a->a,*v; 2071 PetscReal diagk; 2072 PetscScalar *x; 2073 const PetscScalar *b; 2074 PetscInt nz,k; 2075 2076 PetscFunctionBegin; 2077 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2078 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2079 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2080 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2081 for (k=0; k<mbs; k++) { 2082 v = aa + ai[k]; 2083 vj = aj + ai[k]; 2084 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2085 while (nz--) x[*vj++] += (*v++) * x[k]; 2086 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2087 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]])); 2088 x[k] *= PetscSqrtReal(diagk); 2089 } 2090 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2091 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2092 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2093 PetscFunctionReturn(0); 2094 } 2095 2096 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2097 { 2098 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2099 PetscErrorCode ierr; 2100 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2101 const MatScalar *aa=a->a,*v; 2102 PetscReal diagk; 2103 PetscScalar *x; 2104 const PetscScalar *b; 2105 PetscInt nz,k; 2106 2107 PetscFunctionBegin; 2108 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2109 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2110 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2111 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2112 for (k=0; k<mbs; k++) { 2113 v = aa + ai[k] + 1; 2114 vj = aj + ai[k] + 1; 2115 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2116 while (nz--) x[*vj++] += (*v++) * x[k]; 2117 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2118 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]])); 2119 x[k] *= PetscSqrtReal(diagk); 2120 } 2121 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2122 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2123 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2124 PetscFunctionReturn(0); 2125 } 2126 2127 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2128 { 2129 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2130 PetscErrorCode ierr; 2131 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2132 const MatScalar *aa=a->a,*v; 2133 PetscReal diagk; 2134 PetscScalar *x; 2135 const PetscScalar *b; 2136 PetscInt nz,k; 2137 2138 PetscFunctionBegin; 2139 /* solve D^(1/2)*U*x = b by back substitution */ 2140 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2141 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2142 2143 for (k=mbs-1; k>=0; k--) { 2144 v = aa + ai[k]; 2145 vj = aj + ai[k]; 2146 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2147 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2148 x[k] = PetscSqrtReal(diagk)*b[k]; 2149 nz = ai[k+1] - ai[k] - 1; 2150 while (nz--) x[k] += (*v++) * x[*vj++]; 2151 } 2152 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2153 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2154 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2155 PetscFunctionReturn(0); 2156 } 2157 2158 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2159 { 2160 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2161 PetscErrorCode ierr; 2162 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2163 const MatScalar *aa=a->a,*v; 2164 PetscReal diagk; 2165 PetscScalar *x; 2166 const PetscScalar *b; 2167 PetscInt nz,k; 2168 2169 PetscFunctionBegin; 2170 /* solve D^(1/2)*U*x = b by back substitution */ 2171 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2172 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2173 2174 for (k=mbs-1; k>=0; k--) { 2175 v = aa + ai[k] + 1; 2176 vj = aj + ai[k] + 1; 2177 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2178 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2179 x[k] = PetscSqrtReal(diagk)*b[k]; 2180 nz = ai[k+1] - ai[k] - 1; 2181 while (nz--) x[k] += (*v++) * x[*vj++]; 2182 } 2183 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2184 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2185 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2186 PetscFunctionReturn(0); 2187 } 2188 2189 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2190 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2191 { 2192 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2193 PetscErrorCode ierr; 2194 const PetscInt *rip,mbs = a->mbs,*ai,*aj; 2195 PetscInt *jutmp,bs = A->rmap->bs,i; 2196 PetscInt m,reallocs = 0,*levtmp; 2197 PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2198 PetscInt incrlev,*lev,shift,prow,nz; 2199 PetscReal f = info->fill,levels = info->levels; 2200 PetscBool perm_identity; 2201 2202 PetscFunctionBegin; 2203 /* check whether perm is the identity mapping */ 2204 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2205 2206 if (perm_identity) { 2207 a->permute = PETSC_FALSE; 2208 ai = a->i; aj = a->j; 2209 } else { /* non-trivial permutation */ 2210 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2211 } 2212 2213 /* initialization */ 2214 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2215 umax = (PetscInt)(f*ai[mbs] + 1); 2216 ierr = PetscMalloc1(umax,&lev);CHKERRQ(ierr); 2217 umax += mbs + 1; 2218 shift = mbs + 1; 2219 ierr = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr); 2220 ierr = PetscMalloc1(umax,&ju);CHKERRQ(ierr); 2221 iu[0] = mbs + 1; 2222 juidx = mbs + 1; 2223 /* prowl: linked list for pivot row */ 2224 ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr); 2225 /* q: linked list for col index */ 2226 2227 for (i=0; i<mbs; i++) { 2228 prowl[i] = mbs; 2229 q[i] = 0; 2230 } 2231 2232 /* for each row k */ 2233 for (k=0; k<mbs; k++) { 2234 nzk = 0; 2235 q[k] = mbs; 2236 /* copy current row into linked list */ 2237 nz = ai[rip[k]+1] - ai[rip[k]]; 2238 j = ai[rip[k]]; 2239 while (nz--) { 2240 vj = rip[aj[j++]]; 2241 if (vj > k) { 2242 qm = k; 2243 do { 2244 m = qm; qm = q[m]; 2245 } while (qm < vj); 2246 if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n"); 2247 nzk++; 2248 q[m] = vj; 2249 q[vj] = qm; 2250 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2251 } 2252 } 2253 2254 /* modify nonzero structure of k-th row by computing fill-in 2255 for each row prow to be merged in */ 2256 prow = k; 2257 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2258 2259 while (prow < k) { 2260 /* merge row prow into k-th row */ 2261 jmin = iu[prow] + 1; 2262 jmax = iu[prow+1]; 2263 qm = k; 2264 for (j=jmin; j<jmax; j++) { 2265 incrlev = lev[j-shift] + 1; 2266 if (incrlev > levels) continue; 2267 vj = ju[j]; 2268 do { 2269 m = qm; qm = q[m]; 2270 } while (qm < vj); 2271 if (qm != vj) { /* a fill */ 2272 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2273 levtmp[vj] = incrlev; 2274 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2275 } 2276 prow = prowl[prow]; /* next pivot row */ 2277 } 2278 2279 /* add k to row list for first nonzero element in k-th row */ 2280 if (nzk > 1) { 2281 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2282 prowl[k] = prowl[i]; prowl[i] = k; 2283 } 2284 iu[k+1] = iu[k] + nzk; 2285 2286 /* allocate more space to ju and lev if needed */ 2287 if (iu[k+1] > umax) { 2288 /* estimate how much additional space we will need */ 2289 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2290 /* just double the memory each time */ 2291 maxadd = umax; 2292 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2293 umax += maxadd; 2294 2295 /* allocate a longer ju */ 2296 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2297 ierr = PetscArraycpy(jutmp,ju,iu[k]);CHKERRQ(ierr); 2298 ierr = PetscFree(ju);CHKERRQ(ierr); 2299 ju = jutmp; 2300 2301 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2302 ierr = PetscArraycpy(jutmp,lev,iu[k]-shift);CHKERRQ(ierr); 2303 ierr = PetscFree(lev);CHKERRQ(ierr); 2304 lev = jutmp; 2305 reallocs += 2; /* count how many times we realloc */ 2306 } 2307 2308 /* save nonzero structure of k-th row in ju */ 2309 i=k; 2310 while (nzk--) { 2311 i = q[i]; 2312 ju[juidx] = i; 2313 lev[juidx-shift] = levtmp[i]; 2314 juidx++; 2315 } 2316 } 2317 2318 #if defined(PETSC_USE_INFO) 2319 if (ai[mbs] != 0) { 2320 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2321 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr); 2322 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2323 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr); 2324 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 2325 } else { 2326 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2327 } 2328 #endif 2329 2330 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2331 ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr); 2332 ierr = PetscFree(lev);CHKERRQ(ierr); 2333 2334 /* put together the new matrix */ 2335 ierr = MatSeqSBAIJSetPreallocation(B,bs,0,NULL);CHKERRQ(ierr); 2336 2337 /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */ 2338 b = (Mat_SeqSBAIJ*)(B)->data; 2339 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2340 2341 b->singlemalloc = PETSC_FALSE; 2342 b->free_a = PETSC_TRUE; 2343 b->free_ij = PETSC_TRUE; 2344 /* the next line frees the default space generated by the Create() */ 2345 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 2346 ierr = PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);CHKERRQ(ierr); 2347 b->j = ju; 2348 b->i = iu; 2349 b->diag = NULL; 2350 b->ilen = NULL; 2351 b->imax = NULL; 2352 2353 ierr = ISDestroy(&b->row);CHKERRQ(ierr); 2354 ierr = ISDestroy(&b->icol);CHKERRQ(ierr); 2355 b->row = perm; 2356 b->icol = perm; 2357 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2358 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2359 ierr = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr); 2360 /* In b structure: Free imax, ilen, old a, old j. 2361 Allocate idnew, solve_work, new a, new j */ 2362 ierr = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2363 b->maxnz = b->nz = iu[mbs]; 2364 2365 (B)->info.factor_mallocs = reallocs; 2366 (B)->info.fill_ratio_given = f; 2367 if (ai[mbs] != 0) { 2368 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2369 } else { 2370 (B)->info.fill_ratio_needed = 0.0; 2371 } 2372 ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr); 2373 PetscFunctionReturn(0); 2374 } 2375 2376 /* 2377 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2378 */ 2379 #include <petscbt.h> 2380 #include <../src/mat/utils/freespace.h> 2381 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2382 { 2383 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2384 PetscErrorCode ierr; 2385 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2386 PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j; 2387 const PetscInt *rip; 2388 PetscInt reallocs=0,i,*ui,*udiag,*cols; 2389 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2390 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr; 2391 PetscReal fill =info->fill,levels=info->levels; 2392 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2393 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2394 PetscBT lnkbt; 2395 2396 PetscFunctionBegin; 2397 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 2398 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2399 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2400 if (bs > 1) { 2401 ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr); 2402 PetscFunctionReturn(0); 2403 } 2404 2405 /* check whether perm is the identity mapping */ 2406 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2407 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2408 a->permute = PETSC_FALSE; 2409 2410 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2411 ierr = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr); 2412 ui[0] = 0; 2413 2414 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2415 if (!levels) { 2416 /* reuse the column pointers and row offsets for memory savings */ 2417 for (i=0; i<am; i++) { 2418 ncols = ai[i+1] - ai[i]; 2419 ui[i+1] = ui[i] + ncols; 2420 udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */ 2421 } 2422 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2423 cols = uj; 2424 for (i=0; i<am; i++) { 2425 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2426 ncols = ai[i+1] - ai[i] -1; 2427 for (j=0; j<ncols; j++) *cols++ = aj[j]; 2428 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */ 2429 } 2430 } else { /* case: levels>0 */ 2431 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2432 2433 /* initialization */ 2434 /* jl: linked list for storing indices of the pivot rows 2435 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2436 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2437 for (i=0; i<am; i++) { 2438 jl[i] = am; il[i] = 0; 2439 } 2440 2441 /* create and initialize a linked list for storing column indices of the active row k */ 2442 nlnk = am + 1; 2443 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2444 2445 /* initial FreeSpace size is fill*(ai[am]+1) */ 2446 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2447 2448 current_space = free_space; 2449 2450 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2451 2452 current_space_lvl = free_space_lvl; 2453 2454 for (k=0; k<am; k++) { /* for each active row k */ 2455 /* initialize lnk by the column indices of row k */ 2456 nzk = 0; 2457 ncols = ai[k+1] - ai[k]; 2458 if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k); 2459 cols = aj+ai[k]; 2460 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2461 nzk += nlnk; 2462 2463 /* update lnk by computing fill-in for each pivot row to be merged in */ 2464 prow = jl[k]; /* 1st pivot row */ 2465 2466 while (prow < k) { 2467 nextprow = jl[prow]; 2468 2469 /* merge prow into k-th row */ 2470 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2471 jmax = ui[prow+1]; 2472 ncols = jmax-jmin; 2473 i = jmin - ui[prow]; 2474 2475 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2476 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2477 j = *(uj - 1); 2478 ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2479 nzk += nlnk; 2480 2481 /* update il and jl for prow */ 2482 if (jmin < jmax) { 2483 il[prow] = jmin; 2484 2485 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2486 } 2487 prow = nextprow; 2488 } 2489 2490 /* if free space is not available, make more free space */ 2491 if (current_space->local_remaining<nzk) { 2492 i = am - k + 1; /* num of unfactored rows */ 2493 i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2494 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2495 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2496 reallocs++; 2497 } 2498 2499 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2500 if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k); 2501 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2502 2503 /* add the k-th row into il and jl */ 2504 if (nzk > 1) { 2505 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2506 jl[k] = jl[i]; jl[i] = k; 2507 il[k] = ui[k] + 1; 2508 } 2509 uj_ptr[k] = current_space->array; 2510 uj_lvl_ptr[k] = current_space_lvl->array; 2511 2512 current_space->array += nzk; 2513 current_space->local_used += nzk; 2514 current_space->local_remaining -= nzk; 2515 current_space_lvl->array += nzk; 2516 current_space_lvl->local_used += nzk; 2517 current_space_lvl->local_remaining -= nzk; 2518 2519 ui[k+1] = ui[k] + nzk; 2520 } 2521 2522 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2523 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2524 2525 /* destroy list of free space and other temporary array(s) */ 2526 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2527 ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor */ 2528 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2529 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2530 2531 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2532 2533 /* put together the new matrix in MATSEQSBAIJ format */ 2534 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2535 2536 b = (Mat_SeqSBAIJ*)(fact)->data; 2537 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2538 2539 b->singlemalloc = PETSC_FALSE; 2540 b->free_a = PETSC_TRUE; 2541 b->free_ij = free_ij; 2542 2543 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2544 2545 b->j = uj; 2546 b->i = ui; 2547 b->diag = udiag; 2548 b->free_diag = PETSC_TRUE; 2549 b->ilen = NULL; 2550 b->imax = NULL; 2551 b->row = perm; 2552 b->col = perm; 2553 2554 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2555 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2556 2557 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2558 2559 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2560 ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2561 2562 b->maxnz = b->nz = ui[am]; 2563 2564 fact->info.factor_mallocs = reallocs; 2565 fact->info.fill_ratio_given = fill; 2566 if (ai[am] != 0) { 2567 fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am]; 2568 } else { 2569 fact->info.fill_ratio_needed = 0.0; 2570 } 2571 #if defined(PETSC_USE_INFO) 2572 if (ai[am] != 0) { 2573 PetscReal af = fact->info.fill_ratio_needed; 2574 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2575 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2576 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2577 } else { 2578 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2579 } 2580 #endif 2581 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2582 PetscFunctionReturn(0); 2583 } 2584 2585 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2586 { 2587 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2588 Mat_SeqSBAIJ *b; 2589 PetscErrorCode ierr; 2590 PetscBool perm_identity,free_ij = PETSC_TRUE; 2591 PetscInt bs=A->rmap->bs,am=a->mbs; 2592 const PetscInt *cols,*rip,*ai=a->i,*aj=a->j; 2593 PetscInt reallocs=0,i,*ui; 2594 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2595 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 2596 PetscReal fill =info->fill,levels=info->levels,ratio_needed; 2597 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2598 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2599 PetscBT lnkbt; 2600 2601 PetscFunctionBegin; 2602 /* 2603 This code originally uses Modified Sparse Row (MSR) storage 2604 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2605 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2606 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2607 thus the original code in MSR format is still used for these cases. 2608 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2609 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2610 */ 2611 if (bs > 1) { 2612 ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr); 2613 PetscFunctionReturn(0); 2614 } 2615 2616 /* check whether perm is the identity mapping */ 2617 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2618 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2619 a->permute = PETSC_FALSE; 2620 2621 /* special case that simply copies fill pattern */ 2622 if (!levels) { 2623 /* reuse the column pointers and row offsets for memory savings */ 2624 ui = a->i; 2625 uj = a->j; 2626 free_ij = PETSC_FALSE; 2627 ratio_needed = 1.0; 2628 } else { /* case: levels>0 */ 2629 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2630 2631 /* initialization */ 2632 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2633 ui[0] = 0; 2634 2635 /* jl: linked list for storing indices of the pivot rows 2636 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2637 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2638 for (i=0; i<am; i++) { 2639 jl[i] = am; il[i] = 0; 2640 } 2641 2642 /* create and initialize a linked list for storing column indices of the active row k */ 2643 nlnk = am + 1; 2644 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2645 2646 /* initial FreeSpace size is fill*(ai[am]+1) */ 2647 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2648 2649 current_space = free_space; 2650 2651 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2652 2653 current_space_lvl = free_space_lvl; 2654 2655 for (k=0; k<am; k++) { /* for each active row k */ 2656 /* initialize lnk by the column indices of row rip[k] */ 2657 nzk = 0; 2658 ncols = ai[rip[k]+1] - ai[rip[k]]; 2659 cols = aj+ai[rip[k]]; 2660 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2661 nzk += nlnk; 2662 2663 /* update lnk by computing fill-in for each pivot row to be merged in */ 2664 prow = jl[k]; /* 1st pivot row */ 2665 2666 while (prow < k) { 2667 nextprow = jl[prow]; 2668 2669 /* merge prow into k-th row */ 2670 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2671 jmax = ui[prow+1]; 2672 ncols = jmax-jmin; 2673 i = jmin - ui[prow]; 2674 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2675 j = *(uj_lvl_ptr[prow] + i - 1); 2676 cols_lvl = uj_lvl_ptr[prow]+i; 2677 ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2678 nzk += nlnk; 2679 2680 /* update il and jl for prow */ 2681 if (jmin < jmax) { 2682 il[prow] = jmin; 2683 j = *cols; 2684 jl[prow] = jl[j]; 2685 jl[j] = prow; 2686 } 2687 prow = nextprow; 2688 } 2689 2690 /* if free space is not available, make more free space */ 2691 if (current_space->local_remaining<nzk) { 2692 i = am - k + 1; /* num of unfactored rows */ 2693 i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2694 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2695 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2696 reallocs++; 2697 } 2698 2699 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2700 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2701 2702 /* add the k-th row into il and jl */ 2703 if (nzk-1 > 0) { 2704 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2705 jl[k] = jl[i]; jl[i] = k; 2706 il[k] = ui[k] + 1; 2707 } 2708 uj_ptr[k] = current_space->array; 2709 uj_lvl_ptr[k] = current_space_lvl->array; 2710 2711 current_space->array += nzk; 2712 current_space->local_used += nzk; 2713 current_space->local_remaining -= nzk; 2714 current_space_lvl->array += nzk; 2715 current_space_lvl->local_used += nzk; 2716 current_space_lvl->local_remaining -= nzk; 2717 2718 ui[k+1] = ui[k] + nzk; 2719 } 2720 2721 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2722 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2723 2724 /* destroy list of free space and other temporary array(s) */ 2725 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2726 ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 2727 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2728 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2729 if (ai[am] != 0) { 2730 ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 2731 } else { 2732 ratio_needed = 0.0; 2733 } 2734 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2735 2736 /* put together the new matrix in MATSEQSBAIJ format */ 2737 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2738 2739 b = (Mat_SeqSBAIJ*)(fact)->data; 2740 2741 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2742 2743 b->singlemalloc = PETSC_FALSE; 2744 b->free_a = PETSC_TRUE; 2745 b->free_ij = free_ij; 2746 2747 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2748 2749 b->j = uj; 2750 b->i = ui; 2751 b->diag = NULL; 2752 b->ilen = NULL; 2753 b->imax = NULL; 2754 b->row = perm; 2755 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2756 2757 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2758 b->icol = perm; 2759 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2760 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2761 2762 b->maxnz = b->nz = ui[am]; 2763 2764 fact->info.factor_mallocs = reallocs; 2765 fact->info.fill_ratio_given = fill; 2766 fact->info.fill_ratio_needed = ratio_needed; 2767 #if defined(PETSC_USE_INFO) 2768 if (ai[am] != 0) { 2769 PetscReal af = fact->info.fill_ratio_needed; 2770 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2771 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2772 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2773 } else { 2774 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2775 } 2776 #endif 2777 if (perm_identity) { 2778 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2779 } else { 2780 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2781 } 2782 PetscFunctionReturn(0); 2783 } 2784