1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 11 { 12 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 13 IS isrow=a->row; 14 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 15 PetscErrorCode ierr; 16 const PetscInt *r; 17 PetscInt nz,*vj,k,idx,k1; 18 PetscInt bs =A->rmap->bs,bs2 = a->bs2; 19 const MatScalar *aa=a->a,*v,*diag; 20 PetscScalar *x,*xk,*xj,*xk_tmp,*t; 21 const PetscScalar *b; 22 23 PetscFunctionBegin; 24 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 t = a->solve_work; 27 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 28 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 29 30 /* solve U^T * D * y = b by forward substitution */ 31 xk = t; 32 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 33 idx = bs*r[k]; 34 for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1]; 35 } 36 for (k=0; k<mbs; k++) { 37 v = aa + bs2*ai[k]; 38 xk = t + k*bs; /* Dk*xk = k-th block of x */ 39 ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */ 40 nz = ai[k+1] - ai[k]; 41 vj = aj + ai[k]; 42 xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */ 43 while (nz--) { 44 /* x(:) += U(k,:)^T*(Dk*xk) */ 45 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 46 vj++; xj = t + (*vj)*bs; 47 v += bs2; 48 } 49 /* xk = inv(Dk)*(Dk*xk) */ 50 diag = aa+k*bs2; /* ptr to inv(Dk) */ 51 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 52 } 53 54 /* solve U*x = y by back substitution */ 55 for (k=mbs-1; k>=0; k--) { 56 v = aa + bs2*ai[k]; 57 xk = t + k*bs; /* xk */ 58 nz = ai[k+1] - ai[k]; 59 vj = aj + ai[k]; 60 xj = t + (*vj)*bs; 61 while (nz--) { 62 /* xk += U(k,:)*x(:) */ 63 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 64 vj++; 65 v += bs2; xj = t + (*vj)*bs; 66 } 67 idx = bs*r[k]; 68 for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++; 69 } 70 71 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 72 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 73 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 74 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 75 ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr); 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 80 { 81 PetscFunctionBegin; 82 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet"); 83 } 84 85 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 86 { 87 PetscFunctionBegin; 88 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented"); 89 } 90 91 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 92 { 93 PetscErrorCode ierr; 94 PetscInt nz,k; 95 const PetscInt *vj,bs2 = bs*bs; 96 const MatScalar *v,*diag; 97 PetscScalar *xk,*xj,*xk_tmp; 98 99 PetscFunctionBegin; 100 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 101 for (k=0; k<mbs; k++) { 102 v = aa + bs2*ai[k]; 103 xk = x + k*bs; /* Dk*xk = k-th block of x */ 104 ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */ 105 nz = ai[k+1] - ai[k]; 106 vj = aj + ai[k]; 107 xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */ 108 while (nz--) { 109 /* x(:) += U(k,:)^T*(Dk*xk) */ 110 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 111 vj++; xj = x + (*vj)*bs; 112 v += bs2; 113 } 114 /* xk = inv(Dk)*(Dk*xk) */ 115 diag = aa+k*bs2; /* ptr to inv(Dk) */ 116 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 117 } 118 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 119 PetscFunctionReturn(0); 120 } 121 122 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 123 { 124 PetscInt nz,k; 125 const PetscInt *vj,bs2 = bs*bs; 126 const MatScalar *v; 127 PetscScalar *xk,*xj; 128 129 PetscFunctionBegin; 130 for (k=mbs-1; k>=0; k--) { 131 v = aa + bs2*ai[k]; 132 xk = x + k*bs; /* xk */ 133 nz = ai[k+1] - ai[k]; 134 vj = aj + ai[k]; 135 xj = x + (*vj)*bs; 136 while (nz--) { 137 /* xk += U(k,:)*x(:) */ 138 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 139 vj++; 140 v += bs2; xj = x + (*vj)*bs; 141 } 142 } 143 PetscFunctionReturn(0); 144 } 145 146 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 147 { 148 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 149 PetscErrorCode ierr; 150 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 151 PetscInt bs =A->rmap->bs; 152 const MatScalar *aa=a->a; 153 PetscScalar *x; 154 const PetscScalar *b; 155 156 PetscFunctionBegin; 157 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 158 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 159 160 /* solve U^T * D * y = b by forward substitution */ 161 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */ 162 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 163 164 /* solve U*x = y by back substitution */ 165 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 166 167 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 168 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 169 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 174 { 175 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 176 PetscErrorCode ierr; 177 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 178 PetscInt bs =A->rmap->bs; 179 const MatScalar *aa=a->a; 180 const PetscScalar *b; 181 PetscScalar *x; 182 183 PetscFunctionBegin; 184 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 185 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 186 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */ 187 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 188 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 189 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 190 ierr = PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);CHKERRQ(ierr); 191 PetscFunctionReturn(0); 192 } 193 194 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 195 { 196 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 197 PetscErrorCode ierr; 198 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 199 PetscInt bs =A->rmap->bs; 200 const MatScalar *aa=a->a; 201 const PetscScalar *b; 202 PetscScalar *x; 203 204 PetscFunctionBegin; 205 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 206 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 207 ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); 208 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 209 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 210 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 211 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 212 PetscFunctionReturn(0); 213 } 214 215 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 216 { 217 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 218 IS isrow=a->row; 219 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 220 PetscErrorCode ierr; 221 PetscInt nz,k,idx; 222 const MatScalar *aa=a->a,*v,*d; 223 PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp; 224 const PetscScalar *b; 225 226 PetscFunctionBegin; 227 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 228 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 229 t = a->solve_work; 230 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 231 232 /* solve U^T * D * y = b by forward substitution */ 233 tp = t; 234 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 235 idx = 7*r[k]; 236 tp[0] = b[idx]; 237 tp[1] = b[idx+1]; 238 tp[2] = b[idx+2]; 239 tp[3] = b[idx+3]; 240 tp[4] = b[idx+4]; 241 tp[5] = b[idx+5]; 242 tp[6] = b[idx+6]; 243 tp += 7; 244 } 245 246 for (k=0; k<mbs; k++) { 247 v = aa + 49*ai[k]; 248 vj = aj + ai[k]; 249 tp = t + k*7; 250 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; 251 nz = ai[k+1] - ai[k]; 252 tp = t + (*vj)*7; 253 while (nz--) { 254 tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 255 tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 256 tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 257 tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 258 tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 259 tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 260 tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 261 vj++; 262 tp = t + (*vj)*7; 263 v += 49; 264 } 265 266 /* xk = inv(Dk)*(Dk*xk) */ 267 d = aa+k*49; /* ptr to inv(Dk) */ 268 tp = t + k*7; 269 tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 270 tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 271 tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 272 tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 273 tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 274 tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 275 tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 276 } 277 278 /* solve U*x = y by back substitution */ 279 for (k=mbs-1; k>=0; k--) { 280 v = aa + 49*ai[k]; 281 vj = aj + ai[k]; 282 tp = t + k*7; 283 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */ 284 nz = ai[k+1] - ai[k]; 285 286 tp = t + (*vj)*7; 287 while (nz--) { 288 /* xk += U(k,:)*x(:) */ 289 x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6]; 290 x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6]; 291 x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6]; 292 x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6]; 293 x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6]; 294 x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6]; 295 x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6]; 296 vj++; 297 tp = t + (*vj)*7; 298 v += 49; 299 } 300 tp = t + k*7; 301 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6; 302 idx = 7*r[k]; 303 x[idx] = x0; 304 x[idx+1] = x1; 305 x[idx+2] = x2; 306 x[idx+3] = x3; 307 x[idx+4] = x4; 308 x[idx+5] = x5; 309 x[idx+6] = x6; 310 } 311 312 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 313 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 314 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 315 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 316 PetscFunctionReturn(0); 317 } 318 319 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 320 { 321 const MatScalar *v,*d; 322 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 323 PetscInt nz,k; 324 const PetscInt *vj; 325 326 PetscFunctionBegin; 327 for (k=0; k<mbs; k++) { 328 v = aa + 49*ai[k]; 329 xp = x + k*7; 330 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */ 331 nz = ai[k+1] - ai[k]; 332 vj = aj + ai[k]; 333 xp = x + (*vj)*7; 334 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 335 PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 336 while (nz--) { 337 /* x(:) += U(k,:)^T*(Dk*xk) */ 338 xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 339 xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 340 xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 341 xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 342 xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 343 xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 344 xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 345 vj++; 346 xp = x + (*vj)*7; 347 v += 49; 348 } 349 /* xk = inv(Dk)*(Dk*xk) */ 350 d = aa+k*49; /* ptr to inv(Dk) */ 351 xp = x + k*7; 352 xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 353 xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 354 xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 355 xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 356 xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 357 xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 358 xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 359 } 360 PetscFunctionReturn(0); 361 } 362 363 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 364 { 365 const MatScalar *v; 366 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 367 PetscInt nz,k; 368 const PetscInt *vj; 369 370 PetscFunctionBegin; 371 for (k=mbs-1; k>=0; k--) { 372 v = aa + 49*ai[k]; 373 xp = x + k*7; 374 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */ 375 nz = ai[k+1] - ai[k]; 376 vj = aj + ai[k]; 377 xp = x + (*vj)*7; 378 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 379 PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 380 while (nz--) { 381 /* xk += U(k,:)*x(:) */ 382 x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6]; 383 x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6]; 384 x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6]; 385 x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6]; 386 x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6]; 387 x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6]; 388 x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6]; 389 vj++; 390 v += 49; xp = x + (*vj)*7; 391 } 392 xp = x + k*7; 393 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6; 394 } 395 PetscFunctionReturn(0); 396 } 397 398 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 399 { 400 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 401 PetscErrorCode ierr; 402 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 403 const MatScalar *aa=a->a; 404 PetscScalar *x; 405 const PetscScalar *b; 406 407 PetscFunctionBegin; 408 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 409 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 410 411 /* solve U^T * D * y = b by forward substitution */ 412 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); /* x <- b */ 413 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 414 415 /* solve U*x = y by back substitution */ 416 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 417 418 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 419 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 420 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 421 PetscFunctionReturn(0); 422 } 423 424 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 425 { 426 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 427 PetscErrorCode ierr; 428 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 429 const MatScalar *aa=a->a; 430 PetscScalar *x; 431 const PetscScalar *b; 432 433 PetscFunctionBegin; 434 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 435 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 436 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); 437 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 438 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 439 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 440 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 441 PetscFunctionReturn(0); 442 } 443 444 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 445 { 446 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 447 PetscErrorCode ierr; 448 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 449 const MatScalar *aa=a->a; 450 PetscScalar *x; 451 const PetscScalar *b; 452 453 PetscFunctionBegin; 454 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 455 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 456 ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); 457 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 458 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 459 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 460 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 461 PetscFunctionReturn(0); 462 } 463 464 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 465 { 466 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 467 IS isrow=a->row; 468 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 469 PetscErrorCode ierr; 470 PetscInt nz,k,idx; 471 const MatScalar *aa=a->a,*v,*d; 472 PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp; 473 const PetscScalar *b; 474 475 PetscFunctionBegin; 476 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 477 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 478 t = a->solve_work; 479 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 480 481 /* solve U^T * D * y = b by forward substitution */ 482 tp = t; 483 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 484 idx = 6*r[k]; 485 tp[0] = b[idx]; 486 tp[1] = b[idx+1]; 487 tp[2] = b[idx+2]; 488 tp[3] = b[idx+3]; 489 tp[4] = b[idx+4]; 490 tp[5] = b[idx+5]; 491 tp += 6; 492 } 493 494 for (k=0; k<mbs; k++) { 495 v = aa + 36*ai[k]; 496 vj = aj + ai[k]; 497 tp = t + k*6; 498 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; 499 nz = ai[k+1] - ai[k]; 500 tp = t + (*vj)*6; 501 while (nz--) { 502 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 503 tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 504 tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 505 tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 506 tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 507 tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 508 vj++; 509 tp = t + (*vj)*6; 510 v += 36; 511 } 512 513 /* xk = inv(Dk)*(Dk*xk) */ 514 d = aa+k*36; /* ptr to inv(Dk) */ 515 tp = t + k*6; 516 tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 517 tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 518 tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 519 tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 520 tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 521 tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 522 } 523 524 /* solve U*x = y by back substitution */ 525 for (k=mbs-1; k>=0; k--) { 526 v = aa + 36*ai[k]; 527 vj = aj + ai[k]; 528 tp = t + k*6; 529 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */ 530 nz = ai[k+1] - ai[k]; 531 532 tp = t + (*vj)*6; 533 while (nz--) { 534 /* xk += U(k,:)*x(:) */ 535 x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5]; 536 x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5]; 537 x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5]; 538 x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5]; 539 x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5]; 540 x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5]; 541 vj++; 542 tp = t + (*vj)*6; 543 v += 36; 544 } 545 tp = t + k*6; 546 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; 547 idx = 6*r[k]; 548 x[idx] = x0; 549 x[idx+1] = x1; 550 x[idx+2] = x2; 551 x[idx+3] = x3; 552 x[idx+4] = x4; 553 x[idx+5] = x5; 554 } 555 556 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 557 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 558 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 559 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 560 PetscFunctionReturn(0); 561 } 562 563 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 564 { 565 const MatScalar *v,*d; 566 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 567 PetscInt nz,k; 568 const PetscInt *vj; 569 570 PetscFunctionBegin; 571 for (k=0; k<mbs; k++) { 572 v = aa + 36*ai[k]; 573 xp = x + k*6; 574 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */ 575 nz = ai[k+1] - ai[k]; 576 vj = aj + ai[k]; 577 xp = x + (*vj)*6; 578 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 579 PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 580 while (nz--) { 581 /* x(:) += U(k,:)^T*(Dk*xk) */ 582 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 583 xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 584 xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 585 xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 586 xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 587 xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 588 vj++; 589 xp = x + (*vj)*6; 590 v += 36; 591 } 592 /* xk = inv(Dk)*(Dk*xk) */ 593 d = aa+k*36; /* ptr to inv(Dk) */ 594 xp = x + k*6; 595 xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 596 xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 597 xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 598 xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 599 xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 600 xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 601 } 602 PetscFunctionReturn(0); 603 } 604 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 605 { 606 const MatScalar *v; 607 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 608 PetscInt nz,k; 609 const PetscInt *vj; 610 611 PetscFunctionBegin; 612 for (k=mbs-1; k>=0; k--) { 613 v = aa + 36*ai[k]; 614 xp = x + k*6; 615 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */ 616 nz = ai[k+1] - ai[k]; 617 vj = aj + ai[k]; 618 xp = x + (*vj)*6; 619 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 620 PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 621 while (nz--) { 622 /* xk += U(k,:)*x(:) */ 623 x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5]; 624 x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5]; 625 x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5]; 626 x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5]; 627 x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5]; 628 x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5]; 629 vj++; 630 v += 36; xp = x + (*vj)*6; 631 } 632 xp = x + k*6; 633 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; 634 } 635 PetscFunctionReturn(0); 636 } 637 638 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 639 { 640 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 641 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 642 const MatScalar *aa=a->a; 643 PetscScalar *x; 644 const PetscScalar *b; 645 PetscErrorCode ierr; 646 647 PetscFunctionBegin; 648 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 649 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 650 651 /* solve U^T * D * y = b by forward substitution */ 652 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 653 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 654 655 /* solve U*x = y by back substitution */ 656 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 657 658 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 659 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 660 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 661 PetscFunctionReturn(0); 662 } 663 664 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 665 { 666 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 667 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 668 const MatScalar *aa=a->a; 669 PetscScalar *x; 670 const PetscScalar *b; 671 PetscErrorCode ierr; 672 673 PetscFunctionBegin; 674 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 675 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 676 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 677 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 678 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 679 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 680 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 681 PetscFunctionReturn(0); 682 } 683 684 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 685 { 686 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 687 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 688 const MatScalar *aa=a->a; 689 PetscScalar *x; 690 const PetscScalar *b; 691 PetscErrorCode ierr; 692 693 PetscFunctionBegin; 694 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 695 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 696 ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */ 697 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 698 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 699 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 700 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 701 PetscFunctionReturn(0); 702 } 703 704 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 705 { 706 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 707 IS isrow=a->row; 708 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 709 PetscErrorCode ierr; 710 const PetscInt *r,*vj; 711 PetscInt nz,k,idx; 712 const MatScalar *aa=a->a,*v,*diag; 713 PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp; 714 const PetscScalar *b; 715 716 PetscFunctionBegin; 717 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719 t = a->solve_work; 720 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 721 722 /* solve U^T * D * y = b by forward substitution */ 723 tp = t; 724 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 725 idx = 5*r[k]; 726 tp[0] = b[idx]; 727 tp[1] = b[idx+1]; 728 tp[2] = b[idx+2]; 729 tp[3] = b[idx+3]; 730 tp[4] = b[idx+4]; 731 tp += 5; 732 } 733 734 for (k=0; k<mbs; k++) { 735 v = aa + 25*ai[k]; 736 vj = aj + ai[k]; 737 tp = t + k*5; 738 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; 739 nz = ai[k+1] - ai[k]; 740 741 tp = t + (*vj)*5; 742 while (nz--) { 743 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 744 tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 745 tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4; 746 tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4; 747 tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4; 748 vj++; 749 tp = t + (*vj)*5; 750 v += 25; 751 } 752 753 /* xk = inv(Dk)*(Dk*xk) */ 754 diag = aa+k*25; /* ptr to inv(Dk) */ 755 tp = t + k*5; 756 tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 757 tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 758 tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 759 tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 760 tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 761 } 762 763 /* solve U*x = y by back substitution */ 764 for (k=mbs-1; k>=0; k--) { 765 v = aa + 25*ai[k]; 766 vj = aj + ai[k]; 767 tp = t + k*5; 768 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */ 769 nz = ai[k+1] - ai[k]; 770 771 tp = t + (*vj)*5; 772 while (nz--) { 773 /* xk += U(k,:)*x(:) */ 774 x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4]; 775 x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4]; 776 x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4]; 777 x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4]; 778 x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4]; 779 vj++; 780 tp = t + (*vj)*5; 781 v += 25; 782 } 783 tp = t + k*5; 784 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; 785 idx = 5*r[k]; 786 x[idx] = x0; 787 x[idx+1] = x1; 788 x[idx+2] = x2; 789 x[idx+3] = x3; 790 x[idx+4] = x4; 791 } 792 793 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 794 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 795 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 796 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 797 PetscFunctionReturn(0); 798 } 799 800 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 801 { 802 const MatScalar *v,*diag; 803 PetscScalar *xp,x0,x1,x2,x3,x4; 804 PetscInt nz,k; 805 const PetscInt *vj; 806 807 PetscFunctionBegin; 808 for (k=0; k<mbs; k++) { 809 v = aa + 25*ai[k]; 810 xp = x + k*5; 811 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */ 812 nz = ai[k+1] - ai[k]; 813 vj = aj + ai[k]; 814 xp = x + (*vj)*5; 815 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 816 PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 817 while (nz--) { 818 /* x(:) += U(k,:)^T*(Dk*xk) */ 819 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 820 xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 821 xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4; 822 xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4; 823 xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4; 824 vj++; 825 xp = x + (*vj)*5; 826 v += 25; 827 } 828 /* xk = inv(Dk)*(Dk*xk) */ 829 diag = aa+k*25; /* ptr to inv(Dk) */ 830 xp = x + k*5; 831 xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 832 xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 833 xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 834 xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 835 xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 836 } 837 PetscFunctionReturn(0); 838 } 839 840 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 841 { 842 const MatScalar *v; 843 PetscScalar *xp,x0,x1,x2,x3,x4; 844 PetscInt nz,k; 845 const PetscInt *vj; 846 847 PetscFunctionBegin; 848 for (k=mbs-1; k>=0; k--) { 849 v = aa + 25*ai[k]; 850 xp = x + k*5; 851 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */ 852 nz = ai[k+1] - ai[k]; 853 vj = aj + ai[k]; 854 xp = x + (*vj)*5; 855 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 856 PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 857 while (nz--) { 858 /* xk += U(k,:)*x(:) */ 859 x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4]; 860 x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4]; 861 x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4]; 862 x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4]; 863 x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4]; 864 vj++; 865 v += 25; xp = x + (*vj)*5; 866 } 867 xp = x + k*5; 868 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; 869 } 870 PetscFunctionReturn(0); 871 } 872 873 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 874 { 875 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 876 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 877 const MatScalar *aa=a->a; 878 PetscScalar *x; 879 const PetscScalar *b; 880 PetscErrorCode ierr; 881 882 PetscFunctionBegin; 883 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 884 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 885 886 /* solve U^T * D * y = b by forward substitution */ 887 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */ 888 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 889 890 /* solve U*x = y by back substitution */ 891 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 892 893 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 894 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 895 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 896 PetscFunctionReturn(0); 897 } 898 899 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 900 { 901 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 902 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 903 const MatScalar *aa=a->a; 904 PetscScalar *x; 905 const PetscScalar *b; 906 PetscErrorCode ierr; 907 908 PetscFunctionBegin; 909 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 910 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 911 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */ 912 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 913 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 914 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 915 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 916 PetscFunctionReturn(0); 917 } 918 919 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 920 { 921 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 922 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 923 const MatScalar *aa=a->a; 924 PetscScalar *x; 925 const PetscScalar *b; 926 PetscErrorCode ierr; 927 928 PetscFunctionBegin; 929 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 930 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 931 ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); 932 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 933 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 934 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 935 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 936 PetscFunctionReturn(0); 937 } 938 939 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 940 { 941 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 942 IS isrow=a->row; 943 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 944 PetscErrorCode ierr; 945 const PetscInt *r,*vj; 946 PetscInt nz,k,idx; 947 const MatScalar *aa=a->a,*v,*diag; 948 PetscScalar *x,x0,x1,x2,x3,*t,*tp; 949 const PetscScalar *b; 950 951 PetscFunctionBegin; 952 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 953 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 954 t = a->solve_work; 955 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 956 957 /* solve U^T * D * y = b by forward substitution */ 958 tp = t; 959 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 960 idx = 4*r[k]; 961 tp[0] = b[idx]; 962 tp[1] = b[idx+1]; 963 tp[2] = b[idx+2]; 964 tp[3] = b[idx+3]; 965 tp += 4; 966 } 967 968 for (k=0; k<mbs; k++) { 969 v = aa + 16*ai[k]; 970 vj = aj + ai[k]; 971 tp = t + k*4; 972 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; 973 nz = ai[k+1] - ai[k]; 974 975 tp = t + (*vj)*4; 976 while (nz--) { 977 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 978 tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 979 tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 980 tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 981 vj++; 982 tp = t + (*vj)*4; 983 v += 16; 984 } 985 986 /* xk = inv(Dk)*(Dk*xk) */ 987 diag = aa+k*16; /* ptr to inv(Dk) */ 988 tp = t + k*4; 989 tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 990 tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 991 tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 992 tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 993 } 994 995 /* solve U*x = y by back substitution */ 996 for (k=mbs-1; k>=0; k--) { 997 v = aa + 16*ai[k]; 998 vj = aj + ai[k]; 999 tp = t + k*4; 1000 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */ 1001 nz = ai[k+1] - ai[k]; 1002 1003 tp = t + (*vj)*4; 1004 while (nz--) { 1005 /* xk += U(k,:)*x(:) */ 1006 x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3]; 1007 x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3]; 1008 x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3]; 1009 x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3]; 1010 vj++; 1011 tp = t + (*vj)*4; 1012 v += 16; 1013 } 1014 tp = t + k*4; 1015 tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; 1016 idx = 4*r[k]; 1017 x[idx] = x0; 1018 x[idx+1] = x1; 1019 x[idx+2] = x2; 1020 x[idx+3] = x3; 1021 } 1022 1023 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1024 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1025 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1026 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1027 PetscFunctionReturn(0); 1028 } 1029 1030 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1031 { 1032 const MatScalar *v,*diag; 1033 PetscScalar *xp,x0,x1,x2,x3; 1034 PetscInt nz,k; 1035 const PetscInt *vj; 1036 1037 PetscFunctionBegin; 1038 for (k=0; k<mbs; k++) { 1039 v = aa + 16*ai[k]; 1040 xp = x + k*4; 1041 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */ 1042 nz = ai[k+1] - ai[k]; 1043 vj = aj + ai[k]; 1044 xp = x + (*vj)*4; 1045 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1046 PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1047 while (nz--) { 1048 /* x(:) += U(k,:)^T*(Dk*xk) */ 1049 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1050 xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1051 xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1052 xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1053 vj++; 1054 xp = x + (*vj)*4; 1055 v += 16; 1056 } 1057 /* xk = inv(Dk)*(Dk*xk) */ 1058 diag = aa+k*16; /* ptr to inv(Dk) */ 1059 xp = x + k*4; 1060 xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1061 xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1062 xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1063 xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1064 } 1065 PetscFunctionReturn(0); 1066 } 1067 1068 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1069 { 1070 const MatScalar *v; 1071 PetscScalar *xp,x0,x1,x2,x3; 1072 PetscInt nz,k; 1073 const PetscInt *vj; 1074 1075 PetscFunctionBegin; 1076 for (k=mbs-1; k>=0; k--) { 1077 v = aa + 16*ai[k]; 1078 xp = x + k*4; 1079 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */ 1080 nz = ai[k+1] - ai[k]; 1081 vj = aj + ai[k]; 1082 xp = x + (*vj)*4; 1083 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1084 PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1085 while (nz--) { 1086 /* xk += U(k,:)*x(:) */ 1087 x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3]; 1088 x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3]; 1089 x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3]; 1090 x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3]; 1091 vj++; 1092 v += 16; xp = x + (*vj)*4; 1093 } 1094 xp = x + k*4; 1095 xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3; 1096 } 1097 PetscFunctionReturn(0); 1098 } 1099 1100 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1101 { 1102 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1103 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1104 const MatScalar *aa=a->a; 1105 PetscScalar *x; 1106 const PetscScalar *b; 1107 PetscErrorCode ierr; 1108 1109 PetscFunctionBegin; 1110 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1111 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1112 1113 /* solve U^T * D * y = b by forward substitution */ 1114 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */ 1115 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1116 1117 /* solve U*x = y by back substitution */ 1118 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1119 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1120 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1121 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1122 PetscFunctionReturn(0); 1123 } 1124 1125 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1126 { 1127 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1128 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1129 const MatScalar *aa=a->a; 1130 PetscScalar *x; 1131 const PetscScalar *b; 1132 PetscErrorCode ierr; 1133 1134 PetscFunctionBegin; 1135 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1136 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1137 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */ 1138 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1139 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1140 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1141 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1142 PetscFunctionReturn(0); 1143 } 1144 1145 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1146 { 1147 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1148 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1149 const MatScalar *aa=a->a; 1150 PetscScalar *x; 1151 const PetscScalar *b; 1152 PetscErrorCode ierr; 1153 1154 PetscFunctionBegin; 1155 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1156 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1157 ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); 1158 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1159 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1160 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1161 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1162 PetscFunctionReturn(0); 1163 } 1164 1165 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1166 { 1167 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1168 IS isrow=a->row; 1169 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1170 PetscErrorCode ierr; 1171 const PetscInt *r; 1172 PetscInt nz,k,idx; 1173 const PetscInt *vj; 1174 const MatScalar *aa=a->a,*v,*diag; 1175 PetscScalar *x,x0,x1,x2,*t,*tp; 1176 const PetscScalar *b; 1177 1178 PetscFunctionBegin; 1179 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1180 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1181 t = a->solve_work; 1182 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1183 1184 /* solve U^T * D * y = b by forward substitution */ 1185 tp = t; 1186 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1187 idx = 3*r[k]; 1188 tp[0] = b[idx]; 1189 tp[1] = b[idx+1]; 1190 tp[2] = b[idx+2]; 1191 tp += 3; 1192 } 1193 1194 for (k=0; k<mbs; k++) { 1195 v = aa + 9*ai[k]; 1196 vj = aj + ai[k]; 1197 tp = t + k*3; 1198 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; 1199 nz = ai[k+1] - ai[k]; 1200 1201 tp = t + (*vj)*3; 1202 while (nz--) { 1203 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1204 tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1205 tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1206 vj++; 1207 tp = t + (*vj)*3; 1208 v += 9; 1209 } 1210 1211 /* xk = inv(Dk)*(Dk*xk) */ 1212 diag = aa+k*9; /* ptr to inv(Dk) */ 1213 tp = t + k*3; 1214 tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1215 tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1216 tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1217 } 1218 1219 /* solve U*x = y by back substitution */ 1220 for (k=mbs-1; k>=0; k--) { 1221 v = aa + 9*ai[k]; 1222 vj = aj + ai[k]; 1223 tp = t + k*3; 1224 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */ 1225 nz = ai[k+1] - ai[k]; 1226 1227 tp = t + (*vj)*3; 1228 while (nz--) { 1229 /* xk += U(k,:)*x(:) */ 1230 x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2]; 1231 x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2]; 1232 x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2]; 1233 vj++; 1234 tp = t + (*vj)*3; 1235 v += 9; 1236 } 1237 tp = t + k*3; 1238 tp[0] = x0; tp[1] = x1; tp[2] = x2; 1239 idx = 3*r[k]; 1240 x[idx] = x0; 1241 x[idx+1] = x1; 1242 x[idx+2] = x2; 1243 } 1244 1245 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1246 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1247 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1248 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1249 PetscFunctionReturn(0); 1250 } 1251 1252 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1253 { 1254 const MatScalar *v,*diag; 1255 PetscScalar *xp,x0,x1,x2; 1256 PetscInt nz,k; 1257 const PetscInt *vj; 1258 1259 PetscFunctionBegin; 1260 for (k=0; k<mbs; k++) { 1261 v = aa + 9*ai[k]; 1262 xp = x + k*3; 1263 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */ 1264 nz = ai[k+1] - ai[k]; 1265 vj = aj + ai[k]; 1266 xp = x + (*vj)*3; 1267 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1268 PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1269 while (nz--) { 1270 /* x(:) += U(k,:)^T*(Dk*xk) */ 1271 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1272 xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1273 xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1274 vj++; 1275 xp = x + (*vj)*3; 1276 v += 9; 1277 } 1278 /* xk = inv(Dk)*(Dk*xk) */ 1279 diag = aa+k*9; /* ptr to inv(Dk) */ 1280 xp = x + k*3; 1281 xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1282 xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1283 xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1284 } 1285 PetscFunctionReturn(0); 1286 } 1287 1288 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1289 { 1290 const MatScalar *v; 1291 PetscScalar *xp,x0,x1,x2; 1292 PetscInt nz,k; 1293 const PetscInt *vj; 1294 1295 PetscFunctionBegin; 1296 for (k=mbs-1; k>=0; k--) { 1297 v = aa + 9*ai[k]; 1298 xp = x + k*3; 1299 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */ 1300 nz = ai[k+1] - ai[k]; 1301 vj = aj + ai[k]; 1302 xp = x + (*vj)*3; 1303 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1304 PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1305 while (nz--) { 1306 /* xk += U(k,:)*x(:) */ 1307 x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2]; 1308 x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2]; 1309 x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2]; 1310 vj++; 1311 v += 9; xp = x + (*vj)*3; 1312 } 1313 xp = x + k*3; 1314 xp[0] = x0; xp[1] = x1; xp[2] = x2; 1315 } 1316 PetscFunctionReturn(0); 1317 } 1318 1319 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1320 { 1321 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1322 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1323 const MatScalar *aa=a->a; 1324 PetscScalar *x; 1325 const PetscScalar *b; 1326 PetscErrorCode ierr; 1327 1328 PetscFunctionBegin; 1329 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1330 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1331 1332 /* solve U^T * D * y = b by forward substitution */ 1333 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1334 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1335 1336 /* solve U*x = y by back substitution */ 1337 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1338 1339 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1340 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1341 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1342 PetscFunctionReturn(0); 1343 } 1344 1345 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1346 { 1347 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1348 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1349 const MatScalar *aa=a->a; 1350 PetscScalar *x; 1351 const PetscScalar *b; 1352 PetscErrorCode ierr; 1353 1354 PetscFunctionBegin; 1355 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1356 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1357 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1358 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1359 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1361 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1362 PetscFunctionReturn(0); 1363 } 1364 1365 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1366 { 1367 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1368 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1369 const MatScalar *aa=a->a; 1370 PetscScalar *x; 1371 const PetscScalar *b; 1372 PetscErrorCode ierr; 1373 1374 PetscFunctionBegin; 1375 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1376 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1377 ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr); 1378 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1379 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1380 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1381 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1382 PetscFunctionReturn(0); 1383 } 1384 1385 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1386 { 1387 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1388 IS isrow=a->row; 1389 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1390 PetscErrorCode ierr; 1391 const PetscInt *r,*vj; 1392 PetscInt nz,k,k2,idx; 1393 const MatScalar *aa=a->a,*v,*diag; 1394 PetscScalar *x,x0,x1,*t; 1395 const PetscScalar *b; 1396 1397 PetscFunctionBegin; 1398 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1399 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1400 t = a->solve_work; 1401 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1402 1403 /* solve U^T * D * y = perm(b) by forward substitution */ 1404 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1405 idx = 2*r[k]; 1406 t[k*2] = b[idx]; 1407 t[k*2+1] = b[idx+1]; 1408 } 1409 for (k=0; k<mbs; k++) { 1410 v = aa + 4*ai[k]; 1411 vj = aj + ai[k]; 1412 k2 = k*2; 1413 x0 = t[k2]; x1 = t[k2+1]; 1414 nz = ai[k+1] - ai[k]; 1415 while (nz--) { 1416 t[(*vj)*2] += v[0]*x0 + v[1]*x1; 1417 t[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1418 vj++; v += 4; 1419 } 1420 diag = aa+k*4; /* ptr to inv(Dk) */ 1421 t[k2] = diag[0]*x0 + diag[2]*x1; 1422 t[k2+1] = diag[1]*x0 + diag[3]*x1; 1423 } 1424 1425 /* solve U*x = y by back substitution */ 1426 for (k=mbs-1; k>=0; k--) { 1427 v = aa + 4*ai[k]; 1428 vj = aj + ai[k]; 1429 k2 = k*2; 1430 x0 = t[k2]; x1 = t[k2+1]; 1431 nz = ai[k+1] - ai[k]; 1432 while (nz--) { 1433 x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1]; 1434 x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1]; 1435 vj++; 1436 v += 4; 1437 } 1438 t[k2] = x0; 1439 t[k2+1] = x1; 1440 idx = 2*r[k]; 1441 x[idx] = x0; 1442 x[idx+1] = x1; 1443 } 1444 1445 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1446 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1447 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1448 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1449 PetscFunctionReturn(0); 1450 } 1451 1452 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1453 { 1454 const MatScalar *v,*diag; 1455 PetscScalar x0,x1; 1456 PetscInt nz,k,k2; 1457 const PetscInt *vj; 1458 1459 PetscFunctionBegin; 1460 for (k=0; k<mbs; k++) { 1461 v = aa + 4*ai[k]; 1462 vj = aj + ai[k]; 1463 k2 = k*2; 1464 x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */ 1465 nz = ai[k+1] - ai[k]; 1466 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1467 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1468 while (nz--) { 1469 /* x(:) += U(k,:)^T*(Dk*xk) */ 1470 x[(*vj)*2] += v[0]*x0 + v[1]*x1; 1471 x[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1472 vj++; v += 4; 1473 } 1474 /* xk = inv(Dk)*(Dk*xk) */ 1475 diag = aa+k*4; /* ptr to inv(Dk) */ 1476 x[k2] = diag[0]*x0 + diag[2]*x1; 1477 x[k2+1] = diag[1]*x0 + diag[3]*x1; 1478 } 1479 PetscFunctionReturn(0); 1480 } 1481 1482 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1483 { 1484 const MatScalar *v; 1485 PetscScalar x0,x1; 1486 PetscInt nz,k,k2; 1487 const PetscInt *vj; 1488 1489 PetscFunctionBegin; 1490 for (k=mbs-1; k>=0; k--) { 1491 v = aa + 4*ai[k]; 1492 vj = aj + ai[k]; 1493 k2 = k*2; 1494 x0 = x[k2]; x1 = x[k2+1]; /* xk */ 1495 nz = ai[k+1] - ai[k]; 1496 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1497 PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1498 while (nz--) { 1499 /* xk += U(k,:)*x(:) */ 1500 x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1]; 1501 x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1]; 1502 vj++; 1503 v += 4; 1504 } 1505 x[k2] = x0; 1506 x[k2+1] = x1; 1507 } 1508 PetscFunctionReturn(0); 1509 } 1510 1511 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1512 { 1513 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1514 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1515 const MatScalar *aa=a->a; 1516 PetscScalar *x; 1517 const PetscScalar *b; 1518 PetscErrorCode ierr; 1519 1520 PetscFunctionBegin; 1521 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1522 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1523 1524 /* solve U^T * D * y = b by forward substitution */ 1525 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1526 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1527 1528 /* solve U*x = y by back substitution */ 1529 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1530 1531 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1532 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1533 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1534 PetscFunctionReturn(0); 1535 } 1536 1537 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1538 { 1539 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1540 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1541 const MatScalar *aa=a->a; 1542 PetscScalar *x; 1543 const PetscScalar *b; 1544 PetscErrorCode ierr; 1545 1546 PetscFunctionBegin; 1547 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1548 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1549 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1550 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1551 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1552 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1553 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1554 PetscFunctionReturn(0); 1555 } 1556 1557 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1558 { 1559 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1560 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1561 const MatScalar *aa=a->a; 1562 PetscScalar *x; 1563 const PetscScalar *b; 1564 PetscErrorCode ierr; 1565 1566 PetscFunctionBegin; 1567 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1568 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1569 ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr); 1570 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1571 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1572 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1573 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 1574 PetscFunctionReturn(0); 1575 } 1576 1577 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1578 { 1579 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1580 IS isrow=a->row; 1581 PetscErrorCode ierr; 1582 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1583 const MatScalar *aa=a->a,*v; 1584 const PetscScalar *b; 1585 PetscScalar *x,xk,*t; 1586 PetscInt nz,k,j; 1587 1588 PetscFunctionBegin; 1589 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1590 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1591 t = a->solve_work; 1592 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1593 1594 /* solve U^T*D*y = perm(b) by forward substitution */ 1595 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1596 for (k=0; k<mbs; k++) { 1597 v = aa + ai[k]; 1598 vj = aj + ai[k]; 1599 xk = t[k]; 1600 nz = ai[k+1] - ai[k] - 1; 1601 for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk; 1602 t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */ 1603 } 1604 1605 /* solve U*perm(x) = y by back substitution */ 1606 for (k=mbs-1; k>=0; k--) { 1607 v = aa + adiag[k] - 1; 1608 vj = aj + adiag[k] - 1; 1609 nz = ai[k+1] - ai[k] - 1; 1610 for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]]; 1611 x[rp[k]] = t[k]; 1612 } 1613 1614 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1615 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1616 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1617 ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr); 1618 PetscFunctionReturn(0); 1619 } 1620 1621 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1622 { 1623 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1624 IS isrow=a->row; 1625 PetscErrorCode ierr; 1626 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1627 const MatScalar *aa=a->a,*v; 1628 PetscScalar *x,xk,*t; 1629 const PetscScalar *b; 1630 PetscInt nz,k; 1631 1632 PetscFunctionBegin; 1633 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1634 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1635 t = a->solve_work; 1636 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1637 1638 /* solve U^T*D*y = perm(b) by forward substitution */ 1639 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1640 for (k=0; k<mbs; k++) { 1641 v = aa + ai[k] + 1; 1642 vj = aj + ai[k] + 1; 1643 xk = t[k]; 1644 nz = ai[k+1] - ai[k] - 1; 1645 while (nz--) t[*vj++] += (*v++) * xk; 1646 t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */ 1647 } 1648 1649 /* solve U*perm(x) = y by back substitution */ 1650 for (k=mbs-1; k>=0; k--) { 1651 v = aa + ai[k] + 1; 1652 vj = aj + ai[k] + 1; 1653 nz = ai[k+1] - ai[k] - 1; 1654 while (nz--) t[k] += (*v++) * t[*vj++]; 1655 x[rp[k]] = t[k]; 1656 } 1657 1658 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1659 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1660 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1661 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1662 PetscFunctionReturn(0); 1663 } 1664 1665 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1666 { 1667 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1668 IS isrow=a->row; 1669 PetscErrorCode ierr; 1670 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1671 const MatScalar *aa=a->a,*v; 1672 PetscReal diagk; 1673 PetscScalar *x,xk; 1674 const PetscScalar *b; 1675 PetscInt nz,k; 1676 1677 PetscFunctionBegin; 1678 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1679 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1680 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1681 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1682 1683 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1684 for (k=0; k<mbs; k++) { 1685 v = aa + ai[k]; 1686 vj = aj + ai[k]; 1687 xk = x[k]; 1688 nz = ai[k+1] - ai[k] - 1; 1689 while (nz--) x[*vj++] += (*v++) * xk; 1690 1691 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1692 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1693 x[k] = xk*PetscSqrtReal(diagk); 1694 } 1695 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1696 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1697 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1698 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1699 PetscFunctionReturn(0); 1700 } 1701 1702 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1703 { 1704 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1705 IS isrow=a->row; 1706 PetscErrorCode ierr; 1707 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1708 const MatScalar *aa=a->a,*v; 1709 PetscReal diagk; 1710 PetscScalar *x,xk; 1711 const PetscScalar *b; 1712 PetscInt nz,k; 1713 1714 PetscFunctionBegin; 1715 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1716 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1717 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1718 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1719 1720 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1721 for (k=0; k<mbs; k++) { 1722 v = aa + ai[k] + 1; 1723 vj = aj + ai[k] + 1; 1724 xk = x[k]; 1725 nz = ai[k+1] - ai[k] - 1; 1726 while (nz--) x[*vj++] += (*v++) * xk; 1727 1728 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1729 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1730 x[k] = xk*PetscSqrtReal(diagk); 1731 } 1732 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1733 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1734 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1735 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1736 PetscFunctionReturn(0); 1737 } 1738 1739 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1740 { 1741 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1742 IS isrow=a->row; 1743 PetscErrorCode ierr; 1744 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag; 1745 const MatScalar *aa=a->a,*v; 1746 PetscReal diagk; 1747 PetscScalar *x,*t; 1748 const PetscScalar *b; 1749 PetscInt nz,k; 1750 1751 PetscFunctionBegin; 1752 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1753 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1754 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1755 t = a->solve_work; 1756 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1757 1758 for (k=mbs-1; k>=0; k--) { 1759 v = aa + ai[k]; 1760 vj = aj + ai[k]; 1761 diagk = PetscRealPart(aa[adiag[k]]); 1762 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1763 t[k] = b[k] * PetscSqrtReal(diagk); 1764 nz = ai[k+1] - ai[k] - 1; 1765 while (nz--) t[k] += (*v++) * t[*vj++]; 1766 x[rp[k]] = t[k]; 1767 } 1768 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1769 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1770 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1771 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1772 PetscFunctionReturn(0); 1773 } 1774 1775 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1776 { 1777 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1778 IS isrow=a->row; 1779 PetscErrorCode ierr; 1780 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1781 const MatScalar *aa=a->a,*v; 1782 PetscReal diagk; 1783 PetscScalar *x,*t; 1784 const PetscScalar *b; 1785 PetscInt nz,k; 1786 1787 PetscFunctionBegin; 1788 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1789 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1790 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1791 t = a->solve_work; 1792 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1793 1794 for (k=mbs-1; k>=0; k--) { 1795 v = aa + ai[k] + 1; 1796 vj = aj + ai[k] + 1; 1797 diagk = PetscRealPart(aa[ai[k]]); 1798 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1799 t[k] = b[k] * PetscSqrtReal(diagk); 1800 nz = ai[k+1] - ai[k] - 1; 1801 while (nz--) t[k] += (*v++) * t[*vj++]; 1802 x[rp[k]] = t[k]; 1803 } 1804 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1805 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1806 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1807 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1808 PetscFunctionReturn(0); 1809 } 1810 1811 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx) 1812 { 1813 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1814 PetscErrorCode ierr; 1815 1816 PetscFunctionBegin; 1817 if (A->rmap->bs == 1) { 1818 ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr); 1819 } else { 1820 IS isrow=a->row; 1821 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1822 const MatScalar *aa=a->a,*v; 1823 PetscScalar *x,*t; 1824 const PetscScalar *b; 1825 PetscInt nz,k,n,i,j; 1826 1827 if (bb->n > a->solves_work_n) { 1828 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1829 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1830 a->solves_work_n = bb->n; 1831 } 1832 n = bb->n; 1833 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1834 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1835 t = a->solves_work; 1836 1837 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1838 1839 /* solve U^T*D*y = perm(b) by forward substitution */ 1840 for (k=0; k<mbs; k++) { 1841 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1842 } 1843 for (k=0; k<mbs; k++) { 1844 v = aa + ai[k]; 1845 vj = aj + ai[k]; 1846 nz = ai[k+1] - ai[k] - 1; 1847 for (j=0; j<nz; j++) { 1848 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1849 v++;vj++; 1850 } 1851 for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1852 } 1853 1854 /* solve U*perm(x) = y by back substitution */ 1855 for (k=mbs-1; k>=0; k--) { 1856 v = aa + ai[k] - 1; 1857 vj = aj + ai[k] - 1; 1858 nz = ai[k+1] - ai[k] - 1; 1859 for (j=0; j<nz; j++) { 1860 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1861 v++;vj++; 1862 } 1863 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1864 } 1865 1866 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1867 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1868 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1869 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1870 } 1871 PetscFunctionReturn(0); 1872 } 1873 1874 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx) 1875 { 1876 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1877 PetscErrorCode ierr; 1878 1879 PetscFunctionBegin; 1880 if (A->rmap->bs == 1) { 1881 ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr); 1882 } else { 1883 IS isrow=a->row; 1884 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1885 const MatScalar *aa=a->a,*v; 1886 PetscScalar *x,*t; 1887 const PetscScalar *b; 1888 PetscInt nz,k,n,i; 1889 1890 if (bb->n > a->solves_work_n) { 1891 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1892 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1893 a->solves_work_n = bb->n; 1894 } 1895 n = bb->n; 1896 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1897 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1898 t = a->solves_work; 1899 1900 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1901 1902 /* solve U^T*D*y = perm(b) by forward substitution */ 1903 for (k=0; k<mbs; k++) { 1904 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1905 } 1906 for (k=0; k<mbs; k++) { 1907 v = aa + ai[k]; 1908 vj = aj + ai[k]; 1909 nz = ai[k+1] - ai[k]; 1910 while (nz--) { 1911 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1912 v++;vj++; 1913 } 1914 for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1915 } 1916 1917 /* solve U*perm(x) = y by back substitution */ 1918 for (k=mbs-1; k>=0; k--) { 1919 v = aa + ai[k]; 1920 vj = aj + ai[k]; 1921 nz = ai[k+1] - ai[k]; 1922 while (nz--) { 1923 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1924 v++;vj++; 1925 } 1926 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1927 } 1928 1929 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1930 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1931 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1932 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1933 } 1934 PetscFunctionReturn(0); 1935 } 1936 1937 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1938 { 1939 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1940 PetscErrorCode ierr; 1941 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1942 const MatScalar *aa=a->a,*v; 1943 const PetscScalar *b; 1944 PetscScalar *x,xi; 1945 PetscInt nz,i,j; 1946 1947 PetscFunctionBegin; 1948 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1949 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1950 /* solve U^T*D*y = b by forward substitution */ 1951 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 1952 for (i=0; i<mbs; i++) { 1953 v = aa + ai[i]; 1954 vj = aj + ai[i]; 1955 xi = x[i]; 1956 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 1957 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 1958 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 1959 } 1960 /* solve U*x = y by backward substitution */ 1961 for (i=mbs-2; i>=0; i--) { 1962 xi = x[i]; 1963 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 1964 vj = aj + adiag[i] - 1; 1965 nz = ai[i+1] - ai[i] - 1; 1966 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 1967 x[i] = xi; 1968 } 1969 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1970 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1971 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1972 PetscFunctionReturn(0); 1973 } 1974 1975 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X) 1976 { 1977 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1978 PetscErrorCode ierr; 1979 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag; 1980 const MatScalar *aa=a->a,*v; 1981 const PetscScalar *b; 1982 PetscScalar *x,xi; 1983 PetscInt nz,i,j,neq,ldb,ldx; 1984 PetscBool isdense; 1985 1986 PetscFunctionBegin; 1987 if (!mbs) PetscFunctionReturn(0); 1988 ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);CHKERRQ(ierr); 1989 if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix"); 1990 if (X != B) { 1991 ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);CHKERRQ(ierr); 1992 if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix"); 1993 } 1994 ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr); 1995 ierr = MatDenseGetLDA(B,&ldb);CHKERRQ(ierr); 1996 ierr = MatDenseGetArray(X,&x);CHKERRQ(ierr); 1997 ierr = MatDenseGetLDA(X,&ldx);CHKERRQ(ierr); 1998 for (neq=0; neq<B->cmap->n; neq++) { 1999 /* solve U^T*D*y = b by forward substitution */ 2000 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2001 for (i=0; i<mbs; i++) { 2002 v = aa + ai[i]; 2003 vj = aj + ai[i]; 2004 xi = x[i]; 2005 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 2006 for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi; 2007 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2008 } 2009 /* solve U*x = y by backward substitution */ 2010 for (i=mbs-2; i>=0; i--) { 2011 xi = x[i]; 2012 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2013 vj = aj + adiag[i] - 1; 2014 nz = ai[i+1] - ai[i] - 1; 2015 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 2016 x[i] = xi; 2017 } 2018 b += ldb; 2019 x += ldx; 2020 } 2021 ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr); 2022 ierr = MatDenseRestoreArray(X,&x);CHKERRQ(ierr); 2023 ierr = PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));CHKERRQ(ierr); 2024 PetscFunctionReturn(0); 2025 } 2026 2027 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2028 { 2029 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2030 PetscErrorCode ierr; 2031 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2032 const MatScalar *aa=a->a,*v; 2033 PetscScalar *x,xk; 2034 const PetscScalar *b; 2035 PetscInt nz,k; 2036 2037 PetscFunctionBegin; 2038 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2039 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2040 2041 /* solve U^T*D*y = b by forward substitution */ 2042 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2043 for (k=0; k<mbs; k++) { 2044 v = aa + ai[k] + 1; 2045 vj = aj + ai[k] + 1; 2046 xk = x[k]; 2047 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2048 while (nz--) x[*vj++] += (*v++) * xk; 2049 x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2050 } 2051 2052 /* solve U*x = y by back substitution */ 2053 for (k=mbs-2; k>=0; k--) { 2054 v = aa + ai[k] + 1; 2055 vj = aj + ai[k] + 1; 2056 xk = x[k]; 2057 nz = ai[k+1] - ai[k] - 1; 2058 while (nz--) { 2059 xk += (*v++) * x[*vj++]; 2060 } 2061 x[k] = xk; 2062 } 2063 2064 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2065 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2066 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2067 PetscFunctionReturn(0); 2068 } 2069 2070 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2071 { 2072 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2073 PetscErrorCode ierr; 2074 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2075 const MatScalar *aa=a->a,*v; 2076 PetscReal diagk; 2077 PetscScalar *x; 2078 const PetscScalar *b; 2079 PetscInt nz,k; 2080 2081 PetscFunctionBegin; 2082 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2083 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2084 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2085 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2086 for (k=0; k<mbs; k++) { 2087 v = aa + ai[k]; 2088 vj = aj + ai[k]; 2089 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2090 while (nz--) x[*vj++] += (*v++) * x[k]; 2091 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2092 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]])); 2093 x[k] *= PetscSqrtReal(diagk); 2094 } 2095 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2096 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2097 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2098 PetscFunctionReturn(0); 2099 } 2100 2101 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2102 { 2103 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2104 PetscErrorCode ierr; 2105 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2106 const MatScalar *aa=a->a,*v; 2107 PetscReal diagk; 2108 PetscScalar *x; 2109 const PetscScalar *b; 2110 PetscInt nz,k; 2111 2112 PetscFunctionBegin; 2113 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2114 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2115 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2116 ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr); 2117 for (k=0; k<mbs; k++) { 2118 v = aa + ai[k] + 1; 2119 vj = aj + ai[k] + 1; 2120 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2121 while (nz--) x[*vj++] += (*v++) * x[k]; 2122 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2123 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]])); 2124 x[k] *= PetscSqrtReal(diagk); 2125 } 2126 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2127 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2128 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2129 PetscFunctionReturn(0); 2130 } 2131 2132 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2133 { 2134 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2135 PetscErrorCode ierr; 2136 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj; 2137 const MatScalar *aa=a->a,*v; 2138 PetscReal diagk; 2139 PetscScalar *x; 2140 const PetscScalar *b; 2141 PetscInt nz,k; 2142 2143 PetscFunctionBegin; 2144 /* solve D^(1/2)*U*x = b by back substitution */ 2145 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2146 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2147 2148 for (k=mbs-1; k>=0; k--) { 2149 v = aa + ai[k]; 2150 vj = aj + ai[k]; 2151 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2152 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2153 x[k] = PetscSqrtReal(diagk)*b[k]; 2154 nz = ai[k+1] - ai[k] - 1; 2155 while (nz--) x[k] += (*v++) * x[*vj++]; 2156 } 2157 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2158 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2159 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2160 PetscFunctionReturn(0); 2161 } 2162 2163 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2164 { 2165 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2166 PetscErrorCode ierr; 2167 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2168 const MatScalar *aa=a->a,*v; 2169 PetscReal diagk; 2170 PetscScalar *x; 2171 const PetscScalar *b; 2172 PetscInt nz,k; 2173 2174 PetscFunctionBegin; 2175 /* solve D^(1/2)*U*x = b by back substitution */ 2176 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2177 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2178 2179 for (k=mbs-1; k>=0; k--) { 2180 v = aa + ai[k] + 1; 2181 vj = aj + ai[k] + 1; 2182 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2183 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2184 x[k] = PetscSqrtReal(diagk)*b[k]; 2185 nz = ai[k+1] - ai[k] - 1; 2186 while (nz--) x[k] += (*v++) * x[*vj++]; 2187 } 2188 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2189 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2190 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2191 PetscFunctionReturn(0); 2192 } 2193 2194 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2195 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2196 { 2197 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2198 PetscErrorCode ierr; 2199 const PetscInt *rip,mbs = a->mbs,*ai,*aj; 2200 PetscInt *jutmp,bs = A->rmap->bs,i; 2201 PetscInt m,reallocs = 0,*levtmp; 2202 PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2203 PetscInt incrlev,*lev,shift,prow,nz; 2204 PetscReal f = info->fill,levels = info->levels; 2205 PetscBool perm_identity; 2206 2207 PetscFunctionBegin; 2208 /* check whether perm is the identity mapping */ 2209 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2210 2211 if (perm_identity) { 2212 a->permute = PETSC_FALSE; 2213 ai = a->i; aj = a->j; 2214 } else { /* non-trivial permutation */ 2215 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2216 } 2217 2218 /* initialization */ 2219 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2220 umax = (PetscInt)(f*ai[mbs] + 1); 2221 ierr = PetscMalloc1(umax,&lev);CHKERRQ(ierr); 2222 umax += mbs + 1; 2223 shift = mbs + 1; 2224 ierr = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr); 2225 ierr = PetscMalloc1(umax,&ju);CHKERRQ(ierr); 2226 iu[0] = mbs + 1; 2227 juidx = mbs + 1; 2228 /* prowl: linked list for pivot row */ 2229 ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr); 2230 /* q: linked list for col index */ 2231 2232 for (i=0; i<mbs; i++) { 2233 prowl[i] = mbs; 2234 q[i] = 0; 2235 } 2236 2237 /* for each row k */ 2238 for (k=0; k<mbs; k++) { 2239 nzk = 0; 2240 q[k] = mbs; 2241 /* copy current row into linked list */ 2242 nz = ai[rip[k]+1] - ai[rip[k]]; 2243 j = ai[rip[k]]; 2244 while (nz--) { 2245 vj = rip[aj[j++]]; 2246 if (vj > k) { 2247 qm = k; 2248 do { 2249 m = qm; qm = q[m]; 2250 } while (qm < vj); 2251 if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n"); 2252 nzk++; 2253 q[m] = vj; 2254 q[vj] = qm; 2255 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2256 } 2257 } 2258 2259 /* modify nonzero structure of k-th row by computing fill-in 2260 for each row prow to be merged in */ 2261 prow = k; 2262 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2263 2264 while (prow < k) { 2265 /* merge row prow into k-th row */ 2266 jmin = iu[prow] + 1; 2267 jmax = iu[prow+1]; 2268 qm = k; 2269 for (j=jmin; j<jmax; j++) { 2270 incrlev = lev[j-shift] + 1; 2271 if (incrlev > levels) continue; 2272 vj = ju[j]; 2273 do { 2274 m = qm; qm = q[m]; 2275 } while (qm < vj); 2276 if (qm != vj) { /* a fill */ 2277 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2278 levtmp[vj] = incrlev; 2279 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2280 } 2281 prow = prowl[prow]; /* next pivot row */ 2282 } 2283 2284 /* add k to row list for first nonzero element in k-th row */ 2285 if (nzk > 1) { 2286 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2287 prowl[k] = prowl[i]; prowl[i] = k; 2288 } 2289 iu[k+1] = iu[k] + nzk; 2290 2291 /* allocate more space to ju and lev if needed */ 2292 if (iu[k+1] > umax) { 2293 /* estimate how much additional space we will need */ 2294 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2295 /* just double the memory each time */ 2296 maxadd = umax; 2297 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2298 umax += maxadd; 2299 2300 /* allocate a longer ju */ 2301 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2302 ierr = PetscArraycpy(jutmp,ju,iu[k]);CHKERRQ(ierr); 2303 ierr = PetscFree(ju);CHKERRQ(ierr); 2304 ju = jutmp; 2305 2306 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2307 ierr = PetscArraycpy(jutmp,lev,iu[k]-shift);CHKERRQ(ierr); 2308 ierr = PetscFree(lev);CHKERRQ(ierr); 2309 lev = jutmp; 2310 reallocs += 2; /* count how many times we realloc */ 2311 } 2312 2313 /* save nonzero structure of k-th row in ju */ 2314 i=k; 2315 while (nzk--) { 2316 i = q[i]; 2317 ju[juidx] = i; 2318 lev[juidx-shift] = levtmp[i]; 2319 juidx++; 2320 } 2321 } 2322 2323 #if defined(PETSC_USE_INFO) 2324 if (ai[mbs] != 0) { 2325 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2326 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr); 2327 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2328 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr); 2329 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 2330 } else { 2331 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2332 } 2333 #endif 2334 2335 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2336 ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr); 2337 ierr = PetscFree(lev);CHKERRQ(ierr); 2338 2339 /* put together the new matrix */ 2340 ierr = MatSeqSBAIJSetPreallocation(B,bs,0,NULL);CHKERRQ(ierr); 2341 2342 /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */ 2343 b = (Mat_SeqSBAIJ*)(B)->data; 2344 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2345 2346 b->singlemalloc = PETSC_FALSE; 2347 b->free_a = PETSC_TRUE; 2348 b->free_ij = PETSC_TRUE; 2349 /* the next line frees the default space generated by the Create() */ 2350 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 2351 ierr = PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);CHKERRQ(ierr); 2352 b->j = ju; 2353 b->i = iu; 2354 b->diag = NULL; 2355 b->ilen = NULL; 2356 b->imax = NULL; 2357 2358 ierr = ISDestroy(&b->row);CHKERRQ(ierr); 2359 ierr = ISDestroy(&b->icol);CHKERRQ(ierr); 2360 b->row = perm; 2361 b->icol = perm; 2362 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2363 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2364 ierr = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr); 2365 /* In b structure: Free imax, ilen, old a, old j. 2366 Allocate idnew, solve_work, new a, new j */ 2367 ierr = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2368 b->maxnz = b->nz = iu[mbs]; 2369 2370 (B)->info.factor_mallocs = reallocs; 2371 (B)->info.fill_ratio_given = f; 2372 if (ai[mbs] != 0) { 2373 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2374 } else { 2375 (B)->info.fill_ratio_needed = 0.0; 2376 } 2377 ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr); 2378 PetscFunctionReturn(0); 2379 } 2380 2381 /* 2382 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2383 */ 2384 #include <petscbt.h> 2385 #include <../src/mat/utils/freespace.h> 2386 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2387 { 2388 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2389 PetscErrorCode ierr; 2390 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2391 PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j; 2392 const PetscInt *rip; 2393 PetscInt reallocs=0,i,*ui,*udiag,*cols; 2394 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2395 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr; 2396 PetscReal fill =info->fill,levels=info->levels; 2397 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2398 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2399 PetscBT lnkbt; 2400 2401 PetscFunctionBegin; 2402 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 2403 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2404 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2405 if (bs > 1) { 2406 ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr); 2407 PetscFunctionReturn(0); 2408 } 2409 2410 /* check whether perm is the identity mapping */ 2411 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2412 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2413 a->permute = PETSC_FALSE; 2414 2415 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2416 ierr = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr); 2417 ui[0] = 0; 2418 2419 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2420 if (!levels) { 2421 /* reuse the column pointers and row offsets for memory savings */ 2422 for (i=0; i<am; i++) { 2423 ncols = ai[i+1] - ai[i]; 2424 ui[i+1] = ui[i] + ncols; 2425 udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */ 2426 } 2427 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2428 cols = uj; 2429 for (i=0; i<am; i++) { 2430 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2431 ncols = ai[i+1] - ai[i] -1; 2432 for (j=0; j<ncols; j++) *cols++ = aj[j]; 2433 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */ 2434 } 2435 } else { /* case: levels>0 */ 2436 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2437 2438 /* initialization */ 2439 /* jl: linked list for storing indices of the pivot rows 2440 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2441 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2442 for (i=0; i<am; i++) { 2443 jl[i] = am; il[i] = 0; 2444 } 2445 2446 /* create and initialize a linked list for storing column indices of the active row k */ 2447 nlnk = am + 1; 2448 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2449 2450 /* initial FreeSpace size is fill*(ai[am]+1) */ 2451 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2452 2453 current_space = free_space; 2454 2455 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2456 2457 current_space_lvl = free_space_lvl; 2458 2459 for (k=0; k<am; k++) { /* for each active row k */ 2460 /* initialize lnk by the column indices of row k */ 2461 nzk = 0; 2462 ncols = ai[k+1] - ai[k]; 2463 if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k); 2464 cols = aj+ai[k]; 2465 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2466 nzk += nlnk; 2467 2468 /* update lnk by computing fill-in for each pivot row to be merged in */ 2469 prow = jl[k]; /* 1st pivot row */ 2470 2471 while (prow < k) { 2472 nextprow = jl[prow]; 2473 2474 /* merge prow into k-th row */ 2475 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2476 jmax = ui[prow+1]; 2477 ncols = jmax-jmin; 2478 i = jmin - ui[prow]; 2479 2480 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2481 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2482 j = *(uj - 1); 2483 ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2484 nzk += nlnk; 2485 2486 /* update il and jl for prow */ 2487 if (jmin < jmax) { 2488 il[prow] = jmin; 2489 2490 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2491 } 2492 prow = nextprow; 2493 } 2494 2495 /* if free space is not available, make more free space */ 2496 if (current_space->local_remaining<nzk) { 2497 i = am - k + 1; /* num of unfactored rows */ 2498 i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2499 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2500 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2501 reallocs++; 2502 } 2503 2504 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2505 if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k); 2506 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2507 2508 /* add the k-th row into il and jl */ 2509 if (nzk > 1) { 2510 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2511 jl[k] = jl[i]; jl[i] = k; 2512 il[k] = ui[k] + 1; 2513 } 2514 uj_ptr[k] = current_space->array; 2515 uj_lvl_ptr[k] = current_space_lvl->array; 2516 2517 current_space->array += nzk; 2518 current_space->local_used += nzk; 2519 current_space->local_remaining -= nzk; 2520 current_space_lvl->array += nzk; 2521 current_space_lvl->local_used += nzk; 2522 current_space_lvl->local_remaining -= nzk; 2523 2524 ui[k+1] = ui[k] + nzk; 2525 } 2526 2527 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2528 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2529 2530 /* destroy list of free space and other temporary array(s) */ 2531 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2532 ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor */ 2533 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2534 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2535 2536 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2537 2538 /* put together the new matrix in MATSEQSBAIJ format */ 2539 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2540 2541 b = (Mat_SeqSBAIJ*)(fact)->data; 2542 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2543 2544 b->singlemalloc = PETSC_FALSE; 2545 b->free_a = PETSC_TRUE; 2546 b->free_ij = free_ij; 2547 2548 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2549 2550 b->j = uj; 2551 b->i = ui; 2552 b->diag = udiag; 2553 b->free_diag = PETSC_TRUE; 2554 b->ilen = NULL; 2555 b->imax = NULL; 2556 b->row = perm; 2557 b->col = perm; 2558 2559 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2560 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2561 2562 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2563 2564 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2565 ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2566 2567 b->maxnz = b->nz = ui[am]; 2568 2569 fact->info.factor_mallocs = reallocs; 2570 fact->info.fill_ratio_given = fill; 2571 if (ai[am] != 0) { 2572 fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am]; 2573 } else { 2574 fact->info.fill_ratio_needed = 0.0; 2575 } 2576 #if defined(PETSC_USE_INFO) 2577 if (ai[am] != 0) { 2578 PetscReal af = fact->info.fill_ratio_needed; 2579 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2580 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2581 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2582 } else { 2583 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2584 } 2585 #endif 2586 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2587 PetscFunctionReturn(0); 2588 } 2589 2590 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2591 { 2592 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2593 Mat_SeqSBAIJ *b; 2594 PetscErrorCode ierr; 2595 PetscBool perm_identity,free_ij = PETSC_TRUE; 2596 PetscInt bs=A->rmap->bs,am=a->mbs; 2597 const PetscInt *cols,*rip,*ai=a->i,*aj=a->j; 2598 PetscInt reallocs=0,i,*ui; 2599 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2600 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 2601 PetscReal fill =info->fill,levels=info->levels,ratio_needed; 2602 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2603 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2604 PetscBT lnkbt; 2605 2606 PetscFunctionBegin; 2607 /* 2608 This code originally uses Modified Sparse Row (MSR) storage 2609 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2610 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2611 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2612 thus the original code in MSR format is still used for these cases. 2613 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2614 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2615 */ 2616 if (bs > 1) { 2617 ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr); 2618 PetscFunctionReturn(0); 2619 } 2620 2621 /* check whether perm is the identity mapping */ 2622 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2623 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2624 a->permute = PETSC_FALSE; 2625 2626 /* special case that simply copies fill pattern */ 2627 if (!levels) { 2628 /* reuse the column pointers and row offsets for memory savings */ 2629 ui = a->i; 2630 uj = a->j; 2631 free_ij = PETSC_FALSE; 2632 ratio_needed = 1.0; 2633 } else { /* case: levels>0 */ 2634 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2635 2636 /* initialization */ 2637 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2638 ui[0] = 0; 2639 2640 /* jl: linked list for storing indices of the pivot rows 2641 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2642 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2643 for (i=0; i<am; i++) { 2644 jl[i] = am; il[i] = 0; 2645 } 2646 2647 /* create and initialize a linked list for storing column indices of the active row k */ 2648 nlnk = am + 1; 2649 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2650 2651 /* initial FreeSpace size is fill*(ai[am]+1) */ 2652 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2653 2654 current_space = free_space; 2655 2656 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2657 2658 current_space_lvl = free_space_lvl; 2659 2660 for (k=0; k<am; k++) { /* for each active row k */ 2661 /* initialize lnk by the column indices of row rip[k] */ 2662 nzk = 0; 2663 ncols = ai[rip[k]+1] - ai[rip[k]]; 2664 cols = aj+ai[rip[k]]; 2665 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2666 nzk += nlnk; 2667 2668 /* update lnk by computing fill-in for each pivot row to be merged in */ 2669 prow = jl[k]; /* 1st pivot row */ 2670 2671 while (prow < k) { 2672 nextprow = jl[prow]; 2673 2674 /* merge prow into k-th row */ 2675 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2676 jmax = ui[prow+1]; 2677 ncols = jmax-jmin; 2678 i = jmin - ui[prow]; 2679 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2680 j = *(uj_lvl_ptr[prow] + i - 1); 2681 cols_lvl = uj_lvl_ptr[prow]+i; 2682 ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2683 nzk += nlnk; 2684 2685 /* update il and jl for prow */ 2686 if (jmin < jmax) { 2687 il[prow] = jmin; 2688 j = *cols; 2689 jl[prow] = jl[j]; 2690 jl[j] = prow; 2691 } 2692 prow = nextprow; 2693 } 2694 2695 /* if free space is not available, make more free space */ 2696 if (current_space->local_remaining<nzk) { 2697 i = am - k + 1; /* num of unfactored rows */ 2698 i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2699 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2700 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2701 reallocs++; 2702 } 2703 2704 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2705 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2706 2707 /* add the k-th row into il and jl */ 2708 if (nzk-1 > 0) { 2709 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2710 jl[k] = jl[i]; jl[i] = k; 2711 il[k] = ui[k] + 1; 2712 } 2713 uj_ptr[k] = current_space->array; 2714 uj_lvl_ptr[k] = current_space_lvl->array; 2715 2716 current_space->array += nzk; 2717 current_space->local_used += nzk; 2718 current_space->local_remaining -= nzk; 2719 current_space_lvl->array += nzk; 2720 current_space_lvl->local_used += nzk; 2721 current_space_lvl->local_remaining -= nzk; 2722 2723 ui[k+1] = ui[k] + nzk; 2724 } 2725 2726 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2727 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2728 2729 /* destroy list of free space and other temporary array(s) */ 2730 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2731 ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 2732 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2733 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2734 if (ai[am] != 0) { 2735 ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 2736 } else { 2737 ratio_needed = 0.0; 2738 } 2739 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2740 2741 /* put together the new matrix in MATSEQSBAIJ format */ 2742 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2743 2744 b = (Mat_SeqSBAIJ*)(fact)->data; 2745 2746 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2747 2748 b->singlemalloc = PETSC_FALSE; 2749 b->free_a = PETSC_TRUE; 2750 b->free_ij = free_ij; 2751 2752 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2753 2754 b->j = uj; 2755 b->i = ui; 2756 b->diag = NULL; 2757 b->ilen = NULL; 2758 b->imax = NULL; 2759 b->row = perm; 2760 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2761 2762 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2763 b->icol = perm; 2764 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2765 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2766 2767 b->maxnz = b->nz = ui[am]; 2768 2769 fact->info.factor_mallocs = reallocs; 2770 fact->info.fill_ratio_given = fill; 2771 fact->info.fill_ratio_needed = ratio_needed; 2772 #if defined(PETSC_USE_INFO) 2773 if (ai[am] != 0) { 2774 PetscReal af = fact->info.fill_ratio_needed; 2775 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2776 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2777 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2778 } else { 2779 ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr); 2780 } 2781 #endif 2782 if (perm_identity) { 2783 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2784 } else { 2785 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2786 } 2787 PetscFunctionReturn(0); 2788 } 2789