1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 11 { 12 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 13 IS isrow=a->row; 14 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 15 PetscErrorCode ierr; 16 const PetscInt *r; 17 PetscInt nz,*vj,k,idx,k1; 18 PetscInt bs =A->rmap->bs,bs2 = a->bs2; 19 const MatScalar *aa=a->a,*v,*diag; 20 PetscScalar *x,*xk,*xj,*xk_tmp,*t; 21 const PetscScalar *b; 22 23 PetscFunctionBegin; 24 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 t = a->solve_work; 27 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 28 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 29 30 /* solve U^T * D * y = b by forward substitution */ 31 xk = t; 32 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 33 idx = bs*r[k]; 34 for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1]; 35 } 36 for (k=0; k<mbs; k++) { 37 v = aa + bs2*ai[k]; 38 xk = t + k*bs; /* Dk*xk = k-th block of x */ 39 ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */ 40 nz = ai[k+1] - ai[k]; 41 vj = aj + ai[k]; 42 xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */ 43 while (nz--) { 44 /* x(:) += U(k,:)^T*(Dk*xk) */ 45 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 46 vj++; xj = t + (*vj)*bs; 47 v += bs2; 48 } 49 /* xk = inv(Dk)*(Dk*xk) */ 50 diag = aa+k*bs2; /* ptr to inv(Dk) */ 51 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 52 } 53 54 /* solve U*x = y by back substitution */ 55 for (k=mbs-1; k>=0; k--) { 56 v = aa + bs2*ai[k]; 57 xk = t + k*bs; /* xk */ 58 nz = ai[k+1] - ai[k]; 59 vj = aj + ai[k]; 60 xj = t + (*vj)*bs; 61 while (nz--) { 62 /* xk += U(k,:)*x(:) */ 63 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 64 vj++; 65 v += bs2; xj = t + (*vj)*bs; 66 } 67 idx = bs*r[k]; 68 for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++; 69 } 70 71 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 72 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 73 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 74 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 75 ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr); 76 PetscFunctionReturn(0); 77 } 78 79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 80 { 81 PetscFunctionBegin; 82 SETERRQ(PETSC_COMM_SELF,1,"not implemented yet"); 83 PetscFunctionReturn(0); 84 } 85 86 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 87 { 88 PetscFunctionBegin; 89 SETERRQ(PETSC_COMM_SELF,1,"not implemented yet"); 90 PetscFunctionReturn(0); 91 } 92 93 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 94 { 95 PetscErrorCode ierr; 96 PetscInt nz,k; 97 const PetscInt *vj,bs2 = bs*bs; 98 const MatScalar *v,*diag; 99 PetscScalar *xk,*xj,*xk_tmp; 100 101 PetscFunctionBegin; 102 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 103 for (k=0; k<mbs; k++) { 104 v = aa + bs2*ai[k]; 105 xk = x + k*bs; /* Dk*xk = k-th block of x */ 106 ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */ 107 nz = ai[k+1] - ai[k]; 108 vj = aj + ai[k]; 109 xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */ 110 while (nz--) { 111 /* x(:) += U(k,:)^T*(Dk*xk) */ 112 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 113 vj++; xj = x + (*vj)*bs; 114 v += bs2; 115 } 116 /* xk = inv(Dk)*(Dk*xk) */ 117 diag = aa+k*bs2; /* ptr to inv(Dk) */ 118 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 119 } 120 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 121 PetscFunctionReturn(0); 122 } 123 124 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 125 { 126 PetscInt nz,k; 127 const PetscInt *vj,bs2 = bs*bs; 128 const MatScalar *v; 129 PetscScalar *xk,*xj; 130 131 PetscFunctionBegin; 132 for (k=mbs-1; k>=0; k--) { 133 v = aa + bs2*ai[k]; 134 xk = x + k*bs; /* xk */ 135 nz = ai[k+1] - ai[k]; 136 vj = aj + ai[k]; 137 xj = x + (*vj)*bs; 138 while (nz--) { 139 /* xk += U(k,:)*x(:) */ 140 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 141 vj++; 142 v += bs2; xj = x + (*vj)*bs; 143 } 144 } 145 PetscFunctionReturn(0); 146 } 147 148 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 149 { 150 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 151 PetscErrorCode ierr; 152 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 153 PetscInt bs =A->rmap->bs; 154 const MatScalar *aa=a->a; 155 PetscScalar *x; 156 const PetscScalar *b; 157 158 PetscFunctionBegin; 159 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 160 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 161 162 /* solve U^T * D * y = b by forward substitution */ 163 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 164 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 165 166 /* solve U*x = y by back substitution */ 167 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 168 169 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 170 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 171 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 172 PetscFunctionReturn(0); 173 } 174 175 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 176 { 177 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 178 PetscErrorCode ierr; 179 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 180 PetscInt bs =A->rmap->bs; 181 const MatScalar *aa=a->a; 182 const PetscScalar *b; 183 PetscScalar *x; 184 185 PetscFunctionBegin; 186 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 187 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 188 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 189 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 190 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 191 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 192 ierr = PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);CHKERRQ(ierr); 193 PetscFunctionReturn(0); 194 } 195 196 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 197 { 198 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 199 PetscErrorCode ierr; 200 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 201 PetscInt bs =A->rmap->bs; 202 const MatScalar *aa=a->a; 203 const PetscScalar *b; 204 PetscScalar *x; 205 206 PetscFunctionBegin; 207 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 208 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 209 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 210 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 211 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 212 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 213 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 214 PetscFunctionReturn(0); 215 } 216 217 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 218 { 219 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 220 IS isrow=a->row; 221 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 222 PetscErrorCode ierr; 223 PetscInt nz,k,idx; 224 const MatScalar *aa=a->a,*v,*d; 225 PetscScalar *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp; 226 const PetscScalar *b; 227 228 PetscFunctionBegin; 229 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 230 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 231 t = a->solve_work; 232 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 233 234 /* solve U^T * D * y = b by forward substitution */ 235 tp = t; 236 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 237 idx = 7*r[k]; 238 tp[0] = b[idx]; 239 tp[1] = b[idx+1]; 240 tp[2] = b[idx+2]; 241 tp[3] = b[idx+3]; 242 tp[4] = b[idx+4]; 243 tp[5] = b[idx+5]; 244 tp[6] = b[idx+6]; 245 tp += 7; 246 } 247 248 for (k=0; k<mbs; k++) { 249 v = aa + 49*ai[k]; 250 vj = aj + ai[k]; 251 tp = t + k*7; 252 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; 253 nz = ai[k+1] - ai[k]; 254 tp = t + (*vj)*7; 255 while (nz--) { 256 tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 257 tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 258 tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 259 tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 260 tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 261 tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 262 tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 263 vj++; 264 tp = t + (*vj)*7; 265 v += 49; 266 } 267 268 /* xk = inv(Dk)*(Dk*xk) */ 269 d = aa+k*49; /* ptr to inv(Dk) */ 270 tp = t + k*7; 271 tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 272 tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 273 tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 274 tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 275 tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 276 tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 277 tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 278 } 279 280 /* solve U*x = y by back substitution */ 281 for (k=mbs-1; k>=0; k--) { 282 v = aa + 49*ai[k]; 283 vj = aj + ai[k]; 284 tp = t + k*7; 285 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */ 286 nz = ai[k+1] - ai[k]; 287 288 tp = t + (*vj)*7; 289 while (nz--) { 290 /* xk += U(k,:)*x(:) */ 291 x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6]; 292 x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6]; 293 x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6]; 294 x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6]; 295 x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6]; 296 x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6]; 297 x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6]; 298 vj++; 299 tp = t + (*vj)*7; 300 v += 49; 301 } 302 tp = t + k*7; 303 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6; 304 idx = 7*r[k]; 305 x[idx] = x0; 306 x[idx+1] = x1; 307 x[idx+2] = x2; 308 x[idx+3] = x3; 309 x[idx+4] = x4; 310 x[idx+5] = x5; 311 x[idx+6] = x6; 312 } 313 314 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 315 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 316 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 317 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 318 PetscFunctionReturn(0); 319 } 320 321 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 322 { 323 const MatScalar *v,*d; 324 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 325 PetscInt nz,k; 326 const PetscInt *vj; 327 328 PetscFunctionBegin; 329 for (k=0; k<mbs; k++) { 330 v = aa + 49*ai[k]; 331 xp = x + k*7; 332 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */ 333 nz = ai[k+1] - ai[k]; 334 vj = aj + ai[k]; 335 xp = x + (*vj)*7; 336 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 337 PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 338 while (nz--) { 339 /* x(:) += U(k,:)^T*(Dk*xk) */ 340 xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 341 xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 342 xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 343 xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 344 xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 345 xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 346 xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 347 vj++; 348 xp = x + (*vj)*7; 349 v += 49; 350 } 351 /* xk = inv(Dk)*(Dk*xk) */ 352 d = aa+k*49; /* ptr to inv(Dk) */ 353 xp = x + k*7; 354 xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 355 xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 356 xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 357 xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 358 xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 359 xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 360 xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 361 } 362 PetscFunctionReturn(0); 363 } 364 365 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 366 { 367 const MatScalar *v; 368 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 369 PetscInt nz,k; 370 const PetscInt *vj; 371 372 PetscFunctionBegin; 373 for (k=mbs-1; k>=0; k--) { 374 v = aa + 49*ai[k]; 375 xp = x + k*7; 376 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */ 377 nz = ai[k+1] - ai[k]; 378 vj = aj + ai[k]; 379 xp = x + (*vj)*7; 380 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 381 PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 382 while (nz--) { 383 /* xk += U(k,:)*x(:) */ 384 x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6]; 385 x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6]; 386 x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6]; 387 x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6]; 388 x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6]; 389 x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6]; 390 x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6]; 391 vj++; 392 v += 49; xp = x + (*vj)*7; 393 } 394 xp = x + k*7; 395 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6; 396 } 397 PetscFunctionReturn(0); 398 } 399 400 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 401 { 402 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 403 PetscErrorCode ierr; 404 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 405 const MatScalar *aa=a->a; 406 PetscScalar *x; 407 const PetscScalar *b; 408 409 PetscFunctionBegin; 410 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 411 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 412 413 /* solve U^T * D * y = b by forward substitution */ 414 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 415 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 416 417 /* solve U*x = y by back substitution */ 418 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 419 420 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 421 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 422 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 423 PetscFunctionReturn(0); 424 } 425 426 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 427 { 428 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 429 PetscErrorCode ierr; 430 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 431 const MatScalar *aa=a->a; 432 PetscScalar *x; 433 const PetscScalar *b; 434 435 PetscFunctionBegin; 436 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 437 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 438 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 439 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 440 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 441 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 442 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 443 PetscFunctionReturn(0); 444 } 445 446 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 447 { 448 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 449 PetscErrorCode ierr; 450 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 451 const MatScalar *aa=a->a; 452 PetscScalar *x; 453 const PetscScalar *b; 454 455 PetscFunctionBegin; 456 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 457 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 458 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 459 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 460 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 461 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 462 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 463 PetscFunctionReturn(0); 464 } 465 466 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 469 IS isrow=a->row; 470 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,*r,*vj; 471 PetscErrorCode ierr; 472 PetscInt nz,k,idx; 473 const MatScalar *aa=a->a,*v,*d; 474 PetscScalar *x,x0,x1,x2,x3,x4,x5,*t,*tp; 475 const PetscScalar *b; 476 477 PetscFunctionBegin; 478 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 479 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 480 t = a->solve_work; 481 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 482 483 /* solve U^T * D * y = b by forward substitution */ 484 tp = t; 485 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 486 idx = 6*r[k]; 487 tp[0] = b[idx]; 488 tp[1] = b[idx+1]; 489 tp[2] = b[idx+2]; 490 tp[3] = b[idx+3]; 491 tp[4] = b[idx+4]; 492 tp[5] = b[idx+5]; 493 tp += 6; 494 } 495 496 for (k=0; k<mbs; k++) { 497 v = aa + 36*ai[k]; 498 vj = aj + ai[k]; 499 tp = t + k*6; 500 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; 501 nz = ai[k+1] - ai[k]; 502 tp = t + (*vj)*6; 503 while (nz--) { 504 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 505 tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 506 tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 507 tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 508 tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 509 tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 510 vj++; 511 tp = t + (*vj)*6; 512 v += 36; 513 } 514 515 /* xk = inv(Dk)*(Dk*xk) */ 516 d = aa+k*36; /* ptr to inv(Dk) */ 517 tp = t + k*6; 518 tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 519 tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 520 tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 521 tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 522 tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 523 tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 524 } 525 526 /* solve U*x = y by back substitution */ 527 for (k=mbs-1; k>=0; k--) { 528 v = aa + 36*ai[k]; 529 vj = aj + ai[k]; 530 tp = t + k*6; 531 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */ 532 nz = ai[k+1] - ai[k]; 533 534 tp = t + (*vj)*6; 535 while (nz--) { 536 /* xk += U(k,:)*x(:) */ 537 x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5]; 538 x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5]; 539 x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5]; 540 x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5]; 541 x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5]; 542 x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5]; 543 vj++; 544 tp = t + (*vj)*6; 545 v += 36; 546 } 547 tp = t + k*6; 548 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; 549 idx = 6*r[k]; 550 x[idx] = x0; 551 x[idx+1] = x1; 552 x[idx+2] = x2; 553 x[idx+3] = x3; 554 x[idx+4] = x4; 555 x[idx+5] = x5; 556 } 557 558 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 559 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 560 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 561 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } 564 565 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 566 { 567 const MatScalar *v,*d; 568 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 569 PetscInt nz,k; 570 const PetscInt *vj; 571 572 PetscFunctionBegin; 573 for (k=0; k<mbs; k++) { 574 v = aa + 36*ai[k]; 575 xp = x + k*6; 576 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */ 577 nz = ai[k+1] - ai[k]; 578 vj = aj + ai[k]; 579 xp = x + (*vj)*6; 580 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 581 PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 582 while (nz--) { 583 /* x(:) += U(k,:)^T*(Dk*xk) */ 584 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 585 xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 586 xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 587 xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 588 xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 589 xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 590 vj++; 591 xp = x + (*vj)*6; 592 v += 36; 593 } 594 /* xk = inv(Dk)*(Dk*xk) */ 595 d = aa+k*36; /* ptr to inv(Dk) */ 596 xp = x + k*6; 597 xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 598 xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 599 xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 600 xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 601 xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 602 xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 603 } 604 PetscFunctionReturn(0); 605 } 606 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 607 { 608 const MatScalar *v; 609 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 610 PetscInt nz,k; 611 const PetscInt *vj; 612 613 PetscFunctionBegin; 614 for (k=mbs-1; k>=0; k--) { 615 v = aa + 36*ai[k]; 616 xp = x + k*6; 617 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */ 618 nz = ai[k+1] - ai[k]; 619 vj = aj + ai[k]; 620 xp = x + (*vj)*6; 621 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 622 PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 623 while (nz--) { 624 /* xk += U(k,:)*x(:) */ 625 x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5]; 626 x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5]; 627 x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5]; 628 x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5]; 629 x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5]; 630 x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5]; 631 vj++; 632 v += 36; xp = x + (*vj)*6; 633 } 634 xp = x + k*6; 635 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; 636 } 637 PetscFunctionReturn(0); 638 } 639 640 641 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 642 { 643 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 644 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 645 const MatScalar *aa=a->a; 646 PetscScalar *x; 647 const PetscScalar *b; 648 PetscErrorCode ierr; 649 650 PetscFunctionBegin; 651 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 652 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 653 654 /* solve U^T * D * y = b by forward substitution */ 655 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 656 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 657 658 /* solve U*x = y by back substitution */ 659 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 660 661 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 662 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 663 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 664 PetscFunctionReturn(0); 665 } 666 667 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 668 { 669 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 670 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 671 const MatScalar *aa=a->a; 672 PetscScalar *x; 673 const PetscScalar *b; 674 PetscErrorCode ierr; 675 676 PetscFunctionBegin; 677 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 678 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 679 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 680 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 681 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 682 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 683 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 684 PetscFunctionReturn(0); 685 } 686 687 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 688 { 689 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 690 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 691 const MatScalar *aa=a->a; 692 PetscScalar *x; 693 const PetscScalar *b; 694 PetscErrorCode ierr; 695 696 PetscFunctionBegin; 697 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 698 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 699 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 700 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 701 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 703 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 704 PetscFunctionReturn(0); 705 } 706 707 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 708 { 709 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 710 IS isrow=a->row; 711 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 712 PetscErrorCode ierr; 713 const PetscInt *r,*vj; 714 PetscInt nz,k,idx; 715 const MatScalar *aa=a->a,*v,*diag; 716 PetscScalar *x,x0,x1,x2,x3,x4,*t,*tp; 717 const PetscScalar *b; 718 719 PetscFunctionBegin; 720 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 721 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 722 t = a->solve_work; 723 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 724 725 /* solve U^T * D * y = b by forward substitution */ 726 tp = t; 727 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 728 idx = 5*r[k]; 729 tp[0] = b[idx]; 730 tp[1] = b[idx+1]; 731 tp[2] = b[idx+2]; 732 tp[3] = b[idx+3]; 733 tp[4] = b[idx+4]; 734 tp += 5; 735 } 736 737 for (k=0; k<mbs; k++) { 738 v = aa + 25*ai[k]; 739 vj = aj + ai[k]; 740 tp = t + k*5; 741 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; 742 nz = ai[k+1] - ai[k]; 743 744 tp = t + (*vj)*5; 745 while (nz--) { 746 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 747 tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 748 tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4; 749 tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4; 750 tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4; 751 vj++; 752 tp = t + (*vj)*5; 753 v += 25; 754 } 755 756 /* xk = inv(Dk)*(Dk*xk) */ 757 diag = aa+k*25; /* ptr to inv(Dk) */ 758 tp = t + k*5; 759 tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 760 tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 761 tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 762 tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 763 tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 764 } 765 766 /* solve U*x = y by back substitution */ 767 for (k=mbs-1; k>=0; k--) { 768 v = aa + 25*ai[k]; 769 vj = aj + ai[k]; 770 tp = t + k*5; 771 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */ 772 nz = ai[k+1] - ai[k]; 773 774 tp = t + (*vj)*5; 775 while (nz--) { 776 /* xk += U(k,:)*x(:) */ 777 x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4]; 778 x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4]; 779 x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4]; 780 x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4]; 781 x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4]; 782 vj++; 783 tp = t + (*vj)*5; 784 v += 25; 785 } 786 tp = t + k*5; 787 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; 788 idx = 5*r[k]; 789 x[idx] = x0; 790 x[idx+1] = x1; 791 x[idx+2] = x2; 792 x[idx+3] = x3; 793 x[idx+4] = x4; 794 } 795 796 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 797 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 798 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 799 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 800 PetscFunctionReturn(0); 801 } 802 803 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 804 { 805 const MatScalar *v,*diag; 806 PetscScalar *xp,x0,x1,x2,x3,x4; 807 PetscInt nz,k; 808 const PetscInt *vj; 809 810 PetscFunctionBegin; 811 for (k=0; k<mbs; k++) { 812 v = aa + 25*ai[k]; 813 xp = x + k*5; 814 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */ 815 nz = ai[k+1] - ai[k]; 816 vj = aj + ai[k]; 817 xp = x + (*vj)*5; 818 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 819 PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 820 while (nz--) { 821 /* x(:) += U(k,:)^T*(Dk*xk) */ 822 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 823 xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 824 xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4; 825 xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4; 826 xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4; 827 vj++; 828 xp = x + (*vj)*5; 829 v += 25; 830 } 831 /* xk = inv(Dk)*(Dk*xk) */ 832 diag = aa+k*25; /* ptr to inv(Dk) */ 833 xp = x + k*5; 834 xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 835 xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 836 xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 837 xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 838 xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 839 } 840 PetscFunctionReturn(0); 841 } 842 843 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 844 { 845 const MatScalar *v; 846 PetscScalar *xp,x0,x1,x2,x3,x4; 847 PetscInt nz,k; 848 const PetscInt *vj; 849 850 PetscFunctionBegin; 851 for (k=mbs-1; k>=0; k--) { 852 v = aa + 25*ai[k]; 853 xp = x + k*5; 854 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */ 855 nz = ai[k+1] - ai[k]; 856 vj = aj + ai[k]; 857 xp = x + (*vj)*5; 858 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 859 PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 860 while (nz--) { 861 /* xk += U(k,:)*x(:) */ 862 x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4]; 863 x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4]; 864 x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4]; 865 x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4]; 866 x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4]; 867 vj++; 868 v += 25; xp = x + (*vj)*5; 869 } 870 xp = x + k*5; 871 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; 872 } 873 PetscFunctionReturn(0); 874 } 875 876 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 877 { 878 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 879 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 880 const MatScalar *aa=a->a; 881 PetscScalar *x; 882 const PetscScalar *b; 883 PetscErrorCode ierr; 884 885 PetscFunctionBegin; 886 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 887 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 888 889 /* solve U^T * D * y = b by forward substitution */ 890 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 891 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 892 893 /* solve U*x = y by back substitution */ 894 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 895 896 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 897 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 898 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 899 PetscFunctionReturn(0); 900 } 901 902 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 903 { 904 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 905 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 906 const MatScalar *aa=a->a; 907 PetscScalar *x; 908 const PetscScalar *b; 909 PetscErrorCode ierr; 910 911 PetscFunctionBegin; 912 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 914 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 915 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 916 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 917 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 918 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 919 PetscFunctionReturn(0); 920 } 921 922 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 923 { 924 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 925 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 926 const MatScalar *aa=a->a; 927 PetscScalar *x; 928 const PetscScalar *b; 929 PetscErrorCode ierr; 930 931 PetscFunctionBegin; 932 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 933 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 934 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 935 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 936 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 937 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 938 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 939 PetscFunctionReturn(0); 940 } 941 942 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 943 { 944 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 945 IS isrow=a->row; 946 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 947 PetscErrorCode ierr; 948 const PetscInt *r,*vj; 949 PetscInt nz,k,idx; 950 const MatScalar *aa=a->a,*v,*diag; 951 PetscScalar *x,x0,x1,x2,x3,*t,*tp; 952 const PetscScalar *b; 953 954 PetscFunctionBegin; 955 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 956 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 957 t = a->solve_work; 958 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 959 960 /* solve U^T * D * y = b by forward substitution */ 961 tp = t; 962 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 963 idx = 4*r[k]; 964 tp[0] = b[idx]; 965 tp[1] = b[idx+1]; 966 tp[2] = b[idx+2]; 967 tp[3] = b[idx+3]; 968 tp += 4; 969 } 970 971 for (k=0; k<mbs; k++) { 972 v = aa + 16*ai[k]; 973 vj = aj + ai[k]; 974 tp = t + k*4; 975 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; 976 nz = ai[k+1] - ai[k]; 977 978 tp = t + (*vj)*4; 979 while (nz--) { 980 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 981 tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 982 tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 983 tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 984 vj++; 985 tp = t + (*vj)*4; 986 v += 16; 987 } 988 989 /* xk = inv(Dk)*(Dk*xk) */ 990 diag = aa+k*16; /* ptr to inv(Dk) */ 991 tp = t + k*4; 992 tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 993 tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 994 tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 995 tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 996 } 997 998 /* solve U*x = y by back substitution */ 999 for (k=mbs-1; k>=0; k--) { 1000 v = aa + 16*ai[k]; 1001 vj = aj + ai[k]; 1002 tp = t + k*4; 1003 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */ 1004 nz = ai[k+1] - ai[k]; 1005 1006 tp = t + (*vj)*4; 1007 while (nz--) { 1008 /* xk += U(k,:)*x(:) */ 1009 x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3]; 1010 x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3]; 1011 x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3]; 1012 x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3]; 1013 vj++; 1014 tp = t + (*vj)*4; 1015 v += 16; 1016 } 1017 tp = t + k*4; 1018 tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; 1019 idx = 4*r[k]; 1020 x[idx] = x0; 1021 x[idx+1] = x1; 1022 x[idx+2] = x2; 1023 x[idx+3] = x3; 1024 } 1025 1026 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1027 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1028 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1029 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1030 PetscFunctionReturn(0); 1031 } 1032 1033 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1034 { 1035 const MatScalar *v,*diag; 1036 PetscScalar *xp,x0,x1,x2,x3; 1037 PetscInt nz,k; 1038 const PetscInt *vj; 1039 1040 PetscFunctionBegin; 1041 for (k=0; k<mbs; k++) { 1042 v = aa + 16*ai[k]; 1043 xp = x + k*4; 1044 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */ 1045 nz = ai[k+1] - ai[k]; 1046 vj = aj + ai[k]; 1047 xp = x + (*vj)*4; 1048 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1049 PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1050 while (nz--) { 1051 /* x(:) += U(k,:)^T*(Dk*xk) */ 1052 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1053 xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1054 xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1055 xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1056 vj++; 1057 xp = x + (*vj)*4; 1058 v += 16; 1059 } 1060 /* xk = inv(Dk)*(Dk*xk) */ 1061 diag = aa+k*16; /* ptr to inv(Dk) */ 1062 xp = x + k*4; 1063 xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1064 xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1065 xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1066 xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1067 } 1068 PetscFunctionReturn(0); 1069 } 1070 1071 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1072 { 1073 const MatScalar *v; 1074 PetscScalar *xp,x0,x1,x2,x3; 1075 PetscInt nz,k; 1076 const PetscInt *vj; 1077 1078 PetscFunctionBegin; 1079 for (k=mbs-1; k>=0; k--) { 1080 v = aa + 16*ai[k]; 1081 xp = x + k*4; 1082 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */ 1083 nz = ai[k+1] - ai[k]; 1084 vj = aj + ai[k]; 1085 xp = x + (*vj)*4; 1086 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1087 PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1088 while (nz--) { 1089 /* xk += U(k,:)*x(:) */ 1090 x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3]; 1091 x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3]; 1092 x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3]; 1093 x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3]; 1094 vj++; 1095 v += 16; xp = x + (*vj)*4; 1096 } 1097 xp = x + k*4; 1098 xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3; 1099 } 1100 PetscFunctionReturn(0); 1101 } 1102 1103 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1104 { 1105 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1106 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1107 const MatScalar *aa=a->a; 1108 PetscScalar *x; 1109 const PetscScalar *b; 1110 PetscErrorCode ierr; 1111 1112 PetscFunctionBegin; 1113 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1114 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1115 1116 /* solve U^T * D * y = b by forward substitution */ 1117 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 1118 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1119 1120 /* solve U*x = y by back substitution */ 1121 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1122 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1123 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1124 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1125 PetscFunctionReturn(0); 1126 } 1127 1128 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1129 { 1130 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1131 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1132 const MatScalar *aa=a->a; 1133 PetscScalar *x; 1134 const PetscScalar *b; 1135 PetscErrorCode ierr; 1136 1137 PetscFunctionBegin; 1138 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1139 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1140 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 1141 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1142 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1143 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1144 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1145 PetscFunctionReturn(0); 1146 } 1147 1148 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1149 { 1150 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1151 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1152 const MatScalar *aa=a->a; 1153 PetscScalar *x; 1154 const PetscScalar *b; 1155 PetscErrorCode ierr; 1156 1157 PetscFunctionBegin; 1158 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1159 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1160 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1161 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1162 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1163 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1164 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1165 PetscFunctionReturn(0); 1166 } 1167 1168 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1169 { 1170 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1171 IS isrow=a->row; 1172 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1173 PetscErrorCode ierr; 1174 const PetscInt *r; 1175 PetscInt nz,k,idx; 1176 const PetscInt *vj; 1177 const MatScalar *aa=a->a,*v,*diag; 1178 PetscScalar *x,x0,x1,x2,*t,*tp; 1179 const PetscScalar *b; 1180 1181 PetscFunctionBegin; 1182 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1183 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1184 t = a->solve_work; 1185 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1186 1187 /* solve U^T * D * y = b by forward substitution */ 1188 tp = t; 1189 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1190 idx = 3*r[k]; 1191 tp[0] = b[idx]; 1192 tp[1] = b[idx+1]; 1193 tp[2] = b[idx+2]; 1194 tp += 3; 1195 } 1196 1197 for (k=0; k<mbs; k++) { 1198 v = aa + 9*ai[k]; 1199 vj = aj + ai[k]; 1200 tp = t + k*3; 1201 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; 1202 nz = ai[k+1] - ai[k]; 1203 1204 tp = t + (*vj)*3; 1205 while (nz--) { 1206 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1207 tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1208 tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1209 vj++; 1210 tp = t + (*vj)*3; 1211 v += 9; 1212 } 1213 1214 /* xk = inv(Dk)*(Dk*xk) */ 1215 diag = aa+k*9; /* ptr to inv(Dk) */ 1216 tp = t + k*3; 1217 tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1218 tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1219 tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1220 } 1221 1222 /* solve U*x = y by back substitution */ 1223 for (k=mbs-1; k>=0; k--) { 1224 v = aa + 9*ai[k]; 1225 vj = aj + ai[k]; 1226 tp = t + k*3; 1227 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */ 1228 nz = ai[k+1] - ai[k]; 1229 1230 tp = t + (*vj)*3; 1231 while (nz--) { 1232 /* xk += U(k,:)*x(:) */ 1233 x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2]; 1234 x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2]; 1235 x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2]; 1236 vj++; 1237 tp = t + (*vj)*3; 1238 v += 9; 1239 } 1240 tp = t + k*3; 1241 tp[0] = x0; tp[1] = x1; tp[2] = x2; 1242 idx = 3*r[k]; 1243 x[idx] = x0; 1244 x[idx+1] = x1; 1245 x[idx+2] = x2; 1246 } 1247 1248 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1249 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1250 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1251 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1252 PetscFunctionReturn(0); 1253 } 1254 1255 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1256 { 1257 const MatScalar *v,*diag; 1258 PetscScalar *xp,x0,x1,x2; 1259 PetscInt nz,k; 1260 const PetscInt *vj; 1261 1262 PetscFunctionBegin; 1263 for (k=0; k<mbs; k++) { 1264 v = aa + 9*ai[k]; 1265 xp = x + k*3; 1266 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */ 1267 nz = ai[k+1] - ai[k]; 1268 vj = aj + ai[k]; 1269 xp = x + (*vj)*3; 1270 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1271 PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1272 while (nz--) { 1273 /* x(:) += U(k,:)^T*(Dk*xk) */ 1274 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1275 xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1276 xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1277 vj++; 1278 xp = x + (*vj)*3; 1279 v += 9; 1280 } 1281 /* xk = inv(Dk)*(Dk*xk) */ 1282 diag = aa+k*9; /* ptr to inv(Dk) */ 1283 xp = x + k*3; 1284 xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1285 xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1286 xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1287 } 1288 PetscFunctionReturn(0); 1289 } 1290 1291 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1292 { 1293 const MatScalar *v; 1294 PetscScalar *xp,x0,x1,x2; 1295 PetscInt nz,k; 1296 const PetscInt *vj; 1297 1298 PetscFunctionBegin; 1299 for (k=mbs-1; k>=0; k--) { 1300 v = aa + 9*ai[k]; 1301 xp = x + k*3; 1302 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */ 1303 nz = ai[k+1] - ai[k]; 1304 vj = aj + ai[k]; 1305 xp = x + (*vj)*3; 1306 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1307 PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1308 while (nz--) { 1309 /* xk += U(k,:)*x(:) */ 1310 x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2]; 1311 x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2]; 1312 x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2]; 1313 vj++; 1314 v += 9; xp = x + (*vj)*3; 1315 } 1316 xp = x + k*3; 1317 xp[0] = x0; xp[1] = x1; xp[2] = x2; 1318 } 1319 PetscFunctionReturn(0); 1320 } 1321 1322 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1323 { 1324 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1325 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1326 const MatScalar *aa=a->a; 1327 PetscScalar *x; 1328 const PetscScalar *b; 1329 PetscErrorCode ierr; 1330 1331 PetscFunctionBegin; 1332 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1333 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1334 1335 /* solve U^T * D * y = b by forward substitution */ 1336 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1337 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1338 1339 /* solve U*x = y by back substitution */ 1340 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1341 1342 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1343 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1344 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1345 PetscFunctionReturn(0); 1346 } 1347 1348 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1349 { 1350 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1351 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1352 const MatScalar *aa=a->a; 1353 PetscScalar *x; 1354 const PetscScalar *b; 1355 PetscErrorCode ierr; 1356 1357 PetscFunctionBegin; 1358 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1359 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1360 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1361 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1362 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1363 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1364 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1365 PetscFunctionReturn(0); 1366 } 1367 1368 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1369 { 1370 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1371 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1372 const MatScalar *aa=a->a; 1373 PetscScalar *x; 1374 const PetscScalar *b; 1375 PetscErrorCode ierr; 1376 1377 PetscFunctionBegin; 1378 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1379 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1380 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1381 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1382 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1383 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1384 ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr); 1385 PetscFunctionReturn(0); 1386 } 1387 1388 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1389 { 1390 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1391 IS isrow=a->row; 1392 const PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 1393 PetscErrorCode ierr; 1394 const PetscInt *r,*vj; 1395 PetscInt nz,k,k2,idx; 1396 const MatScalar *aa=a->a,*v,*diag; 1397 PetscScalar *x,x0,x1,*t; 1398 const PetscScalar *b; 1399 1400 PetscFunctionBegin; 1401 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1402 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1403 t = a->solve_work; 1404 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1405 1406 /* solve U^T * D * y = perm(b) by forward substitution */ 1407 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1408 idx = 2*r[k]; 1409 t[k*2] = b[idx]; 1410 t[k*2+1] = b[idx+1]; 1411 } 1412 for (k=0; k<mbs; k++) { 1413 v = aa + 4*ai[k]; 1414 vj = aj + ai[k]; 1415 k2 = k*2; 1416 x0 = t[k2]; x1 = t[k2+1]; 1417 nz = ai[k+1] - ai[k]; 1418 while (nz--) { 1419 t[(*vj)*2] += v[0]*x0 + v[1]*x1; 1420 t[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1421 vj++; v += 4; 1422 } 1423 diag = aa+k*4; /* ptr to inv(Dk) */ 1424 t[k2] = diag[0]*x0 + diag[2]*x1; 1425 t[k2+1] = diag[1]*x0 + diag[3]*x1; 1426 } 1427 1428 /* solve U*x = y by back substitution */ 1429 for (k=mbs-1; k>=0; k--) { 1430 v = aa + 4*ai[k]; 1431 vj = aj + ai[k]; 1432 k2 = k*2; 1433 x0 = t[k2]; x1 = t[k2+1]; 1434 nz = ai[k+1] - ai[k]; 1435 while (nz--) { 1436 x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1]; 1437 x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1]; 1438 vj++; 1439 v += 4; 1440 } 1441 t[k2] = x0; 1442 t[k2+1] = x1; 1443 idx = 2*r[k]; 1444 x[idx] = x0; 1445 x[idx+1] = x1; 1446 } 1447 1448 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1449 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1450 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1451 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1452 PetscFunctionReturn(0); 1453 } 1454 1455 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1456 { 1457 const MatScalar *v,*diag; 1458 PetscScalar x0,x1; 1459 PetscInt nz,k,k2; 1460 const PetscInt *vj; 1461 1462 PetscFunctionBegin; 1463 for (k=0; k<mbs; k++) { 1464 v = aa + 4*ai[k]; 1465 vj = aj + ai[k]; 1466 k2 = k*2; 1467 x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */ 1468 nz = ai[k+1] - ai[k]; 1469 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1470 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1471 while (nz--) { 1472 /* x(:) += U(k,:)^T*(Dk*xk) */ 1473 x[(*vj)*2] += v[0]*x0 + v[1]*x1; 1474 x[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1475 vj++; v += 4; 1476 } 1477 /* xk = inv(Dk)*(Dk*xk) */ 1478 diag = aa+k*4; /* ptr to inv(Dk) */ 1479 x[k2] = diag[0]*x0 + diag[2]*x1; 1480 x[k2+1] = diag[1]*x0 + diag[3]*x1; 1481 } 1482 PetscFunctionReturn(0); 1483 } 1484 1485 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x) 1486 { 1487 const MatScalar *v; 1488 PetscScalar x0,x1; 1489 PetscInt nz,k,k2; 1490 const PetscInt *vj; 1491 1492 PetscFunctionBegin; 1493 for (k=mbs-1; k>=0; k--) { 1494 v = aa + 4*ai[k]; 1495 vj = aj + ai[k]; 1496 k2 = k*2; 1497 x0 = x[k2]; x1 = x[k2+1]; /* xk */ 1498 nz = ai[k+1] - ai[k]; 1499 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1500 PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1501 while (nz--) { 1502 /* xk += U(k,:)*x(:) */ 1503 x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1]; 1504 x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1]; 1505 vj++; 1506 v += 4; 1507 } 1508 x[k2] = x0; 1509 x[k2+1] = x1; 1510 } 1511 PetscFunctionReturn(0); 1512 } 1513 1514 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1515 { 1516 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1517 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1518 const MatScalar *aa=a->a; 1519 PetscScalar *x; 1520 const PetscScalar *b; 1521 PetscErrorCode ierr; 1522 1523 PetscFunctionBegin; 1524 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1525 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1526 1527 /* solve U^T * D * y = b by forward substitution */ 1528 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1529 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1530 1531 /* solve U*x = y by back substitution */ 1532 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1533 1534 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1535 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1536 ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr); 1537 PetscFunctionReturn(0); 1538 } 1539 1540 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1541 { 1542 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1543 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1544 const MatScalar *aa=a->a; 1545 PetscScalar *x; 1546 const PetscScalar *b; 1547 PetscErrorCode ierr; 1548 1549 PetscFunctionBegin; 1550 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1551 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1552 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1553 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1554 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1555 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1556 ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr); 1557 PetscFunctionReturn(0); 1558 } 1559 1560 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1561 { 1562 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1563 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 1564 const MatScalar *aa=a->a; 1565 PetscScalar *x; 1566 const PetscScalar *b; 1567 PetscErrorCode ierr; 1568 1569 PetscFunctionBegin; 1570 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1571 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1572 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1573 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1574 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1575 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1576 ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr); 1577 PetscFunctionReturn(0); 1578 } 1579 1580 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1581 { 1582 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1583 IS isrow=a->row; 1584 PetscErrorCode ierr; 1585 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1586 const MatScalar *aa=a->a,*v; 1587 const PetscScalar *b; 1588 PetscScalar *x,xk,*t; 1589 PetscInt nz,k,j; 1590 1591 PetscFunctionBegin; 1592 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1593 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1594 t = a->solve_work; 1595 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1596 1597 /* solve U^T*D*y = perm(b) by forward substitution */ 1598 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1599 for (k=0; k<mbs; k++) { 1600 v = aa + ai[k]; 1601 vj = aj + ai[k]; 1602 xk = t[k]; 1603 nz = ai[k+1] - ai[k] - 1; 1604 for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk; 1605 t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */ 1606 } 1607 1608 /* solve U*perm(x) = y by back substitution */ 1609 for (k=mbs-1; k>=0; k--) { 1610 v = aa + adiag[k] - 1; 1611 vj = aj + adiag[k] - 1; 1612 nz = ai[k+1] - ai[k] - 1; 1613 for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]]; 1614 x[rp[k]] = t[k]; 1615 } 1616 1617 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1618 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1619 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1620 ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr); 1621 PetscFunctionReturn(0); 1622 } 1623 1624 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1625 { 1626 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1627 IS isrow=a->row; 1628 PetscErrorCode ierr; 1629 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1630 const MatScalar *aa=a->a,*v; 1631 PetscScalar *x,xk,*t; 1632 const PetscScalar *b; 1633 PetscInt nz,k; 1634 1635 PetscFunctionBegin; 1636 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1637 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1638 t = a->solve_work; 1639 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1640 1641 /* solve U^T*D*y = perm(b) by forward substitution */ 1642 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1643 for (k=0; k<mbs; k++) { 1644 v = aa + ai[k] + 1; 1645 vj = aj + ai[k] + 1; 1646 xk = t[k]; 1647 nz = ai[k+1] - ai[k] - 1; 1648 while (nz--) t[*vj++] += (*v++) * xk; 1649 t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */ 1650 } 1651 1652 /* solve U*perm(x) = y by back substitution */ 1653 for (k=mbs-1; k>=0; k--) { 1654 v = aa + ai[k] + 1; 1655 vj = aj + ai[k] + 1; 1656 nz = ai[k+1] - ai[k] - 1; 1657 while (nz--) t[k] += (*v++) * t[*vj++]; 1658 x[rp[k]] = t[k]; 1659 } 1660 1661 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1662 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1663 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1664 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1665 PetscFunctionReturn(0); 1666 } 1667 1668 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1669 { 1670 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1671 IS isrow=a->row; 1672 PetscErrorCode ierr; 1673 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1674 const MatScalar *aa=a->a,*v; 1675 PetscReal diagk; 1676 PetscScalar *x,xk; 1677 const PetscScalar *b; 1678 PetscInt nz,k; 1679 1680 PetscFunctionBegin; 1681 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1682 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1683 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1684 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1685 1686 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1687 for (k=0; k<mbs; k++) { 1688 v = aa + ai[k]; 1689 vj = aj + ai[k]; 1690 xk = x[k]; 1691 nz = ai[k+1] - ai[k] - 1; 1692 while (nz--) x[*vj++] += (*v++) * xk; 1693 1694 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1695 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1696 x[k] = xk*PetscSqrtReal(diagk); 1697 } 1698 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1699 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1700 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1701 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1702 PetscFunctionReturn(0); 1703 } 1704 1705 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1706 { 1707 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1708 IS isrow=a->row; 1709 PetscErrorCode ierr; 1710 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1711 const MatScalar *aa=a->a,*v; 1712 PetscReal diagk; 1713 PetscScalar *x,xk; 1714 const PetscScalar *b; 1715 PetscInt nz,k; 1716 1717 PetscFunctionBegin; 1718 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1719 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1720 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1721 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1722 1723 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1724 for (k=0; k<mbs; k++) { 1725 v = aa + ai[k] + 1; 1726 vj = aj + ai[k] + 1; 1727 xk = x[k]; 1728 nz = ai[k+1] - ai[k] - 1; 1729 while (nz--) x[*vj++] += (*v++) * xk; 1730 1731 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1732 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1733 x[k] = xk*PetscSqrtReal(diagk); 1734 } 1735 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1736 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1737 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1738 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1739 PetscFunctionReturn(0); 1740 } 1741 1742 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1743 { 1744 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1745 IS isrow=a->row; 1746 PetscErrorCode ierr; 1747 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1748 const MatScalar *aa=a->a,*v; 1749 PetscReal diagk; 1750 PetscScalar *x,*t; 1751 const PetscScalar *b; 1752 PetscInt nz,k; 1753 1754 PetscFunctionBegin; 1755 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1756 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1757 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1758 t = a->solve_work; 1759 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1760 1761 for (k=mbs-1; k>=0; k--) { 1762 v = aa + ai[k]; 1763 vj = aj + ai[k]; 1764 diagk = PetscRealPart(aa[adiag[k]]); 1765 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1766 t[k] = b[k] * PetscSqrtReal(diagk); 1767 nz = ai[k+1] - ai[k] - 1; 1768 while (nz--) t[k] += (*v++) * t[*vj++]; 1769 x[rp[k]] = t[k]; 1770 } 1771 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1772 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1773 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1774 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1775 PetscFunctionReturn(0); 1776 } 1777 1778 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1779 { 1780 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1781 IS isrow=a->row; 1782 PetscErrorCode ierr; 1783 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1784 const MatScalar *aa=a->a,*v; 1785 PetscReal diagk; 1786 PetscScalar *x,*t; 1787 const PetscScalar *b; 1788 PetscInt nz,k; 1789 1790 PetscFunctionBegin; 1791 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1792 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1793 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1794 t = a->solve_work; 1795 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1796 1797 for (k=mbs-1; k>=0; k--) { 1798 v = aa + ai[k] + 1; 1799 vj = aj + ai[k] + 1; 1800 diagk = PetscRealPart(aa[ai[k]]); 1801 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1802 t[k] = b[k] * PetscSqrtReal(diagk); 1803 nz = ai[k+1] - ai[k] - 1; 1804 while (nz--) t[k] += (*v++) * t[*vj++]; 1805 x[rp[k]] = t[k]; 1806 } 1807 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1808 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1809 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1810 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1811 PetscFunctionReturn(0); 1812 } 1813 1814 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx) 1815 { 1816 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1817 PetscErrorCode ierr; 1818 1819 PetscFunctionBegin; 1820 if (A->rmap->bs == 1) { 1821 ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr); 1822 } else { 1823 IS isrow=a->row; 1824 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1825 const MatScalar *aa=a->a,*v; 1826 PetscScalar *x,*t; 1827 const PetscScalar *b; 1828 PetscInt nz,k,n,i,j; 1829 1830 if (bb->n > a->solves_work_n) { 1831 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1832 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1833 a->solves_work_n = bb->n; 1834 } 1835 n = bb->n; 1836 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1837 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1838 t = a->solves_work; 1839 1840 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1841 1842 /* solve U^T*D*y = perm(b) by forward substitution */ 1843 for (k=0; k<mbs; k++) { 1844 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1845 } 1846 for (k=0; k<mbs; k++) { 1847 v = aa + ai[k]; 1848 vj = aj + ai[k]; 1849 nz = ai[k+1] - ai[k] - 1; 1850 for (j=0; j<nz; j++) { 1851 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1852 v++;vj++; 1853 } 1854 for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1855 } 1856 1857 /* solve U*perm(x) = y by back substitution */ 1858 for (k=mbs-1; k>=0; k--) { 1859 v = aa + ai[k] - 1; 1860 vj = aj + ai[k] - 1; 1861 nz = ai[k+1] - ai[k] - 1; 1862 for (j=0; j<nz; j++) { 1863 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1864 v++;vj++; 1865 } 1866 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1867 } 1868 1869 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1870 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1871 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1872 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1873 } 1874 PetscFunctionReturn(0); 1875 } 1876 1877 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx) 1878 { 1879 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1880 PetscErrorCode ierr; 1881 1882 PetscFunctionBegin; 1883 if (A->rmap->bs == 1) { 1884 ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr); 1885 } else { 1886 IS isrow=a->row; 1887 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1888 const MatScalar *aa=a->a,*v; 1889 PetscScalar *x,*t; 1890 const PetscScalar *b; 1891 PetscInt nz,k,n,i; 1892 1893 if (bb->n > a->solves_work_n) { 1894 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1895 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1896 a->solves_work_n = bb->n; 1897 } 1898 n = bb->n; 1899 ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr); 1900 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1901 t = a->solves_work; 1902 1903 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1904 1905 /* solve U^T*D*y = perm(b) by forward substitution */ 1906 for (k=0; k<mbs; k++) { 1907 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1908 } 1909 for (k=0; k<mbs; k++) { 1910 v = aa + ai[k]; 1911 vj = aj + ai[k]; 1912 nz = ai[k+1] - ai[k]; 1913 while (nz--) { 1914 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1915 v++;vj++; 1916 } 1917 for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1918 } 1919 1920 /* solve U*perm(x) = y by back substitution */ 1921 for (k=mbs-1; k>=0; k--) { 1922 v = aa + ai[k]; 1923 vj = aj + ai[k]; 1924 nz = ai[k+1] - ai[k]; 1925 while (nz--) { 1926 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1927 v++;vj++; 1928 } 1929 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1930 } 1931 1932 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1933 ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr); 1934 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1935 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1936 } 1937 PetscFunctionReturn(0); 1938 } 1939 1940 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1941 { 1942 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1943 PetscErrorCode ierr; 1944 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag; 1945 const MatScalar *aa=a->a,*v; 1946 const PetscScalar *b; 1947 PetscScalar *x,xi; 1948 PetscInt nz,i,j; 1949 1950 PetscFunctionBegin; 1951 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1952 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1953 1954 /* solve U^T*D*y = b by forward substitution */ 1955 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1956 for (i=0; i<mbs; i++) { 1957 v = aa + ai[i]; 1958 vj = aj + ai[i]; 1959 xi = x[i]; 1960 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 1961 for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi; 1962 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 1963 } 1964 1965 /* solve U*x = y by backward substitution */ 1966 for (i=mbs-2; i>=0; i--) { 1967 xi = x[i]; 1968 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 1969 vj = aj + adiag[i] - 1; 1970 nz = ai[i+1] - ai[i] - 1; 1971 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 1972 x[i] = xi; 1973 } 1974 1975 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1976 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1977 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1978 PetscFunctionReturn(0); 1979 } 1980 1981 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1982 { 1983 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1984 PetscErrorCode ierr; 1985 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 1986 const MatScalar *aa=a->a,*v; 1987 PetscScalar *x,xk; 1988 const PetscScalar *b; 1989 PetscInt nz,k; 1990 1991 PetscFunctionBegin; 1992 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1993 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1994 1995 /* solve U^T*D*y = b by forward substitution */ 1996 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1997 for (k=0; k<mbs; k++) { 1998 v = aa + ai[k] + 1; 1999 vj = aj + ai[k] + 1; 2000 xk = x[k]; 2001 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2002 while (nz--) x[*vj++] += (*v++) * xk; 2003 x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2004 } 2005 2006 /* solve U*x = y by back substitution */ 2007 for (k=mbs-2; k>=0; k--) { 2008 v = aa + ai[k] + 1; 2009 vj = aj + ai[k] + 1; 2010 xk = x[k]; 2011 nz = ai[k+1] - ai[k] - 1; 2012 while (nz--) { 2013 xk += (*v++) * x[*vj++]; 2014 } 2015 x[k] = xk; 2016 } 2017 2018 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2019 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2020 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2021 PetscFunctionReturn(0); 2022 } 2023 2024 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2025 { 2026 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2027 PetscErrorCode ierr; 2028 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj; 2029 const MatScalar *aa=a->a,*v; 2030 PetscReal diagk; 2031 PetscScalar *x; 2032 const PetscScalar *b; 2033 PetscInt nz,k; 2034 2035 PetscFunctionBegin; 2036 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2037 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2038 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2039 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2040 for (k=0; k<mbs; k++) { 2041 v = aa + ai[k]; 2042 vj = aj + ai[k]; 2043 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2044 while (nz--) x[*vj++] += (*v++) * x[k]; 2045 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2046 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]])); 2047 x[k] *= PetscSqrtReal(diagk); 2048 } 2049 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2050 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2051 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2052 PetscFunctionReturn(0); 2053 } 2054 2055 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2056 { 2057 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2058 PetscErrorCode ierr; 2059 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2060 const MatScalar *aa=a->a,*v; 2061 PetscReal diagk; 2062 PetscScalar *x; 2063 const PetscScalar *b; 2064 PetscInt nz,k; 2065 2066 PetscFunctionBegin; 2067 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2068 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2069 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2070 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2071 for (k=0; k<mbs; k++) { 2072 v = aa + ai[k] + 1; 2073 vj = aj + ai[k] + 1; 2074 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2075 while (nz--) x[*vj++] += (*v++) * x[k]; 2076 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2077 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]])); 2078 x[k] *= PetscSqrtReal(diagk); 2079 } 2080 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2081 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2082 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2083 PetscFunctionReturn(0); 2084 } 2085 2086 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2087 { 2088 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2089 PetscErrorCode ierr; 2090 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag,*vj; 2091 const MatScalar *aa=a->a,*v; 2092 PetscReal diagk; 2093 PetscScalar *x; 2094 const PetscScalar *b; 2095 PetscInt nz,k; 2096 2097 PetscFunctionBegin; 2098 /* solve D^(1/2)*U*x = b by back substitution */ 2099 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2100 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2101 2102 for (k=mbs-1; k>=0; k--) { 2103 v = aa + ai[k]; 2104 vj = aj + ai[k]; 2105 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2106 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2107 x[k] = PetscSqrtReal(diagk)*b[k]; 2108 nz = ai[k+1] - ai[k] - 1; 2109 while (nz--) x[k] += (*v++) * x[*vj++]; 2110 } 2111 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2113 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2114 PetscFunctionReturn(0); 2115 } 2116 2117 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2118 { 2119 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2120 PetscErrorCode ierr; 2121 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj; 2122 const MatScalar *aa=a->a,*v; 2123 PetscReal diagk; 2124 PetscScalar *x; 2125 const PetscScalar *b; 2126 PetscInt nz,k; 2127 2128 PetscFunctionBegin; 2129 /* solve D^(1/2)*U*x = b by back substitution */ 2130 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2131 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2132 2133 for (k=mbs-1; k>=0; k--) { 2134 v = aa + ai[k] + 1; 2135 vj = aj + ai[k] + 1; 2136 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2137 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2138 x[k] = PetscSqrtReal(diagk)*b[k]; 2139 nz = ai[k+1] - ai[k] - 1; 2140 while (nz--) x[k] += (*v++) * x[*vj++]; 2141 } 2142 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2143 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2144 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2145 PetscFunctionReturn(0); 2146 } 2147 2148 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2149 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2150 { 2151 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2152 PetscErrorCode ierr; 2153 const PetscInt *rip,mbs = a->mbs,*ai,*aj; 2154 PetscInt *jutmp,bs = A->rmap->bs,i; 2155 PetscInt m,reallocs = 0,*levtmp; 2156 PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2157 PetscInt incrlev,*lev,shift,prow,nz; 2158 PetscReal f = info->fill,levels = info->levels; 2159 PetscBool perm_identity; 2160 2161 PetscFunctionBegin; 2162 /* check whether perm is the identity mapping */ 2163 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2164 2165 if (perm_identity) { 2166 a->permute = PETSC_FALSE; 2167 ai = a->i; aj = a->j; 2168 } else { /* non-trivial permutation */ 2169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2170 a->permute = PETSC_TRUE; 2171 ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr); 2172 ai = a->inew; aj = a->jnew; 2173 } 2174 2175 /* initialization */ 2176 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2177 umax = (PetscInt)(f*ai[mbs] + 1); 2178 ierr = PetscMalloc1(umax,&lev);CHKERRQ(ierr); 2179 umax += mbs + 1; 2180 shift = mbs + 1; 2181 ierr = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr); 2182 ierr = PetscMalloc1(umax,&ju);CHKERRQ(ierr); 2183 iu[0] = mbs + 1; 2184 juidx = mbs + 1; 2185 /* prowl: linked list for pivot row */ 2186 ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr); 2187 /* q: linked list for col index */ 2188 2189 for (i=0; i<mbs; i++) { 2190 prowl[i] = mbs; 2191 q[i] = 0; 2192 } 2193 2194 /* for each row k */ 2195 for (k=0; k<mbs; k++) { 2196 nzk = 0; 2197 q[k] = mbs; 2198 /* copy current row into linked list */ 2199 nz = ai[rip[k]+1] - ai[rip[k]]; 2200 j = ai[rip[k]]; 2201 while (nz--) { 2202 vj = rip[aj[j++]]; 2203 if (vj > k) { 2204 qm = k; 2205 do { 2206 m = qm; qm = q[m]; 2207 } while (qm < vj); 2208 if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n"); 2209 nzk++; 2210 q[m] = vj; 2211 q[vj] = qm; 2212 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2213 } 2214 } 2215 2216 /* modify nonzero structure of k-th row by computing fill-in 2217 for each row prow to be merged in */ 2218 prow = k; 2219 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2220 2221 while (prow < k) { 2222 /* merge row prow into k-th row */ 2223 jmin = iu[prow] + 1; 2224 jmax = iu[prow+1]; 2225 qm = k; 2226 for (j=jmin; j<jmax; j++) { 2227 incrlev = lev[j-shift] + 1; 2228 if (incrlev > levels) continue; 2229 vj = ju[j]; 2230 do { 2231 m = qm; qm = q[m]; 2232 } while (qm < vj); 2233 if (qm != vj) { /* a fill */ 2234 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2235 levtmp[vj] = incrlev; 2236 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2237 } 2238 prow = prowl[prow]; /* next pivot row */ 2239 } 2240 2241 /* add k to row list for first nonzero element in k-th row */ 2242 if (nzk > 1) { 2243 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2244 prowl[k] = prowl[i]; prowl[i] = k; 2245 } 2246 iu[k+1] = iu[k] + nzk; 2247 2248 /* allocate more space to ju and lev if needed */ 2249 if (iu[k+1] > umax) { 2250 /* estimate how much additional space we will need */ 2251 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2252 /* just double the memory each time */ 2253 maxadd = umax; 2254 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2255 umax += maxadd; 2256 2257 /* allocate a longer ju */ 2258 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2259 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr); 2260 ierr = PetscFree(ju);CHKERRQ(ierr); 2261 ju = jutmp; 2262 2263 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2264 ierr = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr); 2265 ierr = PetscFree(lev);CHKERRQ(ierr); 2266 lev = jutmp; 2267 reallocs += 2; /* count how many times we realloc */ 2268 } 2269 2270 /* save nonzero structure of k-th row in ju */ 2271 i=k; 2272 while (nzk--) { 2273 i = q[i]; 2274 ju[juidx] = i; 2275 lev[juidx-shift] = levtmp[i]; 2276 juidx++; 2277 } 2278 } 2279 2280 #if defined(PETSC_USE_INFO) 2281 if (ai[mbs] != 0) { 2282 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2283 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr); 2284 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2285 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr); 2286 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 2287 } else { 2288 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2289 } 2290 #endif 2291 2292 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2293 ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr); 2294 ierr = PetscFree(lev);CHKERRQ(ierr); 2295 2296 /* put together the new matrix */ 2297 ierr = MatSeqSBAIJSetPreallocation(B,bs,0,NULL);CHKERRQ(ierr); 2298 2299 /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */ 2300 b = (Mat_SeqSBAIJ*)(B)->data; 2301 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2302 2303 b->singlemalloc = PETSC_FALSE; 2304 b->free_a = PETSC_TRUE; 2305 b->free_ij = PETSC_TRUE; 2306 /* the next line frees the default space generated by the Create() */ 2307 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 2308 ierr = PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);CHKERRQ(ierr); 2309 b->j = ju; 2310 b->i = iu; 2311 b->diag = 0; 2312 b->ilen = 0; 2313 b->imax = 0; 2314 2315 ierr = ISDestroy(&b->row);CHKERRQ(ierr); 2316 ierr = ISDestroy(&b->icol);CHKERRQ(ierr); 2317 b->row = perm; 2318 b->icol = perm; 2319 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2320 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2321 ierr = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr); 2322 /* In b structure: Free imax, ilen, old a, old j. 2323 Allocate idnew, solve_work, new a, new j */ 2324 ierr = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2325 b->maxnz = b->nz = iu[mbs]; 2326 2327 (B)->info.factor_mallocs = reallocs; 2328 (B)->info.fill_ratio_given = f; 2329 if (ai[mbs] != 0) { 2330 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2331 } else { 2332 (B)->info.fill_ratio_needed = 0.0; 2333 } 2334 ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr); 2335 PetscFunctionReturn(0); 2336 } 2337 2338 /* 2339 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2340 */ 2341 #include <petscbt.h> 2342 #include <../src/mat/utils/freespace.h> 2343 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2344 { 2345 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2346 PetscErrorCode ierr; 2347 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2348 PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j; 2349 const PetscInt *rip; 2350 PetscInt reallocs=0,i,*ui,*udiag,*cols; 2351 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2352 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr; 2353 PetscReal fill =info->fill,levels=info->levels; 2354 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2355 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2356 PetscBT lnkbt; 2357 2358 PetscFunctionBegin; 2359 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 2360 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2361 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2362 if (bs > 1) { 2363 ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr); 2364 PetscFunctionReturn(0); 2365 } 2366 2367 /* check whether perm is the identity mapping */ 2368 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2369 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2370 a->permute = PETSC_FALSE; 2371 2372 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2373 ierr = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr); 2374 ui[0] = 0; 2375 2376 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2377 if (!levels) { 2378 /* reuse the column pointers and row offsets for memory savings */ 2379 for (i=0; i<am; i++) { 2380 ncols = ai[i+1] - ai[i]; 2381 ui[i+1] = ui[i] + ncols; 2382 udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */ 2383 } 2384 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2385 cols = uj; 2386 for (i=0; i<am; i++) { 2387 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2388 ncols = ai[i+1] - ai[i] -1; 2389 for (j=0; j<ncols; j++) *cols++ = aj[j]; 2390 *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */ 2391 } 2392 } else { /* case: levels>0 */ 2393 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2394 2395 /* initialization */ 2396 /* jl: linked list for storing indices of the pivot rows 2397 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2398 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2399 for (i=0; i<am; i++) { 2400 jl[i] = am; il[i] = 0; 2401 } 2402 2403 /* create and initialize a linked list for storing column indices of the active row k */ 2404 nlnk = am + 1; 2405 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2406 2407 /* initial FreeSpace size is fill*(ai[am]+1) */ 2408 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2409 2410 current_space = free_space; 2411 2412 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2413 2414 current_space_lvl = free_space_lvl; 2415 2416 for (k=0; k<am; k++) { /* for each active row k */ 2417 /* initialize lnk by the column indices of row k */ 2418 nzk = 0; 2419 ncols = ai[k+1] - ai[k]; 2420 if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k); 2421 cols = aj+ai[k]; 2422 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2423 nzk += nlnk; 2424 2425 /* update lnk by computing fill-in for each pivot row to be merged in */ 2426 prow = jl[k]; /* 1st pivot row */ 2427 2428 while (prow < k) { 2429 nextprow = jl[prow]; 2430 2431 /* merge prow into k-th row */ 2432 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2433 jmax = ui[prow+1]; 2434 ncols = jmax-jmin; 2435 i = jmin - ui[prow]; 2436 2437 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2438 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2439 j = *(uj - 1); 2440 ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2441 nzk += nlnk; 2442 2443 /* update il and jl for prow */ 2444 if (jmin < jmax) { 2445 il[prow] = jmin; 2446 2447 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2448 } 2449 prow = nextprow; 2450 } 2451 2452 /* if free space is not available, make more free space */ 2453 if (current_space->local_remaining<nzk) { 2454 i = am - k + 1; /* num of unfactored rows */ 2455 i = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2456 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2457 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2458 reallocs++; 2459 } 2460 2461 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2462 if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k); 2463 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2464 2465 /* add the k-th row into il and jl */ 2466 if (nzk > 1) { 2467 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2468 jl[k] = jl[i]; jl[i] = k; 2469 il[k] = ui[k] + 1; 2470 } 2471 uj_ptr[k] = current_space->array; 2472 uj_lvl_ptr[k] = current_space_lvl->array; 2473 2474 current_space->array += nzk; 2475 current_space->local_used += nzk; 2476 current_space->local_remaining -= nzk; 2477 current_space_lvl->array += nzk; 2478 current_space_lvl->local_used += nzk; 2479 current_space_lvl->local_remaining -= nzk; 2480 2481 ui[k+1] = ui[k] + nzk; 2482 } 2483 2484 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2485 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2486 2487 /* destroy list of free space and other temporary array(s) */ 2488 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2489 ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor */ 2490 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2491 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2492 2493 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2494 2495 /* put together the new matrix in MATSEQSBAIJ format */ 2496 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2497 2498 b = (Mat_SeqSBAIJ*)(fact)->data; 2499 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2500 2501 b->singlemalloc = PETSC_FALSE; 2502 b->free_a = PETSC_TRUE; 2503 b->free_ij = free_ij; 2504 2505 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2506 2507 b->j = uj; 2508 b->i = ui; 2509 b->diag = udiag; 2510 b->free_diag = PETSC_TRUE; 2511 b->ilen = 0; 2512 b->imax = 0; 2513 b->row = perm; 2514 b->col = perm; 2515 2516 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2517 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2518 2519 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2520 2521 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2522 ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2523 2524 b->maxnz = b->nz = ui[am]; 2525 2526 fact->info.factor_mallocs = reallocs; 2527 fact->info.fill_ratio_given = fill; 2528 if (ai[am] != 0) { 2529 fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am]; 2530 } else { 2531 fact->info.fill_ratio_needed = 0.0; 2532 } 2533 #if defined(PETSC_USE_INFO) 2534 if (ai[am] != 0) { 2535 PetscReal af = fact->info.fill_ratio_needed; 2536 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2537 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2538 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2539 } else { 2540 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2541 } 2542 #endif 2543 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2544 PetscFunctionReturn(0); 2545 } 2546 2547 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2548 { 2549 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2550 Mat_SeqSBAIJ *b; 2551 PetscErrorCode ierr; 2552 PetscBool perm_identity,free_ij = PETSC_TRUE; 2553 PetscInt bs=A->rmap->bs,am=a->mbs; 2554 const PetscInt *cols,*rip,*ai=a->i,*aj=a->j; 2555 PetscInt reallocs=0,i,*ui; 2556 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2557 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 2558 PetscReal fill =info->fill,levels=info->levels,ratio_needed; 2559 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2560 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2561 PetscBT lnkbt; 2562 2563 PetscFunctionBegin; 2564 /* 2565 This code originally uses Modified Sparse Row (MSR) storage 2566 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2567 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2568 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2569 thus the original code in MSR format is still used for these cases. 2570 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2571 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2572 */ 2573 if (bs > 1) { 2574 ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr); 2575 PetscFunctionReturn(0); 2576 } 2577 2578 /* check whether perm is the identity mapping */ 2579 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2580 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2581 a->permute = PETSC_FALSE; 2582 2583 /* special case that simply copies fill pattern */ 2584 if (!levels) { 2585 /* reuse the column pointers and row offsets for memory savings */ 2586 ui = a->i; 2587 uj = a->j; 2588 free_ij = PETSC_FALSE; 2589 ratio_needed = 1.0; 2590 } else { /* case: levels>0 */ 2591 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2592 2593 /* initialization */ 2594 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2595 ui[0] = 0; 2596 2597 /* jl: linked list for storing indices of the pivot rows 2598 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2599 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2600 for (i=0; i<am; i++) { 2601 jl[i] = am; il[i] = 0; 2602 } 2603 2604 /* create and initialize a linked list for storing column indices of the active row k */ 2605 nlnk = am + 1; 2606 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2607 2608 /* initial FreeSpace size is fill*(ai[am]+1) */ 2609 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr); 2610 2611 current_space = free_space; 2612 2613 ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr); 2614 2615 current_space_lvl = free_space_lvl; 2616 2617 for (k=0; k<am; k++) { /* for each active row k */ 2618 /* initialize lnk by the column indices of row rip[k] */ 2619 nzk = 0; 2620 ncols = ai[rip[k]+1] - ai[rip[k]]; 2621 cols = aj+ai[rip[k]]; 2622 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2623 nzk += nlnk; 2624 2625 /* update lnk by computing fill-in for each pivot row to be merged in */ 2626 prow = jl[k]; /* 1st pivot row */ 2627 2628 while (prow < k) { 2629 nextprow = jl[prow]; 2630 2631 /* merge prow into k-th row */ 2632 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2633 jmax = ui[prow+1]; 2634 ncols = jmax-jmin; 2635 i = jmin - ui[prow]; 2636 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2637 j = *(uj_lvl_ptr[prow] + i - 1); 2638 cols_lvl = uj_lvl_ptr[prow]+i; 2639 ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2640 nzk += nlnk; 2641 2642 /* update il and jl for prow */ 2643 if (jmin < jmax) { 2644 il[prow] = jmin; 2645 j = *cols; 2646 jl[prow] = jl[j]; 2647 jl[j] = prow; 2648 } 2649 prow = nextprow; 2650 } 2651 2652 /* if free space is not available, make more free space */ 2653 if (current_space->local_remaining<nzk) { 2654 i = am - k + 1; /* num of unfactored rows */ 2655 i = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2656 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2657 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2658 reallocs++; 2659 } 2660 2661 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2662 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2663 2664 /* add the k-th row into il and jl */ 2665 if (nzk-1 > 0) { 2666 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2667 jl[k] = jl[i]; jl[i] = k; 2668 il[k] = ui[k] + 1; 2669 } 2670 uj_ptr[k] = current_space->array; 2671 uj_lvl_ptr[k] = current_space_lvl->array; 2672 2673 current_space->array += nzk; 2674 current_space->local_used += nzk; 2675 current_space->local_remaining -= nzk; 2676 current_space_lvl->array += nzk; 2677 current_space_lvl->local_used += nzk; 2678 current_space_lvl->local_remaining -= nzk; 2679 2680 ui[k+1] = ui[k] + nzk; 2681 } 2682 2683 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2684 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2685 2686 /* destroy list of free space and other temporary array(s) */ 2687 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2688 ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 2689 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2690 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2691 if (ai[am] != 0) { 2692 ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 2693 } else { 2694 ratio_needed = 0.0; 2695 } 2696 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2697 2698 /* put together the new matrix in MATSEQSBAIJ format */ 2699 ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2700 2701 b = (Mat_SeqSBAIJ*)(fact)->data; 2702 2703 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2704 2705 b->singlemalloc = PETSC_FALSE; 2706 b->free_a = PETSC_TRUE; 2707 b->free_ij = free_ij; 2708 2709 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2710 2711 b->j = uj; 2712 b->i = ui; 2713 b->diag = 0; 2714 b->ilen = 0; 2715 b->imax = 0; 2716 b->row = perm; 2717 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2718 2719 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2720 b->icol = perm; 2721 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2722 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2723 2724 b->maxnz = b->nz = ui[am]; 2725 2726 fact->info.factor_mallocs = reallocs; 2727 fact->info.fill_ratio_given = fill; 2728 fact->info.fill_ratio_needed = ratio_needed; 2729 #if defined(PETSC_USE_INFO) 2730 if (ai[am] != 0) { 2731 PetscReal af = fact->info.fill_ratio_needed; 2732 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2733 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2734 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2735 } else { 2736 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2737 } 2738 #endif 2739 if (perm_identity) { 2740 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2741 } else { 2742 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2743 } 2744 PetscFunctionReturn(0); 2745 } 2746 2747