1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc-private/kernels/blockinvert.h> 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_inplace" 12 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 13 { 14 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 15 IS isrow=a->row; 16 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j; 17 PetscErrorCode ierr; 18 const PetscInt *r; 19 PetscInt nz,*vj,k,idx,k1; 20 PetscInt bs =A->rmap->bs,bs2 = a->bs2; 21 MatScalar *aa=a->a,*v,*diag; 22 PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t; 23 24 PetscFunctionBegin; 25 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 26 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27 t = a->solve_work; 28 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 29 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 30 31 /* solve U^T * D * y = b by forward substitution */ 32 xk = t; 33 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 34 idx = bs*r[k]; 35 for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1]; 36 } 37 for (k=0; k<mbs; k++) { 38 v = aa + bs2*ai[k]; 39 xk = t + k*bs; /* Dk*xk = k-th block of x */ 40 ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */ 41 nz = ai[k+1] - ai[k]; 42 vj = aj + ai[k]; 43 xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */ 44 while (nz--) { 45 /* x(:) += U(k,:)^T*(Dk*xk) */ 46 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 47 vj++; xj = t + (*vj)*bs; 48 v += bs2; 49 } 50 /* xk = inv(Dk)*(Dk*xk) */ 51 diag = aa+k*bs2; /* ptr to inv(Dk) */ 52 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 53 } 54 55 /* solve U*x = y by back substitution */ 56 for (k=mbs-1; k>=0; k--) { 57 v = aa + bs2*ai[k]; 58 xk = t + k*bs; /* xk */ 59 nz = ai[k+1] - ai[k]; 60 vj = aj + ai[k]; 61 xj = t + (*vj)*bs; 62 while (nz--) { 63 /* xk += U(k,:)*x(:) */ 64 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 65 vj++; 66 v += bs2; xj = t + (*vj)*bs; 67 } 68 idx = bs*r[k]; 69 for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++; 70 } 71 72 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 73 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 74 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 75 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 76 ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr); 77 PetscFunctionReturn(0); 78 } 79 80 #undef __FUNCT__ 81 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_inplace" 82 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 83 { 84 PetscFunctionBegin; 85 SETERRQ(PETSC_COMM_SELF,1,"not implemented yet"); 86 PetscFunctionReturn(0); 87 } 88 89 #undef __FUNCT__ 90 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_inplace" 91 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 92 { 93 PetscFunctionBegin; 94 SETERRQ(PETSC_COMM_SELF,1,"not implemented yet"); 95 PetscFunctionReturn(0); 96 } 97 98 #undef __FUNCT__ 99 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering" 100 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 101 { 102 PetscErrorCode ierr; 103 PetscInt nz,*vj,k; 104 PetscInt bs2 = bs*bs; 105 MatScalar *v,*diag; 106 PetscScalar *xk,*xj,*xk_tmp; 107 108 PetscFunctionBegin; 109 ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr); 110 for (k=0; k<mbs; k++) { 111 v = aa + bs2*ai[k]; 112 xk = x + k*bs; /* Dk*xk = k-th block of x */ 113 ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */ 114 nz = ai[k+1] - ai[k]; 115 vj = aj + ai[k]; 116 xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */ 117 while (nz--) { 118 /* x(:) += U(k,:)^T*(Dk*xk) */ 119 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 120 vj++; xj = x + (*vj)*bs; 121 v += bs2; 122 } 123 /* xk = inv(Dk)*(Dk*xk) */ 124 diag = aa+k*bs2; /* ptr to inv(Dk) */ 125 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 126 } 127 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 128 PetscFunctionReturn(0); 129 } 130 131 #undef __FUNCT__ 132 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering" 133 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 134 { 135 PetscInt nz,*vj,k; 136 PetscInt bs2 = bs*bs; 137 MatScalar *v; 138 PetscScalar *xk,*xj; 139 140 PetscFunctionBegin; 141 for (k=mbs-1; k>=0; k--) { 142 v = aa + bs2*ai[k]; 143 xk = x + k*bs; /* xk */ 144 nz = ai[k+1] - ai[k]; 145 vj = aj + ai[k]; 146 xj = x + (*vj)*bs; 147 while (nz--) { 148 /* xk += U(k,:)*x(:) */ 149 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 150 vj++; 151 v += bs2; xj = x + (*vj)*bs; 152 } 153 } 154 PetscFunctionReturn(0); 155 } 156 157 #undef __FUNCT__ 158 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace" 159 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 160 { 161 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 162 PetscErrorCode ierr; 163 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 164 PetscInt bs =A->rmap->bs; 165 MatScalar *aa=a->a; 166 PetscScalar *x,*b; 167 168 PetscFunctionBegin; 169 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 170 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 171 172 /* solve U^T * D * y = b by forward substitution */ 173 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 174 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 175 176 /* solve U*x = y by back substitution */ 177 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 178 179 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 180 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 181 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 #undef __FUNCT__ 186 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace" 187 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 188 { 189 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 190 PetscErrorCode ierr; 191 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 192 PetscInt bs =A->rmap->bs; 193 MatScalar *aa=a->a; 194 PetscScalar *x,*b; 195 196 PetscFunctionBegin; 197 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 198 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 199 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 200 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 201 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 202 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 203 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 204 PetscFunctionReturn(0); 205 } 206 207 #undef __FUNCT__ 208 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace" 209 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 210 { 211 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 212 PetscErrorCode ierr; 213 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 214 PetscInt bs =A->rmap->bs; 215 MatScalar *aa=a->a; 216 PetscScalar *x,*b; 217 218 PetscFunctionBegin; 219 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 220 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 221 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 222 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 223 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 224 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 225 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 226 PetscFunctionReturn(0); 227 } 228 229 #undef __FUNCT__ 230 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_inplace" 231 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 232 { 233 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 234 IS isrow=a->row; 235 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs; 236 PetscErrorCode ierr; 237 const PetscInt *r; 238 PetscInt nz,*vj,k,idx; 239 MatScalar *aa=a->a,*v,*d; 240 PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp; 241 242 PetscFunctionBegin; 243 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 244 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 245 t = a->solve_work; 246 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 247 248 /* solve U^T * D * y = b by forward substitution */ 249 tp = t; 250 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 251 idx = 7*r[k]; 252 tp[0] = b[idx]; 253 tp[1] = b[idx+1]; 254 tp[2] = b[idx+2]; 255 tp[3] = b[idx+3]; 256 tp[4] = b[idx+4]; 257 tp[5] = b[idx+5]; 258 tp[6] = b[idx+6]; 259 tp += 7; 260 } 261 262 for (k=0; k<mbs; k++) { 263 v = aa + 49*ai[k]; 264 vj = aj + ai[k]; 265 tp = t + k*7; 266 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; 267 nz = ai[k+1] - ai[k]; 268 tp = t + (*vj)*7; 269 while (nz--) { 270 tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 271 tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 272 tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 273 tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 274 tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 275 tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 276 tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 277 vj++; 278 tp = t + (*vj)*7; 279 v += 49; 280 } 281 282 /* xk = inv(Dk)*(Dk*xk) */ 283 d = aa+k*49; /* ptr to inv(Dk) */ 284 tp = t + k*7; 285 tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 286 tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 287 tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 288 tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 289 tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 290 tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 291 tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 292 } 293 294 /* solve U*x = y by back substitution */ 295 for (k=mbs-1; k>=0; k--) { 296 v = aa + 49*ai[k]; 297 vj = aj + ai[k]; 298 tp = t + k*7; 299 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */ 300 nz = ai[k+1] - ai[k]; 301 302 tp = t + (*vj)*7; 303 while (nz--) { 304 /* xk += U(k,:)*x(:) */ 305 x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6]; 306 x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6]; 307 x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6]; 308 x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6]; 309 x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6]; 310 x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6]; 311 x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6]; 312 vj++; 313 tp = t + (*vj)*7; 314 v += 49; 315 } 316 tp = t + k*7; 317 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6; 318 idx = 7*r[k]; 319 x[idx] = x0; 320 x[idx+1] = x1; 321 x[idx+2] = x2; 322 x[idx+3] = x3; 323 x[idx+4] = x4; 324 x[idx+5] = x5; 325 x[idx+6] = x6; 326 } 327 328 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 329 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 330 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 331 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 332 PetscFunctionReturn(0); 333 } 334 335 #undef __FUNCT__ 336 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering" 337 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 338 { 339 MatScalar *v,*d; 340 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 341 PetscInt nz,*vj,k; 342 343 PetscFunctionBegin; 344 for (k=0; k<mbs; k++) { 345 v = aa + 49*ai[k]; 346 xp = x + k*7; 347 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */ 348 nz = ai[k+1] - ai[k]; 349 vj = aj + ai[k]; 350 xp = x + (*vj)*7; 351 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 352 PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 353 while (nz--) { 354 /* x(:) += U(k,:)^T*(Dk*xk) */ 355 xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 356 xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 357 xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 358 xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 359 xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 360 xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 361 xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 362 vj++; 363 xp = x + (*vj)*7; 364 v += 49; 365 } 366 /* xk = inv(Dk)*(Dk*xk) */ 367 d = aa+k*49; /* ptr to inv(Dk) */ 368 xp = x + k*7; 369 xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 370 xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 371 xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 372 xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 373 xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 374 xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 375 xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 376 } 377 PetscFunctionReturn(0); 378 } 379 380 #undef __FUNCT__ 381 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering" 382 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 383 { 384 MatScalar *v; 385 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 386 PetscInt nz,*vj,k; 387 388 PetscFunctionBegin; 389 for (k=mbs-1; k>=0; k--) { 390 v = aa + 49*ai[k]; 391 xp = x + k*7; 392 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */ 393 nz = ai[k+1] - ai[k]; 394 vj = aj + ai[k]; 395 xp = x + (*vj)*7; 396 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 397 PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 398 while (nz--) { 399 /* xk += U(k,:)*x(:) */ 400 x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6]; 401 x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6]; 402 x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6]; 403 x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6]; 404 x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6]; 405 x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6]; 406 x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6]; 407 vj++; 408 v += 49; xp = x + (*vj)*7; 409 } 410 xp = x + k*7; 411 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6; 412 } 413 PetscFunctionReturn(0); 414 } 415 416 #undef __FUNCT__ 417 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace" 418 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 419 { 420 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 421 PetscErrorCode ierr; 422 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 423 MatScalar *aa=a->a; 424 PetscScalar *x,*b; 425 426 PetscFunctionBegin; 427 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 428 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 429 430 /* solve U^T * D * y = b by forward substitution */ 431 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 432 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 433 434 /* solve U*x = y by back substitution */ 435 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 436 437 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 438 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 439 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 440 PetscFunctionReturn(0); 441 } 442 443 #undef __FUNCT__ 444 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace" 445 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 446 { 447 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 448 PetscErrorCode ierr; 449 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 450 MatScalar *aa=a->a; 451 PetscScalar *x,*b; 452 453 PetscFunctionBegin; 454 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 455 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 456 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 457 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 458 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 459 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 460 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 461 PetscFunctionReturn(0); 462 } 463 464 #undef __FUNCT__ 465 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace" 466 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 467 { 468 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 469 PetscErrorCode ierr; 470 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 471 MatScalar *aa=a->a; 472 PetscScalar *x,*b; 473 474 PetscFunctionBegin; 475 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 476 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 478 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 479 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 480 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 481 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 482 PetscFunctionReturn(0); 483 } 484 485 #undef __FUNCT__ 486 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_inplace" 487 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 488 { 489 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 490 IS isrow=a->row; 491 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 492 PetscErrorCode ierr; 493 const PetscInt *r; 494 PetscInt nz,*vj,k,idx; 495 MatScalar *aa=a->a,*v,*d; 496 PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp; 497 498 PetscFunctionBegin; 499 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 500 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 501 t = a->solve_work; 502 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 503 504 /* solve U^T * D * y = b by forward substitution */ 505 tp = t; 506 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 507 idx = 6*r[k]; 508 tp[0] = b[idx]; 509 tp[1] = b[idx+1]; 510 tp[2] = b[idx+2]; 511 tp[3] = b[idx+3]; 512 tp[4] = b[idx+4]; 513 tp[5] = b[idx+5]; 514 tp += 6; 515 } 516 517 for (k=0; k<mbs; k++) { 518 v = aa + 36*ai[k]; 519 vj = aj + ai[k]; 520 tp = t + k*6; 521 x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; 522 nz = ai[k+1] - ai[k]; 523 tp = t + (*vj)*6; 524 while (nz--) { 525 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 526 tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 527 tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 528 tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 529 tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 530 tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 531 vj++; 532 tp = t + (*vj)*6; 533 v += 36; 534 } 535 536 /* xk = inv(Dk)*(Dk*xk) */ 537 d = aa+k*36; /* ptr to inv(Dk) */ 538 tp = t + k*6; 539 tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 540 tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 541 tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 542 tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 543 tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 544 tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 545 } 546 547 /* solve U*x = y by back substitution */ 548 for (k=mbs-1; k>=0; k--) { 549 v = aa + 36*ai[k]; 550 vj = aj + ai[k]; 551 tp = t + k*6; 552 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */ 553 nz = ai[k+1] - ai[k]; 554 555 tp = t + (*vj)*6; 556 while (nz--) { 557 /* xk += U(k,:)*x(:) */ 558 x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5]; 559 x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5]; 560 x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5]; 561 x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5]; 562 x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5]; 563 x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5]; 564 vj++; 565 tp = t + (*vj)*6; 566 v += 36; 567 } 568 tp = t + k*6; 569 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; 570 idx = 6*r[k]; 571 x[idx] = x0; 572 x[idx+1] = x1; 573 x[idx+2] = x2; 574 x[idx+3] = x3; 575 x[idx+4] = x4; 576 x[idx+5] = x5; 577 } 578 579 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 580 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 581 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 582 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 583 PetscFunctionReturn(0); 584 } 585 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering" 588 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 589 { 590 MatScalar *v,*d; 591 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 592 PetscInt nz,*vj,k; 593 594 PetscFunctionBegin; 595 for (k=0; k<mbs; k++) { 596 v = aa + 36*ai[k]; 597 xp = x + k*6; 598 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */ 599 nz = ai[k+1] - ai[k]; 600 vj = aj + ai[k]; 601 xp = x + (*vj)*6; 602 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 603 PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 604 while (nz--) { 605 /* x(:) += U(k,:)^T*(Dk*xk) */ 606 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 607 xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 608 xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 609 xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 610 xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 611 xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 612 vj++; 613 xp = x + (*vj)*6; 614 v += 36; 615 } 616 /* xk = inv(Dk)*(Dk*xk) */ 617 d = aa+k*36; /* ptr to inv(Dk) */ 618 xp = x + k*6; 619 xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 620 xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 621 xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 622 xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 623 xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 624 xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 625 } 626 PetscFunctionReturn(0); 627 } 628 #undef __FUNCT__ 629 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering" 630 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 631 { 632 MatScalar *v; 633 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 634 PetscInt nz,*vj,k; 635 636 PetscFunctionBegin; 637 for (k=mbs-1; k>=0; k--) { 638 v = aa + 36*ai[k]; 639 xp = x + k*6; 640 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */ 641 nz = ai[k+1] - ai[k]; 642 vj = aj + ai[k]; 643 xp = x + (*vj)*6; 644 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 645 PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 646 while (nz--) { 647 /* xk += U(k,:)*x(:) */ 648 x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5]; 649 x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5]; 650 x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5]; 651 x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5]; 652 x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5]; 653 x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5]; 654 vj++; 655 v += 36; xp = x + (*vj)*6; 656 } 657 xp = x + k*6; 658 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; 659 } 660 PetscFunctionReturn(0); 661 } 662 663 664 #undef __FUNCT__ 665 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace" 666 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 667 { 668 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 669 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 670 MatScalar *aa=a->a; 671 PetscScalar *x,*b; 672 PetscErrorCode ierr; 673 674 PetscFunctionBegin; 675 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 676 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 677 678 /* solve U^T * D * y = b by forward substitution */ 679 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 680 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 681 682 /* solve U*x = y by back substitution */ 683 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 684 685 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 686 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 687 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 688 PetscFunctionReturn(0); 689 } 690 691 #undef __FUNCT__ 692 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace" 693 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 694 { 695 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 696 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 697 MatScalar *aa=a->a; 698 PetscScalar *x,*b; 699 PetscErrorCode ierr; 700 701 PetscFunctionBegin; 702 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 703 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 704 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 705 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 706 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 707 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 708 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 709 PetscFunctionReturn(0); 710 } 711 712 #undef __FUNCT__ 713 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace" 714 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 715 { 716 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 717 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 718 MatScalar *aa=a->a; 719 PetscScalar *x,*b; 720 PetscErrorCode ierr; 721 722 PetscFunctionBegin; 723 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 724 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 725 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 726 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 727 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 728 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 729 ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr); 730 PetscFunctionReturn(0); 731 } 732 733 #undef __FUNCT__ 734 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_inplace" 735 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 736 { 737 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 738 IS isrow=a->row; 739 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2; 740 PetscErrorCode ierr; 741 const PetscInt *r; 742 PetscInt nz,*vj,k,idx; 743 MatScalar *aa=a->a,*v,*diag; 744 PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp; 745 746 PetscFunctionBegin; 747 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 748 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 749 t = a->solve_work; 750 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 751 752 /* solve U^T * D * y = b by forward substitution */ 753 tp = t; 754 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 755 idx = 5*r[k]; 756 tp[0] = b[idx]; 757 tp[1] = b[idx+1]; 758 tp[2] = b[idx+2]; 759 tp[3] = b[idx+3]; 760 tp[4] = b[idx+4]; 761 tp += 5; 762 } 763 764 for (k=0; k<mbs; k++) { 765 v = aa + 25*ai[k]; 766 vj = aj + ai[k]; 767 tp = t + k*5; 768 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; 769 nz = ai[k+1] - ai[k]; 770 771 tp = t + (*vj)*5; 772 while (nz--) { 773 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 774 tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 775 tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4; 776 tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4; 777 tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4; 778 vj++; 779 tp = t + (*vj)*5; 780 v += 25; 781 } 782 783 /* xk = inv(Dk)*(Dk*xk) */ 784 diag = aa+k*25; /* ptr to inv(Dk) */ 785 tp = t + k*5; 786 tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 787 tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 788 tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 789 tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 790 tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 791 } 792 793 /* solve U*x = y by back substitution */ 794 for (k=mbs-1; k>=0; k--) { 795 v = aa + 25*ai[k]; 796 vj = aj + ai[k]; 797 tp = t + k*5; 798 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */ 799 nz = ai[k+1] - ai[k]; 800 801 tp = t + (*vj)*5; 802 while (nz--) { 803 /* xk += U(k,:)*x(:) */ 804 x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4]; 805 x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4]; 806 x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4]; 807 x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4]; 808 x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4]; 809 vj++; 810 tp = t + (*vj)*5; 811 v += 25; 812 } 813 tp = t + k*5; 814 tp[0] = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; 815 idx = 5*r[k]; 816 x[idx] = x0; 817 x[idx+1] = x1; 818 x[idx+2] = x2; 819 x[idx+3] = x3; 820 x[idx+4] = x4; 821 } 822 823 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 824 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 825 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 826 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 827 PetscFunctionReturn(0); 828 } 829 830 #undef __FUNCT__ 831 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering" 832 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 833 { 834 MatScalar *v,*diag; 835 PetscScalar *xp,x0,x1,x2,x3,x4; 836 PetscInt nz,*vj,k; 837 838 PetscFunctionBegin; 839 for (k=0; k<mbs; k++) { 840 v = aa + 25*ai[k]; 841 xp = x + k*5; 842 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */ 843 nz = ai[k+1] - ai[k]; 844 vj = aj + ai[k]; 845 xp = x + (*vj)*5; 846 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 847 PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 848 while (nz--) { 849 /* x(:) += U(k,:)^T*(Dk*xk) */ 850 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 851 xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 852 xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4; 853 xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4; 854 xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4; 855 vj++; 856 xp = x + (*vj)*5; 857 v += 25; 858 } 859 /* xk = inv(Dk)*(Dk*xk) */ 860 diag = aa+k*25; /* ptr to inv(Dk) */ 861 xp = x + k*5; 862 xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 863 xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 864 xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 865 xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 866 xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 867 } 868 PetscFunctionReturn(0); 869 } 870 871 #undef __FUNCT__ 872 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering" 873 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 874 { 875 MatScalar *v; 876 PetscScalar *xp,x0,x1,x2,x3,x4; 877 PetscInt nz,*vj,k; 878 879 PetscFunctionBegin; 880 for (k=mbs-1; k>=0; k--) { 881 v = aa + 25*ai[k]; 882 xp = x + k*5; 883 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */ 884 nz = ai[k+1] - ai[k]; 885 vj = aj + ai[k]; 886 xp = x + (*vj)*5; 887 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 888 PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 889 while (nz--) { 890 /* xk += U(k,:)*x(:) */ 891 x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4]; 892 x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4]; 893 x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4]; 894 x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4]; 895 x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4]; 896 vj++; 897 v += 25; xp = x + (*vj)*5; 898 } 899 xp = x + k*5; 900 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; 901 } 902 PetscFunctionReturn(0); 903 } 904 905 #undef __FUNCT__ 906 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace" 907 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 908 { 909 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 910 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2; 911 MatScalar *aa=a->a; 912 PetscScalar *x,*b; 913 PetscErrorCode ierr; 914 915 PetscFunctionBegin; 916 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 917 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 918 919 /* solve U^T * D * y = b by forward substitution */ 920 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 921 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 922 923 /* solve U*x = y by back substitution */ 924 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 925 926 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 927 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 928 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 929 PetscFunctionReturn(0); 930 } 931 932 #undef __FUNCT__ 933 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace" 934 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 935 { 936 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 937 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 938 MatScalar *aa=a->a; 939 PetscScalar *x,*b; 940 PetscErrorCode ierr; 941 942 PetscFunctionBegin; 943 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 944 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 945 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 946 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 947 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 948 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 949 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 950 PetscFunctionReturn(0); 951 } 952 953 #undef __FUNCT__ 954 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace" 955 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 956 { 957 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 958 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 959 MatScalar *aa=a->a; 960 PetscScalar *x,*b; 961 PetscErrorCode ierr; 962 963 PetscFunctionBegin; 964 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 965 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 966 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 967 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 968 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 969 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 970 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 971 PetscFunctionReturn(0); 972 } 973 974 #undef __FUNCT__ 975 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_inplace" 976 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 977 { 978 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 979 IS isrow=a->row; 980 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 981 PetscErrorCode ierr; 982 const PetscInt *r; 983 PetscInt nz,*vj,k,idx; 984 MatScalar *aa=a->a,*v,*diag; 985 PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp; 986 987 PetscFunctionBegin; 988 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 989 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 990 t = a->solve_work; 991 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 992 993 /* solve U^T * D * y = b by forward substitution */ 994 tp = t; 995 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 996 idx = 4*r[k]; 997 tp[0] = b[idx]; 998 tp[1] = b[idx+1]; 999 tp[2] = b[idx+2]; 1000 tp[3] = b[idx+3]; 1001 tp += 4; 1002 } 1003 1004 for (k=0; k<mbs; k++) { 1005 v = aa + 16*ai[k]; 1006 vj = aj + ai[k]; 1007 tp = t + k*4; 1008 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; 1009 nz = ai[k+1] - ai[k]; 1010 1011 tp = t + (*vj)*4; 1012 while (nz--) { 1013 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1014 tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1015 tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1016 tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1017 vj++; 1018 tp = t + (*vj)*4; 1019 v += 16; 1020 } 1021 1022 /* xk = inv(Dk)*(Dk*xk) */ 1023 diag = aa+k*16; /* ptr to inv(Dk) */ 1024 tp = t + k*4; 1025 tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1026 tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1027 tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1028 tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1029 } 1030 1031 /* solve U*x = y by back substitution */ 1032 for (k=mbs-1; k>=0; k--) { 1033 v = aa + 16*ai[k]; 1034 vj = aj + ai[k]; 1035 tp = t + k*4; 1036 x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */ 1037 nz = ai[k+1] - ai[k]; 1038 1039 tp = t + (*vj)*4; 1040 while (nz--) { 1041 /* xk += U(k,:)*x(:) */ 1042 x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3]; 1043 x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3]; 1044 x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3]; 1045 x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3]; 1046 vj++; 1047 tp = t + (*vj)*4; 1048 v += 16; 1049 } 1050 tp = t + k*4; 1051 tp[0] =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; 1052 idx = 4*r[k]; 1053 x[idx] = x0; 1054 x[idx+1] = x1; 1055 x[idx+2] = x2; 1056 x[idx+3] = x3; 1057 } 1058 1059 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1060 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1061 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1062 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1063 PetscFunctionReturn(0); 1064 } 1065 1066 #undef __FUNCT__ 1067 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering" 1068 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1069 { 1070 MatScalar *v,*diag; 1071 PetscScalar *xp,x0,x1,x2,x3; 1072 PetscInt nz,*vj,k; 1073 1074 PetscFunctionBegin; 1075 for (k=0; k<mbs; k++) { 1076 v = aa + 16*ai[k]; 1077 xp = x + k*4; 1078 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */ 1079 nz = ai[k+1] - ai[k]; 1080 vj = aj + ai[k]; 1081 xp = x + (*vj)*4; 1082 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1083 PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1084 while (nz--) { 1085 /* x(:) += U(k,:)^T*(Dk*xk) */ 1086 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1087 xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1088 xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1089 xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1090 vj++; 1091 xp = x + (*vj)*4; 1092 v += 16; 1093 } 1094 /* xk = inv(Dk)*(Dk*xk) */ 1095 diag = aa+k*16; /* ptr to inv(Dk) */ 1096 xp = x + k*4; 1097 xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1098 xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1099 xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1100 xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1101 } 1102 PetscFunctionReturn(0); 1103 } 1104 1105 #undef __FUNCT__ 1106 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering" 1107 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1108 { 1109 MatScalar *v; 1110 PetscScalar *xp,x0,x1,x2,x3; 1111 PetscInt nz,*vj,k; 1112 1113 PetscFunctionBegin; 1114 for (k=mbs-1; k>=0; k--) { 1115 v = aa + 16*ai[k]; 1116 xp = x + k*4; 1117 x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */ 1118 nz = ai[k+1] - ai[k]; 1119 vj = aj + ai[k]; 1120 xp = x + (*vj)*4; 1121 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1122 PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1123 while (nz--) { 1124 /* xk += U(k,:)*x(:) */ 1125 x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3]; 1126 x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3]; 1127 x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3]; 1128 x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3]; 1129 vj++; 1130 v += 16; xp = x + (*vj)*4; 1131 } 1132 xp = x + k*4; 1133 xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3; 1134 } 1135 PetscFunctionReturn(0); 1136 } 1137 1138 #undef __FUNCT__ 1139 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace" 1140 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1141 { 1142 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1143 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1144 MatScalar *aa=a->a; 1145 PetscScalar *x,*b; 1146 PetscErrorCode ierr; 1147 1148 PetscFunctionBegin; 1149 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1150 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1151 1152 /* solve U^T * D * y = b by forward substitution */ 1153 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 1154 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1155 1156 /* solve U*x = y by back substitution */ 1157 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1158 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1159 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1160 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1161 PetscFunctionReturn(0); 1162 } 1163 1164 #undef __FUNCT__ 1165 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace" 1166 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1167 { 1168 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1169 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1170 MatScalar *aa=a->a; 1171 PetscScalar *x,*b; 1172 PetscErrorCode ierr; 1173 1174 PetscFunctionBegin; 1175 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1176 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 1178 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1179 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1180 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1181 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 1182 PetscFunctionReturn(0); 1183 } 1184 1185 #undef __FUNCT__ 1186 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace" 1187 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1188 { 1189 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1190 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 1191 MatScalar *aa=a->a; 1192 PetscScalar *x,*b; 1193 PetscErrorCode ierr; 1194 1195 PetscFunctionBegin; 1196 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1197 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1198 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1199 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1200 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1201 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1202 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 1203 PetscFunctionReturn(0); 1204 } 1205 1206 #undef __FUNCT__ 1207 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_inplace" 1208 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1209 { 1210 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1211 IS isrow=a->row; 1212 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1213 PetscErrorCode ierr; 1214 const PetscInt *r; 1215 PetscInt nz,*vj,k,idx; 1216 MatScalar *aa=a->a,*v,*diag; 1217 PetscScalar *x,*b,x0,x1,x2,*t,*tp; 1218 1219 PetscFunctionBegin; 1220 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1221 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1222 t = a->solve_work; 1223 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1224 1225 /* solve U^T * D * y = b by forward substitution */ 1226 tp = t; 1227 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1228 idx = 3*r[k]; 1229 tp[0] = b[idx]; 1230 tp[1] = b[idx+1]; 1231 tp[2] = b[idx+2]; 1232 tp += 3; 1233 } 1234 1235 for (k=0; k<mbs; k++) { 1236 v = aa + 9*ai[k]; 1237 vj = aj + ai[k]; 1238 tp = t + k*3; 1239 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; 1240 nz = ai[k+1] - ai[k]; 1241 1242 tp = t + (*vj)*3; 1243 while (nz--) { 1244 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1245 tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1246 tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1247 vj++; 1248 tp = t + (*vj)*3; 1249 v += 9; 1250 } 1251 1252 /* xk = inv(Dk)*(Dk*xk) */ 1253 diag = aa+k*9; /* ptr to inv(Dk) */ 1254 tp = t + k*3; 1255 tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1256 tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1257 tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1258 } 1259 1260 /* solve U*x = y by back substitution */ 1261 for (k=mbs-1; k>=0; k--) { 1262 v = aa + 9*ai[k]; 1263 vj = aj + ai[k]; 1264 tp = t + k*3; 1265 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */ 1266 nz = ai[k+1] - ai[k]; 1267 1268 tp = t + (*vj)*3; 1269 while (nz--) { 1270 /* xk += U(k,:)*x(:) */ 1271 x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2]; 1272 x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2]; 1273 x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2]; 1274 vj++; 1275 tp = t + (*vj)*3; 1276 v += 9; 1277 } 1278 tp = t + k*3; 1279 tp[0] = x0; tp[1] = x1; tp[2] = x2; 1280 idx = 3*r[k]; 1281 x[idx] = x0; 1282 x[idx+1] = x1; 1283 x[idx+2] = x2; 1284 } 1285 1286 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1287 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1288 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1289 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1290 PetscFunctionReturn(0); 1291 } 1292 1293 #undef __FUNCT__ 1294 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering" 1295 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1296 { 1297 MatScalar *v,*diag; 1298 PetscScalar *xp,x0,x1,x2; 1299 PetscInt nz,*vj,k; 1300 1301 PetscFunctionBegin; 1302 for (k=0; k<mbs; k++) { 1303 v = aa + 9*ai[k]; 1304 xp = x + k*3; 1305 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */ 1306 nz = ai[k+1] - ai[k]; 1307 vj = aj + ai[k]; 1308 xp = x + (*vj)*3; 1309 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1310 PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1311 while (nz--) { 1312 /* x(:) += U(k,:)^T*(Dk*xk) */ 1313 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1314 xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1315 xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1316 vj++; 1317 xp = x + (*vj)*3; 1318 v += 9; 1319 } 1320 /* xk = inv(Dk)*(Dk*xk) */ 1321 diag = aa+k*9; /* ptr to inv(Dk) */ 1322 xp = x + k*3; 1323 xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1324 xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1325 xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1326 } 1327 PetscFunctionReturn(0); 1328 } 1329 1330 #undef __FUNCT__ 1331 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering" 1332 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1333 { 1334 MatScalar *v; 1335 PetscScalar *xp,x0,x1,x2; 1336 PetscInt nz,*vj,k; 1337 1338 PetscFunctionBegin; 1339 for (k=mbs-1; k>=0; k--) { 1340 v = aa + 9*ai[k]; 1341 xp = x + k*3; 1342 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */ 1343 nz = ai[k+1] - ai[k]; 1344 vj = aj + ai[k]; 1345 xp = x + (*vj)*3; 1346 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1347 PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1348 while (nz--) { 1349 /* xk += U(k,:)*x(:) */ 1350 x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2]; 1351 x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2]; 1352 x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2]; 1353 vj++; 1354 v += 9; xp = x + (*vj)*3; 1355 } 1356 xp = x + k*3; 1357 xp[0] = x0; xp[1] = x1; xp[2] = x2; 1358 } 1359 PetscFunctionReturn(0); 1360 } 1361 1362 #undef __FUNCT__ 1363 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace" 1364 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1365 { 1366 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1367 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1368 MatScalar *aa=a->a; 1369 PetscScalar *x,*b; 1370 PetscErrorCode ierr; 1371 1372 PetscFunctionBegin; 1373 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1374 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1375 1376 /* solve U^T * D * y = b by forward substitution */ 1377 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1378 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1379 1380 /* solve U*x = y by back substitution */ 1381 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1382 1383 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1384 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1385 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1386 PetscFunctionReturn(0); 1387 } 1388 1389 #undef __FUNCT__ 1390 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace" 1391 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1392 { 1393 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1394 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1395 MatScalar *aa=a->a; 1396 PetscScalar *x,*b; 1397 PetscErrorCode ierr; 1398 1399 PetscFunctionBegin; 1400 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1401 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1402 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1403 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1404 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1405 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1406 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 1407 PetscFunctionReturn(0); 1408 } 1409 1410 #undef __FUNCT__ 1411 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace" 1412 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1413 { 1414 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1415 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 1416 MatScalar *aa=a->a; 1417 PetscScalar *x,*b; 1418 PetscErrorCode ierr; 1419 1420 PetscFunctionBegin; 1421 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1422 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1423 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1424 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1425 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1426 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1427 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 1428 PetscFunctionReturn(0); 1429 } 1430 1431 #undef __FUNCT__ 1432 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_inplace" 1433 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1434 { 1435 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1436 IS isrow=a->row; 1437 PetscInt mbs =a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1438 PetscErrorCode ierr; 1439 const PetscInt *r; 1440 PetscInt nz,*vj,k,k2,idx; 1441 MatScalar *aa=a->a,*v,*diag; 1442 PetscScalar *x,*b,x0,x1,*t; 1443 1444 PetscFunctionBegin; 1445 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1446 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1447 t = a->solve_work; 1448 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1449 1450 /* solve U^T * D * y = perm(b) by forward substitution */ 1451 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1452 idx = 2*r[k]; 1453 t[k*2] = b[idx]; 1454 t[k*2+1] = b[idx+1]; 1455 } 1456 for (k=0; k<mbs; k++) { 1457 v = aa + 4*ai[k]; 1458 vj = aj + ai[k]; 1459 k2 = k*2; 1460 x0 = t[k2]; x1 = t[k2+1]; 1461 nz = ai[k+1] - ai[k]; 1462 while (nz--) { 1463 t[(*vj)*2] += v[0]*x0 + v[1]*x1; 1464 t[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1465 vj++; v += 4; 1466 } 1467 diag = aa+k*4; /* ptr to inv(Dk) */ 1468 t[k2] = diag[0]*x0 + diag[2]*x1; 1469 t[k2+1] = diag[1]*x0 + diag[3]*x1; 1470 } 1471 1472 /* solve U*x = y by back substitution */ 1473 for (k=mbs-1; k>=0; k--) { 1474 v = aa + 4*ai[k]; 1475 vj = aj + ai[k]; 1476 k2 = k*2; 1477 x0 = t[k2]; x1 = t[k2+1]; 1478 nz = ai[k+1] - ai[k]; 1479 while (nz--) { 1480 x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1]; 1481 x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1]; 1482 vj++; 1483 v += 4; 1484 } 1485 t[k2] = x0; 1486 t[k2+1] = x1; 1487 idx = 2*r[k]; 1488 x[idx] = x0; 1489 x[idx+1] = x1; 1490 } 1491 1492 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1493 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1494 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1495 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1496 PetscFunctionReturn(0); 1497 } 1498 1499 #undef __FUNCT__ 1500 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering" 1501 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1502 { 1503 MatScalar *v,*diag; 1504 PetscScalar x0,x1; 1505 PetscInt nz,*vj,k,k2; 1506 1507 PetscFunctionBegin; 1508 for (k=0; k<mbs; k++) { 1509 v = aa + 4*ai[k]; 1510 vj = aj + ai[k]; 1511 k2 = k*2; 1512 x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */ 1513 nz = ai[k+1] - ai[k]; 1514 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1515 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1516 while (nz--) { 1517 /* x(:) += U(k,:)^T*(Dk*xk) */ 1518 x[(*vj)*2] += v[0]*x0 + v[1]*x1; 1519 x[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1520 vj++; v += 4; 1521 } 1522 /* xk = inv(Dk)*(Dk*xk) */ 1523 diag = aa+k*4; /* ptr to inv(Dk) */ 1524 x[k2] = diag[0]*x0 + diag[2]*x1; 1525 x[k2+1] = diag[1]*x0 + diag[3]*x1; 1526 } 1527 PetscFunctionReturn(0); 1528 } 1529 1530 #undef __FUNCT__ 1531 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering" 1532 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1533 { 1534 MatScalar *v; 1535 PetscScalar x0,x1; 1536 PetscInt nz,*vj,k,k2; 1537 1538 PetscFunctionBegin; 1539 for (k=mbs-1; k>=0; k--) { 1540 v = aa + 4*ai[k]; 1541 vj = aj + ai[k]; 1542 k2 = k*2; 1543 x0 = x[k2]; x1 = x[k2+1]; /* xk */ 1544 nz = ai[k+1] - ai[k]; 1545 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1546 PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1547 while (nz--) { 1548 /* xk += U(k,:)*x(:) */ 1549 x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1]; 1550 x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1]; 1551 vj++; 1552 v += 4; 1553 } 1554 x[k2] = x0; 1555 x[k2+1] = x1; 1556 } 1557 PetscFunctionReturn(0); 1558 } 1559 1560 #undef __FUNCT__ 1561 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace" 1562 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1563 { 1564 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1565 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1566 MatScalar *aa=a->a; 1567 PetscScalar *x,*b; 1568 PetscErrorCode ierr; 1569 1570 PetscFunctionBegin; 1571 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1572 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1573 1574 /* solve U^T * D * y = b by forward substitution */ 1575 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1576 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1577 1578 /* solve U*x = y by back substitution */ 1579 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1580 1581 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1582 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1583 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1584 PetscFunctionReturn(0); 1585 } 1586 1587 #undef __FUNCT__ 1588 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace" 1589 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1590 { 1591 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1592 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1593 MatScalar *aa=a->a; 1594 PetscScalar *x,*b; 1595 PetscErrorCode ierr; 1596 1597 PetscFunctionBegin; 1598 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1600 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1601 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1602 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1603 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1604 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 1605 PetscFunctionReturn(0); 1606 } 1607 1608 #undef __FUNCT__ 1609 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace" 1610 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1611 { 1612 Mat_SeqSBAIJ *a =(Mat_SeqSBAIJ*)A->data; 1613 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 1614 MatScalar *aa=a->a; 1615 PetscScalar *x,*b; 1616 PetscErrorCode ierr; 1617 1618 PetscFunctionBegin; 1619 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1620 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1621 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1622 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1623 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1624 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1625 ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr); 1626 PetscFunctionReturn(0); 1627 } 1628 1629 #undef __FUNCT__ 1630 #define __FUNCT__ "MatSolve_SeqSBAIJ_1" 1631 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1632 { 1633 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1634 IS isrow=a->row; 1635 PetscErrorCode ierr; 1636 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1637 const MatScalar *aa=a->a,*v; 1638 const PetscScalar *b; 1639 PetscScalar *x,xk,*t; 1640 PetscInt nz,k,j; 1641 1642 PetscFunctionBegin; 1643 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1644 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1645 t = a->solve_work; 1646 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1647 1648 /* solve U^T*D*y = perm(b) by forward substitution */ 1649 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1650 for (k=0; k<mbs; k++) { 1651 v = aa + ai[k]; 1652 vj = aj + ai[k]; 1653 xk = t[k]; 1654 nz = ai[k+1] - ai[k] - 1; 1655 for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk; 1656 t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */ 1657 } 1658 1659 /* solve U*perm(x) = y by back substitution */ 1660 for (k=mbs-1; k>=0; k--) { 1661 v = aa + adiag[k] - 1; 1662 vj = aj + adiag[k] - 1; 1663 nz = ai[k+1] - ai[k] - 1; 1664 for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]]; 1665 x[rp[k]] = t[k]; 1666 } 1667 1668 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1669 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1670 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1671 ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr); 1672 PetscFunctionReturn(0); 1673 } 1674 1675 #undef __FUNCT__ 1676 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_inplace" 1677 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1678 { 1679 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1680 IS isrow=a->row; 1681 PetscErrorCode ierr; 1682 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1683 const MatScalar *aa=a->a,*v; 1684 PetscScalar *x,*b,xk,*t; 1685 PetscInt nz,k; 1686 1687 PetscFunctionBegin; 1688 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1689 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1690 t = a->solve_work; 1691 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1692 1693 /* solve U^T*D*y = perm(b) by forward substitution */ 1694 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1695 for (k=0; k<mbs; k++) { 1696 v = aa + ai[k] + 1; 1697 vj = aj + ai[k] + 1; 1698 xk = t[k]; 1699 nz = ai[k+1] - ai[k] - 1; 1700 while (nz--) t[*vj++] += (*v++) * xk; 1701 t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */ 1702 } 1703 1704 /* solve U*perm(x) = y by back substitution */ 1705 for (k=mbs-1; k>=0; k--) { 1706 v = aa + ai[k] + 1; 1707 vj = aj + ai[k] + 1; 1708 nz = ai[k+1] - ai[k] - 1; 1709 while (nz--) t[k] += (*v++) * t[*vj++]; 1710 x[rp[k]] = t[k]; 1711 } 1712 1713 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1714 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1715 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1716 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1717 PetscFunctionReturn(0); 1718 } 1719 1720 #undef __FUNCT__ 1721 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1" 1722 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1723 { 1724 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1725 IS isrow=a->row; 1726 PetscErrorCode ierr; 1727 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1728 const MatScalar *aa=a->a,*v; 1729 PetscReal diagk; 1730 PetscScalar *x,*b,xk; 1731 PetscInt nz,k; 1732 1733 PetscFunctionBegin; 1734 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1735 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1736 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1737 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1738 1739 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1740 for (k=0; k<mbs; k++) { 1741 v = aa + ai[k]; 1742 vj = aj + ai[k]; 1743 xk = x[k]; 1744 nz = ai[k+1] - ai[k] - 1; 1745 while (nz--) x[*vj++] += (*v++) * xk; 1746 1747 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1748 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1749 x[k] = xk*PetscSqrtReal(diagk); 1750 } 1751 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1752 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1753 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1754 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1755 PetscFunctionReturn(0); 1756 } 1757 1758 #undef __FUNCT__ 1759 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_inplace" 1760 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1761 { 1762 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1763 IS isrow=a->row; 1764 PetscErrorCode ierr; 1765 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1766 const MatScalar *aa=a->a,*v; 1767 PetscReal diagk; 1768 PetscScalar *x,*b,xk; 1769 PetscInt nz,k; 1770 1771 PetscFunctionBegin; 1772 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1773 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1774 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1775 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1776 1777 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1778 for (k=0; k<mbs; k++) { 1779 v = aa + ai[k] + 1; 1780 vj = aj + ai[k] + 1; 1781 xk = x[k]; 1782 nz = ai[k+1] - ai[k] - 1; 1783 while (nz--) x[*vj++] += (*v++) * xk; 1784 1785 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1786 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1787 x[k] = xk*PetscSqrtReal(diagk); 1788 } 1789 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1790 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1791 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1792 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1793 PetscFunctionReturn(0); 1794 } 1795 1796 #undef __FUNCT__ 1797 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1" 1798 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1799 { 1800 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1801 IS isrow=a->row; 1802 PetscErrorCode ierr; 1803 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1804 const MatScalar *aa=a->a,*v; 1805 PetscReal diagk; 1806 PetscScalar *x,*b,*t; 1807 PetscInt nz,k; 1808 1809 PetscFunctionBegin; 1810 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1811 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1812 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1813 t = a->solve_work; 1814 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1815 1816 for (k=mbs-1; k>=0; k--) { 1817 v = aa + ai[k]; 1818 vj = aj + ai[k]; 1819 diagk = PetscRealPart(aa[adiag[k]]); 1820 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1821 t[k] = b[k] * PetscSqrtReal(diagk); 1822 nz = ai[k+1] - ai[k] - 1; 1823 while (nz--) t[k] += (*v++) * t[*vj++]; 1824 x[rp[k]] = t[k]; 1825 } 1826 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1827 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1828 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1829 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1830 PetscFunctionReturn(0); 1831 } 1832 1833 #undef __FUNCT__ 1834 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_inplace" 1835 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1836 { 1837 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1838 IS isrow=a->row; 1839 PetscErrorCode ierr; 1840 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1841 const MatScalar *aa=a->a,*v; 1842 PetscReal diagk; 1843 PetscScalar *x,*b,*t; 1844 PetscInt nz,k; 1845 1846 PetscFunctionBegin; 1847 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1848 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1849 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1850 t = a->solve_work; 1851 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1852 1853 for (k=mbs-1; k>=0; k--) { 1854 v = aa + ai[k] + 1; 1855 vj = aj + ai[k] + 1; 1856 diagk = PetscRealPart(aa[ai[k]]); 1857 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1858 t[k] = b[k] * PetscSqrtReal(diagk); 1859 nz = ai[k+1] - ai[k] - 1; 1860 while (nz--) t[k] += (*v++) * t[*vj++]; 1861 x[rp[k]] = t[k]; 1862 } 1863 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1864 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1865 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1866 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1867 PetscFunctionReturn(0); 1868 } 1869 1870 #undef __FUNCT__ 1871 #define __FUNCT__ "MatSolves_SeqSBAIJ_1" 1872 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx) 1873 { 1874 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1875 PetscErrorCode ierr; 1876 1877 PetscFunctionBegin; 1878 if (A->rmap->bs == 1) { 1879 ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr); 1880 } else { 1881 IS isrow=a->row; 1882 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1883 const MatScalar *aa=a->a,*v; 1884 PetscScalar *x,*b,*t; 1885 PetscInt nz,k,n,i,j; 1886 if (bb->n > a->solves_work_n) { 1887 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1888 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1889 1890 a->solves_work_n = bb->n; 1891 } 1892 n = bb->n; 1893 ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr); 1894 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1895 t = a->solves_work; 1896 1897 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1898 1899 /* solve U^T*D*y = perm(b) by forward substitution */ 1900 for (k=0; k<mbs; k++) { 1901 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1902 } 1903 for (k=0; k<mbs; k++) { 1904 v = aa + ai[k]; 1905 vj = aj + ai[k]; 1906 nz = ai[k+1] - ai[k] - 1; 1907 for (j=0; j<nz; j++) { 1908 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1909 v++;vj++; 1910 } 1911 for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1912 } 1913 1914 /* solve U*perm(x) = y by back substitution */ 1915 for (k=mbs-1; k>=0; k--) { 1916 v = aa + ai[k] - 1; 1917 vj = aj + ai[k] - 1; 1918 nz = ai[k+1] - ai[k] - 1; 1919 for (j=0; j<nz; j++) { 1920 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1921 v++;vj++; 1922 } 1923 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1924 } 1925 1926 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1927 ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr); 1928 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1929 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1930 } 1931 PetscFunctionReturn(0); 1932 } 1933 1934 #undef __FUNCT__ 1935 #define __FUNCT__ "MatSolves_SeqSBAIJ_1_inplace" 1936 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx) 1937 { 1938 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 1939 PetscErrorCode ierr; 1940 1941 PetscFunctionBegin; 1942 if (A->rmap->bs == 1) { 1943 ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr); 1944 } else { 1945 IS isrow=a->row; 1946 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1947 const MatScalar *aa=a->a,*v; 1948 PetscScalar *x,*b,*t; 1949 PetscInt nz,k,n,i; 1950 if (bb->n > a->solves_work_n) { 1951 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1952 ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr); 1953 1954 a->solves_work_n = bb->n; 1955 } 1956 n = bb->n; 1957 ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr); 1958 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1959 t = a->solves_work; 1960 1961 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1962 1963 /* solve U^T*D*y = perm(b) by forward substitution */ 1964 for (k=0; k<mbs; k++) { 1965 for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */ 1966 } 1967 for (k=0; k<mbs; k++) { 1968 v = aa + ai[k]; 1969 vj = aj + ai[k]; 1970 nz = ai[k+1] - ai[k]; 1971 while (nz--) { 1972 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1973 v++;vj++; 1974 } 1975 for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1976 } 1977 1978 /* solve U*perm(x) = y by back substitution */ 1979 for (k=mbs-1; k>=0; k--) { 1980 v = aa + ai[k]; 1981 vj = aj + ai[k]; 1982 nz = ai[k+1] - ai[k]; 1983 while (nz--) { 1984 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1985 v++;vj++; 1986 } 1987 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1988 } 1989 1990 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1991 ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr); 1992 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1993 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1994 } 1995 PetscFunctionReturn(0); 1996 } 1997 1998 #undef __FUNCT__ 1999 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering" 2000 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2001 { 2002 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2003 PetscErrorCode ierr; 2004 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag; 2005 const MatScalar *aa=a->a,*v; 2006 const PetscScalar *b; 2007 PetscScalar *x,xi; 2008 PetscInt nz,i,j; 2009 2010 PetscFunctionBegin; 2011 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2012 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2013 2014 /* solve U^T*D*y = b by forward substitution */ 2015 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2016 for (i=0; i<mbs; i++) { 2017 v = aa + ai[i]; 2018 vj = aj + ai[i]; 2019 xi = x[i]; 2020 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 2021 for (j=0; j<nz; j++) x[vj[j]] += v[j]* xi; 2022 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2023 } 2024 2025 /* solve U*x = y by backward substitution */ 2026 for (i=mbs-2; i>=0; i--) { 2027 xi = x[i]; 2028 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2029 vj = aj + adiag[i] - 1; 2030 nz = ai[i+1] - ai[i] - 1; 2031 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 2032 x[i] = xi; 2033 } 2034 2035 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2036 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2037 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2038 PetscFunctionReturn(0); 2039 } 2040 2041 #undef __FUNCT__ 2042 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace" 2043 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2044 { 2045 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2046 PetscErrorCode ierr; 2047 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 2048 MatScalar *aa=a->a,*v; 2049 PetscScalar *x,*b,xk; 2050 PetscInt nz,*vj,k; 2051 2052 PetscFunctionBegin; 2053 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2054 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2055 2056 /* solve U^T*D*y = b by forward substitution */ 2057 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2058 for (k=0; k<mbs; k++) { 2059 v = aa + ai[k] + 1; 2060 vj = aj + ai[k] + 1; 2061 xk = x[k]; 2062 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2063 while (nz--) x[*vj++] += (*v++) * xk; 2064 x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2065 } 2066 2067 /* solve U*x = y by back substitution */ 2068 for (k=mbs-2; k>=0; k--) { 2069 v = aa + ai[k] + 1; 2070 vj = aj + ai[k] + 1; 2071 xk = x[k]; 2072 nz = ai[k+1] - ai[k] - 1; 2073 while (nz--) { 2074 xk += (*v++) * x[*vj++]; 2075 } 2076 x[k] = xk; 2077 } 2078 2079 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2080 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2081 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2082 PetscFunctionReturn(0); 2083 } 2084 2085 #undef __FUNCT__ 2086 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering" 2087 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2088 { 2089 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2090 PetscErrorCode ierr; 2091 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag; 2092 const MatScalar *aa=a->a,*v; 2093 PetscReal diagk; 2094 PetscScalar *x,*b; 2095 PetscInt nz,*vj,k; 2096 2097 PetscFunctionBegin; 2098 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2099 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2100 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2101 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2102 for (k=0; k<mbs; k++) { 2103 v = aa + ai[k]; 2104 vj = aj + ai[k]; 2105 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2106 while (nz--) x[*vj++] += (*v++) * x[k]; 2107 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2108 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]])); 2109 x[k] *= PetscSqrtReal(diagk); 2110 } 2111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2113 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2114 PetscFunctionReturn(0); 2115 } 2116 2117 #undef __FUNCT__ 2118 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace" 2119 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2120 { 2121 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2122 PetscErrorCode ierr; 2123 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 2124 MatScalar *aa=a->a,*v; 2125 PetscReal diagk; 2126 PetscScalar *x,*b; 2127 PetscInt nz,*vj,k; 2128 2129 PetscFunctionBegin; 2130 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2133 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2134 for (k=0; k<mbs; k++) { 2135 v = aa + ai[k] + 1; 2136 vj = aj + ai[k] + 1; 2137 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2138 while (nz--) x[*vj++] += (*v++) * x[k]; 2139 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2140 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]])); 2141 x[k] *= PetscSqrtReal(diagk); 2142 } 2143 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2144 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2145 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2146 PetscFunctionReturn(0); 2147 } 2148 2149 #undef __FUNCT__ 2150 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering" 2151 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2152 { 2153 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2154 PetscErrorCode ierr; 2155 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag; 2156 MatScalar *aa=a->a,*v; 2157 PetscReal diagk; 2158 PetscScalar *x,*b; 2159 PetscInt nz,*vj,k; 2160 2161 PetscFunctionBegin; 2162 /* solve D^(1/2)*U*x = b by back substitution */ 2163 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2164 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2165 2166 for (k=mbs-1; k>=0; k--) { 2167 v = aa + ai[k]; 2168 vj = aj + ai[k]; 2169 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2170 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2171 x[k] = PetscSqrtReal(diagk)*b[k]; 2172 nz = ai[k+1] - ai[k] - 1; 2173 while (nz--) x[k] += (*v++) * x[*vj++]; 2174 } 2175 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2177 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2178 PetscFunctionReturn(0); 2179 } 2180 2181 #undef __FUNCT__ 2182 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace" 2183 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2184 { 2185 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2186 PetscErrorCode ierr; 2187 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 2188 MatScalar *aa=a->a,*v; 2189 PetscReal diagk; 2190 PetscScalar *x,*b; 2191 PetscInt nz,*vj,k; 2192 2193 PetscFunctionBegin; 2194 /* solve D^(1/2)*U*x = b by back substitution */ 2195 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2196 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2197 2198 for (k=mbs-1; k>=0; k--) { 2199 v = aa + ai[k] + 1; 2200 vj = aj + ai[k] + 1; 2201 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2202 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2203 x[k] = PetscSqrtReal(diagk)*b[k]; 2204 nz = ai[k+1] - ai[k] - 1; 2205 while (nz--) x[k] += (*v++) * x[*vj++]; 2206 } 2207 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2208 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2209 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2210 PetscFunctionReturn(0); 2211 } 2212 2213 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2214 #undef __FUNCT__ 2215 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_MSR" 2216 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2217 { 2218 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2219 PetscErrorCode ierr; 2220 const PetscInt *rip,mbs = a->mbs,*ai,*aj; 2221 PetscInt *jutmp,bs = A->rmap->bs,bs2=a->bs2,i; 2222 PetscInt m,reallocs = 0,*levtmp; 2223 PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2224 PetscInt incrlev,*lev,shift,prow,nz; 2225 PetscReal f = info->fill,levels = info->levels; 2226 PetscBool perm_identity; 2227 2228 PetscFunctionBegin; 2229 /* check whether perm is the identity mapping */ 2230 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2231 2232 if (perm_identity) { 2233 a->permute = PETSC_FALSE; 2234 ai = a->i; aj = a->j; 2235 } else { /* non-trivial permutation */ 2236 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2237 a->permute = PETSC_TRUE; 2238 ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr); 2239 ai = a->inew; aj = a->jnew; 2240 } 2241 2242 /* initialization */ 2243 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2244 umax = (PetscInt)(f*ai[mbs] + 1); 2245 ierr = PetscMalloc1(umax,&lev);CHKERRQ(ierr); 2246 umax += mbs + 1; 2247 shift = mbs + 1; 2248 ierr = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr); 2249 ierr = PetscMalloc1(umax,&ju);CHKERRQ(ierr); 2250 iu[0] = mbs + 1; 2251 juidx = mbs + 1; 2252 /* prowl: linked list for pivot row */ 2253 ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr); 2254 /* q: linked list for col index */ 2255 2256 for (i=0; i<mbs; i++) { 2257 prowl[i] = mbs; 2258 q[i] = 0; 2259 } 2260 2261 /* for each row k */ 2262 for (k=0; k<mbs; k++) { 2263 nzk = 0; 2264 q[k] = mbs; 2265 /* copy current row into linked list */ 2266 nz = ai[rip[k]+1] - ai[rip[k]]; 2267 j = ai[rip[k]]; 2268 while (nz--) { 2269 vj = rip[aj[j++]]; 2270 if (vj > k) { 2271 qm = k; 2272 do { 2273 m = qm; qm = q[m]; 2274 } while (qm < vj); 2275 if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n"); 2276 nzk++; 2277 q[m] = vj; 2278 q[vj] = qm; 2279 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2280 } 2281 } 2282 2283 /* modify nonzero structure of k-th row by computing fill-in 2284 for each row prow to be merged in */ 2285 prow = k; 2286 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2287 2288 while (prow < k) { 2289 /* merge row prow into k-th row */ 2290 jmin = iu[prow] + 1; 2291 jmax = iu[prow+1]; 2292 qm = k; 2293 for (j=jmin; j<jmax; j++) { 2294 incrlev = lev[j-shift] + 1; 2295 if (incrlev > levels) continue; 2296 vj = ju[j]; 2297 do { 2298 m = qm; qm = q[m]; 2299 } while (qm < vj); 2300 if (qm != vj) { /* a fill */ 2301 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2302 levtmp[vj] = incrlev; 2303 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2304 } 2305 prow = prowl[prow]; /* next pivot row */ 2306 } 2307 2308 /* add k to row list for first nonzero element in k-th row */ 2309 if (nzk > 1) { 2310 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2311 prowl[k] = prowl[i]; prowl[i] = k; 2312 } 2313 iu[k+1] = iu[k] + nzk; 2314 2315 /* allocate more space to ju and lev if needed */ 2316 if (iu[k+1] > umax) { 2317 /* estimate how much additional space we will need */ 2318 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2319 /* just double the memory each time */ 2320 maxadd = umax; 2321 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2322 umax += maxadd; 2323 2324 /* allocate a longer ju */ 2325 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2326 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr); 2327 ierr = PetscFree(ju);CHKERRQ(ierr); 2328 ju = jutmp; 2329 2330 ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr); 2331 ierr = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr); 2332 ierr = PetscFree(lev);CHKERRQ(ierr); 2333 lev = jutmp; 2334 reallocs += 2; /* count how many times we realloc */ 2335 } 2336 2337 /* save nonzero structure of k-th row in ju */ 2338 i=k; 2339 while (nzk--) { 2340 i = q[i]; 2341 ju[juidx] = i; 2342 lev[juidx-shift] = levtmp[i]; 2343 juidx++; 2344 } 2345 } 2346 2347 #if defined(PETSC_USE_INFO) 2348 if (ai[mbs] != 0) { 2349 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2350 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr); 2351 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2352 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr); 2353 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 2354 } else { 2355 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2356 } 2357 #endif 2358 2359 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2360 ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr); 2361 ierr = PetscFree(lev);CHKERRQ(ierr); 2362 2363 /* put together the new matrix */ 2364 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,NULL);CHKERRQ(ierr); 2365 2366 /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */ 2367 b = (Mat_SeqSBAIJ*)(B)->data; 2368 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2369 2370 b->singlemalloc = PETSC_FALSE; 2371 b->free_a = PETSC_TRUE; 2372 b->free_ij = PETSC_TRUE; 2373 /* the next line frees the default space generated by the Create() */ 2374 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 2375 ierr = PetscMalloc1((iu[mbs]+1)*bs2,&b->a);CHKERRQ(ierr); 2376 b->j = ju; 2377 b->i = iu; 2378 b->diag = 0; 2379 b->ilen = 0; 2380 b->imax = 0; 2381 2382 ierr = ISDestroy(&b->row);CHKERRQ(ierr); 2383 ierr = ISDestroy(&b->icol);CHKERRQ(ierr); 2384 b->row = perm; 2385 b->icol = perm; 2386 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2387 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2388 ierr = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr); 2389 /* In b structure: Free imax, ilen, old a, old j. 2390 Allocate idnew, solve_work, new a, new j */ 2391 ierr = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2392 b->maxnz = b->nz = iu[mbs]; 2393 2394 (B)->info.factor_mallocs = reallocs; 2395 (B)->info.fill_ratio_given = f; 2396 if (ai[mbs] != 0) { 2397 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2398 } else { 2399 (B)->info.fill_ratio_needed = 0.0; 2400 } 2401 ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr); 2402 PetscFunctionReturn(0); 2403 } 2404 2405 /* 2406 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2407 */ 2408 #include <petscbt.h> 2409 #include <../src/mat/utils/freespace.h> 2410 #undef __FUNCT__ 2411 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ" 2412 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2413 { 2414 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2415 PetscErrorCode ierr; 2416 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2417 PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j; 2418 const PetscInt *rip; 2419 PetscInt reallocs=0,i,*ui,*udiag,*cols; 2420 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2421 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr; 2422 PetscReal fill =info->fill,levels=info->levels; 2423 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2424 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2425 PetscBT lnkbt; 2426 2427 PetscFunctionBegin; 2428 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 2429 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2430 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2431 if (bs > 1) { 2432 ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr); 2433 PetscFunctionReturn(0); 2434 } 2435 2436 /* check whether perm is the identity mapping */ 2437 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2438 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2439 a->permute = PETSC_FALSE; 2440 2441 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2442 ierr = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr); 2443 ui[0] = 0; 2444 2445 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2446 if (!levels) { 2447 /* reuse the column pointers and row offsets for memory savings */ 2448 for (i=0; i<am; i++) { 2449 ncols = ai[i+1] - ai[i]; 2450 ui[i+1] = ui[i] + ncols; 2451 udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */ 2452 } 2453 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2454 cols = uj; 2455 for (i=0; i<am; i++) { 2456 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2457 ncols = ai[i+1] - ai[i] -1; 2458 for (j=0; j<ncols; j++) *cols++ = aj[j]; 2459 *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */ 2460 } 2461 } else { /* case: levels>0 */ 2462 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2463 2464 /* initialization */ 2465 /* jl: linked list for storing indices of the pivot rows 2466 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2467 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2468 for (i=0; i<am; i++) { 2469 jl[i] = am; il[i] = 0; 2470 } 2471 2472 /* create and initialize a linked list for storing column indices of the active row k */ 2473 nlnk = am + 1; 2474 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2475 2476 /* initial FreeSpace size is fill*(ai[am]+1) */ 2477 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr); 2478 2479 current_space = free_space; 2480 2481 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr); 2482 2483 current_space_lvl = free_space_lvl; 2484 2485 for (k=0; k<am; k++) { /* for each active row k */ 2486 /* initialize lnk by the column indices of row k */ 2487 nzk = 0; 2488 ncols = ai[k+1] - ai[k]; 2489 if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k); 2490 cols = aj+ai[k]; 2491 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2492 nzk += nlnk; 2493 2494 /* update lnk by computing fill-in for each pivot row to be merged in */ 2495 prow = jl[k]; /* 1st pivot row */ 2496 2497 while (prow < k) { 2498 nextprow = jl[prow]; 2499 2500 /* merge prow into k-th row */ 2501 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2502 jmax = ui[prow+1]; 2503 ncols = jmax-jmin; 2504 i = jmin - ui[prow]; 2505 2506 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2507 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2508 j = *(uj - 1); 2509 ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2510 nzk += nlnk; 2511 2512 /* update il and jl for prow */ 2513 if (jmin < jmax) { 2514 il[prow] = jmin; 2515 2516 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2517 } 2518 prow = nextprow; 2519 } 2520 2521 /* if free space is not available, make more free space */ 2522 if (current_space->local_remaining<nzk) { 2523 i = am - k + 1; /* num of unfactored rows */ 2524 i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2525 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2526 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2527 reallocs++; 2528 } 2529 2530 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2531 if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k); 2532 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2533 2534 /* add the k-th row into il and jl */ 2535 if (nzk > 1) { 2536 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2537 jl[k] = jl[i]; jl[i] = k; 2538 il[k] = ui[k] + 1; 2539 } 2540 uj_ptr[k] = current_space->array; 2541 uj_lvl_ptr[k] = current_space_lvl->array; 2542 2543 current_space->array += nzk; 2544 current_space->local_used += nzk; 2545 current_space->local_remaining -= nzk; 2546 current_space_lvl->array += nzk; 2547 current_space_lvl->local_used += nzk; 2548 current_space_lvl->local_remaining -= nzk; 2549 2550 ui[k+1] = ui[k] + nzk; 2551 } 2552 2553 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2554 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2555 2556 /* destroy list of free space and other temporary array(s) */ 2557 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2558 ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor */ 2559 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2560 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2561 2562 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2563 2564 /* put together the new matrix in MATSEQSBAIJ format */ 2565 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2566 2567 b = (Mat_SeqSBAIJ*)(fact)->data; 2568 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2569 2570 b->singlemalloc = PETSC_FALSE; 2571 b->free_a = PETSC_TRUE; 2572 b->free_ij = free_ij; 2573 2574 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2575 2576 b->j = uj; 2577 b->i = ui; 2578 b->diag = udiag; 2579 b->free_diag = PETSC_TRUE; 2580 b->ilen = 0; 2581 b->imax = 0; 2582 b->row = perm; 2583 b->col = perm; 2584 2585 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2586 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2587 2588 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2589 2590 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2591 ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2592 2593 b->maxnz = b->nz = ui[am]; 2594 2595 fact->info.factor_mallocs = reallocs; 2596 fact->info.fill_ratio_given = fill; 2597 if (ai[am] != 0) { 2598 fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am]; 2599 } else { 2600 fact->info.fill_ratio_needed = 0.0; 2601 } 2602 #if defined(PETSC_USE_INFO) 2603 if (ai[am] != 0) { 2604 PetscReal af = fact->info.fill_ratio_needed; 2605 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2606 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2607 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2608 } else { 2609 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2610 } 2611 #endif 2612 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2613 PetscFunctionReturn(0); 2614 } 2615 2616 #undef __FUNCT__ 2617 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_inplace" 2618 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2619 { 2620 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2621 Mat_SeqSBAIJ *b; 2622 PetscErrorCode ierr; 2623 PetscBool perm_identity,free_ij = PETSC_TRUE; 2624 PetscInt bs=A->rmap->bs,am=a->mbs; 2625 const PetscInt *cols,*rip,*ai=a->i,*aj=a->j; 2626 PetscInt reallocs=0,i,*ui; 2627 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2628 PetscInt nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 2629 PetscReal fill =info->fill,levels=info->levels,ratio_needed; 2630 PetscFreeSpaceList free_space =NULL,current_space=NULL; 2631 PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL; 2632 PetscBT lnkbt; 2633 2634 PetscFunctionBegin; 2635 /* 2636 This code originally uses Modified Sparse Row (MSR) storage 2637 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2638 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2639 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2640 thus the original code in MSR format is still used for these cases. 2641 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2642 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2643 */ 2644 if (bs > 1) { 2645 ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr); 2646 PetscFunctionReturn(0); 2647 } 2648 2649 /* check whether perm is the identity mapping */ 2650 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2651 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2652 a->permute = PETSC_FALSE; 2653 2654 /* special case that simply copies fill pattern */ 2655 if (!levels) { 2656 /* reuse the column pointers and row offsets for memory savings */ 2657 ui = a->i; 2658 uj = a->j; 2659 free_ij = PETSC_FALSE; 2660 ratio_needed = 1.0; 2661 } else { /* case: levels>0 */ 2662 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2663 2664 /* initialization */ 2665 ierr = PetscMalloc1(am+1,&ui);CHKERRQ(ierr); 2666 ui[0] = 0; 2667 2668 /* jl: linked list for storing indices of the pivot rows 2669 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2670 ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr); 2671 for (i=0; i<am; i++) { 2672 jl[i] = am; il[i] = 0; 2673 } 2674 2675 /* create and initialize a linked list for storing column indices of the active row k */ 2676 nlnk = am + 1; 2677 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2678 2679 /* initial FreeSpace size is fill*(ai[am]+1) */ 2680 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr); 2681 2682 current_space = free_space; 2683 2684 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr); 2685 2686 current_space_lvl = free_space_lvl; 2687 2688 for (k=0; k<am; k++) { /* for each active row k */ 2689 /* initialize lnk by the column indices of row rip[k] */ 2690 nzk = 0; 2691 ncols = ai[rip[k]+1] - ai[rip[k]]; 2692 cols = aj+ai[rip[k]]; 2693 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2694 nzk += nlnk; 2695 2696 /* update lnk by computing fill-in for each pivot row to be merged in */ 2697 prow = jl[k]; /* 1st pivot row */ 2698 2699 while (prow < k) { 2700 nextprow = jl[prow]; 2701 2702 /* merge prow into k-th row */ 2703 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2704 jmax = ui[prow+1]; 2705 ncols = jmax-jmin; 2706 i = jmin - ui[prow]; 2707 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2708 j = *(uj_lvl_ptr[prow] + i - 1); 2709 cols_lvl = uj_lvl_ptr[prow]+i; 2710 ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2711 nzk += nlnk; 2712 2713 /* update il and jl for prow */ 2714 if (jmin < jmax) { 2715 il[prow] = jmin; 2716 j = *cols; 2717 jl[prow] = jl[j]; 2718 jl[j] = prow; 2719 } 2720 prow = nextprow; 2721 } 2722 2723 /* if free space is not available, make more free space */ 2724 if (current_space->local_remaining<nzk) { 2725 i = am - k + 1; /* num of unfactored rows */ 2726 i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2727 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2728 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2729 reallocs++; 2730 } 2731 2732 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2733 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2734 2735 /* add the k-th row into il and jl */ 2736 if (nzk-1 > 0) { 2737 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2738 jl[k] = jl[i]; jl[i] = k; 2739 il[k] = ui[k] + 1; 2740 } 2741 uj_ptr[k] = current_space->array; 2742 uj_lvl_ptr[k] = current_space_lvl->array; 2743 2744 current_space->array += nzk; 2745 current_space->local_used += nzk; 2746 current_space->local_remaining -= nzk; 2747 current_space_lvl->array += nzk; 2748 current_space_lvl->local_used += nzk; 2749 current_space_lvl->local_remaining -= nzk; 2750 2751 ui[k+1] = ui[k] + nzk; 2752 } 2753 2754 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2755 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2756 2757 /* destroy list of free space and other temporary array(s) */ 2758 ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr); 2759 ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 2760 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2761 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2762 if (ai[am] != 0) { 2763 ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 2764 } else { 2765 ratio_needed = 0.0; 2766 } 2767 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2768 2769 /* put together the new matrix in MATSEQSBAIJ format */ 2770 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr); 2771 2772 b = (Mat_SeqSBAIJ*)(fact)->data; 2773 2774 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2775 2776 b->singlemalloc = PETSC_FALSE; 2777 b->free_a = PETSC_TRUE; 2778 b->free_ij = free_ij; 2779 2780 ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr); 2781 2782 b->j = uj; 2783 b->i = ui; 2784 b->diag = 0; 2785 b->ilen = 0; 2786 b->imax = 0; 2787 b->row = perm; 2788 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2789 2790 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2791 b->icol = perm; 2792 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2793 ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr); 2794 2795 b->maxnz = b->nz = ui[am]; 2796 2797 fact->info.factor_mallocs = reallocs; 2798 fact->info.fill_ratio_given = fill; 2799 fact->info.fill_ratio_needed = ratio_needed; 2800 #if defined(PETSC_USE_INFO) 2801 if (ai[am] != 0) { 2802 PetscReal af = fact->info.fill_ratio_needed; 2803 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr); 2804 ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr); 2805 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr); 2806 } else { 2807 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2808 } 2809 #endif 2810 if (perm_identity) { 2811 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2812 } else { 2813 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2814 } 2815 PetscFunctionReturn(0); 2816 } 2817 2818