1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <../src/mat/blockinvert.h> 9 10 #undef __FUNCT__ 11 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_inplace" 12 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 13 { 14 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 15 IS isrow=a->row; 16 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 17 PetscErrorCode ierr; 18 const PetscInt *r; 19 PetscInt nz,*vj,k,idx,k1; 20 PetscInt bs=A->rmap->bs,bs2 = a->bs2; 21 MatScalar *aa=a->a,*v,*diag; 22 PetscScalar *x,*xk,*xj,*b,*xk_tmp,*t; 23 24 PetscFunctionBegin; 25 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 26 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27 t = a->solve_work; 28 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 29 ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr); 30 31 /* solve U^T * D * y = b by forward substitution */ 32 xk = t; 33 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 34 idx = bs*r[k]; 35 for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1]; 36 } 37 for (k=0; k<mbs; k++) { 38 v = aa + bs2*ai[k]; 39 xk = t + k*bs; /* Dk*xk = k-th block of x */ 40 ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */ 41 nz = ai[k+1] - ai[k]; 42 vj = aj + ai[k]; 43 xj = t + (*vj)*bs; /* *vj-th block of x, *vj>k */ 44 while (nz--) { 45 /* x(:) += U(k,:)^T*(Dk*xk) */ 46 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 47 vj++; xj = t + (*vj)*bs; 48 v += bs2; 49 } 50 /* xk = inv(Dk)*(Dk*xk) */ 51 diag = aa+k*bs2; /* ptr to inv(Dk) */ 52 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 53 } 54 55 /* solve U*x = y by back substitution */ 56 for (k=mbs-1; k>=0; k--) { 57 v = aa + bs2*ai[k]; 58 xk = t + k*bs; /* xk */ 59 nz = ai[k+1] - ai[k]; 60 vj = aj + ai[k]; 61 xj = t + (*vj)*bs; 62 while (nz--) { 63 /* xk += U(k,:)*x(:) */ 64 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 65 vj++; 66 v += bs2; xj = t + (*vj)*bs; 67 } 68 idx = bs*r[k]; 69 for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++; 70 } 71 72 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 73 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 74 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 75 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 76 ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr); 77 PetscFunctionReturn(0); 78 } 79 80 #undef __FUNCT__ 81 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_inplace" 82 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 83 { 84 PetscFunctionBegin; 85 SETERRQ(PETSC_COMM_SELF,1,"not implemented yet"); 86 PetscFunctionReturn(0); 87 } 88 89 #undef __FUNCT__ 90 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_inplace" 91 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 92 { 93 PetscFunctionBegin; 94 SETERRQ(PETSC_COMM_SELF,1,"not implemented yet"); 95 PetscFunctionReturn(0); 96 } 97 98 #undef __FUNCT__ 99 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering" 100 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 101 { 102 PetscErrorCode ierr; 103 PetscInt nz,*vj,k; 104 PetscInt bs2 = bs*bs; 105 MatScalar *v,*diag; 106 PetscScalar *xk,*xj,*xk_tmp; 107 108 PetscFunctionBegin; 109 ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr); 110 for (k=0; k<mbs; k++) { 111 v = aa + bs2*ai[k]; 112 xk = x + k*bs; /* Dk*xk = k-th block of x */ 113 ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */ 114 nz = ai[k+1] - ai[k]; 115 vj = aj + ai[k]; 116 xj = x + (*vj)*bs; /* *vj-th block of x, *vj>k */ 117 while (nz--) { 118 /* x(:) += U(k,:)^T*(Dk*xk) */ 119 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */ 120 vj++; xj = x + (*vj)*bs; 121 v += bs2; 122 } 123 /* xk = inv(Dk)*(Dk*xk) */ 124 diag = aa+k*bs2; /* ptr to inv(Dk) */ 125 PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */ 126 } 127 ierr = PetscFree(xk_tmp);CHKERRQ(ierr); 128 PetscFunctionReturn(0); 129 } 130 131 #undef __FUNCT__ 132 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering" 133 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x) 134 { 135 PetscInt nz,*vj,k; 136 PetscInt bs2 = bs*bs; 137 MatScalar *v; 138 PetscScalar *xk,*xj; 139 140 PetscFunctionBegin; 141 for (k=mbs-1; k>=0; k--) { 142 v = aa + bs2*ai[k]; 143 xk = x + k*bs; /* xk */ 144 nz = ai[k+1] - ai[k]; 145 vj = aj + ai[k]; 146 xj = x + (*vj)*bs; 147 while (nz--) { 148 /* xk += U(k,:)*x(:) */ 149 PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */ 150 vj++; 151 v += bs2; xj = x + (*vj)*bs; 152 } 153 } 154 PetscFunctionReturn(0); 155 } 156 157 #undef __FUNCT__ 158 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace" 159 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 160 { 161 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 162 PetscErrorCode ierr; 163 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 164 PetscInt bs=A->rmap->bs; 165 MatScalar *aa=a->a; 166 PetscScalar *x,*b; 167 168 PetscFunctionBegin; 169 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 170 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 171 172 /* solve U^T * D * y = b by forward substitution */ 173 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 174 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 175 176 /* solve U*x = y by back substitution */ 177 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 178 179 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 180 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 181 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 182 PetscFunctionReturn(0); 183 } 184 185 #undef __FUNCT__ 186 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace" 187 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 188 { 189 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 190 PetscErrorCode ierr; 191 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 192 PetscInt bs=A->rmap->bs; 193 MatScalar *aa=a->a; 194 PetscScalar *x,*b; 195 196 PetscFunctionBegin; 197 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 198 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 199 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 200 ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 201 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 202 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 203 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 204 PetscFunctionReturn(0); 205 } 206 207 #undef __FUNCT__ 208 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace" 209 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 210 { 211 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 212 PetscErrorCode ierr; 213 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 214 PetscInt bs=A->rmap->bs; 215 MatScalar *aa=a->a; 216 PetscScalar *x,*b; 217 218 PetscFunctionBegin; 219 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 220 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 221 ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 222 ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr); 223 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 224 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 225 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 226 PetscFunctionReturn(0); 227 } 228 229 #undef __FUNCT__ 230 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_inplace" 231 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 232 { 233 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 234 IS isrow=a->row; 235 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs; 236 PetscErrorCode ierr; 237 const PetscInt *r; 238 PetscInt nz,*vj,k,idx; 239 MatScalar *aa=a->a,*v,*d; 240 PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp; 241 242 PetscFunctionBegin; 243 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 244 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 245 t = a->solve_work; 246 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 247 248 /* solve U^T * D * y = b by forward substitution */ 249 tp = t; 250 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 251 idx = 7*r[k]; 252 tp[0] = b[idx]; 253 tp[1] = b[idx+1]; 254 tp[2] = b[idx+2]; 255 tp[3] = b[idx+3]; 256 tp[4] = b[idx+4]; 257 tp[5] = b[idx+5]; 258 tp[6] = b[idx+6]; 259 tp += 7; 260 } 261 262 for (k=0; k<mbs; k++) { 263 v = aa + 49*ai[k]; 264 vj = aj + ai[k]; 265 tp = t + k*7; 266 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; 267 nz = ai[k+1] - ai[k]; 268 tp = t + (*vj)*7; 269 while (nz--) { 270 tp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 271 tp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 272 tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 273 tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 274 tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 275 tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 276 tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 277 vj++; tp = t + (*vj)*7; 278 v += 49; 279 } 280 281 /* xk = inv(Dk)*(Dk*xk) */ 282 d = aa+k*49; /* ptr to inv(Dk) */ 283 tp = t + k*7; 284 tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 285 tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 286 tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 287 tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 288 tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 289 tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 290 tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 291 } 292 293 /* solve U*x = y by back substitution */ 294 for (k=mbs-1; k>=0; k--) { 295 v = aa + 49*ai[k]; 296 vj = aj + ai[k]; 297 tp = t + k*7; 298 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6]; /* xk */ 299 nz = ai[k+1] - ai[k]; 300 301 tp = t + (*vj)*7; 302 while (nz--) { 303 /* xk += U(k,:)*x(:) */ 304 x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6]; 305 x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6]; 306 x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6]; 307 x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6]; 308 x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6]; 309 x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6]; 310 x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6]; 311 vj++; tp = t + (*vj)*7; 312 v += 49; 313 } 314 tp = t + k*7; 315 tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6; 316 idx = 7*r[k]; 317 x[idx] = x0; 318 x[idx+1] = x1; 319 x[idx+2] = x2; 320 x[idx+3] = x3; 321 x[idx+4] = x4; 322 x[idx+5] = x5; 323 x[idx+6] = x6; 324 } 325 326 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 327 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 328 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 329 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 330 PetscFunctionReturn(0); 331 } 332 333 #undef __FUNCT__ 334 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering" 335 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 336 { 337 MatScalar *v,*d; 338 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 339 PetscInt nz,*vj,k; 340 341 PetscFunctionBegin; 342 for (k=0; k<mbs; k++) { 343 v = aa + 49*ai[k]; 344 xp = x + k*7; 345 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */ 346 nz = ai[k+1] - ai[k]; 347 vj = aj + ai[k]; 348 xp = x + (*vj)*7; 349 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 350 PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 351 while (nz--) { 352 /* x(:) += U(k,:)^T*(Dk*xk) */ 353 xp[0]+= v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6; 354 xp[1]+= v[7]*x0 + v[8]*x1 + v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6; 355 xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6; 356 xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6; 357 xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6; 358 xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6; 359 xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6; 360 vj++; xp = x + (*vj)*7; 361 v += 49; 362 } 363 /* xk = inv(Dk)*(Dk*xk) */ 364 d = aa+k*49; /* ptr to inv(Dk) */ 365 xp = x + k*7; 366 xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6; 367 xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6; 368 xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6; 369 xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6; 370 xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6; 371 xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6; 372 xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6; 373 } 374 PetscFunctionReturn(0); 375 } 376 377 #undef __FUNCT__ 378 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering" 379 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 380 { 381 MatScalar *v; 382 PetscScalar *xp,x0,x1,x2,x3,x4,x5,x6; 383 PetscInt nz,*vj,k; 384 385 PetscFunctionBegin; 386 for (k=mbs-1; k>=0; k--) { 387 v = aa + 49*ai[k]; 388 xp = x + k*7; 389 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */ 390 nz = ai[k+1] - ai[k]; 391 vj = aj + ai[k]; 392 xp = x + (*vj)*7; 393 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 394 PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 395 while (nz--) { 396 /* xk += U(k,:)*x(:) */ 397 x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6]; 398 x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6]; 399 x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6]; 400 x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6]; 401 x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6]; 402 x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6]; 403 x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6]; 404 vj++; 405 v += 49; xp = x + (*vj)*7; 406 } 407 xp = x + k*7; 408 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6; 409 } 410 PetscFunctionReturn(0); 411 } 412 413 #undef __FUNCT__ 414 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace" 415 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 416 { 417 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 418 PetscErrorCode ierr; 419 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 420 MatScalar *aa=a->a; 421 PetscScalar *x,*b; 422 423 PetscFunctionBegin; 424 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 425 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 426 427 /* solve U^T * D * y = b by forward substitution */ 428 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 429 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 430 431 /* solve U*x = y by back substitution */ 432 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 433 434 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 435 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 436 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 437 PetscFunctionReturn(0); 438 } 439 440 #undef __FUNCT__ 441 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace" 442 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 443 { 444 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 445 PetscErrorCode ierr; 446 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 447 MatScalar *aa=a->a; 448 PetscScalar *x,*b; 449 450 PetscFunctionBegin; 451 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 452 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 453 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 454 ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 455 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 457 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 458 PetscFunctionReturn(0); 459 } 460 461 #undef __FUNCT__ 462 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace" 463 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 464 { 465 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 466 PetscErrorCode ierr; 467 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 468 MatScalar *aa=a->a; 469 PetscScalar *x,*b; 470 471 PetscFunctionBegin; 472 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 473 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 474 ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 475 ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 476 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 477 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 478 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 479 PetscFunctionReturn(0); 480 } 481 482 #undef __FUNCT__ 483 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_inplace" 484 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 485 { 486 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 487 IS isrow=a->row; 488 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 489 PetscErrorCode ierr; 490 const PetscInt *r; 491 PetscInt nz,*vj,k,idx; 492 MatScalar *aa=a->a,*v,*d; 493 PetscScalar *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp; 494 495 PetscFunctionBegin; 496 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 497 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 498 t = a->solve_work; 499 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 500 501 /* solve U^T * D * y = b by forward substitution */ 502 tp = t; 503 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 504 idx = 6*r[k]; 505 tp[0] = b[idx]; 506 tp[1] = b[idx+1]; 507 tp[2] = b[idx+2]; 508 tp[3] = b[idx+3]; 509 tp[4] = b[idx+4]; 510 tp[5] = b[idx+5]; 511 tp += 6; 512 } 513 514 for (k=0; k<mbs; k++) { 515 v = aa + 36*ai[k]; 516 vj = aj + ai[k]; 517 tp = t + k*6; 518 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; 519 nz = ai[k+1] - ai[k]; 520 tp = t + (*vj)*6; 521 while (nz--) { 522 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 523 tp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 524 tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 525 tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 526 tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 527 tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 528 vj++; tp = t + (*vj)*6; 529 v += 36; 530 } 531 532 /* xk = inv(Dk)*(Dk*xk) */ 533 d = aa+k*36; /* ptr to inv(Dk) */ 534 tp = t + k*6; 535 tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 536 tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 537 tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 538 tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 539 tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 540 tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 541 } 542 543 /* solve U*x = y by back substitution */ 544 for (k=mbs-1; k>=0; k--) { 545 v = aa + 36*ai[k]; 546 vj = aj + ai[k]; 547 tp = t + k*6; 548 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */ 549 nz = ai[k+1] - ai[k]; 550 551 tp = t + (*vj)*6; 552 while (nz--) { 553 /* xk += U(k,:)*x(:) */ 554 x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5]; 555 x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5]; 556 x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5]; 557 x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5]; 558 x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5]; 559 x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5]; 560 vj++; tp = t + (*vj)*6; 561 v += 36; 562 } 563 tp = t + k*6; 564 tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; 565 idx = 6*r[k]; 566 x[idx] = x0; 567 x[idx+1] = x1; 568 x[idx+2] = x2; 569 x[idx+3] = x3; 570 x[idx+4] = x4; 571 x[idx+5] = x5; 572 } 573 574 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 575 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 576 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 577 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 578 PetscFunctionReturn(0); 579 } 580 581 #undef __FUNCT__ 582 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering" 583 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 584 { 585 MatScalar *v,*d; 586 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 587 PetscInt nz,*vj,k; 588 589 PetscFunctionBegin; 590 for (k=0; k<mbs; k++) { 591 v = aa + 36*ai[k]; 592 xp = x + k*6; 593 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */ 594 nz = ai[k+1] - ai[k]; 595 vj = aj + ai[k]; 596 xp = x + (*vj)*6; 597 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 598 PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 599 while (nz--) { 600 /* x(:) += U(k,:)^T*(Dk*xk) */ 601 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5; 602 xp[1] += v[6]*x0 + v[7]*x1 + v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5; 603 xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5; 604 xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5; 605 xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5; 606 xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5; 607 vj++; xp = x + (*vj)*6; 608 v += 36; 609 } 610 /* xk = inv(Dk)*(Dk*xk) */ 611 d = aa+k*36; /* ptr to inv(Dk) */ 612 xp = x + k*6; 613 xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5; 614 xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5; 615 xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5; 616 xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5; 617 xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5; 618 xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5; 619 } 620 PetscFunctionReturn(0); 621 } 622 #undef __FUNCT__ 623 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering" 624 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 625 { 626 MatScalar *v; 627 PetscScalar *xp,x0,x1,x2,x3,x4,x5; 628 PetscInt nz,*vj,k; 629 630 PetscFunctionBegin; 631 for (k=mbs-1; k>=0; k--) { 632 v = aa + 36*ai[k]; 633 xp = x + k*6; 634 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */ 635 nz = ai[k+1] - ai[k]; 636 vj = aj + ai[k]; 637 xp = x + (*vj)*6; 638 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 639 PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 640 while (nz--) { 641 /* xk += U(k,:)*x(:) */ 642 x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5]; 643 x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5]; 644 x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5]; 645 x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5]; 646 x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5]; 647 x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5]; 648 vj++; 649 v += 36; xp = x + (*vj)*6; 650 } 651 xp = x + k*6; 652 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; 653 } 654 PetscFunctionReturn(0); 655 } 656 657 658 #undef __FUNCT__ 659 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace" 660 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 661 { 662 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 663 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 664 MatScalar *aa=a->a; 665 PetscScalar *x,*b; 666 PetscErrorCode ierr; 667 668 PetscFunctionBegin; 669 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 670 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 671 672 /* solve U^T * D * y = b by forward substitution */ 673 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 674 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 675 676 /* solve U*x = y by back substitution */ 677 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 678 679 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 680 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 681 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 682 PetscFunctionReturn(0); 683 } 684 685 #undef __FUNCT__ 686 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace" 687 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 688 { 689 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 690 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 691 MatScalar *aa=a->a; 692 PetscScalar *x,*b; 693 PetscErrorCode ierr; 694 695 PetscFunctionBegin; 696 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 697 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 698 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 699 ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 700 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 701 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 702 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 703 PetscFunctionReturn(0); 704 } 705 706 #undef __FUNCT__ 707 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace" 708 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 709 { 710 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 711 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 712 MatScalar *aa=a->a; 713 PetscScalar *x,*b; 714 PetscErrorCode ierr; 715 716 PetscFunctionBegin; 717 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 718 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 719 ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 720 ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 721 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 722 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 723 ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr); 724 PetscFunctionReturn(0); 725 } 726 727 #undef __FUNCT__ 728 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_inplace" 729 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 730 { 731 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 732 IS isrow=a->row; 733 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2; 734 PetscErrorCode ierr; 735 const PetscInt *r; 736 PetscInt nz,*vj,k,idx; 737 MatScalar *aa=a->a,*v,*diag; 738 PetscScalar *x,*b,x0,x1,x2,x3,x4,*t,*tp; 739 740 PetscFunctionBegin; 741 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 742 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 743 t = a->solve_work; 744 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 745 746 /* solve U^T * D * y = b by forward substitution */ 747 tp = t; 748 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 749 idx = 5*r[k]; 750 tp[0] = b[idx]; 751 tp[1] = b[idx+1]; 752 tp[2] = b[idx+2]; 753 tp[3] = b[idx+3]; 754 tp[4] = b[idx+4]; 755 tp += 5; 756 } 757 758 for (k=0; k<mbs; k++) { 759 v = aa + 25*ai[k]; 760 vj = aj + ai[k]; 761 tp = t + k*5; 762 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; 763 nz = ai[k+1] - ai[k]; 764 765 tp = t + (*vj)*5; 766 while (nz--) { 767 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 768 tp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 769 tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4; 770 tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4; 771 tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4; 772 vj++; tp = t + (*vj)*5; 773 v += 25; 774 } 775 776 /* xk = inv(Dk)*(Dk*xk) */ 777 diag = aa+k*25; /* ptr to inv(Dk) */ 778 tp = t + k*5; 779 tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 780 tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 781 tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 782 tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 783 tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 784 } 785 786 /* solve U*x = y by back substitution */ 787 for (k=mbs-1; k>=0; k--) { 788 v = aa + 25*ai[k]; 789 vj = aj + ai[k]; 790 tp = t + k*5; 791 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */ 792 nz = ai[k+1] - ai[k]; 793 794 tp = t + (*vj)*5; 795 while (nz--) { 796 /* xk += U(k,:)*x(:) */ 797 x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4]; 798 x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4]; 799 x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4]; 800 x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4]; 801 x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4]; 802 vj++; tp = t + (*vj)*5; 803 v += 25; 804 } 805 tp = t + k*5; 806 tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; 807 idx = 5*r[k]; 808 x[idx] = x0; 809 x[idx+1] = x1; 810 x[idx+2] = x2; 811 x[idx+3] = x3; 812 x[idx+4] = x4; 813 } 814 815 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 816 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 817 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 818 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 819 PetscFunctionReturn(0); 820 } 821 822 #undef __FUNCT__ 823 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering" 824 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 825 { 826 MatScalar *v,*diag; 827 PetscScalar *xp,x0,x1,x2,x3,x4; 828 PetscInt nz,*vj,k; 829 830 PetscFunctionBegin; 831 for (k=0; k<mbs; k++) { 832 v = aa + 25*ai[k]; 833 xp = x + k*5; 834 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */ 835 nz = ai[k+1] - ai[k]; 836 vj = aj + ai[k]; 837 xp = x + (*vj)*5; 838 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 839 PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 840 while (nz--) { 841 /* x(:) += U(k,:)^T*(Dk*xk) */ 842 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4; 843 xp[1] += v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4; 844 xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4; 845 xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4; 846 xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4; 847 vj++; xp = x + (*vj)*5; 848 v += 25; 849 } 850 /* xk = inv(Dk)*(Dk*xk) */ 851 diag = aa+k*25; /* ptr to inv(Dk) */ 852 xp = x + k*5; 853 xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4; 854 xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4; 855 xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4; 856 xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4; 857 xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4; 858 } 859 PetscFunctionReturn(0); 860 } 861 862 #undef __FUNCT__ 863 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering" 864 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 865 { 866 MatScalar *v; 867 PetscScalar *xp,x0,x1,x2,x3,x4; 868 PetscInt nz,*vj,k; 869 870 PetscFunctionBegin; 871 for (k=mbs-1; k>=0; k--) { 872 v = aa + 25*ai[k]; 873 xp = x + k*5; 874 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */ 875 nz = ai[k+1] - ai[k]; 876 vj = aj + ai[k]; 877 xp = x + (*vj)*5; 878 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 879 PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 880 while (nz--) { 881 /* xk += U(k,:)*x(:) */ 882 x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4]; 883 x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4]; 884 x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4]; 885 x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4]; 886 x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4]; 887 vj++; 888 v += 25; xp = x + (*vj)*5; 889 } 890 xp = x + k*5; 891 xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; 892 } 893 PetscFunctionReturn(0); 894 } 895 896 #undef __FUNCT__ 897 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace" 898 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 899 { 900 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 901 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2; 902 MatScalar *aa=a->a; 903 PetscScalar *x,*b; 904 PetscErrorCode ierr; 905 906 PetscFunctionBegin; 907 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 908 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 909 910 /* solve U^T * D * y = b by forward substitution */ 911 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 912 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 913 914 /* solve U*x = y by back substitution */ 915 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 916 917 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 918 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 919 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 920 PetscFunctionReturn(0); 921 } 922 923 #undef __FUNCT__ 924 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace" 925 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 926 { 927 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 928 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 929 MatScalar *aa=a->a; 930 PetscScalar *x,*b; 931 PetscErrorCode ierr; 932 933 PetscFunctionBegin; 934 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 935 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 936 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 937 ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 938 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 939 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 940 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 941 PetscFunctionReturn(0); 942 } 943 944 #undef __FUNCT__ 945 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace" 946 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 947 { 948 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 949 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 950 MatScalar *aa=a->a; 951 PetscScalar *x,*b; 952 PetscErrorCode ierr; 953 954 PetscFunctionBegin; 955 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 956 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 957 ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 958 ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 959 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 960 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 961 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 962 PetscFunctionReturn(0); 963 } 964 965 #undef __FUNCT__ 966 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_inplace" 967 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 968 { 969 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 970 IS isrow=a->row; 971 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 972 PetscErrorCode ierr; 973 const PetscInt *r; 974 PetscInt nz,*vj,k,idx; 975 MatScalar *aa=a->a,*v,*diag; 976 PetscScalar *x,*b,x0,x1,x2,x3,*t,*tp; 977 978 PetscFunctionBegin; 979 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 980 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 981 t = a->solve_work; 982 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 983 984 /* solve U^T * D * y = b by forward substitution */ 985 tp = t; 986 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 987 idx = 4*r[k]; 988 tp[0] = b[idx]; 989 tp[1] = b[idx+1]; 990 tp[2] = b[idx+2]; 991 tp[3] = b[idx+3]; 992 tp += 4; 993 } 994 995 for (k=0; k<mbs; k++) { 996 v = aa + 16*ai[k]; 997 vj = aj + ai[k]; 998 tp = t + k*4; 999 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; 1000 nz = ai[k+1] - ai[k]; 1001 1002 tp = t + (*vj)*4; 1003 while (nz--) { 1004 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1005 tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1006 tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1007 tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1008 vj++; tp = t + (*vj)*4; 1009 v += 16; 1010 } 1011 1012 /* xk = inv(Dk)*(Dk*xk) */ 1013 diag = aa+k*16; /* ptr to inv(Dk) */ 1014 tp = t + k*4; 1015 tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1016 tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1017 tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1018 tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1019 } 1020 1021 /* solve U*x = y by back substitution */ 1022 for (k=mbs-1; k>=0; k--) { 1023 v = aa + 16*ai[k]; 1024 vj = aj + ai[k]; 1025 tp = t + k*4; 1026 x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */ 1027 nz = ai[k+1] - ai[k]; 1028 1029 tp = t + (*vj)*4; 1030 while (nz--) { 1031 /* xk += U(k,:)*x(:) */ 1032 x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3]; 1033 x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3]; 1034 x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3]; 1035 x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3]; 1036 vj++; tp = t + (*vj)*4; 1037 v += 16; 1038 } 1039 tp = t + k*4; 1040 tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; 1041 idx = 4*r[k]; 1042 x[idx] = x0; 1043 x[idx+1] = x1; 1044 x[idx+2] = x2; 1045 x[idx+3] = x3; 1046 } 1047 1048 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1049 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1050 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1051 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1052 PetscFunctionReturn(0); 1053 } 1054 1055 #undef __FUNCT__ 1056 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering" 1057 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1058 { 1059 MatScalar *v,*diag; 1060 PetscScalar *xp,x0,x1,x2,x3; 1061 PetscInt nz,*vj,k; 1062 1063 PetscFunctionBegin; 1064 for (k=0; k<mbs; k++) { 1065 v = aa + 16*ai[k]; 1066 xp = x + k*4; 1067 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */ 1068 nz = ai[k+1] - ai[k]; 1069 vj = aj + ai[k]; 1070 xp = x + (*vj)*4; 1071 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1072 PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1073 while (nz--) { 1074 /* x(:) += U(k,:)^T*(Dk*xk) */ 1075 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3; 1076 xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3; 1077 xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3; 1078 xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3; 1079 vj++; xp = x + (*vj)*4; 1080 v += 16; 1081 } 1082 /* xk = inv(Dk)*(Dk*xk) */ 1083 diag = aa+k*16; /* ptr to inv(Dk) */ 1084 xp = x + k*4; 1085 xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3; 1086 xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3; 1087 xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3; 1088 xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3; 1089 } 1090 PetscFunctionReturn(0); 1091 } 1092 1093 #undef __FUNCT__ 1094 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering" 1095 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1096 { 1097 MatScalar *v; 1098 PetscScalar *xp,x0,x1,x2,x3; 1099 PetscInt nz,*vj,k; 1100 1101 PetscFunctionBegin; 1102 for (k=mbs-1; k>=0; k--) { 1103 v = aa + 16*ai[k]; 1104 xp = x + k*4; 1105 x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */ 1106 nz = ai[k+1] - ai[k]; 1107 vj = aj + ai[k]; 1108 xp = x + (*vj)*4; 1109 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1110 PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1111 while (nz--) { 1112 /* xk += U(k,:)*x(:) */ 1113 x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3]; 1114 x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3]; 1115 x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3]; 1116 x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3]; 1117 vj++; 1118 v += 16; xp = x + (*vj)*4; 1119 } 1120 xp = x + k*4; 1121 xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3; 1122 } 1123 PetscFunctionReturn(0); 1124 } 1125 1126 #undef __FUNCT__ 1127 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace" 1128 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1129 { 1130 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1131 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1132 MatScalar *aa=a->a; 1133 PetscScalar *x,*b; 1134 PetscErrorCode ierr; 1135 1136 PetscFunctionBegin; 1137 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1138 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1139 1140 /* solve U^T * D * y = b by forward substitution */ 1141 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 1142 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1143 1144 /* solve U*x = y by back substitution */ 1145 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1146 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1147 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1148 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1149 PetscFunctionReturn(0); 1150 } 1151 1152 #undef __FUNCT__ 1153 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace" 1154 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1155 { 1156 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1157 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1158 MatScalar *aa=a->a; 1159 PetscScalar *x,*b; 1160 PetscErrorCode ierr; 1161 1162 PetscFunctionBegin; 1163 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1164 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1165 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */ 1166 ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1168 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1169 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 1170 PetscFunctionReturn(0); 1171 } 1172 1173 #undef __FUNCT__ 1174 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace" 1175 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1176 { 1177 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1178 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 1179 MatScalar *aa=a->a; 1180 PetscScalar *x,*b; 1181 PetscErrorCode ierr; 1182 1183 PetscFunctionBegin; 1184 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1185 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1186 ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1187 ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1188 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1189 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1190 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 1191 PetscFunctionReturn(0); 1192 } 1193 1194 #undef __FUNCT__ 1195 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_inplace" 1196 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1197 { 1198 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1199 IS isrow=a->row; 1200 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1201 PetscErrorCode ierr; 1202 const PetscInt *r; 1203 PetscInt nz,*vj,k,idx; 1204 MatScalar *aa=a->a,*v,*diag; 1205 PetscScalar *x,*b,x0,x1,x2,*t,*tp; 1206 1207 PetscFunctionBegin; 1208 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1209 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1210 t = a->solve_work; 1211 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1212 1213 /* solve U^T * D * y = b by forward substitution */ 1214 tp = t; 1215 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1216 idx = 3*r[k]; 1217 tp[0] = b[idx]; 1218 tp[1] = b[idx+1]; 1219 tp[2] = b[idx+2]; 1220 tp += 3; 1221 } 1222 1223 for (k=0; k<mbs; k++) { 1224 v = aa + 9*ai[k]; 1225 vj = aj + ai[k]; 1226 tp = t + k*3; 1227 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; 1228 nz = ai[k+1] - ai[k]; 1229 1230 tp = t + (*vj)*3; 1231 while (nz--) { 1232 tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1233 tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1234 tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1235 vj++; tp = t + (*vj)*3; 1236 v += 9; 1237 } 1238 1239 /* xk = inv(Dk)*(Dk*xk) */ 1240 diag = aa+k*9; /* ptr to inv(Dk) */ 1241 tp = t + k*3; 1242 tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1243 tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1244 tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1245 } 1246 1247 /* solve U*x = y by back substitution */ 1248 for (k=mbs-1; k>=0; k--) { 1249 v = aa + 9*ai[k]; 1250 vj = aj + ai[k]; 1251 tp = t + k*3; 1252 x0 = tp[0]; x1 = tp[1]; x2 = tp[2]; /* xk */ 1253 nz = ai[k+1] - ai[k]; 1254 1255 tp = t + (*vj)*3; 1256 while (nz--) { 1257 /* xk += U(k,:)*x(:) */ 1258 x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2]; 1259 x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2]; 1260 x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2]; 1261 vj++; tp = t + (*vj)*3; 1262 v += 9; 1263 } 1264 tp = t + k*3; 1265 tp[0] = x0; tp[1] = x1; tp[2] = x2; 1266 idx = 3*r[k]; 1267 x[idx] = x0; 1268 x[idx+1] = x1; 1269 x[idx+2] = x2; 1270 } 1271 1272 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1273 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1274 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1275 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1276 PetscFunctionReturn(0); 1277 } 1278 1279 #undef __FUNCT__ 1280 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering" 1281 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1282 { 1283 MatScalar *v,*diag; 1284 PetscScalar *xp,x0,x1,x2; 1285 PetscInt nz,*vj,k; 1286 1287 PetscFunctionBegin; 1288 for (k=0; k<mbs; k++) { 1289 v = aa + 9*ai[k]; 1290 xp = x + k*3; 1291 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */ 1292 nz = ai[k+1] - ai[k]; 1293 vj = aj + ai[k]; 1294 xp = x + (*vj)*3; 1295 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1296 PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1297 while (nz--) { 1298 /* x(:) += U(k,:)^T*(Dk*xk) */ 1299 xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2; 1300 xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2; 1301 xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2; 1302 vj++; xp = x + (*vj)*3; 1303 v += 9; 1304 } 1305 /* xk = inv(Dk)*(Dk*xk) */ 1306 diag = aa+k*9; /* ptr to inv(Dk) */ 1307 xp = x + k*3; 1308 xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2; 1309 xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2; 1310 xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2; 1311 } 1312 PetscFunctionReturn(0); 1313 } 1314 1315 #undef __FUNCT__ 1316 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering" 1317 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1318 { 1319 MatScalar *v; 1320 PetscScalar *xp,x0,x1,x2; 1321 PetscInt nz,*vj,k; 1322 1323 PetscFunctionBegin; 1324 for (k=mbs-1; k>=0; k--) { 1325 v = aa + 9*ai[k]; 1326 xp = x + k*3; 1327 x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* xk */ 1328 nz = ai[k+1] - ai[k]; 1329 vj = aj + ai[k]; 1330 xp = x + (*vj)*3; 1331 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1332 PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1333 while (nz--) { 1334 /* xk += U(k,:)*x(:) */ 1335 x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2]; 1336 x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2]; 1337 x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2]; 1338 vj++; 1339 v += 9; xp = x + (*vj)*3; 1340 } 1341 xp = x + k*3; 1342 xp[0] = x0; xp[1] = x1; xp[2] = x2; 1343 } 1344 PetscFunctionReturn(0); 1345 } 1346 1347 #undef __FUNCT__ 1348 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace" 1349 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1350 { 1351 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1352 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1353 MatScalar *aa=a->a; 1354 PetscScalar *x,*b; 1355 PetscErrorCode ierr; 1356 1357 PetscFunctionBegin; 1358 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1359 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1360 1361 /* solve U^T * D * y = b by forward substitution */ 1362 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1363 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1364 1365 /* solve U*x = y by back substitution */ 1366 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1367 1368 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1369 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1370 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1371 PetscFunctionReturn(0); 1372 } 1373 1374 #undef __FUNCT__ 1375 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace" 1376 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1377 { 1378 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1379 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1380 MatScalar *aa=a->a; 1381 PetscScalar *x,*b; 1382 PetscErrorCode ierr; 1383 1384 PetscFunctionBegin; 1385 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1386 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1387 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1388 ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1389 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1390 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1391 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 1392 PetscFunctionReturn(0); 1393 } 1394 1395 #undef __FUNCT__ 1396 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace" 1397 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1398 { 1399 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1400 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 1401 MatScalar *aa=a->a; 1402 PetscScalar *x,*b; 1403 PetscErrorCode ierr; 1404 1405 PetscFunctionBegin; 1406 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1407 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1408 ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1409 ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1410 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1411 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1412 ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr); 1413 PetscFunctionReturn(0); 1414 } 1415 1416 #undef __FUNCT__ 1417 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_inplace" 1418 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1419 { 1420 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ *)A->data; 1421 IS isrow=a->row; 1422 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1423 PetscErrorCode ierr; 1424 const PetscInt *r; 1425 PetscInt nz,*vj,k,k2,idx; 1426 MatScalar *aa=a->a,*v,*diag; 1427 PetscScalar *x,*b,x0,x1,*t; 1428 1429 PetscFunctionBegin; 1430 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1431 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1432 t = a->solve_work; 1433 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1434 1435 /* solve U^T * D * y = perm(b) by forward substitution */ 1436 for (k=0; k<mbs; k++) { /* t <- perm(b) */ 1437 idx = 2*r[k]; 1438 t[k*2] = b[idx]; 1439 t[k*2+1] = b[idx+1]; 1440 } 1441 for (k=0; k<mbs; k++) { 1442 v = aa + 4*ai[k]; 1443 vj = aj + ai[k]; 1444 k2 = k*2; 1445 x0 = t[k2]; x1 = t[k2+1]; 1446 nz = ai[k+1] - ai[k]; 1447 while (nz--) { 1448 t[(*vj)*2] += v[0]*x0 + v[1]*x1; 1449 t[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1450 vj++; v += 4; 1451 } 1452 diag = aa+k*4; /* ptr to inv(Dk) */ 1453 t[k2] = diag[0]*x0 + diag[2]*x1; 1454 t[k2+1] = diag[1]*x0 + diag[3]*x1; 1455 } 1456 1457 /* solve U*x = y by back substitution */ 1458 for (k=mbs-1; k>=0; k--) { 1459 v = aa + 4*ai[k]; 1460 vj = aj + ai[k]; 1461 k2 = k*2; 1462 x0 = t[k2]; x1 = t[k2+1]; 1463 nz = ai[k+1] - ai[k]; 1464 while (nz--) { 1465 x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1]; 1466 x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1]; 1467 vj++; v += 4; 1468 } 1469 t[k2] = x0; 1470 t[k2+1] = x1; 1471 idx = 2*r[k]; 1472 x[idx] = x0; 1473 x[idx+1] = x1; 1474 } 1475 1476 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1477 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1478 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1479 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1480 PetscFunctionReturn(0); 1481 } 1482 1483 #undef __FUNCT__ 1484 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering" 1485 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1486 { 1487 MatScalar *v,*diag; 1488 PetscScalar x0,x1; 1489 PetscInt nz,*vj,k,k2; 1490 1491 PetscFunctionBegin; 1492 for (k=0; k<mbs; k++) { 1493 v = aa + 4*ai[k]; 1494 vj = aj + ai[k]; 1495 k2 = k*2; 1496 x0 = x[k2]; x1 = x[k2+1]; /* Dk*xk = k-th block of x */ 1497 nz = ai[k+1] - ai[k]; 1498 PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1499 PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1500 while (nz--) { 1501 /* x(:) += U(k,:)^T*(Dk*xk) */ 1502 x[(*vj)*2] += v[0]*x0 + v[1]*x1; 1503 x[(*vj)*2+1] += v[2]*x0 + v[3]*x1; 1504 vj++; v += 4; 1505 } 1506 /* xk = inv(Dk)*(Dk*xk) */ 1507 diag = aa+k*4; /* ptr to inv(Dk) */ 1508 x[k2] = diag[0]*x0 + diag[2]*x1; 1509 x[k2+1] = diag[1]*x0 + diag[3]*x1; 1510 } 1511 PetscFunctionReturn(0); 1512 } 1513 1514 #undef __FUNCT__ 1515 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering" 1516 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x) 1517 { 1518 MatScalar *v; 1519 PetscScalar x0,x1; 1520 PetscInt nz,*vj,k,k2; 1521 1522 PetscFunctionBegin; 1523 for (k=mbs-1; k>=0; k--) { 1524 v = aa + 4*ai[k]; 1525 vj = aj + ai[k]; 1526 k2 = k*2; 1527 x0 = x[k2]; x1 = x[k2+1]; /* xk */ 1528 nz = ai[k+1] - ai[k]; 1529 PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1530 PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1531 while (nz--) { 1532 /* xk += U(k,:)*x(:) */ 1533 x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1]; 1534 x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1]; 1535 vj++; v += 4; 1536 } 1537 x[k2] = x0; 1538 x[k2+1] = x1; 1539 } 1540 PetscFunctionReturn(0); 1541 } 1542 1543 #undef __FUNCT__ 1544 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace" 1545 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1546 { 1547 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1548 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1549 MatScalar *aa=a->a; 1550 PetscScalar *x,*b; 1551 PetscErrorCode ierr; 1552 1553 PetscFunctionBegin; 1554 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1555 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1556 1557 /* solve U^T * D * y = b by forward substitution */ 1558 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1559 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1560 1561 /* solve U*x = y by back substitution */ 1562 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1563 1564 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1565 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1566 ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr); 1567 PetscFunctionReturn(0); 1568 } 1569 1570 #undef __FUNCT__ 1571 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace" 1572 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1573 { 1574 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1575 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2; 1576 MatScalar *aa=a->a; 1577 PetscScalar *x,*b; 1578 PetscErrorCode ierr; 1579 1580 PetscFunctionBegin; 1581 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1582 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1583 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1584 ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1585 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1586 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1587 ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr); 1588 PetscFunctionReturn(0); 1589 } 1590 1591 #undef __FUNCT__ 1592 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace" 1593 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 1594 { 1595 Mat_SeqSBAIJ *a=(Mat_SeqSBAIJ*)A->data; 1596 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2; 1597 MatScalar *aa=a->a; 1598 PetscScalar *x,*b; 1599 PetscErrorCode ierr; 1600 1601 PetscFunctionBegin; 1602 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1603 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1604 ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1605 ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr); 1606 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1607 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1608 ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr); 1609 PetscFunctionReturn(0); 1610 } 1611 1612 #undef __FUNCT__ 1613 #define __FUNCT__ "MatSolve_SeqSBAIJ_1" 1614 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1615 { 1616 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1617 IS isrow=a->row; 1618 PetscErrorCode ierr; 1619 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1620 const MatScalar *aa=a->a,*v; 1621 const PetscScalar *b; 1622 PetscScalar *x,xk,*t; 1623 PetscInt nz,k,j; 1624 1625 PetscFunctionBegin; 1626 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1627 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1628 t = a->solve_work; 1629 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1630 1631 /* solve U^T*D*y = perm(b) by forward substitution */ 1632 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1633 for (k=0; k<mbs; k++) { 1634 v = aa + ai[k]; 1635 vj = aj + ai[k]; 1636 xk = t[k]; 1637 nz = ai[k+1] - ai[k] - 1; 1638 for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk; 1639 t[k] = xk*v[nz]; /* v[nz] = 1/D(k) */ 1640 } 1641 1642 /* solve U*perm(x) = y by back substitution */ 1643 for (k=mbs-1; k>=0; k--) { 1644 v = aa + adiag[k] - 1; 1645 vj = aj + adiag[k] - 1; 1646 nz = ai[k+1] - ai[k] - 1; 1647 for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]]; 1648 x[rp[k]] = t[k]; 1649 } 1650 1651 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1652 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1653 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1654 ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr); 1655 PetscFunctionReturn(0); 1656 } 1657 1658 #undef __FUNCT__ 1659 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_inplace" 1660 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1661 { 1662 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1663 IS isrow=a->row; 1664 PetscErrorCode ierr; 1665 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1666 const MatScalar *aa=a->a,*v; 1667 PetscScalar *x,*b,xk,*t; 1668 PetscInt nz,k; 1669 1670 PetscFunctionBegin; 1671 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1672 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1673 t = a->solve_work; 1674 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1675 1676 /* solve U^T*D*y = perm(b) by forward substitution */ 1677 for (k=0; k<mbs; k++) t[k] = b[rp[k]]; 1678 for (k=0; k<mbs; k++) { 1679 v = aa + ai[k] + 1; 1680 vj = aj + ai[k] + 1; 1681 xk = t[k]; 1682 nz = ai[k+1] - ai[k] - 1; 1683 while (nz--) t[*vj++] += (*v++) * xk; 1684 t[k] = xk*aa[ai[k]]; /* aa[k] = 1/D(k) */ 1685 } 1686 1687 /* solve U*perm(x) = y by back substitution */ 1688 for (k=mbs-1; k>=0; k--) { 1689 v = aa + ai[k] + 1; 1690 vj = aj + ai[k] + 1; 1691 nz = ai[k+1] - ai[k] - 1; 1692 while (nz--) t[k] += (*v++) * t[*vj++]; 1693 x[rp[k]] = t[k]; 1694 } 1695 1696 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1697 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1698 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1699 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 1700 PetscFunctionReturn(0); 1701 } 1702 1703 #undef __FUNCT__ 1704 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1" 1705 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1706 { 1707 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1708 IS isrow=a->row; 1709 PetscErrorCode ierr; 1710 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1711 const MatScalar *aa=a->a,*v; 1712 PetscReal diagk; 1713 PetscScalar *x,*b,xk; 1714 PetscInt nz,k; 1715 1716 PetscFunctionBegin; 1717 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1718 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1719 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1720 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1721 1722 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1723 for (k=0; k<mbs; k++) { 1724 v = aa + ai[k]; 1725 vj = aj + ai[k]; 1726 xk = x[k]; 1727 nz = ai[k+1] - ai[k] - 1; 1728 while (nz--) x[*vj++] += (*v++) * xk; 1729 1730 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1731 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1732 x[k] = xk*PetscSqrtReal(diagk); 1733 } 1734 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1735 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1736 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1737 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1738 PetscFunctionReturn(0); 1739 } 1740 1741 #undef __FUNCT__ 1742 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_inplace" 1743 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1744 { 1745 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1746 IS isrow=a->row; 1747 PetscErrorCode ierr; 1748 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1749 const MatScalar *aa=a->a,*v; 1750 PetscReal diagk; 1751 PetscScalar *x,*b,xk; 1752 PetscInt nz,k; 1753 1754 PetscFunctionBegin; 1755 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1756 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1757 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1758 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1759 1760 for (k=0; k<mbs; k++) x[k] = b[rp[k]]; 1761 for (k=0; k<mbs; k++) { 1762 v = aa + ai[k] + 1; 1763 vj = aj + ai[k] + 1; 1764 xk = x[k]; 1765 nz = ai[k+1] - ai[k] - 1; 1766 while (nz--) x[*vj++] += (*v++) * xk; 1767 1768 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1769 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1770 x[k] = xk*PetscSqrtReal(diagk); 1771 } 1772 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1773 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1774 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1775 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1776 PetscFunctionReturn(0); 1777 } 1778 1779 #undef __FUNCT__ 1780 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1" 1781 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 1782 { 1783 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1784 IS isrow=a->row; 1785 PetscErrorCode ierr; 1786 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag; 1787 const MatScalar *aa=a->a,*v; 1788 PetscReal diagk; 1789 PetscScalar *x,*b,*t; 1790 PetscInt nz,k; 1791 1792 PetscFunctionBegin; 1793 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1794 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1795 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1796 t = a->solve_work; 1797 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1798 1799 for (k=mbs-1; k>=0; k--) { 1800 v = aa + ai[k]; 1801 vj = aj + ai[k]; 1802 diagk = PetscRealPart(aa[adiag[k]]); 1803 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1804 t[k] = b[k] * PetscSqrtReal(diagk); 1805 nz = ai[k+1] - ai[k] - 1; 1806 while (nz--) t[k] += (*v++) * t[*vj++]; 1807 x[rp[k]] = t[k]; 1808 } 1809 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1810 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1811 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1812 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1813 PetscFunctionReturn(0); 1814 } 1815 1816 #undef __FUNCT__ 1817 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_inplace" 1818 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 1819 { 1820 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1821 IS isrow=a->row; 1822 PetscErrorCode ierr; 1823 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj; 1824 const MatScalar *aa=a->a,*v; 1825 PetscReal diagk; 1826 PetscScalar *x,*b,*t; 1827 PetscInt nz,k; 1828 1829 PetscFunctionBegin; 1830 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1831 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1832 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1833 t = a->solve_work; 1834 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1835 1836 for (k=mbs-1; k>=0; k--) { 1837 v = aa + ai[k] + 1; 1838 vj = aj + ai[k] + 1; 1839 diagk = PetscRealPart(aa[ai[k]]); 1840 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 1841 t[k] = b[k] * PetscSqrtReal(diagk); 1842 nz = ai[k+1] - ai[k] - 1; 1843 while (nz--) t[k] += (*v++) * t[*vj++]; 1844 x[rp[k]] = t[k]; 1845 } 1846 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1847 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1848 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1849 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 1850 PetscFunctionReturn(0); 1851 } 1852 1853 #undef __FUNCT__ 1854 #define __FUNCT__ "MatSolves_SeqSBAIJ_1" 1855 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx) 1856 { 1857 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1858 PetscErrorCode ierr; 1859 1860 PetscFunctionBegin; 1861 if (A->rmap->bs == 1) { 1862 ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr); 1863 } else { 1864 IS isrow=a->row; 1865 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1866 const MatScalar *aa=a->a,*v; 1867 PetscScalar *x,*b,*t; 1868 PetscInt nz,k,n,i,j; 1869 if (bb->n > a->solves_work_n) { 1870 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1871 ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr); 1872 a->solves_work_n = bb->n; 1873 } 1874 n = bb->n; 1875 ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr); 1876 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1877 t = a->solves_work; 1878 1879 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1880 1881 /* solve U^T*D*y = perm(b) by forward substitution */ 1882 for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */ 1883 for (k=0; k<mbs; k++) { 1884 v = aa + ai[k]; 1885 vj = aj + ai[k]; 1886 nz = ai[k+1] - ai[k] - 1; 1887 for (j=0; j<nz; j++) { 1888 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1889 v++;vj++; 1890 } 1891 for (i=0; i<n; i++) t[n*k+i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1892 } 1893 1894 /* solve U*perm(x) = y by back substitution */ 1895 for (k=mbs-1; k>=0; k--) { 1896 v = aa + ai[k] - 1; 1897 vj = aj + ai[k] - 1; 1898 nz = ai[k+1] - ai[k] - 1; 1899 for (j=0; j<nz; j++) { 1900 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1901 v++;vj++; 1902 } 1903 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1904 } 1905 1906 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1907 ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr); 1908 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1909 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1910 } 1911 PetscFunctionReturn(0); 1912 } 1913 1914 #undef __FUNCT__ 1915 #define __FUNCT__ "MatSolves_SeqSBAIJ_1_inplace" 1916 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx) 1917 { 1918 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1919 PetscErrorCode ierr; 1920 1921 PetscFunctionBegin; 1922 if (A->rmap->bs == 1) { 1923 ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr); 1924 } else { 1925 IS isrow=a->row; 1926 const PetscInt *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp; 1927 const MatScalar *aa=a->a,*v; 1928 PetscScalar *x,*b,*t; 1929 PetscInt nz,k,n,i; 1930 if (bb->n > a->solves_work_n) { 1931 ierr = PetscFree(a->solves_work);CHKERRQ(ierr); 1932 ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr); 1933 a->solves_work_n = bb->n; 1934 } 1935 n = bb->n; 1936 ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr); 1937 ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr); 1938 t = a->solves_work; 1939 1940 ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr); 1941 1942 /* solve U^T*D*y = perm(b) by forward substitution */ 1943 for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */ 1944 for (k=0; k<mbs; k++) { 1945 v = aa + ai[k]; 1946 vj = aj + ai[k]; 1947 nz = ai[k+1] - ai[k]; 1948 while (nz--) { 1949 for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i]; 1950 v++;vj++; 1951 } 1952 for (i=0; i<n; i++) t[n*k+i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 1953 } 1954 1955 /* solve U*perm(x) = y by back substitution */ 1956 for (k=mbs-1; k>=0; k--) { 1957 v = aa + ai[k]; 1958 vj = aj + ai[k]; 1959 nz = ai[k+1] - ai[k]; 1960 while (nz--) { 1961 for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i]; 1962 v++;vj++; 1963 } 1964 for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i]; 1965 } 1966 1967 ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr); 1968 ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr); 1969 ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr); 1970 ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr); 1971 } 1972 PetscFunctionReturn(0); 1973 } 1974 1975 #undef __FUNCT__ 1976 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering" 1977 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1978 { 1979 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1980 PetscErrorCode ierr; 1981 const PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag; 1982 const MatScalar *aa=a->a,*v; 1983 const PetscScalar *b; 1984 PetscScalar *x,xi; 1985 PetscInt nz,i,j; 1986 1987 PetscFunctionBegin; 1988 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1989 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1990 1991 /* solve U^T*D*y = b by forward substitution */ 1992 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 1993 for (i=0; i<mbs; i++) { 1994 v = aa + ai[i]; 1995 vj = aj + ai[i]; 1996 xi = x[i]; 1997 nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */ 1998 for (j=0; j<nz; j++) { 1999 x[vj[j]] += v[j]* xi; 2000 } 2001 x[i] = xi*v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2002 } 2003 2004 /* solve U*x = y by backward substitution */ 2005 for (i=mbs-2; i>=0; i--) { 2006 xi = x[i]; 2007 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2008 vj = aj + adiag[i] - 1; 2009 nz = ai[i+1] - ai[i] - 1; 2010 for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]]; 2011 x[i] = xi; 2012 } 2013 2014 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2015 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2016 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2017 PetscFunctionReturn(0); 2018 } 2019 2020 #undef __FUNCT__ 2021 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace" 2022 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2023 { 2024 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2025 PetscErrorCode ierr; 2026 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 2027 MatScalar *aa=a->a,*v; 2028 PetscScalar *x,*b,xk; 2029 PetscInt nz,*vj,k; 2030 2031 PetscFunctionBegin; 2032 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2033 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2034 2035 /* solve U^T*D*y = b by forward substitution */ 2036 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2037 for (k=0; k<mbs; k++) { 2038 v = aa + ai[k] + 1; 2039 vj = aj + ai[k] + 1; 2040 xk = x[k]; 2041 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2042 while (nz--) x[*vj++] += (*v++) * xk; 2043 x[k] = xk*aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2044 } 2045 2046 /* solve U*x = y by back substitution */ 2047 for (k=mbs-2; k>=0; k--) { 2048 v = aa + ai[k] + 1; 2049 vj = aj + ai[k] + 1; 2050 xk = x[k]; 2051 nz = ai[k+1] - ai[k] - 1; 2052 while (nz--) { 2053 xk += (*v++) * x[*vj++]; 2054 } 2055 x[k] = xk; 2056 } 2057 2058 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2059 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2060 ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr); 2061 PetscFunctionReturn(0); 2062 } 2063 2064 #undef __FUNCT__ 2065 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering" 2066 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2067 { 2068 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2069 PetscErrorCode ierr; 2070 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag; 2071 const MatScalar *aa=a->a,*v; 2072 PetscReal diagk; 2073 PetscScalar *x,*b; 2074 PetscInt nz,*vj,k; 2075 2076 PetscFunctionBegin; 2077 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2078 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2079 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2080 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2081 for (k=0; k<mbs; k++) { 2082 v = aa + ai[k]; 2083 vj = aj + ai[k]; 2084 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2085 while (nz--) x[*vj++] += (*v++) * x[k]; 2086 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2087 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]])); 2088 x[k] *= PetscSqrtReal(diagk); 2089 } 2090 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2091 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2092 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2093 PetscFunctionReturn(0); 2094 } 2095 2096 #undef __FUNCT__ 2097 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace" 2098 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2099 { 2100 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2101 PetscErrorCode ierr; 2102 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 2103 MatScalar *aa=a->a,*v; 2104 PetscReal diagk; 2105 PetscScalar *x,*b; 2106 PetscInt nz,*vj,k; 2107 2108 PetscFunctionBegin; 2109 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2110 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2111 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2112 ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr); 2113 for (k=0; k<mbs; k++) { 2114 v = aa + ai[k] + 1; 2115 vj = aj + ai[k] + 1; 2116 nz = ai[k+1] - ai[k] - 1; /* exclude diag[k] */ 2117 while (nz--) x[*vj++] += (*v++) * x[k]; 2118 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2119 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]])); 2120 x[k] *= PetscSqrtReal(diagk); 2121 } 2122 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2123 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2124 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2125 PetscFunctionReturn(0); 2126 } 2127 2128 #undef __FUNCT__ 2129 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering" 2130 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2131 { 2132 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2133 PetscErrorCode ierr; 2134 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag; 2135 MatScalar *aa=a->a,*v; 2136 PetscReal diagk; 2137 PetscScalar *x,*b; 2138 PetscInt nz,*vj,k; 2139 2140 PetscFunctionBegin; 2141 /* solve D^(1/2)*U*x = b by back substitution */ 2142 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2143 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2144 2145 for (k=mbs-1; k>=0; k--) { 2146 v = aa + ai[k]; 2147 vj = aj + ai[k]; 2148 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2149 if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2150 x[k] = PetscSqrtReal(diagk)*b[k]; 2151 nz = ai[k+1] - ai[k] - 1; 2152 while (nz--) x[k] += (*v++) * x[*vj++]; 2153 } 2154 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2155 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2156 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2157 PetscFunctionReturn(0); 2158 } 2159 2160 #undef __FUNCT__ 2161 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace" 2162 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2163 { 2164 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2165 PetscErrorCode ierr; 2166 PetscInt mbs=a->mbs,*ai=a->i,*aj=a->j; 2167 MatScalar *aa=a->a,*v; 2168 PetscReal diagk; 2169 PetscScalar *x,*b; 2170 PetscInt nz,*vj,k; 2171 2172 PetscFunctionBegin; 2173 /* solve D^(1/2)*U*x = b by back substitution */ 2174 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2175 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2176 2177 for (k=mbs-1; k>=0; k--) { 2178 v = aa + ai[k] + 1; 2179 vj = aj + ai[k] + 1; 2180 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2181 if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative"); 2182 x[k] = PetscSqrtReal(diagk)*b[k]; 2183 nz = ai[k+1] - ai[k] - 1; 2184 while (nz--) x[k] += (*v++) * x[*vj++]; 2185 } 2186 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2187 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2188 ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr); 2189 PetscFunctionReturn(0); 2190 } 2191 2192 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2193 #undef __FUNCT__ 2194 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_MSR" 2195 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info) 2196 { 2197 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2198 PetscErrorCode ierr; 2199 const PetscInt *rip,mbs = a->mbs,*ai,*aj; 2200 PetscInt *jutmp,bs = A->rmap->bs,bs2=a->bs2,i; 2201 PetscInt m,reallocs = 0,*levtmp; 2202 PetscInt *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2203 PetscInt incrlev,*lev,shift,prow,nz; 2204 PetscReal f = info->fill,levels = info->levels; 2205 PetscBool perm_identity; 2206 2207 PetscFunctionBegin; 2208 /* check whether perm is the identity mapping */ 2209 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2210 2211 if (perm_identity) { 2212 a->permute = PETSC_FALSE; 2213 ai = a->i; aj = a->j; 2214 } else { /* non-trivial permutation */ 2215 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2216 a->permute = PETSC_TRUE; 2217 ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr); 2218 ai = a->inew; aj = a->jnew; 2219 } 2220 2221 /* initialization */ 2222 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2223 umax = (PetscInt)(f*ai[mbs] + 1); 2224 ierr = PetscMalloc(umax*sizeof(PetscInt),&lev);CHKERRQ(ierr); 2225 umax += mbs + 1; 2226 shift = mbs + 1; 2227 ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr); 2228 ierr = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr); 2229 iu[0] = mbs + 1; 2230 juidx = mbs + 1; 2231 /* prowl: linked list for pivot row */ 2232 ierr = PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);CHKERRQ(ierr); 2233 /* q: linked list for col index */ 2234 2235 for (i=0; i<mbs; i++) { 2236 prowl[i] = mbs; 2237 q[i] = 0; 2238 } 2239 2240 /* for each row k */ 2241 for (k=0; k<mbs; k++) { 2242 nzk = 0; 2243 q[k] = mbs; 2244 /* copy current row into linked list */ 2245 nz = ai[rip[k]+1] - ai[rip[k]]; 2246 j = ai[rip[k]]; 2247 while (nz--) { 2248 vj = rip[aj[j++]]; 2249 if (vj > k) { 2250 qm = k; 2251 do { 2252 m = qm; qm = q[m]; 2253 } while (qm < vj); 2254 if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n"); 2255 nzk++; 2256 q[m] = vj; 2257 q[vj] = qm; 2258 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2259 } 2260 } 2261 2262 /* modify nonzero structure of k-th row by computing fill-in 2263 for each row prow to be merged in */ 2264 prow = k; 2265 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2266 2267 while (prow < k) { 2268 /* merge row prow into k-th row */ 2269 jmin = iu[prow] + 1; 2270 jmax = iu[prow+1]; 2271 qm = k; 2272 for (j=jmin; j<jmax; j++) { 2273 incrlev = lev[j-shift] + 1; 2274 if (incrlev > levels) continue; 2275 vj = ju[j]; 2276 do { 2277 m = qm; qm = q[m]; 2278 } while (qm < vj); 2279 if (qm != vj) { /* a fill */ 2280 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2281 levtmp[vj] = incrlev; 2282 } else { 2283 if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2284 } 2285 } 2286 prow = prowl[prow]; /* next pivot row */ 2287 } 2288 2289 /* add k to row list for first nonzero element in k-th row */ 2290 if (nzk > 1) { 2291 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2292 prowl[k] = prowl[i]; prowl[i] = k; 2293 } 2294 iu[k+1] = iu[k] + nzk; 2295 2296 /* allocate more space to ju and lev if needed */ 2297 if (iu[k+1] > umax) { 2298 /* estimate how much additional space we will need */ 2299 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2300 /* just double the memory each time */ 2301 maxadd = umax; 2302 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2303 umax += maxadd; 2304 2305 /* allocate a longer ju */ 2306 ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr); 2307 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr); 2308 ierr = PetscFree(ju);CHKERRQ(ierr); 2309 ju = jutmp; 2310 2311 ierr = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr); 2312 ierr = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr); 2313 ierr = PetscFree(lev);CHKERRQ(ierr); 2314 lev = jutmp; 2315 reallocs += 2; /* count how many times we realloc */ 2316 } 2317 2318 /* save nonzero structure of k-th row in ju */ 2319 i=k; 2320 while (nzk --) { 2321 i = q[i]; 2322 ju[juidx] = i; 2323 lev[juidx-shift] = levtmp[i]; 2324 juidx++; 2325 } 2326 } 2327 2328 #if defined(PETSC_USE_INFO) 2329 if (ai[mbs] != 0) { 2330 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2331 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 2332 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 2333 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 2334 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 2335 } else { 2336 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2337 } 2338 #endif 2339 2340 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2341 ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr); 2342 ierr = PetscFree(lev);CHKERRQ(ierr); 2343 2344 /* put together the new matrix */ 2345 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);CHKERRQ(ierr); 2346 2347 /* ierr = PetscLogObjectParent(B,iperm);CHKERRQ(ierr); */ 2348 b = (Mat_SeqSBAIJ*)(B)->data; 2349 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2350 b->singlemalloc = PETSC_FALSE; 2351 b->free_a = PETSC_TRUE; 2352 b->free_ij = PETSC_TRUE; 2353 /* the next line frees the default space generated by the Create() */ 2354 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 2355 ierr = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr); 2356 b->j = ju; 2357 b->i = iu; 2358 b->diag = 0; 2359 b->ilen = 0; 2360 b->imax = 0; 2361 2362 ierr = ISDestroy(&b->row);CHKERRQ(ierr); 2363 ierr = ISDestroy(&b->icol);CHKERRQ(ierr); 2364 b->row = perm; 2365 b->icol = perm; 2366 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2367 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2368 ierr = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2369 /* In b structure: Free imax, ilen, old a, old j. 2370 Allocate idnew, solve_work, new a, new j */ 2371 ierr = PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2372 b->maxnz = b->nz = iu[mbs]; 2373 2374 (B)->info.factor_mallocs = reallocs; 2375 (B)->info.fill_ratio_given = f; 2376 if (ai[mbs] != 0) { 2377 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2378 } else { 2379 (B)->info.fill_ratio_needed = 0.0; 2380 } 2381 ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr); 2382 PetscFunctionReturn(0); 2383 } 2384 2385 /* 2386 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2387 */ 2388 #include <petscbt.h> 2389 #include <../src/mat/utils/freespace.h> 2390 #undef __FUNCT__ 2391 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ" 2392 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2393 { 2394 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2395 PetscErrorCode ierr; 2396 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2397 PetscInt bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j; 2398 const PetscInt *rip; 2399 PetscInt reallocs=0,i,*ui,*udiag,*cols; 2400 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2401 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr; 2402 PetscReal fill=info->fill,levels=info->levels; 2403 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 2404 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 2405 PetscBT lnkbt; 2406 2407 PetscFunctionBegin; 2408 if (bs > 1) { 2409 ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr); 2410 PetscFunctionReturn(0); 2411 } 2412 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 2413 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2414 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2415 2416 /* check whether perm is the identity mapping */ 2417 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2418 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2419 a->permute = PETSC_FALSE; 2420 2421 ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr); 2422 ierr = PetscMalloc((am+1)*sizeof(PetscInt),&udiag);CHKERRQ(ierr); 2423 ui[0] = 0; 2424 2425 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2426 if (!levels) { 2427 /* reuse the column pointers and row offsets for memory savings */ 2428 for (i=0; i<am; i++) { 2429 ncols = ai[i+1] - ai[i]; 2430 ui[i+1] = ui[i] + ncols; 2431 udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */ 2432 } 2433 ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr); 2434 cols = uj; 2435 for (i=0; i<am; i++) { 2436 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2437 ncols = ai[i+1] - ai[i] -1; 2438 for (j=0; j<ncols; j++) *cols++ = aj[j]; 2439 *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */ 2440 } 2441 } else { /* case: levels>0 */ 2442 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2443 2444 /* initialization */ 2445 /* jl: linked list for storing indices of the pivot rows 2446 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2447 ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr); 2448 for (i=0; i<am; i++) { 2449 jl[i] = am; il[i] = 0; 2450 } 2451 2452 /* create and initialize a linked list for storing column indices of the active row k */ 2453 nlnk = am + 1; 2454 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2455 2456 /* initial FreeSpace size is fill*(ai[am]+1) */ 2457 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr); 2458 current_space = free_space; 2459 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr); 2460 current_space_lvl = free_space_lvl; 2461 2462 for (k=0; k<am; k++) { /* for each active row k */ 2463 /* initialize lnk by the column indices of row k */ 2464 nzk = 0; 2465 ncols = ai[k+1] - ai[k]; 2466 if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k); 2467 cols = aj+ai[k]; 2468 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2469 nzk += nlnk; 2470 2471 /* update lnk by computing fill-in for each pivot row to be merged in */ 2472 prow = jl[k]; /* 1st pivot row */ 2473 2474 while (prow < k) { 2475 nextprow = jl[prow]; 2476 2477 /* merge prow into k-th row */ 2478 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2479 jmax = ui[prow+1]; 2480 ncols = jmax-jmin; 2481 i = jmin - ui[prow]; 2482 2483 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2484 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2485 j = *(uj - 1); 2486 ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2487 nzk += nlnk; 2488 2489 /* update il and jl for prow */ 2490 if (jmin < jmax) { 2491 il[prow] = jmin; 2492 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2493 } 2494 prow = nextprow; 2495 } 2496 2497 /* if free space is not available, make more free space */ 2498 if (current_space->local_remaining<nzk) { 2499 i = am - k + 1; /* num of unfactored rows */ 2500 i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2501 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2502 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2503 reallocs++; 2504 } 2505 2506 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2507 if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k); 2508 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2509 2510 /* add the k-th row into il and jl */ 2511 if (nzk > 1) { 2512 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2513 jl[k] = jl[i]; jl[i] = k; 2514 il[k] = ui[k] + 1; 2515 } 2516 uj_ptr[k] = current_space->array; 2517 uj_lvl_ptr[k] = current_space_lvl->array; 2518 2519 current_space->array += nzk; 2520 current_space->local_used += nzk; 2521 current_space->local_remaining -= nzk; 2522 current_space_lvl->array += nzk; 2523 current_space_lvl->local_used += nzk; 2524 current_space_lvl->local_remaining -= nzk; 2525 2526 ui[k+1] = ui[k] + nzk; 2527 } 2528 2529 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2530 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2531 2532 /* destroy list of free space and other temporary array(s) */ 2533 ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr); 2534 ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor */ 2535 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2536 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2537 2538 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2539 2540 /* put together the new matrix in MATSEQSBAIJ format */ 2541 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 2542 2543 b = (Mat_SeqSBAIJ*)(fact)->data; 2544 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2545 b->singlemalloc = PETSC_FALSE; 2546 b->free_a = PETSC_TRUE; 2547 b->free_ij = free_ij; 2548 ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 2549 b->j = uj; 2550 b->i = ui; 2551 b->diag = udiag; 2552 b->free_diag = PETSC_TRUE; 2553 b->ilen = 0; 2554 b->imax = 0; 2555 b->row = perm; 2556 b->col = perm; 2557 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2558 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2559 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2560 ierr = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2561 ierr = PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 2562 b->maxnz = b->nz = ui[am]; 2563 2564 fact->info.factor_mallocs = reallocs; 2565 fact->info.fill_ratio_given = fill; 2566 if (ai[am] != 0) { 2567 fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am]; 2568 } else { 2569 fact->info.fill_ratio_needed = 0.0; 2570 } 2571 #if defined(PETSC_USE_INFO) 2572 if (ai[am] != 0) { 2573 PetscReal af = fact->info.fill_ratio_needed; 2574 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr); 2575 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 2576 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr); 2577 } else { 2578 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2579 } 2580 #endif 2581 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2582 PetscFunctionReturn(0); 2583 } 2584 2585 #undef __FUNCT__ 2586 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_inplace" 2587 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 2588 { 2589 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 2590 Mat_SeqSBAIJ *b; 2591 PetscErrorCode ierr; 2592 PetscBool perm_identity,free_ij = PETSC_TRUE,missing; 2593 PetscInt bs=A->rmap->bs,am=a->mbs,d; 2594 const PetscInt *cols,*rip,*ai=a->i,*aj=a->j; 2595 PetscInt reallocs=0,i,*ui; 2596 PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 2597 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 2598 PetscReal fill=info->fill,levels=info->levels,ratio_needed; 2599 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 2600 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 2601 PetscBT lnkbt; 2602 2603 PetscFunctionBegin; 2604 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 2605 if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 2606 2607 /* 2608 This code originally uses Modified Sparse Row (MSR) storage 2609 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2610 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2611 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2612 thus the original code in MSR format is still used for these cases. 2613 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2614 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2615 */ 2616 if (bs > 1) { 2617 ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr); 2618 PetscFunctionReturn(0); 2619 } 2620 2621 /* check whether perm is the identity mapping */ 2622 ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 2623 if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format"); 2624 a->permute = PETSC_FALSE; 2625 2626 /* special case that simply copies fill pattern */ 2627 if (!levels) { 2628 /* reuse the column pointers and row offsets for memory savings */ 2629 ui = a->i; 2630 uj = a->j; 2631 free_ij = PETSC_FALSE; 2632 ratio_needed = 1.0; 2633 } else { /* case: levels>0 */ 2634 ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 2635 2636 /* initialization */ 2637 ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr); 2638 ui[0] = 0; 2639 2640 /* jl: linked list for storing indices of the pivot rows 2641 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2642 ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr); 2643 for (i=0; i<am; i++) { 2644 jl[i] = am; il[i] = 0; 2645 } 2646 2647 /* create and initialize a linked list for storing column indices of the active row k */ 2648 nlnk = am + 1; 2649 ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2650 2651 /* initial FreeSpace size is fill*(ai[am]+1) */ 2652 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr); 2653 current_space = free_space; 2654 ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr); 2655 current_space_lvl = free_space_lvl; 2656 2657 for (k=0; k<am; k++) { /* for each active row k */ 2658 /* initialize lnk by the column indices of row rip[k] */ 2659 nzk = 0; 2660 ncols = ai[rip[k]+1] - ai[rip[k]]; 2661 cols = aj+ai[rip[k]]; 2662 ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 2663 nzk += nlnk; 2664 2665 /* update lnk by computing fill-in for each pivot row to be merged in */ 2666 prow = jl[k]; /* 1st pivot row */ 2667 2668 while (prow < k) { 2669 nextprow = jl[prow]; 2670 2671 /* merge prow into k-th row */ 2672 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2673 jmax = ui[prow+1]; 2674 ncols = jmax-jmin; 2675 i = jmin - ui[prow]; 2676 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2677 j = *(uj_lvl_ptr[prow] + i - 1); 2678 cols_lvl = uj_lvl_ptr[prow]+i; 2679 ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr); 2680 nzk += nlnk; 2681 2682 /* update il and jl for prow */ 2683 if (jmin < jmax) { 2684 il[prow] = jmin; 2685 j = *cols; jl[prow] = jl[j]; jl[j] = prow; 2686 } 2687 prow = nextprow; 2688 } 2689 2690 /* if free space is not available, make more free space */ 2691 if (current_space->local_remaining<nzk) { 2692 i = am - k + 1; /* num of unfactored rows */ 2693 i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2694 ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 2695 ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 2696 reallocs++; 2697 } 2698 2699 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2700 ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 2701 2702 /* add the k-th row into il and jl */ 2703 if (nzk-1 > 0) { 2704 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2705 jl[k] = jl[i]; jl[i] = k; 2706 il[k] = ui[k] + 1; 2707 } 2708 uj_ptr[k] = current_space->array; 2709 uj_lvl_ptr[k] = current_space_lvl->array; 2710 2711 current_space->array += nzk; 2712 current_space->local_used += nzk; 2713 current_space->local_remaining -= nzk; 2714 current_space_lvl->array += nzk; 2715 current_space_lvl->local_used += nzk; 2716 current_space_lvl->local_remaining -= nzk; 2717 2718 ui[k+1] = ui[k] + nzk; 2719 } 2720 2721 ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 2722 ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr); 2723 2724 /* destroy list of free space and other temporary array(s) */ 2725 ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr); 2726 ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 2727 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 2728 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 2729 if (ai[am] != 0) { 2730 ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 2731 } else { 2732 ratio_needed = 0.0; 2733 } 2734 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2735 2736 /* put together the new matrix in MATSEQSBAIJ format */ 2737 ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 2738 2739 b = (Mat_SeqSBAIJ*)(fact)->data; 2740 ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr); 2741 b->singlemalloc = PETSC_FALSE; 2742 b->free_a = PETSC_TRUE; 2743 b->free_ij = free_ij; 2744 ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 2745 b->j = uj; 2746 b->i = ui; 2747 b->diag = 0; 2748 b->ilen = 0; 2749 b->imax = 0; 2750 b->row = perm; 2751 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2752 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2753 b->icol = perm; 2754 ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 2755 ierr = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2756 b->maxnz = b->nz = ui[am]; 2757 2758 fact->info.factor_mallocs = reallocs; 2759 fact->info.fill_ratio_given = fill; 2760 fact->info.fill_ratio_needed = ratio_needed; 2761 #if defined(PETSC_USE_INFO) 2762 if (ai[am] != 0) { 2763 PetscReal af = fact->info.fill_ratio_needed; 2764 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr); 2765 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 2766 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr); 2767 } else { 2768 ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 2769 } 2770 #endif 2771 if (perm_identity) { 2772 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2773 } else { 2774 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2775 } 2776 PetscFunctionReturn(0); 2777 } 2778 2779