1 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include <../src/mat/impls/sbaij/seq/sbaij.h> 7 #include <../src/mat/impls/baij/seq/baij.h> 8 #include <petsc/private/kernels/blockinvert.h> 9 10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 11 { 12 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 13 IS isrow = a->row; 14 PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 15 const PetscInt *r; 16 PetscInt nz, *vj, k, idx, k1; 17 PetscInt bs = A->rmap->bs, bs2 = a->bs2; 18 const MatScalar *aa = a->a, *v, *diag; 19 PetscScalar *x, *xk, *xj, *xk_tmp, *t; 20 const PetscScalar *b; 21 22 PetscFunctionBegin; 23 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 24 PetscCall(VecGetArrayRead(bb, &b)); 25 PetscCall(VecGetArray(xx, &x)); 26 t = a->solve_work; 27 PetscCall(ISGetIndices(isrow, &r)); 28 PetscCall(PetscMalloc1(bs, &xk_tmp)); 29 30 /* solve U^T * D * y = b by forward substitution */ 31 xk = t; 32 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 33 idx = bs * r[k]; 34 for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1]; 35 } 36 for (k = 0; k < mbs; k++) { 37 v = aa + bs2 * ai[k]; 38 xk = t + k * bs; /* Dk*xk = k-th block of x */ 39 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */ 40 nz = ai[k + 1] - ai[k]; 41 vj = aj + ai[k]; 42 xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */ 43 while (nz--) { 44 /* x(:) += U(k,:)^T*(Dk*xk) */ 45 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */ 46 vj++; 47 xj = t + (*vj) * bs; 48 v += bs2; 49 } 50 /* xk = inv(Dk)*(Dk*xk) */ 51 diag = aa + k * bs2; /* ptr to inv(Dk) */ 52 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */ 53 } 54 55 /* solve U*x = y by back substitution */ 56 for (k = mbs - 1; k >= 0; k--) { 57 v = aa + bs2 * ai[k]; 58 xk = t + k * bs; /* xk */ 59 nz = ai[k + 1] - ai[k]; 60 vj = aj + ai[k]; 61 xj = t + (*vj) * bs; 62 while (nz--) { 63 /* xk += U(k,:)*x(:) */ 64 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */ 65 vj++; 66 v += bs2; 67 xj = t + (*vj) * bs; 68 } 69 idx = bs * r[k]; 70 for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++; 71 } 72 73 PetscCall(PetscFree(xk_tmp)); 74 PetscCall(ISRestoreIndices(isrow, &r)); 75 PetscCall(VecRestoreArrayRead(bb, &b)); 76 PetscCall(VecRestoreArray(xx, &x)); 77 PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs)); 78 PetscFunctionReturn(PETSC_SUCCESS); 79 } 80 81 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 82 { 83 PetscFunctionBegin; 84 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet"); 85 } 86 87 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 88 { 89 PetscFunctionBegin; 90 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented"); 91 } 92 93 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) 94 { 95 PetscInt nz, k; 96 const PetscInt *vj, bs2 = bs * bs; 97 const MatScalar *v, *diag; 98 PetscScalar *xk, *xj, *xk_tmp; 99 100 PetscFunctionBegin; 101 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 102 PetscCall(PetscMalloc1(bs, &xk_tmp)); 103 for (k = 0; k < mbs; k++) { 104 v = aa + bs2 * ai[k]; 105 xk = x + k * bs; /* Dk*xk = k-th block of x */ 106 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */ 107 nz = ai[k + 1] - ai[k]; 108 vj = aj + ai[k]; 109 xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */ 110 while (nz--) { 111 /* x(:) += U(k,:)^T*(Dk*xk) */ 112 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */ 113 vj++; 114 xj = x + (size_t)(*vj) * bs; 115 v += bs2; 116 } 117 /* xk = inv(Dk)*(Dk*xk) */ 118 diag = aa + k * bs2; /* ptr to inv(Dk) */ 119 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */ 120 } 121 PetscCall(PetscFree(xk_tmp)); 122 PetscFunctionReturn(PETSC_SUCCESS); 123 } 124 125 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) 126 { 127 PetscInt nz, k; 128 const PetscInt *vj, bs2 = bs * bs; 129 const MatScalar *v; 130 PetscScalar *xk, *xj; 131 132 PetscFunctionBegin; 133 for (k = mbs - 1; k >= 0; k--) { 134 v = aa + bs2 * ai[k]; 135 xk = x + k * bs; /* xk */ 136 nz = ai[k + 1] - ai[k]; 137 vj = aj + ai[k]; 138 xj = x + (size_t)(*vj) * bs; 139 while (nz--) { 140 /* xk += U(k,:)*x(:) */ 141 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */ 142 vj++; 143 v += bs2; 144 xj = x + (size_t)(*vj) * bs; 145 } 146 } 147 PetscFunctionReturn(PETSC_SUCCESS); 148 } 149 150 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 151 { 152 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 153 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 154 PetscInt bs = A->rmap->bs; 155 const MatScalar *aa = a->a; 156 PetscScalar *x; 157 const PetscScalar *b; 158 159 PetscFunctionBegin; 160 PetscCall(VecGetArrayRead(bb, &b)); 161 PetscCall(VecGetArray(xx, &x)); 162 163 /* solve U^T * D * y = b by forward substitution */ 164 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */ 165 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 166 167 /* solve U*x = y by back substitution */ 168 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 169 170 PetscCall(VecRestoreArrayRead(bb, &b)); 171 PetscCall(VecRestoreArray(xx, &x)); 172 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs)); 173 PetscFunctionReturn(PETSC_SUCCESS); 174 } 175 176 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 177 { 178 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 179 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 180 PetscInt bs = A->rmap->bs; 181 const MatScalar *aa = a->a; 182 const PetscScalar *b; 183 PetscScalar *x; 184 185 PetscFunctionBegin; 186 PetscCall(VecGetArrayRead(bb, &b)); 187 PetscCall(VecGetArray(xx, &x)); 188 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */ 189 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 190 PetscCall(VecRestoreArrayRead(bb, &b)); 191 PetscCall(VecRestoreArray(xx, &x)); 192 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs)); 193 PetscFunctionReturn(PETSC_SUCCESS); 194 } 195 196 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 197 { 198 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 199 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 200 PetscInt bs = A->rmap->bs; 201 const MatScalar *aa = a->a; 202 const PetscScalar *b; 203 PetscScalar *x; 204 205 PetscFunctionBegin; 206 PetscCall(VecGetArrayRead(bb, &b)); 207 PetscCall(VecGetArray(xx, &x)); 208 PetscCall(PetscArraycpy(x, b, bs * mbs)); 209 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 210 PetscCall(VecRestoreArrayRead(bb, &b)); 211 PetscCall(VecRestoreArray(xx, &x)); 212 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 213 PetscFunctionReturn(PETSC_SUCCESS); 214 } 215 216 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx) 217 { 218 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 219 IS isrow = a->row; 220 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj; 221 PetscInt nz, k, idx; 222 const MatScalar *aa = a->a, *v, *d; 223 PetscScalar *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp; 224 const PetscScalar *b; 225 226 PetscFunctionBegin; 227 PetscCall(VecGetArrayRead(bb, &b)); 228 PetscCall(VecGetArray(xx, &x)); 229 t = a->solve_work; 230 PetscCall(ISGetIndices(isrow, &r)); 231 232 /* solve U^T * D * y = b by forward substitution */ 233 tp = t; 234 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 235 idx = 7 * r[k]; 236 tp[0] = b[idx]; 237 tp[1] = b[idx + 1]; 238 tp[2] = b[idx + 2]; 239 tp[3] = b[idx + 3]; 240 tp[4] = b[idx + 4]; 241 tp[5] = b[idx + 5]; 242 tp[6] = b[idx + 6]; 243 tp += 7; 244 } 245 246 for (k = 0; k < mbs; k++) { 247 v = aa + 49 * ai[k]; 248 vj = aj + ai[k]; 249 tp = t + k * 7; 250 x0 = tp[0]; 251 x1 = tp[1]; 252 x2 = tp[2]; 253 x3 = tp[3]; 254 x4 = tp[4]; 255 x5 = tp[5]; 256 x6 = tp[6]; 257 nz = ai[k + 1] - ai[k]; 258 tp = t + (*vj) * 7; 259 while (nz--) { 260 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6; 261 tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6; 262 tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6; 263 tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6; 264 tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6; 265 tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6; 266 tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6; 267 vj++; 268 tp = t + (*vj) * 7; 269 v += 49; 270 } 271 272 /* xk = inv(Dk)*(Dk*xk) */ 273 d = aa + k * 49; /* ptr to inv(Dk) */ 274 tp = t + k * 7; 275 tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6; 276 tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6; 277 tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6; 278 tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6; 279 tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6; 280 tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6; 281 tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6; 282 } 283 284 /* solve U*x = y by back substitution */ 285 for (k = mbs - 1; k >= 0; k--) { 286 v = aa + 49 * ai[k]; 287 vj = aj + ai[k]; 288 tp = t + k * 7; 289 x0 = tp[0]; 290 x1 = tp[1]; 291 x2 = tp[2]; 292 x3 = tp[3]; 293 x4 = tp[4]; 294 x5 = tp[5]; 295 x6 = tp[6]; /* xk */ 296 nz = ai[k + 1] - ai[k]; 297 298 tp = t + (*vj) * 7; 299 while (nz--) { 300 /* xk += U(k,:)*x(:) */ 301 x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6]; 302 x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6]; 303 x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6]; 304 x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6]; 305 x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6]; 306 x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6]; 307 x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6]; 308 vj++; 309 tp = t + (*vj) * 7; 310 v += 49; 311 } 312 tp = t + k * 7; 313 tp[0] = x0; 314 tp[1] = x1; 315 tp[2] = x2; 316 tp[3] = x3; 317 tp[4] = x4; 318 tp[5] = x5; 319 tp[6] = x6; 320 idx = 7 * r[k]; 321 x[idx] = x0; 322 x[idx + 1] = x1; 323 x[idx + 2] = x2; 324 x[idx + 3] = x3; 325 x[idx + 4] = x4; 326 x[idx + 5] = x5; 327 x[idx + 6] = x6; 328 } 329 330 PetscCall(ISRestoreIndices(isrow, &r)); 331 PetscCall(VecRestoreArrayRead(bb, &b)); 332 PetscCall(VecRestoreArray(xx, &x)); 333 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 334 PetscFunctionReturn(PETSC_SUCCESS); 335 } 336 337 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 338 { 339 const MatScalar *v, *d; 340 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6; 341 PetscInt nz, k; 342 const PetscInt *vj; 343 344 PetscFunctionBegin; 345 for (k = 0; k < mbs; k++) { 346 v = aa + 49 * ai[k]; 347 xp = x + k * 7; 348 x0 = xp[0]; 349 x1 = xp[1]; 350 x2 = xp[2]; 351 x3 = xp[3]; 352 x4 = xp[4]; 353 x5 = xp[5]; 354 x6 = xp[6]; /* Dk*xk = k-th block of x */ 355 nz = ai[k + 1] - ai[k]; 356 vj = aj + ai[k]; 357 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 358 PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 359 while (nz--) { 360 xp = x + (*vj) * 7; 361 /* x(:) += U(k,:)^T*(Dk*xk) */ 362 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6; 363 xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6; 364 xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6; 365 xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6; 366 xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6; 367 xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6; 368 xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6; 369 vj++; 370 v += 49; 371 } 372 /* xk = inv(Dk)*(Dk*xk) */ 373 d = aa + k * 49; /* ptr to inv(Dk) */ 374 xp = x + k * 7; 375 xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6; 376 xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6; 377 xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6; 378 xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6; 379 xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6; 380 xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6; 381 xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6; 382 } 383 PetscFunctionReturn(PETSC_SUCCESS); 384 } 385 386 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 387 { 388 const MatScalar *v; 389 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6; 390 PetscInt nz, k; 391 const PetscInt *vj; 392 393 PetscFunctionBegin; 394 for (k = mbs - 1; k >= 0; k--) { 395 v = aa + 49 * ai[k]; 396 xp = x + k * 7; 397 x0 = xp[0]; 398 x1 = xp[1]; 399 x2 = xp[2]; 400 x3 = xp[3]; 401 x4 = xp[4]; 402 x5 = xp[5]; 403 x6 = xp[6]; /* xk */ 404 nz = ai[k + 1] - ai[k]; 405 vj = aj + ai[k]; 406 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 407 PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 408 while (nz--) { 409 xp = x + (*vj) * 7; 410 /* xk += U(k,:)*x(:) */ 411 x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6]; 412 x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6]; 413 x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6]; 414 x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6]; 415 x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6]; 416 x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6]; 417 x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6]; 418 vj++; 419 v += 49; 420 } 421 xp = x + k * 7; 422 xp[0] = x0; 423 xp[1] = x1; 424 xp[2] = x2; 425 xp[3] = x3; 426 xp[4] = x4; 427 xp[5] = x5; 428 xp[6] = x6; 429 } 430 PetscFunctionReturn(PETSC_SUCCESS); 431 } 432 433 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 434 { 435 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 436 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 437 const MatScalar *aa = a->a; 438 PetscScalar *x; 439 const PetscScalar *b; 440 441 PetscFunctionBegin; 442 PetscCall(VecGetArrayRead(bb, &b)); 443 PetscCall(VecGetArray(xx, &x)); 444 445 /* solve U^T * D * y = b by forward substitution */ 446 PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */ 447 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 448 449 /* solve U*x = y by back substitution */ 450 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 451 452 PetscCall(VecRestoreArrayRead(bb, &b)); 453 PetscCall(VecRestoreArray(xx, &x)); 454 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 455 PetscFunctionReturn(PETSC_SUCCESS); 456 } 457 458 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 459 { 460 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 461 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 462 const MatScalar *aa = a->a; 463 PetscScalar *x; 464 const PetscScalar *b; 465 466 PetscFunctionBegin; 467 PetscCall(VecGetArrayRead(bb, &b)); 468 PetscCall(VecGetArray(xx, &x)); 469 PetscCall(PetscArraycpy(x, b, 7 * mbs)); 470 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 471 PetscCall(VecRestoreArrayRead(bb, &b)); 472 PetscCall(VecRestoreArray(xx, &x)); 473 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 474 PetscFunctionReturn(PETSC_SUCCESS); 475 } 476 477 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 478 { 479 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 480 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 481 const MatScalar *aa = a->a; 482 PetscScalar *x; 483 const PetscScalar *b; 484 485 PetscFunctionBegin; 486 PetscCall(VecGetArrayRead(bb, &b)); 487 PetscCall(VecGetArray(xx, &x)); 488 PetscCall(PetscArraycpy(x, b, 7 * mbs)); 489 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 490 PetscCall(VecRestoreArrayRead(bb, &b)); 491 PetscCall(VecRestoreArray(xx, &x)); 492 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 493 PetscFunctionReturn(PETSC_SUCCESS); 494 } 495 496 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx) 497 { 498 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 499 IS isrow = a->row; 500 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj; 501 PetscInt nz, k, idx; 502 const MatScalar *aa = a->a, *v, *d; 503 PetscScalar *x, x0, x1, x2, x3, x4, x5, *t, *tp; 504 const PetscScalar *b; 505 506 PetscFunctionBegin; 507 PetscCall(VecGetArrayRead(bb, &b)); 508 PetscCall(VecGetArray(xx, &x)); 509 t = a->solve_work; 510 PetscCall(ISGetIndices(isrow, &r)); 511 512 /* solve U^T * D * y = b by forward substitution */ 513 tp = t; 514 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 515 idx = 6 * r[k]; 516 tp[0] = b[idx]; 517 tp[1] = b[idx + 1]; 518 tp[2] = b[idx + 2]; 519 tp[3] = b[idx + 3]; 520 tp[4] = b[idx + 4]; 521 tp[5] = b[idx + 5]; 522 tp += 6; 523 } 524 525 for (k = 0; k < mbs; k++) { 526 v = aa + 36 * ai[k]; 527 vj = aj + ai[k]; 528 tp = t + k * 6; 529 x0 = tp[0]; 530 x1 = tp[1]; 531 x2 = tp[2]; 532 x3 = tp[3]; 533 x4 = tp[4]; 534 x5 = tp[5]; 535 nz = ai[k + 1] - ai[k]; 536 tp = t + (*vj) * 6; 537 while (nz--) { 538 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5; 539 tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5; 540 tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5; 541 tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5; 542 tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5; 543 tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5; 544 vj++; 545 tp = t + (*vj) * 6; 546 v += 36; 547 } 548 549 /* xk = inv(Dk)*(Dk*xk) */ 550 d = aa + k * 36; /* ptr to inv(Dk) */ 551 tp = t + k * 6; 552 tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5; 553 tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5; 554 tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5; 555 tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5; 556 tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5; 557 tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5; 558 } 559 560 /* solve U*x = y by back substitution */ 561 for (k = mbs - 1; k >= 0; k--) { 562 v = aa + 36 * ai[k]; 563 vj = aj + ai[k]; 564 tp = t + k * 6; 565 x0 = tp[0]; 566 x1 = tp[1]; 567 x2 = tp[2]; 568 x3 = tp[3]; 569 x4 = tp[4]; 570 x5 = tp[5]; /* xk */ 571 nz = ai[k + 1] - ai[k]; 572 573 tp = t + (*vj) * 6; 574 while (nz--) { 575 /* xk += U(k,:)*x(:) */ 576 x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5]; 577 x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5]; 578 x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5]; 579 x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5]; 580 x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5]; 581 x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5]; 582 vj++; 583 tp = t + (*vj) * 6; 584 v += 36; 585 } 586 tp = t + k * 6; 587 tp[0] = x0; 588 tp[1] = x1; 589 tp[2] = x2; 590 tp[3] = x3; 591 tp[4] = x4; 592 tp[5] = x5; 593 idx = 6 * r[k]; 594 x[idx] = x0; 595 x[idx + 1] = x1; 596 x[idx + 2] = x2; 597 x[idx + 3] = x3; 598 x[idx + 4] = x4; 599 x[idx + 5] = x5; 600 } 601 602 PetscCall(ISRestoreIndices(isrow, &r)); 603 PetscCall(VecRestoreArrayRead(bb, &b)); 604 PetscCall(VecRestoreArray(xx, &x)); 605 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 606 PetscFunctionReturn(PETSC_SUCCESS); 607 } 608 609 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 610 { 611 const MatScalar *v, *d; 612 PetscScalar *xp, x0, x1, x2, x3, x4, x5; 613 PetscInt nz, k; 614 const PetscInt *vj; 615 616 PetscFunctionBegin; 617 for (k = 0; k < mbs; k++) { 618 v = aa + 36 * ai[k]; 619 xp = x + k * 6; 620 x0 = xp[0]; 621 x1 = xp[1]; 622 x2 = xp[2]; 623 x3 = xp[3]; 624 x4 = xp[4]; 625 x5 = xp[5]; /* Dk*xk = k-th block of x */ 626 nz = ai[k + 1] - ai[k]; 627 vj = aj + ai[k]; 628 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 629 PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 630 while (nz--) { 631 xp = x + (*vj) * 6; 632 /* x(:) += U(k,:)^T*(Dk*xk) */ 633 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5; 634 xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5; 635 xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5; 636 xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5; 637 xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5; 638 xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5; 639 vj++; 640 v += 36; 641 } 642 /* xk = inv(Dk)*(Dk*xk) */ 643 d = aa + k * 36; /* ptr to inv(Dk) */ 644 xp = x + k * 6; 645 xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5; 646 xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5; 647 xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5; 648 xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5; 649 xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5; 650 xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5; 651 } 652 PetscFunctionReturn(PETSC_SUCCESS); 653 } 654 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 655 { 656 const MatScalar *v; 657 PetscScalar *xp, x0, x1, x2, x3, x4, x5; 658 PetscInt nz, k; 659 const PetscInt *vj; 660 661 PetscFunctionBegin; 662 for (k = mbs - 1; k >= 0; k--) { 663 v = aa + 36 * ai[k]; 664 xp = x + k * 6; 665 x0 = xp[0]; 666 x1 = xp[1]; 667 x2 = xp[2]; 668 x3 = xp[3]; 669 x4 = xp[4]; 670 x5 = xp[5]; /* xk */ 671 nz = ai[k + 1] - ai[k]; 672 vj = aj + ai[k]; 673 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 674 PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 675 while (nz--) { 676 xp = x + (*vj) * 6; 677 /* xk += U(k,:)*x(:) */ 678 x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5]; 679 x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5]; 680 x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5]; 681 x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5]; 682 x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5]; 683 x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5]; 684 vj++; 685 v += 36; 686 } 687 xp = x + k * 6; 688 xp[0] = x0; 689 xp[1] = x1; 690 xp[2] = x2; 691 xp[3] = x3; 692 xp[4] = x4; 693 xp[5] = x5; 694 } 695 PetscFunctionReturn(PETSC_SUCCESS); 696 } 697 698 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 699 { 700 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 701 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 702 const MatScalar *aa = a->a; 703 PetscScalar *x; 704 const PetscScalar *b; 705 706 PetscFunctionBegin; 707 PetscCall(VecGetArrayRead(bb, &b)); 708 PetscCall(VecGetArray(xx, &x)); 709 710 /* solve U^T * D * y = b by forward substitution */ 711 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 712 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 713 714 /* solve U*x = y by back substitution */ 715 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 716 717 PetscCall(VecRestoreArrayRead(bb, &b)); 718 PetscCall(VecRestoreArray(xx, &x)); 719 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 720 PetscFunctionReturn(PETSC_SUCCESS); 721 } 722 723 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 724 { 725 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 726 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 727 const MatScalar *aa = a->a; 728 PetscScalar *x; 729 const PetscScalar *b; 730 731 PetscFunctionBegin; 732 PetscCall(VecGetArrayRead(bb, &b)); 733 PetscCall(VecGetArray(xx, &x)); 734 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 735 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 736 PetscCall(VecRestoreArrayRead(bb, &b)); 737 PetscCall(VecRestoreArray(xx, &x)); 738 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 739 PetscFunctionReturn(PETSC_SUCCESS); 740 } 741 742 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 743 { 744 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 745 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 746 const MatScalar *aa = a->a; 747 PetscScalar *x; 748 const PetscScalar *b; 749 750 PetscFunctionBegin; 751 PetscCall(VecGetArrayRead(bb, &b)); 752 PetscCall(VecGetArray(xx, &x)); 753 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 754 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 755 PetscCall(VecRestoreArrayRead(bb, &b)); 756 PetscCall(VecRestoreArray(xx, &x)); 757 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 758 PetscFunctionReturn(PETSC_SUCCESS); 759 } 760 761 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx) 762 { 763 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 764 IS isrow = a->row; 765 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 766 const PetscInt *r, *vj; 767 PetscInt nz, k, idx; 768 const MatScalar *aa = a->a, *v, *diag; 769 PetscScalar *x, x0, x1, x2, x3, x4, *t, *tp; 770 const PetscScalar *b; 771 772 PetscFunctionBegin; 773 PetscCall(VecGetArrayRead(bb, &b)); 774 PetscCall(VecGetArray(xx, &x)); 775 t = a->solve_work; 776 PetscCall(ISGetIndices(isrow, &r)); 777 778 /* solve U^T * D * y = b by forward substitution */ 779 tp = t; 780 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 781 idx = 5 * r[k]; 782 tp[0] = b[idx]; 783 tp[1] = b[idx + 1]; 784 tp[2] = b[idx + 2]; 785 tp[3] = b[idx + 3]; 786 tp[4] = b[idx + 4]; 787 tp += 5; 788 } 789 790 for (k = 0; k < mbs; k++) { 791 v = aa + 25 * ai[k]; 792 vj = aj + ai[k]; 793 tp = t + k * 5; 794 x0 = tp[0]; 795 x1 = tp[1]; 796 x2 = tp[2]; 797 x3 = tp[3]; 798 x4 = tp[4]; 799 nz = ai[k + 1] - ai[k]; 800 801 tp = t + (*vj) * 5; 802 while (nz--) { 803 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4; 804 tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4; 805 tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4; 806 tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4; 807 tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4; 808 vj++; 809 tp = t + (*vj) * 5; 810 v += 25; 811 } 812 813 /* xk = inv(Dk)*(Dk*xk) */ 814 diag = aa + k * 25; /* ptr to inv(Dk) */ 815 tp = t + k * 5; 816 tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4; 817 tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4; 818 tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4; 819 tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4; 820 tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4; 821 } 822 823 /* solve U*x = y by back substitution */ 824 for (k = mbs - 1; k >= 0; k--) { 825 v = aa + 25 * ai[k]; 826 vj = aj + ai[k]; 827 tp = t + k * 5; 828 x0 = tp[0]; 829 x1 = tp[1]; 830 x2 = tp[2]; 831 x3 = tp[3]; 832 x4 = tp[4]; /* xk */ 833 nz = ai[k + 1] - ai[k]; 834 835 tp = t + (*vj) * 5; 836 while (nz--) { 837 /* xk += U(k,:)*x(:) */ 838 x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4]; 839 x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4]; 840 x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4]; 841 x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4]; 842 x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4]; 843 vj++; 844 tp = t + (*vj) * 5; 845 v += 25; 846 } 847 tp = t + k * 5; 848 tp[0] = x0; 849 tp[1] = x1; 850 tp[2] = x2; 851 tp[3] = x3; 852 tp[4] = x4; 853 idx = 5 * r[k]; 854 x[idx] = x0; 855 x[idx + 1] = x1; 856 x[idx + 2] = x2; 857 x[idx + 3] = x3; 858 x[idx + 4] = x4; 859 } 860 861 PetscCall(ISRestoreIndices(isrow, &r)); 862 PetscCall(VecRestoreArrayRead(bb, &b)); 863 PetscCall(VecRestoreArray(xx, &x)); 864 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 865 PetscFunctionReturn(PETSC_SUCCESS); 866 } 867 868 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 869 { 870 const MatScalar *v, *diag; 871 PetscScalar *xp, x0, x1, x2, x3, x4; 872 PetscInt nz, k; 873 const PetscInt *vj; 874 875 PetscFunctionBegin; 876 for (k = 0; k < mbs; k++) { 877 v = aa + 25 * ai[k]; 878 xp = x + k * 5; 879 x0 = xp[0]; 880 x1 = xp[1]; 881 x2 = xp[2]; 882 x3 = xp[3]; 883 x4 = xp[4]; /* Dk*xk = k-th block of x */ 884 nz = ai[k + 1] - ai[k]; 885 vj = aj + ai[k]; 886 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 887 PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 888 while (nz--) { 889 xp = x + (*vj) * 5; 890 /* x(:) += U(k,:)^T*(Dk*xk) */ 891 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4; 892 xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4; 893 xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4; 894 xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4; 895 xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4; 896 vj++; 897 v += 25; 898 } 899 /* xk = inv(Dk)*(Dk*xk) */ 900 diag = aa + k * 25; /* ptr to inv(Dk) */ 901 xp = x + k * 5; 902 xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4; 903 xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4; 904 xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4; 905 xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4; 906 xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4; 907 } 908 PetscFunctionReturn(PETSC_SUCCESS); 909 } 910 911 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 912 { 913 const MatScalar *v; 914 PetscScalar *xp, x0, x1, x2, x3, x4; 915 PetscInt nz, k; 916 const PetscInt *vj; 917 918 PetscFunctionBegin; 919 for (k = mbs - 1; k >= 0; k--) { 920 v = aa + 25 * ai[k]; 921 xp = x + k * 5; 922 x0 = xp[0]; 923 x1 = xp[1]; 924 x2 = xp[2]; 925 x3 = xp[3]; 926 x4 = xp[4]; /* xk */ 927 nz = ai[k + 1] - ai[k]; 928 vj = aj + ai[k]; 929 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 930 PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 931 while (nz--) { 932 xp = x + (*vj) * 5; 933 /* xk += U(k,:)*x(:) */ 934 x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4]; 935 x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4]; 936 x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4]; 937 x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4]; 938 x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4]; 939 vj++; 940 v += 25; 941 } 942 xp = x + k * 5; 943 xp[0] = x0; 944 xp[1] = x1; 945 xp[2] = x2; 946 xp[3] = x3; 947 xp[4] = x4; 948 } 949 PetscFunctionReturn(PETSC_SUCCESS); 950 } 951 952 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 953 { 954 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 955 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 956 const MatScalar *aa = a->a; 957 PetscScalar *x; 958 const PetscScalar *b; 959 960 PetscFunctionBegin; 961 PetscCall(VecGetArrayRead(bb, &b)); 962 PetscCall(VecGetArray(xx, &x)); 963 964 /* solve U^T * D * y = b by forward substitution */ 965 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */ 966 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 967 968 /* solve U*x = y by back substitution */ 969 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 970 971 PetscCall(VecRestoreArrayRead(bb, &b)); 972 PetscCall(VecRestoreArray(xx, &x)); 973 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 974 PetscFunctionReturn(PETSC_SUCCESS); 975 } 976 977 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 978 { 979 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 980 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 981 const MatScalar *aa = a->a; 982 PetscScalar *x; 983 const PetscScalar *b; 984 985 PetscFunctionBegin; 986 PetscCall(VecGetArrayRead(bb, &b)); 987 PetscCall(VecGetArray(xx, &x)); 988 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */ 989 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 990 PetscCall(VecRestoreArrayRead(bb, &b)); 991 PetscCall(VecRestoreArray(xx, &x)); 992 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 993 PetscFunctionReturn(PETSC_SUCCESS); 994 } 995 996 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 997 { 998 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 999 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1000 const MatScalar *aa = a->a; 1001 PetscScalar *x; 1002 const PetscScalar *b; 1003 1004 PetscFunctionBegin; 1005 PetscCall(VecGetArrayRead(bb, &b)); 1006 PetscCall(VecGetArray(xx, &x)); 1007 PetscCall(PetscArraycpy(x, b, 5 * mbs)); 1008 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 1009 PetscCall(VecRestoreArrayRead(bb, &b)); 1010 PetscCall(VecRestoreArray(xx, &x)); 1011 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1012 PetscFunctionReturn(PETSC_SUCCESS); 1013 } 1014 1015 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx) 1016 { 1017 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1018 IS isrow = a->row; 1019 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1020 const PetscInt *r, *vj; 1021 PetscInt nz, k, idx; 1022 const MatScalar *aa = a->a, *v, *diag; 1023 PetscScalar *x, x0, x1, x2, x3, *t, *tp; 1024 const PetscScalar *b; 1025 1026 PetscFunctionBegin; 1027 PetscCall(VecGetArrayRead(bb, &b)); 1028 PetscCall(VecGetArray(xx, &x)); 1029 t = a->solve_work; 1030 PetscCall(ISGetIndices(isrow, &r)); 1031 1032 /* solve U^T * D * y = b by forward substitution */ 1033 tp = t; 1034 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1035 idx = 4 * r[k]; 1036 tp[0] = b[idx]; 1037 tp[1] = b[idx + 1]; 1038 tp[2] = b[idx + 2]; 1039 tp[3] = b[idx + 3]; 1040 tp += 4; 1041 } 1042 1043 for (k = 0; k < mbs; k++) { 1044 v = aa + 16 * ai[k]; 1045 vj = aj + ai[k]; 1046 tp = t + k * 4; 1047 x0 = tp[0]; 1048 x1 = tp[1]; 1049 x2 = tp[2]; 1050 x3 = tp[3]; 1051 nz = ai[k + 1] - ai[k]; 1052 1053 tp = t + (*vj) * 4; 1054 while (nz--) { 1055 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3; 1056 tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3; 1057 tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3; 1058 tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3; 1059 vj++; 1060 tp = t + (*vj) * 4; 1061 v += 16; 1062 } 1063 1064 /* xk = inv(Dk)*(Dk*xk) */ 1065 diag = aa + k * 16; /* ptr to inv(Dk) */ 1066 tp = t + k * 4; 1067 tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3; 1068 tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3; 1069 tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3; 1070 tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3; 1071 } 1072 1073 /* solve U*x = y by back substitution */ 1074 for (k = mbs - 1; k >= 0; k--) { 1075 v = aa + 16 * ai[k]; 1076 vj = aj + ai[k]; 1077 tp = t + k * 4; 1078 x0 = tp[0]; 1079 x1 = tp[1]; 1080 x2 = tp[2]; 1081 x3 = tp[3]; /* xk */ 1082 nz = ai[k + 1] - ai[k]; 1083 1084 tp = t + (*vj) * 4; 1085 while (nz--) { 1086 /* xk += U(k,:)*x(:) */ 1087 x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3]; 1088 x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3]; 1089 x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3]; 1090 x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3]; 1091 vj++; 1092 tp = t + (*vj) * 4; 1093 v += 16; 1094 } 1095 tp = t + k * 4; 1096 tp[0] = x0; 1097 tp[1] = x1; 1098 tp[2] = x2; 1099 tp[3] = x3; 1100 idx = 4 * r[k]; 1101 x[idx] = x0; 1102 x[idx + 1] = x1; 1103 x[idx + 2] = x2; 1104 x[idx + 3] = x3; 1105 } 1106 1107 PetscCall(ISRestoreIndices(isrow, &r)); 1108 PetscCall(VecRestoreArrayRead(bb, &b)); 1109 PetscCall(VecRestoreArray(xx, &x)); 1110 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1111 PetscFunctionReturn(PETSC_SUCCESS); 1112 } 1113 1114 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1115 { 1116 const MatScalar *v, *diag; 1117 PetscScalar *xp, x0, x1, x2, x3; 1118 PetscInt nz, k; 1119 const PetscInt *vj; 1120 1121 PetscFunctionBegin; 1122 for (k = 0; k < mbs; k++) { 1123 v = aa + 16 * ai[k]; 1124 xp = x + k * 4; 1125 x0 = xp[0]; 1126 x1 = xp[1]; 1127 x2 = xp[2]; 1128 x3 = xp[3]; /* Dk*xk = k-th block of x */ 1129 nz = ai[k + 1] - ai[k]; 1130 vj = aj + ai[k]; 1131 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1132 PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1133 while (nz--) { 1134 xp = x + (*vj) * 4; 1135 /* x(:) += U(k,:)^T*(Dk*xk) */ 1136 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3; 1137 xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3; 1138 xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3; 1139 xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3; 1140 vj++; 1141 v += 16; 1142 } 1143 /* xk = inv(Dk)*(Dk*xk) */ 1144 diag = aa + k * 16; /* ptr to inv(Dk) */ 1145 xp = x + k * 4; 1146 xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3; 1147 xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3; 1148 xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3; 1149 xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3; 1150 } 1151 PetscFunctionReturn(PETSC_SUCCESS); 1152 } 1153 1154 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1155 { 1156 const MatScalar *v; 1157 PetscScalar *xp, x0, x1, x2, x3; 1158 PetscInt nz, k; 1159 const PetscInt *vj; 1160 1161 PetscFunctionBegin; 1162 for (k = mbs - 1; k >= 0; k--) { 1163 v = aa + 16 * ai[k]; 1164 xp = x + k * 4; 1165 x0 = xp[0]; 1166 x1 = xp[1]; 1167 x2 = xp[2]; 1168 x3 = xp[3]; /* xk */ 1169 nz = ai[k + 1] - ai[k]; 1170 vj = aj + ai[k]; 1171 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1172 PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1173 while (nz--) { 1174 xp = x + (*vj) * 4; 1175 /* xk += U(k,:)*x(:) */ 1176 x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3]; 1177 x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3]; 1178 x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3]; 1179 x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3]; 1180 vj++; 1181 v += 16; 1182 } 1183 xp = x + k * 4; 1184 xp[0] = x0; 1185 xp[1] = x1; 1186 xp[2] = x2; 1187 xp[3] = x3; 1188 } 1189 PetscFunctionReturn(PETSC_SUCCESS); 1190 } 1191 1192 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1193 { 1194 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1195 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1196 const MatScalar *aa = a->a; 1197 PetscScalar *x; 1198 const PetscScalar *b; 1199 1200 PetscFunctionBegin; 1201 PetscCall(VecGetArrayRead(bb, &b)); 1202 PetscCall(VecGetArray(xx, &x)); 1203 1204 /* solve U^T * D * y = b by forward substitution */ 1205 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */ 1206 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1207 1208 /* solve U*x = y by back substitution */ 1209 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1210 PetscCall(VecRestoreArrayRead(bb, &b)); 1211 PetscCall(VecRestoreArray(xx, &x)); 1212 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1213 PetscFunctionReturn(PETSC_SUCCESS); 1214 } 1215 1216 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1217 { 1218 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1219 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1220 const MatScalar *aa = a->a; 1221 PetscScalar *x; 1222 const PetscScalar *b; 1223 1224 PetscFunctionBegin; 1225 PetscCall(VecGetArrayRead(bb, &b)); 1226 PetscCall(VecGetArray(xx, &x)); 1227 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */ 1228 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1229 PetscCall(VecRestoreArrayRead(bb, &b)); 1230 PetscCall(VecRestoreArray(xx, &x)); 1231 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1232 PetscFunctionReturn(PETSC_SUCCESS); 1233 } 1234 1235 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1236 { 1237 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1238 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1239 const MatScalar *aa = a->a; 1240 PetscScalar *x; 1241 const PetscScalar *b; 1242 1243 PetscFunctionBegin; 1244 PetscCall(VecGetArrayRead(bb, &b)); 1245 PetscCall(VecGetArray(xx, &x)); 1246 PetscCall(PetscArraycpy(x, b, 4 * mbs)); 1247 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1248 PetscCall(VecRestoreArrayRead(bb, &b)); 1249 PetscCall(VecRestoreArray(xx, &x)); 1250 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1251 PetscFunctionReturn(PETSC_SUCCESS); 1252 } 1253 1254 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx) 1255 { 1256 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1257 IS isrow = a->row; 1258 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1259 const PetscInt *r; 1260 PetscInt nz, k, idx; 1261 const PetscInt *vj; 1262 const MatScalar *aa = a->a, *v, *diag; 1263 PetscScalar *x, x0, x1, x2, *t, *tp; 1264 const PetscScalar *b; 1265 1266 PetscFunctionBegin; 1267 PetscCall(VecGetArrayRead(bb, &b)); 1268 PetscCall(VecGetArray(xx, &x)); 1269 t = a->solve_work; 1270 PetscCall(ISGetIndices(isrow, &r)); 1271 1272 /* solve U^T * D * y = b by forward substitution */ 1273 tp = t; 1274 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1275 idx = 3 * r[k]; 1276 tp[0] = b[idx]; 1277 tp[1] = b[idx + 1]; 1278 tp[2] = b[idx + 2]; 1279 tp += 3; 1280 } 1281 1282 for (k = 0; k < mbs; k++) { 1283 v = aa + 9 * ai[k]; 1284 vj = aj + ai[k]; 1285 tp = t + k * 3; 1286 x0 = tp[0]; 1287 x1 = tp[1]; 1288 x2 = tp[2]; 1289 nz = ai[k + 1] - ai[k]; 1290 1291 tp = t + (*vj) * 3; 1292 while (nz--) { 1293 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2; 1294 tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2; 1295 tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2; 1296 vj++; 1297 tp = t + (*vj) * 3; 1298 v += 9; 1299 } 1300 1301 /* xk = inv(Dk)*(Dk*xk) */ 1302 diag = aa + k * 9; /* ptr to inv(Dk) */ 1303 tp = t + k * 3; 1304 tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2; 1305 tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2; 1306 tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2; 1307 } 1308 1309 /* solve U*x = y by back substitution */ 1310 for (k = mbs - 1; k >= 0; k--) { 1311 v = aa + 9 * ai[k]; 1312 vj = aj + ai[k]; 1313 tp = t + k * 3; 1314 x0 = tp[0]; 1315 x1 = tp[1]; 1316 x2 = tp[2]; /* xk */ 1317 nz = ai[k + 1] - ai[k]; 1318 1319 tp = t + (*vj) * 3; 1320 while (nz--) { 1321 /* xk += U(k,:)*x(:) */ 1322 x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2]; 1323 x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2]; 1324 x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2]; 1325 vj++; 1326 tp = t + (*vj) * 3; 1327 v += 9; 1328 } 1329 tp = t + k * 3; 1330 tp[0] = x0; 1331 tp[1] = x1; 1332 tp[2] = x2; 1333 idx = 3 * r[k]; 1334 x[idx] = x0; 1335 x[idx + 1] = x1; 1336 x[idx + 2] = x2; 1337 } 1338 1339 PetscCall(ISRestoreIndices(isrow, &r)); 1340 PetscCall(VecRestoreArrayRead(bb, &b)); 1341 PetscCall(VecRestoreArray(xx, &x)); 1342 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1343 PetscFunctionReturn(PETSC_SUCCESS); 1344 } 1345 1346 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1347 { 1348 const MatScalar *v, *diag; 1349 PetscScalar *xp, x0, x1, x2; 1350 PetscInt nz, k; 1351 const PetscInt *vj; 1352 1353 PetscFunctionBegin; 1354 for (k = 0; k < mbs; k++) { 1355 v = aa + 9 * ai[k]; 1356 xp = x + k * 3; 1357 x0 = xp[0]; 1358 x1 = xp[1]; 1359 x2 = xp[2]; /* Dk*xk = k-th block of x */ 1360 nz = ai[k + 1] - ai[k]; 1361 vj = aj + ai[k]; 1362 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1363 PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1364 while (nz--) { 1365 xp = x + (*vj) * 3; 1366 /* x(:) += U(k,:)^T*(Dk*xk) */ 1367 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2; 1368 xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2; 1369 xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2; 1370 vj++; 1371 v += 9; 1372 } 1373 /* xk = inv(Dk)*(Dk*xk) */ 1374 diag = aa + k * 9; /* ptr to inv(Dk) */ 1375 xp = x + k * 3; 1376 xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2; 1377 xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2; 1378 xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2; 1379 } 1380 PetscFunctionReturn(PETSC_SUCCESS); 1381 } 1382 1383 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1384 { 1385 const MatScalar *v; 1386 PetscScalar *xp, x0, x1, x2; 1387 PetscInt nz, k; 1388 const PetscInt *vj; 1389 1390 PetscFunctionBegin; 1391 for (k = mbs - 1; k >= 0; k--) { 1392 v = aa + 9 * ai[k]; 1393 xp = x + k * 3; 1394 x0 = xp[0]; 1395 x1 = xp[1]; 1396 x2 = xp[2]; /* xk */ 1397 nz = ai[k + 1] - ai[k]; 1398 vj = aj + ai[k]; 1399 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1400 PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1401 while (nz--) { 1402 xp = x + (*vj) * 3; 1403 /* xk += U(k,:)*x(:) */ 1404 x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2]; 1405 x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2]; 1406 x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2]; 1407 vj++; 1408 v += 9; 1409 } 1410 xp = x + k * 3; 1411 xp[0] = x0; 1412 xp[1] = x1; 1413 xp[2] = x2; 1414 } 1415 PetscFunctionReturn(PETSC_SUCCESS); 1416 } 1417 1418 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1419 { 1420 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1421 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1422 const MatScalar *aa = a->a; 1423 PetscScalar *x; 1424 const PetscScalar *b; 1425 1426 PetscFunctionBegin; 1427 PetscCall(VecGetArrayRead(bb, &b)); 1428 PetscCall(VecGetArray(xx, &x)); 1429 1430 /* solve U^T * D * y = b by forward substitution */ 1431 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1432 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1433 1434 /* solve U*x = y by back substitution */ 1435 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1436 1437 PetscCall(VecRestoreArrayRead(bb, &b)); 1438 PetscCall(VecRestoreArray(xx, &x)); 1439 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1440 PetscFunctionReturn(PETSC_SUCCESS); 1441 } 1442 1443 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1444 { 1445 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1446 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1447 const MatScalar *aa = a->a; 1448 PetscScalar *x; 1449 const PetscScalar *b; 1450 1451 PetscFunctionBegin; 1452 PetscCall(VecGetArrayRead(bb, &b)); 1453 PetscCall(VecGetArray(xx, &x)); 1454 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1455 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1456 PetscCall(VecRestoreArrayRead(bb, &b)); 1457 PetscCall(VecRestoreArray(xx, &x)); 1458 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1459 PetscFunctionReturn(PETSC_SUCCESS); 1460 } 1461 1462 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1463 { 1464 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1465 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1466 const MatScalar *aa = a->a; 1467 PetscScalar *x; 1468 const PetscScalar *b; 1469 1470 PetscFunctionBegin; 1471 PetscCall(VecGetArrayRead(bb, &b)); 1472 PetscCall(VecGetArray(xx, &x)); 1473 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1474 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1475 PetscCall(VecRestoreArrayRead(bb, &b)); 1476 PetscCall(VecRestoreArray(xx, &x)); 1477 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1478 PetscFunctionReturn(PETSC_SUCCESS); 1479 } 1480 1481 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx) 1482 { 1483 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1484 IS isrow = a->row; 1485 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1486 const PetscInt *r, *vj; 1487 PetscInt nz, k, k2, idx; 1488 const MatScalar *aa = a->a, *v, *diag; 1489 PetscScalar *x, x0, x1, *t; 1490 const PetscScalar *b; 1491 1492 PetscFunctionBegin; 1493 PetscCall(VecGetArrayRead(bb, &b)); 1494 PetscCall(VecGetArray(xx, &x)); 1495 t = a->solve_work; 1496 PetscCall(ISGetIndices(isrow, &r)); 1497 1498 /* solve U^T * D * y = perm(b) by forward substitution */ 1499 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1500 idx = 2 * r[k]; 1501 t[k * 2] = b[idx]; 1502 t[k * 2 + 1] = b[idx + 1]; 1503 } 1504 for (k = 0; k < mbs; k++) { 1505 v = aa + 4 * ai[k]; 1506 vj = aj + ai[k]; 1507 k2 = k * 2; 1508 x0 = t[k2]; 1509 x1 = t[k2 + 1]; 1510 nz = ai[k + 1] - ai[k]; 1511 while (nz--) { 1512 t[(*vj) * 2] += v[0] * x0 + v[1] * x1; 1513 t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1; 1514 vj++; 1515 v += 4; 1516 } 1517 diag = aa + k * 4; /* ptr to inv(Dk) */ 1518 t[k2] = diag[0] * x0 + diag[2] * x1; 1519 t[k2 + 1] = diag[1] * x0 + diag[3] * x1; 1520 } 1521 1522 /* solve U*x = y by back substitution */ 1523 for (k = mbs - 1; k >= 0; k--) { 1524 v = aa + 4 * ai[k]; 1525 vj = aj + ai[k]; 1526 k2 = k * 2; 1527 x0 = t[k2]; 1528 x1 = t[k2 + 1]; 1529 nz = ai[k + 1] - ai[k]; 1530 while (nz--) { 1531 x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1]; 1532 x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1]; 1533 vj++; 1534 v += 4; 1535 } 1536 t[k2] = x0; 1537 t[k2 + 1] = x1; 1538 idx = 2 * r[k]; 1539 x[idx] = x0; 1540 x[idx + 1] = x1; 1541 } 1542 1543 PetscCall(ISRestoreIndices(isrow, &r)); 1544 PetscCall(VecRestoreArrayRead(bb, &b)); 1545 PetscCall(VecRestoreArray(xx, &x)); 1546 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1547 PetscFunctionReturn(PETSC_SUCCESS); 1548 } 1549 1550 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1551 { 1552 const MatScalar *v, *diag; 1553 PetscScalar x0, x1; 1554 PetscInt nz, k, k2; 1555 const PetscInt *vj; 1556 1557 PetscFunctionBegin; 1558 for (k = 0; k < mbs; k++) { 1559 v = aa + 4 * ai[k]; 1560 vj = aj + ai[k]; 1561 k2 = k * 2; 1562 x0 = x[k2]; 1563 x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */ 1564 nz = ai[k + 1] - ai[k]; 1565 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1566 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1567 while (nz--) { 1568 /* x(:) += U(k,:)^T*(Dk*xk) */ 1569 x[(*vj) * 2] += v[0] * x0 + v[1] * x1; 1570 x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1; 1571 vj++; 1572 v += 4; 1573 } 1574 /* xk = inv(Dk)*(Dk*xk) */ 1575 diag = aa + k * 4; /* ptr to inv(Dk) */ 1576 x[k2] = diag[0] * x0 + diag[2] * x1; 1577 x[k2 + 1] = diag[1] * x0 + diag[3] * x1; 1578 } 1579 PetscFunctionReturn(PETSC_SUCCESS); 1580 } 1581 1582 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1583 { 1584 const MatScalar *v; 1585 PetscScalar x0, x1; 1586 PetscInt nz, k, k2; 1587 const PetscInt *vj; 1588 1589 PetscFunctionBegin; 1590 for (k = mbs - 1; k >= 0; k--) { 1591 v = aa + 4 * ai[k]; 1592 vj = aj + ai[k]; 1593 k2 = k * 2; 1594 x0 = x[k2]; 1595 x1 = x[k2 + 1]; /* xk */ 1596 nz = ai[k + 1] - ai[k]; 1597 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1598 PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1599 while (nz--) { 1600 /* xk += U(k,:)*x(:) */ 1601 x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1]; 1602 x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1]; 1603 vj++; 1604 v += 4; 1605 } 1606 x[k2] = x0; 1607 x[k2 + 1] = x1; 1608 } 1609 PetscFunctionReturn(PETSC_SUCCESS); 1610 } 1611 1612 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1613 { 1614 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1615 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1616 const MatScalar *aa = a->a; 1617 PetscScalar *x; 1618 const PetscScalar *b; 1619 1620 PetscFunctionBegin; 1621 PetscCall(VecGetArrayRead(bb, &b)); 1622 PetscCall(VecGetArray(xx, &x)); 1623 1624 /* solve U^T * D * y = b by forward substitution */ 1625 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1626 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1627 1628 /* solve U*x = y by back substitution */ 1629 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1630 1631 PetscCall(VecRestoreArrayRead(bb, &b)); 1632 PetscCall(VecRestoreArray(xx, &x)); 1633 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1634 PetscFunctionReturn(PETSC_SUCCESS); 1635 } 1636 1637 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1638 { 1639 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1640 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1641 const MatScalar *aa = a->a; 1642 PetscScalar *x; 1643 const PetscScalar *b; 1644 1645 PetscFunctionBegin; 1646 PetscCall(VecGetArrayRead(bb, &b)); 1647 PetscCall(VecGetArray(xx, &x)); 1648 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1649 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1650 PetscCall(VecRestoreArrayRead(bb, &b)); 1651 PetscCall(VecRestoreArray(xx, &x)); 1652 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1653 PetscFunctionReturn(PETSC_SUCCESS); 1654 } 1655 1656 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1657 { 1658 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1659 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1660 const MatScalar *aa = a->a; 1661 PetscScalar *x; 1662 const PetscScalar *b; 1663 1664 PetscFunctionBegin; 1665 PetscCall(VecGetArrayRead(bb, &b)); 1666 PetscCall(VecGetArray(xx, &x)); 1667 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1668 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1669 PetscCall(VecRestoreArrayRead(bb, &b)); 1670 PetscCall(VecRestoreArray(xx, &x)); 1671 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1672 PetscFunctionReturn(PETSC_SUCCESS); 1673 } 1674 1675 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) 1676 { 1677 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1678 IS isrow = a->row; 1679 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1680 const MatScalar *aa = a->a, *v; 1681 const PetscScalar *b; 1682 PetscScalar *x, xk, *t; 1683 PetscInt nz, k, j; 1684 1685 PetscFunctionBegin; 1686 PetscCall(VecGetArrayRead(bb, &b)); 1687 PetscCall(VecGetArray(xx, &x)); 1688 t = a->solve_work; 1689 PetscCall(ISGetIndices(isrow, &rp)); 1690 1691 /* solve U^T*D*y = perm(b) by forward substitution */ 1692 for (k = 0; k < mbs; k++) t[k] = b[rp[k]]; 1693 for (k = 0; k < mbs; k++) { 1694 v = aa + ai[k]; 1695 vj = aj + ai[k]; 1696 xk = t[k]; 1697 nz = ai[k + 1] - ai[k] - 1; 1698 for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk; 1699 t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */ 1700 } 1701 1702 /* solve U*perm(x) = y by back substitution */ 1703 for (k = mbs - 1; k >= 0; k--) { 1704 v = aa + adiag[k] - 1; 1705 vj = aj + adiag[k] - 1; 1706 nz = ai[k + 1] - ai[k] - 1; 1707 for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]]; 1708 x[rp[k]] = t[k]; 1709 } 1710 1711 PetscCall(ISRestoreIndices(isrow, &rp)); 1712 PetscCall(VecRestoreArrayRead(bb, &b)); 1713 PetscCall(VecRestoreArray(xx, &x)); 1714 PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs)); 1715 PetscFunctionReturn(PETSC_SUCCESS); 1716 } 1717 1718 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1719 { 1720 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1721 IS isrow = a->row; 1722 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1723 const MatScalar *aa = a->a, *v; 1724 PetscScalar *x, xk, *t; 1725 const PetscScalar *b; 1726 PetscInt nz, k; 1727 1728 PetscFunctionBegin; 1729 PetscCall(VecGetArrayRead(bb, &b)); 1730 PetscCall(VecGetArray(xx, &x)); 1731 t = a->solve_work; 1732 PetscCall(ISGetIndices(isrow, &rp)); 1733 1734 /* solve U^T*D*y = perm(b) by forward substitution */ 1735 for (k = 0; k < mbs; k++) t[k] = b[rp[k]]; 1736 for (k = 0; k < mbs; k++) { 1737 v = aa + ai[k] + 1; 1738 vj = aj + ai[k] + 1; 1739 xk = t[k]; 1740 nz = ai[k + 1] - ai[k] - 1; 1741 while (nz--) t[*vj++] += (*v++) * xk; 1742 t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */ 1743 } 1744 1745 /* solve U*perm(x) = y by back substitution */ 1746 for (k = mbs - 1; k >= 0; k--) { 1747 v = aa + ai[k] + 1; 1748 vj = aj + ai[k] + 1; 1749 nz = ai[k + 1] - ai[k] - 1; 1750 while (nz--) t[k] += (*v++) * t[*vj++]; 1751 x[rp[k]] = t[k]; 1752 } 1753 1754 PetscCall(ISRestoreIndices(isrow, &rp)); 1755 PetscCall(VecRestoreArrayRead(bb, &b)); 1756 PetscCall(VecRestoreArray(xx, &x)); 1757 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 1758 PetscFunctionReturn(PETSC_SUCCESS); 1759 } 1760 1761 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) 1762 { 1763 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1764 IS isrow = a->row; 1765 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1766 const MatScalar *aa = a->a, *v; 1767 PetscReal diagk; 1768 PetscScalar *x, xk; 1769 const PetscScalar *b; 1770 PetscInt nz, k; 1771 1772 PetscFunctionBegin; 1773 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1774 PetscCall(VecGetArrayRead(bb, &b)); 1775 PetscCall(VecGetArray(xx, &x)); 1776 PetscCall(ISGetIndices(isrow, &rp)); 1777 1778 for (k = 0; k < mbs; k++) x[k] = b[rp[k]]; 1779 for (k = 0; k < mbs; k++) { 1780 v = aa + ai[k]; 1781 vj = aj + ai[k]; 1782 xk = x[k]; 1783 nz = ai[k + 1] - ai[k] - 1; 1784 while (nz--) x[*vj++] += (*v++) * xk; 1785 1786 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1787 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1788 x[k] = xk * PetscSqrtReal(diagk); 1789 } 1790 PetscCall(ISRestoreIndices(isrow, &rp)); 1791 PetscCall(VecRestoreArrayRead(bb, &b)); 1792 PetscCall(VecRestoreArray(xx, &x)); 1793 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1794 PetscFunctionReturn(PETSC_SUCCESS); 1795 } 1796 1797 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1798 { 1799 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1800 IS isrow = a->row; 1801 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1802 const MatScalar *aa = a->a, *v; 1803 PetscReal diagk; 1804 PetscScalar *x, xk; 1805 const PetscScalar *b; 1806 PetscInt nz, k; 1807 1808 PetscFunctionBegin; 1809 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1810 PetscCall(VecGetArrayRead(bb, &b)); 1811 PetscCall(VecGetArray(xx, &x)); 1812 PetscCall(ISGetIndices(isrow, &rp)); 1813 1814 for (k = 0; k < mbs; k++) x[k] = b[rp[k]]; 1815 for (k = 0; k < mbs; k++) { 1816 v = aa + ai[k] + 1; 1817 vj = aj + ai[k] + 1; 1818 xk = x[k]; 1819 nz = ai[k + 1] - ai[k] - 1; 1820 while (nz--) x[*vj++] += (*v++) * xk; 1821 1822 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1823 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1824 x[k] = xk * PetscSqrtReal(diagk); 1825 } 1826 PetscCall(ISRestoreIndices(isrow, &rp)); 1827 PetscCall(VecRestoreArrayRead(bb, &b)); 1828 PetscCall(VecRestoreArray(xx, &x)); 1829 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1830 PetscFunctionReturn(PETSC_SUCCESS); 1831 } 1832 1833 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) 1834 { 1835 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1836 IS isrow = a->row; 1837 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1838 const MatScalar *aa = a->a, *v; 1839 PetscReal diagk; 1840 PetscScalar *x, *t; 1841 const PetscScalar *b; 1842 PetscInt nz, k; 1843 1844 PetscFunctionBegin; 1845 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1846 PetscCall(VecGetArrayRead(bb, &b)); 1847 PetscCall(VecGetArray(xx, &x)); 1848 t = a->solve_work; 1849 PetscCall(ISGetIndices(isrow, &rp)); 1850 1851 for (k = mbs - 1; k >= 0; k--) { 1852 v = aa + ai[k]; 1853 vj = aj + ai[k]; 1854 diagk = PetscRealPart(aa[adiag[k]]); 1855 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1856 t[k] = b[k] * PetscSqrtReal(diagk); 1857 nz = ai[k + 1] - ai[k] - 1; 1858 while (nz--) t[k] += (*v++) * t[*vj++]; 1859 x[rp[k]] = t[k]; 1860 } 1861 PetscCall(ISRestoreIndices(isrow, &rp)); 1862 PetscCall(VecRestoreArrayRead(bb, &b)); 1863 PetscCall(VecRestoreArray(xx, &x)); 1864 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1865 PetscFunctionReturn(PETSC_SUCCESS); 1866 } 1867 1868 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1869 { 1870 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1871 IS isrow = a->row; 1872 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1873 const MatScalar *aa = a->a, *v; 1874 PetscReal diagk; 1875 PetscScalar *x, *t; 1876 const PetscScalar *b; 1877 PetscInt nz, k; 1878 1879 PetscFunctionBegin; 1880 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1881 PetscCall(VecGetArrayRead(bb, &b)); 1882 PetscCall(VecGetArray(xx, &x)); 1883 t = a->solve_work; 1884 PetscCall(ISGetIndices(isrow, &rp)); 1885 1886 for (k = mbs - 1; k >= 0; k--) { 1887 v = aa + ai[k] + 1; 1888 vj = aj + ai[k] + 1; 1889 diagk = PetscRealPart(aa[ai[k]]); 1890 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1891 t[k] = b[k] * PetscSqrtReal(diagk); 1892 nz = ai[k + 1] - ai[k] - 1; 1893 while (nz--) t[k] += (*v++) * t[*vj++]; 1894 x[rp[k]] = t[k]; 1895 } 1896 PetscCall(ISRestoreIndices(isrow, &rp)); 1897 PetscCall(VecRestoreArrayRead(bb, &b)); 1898 PetscCall(VecRestoreArray(xx, &x)); 1899 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1900 PetscFunctionReturn(PETSC_SUCCESS); 1901 } 1902 1903 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx) 1904 { 1905 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1906 1907 PetscFunctionBegin; 1908 if (A->rmap->bs == 1) { 1909 PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v)); 1910 } else { 1911 IS isrow = a->row; 1912 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp; 1913 const MatScalar *aa = a->a, *v; 1914 PetscScalar *x, *t; 1915 const PetscScalar *b; 1916 PetscInt nz, k, n, i, j; 1917 1918 if (bb->n > a->solves_work_n) { 1919 PetscCall(PetscFree(a->solves_work)); 1920 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work)); 1921 a->solves_work_n = bb->n; 1922 } 1923 n = bb->n; 1924 PetscCall(VecGetArrayRead(bb->v, &b)); 1925 PetscCall(VecGetArray(xx->v, &x)); 1926 t = a->solves_work; 1927 1928 PetscCall(ISGetIndices(isrow, &rp)); 1929 1930 /* solve U^T*D*y = perm(b) by forward substitution */ 1931 for (k = 0; k < mbs; k++) { 1932 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */ 1933 } 1934 for (k = 0; k < mbs; k++) { 1935 v = aa + ai[k]; 1936 vj = aj + ai[k]; 1937 nz = ai[k + 1] - ai[k] - 1; 1938 for (j = 0; j < nz; j++) { 1939 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i]; 1940 v++; 1941 vj++; 1942 } 1943 for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1944 } 1945 1946 /* solve U*perm(x) = y by back substitution */ 1947 for (k = mbs - 1; k >= 0; k--) { 1948 v = aa + ai[k] - 1; 1949 vj = aj + ai[k] - 1; 1950 nz = ai[k + 1] - ai[k] - 1; 1951 for (j = 0; j < nz; j++) { 1952 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i]; 1953 v++; 1954 vj++; 1955 } 1956 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i]; 1957 } 1958 1959 PetscCall(ISRestoreIndices(isrow, &rp)); 1960 PetscCall(VecRestoreArrayRead(bb->v, &b)); 1961 PetscCall(VecRestoreArray(xx->v, &x)); 1962 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs))); 1963 } 1964 PetscFunctionReturn(PETSC_SUCCESS); 1965 } 1966 1967 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx) 1968 { 1969 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1970 1971 PetscFunctionBegin; 1972 if (A->rmap->bs == 1) { 1973 PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v)); 1974 } else { 1975 IS isrow = a->row; 1976 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp; 1977 const MatScalar *aa = a->a, *v; 1978 PetscScalar *x, *t; 1979 const PetscScalar *b; 1980 PetscInt nz, k, n, i; 1981 1982 if (bb->n > a->solves_work_n) { 1983 PetscCall(PetscFree(a->solves_work)); 1984 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work)); 1985 a->solves_work_n = bb->n; 1986 } 1987 n = bb->n; 1988 PetscCall(VecGetArrayRead(bb->v, &b)); 1989 PetscCall(VecGetArray(xx->v, &x)); 1990 t = a->solves_work; 1991 1992 PetscCall(ISGetIndices(isrow, &rp)); 1993 1994 /* solve U^T*D*y = perm(b) by forward substitution */ 1995 for (k = 0; k < mbs; k++) { 1996 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */ 1997 } 1998 for (k = 0; k < mbs; k++) { 1999 v = aa + ai[k]; 2000 vj = aj + ai[k]; 2001 nz = ai[k + 1] - ai[k]; 2002 while (nz--) { 2003 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i]; 2004 v++; 2005 vj++; 2006 } 2007 for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 2008 } 2009 2010 /* solve U*perm(x) = y by back substitution */ 2011 for (k = mbs - 1; k >= 0; k--) { 2012 v = aa + ai[k]; 2013 vj = aj + ai[k]; 2014 nz = ai[k + 1] - ai[k]; 2015 while (nz--) { 2016 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i]; 2017 v++; 2018 vj++; 2019 } 2020 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i]; 2021 } 2022 2023 PetscCall(ISRestoreIndices(isrow, &rp)); 2024 PetscCall(VecRestoreArrayRead(bb->v, &b)); 2025 PetscCall(VecRestoreArray(xx->v, &x)); 2026 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs))); 2027 } 2028 PetscFunctionReturn(PETSC_SUCCESS); 2029 } 2030 2031 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) 2032 { 2033 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2034 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag; 2035 const MatScalar *aa = a->a, *v; 2036 const PetscScalar *b; 2037 PetscScalar *x, xi; 2038 PetscInt nz, i, j; 2039 2040 PetscFunctionBegin; 2041 PetscCall(VecGetArrayRead(bb, &b)); 2042 PetscCall(VecGetArray(xx, &x)); 2043 /* solve U^T*D*y = b by forward substitution */ 2044 PetscCall(PetscArraycpy(x, b, mbs)); 2045 for (i = 0; i < mbs; i++) { 2046 v = aa + ai[i]; 2047 vj = aj + ai[i]; 2048 xi = x[i]; 2049 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 2050 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi; 2051 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2052 } 2053 /* solve U*x = y by backward substitution */ 2054 for (i = mbs - 2; i >= 0; i--) { 2055 xi = x[i]; 2056 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2057 vj = aj + adiag[i] - 1; 2058 nz = ai[i + 1] - ai[i] - 1; 2059 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]]; 2060 x[i] = xi; 2061 } 2062 PetscCall(VecRestoreArrayRead(bb, &b)); 2063 PetscCall(VecRestoreArray(xx, &x)); 2064 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 2065 PetscFunctionReturn(PETSC_SUCCESS); 2066 } 2067 2068 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X) 2069 { 2070 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2071 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag; 2072 const MatScalar *aa = a->a, *v; 2073 const PetscScalar *b; 2074 PetscScalar *x, xi; 2075 PetscInt nz, i, j, neq, ldb, ldx; 2076 PetscBool isdense; 2077 2078 PetscFunctionBegin; 2079 if (!mbs) PetscFunctionReturn(PETSC_SUCCESS); 2080 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense)); 2081 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix"); 2082 if (X != B) { 2083 PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense)); 2084 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix"); 2085 } 2086 PetscCall(MatDenseGetArrayRead(B, &b)); 2087 PetscCall(MatDenseGetLDA(B, &ldb)); 2088 PetscCall(MatDenseGetArray(X, &x)); 2089 PetscCall(MatDenseGetLDA(X, &ldx)); 2090 for (neq = 0; neq < B->cmap->n; neq++) { 2091 /* solve U^T*D*y = b by forward substitution */ 2092 PetscCall(PetscArraycpy(x, b, mbs)); 2093 for (i = 0; i < mbs; i++) { 2094 v = aa + ai[i]; 2095 vj = aj + ai[i]; 2096 xi = x[i]; 2097 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 2098 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi; 2099 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2100 } 2101 /* solve U*x = y by backward substitution */ 2102 for (i = mbs - 2; i >= 0; i--) { 2103 xi = x[i]; 2104 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2105 vj = aj + adiag[i] - 1; 2106 nz = ai[i + 1] - ai[i] - 1; 2107 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]]; 2108 x[i] = xi; 2109 } 2110 b += ldb; 2111 x += ldx; 2112 } 2113 PetscCall(MatDenseRestoreArrayRead(B, &b)); 2114 PetscCall(MatDenseRestoreArray(X, &x)); 2115 PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs))); 2116 PetscFunctionReturn(PETSC_SUCCESS); 2117 } 2118 2119 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 2120 { 2121 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2122 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2123 const MatScalar *aa = a->a, *v; 2124 PetscScalar *x, xk; 2125 const PetscScalar *b; 2126 PetscInt nz, k; 2127 2128 PetscFunctionBegin; 2129 PetscCall(VecGetArrayRead(bb, &b)); 2130 PetscCall(VecGetArray(xx, &x)); 2131 2132 /* solve U^T*D*y = b by forward substitution */ 2133 PetscCall(PetscArraycpy(x, b, mbs)); 2134 for (k = 0; k < mbs; k++) { 2135 v = aa + ai[k] + 1; 2136 vj = aj + ai[k] + 1; 2137 xk = x[k]; 2138 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2139 while (nz--) x[*vj++] += (*v++) * xk; 2140 x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2141 } 2142 2143 /* solve U*x = y by back substitution */ 2144 for (k = mbs - 2; k >= 0; k--) { 2145 v = aa + ai[k] + 1; 2146 vj = aj + ai[k] + 1; 2147 xk = x[k]; 2148 nz = ai[k + 1] - ai[k] - 1; 2149 while (nz--) xk += (*v++) * x[*vj++]; 2150 x[k] = xk; 2151 } 2152 2153 PetscCall(VecRestoreArrayRead(bb, &b)); 2154 PetscCall(VecRestoreArray(xx, &x)); 2155 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 2156 PetscFunctionReturn(PETSC_SUCCESS); 2157 } 2158 2159 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) 2160 { 2161 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2162 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj; 2163 const MatScalar *aa = a->a, *v; 2164 PetscReal diagk; 2165 PetscScalar *x; 2166 const PetscScalar *b; 2167 PetscInt nz, k; 2168 2169 PetscFunctionBegin; 2170 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2171 PetscCall(VecGetArrayRead(bb, &b)); 2172 PetscCall(VecGetArray(xx, &x)); 2173 PetscCall(PetscArraycpy(x, b, mbs)); 2174 for (k = 0; k < mbs; k++) { 2175 v = aa + ai[k]; 2176 vj = aj + ai[k]; 2177 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2178 while (nz--) x[*vj++] += (*v++) * x[k]; 2179 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2180 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]])); 2181 x[k] *= PetscSqrtReal(diagk); 2182 } 2183 PetscCall(VecRestoreArrayRead(bb, &b)); 2184 PetscCall(VecRestoreArray(xx, &x)); 2185 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2186 PetscFunctionReturn(PETSC_SUCCESS); 2187 } 2188 2189 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 2190 { 2191 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2192 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2193 const MatScalar *aa = a->a, *v; 2194 PetscReal diagk; 2195 PetscScalar *x; 2196 const PetscScalar *b; 2197 PetscInt nz, k; 2198 2199 PetscFunctionBegin; 2200 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2201 PetscCall(VecGetArrayRead(bb, &b)); 2202 PetscCall(VecGetArray(xx, &x)); 2203 PetscCall(PetscArraycpy(x, b, mbs)); 2204 for (k = 0; k < mbs; k++) { 2205 v = aa + ai[k] + 1; 2206 vj = aj + ai[k] + 1; 2207 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2208 while (nz--) x[*vj++] += (*v++) * x[k]; 2209 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2210 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]])); 2211 x[k] *= PetscSqrtReal(diagk); 2212 } 2213 PetscCall(VecRestoreArrayRead(bb, &b)); 2214 PetscCall(VecRestoreArray(xx, &x)); 2215 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2216 PetscFunctionReturn(PETSC_SUCCESS); 2217 } 2218 2219 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) 2220 { 2221 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2222 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj; 2223 const MatScalar *aa = a->a, *v; 2224 PetscReal diagk; 2225 PetscScalar *x; 2226 const PetscScalar *b; 2227 PetscInt nz, k; 2228 2229 PetscFunctionBegin; 2230 /* solve D^(1/2)*U*x = b by back substitution */ 2231 PetscCall(VecGetArrayRead(bb, &b)); 2232 PetscCall(VecGetArray(xx, &x)); 2233 2234 for (k = mbs - 1; k >= 0; k--) { 2235 v = aa + ai[k]; 2236 vj = aj + ai[k]; 2237 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2238 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 2239 x[k] = PetscSqrtReal(diagk) * b[k]; 2240 nz = ai[k + 1] - ai[k] - 1; 2241 while (nz--) x[k] += (*v++) * x[*vj++]; 2242 } 2243 PetscCall(VecRestoreArrayRead(bb, &b)); 2244 PetscCall(VecRestoreArray(xx, &x)); 2245 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2246 PetscFunctionReturn(PETSC_SUCCESS); 2247 } 2248 2249 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 2250 { 2251 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2252 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2253 const MatScalar *aa = a->a, *v; 2254 PetscReal diagk; 2255 PetscScalar *x; 2256 const PetscScalar *b; 2257 PetscInt nz, k; 2258 2259 PetscFunctionBegin; 2260 /* solve D^(1/2)*U*x = b by back substitution */ 2261 PetscCall(VecGetArrayRead(bb, &b)); 2262 PetscCall(VecGetArray(xx, &x)); 2263 2264 for (k = mbs - 1; k >= 0; k--) { 2265 v = aa + ai[k] + 1; 2266 vj = aj + ai[k] + 1; 2267 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2268 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 2269 x[k] = PetscSqrtReal(diagk) * b[k]; 2270 nz = ai[k + 1] - ai[k] - 1; 2271 while (nz--) x[k] += (*v++) * x[*vj++]; 2272 } 2273 PetscCall(VecRestoreArrayRead(bb, &b)); 2274 PetscCall(VecRestoreArray(xx, &x)); 2275 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2276 PetscFunctionReturn(PETSC_SUCCESS); 2277 } 2278 2279 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2280 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2281 { 2282 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b; 2283 const PetscInt *rip, mbs = a->mbs, *ai, *aj; 2284 PetscInt *jutmp, bs = A->rmap->bs, i; 2285 PetscInt m, reallocs = 0, *levtmp; 2286 PetscInt *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd; 2287 PetscInt incrlev, *lev, shift, prow, nz; 2288 PetscReal f = info->fill, levels = info->levels; 2289 PetscBool perm_identity; 2290 2291 PetscFunctionBegin; 2292 /* check whether perm is the identity mapping */ 2293 PetscCall(ISIdentity(perm, &perm_identity)); 2294 2295 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2296 a->permute = PETSC_FALSE; 2297 ai = a->i; 2298 aj = a->j; 2299 2300 /* initialization */ 2301 PetscCall(ISGetIndices(perm, &rip)); 2302 umax = (PetscInt)(f * ai[mbs] + 1); 2303 PetscCall(PetscMalloc1(umax, &lev)); 2304 umax += mbs + 1; 2305 shift = mbs + 1; 2306 PetscCall(PetscMalloc1(mbs + 1, &iu)); 2307 PetscCall(PetscMalloc1(umax, &ju)); 2308 iu[0] = mbs + 1; 2309 juidx = mbs + 1; 2310 /* prowl: linked list for pivot row */ 2311 PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp)); 2312 /* q: linked list for col index */ 2313 2314 for (i = 0; i < mbs; i++) { 2315 prowl[i] = mbs; 2316 q[i] = 0; 2317 } 2318 2319 /* for each row k */ 2320 for (k = 0; k < mbs; k++) { 2321 nzk = 0; 2322 q[k] = mbs; 2323 /* copy current row into linked list */ 2324 nz = ai[rip[k] + 1] - ai[rip[k]]; 2325 j = ai[rip[k]]; 2326 while (nz--) { 2327 vj = rip[aj[j++]]; 2328 if (vj > k) { 2329 qm = k; 2330 do { 2331 m = qm; 2332 qm = q[m]; 2333 } while (qm < vj); 2334 PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A"); 2335 nzk++; 2336 q[m] = vj; 2337 q[vj] = qm; 2338 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2339 } 2340 } 2341 2342 /* modify nonzero structure of k-th row by computing fill-in 2343 for each row prow to be merged in */ 2344 prow = k; 2345 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2346 2347 while (prow < k) { 2348 /* merge row prow into k-th row */ 2349 jmin = iu[prow] + 1; 2350 jmax = iu[prow + 1]; 2351 qm = k; 2352 for (j = jmin; j < jmax; j++) { 2353 incrlev = lev[j - shift] + 1; 2354 if (incrlev > levels) continue; 2355 vj = ju[j]; 2356 do { 2357 m = qm; 2358 qm = q[m]; 2359 } while (qm < vj); 2360 if (qm != vj) { /* a fill */ 2361 nzk++; 2362 q[m] = vj; 2363 q[vj] = qm; 2364 qm = vj; 2365 levtmp[vj] = incrlev; 2366 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2367 } 2368 prow = prowl[prow]; /* next pivot row */ 2369 } 2370 2371 /* add k to row list for first nonzero element in k-th row */ 2372 if (nzk > 1) { 2373 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2374 prowl[k] = prowl[i]; 2375 prowl[i] = k; 2376 } 2377 iu[k + 1] = iu[k] + nzk; 2378 2379 /* allocate more space to ju and lev if needed */ 2380 if (iu[k + 1] > umax) { 2381 /* estimate how much additional space we will need */ 2382 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2383 /* just double the memory each time */ 2384 maxadd = umax; 2385 if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2; 2386 umax += maxadd; 2387 2388 /* allocate a longer ju */ 2389 PetscCall(PetscMalloc1(umax, &jutmp)); 2390 PetscCall(PetscArraycpy(jutmp, ju, iu[k])); 2391 PetscCall(PetscFree(ju)); 2392 ju = jutmp; 2393 2394 PetscCall(PetscMalloc1(umax, &jutmp)); 2395 PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift)); 2396 PetscCall(PetscFree(lev)); 2397 lev = jutmp; 2398 reallocs += 2; /* count how many times we realloc */ 2399 } 2400 2401 /* save nonzero structure of k-th row in ju */ 2402 i = k; 2403 while (nzk--) { 2404 i = q[i]; 2405 ju[juidx] = i; 2406 lev[juidx - shift] = levtmp[i]; 2407 juidx++; 2408 } 2409 } 2410 2411 #if defined(PETSC_USE_INFO) 2412 if (ai[mbs] != 0) { 2413 PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]); 2414 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af)); 2415 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2416 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af)); 2417 PetscCall(PetscInfo(A, "for best performance.\n")); 2418 } else { 2419 PetscCall(PetscInfo(A, "Empty matrix\n")); 2420 } 2421 #endif 2422 2423 PetscCall(ISRestoreIndices(perm, &rip)); 2424 PetscCall(PetscFree3(prowl, q, levtmp)); 2425 PetscCall(PetscFree(lev)); 2426 2427 /* put together the new matrix */ 2428 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL)); 2429 2430 b = (Mat_SeqSBAIJ *)(B)->data; 2431 PetscCall(PetscFree2(b->imax, b->ilen)); 2432 2433 b->singlemalloc = PETSC_FALSE; 2434 b->free_a = PETSC_TRUE; 2435 b->free_ij = PETSC_TRUE; 2436 /* the next line frees the default space generated by the Create() */ 2437 PetscCall(PetscFree3(b->a, b->j, b->i)); 2438 PetscCall(PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a)); 2439 b->j = ju; 2440 b->i = iu; 2441 b->diag = NULL; 2442 b->ilen = NULL; 2443 b->imax = NULL; 2444 2445 PetscCall(ISDestroy(&b->row)); 2446 PetscCall(ISDestroy(&b->icol)); 2447 b->row = perm; 2448 b->icol = perm; 2449 PetscCall(PetscObjectReference((PetscObject)perm)); 2450 PetscCall(PetscObjectReference((PetscObject)perm)); 2451 PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work)); 2452 /* In b structure: Free imax, ilen, old a, old j. 2453 Allocate idnew, solve_work, new a, new j */ 2454 b->maxnz = b->nz = iu[mbs]; 2455 2456 (B)->info.factor_mallocs = reallocs; 2457 (B)->info.fill_ratio_given = f; 2458 if (ai[mbs] != 0) { 2459 (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]); 2460 } else { 2461 (B)->info.fill_ratio_needed = 0.0; 2462 } 2463 PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity)); 2464 PetscFunctionReturn(PETSC_SUCCESS); 2465 } 2466 2467 /* 2468 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2469 */ 2470 #include <petscbt.h> 2471 #include <../src/mat/utils/freespace.h> 2472 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 2473 { 2474 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b; 2475 PetscBool perm_identity, free_ij = PETSC_TRUE, missing; 2476 PetscInt bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j; 2477 const PetscInt *rip; 2478 PetscInt reallocs = 0, i, *ui, *udiag, *cols; 2479 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 2480 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr; 2481 PetscReal fill = info->fill, levels = info->levels; 2482 PetscFreeSpaceList free_space = NULL, current_space = NULL; 2483 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 2484 PetscBT lnkbt; 2485 2486 PetscFunctionBegin; 2487 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2488 PetscCall(MatMissingDiagonal(A, &missing, &d)); 2489 PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d); 2490 if (bs > 1) { 2491 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info)); 2492 PetscFunctionReturn(PETSC_SUCCESS); 2493 } 2494 2495 /* check whether perm is the identity mapping */ 2496 PetscCall(ISIdentity(perm, &perm_identity)); 2497 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2498 a->permute = PETSC_FALSE; 2499 2500 PetscCall(PetscMalloc1(am + 1, &ui)); 2501 PetscCall(PetscMalloc1(am + 1, &udiag)); 2502 ui[0] = 0; 2503 2504 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2505 if (!levels) { 2506 /* reuse the column pointers and row offsets for memory savings */ 2507 for (i = 0; i < am; i++) { 2508 ncols = ai[i + 1] - ai[i]; 2509 ui[i + 1] = ui[i] + ncols; 2510 udiag[i] = ui[i + 1] - 1; /* points to the last entry of U(i,:) */ 2511 } 2512 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2513 cols = uj; 2514 for (i = 0; i < am; i++) { 2515 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2516 ncols = ai[i + 1] - ai[i] - 1; 2517 for (j = 0; j < ncols; j++) *cols++ = aj[j]; 2518 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */ 2519 } 2520 } else { /* case: levels>0 */ 2521 PetscCall(ISGetIndices(perm, &rip)); 2522 2523 /* initialization */ 2524 /* jl: linked list for storing indices of the pivot rows 2525 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2526 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 2527 for (i = 0; i < am; i++) { 2528 jl[i] = am; 2529 il[i] = 0; 2530 } 2531 2532 /* create and initialize a linked list for storing column indices of the active row k */ 2533 nlnk = am + 1; 2534 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 2535 2536 /* initial FreeSpace size is fill*(ai[am]+1) */ 2537 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space)); 2538 2539 current_space = free_space; 2540 2541 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl)); 2542 2543 current_space_lvl = free_space_lvl; 2544 2545 for (k = 0; k < am; k++) { /* for each active row k */ 2546 /* initialize lnk by the column indices of row k */ 2547 nzk = 0; 2548 ncols = ai[k + 1] - ai[k]; 2549 PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k); 2550 cols = aj + ai[k]; 2551 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt)); 2552 nzk += nlnk; 2553 2554 /* update lnk by computing fill-in for each pivot row to be merged in */ 2555 prow = jl[k]; /* 1st pivot row */ 2556 2557 while (prow < k) { 2558 nextprow = jl[prow]; 2559 2560 /* merge prow into k-th row */ 2561 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2562 jmax = ui[prow + 1]; 2563 ncols = jmax - jmin; 2564 i = jmin - ui[prow]; 2565 2566 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2567 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2568 j = *(uj - 1); 2569 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j)); 2570 nzk += nlnk; 2571 2572 /* update il and jl for prow */ 2573 if (jmin < jmax) { 2574 il[prow] = jmin; 2575 2576 j = *cols; 2577 jl[prow] = jl[j]; 2578 jl[j] = prow; 2579 } 2580 prow = nextprow; 2581 } 2582 2583 /* if free space is not available, make more free space */ 2584 if (current_space->local_remaining < nzk) { 2585 i = am - k + 1; /* num of unfactored rows */ 2586 i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2587 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 2588 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 2589 reallocs++; 2590 } 2591 2592 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2593 PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k); 2594 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 2595 2596 /* add the k-th row into il and jl */ 2597 if (nzk > 1) { 2598 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2599 jl[k] = jl[i]; 2600 jl[i] = k; 2601 il[k] = ui[k] + 1; 2602 } 2603 uj_ptr[k] = current_space->array; 2604 uj_lvl_ptr[k] = current_space_lvl->array; 2605 2606 current_space->array += nzk; 2607 current_space->local_used += nzk; 2608 current_space->local_remaining -= nzk; 2609 current_space_lvl->array += nzk; 2610 current_space_lvl->local_used += nzk; 2611 current_space_lvl->local_remaining -= nzk; 2612 2613 ui[k + 1] = ui[k] + nzk; 2614 } 2615 2616 PetscCall(ISRestoreIndices(perm, &rip)); 2617 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 2618 2619 /* destroy list of free space and other temporary array(s) */ 2620 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2621 PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor */ 2622 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 2623 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2624 2625 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2626 2627 /* put together the new matrix in MATSEQSBAIJ format */ 2628 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 2629 2630 b = (Mat_SeqSBAIJ *)(fact)->data; 2631 PetscCall(PetscFree2(b->imax, b->ilen)); 2632 2633 b->singlemalloc = PETSC_FALSE; 2634 b->free_a = PETSC_TRUE; 2635 b->free_ij = free_ij; 2636 2637 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 2638 2639 b->j = uj; 2640 b->i = ui; 2641 b->diag = udiag; 2642 b->free_diag = PETSC_TRUE; 2643 b->ilen = NULL; 2644 b->imax = NULL; 2645 b->row = perm; 2646 b->col = perm; 2647 2648 PetscCall(PetscObjectReference((PetscObject)perm)); 2649 PetscCall(PetscObjectReference((PetscObject)perm)); 2650 2651 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2652 2653 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 2654 2655 b->maxnz = b->nz = ui[am]; 2656 2657 fact->info.factor_mallocs = reallocs; 2658 fact->info.fill_ratio_given = fill; 2659 if (ai[am] != 0) { 2660 fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am]; 2661 } else { 2662 fact->info.fill_ratio_needed = 0.0; 2663 } 2664 #if defined(PETSC_USE_INFO) 2665 if (ai[am] != 0) { 2666 PetscReal af = fact->info.fill_ratio_needed; 2667 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 2668 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2669 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 2670 } else { 2671 PetscCall(PetscInfo(A, "Empty matrix\n")); 2672 } 2673 #endif 2674 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2675 PetscFunctionReturn(PETSC_SUCCESS); 2676 } 2677 2678 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 2679 { 2680 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2681 Mat_SeqSBAIJ *b; 2682 PetscBool perm_identity, free_ij = PETSC_TRUE; 2683 PetscInt bs = A->rmap->bs, am = a->mbs; 2684 const PetscInt *cols, *rip, *ai = a->i, *aj = a->j; 2685 PetscInt reallocs = 0, i, *ui; 2686 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 2687 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 2688 PetscReal fill = info->fill, levels = info->levels, ratio_needed; 2689 PetscFreeSpaceList free_space = NULL, current_space = NULL; 2690 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 2691 PetscBT lnkbt; 2692 2693 PetscFunctionBegin; 2694 /* 2695 This code originally uses Modified Sparse Row (MSR) storage 2696 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2697 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2698 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2699 thus the original code in MSR format is still used for these cases. 2700 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2701 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2702 */ 2703 if (bs > 1) { 2704 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info)); 2705 PetscFunctionReturn(PETSC_SUCCESS); 2706 } 2707 2708 /* check whether perm is the identity mapping */ 2709 PetscCall(ISIdentity(perm, &perm_identity)); 2710 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2711 a->permute = PETSC_FALSE; 2712 2713 /* special case that simply copies fill pattern */ 2714 if (!levels) { 2715 /* reuse the column pointers and row offsets for memory savings */ 2716 ui = a->i; 2717 uj = a->j; 2718 free_ij = PETSC_FALSE; 2719 ratio_needed = 1.0; 2720 } else { /* case: levels>0 */ 2721 PetscCall(ISGetIndices(perm, &rip)); 2722 2723 /* initialization */ 2724 PetscCall(PetscMalloc1(am + 1, &ui)); 2725 ui[0] = 0; 2726 2727 /* jl: linked list for storing indices of the pivot rows 2728 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2729 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 2730 for (i = 0; i < am; i++) { 2731 jl[i] = am; 2732 il[i] = 0; 2733 } 2734 2735 /* create and initialize a linked list for storing column indices of the active row k */ 2736 nlnk = am + 1; 2737 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 2738 2739 /* initial FreeSpace size is fill*(ai[am]+1) */ 2740 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space)); 2741 2742 current_space = free_space; 2743 2744 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl)); 2745 2746 current_space_lvl = free_space_lvl; 2747 2748 for (k = 0; k < am; k++) { /* for each active row k */ 2749 /* initialize lnk by the column indices of row rip[k] */ 2750 nzk = 0; 2751 ncols = ai[rip[k] + 1] - ai[rip[k]]; 2752 cols = aj + ai[rip[k]]; 2753 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt)); 2754 nzk += nlnk; 2755 2756 /* update lnk by computing fill-in for each pivot row to be merged in */ 2757 prow = jl[k]; /* 1st pivot row */ 2758 2759 while (prow < k) { 2760 nextprow = jl[prow]; 2761 2762 /* merge prow into k-th row */ 2763 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2764 jmax = ui[prow + 1]; 2765 ncols = jmax - jmin; 2766 i = jmin - ui[prow]; 2767 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2768 j = *(uj_lvl_ptr[prow] + i - 1); 2769 cols_lvl = uj_lvl_ptr[prow] + i; 2770 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j)); 2771 nzk += nlnk; 2772 2773 /* update il and jl for prow */ 2774 if (jmin < jmax) { 2775 il[prow] = jmin; 2776 j = *cols; 2777 jl[prow] = jl[j]; 2778 jl[j] = prow; 2779 } 2780 prow = nextprow; 2781 } 2782 2783 /* if free space is not available, make more free space */ 2784 if (current_space->local_remaining < nzk) { 2785 i = am - k + 1; /* num of unfactored rows */ 2786 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2787 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 2788 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 2789 reallocs++; 2790 } 2791 2792 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2793 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 2794 2795 /* add the k-th row into il and jl */ 2796 if (nzk - 1 > 0) { 2797 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2798 jl[k] = jl[i]; 2799 jl[i] = k; 2800 il[k] = ui[k] + 1; 2801 } 2802 uj_ptr[k] = current_space->array; 2803 uj_lvl_ptr[k] = current_space_lvl->array; 2804 2805 current_space->array += nzk; 2806 current_space->local_used += nzk; 2807 current_space->local_remaining -= nzk; 2808 current_space_lvl->array += nzk; 2809 current_space_lvl->local_used += nzk; 2810 current_space_lvl->local_remaining -= nzk; 2811 2812 ui[k + 1] = ui[k] + nzk; 2813 } 2814 2815 PetscCall(ISRestoreIndices(perm, &rip)); 2816 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 2817 2818 /* destroy list of free space and other temporary array(s) */ 2819 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2820 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 2821 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 2822 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2823 if (ai[am] != 0) { 2824 ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]); 2825 } else { 2826 ratio_needed = 0.0; 2827 } 2828 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2829 2830 /* put together the new matrix in MATSEQSBAIJ format */ 2831 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 2832 2833 b = (Mat_SeqSBAIJ *)(fact)->data; 2834 2835 PetscCall(PetscFree2(b->imax, b->ilen)); 2836 2837 b->singlemalloc = PETSC_FALSE; 2838 b->free_a = PETSC_TRUE; 2839 b->free_ij = free_ij; 2840 2841 PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 2842 2843 b->j = uj; 2844 b->i = ui; 2845 b->diag = NULL; 2846 b->ilen = NULL; 2847 b->imax = NULL; 2848 b->row = perm; 2849 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2850 2851 PetscCall(PetscObjectReference((PetscObject)perm)); 2852 b->icol = perm; 2853 PetscCall(PetscObjectReference((PetscObject)perm)); 2854 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 2855 2856 b->maxnz = b->nz = ui[am]; 2857 2858 fact->info.factor_mallocs = reallocs; 2859 fact->info.fill_ratio_given = fill; 2860 fact->info.fill_ratio_needed = ratio_needed; 2861 #if defined(PETSC_USE_INFO) 2862 if (ai[am] != 0) { 2863 PetscReal af = fact->info.fill_ratio_needed; 2864 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 2865 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2866 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 2867 } else { 2868 PetscCall(PetscInfo(A, "Empty matrix\n")); 2869 } 2870 #endif 2871 if (perm_identity) { 2872 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2873 } else { 2874 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2875 } 2876 PetscFunctionReturn(PETSC_SUCCESS); 2877 } 2878