1 /* 2 Factorization code for SBAIJ format. 3 */ 4 5 #include <../src/mat/impls/sbaij/seq/sbaij.h> 6 #include <../src/mat/impls/baij/seq/baij.h> 7 #include <petsc/private/kernels/blockinvert.h> 8 9 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 10 { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 12 IS isrow = a->row; 13 PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 14 const PetscInt *r; 15 PetscInt nz, *vj, k, idx, k1; 16 PetscInt bs = A->rmap->bs, bs2 = a->bs2; 17 const MatScalar *aa = a->a, *v, *diag; 18 PetscScalar *x, *xk, *xj, *xk_tmp, *t; 19 const PetscScalar *b; 20 21 PetscFunctionBegin; 22 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 23 PetscCall(VecGetArrayRead(bb, &b)); 24 PetscCall(VecGetArray(xx, &x)); 25 t = a->solve_work; 26 PetscCall(ISGetIndices(isrow, &r)); 27 PetscCall(PetscMalloc1(bs, &xk_tmp)); 28 29 /* solve U^T * D * y = b by forward substitution */ 30 xk = t; 31 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 32 idx = bs * r[k]; 33 for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1]; 34 } 35 for (k = 0; k < mbs; k++) { 36 v = aa + bs2 * ai[k]; 37 xk = t + k * bs; /* Dk*xk = k-th block of x */ 38 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */ 39 nz = ai[k + 1] - ai[k]; 40 vj = aj + ai[k]; 41 xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */ 42 while (nz--) { 43 /* x(:) += U(k,:)^T*(Dk*xk) */ 44 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */ 45 vj++; 46 xj = t + (*vj) * bs; 47 v += bs2; 48 } 49 /* xk = inv(Dk)*(Dk*xk) */ 50 diag = aa + k * bs2; /* ptr to inv(Dk) */ 51 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */ 52 } 53 54 /* solve U*x = y by back substitution */ 55 for (k = mbs - 1; k >= 0; k--) { 56 v = aa + bs2 * ai[k]; 57 xk = t + k * bs; /* xk */ 58 nz = ai[k + 1] - ai[k]; 59 vj = aj + ai[k]; 60 xj = t + (*vj) * bs; 61 while (nz--) { 62 /* xk += U(k,:)*x(:) */ 63 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */ 64 vj++; 65 v += bs2; 66 xj = t + (*vj) * bs; 67 } 68 idx = bs * r[k]; 69 for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++; 70 } 71 72 PetscCall(PetscFree(xk_tmp)); 73 PetscCall(ISRestoreIndices(isrow, &r)); 74 PetscCall(VecRestoreArrayRead(bb, &b)); 75 PetscCall(VecRestoreArray(xx, &x)); 76 PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs)); 77 PetscFunctionReturn(PETSC_SUCCESS); 78 } 79 80 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 81 { 82 PetscFunctionBegin; 83 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet"); 84 } 85 86 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 87 { 88 PetscFunctionBegin; 89 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented"); 90 } 91 92 static PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) 93 { 94 PetscInt nz, k; 95 const PetscInt *vj, bs2 = bs * bs; 96 const MatScalar *v, *diag; 97 PetscScalar *xk, *xj, *xk_tmp; 98 99 PetscFunctionBegin; 100 PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 101 PetscCall(PetscMalloc1(bs, &xk_tmp)); 102 for (k = 0; k < mbs; k++) { 103 v = aa + bs2 * ai[k]; 104 xk = x + k * bs; /* Dk*xk = k-th block of x */ 105 PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */ 106 nz = ai[k + 1] - ai[k]; 107 vj = aj + ai[k]; 108 xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */ 109 while (nz--) { 110 /* x(:) += U(k,:)^T*(Dk*xk) */ 111 PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */ 112 vj++; 113 xj = x + (size_t)(*vj) * bs; 114 v += bs2; 115 } 116 /* xk = inv(Dk)*(Dk*xk) */ 117 diag = aa + k * bs2; /* ptr to inv(Dk) */ 118 PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */ 119 } 120 PetscCall(PetscFree(xk_tmp)); 121 PetscFunctionReturn(PETSC_SUCCESS); 122 } 123 124 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) 125 { 126 PetscInt nz, k; 127 const PetscInt *vj, bs2 = bs * bs; 128 const MatScalar *v; 129 PetscScalar *xk, *xj; 130 131 PetscFunctionBegin; 132 for (k = mbs - 1; k >= 0; k--) { 133 v = aa + bs2 * ai[k]; 134 xk = x + k * bs; /* xk */ 135 nz = ai[k + 1] - ai[k]; 136 vj = aj + ai[k]; 137 xj = x + (size_t)(*vj) * bs; 138 while (nz--) { 139 /* xk += U(k,:)*x(:) */ 140 PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */ 141 vj++; 142 v += bs2; 143 xj = x + (size_t)(*vj) * bs; 144 } 145 } 146 PetscFunctionReturn(PETSC_SUCCESS); 147 } 148 149 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 150 { 151 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 152 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 153 PetscInt bs = A->rmap->bs; 154 const MatScalar *aa = a->a; 155 PetscScalar *x; 156 const PetscScalar *b; 157 158 PetscFunctionBegin; 159 PetscCall(VecGetArrayRead(bb, &b)); 160 PetscCall(VecGetArray(xx, &x)); 161 162 /* solve U^T * D * y = b by forward substitution */ 163 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */ 164 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 165 166 /* solve U*x = y by back substitution */ 167 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 168 169 PetscCall(VecRestoreArrayRead(bb, &b)); 170 PetscCall(VecRestoreArray(xx, &x)); 171 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs)); 172 PetscFunctionReturn(PETSC_SUCCESS); 173 } 174 175 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 176 { 177 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 178 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 179 PetscInt bs = A->rmap->bs; 180 const MatScalar *aa = a->a; 181 const PetscScalar *b; 182 PetscScalar *x; 183 184 PetscFunctionBegin; 185 PetscCall(VecGetArrayRead(bb, &b)); 186 PetscCall(VecGetArray(xx, &x)); 187 PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */ 188 PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 189 PetscCall(VecRestoreArrayRead(bb, &b)); 190 PetscCall(VecRestoreArray(xx, &x)); 191 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs)); 192 PetscFunctionReturn(PETSC_SUCCESS); 193 } 194 195 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 196 { 197 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 198 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 199 PetscInt bs = A->rmap->bs; 200 const MatScalar *aa = a->a; 201 const PetscScalar *b; 202 PetscScalar *x; 203 204 PetscFunctionBegin; 205 PetscCall(VecGetArrayRead(bb, &b)); 206 PetscCall(VecGetArray(xx, &x)); 207 PetscCall(PetscArraycpy(x, b, bs * mbs)); 208 PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x)); 209 PetscCall(VecRestoreArrayRead(bb, &b)); 210 PetscCall(VecRestoreArray(xx, &x)); 211 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx) 216 { 217 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 218 IS isrow = a->row; 219 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj; 220 PetscInt nz, k, idx; 221 const MatScalar *aa = a->a, *v, *d; 222 PetscScalar *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp; 223 const PetscScalar *b; 224 225 PetscFunctionBegin; 226 PetscCall(VecGetArrayRead(bb, &b)); 227 PetscCall(VecGetArray(xx, &x)); 228 t = a->solve_work; 229 PetscCall(ISGetIndices(isrow, &r)); 230 231 /* solve U^T * D * y = b by forward substitution */ 232 tp = t; 233 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 234 idx = 7 * r[k]; 235 tp[0] = b[idx]; 236 tp[1] = b[idx + 1]; 237 tp[2] = b[idx + 2]; 238 tp[3] = b[idx + 3]; 239 tp[4] = b[idx + 4]; 240 tp[5] = b[idx + 5]; 241 tp[6] = b[idx + 6]; 242 tp += 7; 243 } 244 245 for (k = 0; k < mbs; k++) { 246 v = aa + 49 * ai[k]; 247 vj = aj + ai[k]; 248 tp = t + k * 7; 249 x0 = tp[0]; 250 x1 = tp[1]; 251 x2 = tp[2]; 252 x3 = tp[3]; 253 x4 = tp[4]; 254 x5 = tp[5]; 255 x6 = tp[6]; 256 nz = ai[k + 1] - ai[k]; 257 tp = t + (*vj) * 7; 258 while (nz--) { 259 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6; 260 tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6; 261 tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6; 262 tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6; 263 tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6; 264 tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6; 265 tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6; 266 vj++; 267 tp = t + (*vj) * 7; 268 v += 49; 269 } 270 271 /* xk = inv(Dk)*(Dk*xk) */ 272 d = aa + k * 49; /* ptr to inv(Dk) */ 273 tp = t + k * 7; 274 tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6; 275 tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6; 276 tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6; 277 tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6; 278 tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6; 279 tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6; 280 tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6; 281 } 282 283 /* solve U*x = y by back substitution */ 284 for (k = mbs - 1; k >= 0; k--) { 285 v = aa + 49 * ai[k]; 286 vj = aj + ai[k]; 287 tp = t + k * 7; 288 x0 = tp[0]; 289 x1 = tp[1]; 290 x2 = tp[2]; 291 x3 = tp[3]; 292 x4 = tp[4]; 293 x5 = tp[5]; 294 x6 = tp[6]; /* xk */ 295 nz = ai[k + 1] - ai[k]; 296 297 tp = t + (*vj) * 7; 298 while (nz--) { 299 /* xk += U(k,:)*x(:) */ 300 x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6]; 301 x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6]; 302 x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6]; 303 x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6]; 304 x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6]; 305 x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6]; 306 x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6]; 307 vj++; 308 tp = t + (*vj) * 7; 309 v += 49; 310 } 311 tp = t + k * 7; 312 tp[0] = x0; 313 tp[1] = x1; 314 tp[2] = x2; 315 tp[3] = x3; 316 tp[4] = x4; 317 tp[5] = x5; 318 tp[6] = x6; 319 idx = 7 * r[k]; 320 x[idx] = x0; 321 x[idx + 1] = x1; 322 x[idx + 2] = x2; 323 x[idx + 3] = x3; 324 x[idx + 4] = x4; 325 x[idx + 5] = x5; 326 x[idx + 6] = x6; 327 } 328 329 PetscCall(ISRestoreIndices(isrow, &r)); 330 PetscCall(VecRestoreArrayRead(bb, &b)); 331 PetscCall(VecRestoreArray(xx, &x)); 332 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 333 PetscFunctionReturn(PETSC_SUCCESS); 334 } 335 336 static PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 337 { 338 const MatScalar *v, *d; 339 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6; 340 PetscInt nz, k; 341 const PetscInt *vj; 342 343 PetscFunctionBegin; 344 for (k = 0; k < mbs; k++) { 345 v = aa + 49 * ai[k]; 346 xp = x + k * 7; 347 x0 = xp[0]; 348 x1 = xp[1]; 349 x2 = xp[2]; 350 x3 = xp[3]; 351 x4 = xp[4]; 352 x5 = xp[5]; 353 x6 = xp[6]; /* Dk*xk = k-th block of x */ 354 nz = ai[k + 1] - ai[k]; 355 vj = aj + ai[k]; 356 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 357 PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 358 while (nz--) { 359 xp = x + (*vj) * 7; 360 /* x(:) += U(k,:)^T*(Dk*xk) */ 361 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6; 362 xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6; 363 xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6; 364 xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6; 365 xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6; 366 xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6; 367 xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6; 368 vj++; 369 v += 49; 370 } 371 /* xk = inv(Dk)*(Dk*xk) */ 372 d = aa + k * 49; /* ptr to inv(Dk) */ 373 xp = x + k * 7; 374 xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6; 375 xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6; 376 xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6; 377 xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6; 378 xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6; 379 xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6; 380 xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6; 381 } 382 PetscFunctionReturn(PETSC_SUCCESS); 383 } 384 385 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 386 { 387 const MatScalar *v; 388 PetscScalar *xp, x0, x1, x2, x3, x4, x5, x6; 389 PetscInt nz, k; 390 const PetscInt *vj; 391 392 PetscFunctionBegin; 393 for (k = mbs - 1; k >= 0; k--) { 394 v = aa + 49 * ai[k]; 395 xp = x + k * 7; 396 x0 = xp[0]; 397 x1 = xp[1]; 398 x2 = xp[2]; 399 x3 = xp[3]; 400 x4 = xp[4]; 401 x5 = xp[5]; 402 x6 = xp[6]; /* xk */ 403 nz = ai[k + 1] - ai[k]; 404 vj = aj + ai[k]; 405 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 406 PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 407 while (nz--) { 408 xp = x + (*vj) * 7; 409 /* xk += U(k,:)*x(:) */ 410 x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6]; 411 x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6]; 412 x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6]; 413 x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6]; 414 x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6]; 415 x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6]; 416 x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6]; 417 vj++; 418 v += 49; 419 } 420 xp = x + k * 7; 421 xp[0] = x0; 422 xp[1] = x1; 423 xp[2] = x2; 424 xp[3] = x3; 425 xp[4] = x4; 426 xp[5] = x5; 427 xp[6] = x6; 428 } 429 PetscFunctionReturn(PETSC_SUCCESS); 430 } 431 432 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 433 { 434 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 435 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 436 const MatScalar *aa = a->a; 437 PetscScalar *x; 438 const PetscScalar *b; 439 440 PetscFunctionBegin; 441 PetscCall(VecGetArrayRead(bb, &b)); 442 PetscCall(VecGetArray(xx, &x)); 443 444 /* solve U^T * D * y = b by forward substitution */ 445 PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */ 446 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 447 448 /* solve U*x = y by back substitution */ 449 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 450 451 PetscCall(VecRestoreArrayRead(bb, &b)); 452 PetscCall(VecRestoreArray(xx, &x)); 453 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 454 PetscFunctionReturn(PETSC_SUCCESS); 455 } 456 457 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 458 { 459 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 460 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 461 const MatScalar *aa = a->a; 462 PetscScalar *x; 463 const PetscScalar *b; 464 465 PetscFunctionBegin; 466 PetscCall(VecGetArrayRead(bb, &b)); 467 PetscCall(VecGetArray(xx, &x)); 468 PetscCall(PetscArraycpy(x, b, 7 * mbs)); 469 PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 470 PetscCall(VecRestoreArrayRead(bb, &b)); 471 PetscCall(VecRestoreArray(xx, &x)); 472 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 473 PetscFunctionReturn(PETSC_SUCCESS); 474 } 475 476 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 477 { 478 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 479 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 480 const MatScalar *aa = a->a; 481 PetscScalar *x; 482 const PetscScalar *b; 483 484 PetscFunctionBegin; 485 PetscCall(VecGetArrayRead(bb, &b)); 486 PetscCall(VecGetArray(xx, &x)); 487 PetscCall(PetscArraycpy(x, b, 7 * mbs)); 488 PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x)); 489 PetscCall(VecRestoreArrayRead(bb, &b)); 490 PetscCall(VecRestoreArray(xx, &x)); 491 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 492 PetscFunctionReturn(PETSC_SUCCESS); 493 } 494 495 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx) 496 { 497 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 498 IS isrow = a->row; 499 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj; 500 PetscInt nz, k, idx; 501 const MatScalar *aa = a->a, *v, *d; 502 PetscScalar *x, x0, x1, x2, x3, x4, x5, *t, *tp; 503 const PetscScalar *b; 504 505 PetscFunctionBegin; 506 PetscCall(VecGetArrayRead(bb, &b)); 507 PetscCall(VecGetArray(xx, &x)); 508 t = a->solve_work; 509 PetscCall(ISGetIndices(isrow, &r)); 510 511 /* solve U^T * D * y = b by forward substitution */ 512 tp = t; 513 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 514 idx = 6 * r[k]; 515 tp[0] = b[idx]; 516 tp[1] = b[idx + 1]; 517 tp[2] = b[idx + 2]; 518 tp[3] = b[idx + 3]; 519 tp[4] = b[idx + 4]; 520 tp[5] = b[idx + 5]; 521 tp += 6; 522 } 523 524 for (k = 0; k < mbs; k++) { 525 v = aa + 36 * ai[k]; 526 vj = aj + ai[k]; 527 tp = t + k * 6; 528 x0 = tp[0]; 529 x1 = tp[1]; 530 x2 = tp[2]; 531 x3 = tp[3]; 532 x4 = tp[4]; 533 x5 = tp[5]; 534 nz = ai[k + 1] - ai[k]; 535 tp = t + (*vj) * 6; 536 while (nz--) { 537 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5; 538 tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5; 539 tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5; 540 tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5; 541 tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5; 542 tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5; 543 vj++; 544 tp = t + (*vj) * 6; 545 v += 36; 546 } 547 548 /* xk = inv(Dk)*(Dk*xk) */ 549 d = aa + k * 36; /* ptr to inv(Dk) */ 550 tp = t + k * 6; 551 tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5; 552 tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5; 553 tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5; 554 tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5; 555 tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5; 556 tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5; 557 } 558 559 /* solve U*x = y by back substitution */ 560 for (k = mbs - 1; k >= 0; k--) { 561 v = aa + 36 * ai[k]; 562 vj = aj + ai[k]; 563 tp = t + k * 6; 564 x0 = tp[0]; 565 x1 = tp[1]; 566 x2 = tp[2]; 567 x3 = tp[3]; 568 x4 = tp[4]; 569 x5 = tp[5]; /* xk */ 570 nz = ai[k + 1] - ai[k]; 571 572 tp = t + (*vj) * 6; 573 while (nz--) { 574 /* xk += U(k,:)*x(:) */ 575 x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5]; 576 x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5]; 577 x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5]; 578 x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5]; 579 x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5]; 580 x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5]; 581 vj++; 582 tp = t + (*vj) * 6; 583 v += 36; 584 } 585 tp = t + k * 6; 586 tp[0] = x0; 587 tp[1] = x1; 588 tp[2] = x2; 589 tp[3] = x3; 590 tp[4] = x4; 591 tp[5] = x5; 592 idx = 6 * r[k]; 593 x[idx] = x0; 594 x[idx + 1] = x1; 595 x[idx + 2] = x2; 596 x[idx + 3] = x3; 597 x[idx + 4] = x4; 598 x[idx + 5] = x5; 599 } 600 601 PetscCall(ISRestoreIndices(isrow, &r)); 602 PetscCall(VecRestoreArrayRead(bb, &b)); 603 PetscCall(VecRestoreArray(xx, &x)); 604 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 605 PetscFunctionReturn(PETSC_SUCCESS); 606 } 607 608 static PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 609 { 610 const MatScalar *v, *d; 611 PetscScalar *xp, x0, x1, x2, x3, x4, x5; 612 PetscInt nz, k; 613 const PetscInt *vj; 614 615 PetscFunctionBegin; 616 for (k = 0; k < mbs; k++) { 617 v = aa + 36 * ai[k]; 618 xp = x + k * 6; 619 x0 = xp[0]; 620 x1 = xp[1]; 621 x2 = xp[2]; 622 x3 = xp[3]; 623 x4 = xp[4]; 624 x5 = xp[5]; /* Dk*xk = k-th block of x */ 625 nz = ai[k + 1] - ai[k]; 626 vj = aj + ai[k]; 627 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 628 PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 629 while (nz--) { 630 xp = x + (*vj) * 6; 631 /* x(:) += U(k,:)^T*(Dk*xk) */ 632 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5; 633 xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5; 634 xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5; 635 xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5; 636 xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5; 637 xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5; 638 vj++; 639 v += 36; 640 } 641 /* xk = inv(Dk)*(Dk*xk) */ 642 d = aa + k * 36; /* ptr to inv(Dk) */ 643 xp = x + k * 6; 644 xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5; 645 xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5; 646 xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5; 647 xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5; 648 xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5; 649 xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5; 650 } 651 PetscFunctionReturn(PETSC_SUCCESS); 652 } 653 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 654 { 655 const MatScalar *v; 656 PetscScalar *xp, x0, x1, x2, x3, x4, x5; 657 PetscInt nz, k; 658 const PetscInt *vj; 659 660 PetscFunctionBegin; 661 for (k = mbs - 1; k >= 0; k--) { 662 v = aa + 36 * ai[k]; 663 xp = x + k * 6; 664 x0 = xp[0]; 665 x1 = xp[1]; 666 x2 = xp[2]; 667 x3 = xp[3]; 668 x4 = xp[4]; 669 x5 = xp[5]; /* xk */ 670 nz = ai[k + 1] - ai[k]; 671 vj = aj + ai[k]; 672 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 673 PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 674 while (nz--) { 675 xp = x + (*vj) * 6; 676 /* xk += U(k,:)*x(:) */ 677 x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5]; 678 x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5]; 679 x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5]; 680 x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5]; 681 x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5]; 682 x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5]; 683 vj++; 684 v += 36; 685 } 686 xp = x + k * 6; 687 xp[0] = x0; 688 xp[1] = x1; 689 xp[2] = x2; 690 xp[3] = x3; 691 xp[4] = x4; 692 xp[5] = x5; 693 } 694 PetscFunctionReturn(PETSC_SUCCESS); 695 } 696 697 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 698 { 699 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 700 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 701 const MatScalar *aa = a->a; 702 PetscScalar *x; 703 const PetscScalar *b; 704 705 PetscFunctionBegin; 706 PetscCall(VecGetArrayRead(bb, &b)); 707 PetscCall(VecGetArray(xx, &x)); 708 709 /* solve U^T * D * y = b by forward substitution */ 710 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 711 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 712 713 /* solve U*x = y by back substitution */ 714 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 715 716 PetscCall(VecRestoreArrayRead(bb, &b)); 717 PetscCall(VecRestoreArray(xx, &x)); 718 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 719 PetscFunctionReturn(PETSC_SUCCESS); 720 } 721 722 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 723 { 724 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 725 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 726 const MatScalar *aa = a->a; 727 PetscScalar *x; 728 const PetscScalar *b; 729 730 PetscFunctionBegin; 731 PetscCall(VecGetArrayRead(bb, &b)); 732 PetscCall(VecGetArray(xx, &x)); 733 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 734 PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 735 PetscCall(VecRestoreArrayRead(bb, &b)); 736 PetscCall(VecRestoreArray(xx, &x)); 737 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 738 PetscFunctionReturn(PETSC_SUCCESS); 739 } 740 741 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 742 { 743 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 744 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 745 const MatScalar *aa = a->a; 746 PetscScalar *x; 747 const PetscScalar *b; 748 749 PetscFunctionBegin; 750 PetscCall(VecGetArrayRead(bb, &b)); 751 PetscCall(VecGetArray(xx, &x)); 752 PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */ 753 PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x)); 754 PetscCall(VecRestoreArrayRead(bb, &b)); 755 PetscCall(VecRestoreArray(xx, &x)); 756 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 757 PetscFunctionReturn(PETSC_SUCCESS); 758 } 759 760 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx) 761 { 762 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 763 IS isrow = a->row; 764 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 765 const PetscInt *r, *vj; 766 PetscInt nz, k, idx; 767 const MatScalar *aa = a->a, *v, *diag; 768 PetscScalar *x, x0, x1, x2, x3, x4, *t, *tp; 769 const PetscScalar *b; 770 771 PetscFunctionBegin; 772 PetscCall(VecGetArrayRead(bb, &b)); 773 PetscCall(VecGetArray(xx, &x)); 774 t = a->solve_work; 775 PetscCall(ISGetIndices(isrow, &r)); 776 777 /* solve U^T * D * y = b by forward substitution */ 778 tp = t; 779 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 780 idx = 5 * r[k]; 781 tp[0] = b[idx]; 782 tp[1] = b[idx + 1]; 783 tp[2] = b[idx + 2]; 784 tp[3] = b[idx + 3]; 785 tp[4] = b[idx + 4]; 786 tp += 5; 787 } 788 789 for (k = 0; k < mbs; k++) { 790 v = aa + 25 * ai[k]; 791 vj = aj + ai[k]; 792 tp = t + k * 5; 793 x0 = tp[0]; 794 x1 = tp[1]; 795 x2 = tp[2]; 796 x3 = tp[3]; 797 x4 = tp[4]; 798 nz = ai[k + 1] - ai[k]; 799 800 tp = t + (*vj) * 5; 801 while (nz--) { 802 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4; 803 tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4; 804 tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4; 805 tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4; 806 tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4; 807 vj++; 808 tp = t + (*vj) * 5; 809 v += 25; 810 } 811 812 /* xk = inv(Dk)*(Dk*xk) */ 813 diag = aa + k * 25; /* ptr to inv(Dk) */ 814 tp = t + k * 5; 815 tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4; 816 tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4; 817 tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4; 818 tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4; 819 tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4; 820 } 821 822 /* solve U*x = y by back substitution */ 823 for (k = mbs - 1; k >= 0; k--) { 824 v = aa + 25 * ai[k]; 825 vj = aj + ai[k]; 826 tp = t + k * 5; 827 x0 = tp[0]; 828 x1 = tp[1]; 829 x2 = tp[2]; 830 x3 = tp[3]; 831 x4 = tp[4]; /* xk */ 832 nz = ai[k + 1] - ai[k]; 833 834 tp = t + (*vj) * 5; 835 while (nz--) { 836 /* xk += U(k,:)*x(:) */ 837 x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4]; 838 x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4]; 839 x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4]; 840 x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4]; 841 x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4]; 842 vj++; 843 tp = t + (*vj) * 5; 844 v += 25; 845 } 846 tp = t + k * 5; 847 tp[0] = x0; 848 tp[1] = x1; 849 tp[2] = x2; 850 tp[3] = x3; 851 tp[4] = x4; 852 idx = 5 * r[k]; 853 x[idx] = x0; 854 x[idx + 1] = x1; 855 x[idx + 2] = x2; 856 x[idx + 3] = x3; 857 x[idx + 4] = x4; 858 } 859 860 PetscCall(ISRestoreIndices(isrow, &r)); 861 PetscCall(VecRestoreArrayRead(bb, &b)); 862 PetscCall(VecRestoreArray(xx, &x)); 863 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 864 PetscFunctionReturn(PETSC_SUCCESS); 865 } 866 867 static PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 868 { 869 const MatScalar *v, *diag; 870 PetscScalar *xp, x0, x1, x2, x3, x4; 871 PetscInt nz, k; 872 const PetscInt *vj; 873 874 PetscFunctionBegin; 875 for (k = 0; k < mbs; k++) { 876 v = aa + 25 * ai[k]; 877 xp = x + k * 5; 878 x0 = xp[0]; 879 x1 = xp[1]; 880 x2 = xp[2]; 881 x3 = xp[3]; 882 x4 = xp[4]; /* Dk*xk = k-th block of x */ 883 nz = ai[k + 1] - ai[k]; 884 vj = aj + ai[k]; 885 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 886 PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 887 while (nz--) { 888 xp = x + (*vj) * 5; 889 /* x(:) += U(k,:)^T*(Dk*xk) */ 890 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4; 891 xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4; 892 xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4; 893 xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4; 894 xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4; 895 vj++; 896 v += 25; 897 } 898 /* xk = inv(Dk)*(Dk*xk) */ 899 diag = aa + k * 25; /* ptr to inv(Dk) */ 900 xp = x + k * 5; 901 xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4; 902 xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4; 903 xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4; 904 xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4; 905 xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4; 906 } 907 PetscFunctionReturn(PETSC_SUCCESS); 908 } 909 910 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 911 { 912 const MatScalar *v; 913 PetscScalar *xp, x0, x1, x2, x3, x4; 914 PetscInt nz, k; 915 const PetscInt *vj; 916 917 PetscFunctionBegin; 918 for (k = mbs - 1; k >= 0; k--) { 919 v = aa + 25 * ai[k]; 920 xp = x + k * 5; 921 x0 = xp[0]; 922 x1 = xp[1]; 923 x2 = xp[2]; 924 x3 = xp[3]; 925 x4 = xp[4]; /* xk */ 926 nz = ai[k + 1] - ai[k]; 927 vj = aj + ai[k]; 928 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 929 PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 930 while (nz--) { 931 xp = x + (*vj) * 5; 932 /* xk += U(k,:)*x(:) */ 933 x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4]; 934 x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4]; 935 x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4]; 936 x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4]; 937 x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4]; 938 vj++; 939 v += 25; 940 } 941 xp = x + k * 5; 942 xp[0] = x0; 943 xp[1] = x1; 944 xp[2] = x2; 945 xp[3] = x3; 946 xp[4] = x4; 947 } 948 PetscFunctionReturn(PETSC_SUCCESS); 949 } 950 951 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 952 { 953 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 954 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 955 const MatScalar *aa = a->a; 956 PetscScalar *x; 957 const PetscScalar *b; 958 959 PetscFunctionBegin; 960 PetscCall(VecGetArrayRead(bb, &b)); 961 PetscCall(VecGetArray(xx, &x)); 962 963 /* solve U^T * D * y = b by forward substitution */ 964 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */ 965 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 966 967 /* solve U*x = y by back substitution */ 968 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 969 970 PetscCall(VecRestoreArrayRead(bb, &b)); 971 PetscCall(VecRestoreArray(xx, &x)); 972 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 973 PetscFunctionReturn(PETSC_SUCCESS); 974 } 975 976 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 977 { 978 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 979 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 980 const MatScalar *aa = a->a; 981 PetscScalar *x; 982 const PetscScalar *b; 983 984 PetscFunctionBegin; 985 PetscCall(VecGetArrayRead(bb, &b)); 986 PetscCall(VecGetArray(xx, &x)); 987 PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */ 988 PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 989 PetscCall(VecRestoreArrayRead(bb, &b)); 990 PetscCall(VecRestoreArray(xx, &x)); 991 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 992 PetscFunctionReturn(PETSC_SUCCESS); 993 } 994 995 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 996 { 997 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 998 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 999 const MatScalar *aa = a->a; 1000 PetscScalar *x; 1001 const PetscScalar *b; 1002 1003 PetscFunctionBegin; 1004 PetscCall(VecGetArrayRead(bb, &b)); 1005 PetscCall(VecGetArray(xx, &x)); 1006 PetscCall(PetscArraycpy(x, b, 5 * mbs)); 1007 PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x)); 1008 PetscCall(VecRestoreArrayRead(bb, &b)); 1009 PetscCall(VecRestoreArray(xx, &x)); 1010 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1011 PetscFunctionReturn(PETSC_SUCCESS); 1012 } 1013 1014 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx) 1015 { 1016 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1017 IS isrow = a->row; 1018 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1019 const PetscInt *r, *vj; 1020 PetscInt nz, k, idx; 1021 const MatScalar *aa = a->a, *v, *diag; 1022 PetscScalar *x, x0, x1, x2, x3, *t, *tp; 1023 const PetscScalar *b; 1024 1025 PetscFunctionBegin; 1026 PetscCall(VecGetArrayRead(bb, &b)); 1027 PetscCall(VecGetArray(xx, &x)); 1028 t = a->solve_work; 1029 PetscCall(ISGetIndices(isrow, &r)); 1030 1031 /* solve U^T * D * y = b by forward substitution */ 1032 tp = t; 1033 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1034 idx = 4 * r[k]; 1035 tp[0] = b[idx]; 1036 tp[1] = b[idx + 1]; 1037 tp[2] = b[idx + 2]; 1038 tp[3] = b[idx + 3]; 1039 tp += 4; 1040 } 1041 1042 for (k = 0; k < mbs; k++) { 1043 v = aa + 16 * ai[k]; 1044 vj = aj + ai[k]; 1045 tp = t + k * 4; 1046 x0 = tp[0]; 1047 x1 = tp[1]; 1048 x2 = tp[2]; 1049 x3 = tp[3]; 1050 nz = ai[k + 1] - ai[k]; 1051 1052 tp = t + (*vj) * 4; 1053 while (nz--) { 1054 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3; 1055 tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3; 1056 tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3; 1057 tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3; 1058 vj++; 1059 tp = t + (*vj) * 4; 1060 v += 16; 1061 } 1062 1063 /* xk = inv(Dk)*(Dk*xk) */ 1064 diag = aa + k * 16; /* ptr to inv(Dk) */ 1065 tp = t + k * 4; 1066 tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3; 1067 tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3; 1068 tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3; 1069 tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3; 1070 } 1071 1072 /* solve U*x = y by back substitution */ 1073 for (k = mbs - 1; k >= 0; k--) { 1074 v = aa + 16 * ai[k]; 1075 vj = aj + ai[k]; 1076 tp = t + k * 4; 1077 x0 = tp[0]; 1078 x1 = tp[1]; 1079 x2 = tp[2]; 1080 x3 = tp[3]; /* xk */ 1081 nz = ai[k + 1] - ai[k]; 1082 1083 tp = t + (*vj) * 4; 1084 while (nz--) { 1085 /* xk += U(k,:)*x(:) */ 1086 x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3]; 1087 x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3]; 1088 x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3]; 1089 x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3]; 1090 vj++; 1091 tp = t + (*vj) * 4; 1092 v += 16; 1093 } 1094 tp = t + k * 4; 1095 tp[0] = x0; 1096 tp[1] = x1; 1097 tp[2] = x2; 1098 tp[3] = x3; 1099 idx = 4 * r[k]; 1100 x[idx] = x0; 1101 x[idx + 1] = x1; 1102 x[idx + 2] = x2; 1103 x[idx + 3] = x3; 1104 } 1105 1106 PetscCall(ISRestoreIndices(isrow, &r)); 1107 PetscCall(VecRestoreArrayRead(bb, &b)); 1108 PetscCall(VecRestoreArray(xx, &x)); 1109 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1110 PetscFunctionReturn(PETSC_SUCCESS); 1111 } 1112 1113 static PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1114 { 1115 const MatScalar *v, *diag; 1116 PetscScalar *xp, x0, x1, x2, x3; 1117 PetscInt nz, k; 1118 const PetscInt *vj; 1119 1120 PetscFunctionBegin; 1121 for (k = 0; k < mbs; k++) { 1122 v = aa + 16 * ai[k]; 1123 xp = x + k * 4; 1124 x0 = xp[0]; 1125 x1 = xp[1]; 1126 x2 = xp[2]; 1127 x3 = xp[3]; /* Dk*xk = k-th block of x */ 1128 nz = ai[k + 1] - ai[k]; 1129 vj = aj + ai[k]; 1130 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1131 PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1132 while (nz--) { 1133 xp = x + (*vj) * 4; 1134 /* x(:) += U(k,:)^T*(Dk*xk) */ 1135 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3; 1136 xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3; 1137 xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3; 1138 xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3; 1139 vj++; 1140 v += 16; 1141 } 1142 /* xk = inv(Dk)*(Dk*xk) */ 1143 diag = aa + k * 16; /* ptr to inv(Dk) */ 1144 xp = x + k * 4; 1145 xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3; 1146 xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3; 1147 xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3; 1148 xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3; 1149 } 1150 PetscFunctionReturn(PETSC_SUCCESS); 1151 } 1152 1153 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1154 { 1155 const MatScalar *v; 1156 PetscScalar *xp, x0, x1, x2, x3; 1157 PetscInt nz, k; 1158 const PetscInt *vj; 1159 1160 PetscFunctionBegin; 1161 for (k = mbs - 1; k >= 0; k--) { 1162 v = aa + 16 * ai[k]; 1163 xp = x + k * 4; 1164 x0 = xp[0]; 1165 x1 = xp[1]; 1166 x2 = xp[2]; 1167 x3 = xp[3]; /* xk */ 1168 nz = ai[k + 1] - ai[k]; 1169 vj = aj + ai[k]; 1170 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1171 PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1172 while (nz--) { 1173 xp = x + (*vj) * 4; 1174 /* xk += U(k,:)*x(:) */ 1175 x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3]; 1176 x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3]; 1177 x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3]; 1178 x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3]; 1179 vj++; 1180 v += 16; 1181 } 1182 xp = x + k * 4; 1183 xp[0] = x0; 1184 xp[1] = x1; 1185 xp[2] = x2; 1186 xp[3] = x3; 1187 } 1188 PetscFunctionReturn(PETSC_SUCCESS); 1189 } 1190 1191 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1192 { 1193 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1194 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1195 const MatScalar *aa = a->a; 1196 PetscScalar *x; 1197 const PetscScalar *b; 1198 1199 PetscFunctionBegin; 1200 PetscCall(VecGetArrayRead(bb, &b)); 1201 PetscCall(VecGetArray(xx, &x)); 1202 1203 /* solve U^T * D * y = b by forward substitution */ 1204 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */ 1205 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1206 1207 /* solve U*x = y by back substitution */ 1208 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1209 PetscCall(VecRestoreArrayRead(bb, &b)); 1210 PetscCall(VecRestoreArray(xx, &x)); 1211 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1212 PetscFunctionReturn(PETSC_SUCCESS); 1213 } 1214 1215 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1216 { 1217 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1218 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1219 const MatScalar *aa = a->a; 1220 PetscScalar *x; 1221 const PetscScalar *b; 1222 1223 PetscFunctionBegin; 1224 PetscCall(VecGetArrayRead(bb, &b)); 1225 PetscCall(VecGetArray(xx, &x)); 1226 PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */ 1227 PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1228 PetscCall(VecRestoreArrayRead(bb, &b)); 1229 PetscCall(VecRestoreArray(xx, &x)); 1230 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1231 PetscFunctionReturn(PETSC_SUCCESS); 1232 } 1233 1234 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1235 { 1236 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1237 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1238 const MatScalar *aa = a->a; 1239 PetscScalar *x; 1240 const PetscScalar *b; 1241 1242 PetscFunctionBegin; 1243 PetscCall(VecGetArrayRead(bb, &b)); 1244 PetscCall(VecGetArray(xx, &x)); 1245 PetscCall(PetscArraycpy(x, b, 4 * mbs)); 1246 PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x)); 1247 PetscCall(VecRestoreArrayRead(bb, &b)); 1248 PetscCall(VecRestoreArray(xx, &x)); 1249 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1250 PetscFunctionReturn(PETSC_SUCCESS); 1251 } 1252 1253 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx) 1254 { 1255 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1256 IS isrow = a->row; 1257 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1258 const PetscInt *r; 1259 PetscInt nz, k, idx; 1260 const PetscInt *vj; 1261 const MatScalar *aa = a->a, *v, *diag; 1262 PetscScalar *x, x0, x1, x2, *t, *tp; 1263 const PetscScalar *b; 1264 1265 PetscFunctionBegin; 1266 PetscCall(VecGetArrayRead(bb, &b)); 1267 PetscCall(VecGetArray(xx, &x)); 1268 t = a->solve_work; 1269 PetscCall(ISGetIndices(isrow, &r)); 1270 1271 /* solve U^T * D * y = b by forward substitution */ 1272 tp = t; 1273 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1274 idx = 3 * r[k]; 1275 tp[0] = b[idx]; 1276 tp[1] = b[idx + 1]; 1277 tp[2] = b[idx + 2]; 1278 tp += 3; 1279 } 1280 1281 for (k = 0; k < mbs; k++) { 1282 v = aa + 9 * ai[k]; 1283 vj = aj + ai[k]; 1284 tp = t + k * 3; 1285 x0 = tp[0]; 1286 x1 = tp[1]; 1287 x2 = tp[2]; 1288 nz = ai[k + 1] - ai[k]; 1289 1290 tp = t + (*vj) * 3; 1291 while (nz--) { 1292 tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2; 1293 tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2; 1294 tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2; 1295 vj++; 1296 tp = t + (*vj) * 3; 1297 v += 9; 1298 } 1299 1300 /* xk = inv(Dk)*(Dk*xk) */ 1301 diag = aa + k * 9; /* ptr to inv(Dk) */ 1302 tp = t + k * 3; 1303 tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2; 1304 tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2; 1305 tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2; 1306 } 1307 1308 /* solve U*x = y by back substitution */ 1309 for (k = mbs - 1; k >= 0; k--) { 1310 v = aa + 9 * ai[k]; 1311 vj = aj + ai[k]; 1312 tp = t + k * 3; 1313 x0 = tp[0]; 1314 x1 = tp[1]; 1315 x2 = tp[2]; /* xk */ 1316 nz = ai[k + 1] - ai[k]; 1317 1318 tp = t + (*vj) * 3; 1319 while (nz--) { 1320 /* xk += U(k,:)*x(:) */ 1321 x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2]; 1322 x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2]; 1323 x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2]; 1324 vj++; 1325 tp = t + (*vj) * 3; 1326 v += 9; 1327 } 1328 tp = t + k * 3; 1329 tp[0] = x0; 1330 tp[1] = x1; 1331 tp[2] = x2; 1332 idx = 3 * r[k]; 1333 x[idx] = x0; 1334 x[idx + 1] = x1; 1335 x[idx + 2] = x2; 1336 } 1337 1338 PetscCall(ISRestoreIndices(isrow, &r)); 1339 PetscCall(VecRestoreArrayRead(bb, &b)); 1340 PetscCall(VecRestoreArray(xx, &x)); 1341 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1342 PetscFunctionReturn(PETSC_SUCCESS); 1343 } 1344 1345 static PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1346 { 1347 const MatScalar *v, *diag; 1348 PetscScalar *xp, x0, x1, x2; 1349 PetscInt nz, k; 1350 const PetscInt *vj; 1351 1352 PetscFunctionBegin; 1353 for (k = 0; k < mbs; k++) { 1354 v = aa + 9 * ai[k]; 1355 xp = x + k * 3; 1356 x0 = xp[0]; 1357 x1 = xp[1]; 1358 x2 = xp[2]; /* Dk*xk = k-th block of x */ 1359 nz = ai[k + 1] - ai[k]; 1360 vj = aj + ai[k]; 1361 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1362 PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1363 while (nz--) { 1364 xp = x + (*vj) * 3; 1365 /* x(:) += U(k,:)^T*(Dk*xk) */ 1366 xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2; 1367 xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2; 1368 xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2; 1369 vj++; 1370 v += 9; 1371 } 1372 /* xk = inv(Dk)*(Dk*xk) */ 1373 diag = aa + k * 9; /* ptr to inv(Dk) */ 1374 xp = x + k * 3; 1375 xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2; 1376 xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2; 1377 xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2; 1378 } 1379 PetscFunctionReturn(PETSC_SUCCESS); 1380 } 1381 1382 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1383 { 1384 const MatScalar *v; 1385 PetscScalar *xp, x0, x1, x2; 1386 PetscInt nz, k; 1387 const PetscInt *vj; 1388 1389 PetscFunctionBegin; 1390 for (k = mbs - 1; k >= 0; k--) { 1391 v = aa + 9 * ai[k]; 1392 xp = x + k * 3; 1393 x0 = xp[0]; 1394 x1 = xp[1]; 1395 x2 = xp[2]; /* xk */ 1396 nz = ai[k + 1] - ai[k]; 1397 vj = aj + ai[k]; 1398 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1399 PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1400 while (nz--) { 1401 xp = x + (*vj) * 3; 1402 /* xk += U(k,:)*x(:) */ 1403 x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2]; 1404 x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2]; 1405 x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2]; 1406 vj++; 1407 v += 9; 1408 } 1409 xp = x + k * 3; 1410 xp[0] = x0; 1411 xp[1] = x1; 1412 xp[2] = x2; 1413 } 1414 PetscFunctionReturn(PETSC_SUCCESS); 1415 } 1416 1417 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1418 { 1419 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1420 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1421 const MatScalar *aa = a->a; 1422 PetscScalar *x; 1423 const PetscScalar *b; 1424 1425 PetscFunctionBegin; 1426 PetscCall(VecGetArrayRead(bb, &b)); 1427 PetscCall(VecGetArray(xx, &x)); 1428 1429 /* solve U^T * D * y = b by forward substitution */ 1430 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1431 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1432 1433 /* solve U*x = y by back substitution */ 1434 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1435 1436 PetscCall(VecRestoreArrayRead(bb, &b)); 1437 PetscCall(VecRestoreArray(xx, &x)); 1438 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1439 PetscFunctionReturn(PETSC_SUCCESS); 1440 } 1441 1442 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1443 { 1444 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1445 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1446 const MatScalar *aa = a->a; 1447 PetscScalar *x; 1448 const PetscScalar *b; 1449 1450 PetscFunctionBegin; 1451 PetscCall(VecGetArrayRead(bb, &b)); 1452 PetscCall(VecGetArray(xx, &x)); 1453 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1454 PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1455 PetscCall(VecRestoreArrayRead(bb, &b)); 1456 PetscCall(VecRestoreArray(xx, &x)); 1457 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1458 PetscFunctionReturn(PETSC_SUCCESS); 1459 } 1460 1461 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1462 { 1463 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1464 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1465 const MatScalar *aa = a->a; 1466 PetscScalar *x; 1467 const PetscScalar *b; 1468 1469 PetscFunctionBegin; 1470 PetscCall(VecGetArrayRead(bb, &b)); 1471 PetscCall(VecGetArray(xx, &x)); 1472 PetscCall(PetscArraycpy(x, b, 3 * mbs)); 1473 PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x)); 1474 PetscCall(VecRestoreArrayRead(bb, &b)); 1475 PetscCall(VecRestoreArray(xx, &x)); 1476 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1477 PetscFunctionReturn(PETSC_SUCCESS); 1478 } 1479 1480 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx) 1481 { 1482 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1483 IS isrow = a->row; 1484 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1485 const PetscInt *r, *vj; 1486 PetscInt nz, k, k2, idx; 1487 const MatScalar *aa = a->a, *v, *diag; 1488 PetscScalar *x, x0, x1, *t; 1489 const PetscScalar *b; 1490 1491 PetscFunctionBegin; 1492 PetscCall(VecGetArrayRead(bb, &b)); 1493 PetscCall(VecGetArray(xx, &x)); 1494 t = a->solve_work; 1495 PetscCall(ISGetIndices(isrow, &r)); 1496 1497 /* solve U^T * D * y = perm(b) by forward substitution */ 1498 for (k = 0; k < mbs; k++) { /* t <- perm(b) */ 1499 idx = 2 * r[k]; 1500 t[k * 2] = b[idx]; 1501 t[k * 2 + 1] = b[idx + 1]; 1502 } 1503 for (k = 0; k < mbs; k++) { 1504 v = aa + 4 * ai[k]; 1505 vj = aj + ai[k]; 1506 k2 = k * 2; 1507 x0 = t[k2]; 1508 x1 = t[k2 + 1]; 1509 nz = ai[k + 1] - ai[k]; 1510 while (nz--) { 1511 t[(*vj) * 2] += v[0] * x0 + v[1] * x1; 1512 t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1; 1513 vj++; 1514 v += 4; 1515 } 1516 diag = aa + k * 4; /* ptr to inv(Dk) */ 1517 t[k2] = diag[0] * x0 + diag[2] * x1; 1518 t[k2 + 1] = diag[1] * x0 + diag[3] * x1; 1519 } 1520 1521 /* solve U*x = y by back substitution */ 1522 for (k = mbs - 1; k >= 0; k--) { 1523 v = aa + 4 * ai[k]; 1524 vj = aj + ai[k]; 1525 k2 = k * 2; 1526 x0 = t[k2]; 1527 x1 = t[k2 + 1]; 1528 nz = ai[k + 1] - ai[k]; 1529 while (nz--) { 1530 x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1]; 1531 x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1]; 1532 vj++; 1533 v += 4; 1534 } 1535 t[k2] = x0; 1536 t[k2 + 1] = x1; 1537 idx = 2 * r[k]; 1538 x[idx] = x0; 1539 x[idx + 1] = x1; 1540 } 1541 1542 PetscCall(ISRestoreIndices(isrow, &r)); 1543 PetscCall(VecRestoreArrayRead(bb, &b)); 1544 PetscCall(VecRestoreArray(xx, &x)); 1545 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1546 PetscFunctionReturn(PETSC_SUCCESS); 1547 } 1548 1549 static PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1550 { 1551 const MatScalar *v, *diag; 1552 PetscScalar x0, x1; 1553 PetscInt nz, k, k2; 1554 const PetscInt *vj; 1555 1556 PetscFunctionBegin; 1557 for (k = 0; k < mbs; k++) { 1558 v = aa + 4 * ai[k]; 1559 vj = aj + ai[k]; 1560 k2 = k * 2; 1561 x0 = x[k2]; 1562 x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */ 1563 nz = ai[k + 1] - ai[k]; 1564 PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1565 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1566 while (nz--) { 1567 /* x(:) += U(k,:)^T*(Dk*xk) */ 1568 x[(*vj) * 2] += v[0] * x0 + v[1] * x1; 1569 x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1; 1570 vj++; 1571 v += 4; 1572 } 1573 /* xk = inv(Dk)*(Dk*xk) */ 1574 diag = aa + k * 4; /* ptr to inv(Dk) */ 1575 x[k2] = diag[0] * x0 + diag[2] * x1; 1576 x[k2 + 1] = diag[1] * x0 + diag[3] * x1; 1577 } 1578 PetscFunctionReturn(PETSC_SUCCESS); 1579 } 1580 1581 static PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) 1582 { 1583 const MatScalar *v; 1584 PetscScalar x0, x1; 1585 PetscInt nz, k, k2; 1586 const PetscInt *vj; 1587 1588 PetscFunctionBegin; 1589 for (k = mbs - 1; k >= 0; k--) { 1590 v = aa + 4 * ai[k]; 1591 vj = aj + ai[k]; 1592 k2 = k * 2; 1593 x0 = x[k2]; 1594 x1 = x[k2 + 1]; /* xk */ 1595 nz = ai[k + 1] - ai[k]; 1596 PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 1597 PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 1598 while (nz--) { 1599 /* xk += U(k,:)*x(:) */ 1600 x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1]; 1601 x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1]; 1602 vj++; 1603 v += 4; 1604 } 1605 x[k2] = x0; 1606 x[k2 + 1] = x1; 1607 } 1608 PetscFunctionReturn(PETSC_SUCCESS); 1609 } 1610 1611 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1612 { 1613 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1614 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1615 const MatScalar *aa = a->a; 1616 PetscScalar *x; 1617 const PetscScalar *b; 1618 1619 PetscFunctionBegin; 1620 PetscCall(VecGetArrayRead(bb, &b)); 1621 PetscCall(VecGetArray(xx, &x)); 1622 1623 /* solve U^T * D * y = b by forward substitution */ 1624 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1625 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1626 1627 /* solve U*x = y by back substitution */ 1628 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1629 1630 PetscCall(VecRestoreArrayRead(bb, &b)); 1631 PetscCall(VecRestoreArray(xx, &x)); 1632 PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs)); 1633 PetscFunctionReturn(PETSC_SUCCESS); 1634 } 1635 1636 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1637 { 1638 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1639 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1640 const MatScalar *aa = a->a; 1641 PetscScalar *x; 1642 const PetscScalar *b; 1643 1644 PetscFunctionBegin; 1645 PetscCall(VecGetArrayRead(bb, &b)); 1646 PetscCall(VecGetArray(xx, &x)); 1647 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1648 PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1649 PetscCall(VecRestoreArrayRead(bb, &b)); 1650 PetscCall(VecRestoreArray(xx, &x)); 1651 PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs)); 1652 PetscFunctionReturn(PETSC_SUCCESS); 1653 } 1654 1655 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 1656 { 1657 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1658 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j; 1659 const MatScalar *aa = a->a; 1660 PetscScalar *x; 1661 const PetscScalar *b; 1662 1663 PetscFunctionBegin; 1664 PetscCall(VecGetArrayRead(bb, &b)); 1665 PetscCall(VecGetArray(xx, &x)); 1666 PetscCall(PetscArraycpy(x, b, 2 * mbs)); 1667 PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x)); 1668 PetscCall(VecRestoreArrayRead(bb, &b)); 1669 PetscCall(VecRestoreArray(xx, &x)); 1670 PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs))); 1671 PetscFunctionReturn(PETSC_SUCCESS); 1672 } 1673 1674 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) 1675 { 1676 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1677 IS isrow = a->row; 1678 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1679 const MatScalar *aa = a->a, *v; 1680 const PetscScalar *b; 1681 PetscScalar *x, xk, *t; 1682 PetscInt nz, k, j; 1683 1684 PetscFunctionBegin; 1685 PetscCall(VecGetArrayRead(bb, &b)); 1686 PetscCall(VecGetArray(xx, &x)); 1687 t = a->solve_work; 1688 PetscCall(ISGetIndices(isrow, &rp)); 1689 1690 /* solve U^T*D*y = perm(b) by forward substitution */ 1691 for (k = 0; k < mbs; k++) t[k] = b[rp[k]]; 1692 for (k = 0; k < mbs; k++) { 1693 v = aa + ai[k]; 1694 vj = aj + ai[k]; 1695 xk = t[k]; 1696 nz = ai[k + 1] - ai[k] - 1; 1697 for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk; 1698 t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */ 1699 } 1700 1701 /* solve U*perm(x) = y by back substitution */ 1702 for (k = mbs - 1; k >= 0; k--) { 1703 v = aa + adiag[k] - 1; 1704 vj = aj + adiag[k] - 1; 1705 nz = ai[k + 1] - ai[k] - 1; 1706 for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]]; 1707 x[rp[k]] = t[k]; 1708 } 1709 1710 PetscCall(ISRestoreIndices(isrow, &rp)); 1711 PetscCall(VecRestoreArrayRead(bb, &b)); 1712 PetscCall(VecRestoreArray(xx, &x)); 1713 PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs)); 1714 PetscFunctionReturn(PETSC_SUCCESS); 1715 } 1716 1717 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1718 { 1719 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1720 IS isrow = a->row; 1721 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1722 const MatScalar *aa = a->a, *v; 1723 PetscScalar *x, xk, *t; 1724 const PetscScalar *b; 1725 PetscInt nz, k; 1726 1727 PetscFunctionBegin; 1728 PetscCall(VecGetArrayRead(bb, &b)); 1729 PetscCall(VecGetArray(xx, &x)); 1730 t = a->solve_work; 1731 PetscCall(ISGetIndices(isrow, &rp)); 1732 1733 /* solve U^T*D*y = perm(b) by forward substitution */ 1734 for (k = 0; k < mbs; k++) t[k] = b[rp[k]]; 1735 for (k = 0; k < mbs; k++) { 1736 v = aa + ai[k] + 1; 1737 vj = aj + ai[k] + 1; 1738 xk = t[k]; 1739 nz = ai[k + 1] - ai[k] - 1; 1740 while (nz--) t[*vj++] += (*v++) * xk; 1741 t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */ 1742 } 1743 1744 /* solve U*perm(x) = y by back substitution */ 1745 for (k = mbs - 1; k >= 0; k--) { 1746 v = aa + ai[k] + 1; 1747 vj = aj + ai[k] + 1; 1748 nz = ai[k + 1] - ai[k] - 1; 1749 while (nz--) t[k] += (*v++) * t[*vj++]; 1750 x[rp[k]] = t[k]; 1751 } 1752 1753 PetscCall(ISRestoreIndices(isrow, &rp)); 1754 PetscCall(VecRestoreArrayRead(bb, &b)); 1755 PetscCall(VecRestoreArray(xx, &x)); 1756 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 1757 PetscFunctionReturn(PETSC_SUCCESS); 1758 } 1759 1760 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) 1761 { 1762 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1763 IS isrow = a->row; 1764 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1765 const MatScalar *aa = a->a, *v; 1766 PetscReal diagk; 1767 PetscScalar *x, xk; 1768 const PetscScalar *b; 1769 PetscInt nz, k; 1770 1771 PetscFunctionBegin; 1772 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1773 PetscCall(VecGetArrayRead(bb, &b)); 1774 PetscCall(VecGetArray(xx, &x)); 1775 PetscCall(ISGetIndices(isrow, &rp)); 1776 1777 for (k = 0; k < mbs; k++) x[k] = b[rp[k]]; 1778 for (k = 0; k < mbs; k++) { 1779 v = aa + ai[k]; 1780 vj = aj + ai[k]; 1781 xk = x[k]; 1782 nz = ai[k + 1] - ai[k] - 1; 1783 while (nz--) x[*vj++] += (*v++) * xk; 1784 1785 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1786 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1787 x[k] = xk * PetscSqrtReal(diagk); 1788 } 1789 PetscCall(ISRestoreIndices(isrow, &rp)); 1790 PetscCall(VecRestoreArrayRead(bb, &b)); 1791 PetscCall(VecRestoreArray(xx, &x)); 1792 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1793 PetscFunctionReturn(PETSC_SUCCESS); 1794 } 1795 1796 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1797 { 1798 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1799 IS isrow = a->row; 1800 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1801 const MatScalar *aa = a->a, *v; 1802 PetscReal diagk; 1803 PetscScalar *x, xk; 1804 const PetscScalar *b; 1805 PetscInt nz, k; 1806 1807 PetscFunctionBegin; 1808 /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */ 1809 PetscCall(VecGetArrayRead(bb, &b)); 1810 PetscCall(VecGetArray(xx, &x)); 1811 PetscCall(ISGetIndices(isrow, &rp)); 1812 1813 for (k = 0; k < mbs; k++) x[k] = b[rp[k]]; 1814 for (k = 0; k < mbs; k++) { 1815 v = aa + ai[k] + 1; 1816 vj = aj + ai[k] + 1; 1817 xk = x[k]; 1818 nz = ai[k + 1] - ai[k] - 1; 1819 while (nz--) x[*vj++] += (*v++) * xk; 1820 1821 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 1822 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1823 x[k] = xk * PetscSqrtReal(diagk); 1824 } 1825 PetscCall(ISRestoreIndices(isrow, &rp)); 1826 PetscCall(VecRestoreArrayRead(bb, &b)); 1827 PetscCall(VecRestoreArray(xx, &x)); 1828 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1829 PetscFunctionReturn(PETSC_SUCCESS); 1830 } 1831 1832 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) 1833 { 1834 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1835 IS isrow = a->row; 1836 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag; 1837 const MatScalar *aa = a->a, *v; 1838 PetscReal diagk; 1839 PetscScalar *x, *t; 1840 const PetscScalar *b; 1841 PetscInt nz, k; 1842 1843 PetscFunctionBegin; 1844 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1845 PetscCall(VecGetArrayRead(bb, &b)); 1846 PetscCall(VecGetArray(xx, &x)); 1847 t = a->solve_work; 1848 PetscCall(ISGetIndices(isrow, &rp)); 1849 1850 for (k = mbs - 1; k >= 0; k--) { 1851 v = aa + ai[k]; 1852 vj = aj + ai[k]; 1853 diagk = PetscRealPart(aa[adiag[k]]); 1854 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1855 t[k] = b[k] * PetscSqrtReal(diagk); 1856 nz = ai[k + 1] - ai[k] - 1; 1857 while (nz--) t[k] += (*v++) * t[*vj++]; 1858 x[rp[k]] = t[k]; 1859 } 1860 PetscCall(ISRestoreIndices(isrow, &rp)); 1861 PetscCall(VecRestoreArrayRead(bb, &b)); 1862 PetscCall(VecRestoreArray(xx, &x)); 1863 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1864 PetscFunctionReturn(PETSC_SUCCESS); 1865 } 1866 1867 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1868 { 1869 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1870 IS isrow = a->row; 1871 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj; 1872 const MatScalar *aa = a->a, *v; 1873 PetscReal diagk; 1874 PetscScalar *x, *t; 1875 const PetscScalar *b; 1876 PetscInt nz, k; 1877 1878 PetscFunctionBegin; 1879 /* solve D^(1/2)*U*perm(x) = b by back substitution */ 1880 PetscCall(VecGetArrayRead(bb, &b)); 1881 PetscCall(VecGetArray(xx, &x)); 1882 t = a->solve_work; 1883 PetscCall(ISGetIndices(isrow, &rp)); 1884 1885 for (k = mbs - 1; k >= 0; k--) { 1886 v = aa + ai[k] + 1; 1887 vj = aj + ai[k] + 1; 1888 diagk = PetscRealPart(aa[ai[k]]); 1889 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 1890 t[k] = b[k] * PetscSqrtReal(diagk); 1891 nz = ai[k + 1] - ai[k] - 1; 1892 while (nz--) t[k] += (*v++) * t[*vj++]; 1893 x[rp[k]] = t[k]; 1894 } 1895 PetscCall(ISRestoreIndices(isrow, &rp)); 1896 PetscCall(VecRestoreArrayRead(bb, &b)); 1897 PetscCall(VecRestoreArray(xx, &x)); 1898 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 1899 PetscFunctionReturn(PETSC_SUCCESS); 1900 } 1901 1902 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx) 1903 { 1904 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1905 1906 PetscFunctionBegin; 1907 if (A->rmap->bs == 1) { 1908 PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v)); 1909 } else { 1910 IS isrow = a->row; 1911 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp; 1912 const MatScalar *aa = a->a, *v; 1913 PetscScalar *x, *t; 1914 const PetscScalar *b; 1915 PetscInt nz, k, n, i, j; 1916 1917 if (bb->n > a->solves_work_n) { 1918 PetscCall(PetscFree(a->solves_work)); 1919 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work)); 1920 a->solves_work_n = bb->n; 1921 } 1922 n = bb->n; 1923 PetscCall(VecGetArrayRead(bb->v, &b)); 1924 PetscCall(VecGetArray(xx->v, &x)); 1925 t = a->solves_work; 1926 1927 PetscCall(ISGetIndices(isrow, &rp)); 1928 1929 /* solve U^T*D*y = perm(b) by forward substitution */ 1930 for (k = 0; k < mbs; k++) { 1931 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */ 1932 } 1933 for (k = 0; k < mbs; k++) { 1934 v = aa + ai[k]; 1935 vj = aj + ai[k]; 1936 nz = ai[k + 1] - ai[k] - 1; 1937 for (j = 0; j < nz; j++) { 1938 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i]; 1939 v++; 1940 vj++; 1941 } 1942 for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */ 1943 } 1944 1945 /* solve U*perm(x) = y by back substitution */ 1946 for (k = mbs - 1; k >= 0; k--) { 1947 v = aa + ai[k] - 1; 1948 vj = aj + ai[k] - 1; 1949 nz = ai[k + 1] - ai[k] - 1; 1950 for (j = 0; j < nz; j++) { 1951 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i]; 1952 v++; 1953 vj++; 1954 } 1955 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i]; 1956 } 1957 1958 PetscCall(ISRestoreIndices(isrow, &rp)); 1959 PetscCall(VecRestoreArrayRead(bb->v, &b)); 1960 PetscCall(VecRestoreArray(xx->v, &x)); 1961 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs))); 1962 } 1963 PetscFunctionReturn(PETSC_SUCCESS); 1964 } 1965 1966 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx) 1967 { 1968 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 1969 1970 PetscFunctionBegin; 1971 if (A->rmap->bs == 1) { 1972 PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v)); 1973 } else { 1974 IS isrow = a->row; 1975 const PetscInt *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp; 1976 const MatScalar *aa = a->a, *v; 1977 PetscScalar *x, *t; 1978 const PetscScalar *b; 1979 PetscInt nz, k, n, i; 1980 1981 if (bb->n > a->solves_work_n) { 1982 PetscCall(PetscFree(a->solves_work)); 1983 PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work)); 1984 a->solves_work_n = bb->n; 1985 } 1986 n = bb->n; 1987 PetscCall(VecGetArrayRead(bb->v, &b)); 1988 PetscCall(VecGetArray(xx->v, &x)); 1989 t = a->solves_work; 1990 1991 PetscCall(ISGetIndices(isrow, &rp)); 1992 1993 /* solve U^T*D*y = perm(b) by forward substitution */ 1994 for (k = 0; k < mbs; k++) { 1995 for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */ 1996 } 1997 for (k = 0; k < mbs; k++) { 1998 v = aa + ai[k]; 1999 vj = aj + ai[k]; 2000 nz = ai[k + 1] - ai[k]; 2001 while (nz--) { 2002 for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i]; 2003 v++; 2004 vj++; 2005 } 2006 for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */ 2007 } 2008 2009 /* solve U*perm(x) = y by back substitution */ 2010 for (k = mbs - 1; k >= 0; k--) { 2011 v = aa + ai[k]; 2012 vj = aj + ai[k]; 2013 nz = ai[k + 1] - ai[k]; 2014 while (nz--) { 2015 for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i]; 2016 v++; 2017 vj++; 2018 } 2019 for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i]; 2020 } 2021 2022 PetscCall(ISRestoreIndices(isrow, &rp)); 2023 PetscCall(VecRestoreArrayRead(bb->v, &b)); 2024 PetscCall(VecRestoreArray(xx->v, &x)); 2025 PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs))); 2026 } 2027 PetscFunctionReturn(PETSC_SUCCESS); 2028 } 2029 2030 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) 2031 { 2032 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2033 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag; 2034 const MatScalar *aa = a->a, *v; 2035 const PetscScalar *b; 2036 PetscScalar *x, xi; 2037 PetscInt nz, i, j; 2038 2039 PetscFunctionBegin; 2040 PetscCall(VecGetArrayRead(bb, &b)); 2041 PetscCall(VecGetArray(xx, &x)); 2042 /* solve U^T*D*y = b by forward substitution */ 2043 PetscCall(PetscArraycpy(x, b, mbs)); 2044 for (i = 0; i < mbs; i++) { 2045 v = aa + ai[i]; 2046 vj = aj + ai[i]; 2047 xi = x[i]; 2048 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 2049 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi; 2050 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2051 } 2052 /* solve U*x = y by backward substitution */ 2053 for (i = mbs - 2; i >= 0; i--) { 2054 xi = x[i]; 2055 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2056 vj = aj + adiag[i] - 1; 2057 nz = ai[i + 1] - ai[i] - 1; 2058 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]]; 2059 x[i] = xi; 2060 } 2061 PetscCall(VecRestoreArrayRead(bb, &b)); 2062 PetscCall(VecRestoreArray(xx, &x)); 2063 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 2064 PetscFunctionReturn(PETSC_SUCCESS); 2065 } 2066 2067 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X) 2068 { 2069 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2070 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag; 2071 const MatScalar *aa = a->a, *v; 2072 const PetscScalar *b; 2073 PetscScalar *x, xi; 2074 PetscInt nz, i, j, neq, ldb, ldx; 2075 PetscBool isdense; 2076 2077 PetscFunctionBegin; 2078 if (!mbs) PetscFunctionReturn(PETSC_SUCCESS); 2079 PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense)); 2080 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix"); 2081 if (X != B) { 2082 PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense)); 2083 PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix"); 2084 } 2085 PetscCall(MatDenseGetArrayRead(B, &b)); 2086 PetscCall(MatDenseGetLDA(B, &ldb)); 2087 PetscCall(MatDenseGetArray(X, &x)); 2088 PetscCall(MatDenseGetLDA(X, &ldx)); 2089 for (neq = 0; neq < B->cmap->n; neq++) { 2090 /* solve U^T*D*y = b by forward substitution */ 2091 PetscCall(PetscArraycpy(x, b, mbs)); 2092 for (i = 0; i < mbs; i++) { 2093 v = aa + ai[i]; 2094 vj = aj + ai[i]; 2095 xi = x[i]; 2096 nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */ 2097 for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi; 2098 x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */ 2099 } 2100 /* solve U*x = y by backward substitution */ 2101 for (i = mbs - 2; i >= 0; i--) { 2102 xi = x[i]; 2103 v = aa + adiag[i] - 1; /* end of row i, excluding diag */ 2104 vj = aj + adiag[i] - 1; 2105 nz = ai[i + 1] - ai[i] - 1; 2106 for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]]; 2107 x[i] = xi; 2108 } 2109 b += ldb; 2110 x += ldx; 2111 } 2112 PetscCall(MatDenseRestoreArrayRead(B, &b)); 2113 PetscCall(MatDenseRestoreArray(X, &x)); 2114 PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs))); 2115 PetscFunctionReturn(PETSC_SUCCESS); 2116 } 2117 2118 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 2119 { 2120 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2121 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2122 const MatScalar *aa = a->a, *v; 2123 PetscScalar *x, xk; 2124 const PetscScalar *b; 2125 PetscInt nz, k; 2126 2127 PetscFunctionBegin; 2128 PetscCall(VecGetArrayRead(bb, &b)); 2129 PetscCall(VecGetArray(xx, &x)); 2130 2131 /* solve U^T*D*y = b by forward substitution */ 2132 PetscCall(PetscArraycpy(x, b, mbs)); 2133 for (k = 0; k < mbs; k++) { 2134 v = aa + ai[k] + 1; 2135 vj = aj + ai[k] + 1; 2136 xk = x[k]; 2137 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2138 while (nz--) x[*vj++] += (*v++) * xk; 2139 x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */ 2140 } 2141 2142 /* solve U*x = y by back substitution */ 2143 for (k = mbs - 2; k >= 0; k--) { 2144 v = aa + ai[k] + 1; 2145 vj = aj + ai[k] + 1; 2146 xk = x[k]; 2147 nz = ai[k + 1] - ai[k] - 1; 2148 while (nz--) xk += (*v++) * x[*vj++]; 2149 x[k] = xk; 2150 } 2151 2152 PetscCall(VecRestoreArrayRead(bb, &b)); 2153 PetscCall(VecRestoreArray(xx, &x)); 2154 PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs)); 2155 PetscFunctionReturn(PETSC_SUCCESS); 2156 } 2157 2158 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) 2159 { 2160 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2161 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj; 2162 const MatScalar *aa = a->a, *v; 2163 PetscReal diagk; 2164 PetscScalar *x; 2165 const PetscScalar *b; 2166 PetscInt nz, k; 2167 2168 PetscFunctionBegin; 2169 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2170 PetscCall(VecGetArrayRead(bb, &b)); 2171 PetscCall(VecGetArray(xx, &x)); 2172 PetscCall(PetscArraycpy(x, b, mbs)); 2173 for (k = 0; k < mbs; k++) { 2174 v = aa + ai[k]; 2175 vj = aj + ai[k]; 2176 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2177 while (nz--) x[*vj++] += (*v++) * x[k]; 2178 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */ 2179 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]])); 2180 x[k] *= PetscSqrtReal(diagk); 2181 } 2182 PetscCall(VecRestoreArrayRead(bb, &b)); 2183 PetscCall(VecRestoreArray(xx, &x)); 2184 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2185 PetscFunctionReturn(PETSC_SUCCESS); 2186 } 2187 2188 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 2189 { 2190 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2191 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2192 const MatScalar *aa = a->a, *v; 2193 PetscReal diagk; 2194 PetscScalar *x; 2195 const PetscScalar *b; 2196 PetscInt nz, k; 2197 2198 PetscFunctionBegin; 2199 /* solve U^T*D^(1/2)*x = b by forward substitution */ 2200 PetscCall(VecGetArrayRead(bb, &b)); 2201 PetscCall(VecGetArray(xx, &x)); 2202 PetscCall(PetscArraycpy(x, b, mbs)); 2203 for (k = 0; k < mbs; k++) { 2204 v = aa + ai[k] + 1; 2205 vj = aj + ai[k] + 1; 2206 nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */ 2207 while (nz--) x[*vj++] += (*v++) * x[k]; 2208 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2209 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]])); 2210 x[k] *= PetscSqrtReal(diagk); 2211 } 2212 PetscCall(VecRestoreArrayRead(bb, &b)); 2213 PetscCall(VecRestoreArray(xx, &x)); 2214 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2215 PetscFunctionReturn(PETSC_SUCCESS); 2216 } 2217 2218 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) 2219 { 2220 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2221 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj; 2222 const MatScalar *aa = a->a, *v; 2223 PetscReal diagk; 2224 PetscScalar *x; 2225 const PetscScalar *b; 2226 PetscInt nz, k; 2227 2228 PetscFunctionBegin; 2229 /* solve D^(1/2)*U*x = b by back substitution */ 2230 PetscCall(VecGetArrayRead(bb, &b)); 2231 PetscCall(VecGetArray(xx, &x)); 2232 2233 for (k = mbs - 1; k >= 0; k--) { 2234 v = aa + ai[k]; 2235 vj = aj + ai[k]; 2236 diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2237 PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 2238 x[k] = PetscSqrtReal(diagk) * b[k]; 2239 nz = ai[k + 1] - ai[k] - 1; 2240 while (nz--) x[k] += (*v++) * x[*vj++]; 2241 } 2242 PetscCall(VecRestoreArrayRead(bb, &b)); 2243 PetscCall(VecRestoreArray(xx, &x)); 2244 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2245 PetscFunctionReturn(PETSC_SUCCESS); 2246 } 2247 2248 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 2249 { 2250 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2251 const PetscInt mbs = a->mbs, *ai = a->i, *aj = a->j, *vj; 2252 const MatScalar *aa = a->a, *v; 2253 PetscReal diagk; 2254 PetscScalar *x; 2255 const PetscScalar *b; 2256 PetscInt nz, k; 2257 2258 PetscFunctionBegin; 2259 /* solve D^(1/2)*U*x = b by back substitution */ 2260 PetscCall(VecGetArrayRead(bb, &b)); 2261 PetscCall(VecGetArray(xx, &x)); 2262 2263 for (k = mbs - 1; k >= 0; k--) { 2264 v = aa + ai[k] + 1; 2265 vj = aj + ai[k] + 1; 2266 diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */ 2267 PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative"); 2268 x[k] = PetscSqrtReal(diagk) * b[k]; 2269 nz = ai[k + 1] - ai[k] - 1; 2270 while (nz--) x[k] += (*v++) * x[*vj++]; 2271 } 2272 PetscCall(VecRestoreArrayRead(bb, &b)); 2273 PetscCall(VecRestoreArray(xx, &x)); 2274 PetscCall(PetscLogFlops(2.0 * a->nz - mbs)); 2275 PetscFunctionReturn(PETSC_SUCCESS); 2276 } 2277 2278 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */ 2279 static PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info) 2280 { 2281 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b; 2282 const PetscInt *rip, mbs = a->mbs, *ai, *aj; 2283 PetscInt *jutmp, bs = A->rmap->bs, i; 2284 PetscInt m, reallocs = 0, *levtmp; 2285 PetscInt *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd; 2286 PetscInt incrlev, *lev, shift, prow, nz; 2287 PetscReal f = info->fill, levels = info->levels; 2288 PetscBool perm_identity; 2289 2290 PetscFunctionBegin; 2291 /* check whether perm is the identity mapping */ 2292 PetscCall(ISIdentity(perm, &perm_identity)); 2293 2294 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2295 a->permute = PETSC_FALSE; 2296 ai = a->i; 2297 aj = a->j; 2298 2299 /* initialization */ 2300 PetscCall(ISGetIndices(perm, &rip)); 2301 umax = (PetscInt)(f * ai[mbs] + 1); 2302 PetscCall(PetscMalloc1(umax, &lev)); 2303 umax += mbs + 1; 2304 shift = mbs + 1; 2305 PetscCall(PetscMalloc1(mbs + 1, &iu)); 2306 PetscCall(PetscMalloc1(umax, &ju)); 2307 iu[0] = mbs + 1; 2308 juidx = mbs + 1; 2309 /* prowl: linked list for pivot row */ 2310 PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp)); 2311 /* q: linked list for col index */ 2312 2313 for (i = 0; i < mbs; i++) { 2314 prowl[i] = mbs; 2315 q[i] = 0; 2316 } 2317 2318 /* for each row k */ 2319 for (k = 0; k < mbs; k++) { 2320 nzk = 0; 2321 q[k] = mbs; 2322 /* copy current row into linked list */ 2323 nz = ai[rip[k] + 1] - ai[rip[k]]; 2324 j = ai[rip[k]]; 2325 while (nz--) { 2326 vj = rip[aj[j++]]; 2327 if (vj > k) { 2328 qm = k; 2329 do { 2330 m = qm; 2331 qm = q[m]; 2332 } while (qm < vj); 2333 PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A"); 2334 nzk++; 2335 q[m] = vj; 2336 q[vj] = qm; 2337 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2338 } 2339 } 2340 2341 /* modify nonzero structure of k-th row by computing fill-in 2342 for each row prow to be merged in */ 2343 prow = k; 2344 prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */ 2345 2346 while (prow < k) { 2347 /* merge row prow into k-th row */ 2348 jmin = iu[prow] + 1; 2349 jmax = iu[prow + 1]; 2350 qm = k; 2351 for (j = jmin; j < jmax; j++) { 2352 incrlev = lev[j - shift] + 1; 2353 if (incrlev > levels) continue; 2354 vj = ju[j]; 2355 do { 2356 m = qm; 2357 qm = q[m]; 2358 } while (qm < vj); 2359 if (qm != vj) { /* a fill */ 2360 nzk++; 2361 q[m] = vj; 2362 q[vj] = qm; 2363 qm = vj; 2364 levtmp[vj] = incrlev; 2365 } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev; 2366 } 2367 prow = prowl[prow]; /* next pivot row */ 2368 } 2369 2370 /* add k to row list for first nonzero element in k-th row */ 2371 if (nzk > 1) { 2372 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2373 prowl[k] = prowl[i]; 2374 prowl[i] = k; 2375 } 2376 iu[k + 1] = iu[k] + nzk; 2377 2378 /* allocate more space to ju and lev if needed */ 2379 if (iu[k + 1] > umax) { 2380 /* estimate how much additional space we will need */ 2381 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2382 /* just double the memory each time */ 2383 maxadd = umax; 2384 if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2; 2385 umax += maxadd; 2386 2387 /* allocate a longer ju */ 2388 PetscCall(PetscMalloc1(umax, &jutmp)); 2389 PetscCall(PetscArraycpy(jutmp, ju, iu[k])); 2390 PetscCall(PetscFree(ju)); 2391 ju = jutmp; 2392 2393 PetscCall(PetscMalloc1(umax, &jutmp)); 2394 PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift)); 2395 PetscCall(PetscFree(lev)); 2396 lev = jutmp; 2397 reallocs += 2; /* count how many times we realloc */ 2398 } 2399 2400 /* save nonzero structure of k-th row in ju */ 2401 i = k; 2402 while (nzk--) { 2403 i = q[i]; 2404 ju[juidx] = i; 2405 lev[juidx - shift] = levtmp[i]; 2406 juidx++; 2407 } 2408 } 2409 2410 #if defined(PETSC_USE_INFO) 2411 if (ai[mbs] != 0) { 2412 PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]); 2413 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af)); 2414 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2415 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af)); 2416 PetscCall(PetscInfo(A, "for best performance.\n")); 2417 } else { 2418 PetscCall(PetscInfo(A, "Empty matrix\n")); 2419 } 2420 #endif 2421 2422 PetscCall(ISRestoreIndices(perm, &rip)); 2423 PetscCall(PetscFree3(prowl, q, levtmp)); 2424 PetscCall(PetscFree(lev)); 2425 2426 /* put together the new matrix */ 2427 PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL)); 2428 2429 b = (Mat_SeqSBAIJ *)B->data; 2430 PetscCall(PetscFree2(b->imax, b->ilen)); 2431 2432 /* the next line frees the default space generated by the Create() */ 2433 PetscCall(MatSeqXAIJFreeAIJ(B, &b->a, &b->j, &b->i)); 2434 PetscCall(PetscShmgetAllocateArray((iu[mbs] + 1) * a->bs2, sizeof(PetscScalar), (void **)&b->a)); 2435 b->free_a = PETSC_TRUE; 2436 b->free_ij = PETSC_TRUE; 2437 b->j = ju; 2438 b->i = iu; 2439 b->diag = NULL; 2440 b->ilen = NULL; 2441 b->imax = NULL; 2442 2443 PetscCall(ISDestroy(&b->row)); 2444 PetscCall(ISDestroy(&b->icol)); 2445 b->row = perm; 2446 b->icol = perm; 2447 PetscCall(PetscObjectReference((PetscObject)perm)); 2448 PetscCall(PetscObjectReference((PetscObject)perm)); 2449 PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work)); 2450 /* In b structure: Free imax, ilen, old a, old j. 2451 Allocate idnew, solve_work, new a, new j */ 2452 b->maxnz = b->nz = iu[mbs]; 2453 2454 B->info.factor_mallocs = reallocs; 2455 B->info.fill_ratio_given = f; 2456 if (ai[mbs] != 0) { 2457 B->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]); 2458 } else { 2459 B->info.fill_ratio_needed = 0.0; 2460 } 2461 PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity)); 2462 PetscFunctionReturn(PETSC_SUCCESS); 2463 } 2464 2465 /* 2466 See MatICCFactorSymbolic_SeqAIJ() for description of its data structure 2467 */ 2468 #include <petscbt.h> 2469 #include <../src/mat/utils/freespace.h> 2470 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 2471 { 2472 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data, *b; 2473 PetscBool perm_identity, free_ij = PETSC_TRUE, diagDense; 2474 PetscInt bs = A->rmap->bs, am = a->mbs, *ai = a->i, *aj = a->j; 2475 const PetscInt *rip; 2476 PetscInt reallocs = 0, i, *ui, *udiag, *cols; 2477 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 2478 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr; 2479 PetscReal fill = info->fill, levels = info->levels; 2480 PetscFreeSpaceList free_space = NULL, current_space = NULL; 2481 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 2482 PetscBT lnkbt; 2483 2484 PetscFunctionBegin; 2485 PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n); 2486 PetscCall(MatGetDiagonalMarkers_SeqSBAIJ(A, NULL, &diagDense)); 2487 PetscCheck(diagDense, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry"); 2488 if (bs > 1) { 2489 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info)); 2490 PetscFunctionReturn(PETSC_SUCCESS); 2491 } 2492 2493 /* check whether perm is the identity mapping */ 2494 PetscCall(ISIdentity(perm, &perm_identity)); 2495 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2496 a->permute = PETSC_FALSE; 2497 2498 PetscCall(PetscMalloc1(am + 1, &ui)); 2499 PetscCall(PetscMalloc1(am + 1, &udiag)); 2500 ui[0] = 0; 2501 2502 /* ICC(0) without matrix ordering: simply rearrange column indices */ 2503 if (!levels) { 2504 /* reuse the column pointers and row offsets for memory savings */ 2505 for (i = 0; i < am; i++) { 2506 ncols = ai[i + 1] - ai[i]; 2507 ui[i + 1] = ui[i] + ncols; 2508 udiag[i] = ui[i + 1] - 1; /* points to the last entry of U(i,:) */ 2509 } 2510 PetscCall(PetscMalloc1(ui[am], &uj)); 2511 cols = uj; 2512 for (i = 0; i < am; i++) { 2513 aj = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */ 2514 ncols = ai[i + 1] - ai[i] - 1; 2515 for (j = 0; j < ncols; j++) *cols++ = aj[j]; 2516 *cols++ = i; /* diagonal is located as the last entry of U(i,:) */ 2517 } 2518 } else { /* case: levels>0 */ 2519 PetscCall(ISGetIndices(perm, &rip)); 2520 2521 /* initialization */ 2522 /* jl: linked list for storing indices of the pivot rows 2523 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2524 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 2525 for (i = 0; i < am; i++) { 2526 jl[i] = am; 2527 il[i] = 0; 2528 } 2529 2530 /* create and initialize a linked list for storing column indices of the active row k */ 2531 nlnk = am + 1; 2532 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 2533 2534 /* initial FreeSpace size is fill*(ai[am]+1) */ 2535 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space)); 2536 2537 current_space = free_space; 2538 2539 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl)); 2540 2541 current_space_lvl = free_space_lvl; 2542 2543 for (k = 0; k < am; k++) { /* for each active row k */ 2544 /* initialize lnk by the column indices of row k */ 2545 nzk = 0; 2546 ncols = ai[k + 1] - ai[k]; 2547 PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k); 2548 cols = aj + ai[k]; 2549 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt)); 2550 nzk += nlnk; 2551 2552 /* update lnk by computing fill-in for each pivot row to be merged in */ 2553 prow = jl[k]; /* 1st pivot row */ 2554 2555 while (prow < k) { 2556 nextprow = jl[prow]; 2557 2558 /* merge prow into k-th row */ 2559 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2560 jmax = ui[prow + 1]; 2561 ncols = jmax - jmin; 2562 i = jmin - ui[prow]; 2563 2564 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2565 uj = uj_lvl_ptr[prow] + i; /* levels of cols */ 2566 j = *(uj - 1); 2567 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j)); 2568 nzk += nlnk; 2569 2570 /* update il and jl for prow */ 2571 if (jmin < jmax) { 2572 il[prow] = jmin; 2573 2574 j = *cols; 2575 jl[prow] = jl[j]; 2576 jl[j] = prow; 2577 } 2578 prow = nextprow; 2579 } 2580 2581 /* if free space is not available, make more free space */ 2582 if (current_space->local_remaining < nzk) { 2583 i = am - k + 1; /* num of unfactored rows */ 2584 i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2585 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 2586 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 2587 reallocs++; 2588 } 2589 2590 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2591 PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k); 2592 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 2593 2594 /* add the k-th row into il and jl */ 2595 if (nzk > 1) { 2596 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2597 jl[k] = jl[i]; 2598 jl[i] = k; 2599 il[k] = ui[k] + 1; 2600 } 2601 uj_ptr[k] = current_space->array; 2602 uj_lvl_ptr[k] = current_space_lvl->array; 2603 2604 current_space->array += nzk; 2605 current_space->local_used += nzk; 2606 current_space->local_remaining -= nzk; 2607 current_space_lvl->array += nzk; 2608 current_space_lvl->local_used += nzk; 2609 current_space_lvl->local_remaining -= nzk; 2610 2611 ui[k + 1] = ui[k] + nzk; 2612 } 2613 2614 PetscCall(ISRestoreIndices(perm, &rip)); 2615 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 2616 2617 /* destroy list of free space and other temporary array(s) */ 2618 PetscCall(PetscMalloc1(ui[am], &uj)); 2619 PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor */ 2620 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 2621 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2622 2623 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2624 2625 /* put together the new matrix in MATSEQSBAIJ format */ 2626 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 2627 2628 b = (Mat_SeqSBAIJ *)fact->data; 2629 PetscCall(PetscFree2(b->imax, b->ilen)); 2630 PetscCall(PetscShmgetAllocateArray(ui[am], sizeof(PetscScalar), (void **)&b->a)); 2631 b->free_a = PETSC_TRUE; 2632 b->free_ij = free_ij; 2633 b->j = uj; 2634 b->i = ui; 2635 b->diag = udiag; 2636 b->ilen = NULL; 2637 b->imax = NULL; 2638 b->row = perm; 2639 b->col = perm; 2640 2641 PetscCall(PetscObjectReference((PetscObject)perm)); 2642 PetscCall(PetscObjectReference((PetscObject)perm)); 2643 2644 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2645 2646 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 2647 2648 b->maxnz = b->nz = ui[am]; 2649 2650 fact->info.factor_mallocs = reallocs; 2651 fact->info.fill_ratio_given = fill; 2652 if (ai[am] != 0) { 2653 fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am]; 2654 } else { 2655 fact->info.fill_ratio_needed = 0.0; 2656 } 2657 #if defined(PETSC_USE_INFO) 2658 if (ai[am] != 0) { 2659 PetscReal af = fact->info.fill_ratio_needed; 2660 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 2661 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2662 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 2663 } else { 2664 PetscCall(PetscInfo(A, "Empty matrix\n")); 2665 } 2666 #endif 2667 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering; 2668 PetscFunctionReturn(PETSC_SUCCESS); 2669 } 2670 2671 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 2672 { 2673 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2674 Mat_SeqSBAIJ *b; 2675 PetscBool perm_identity, free_ij = PETSC_TRUE; 2676 PetscInt bs = A->rmap->bs, am = a->mbs; 2677 const PetscInt *cols, *rip, *ai = a->i, *aj = a->j; 2678 PetscInt reallocs = 0, i, *ui; 2679 PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 2680 PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 2681 PetscReal fill = info->fill, levels = info->levels, ratio_needed; 2682 PetscFreeSpaceList free_space = NULL, current_space = NULL; 2683 PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 2684 PetscBT lnkbt; 2685 2686 PetscFunctionBegin; 2687 /* 2688 This code originally uses Modified Sparse Row (MSR) storage 2689 (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice! 2690 Then it is rewritten so the factor B takes seqsbaij format. However the associated 2691 MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1, 2692 thus the original code in MSR format is still used for these cases. 2693 The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever 2694 MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor. 2695 */ 2696 if (bs > 1) { 2697 PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info)); 2698 PetscFunctionReturn(PETSC_SUCCESS); 2699 } 2700 2701 /* check whether perm is the identity mapping */ 2702 PetscCall(ISIdentity(perm, &perm_identity)); 2703 PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format"); 2704 a->permute = PETSC_FALSE; 2705 2706 /* special case that simply copies fill pattern */ 2707 if (!levels) { 2708 /* reuse the column pointers and row offsets for memory savings */ 2709 ui = a->i; 2710 uj = a->j; 2711 free_ij = PETSC_FALSE; 2712 ratio_needed = 1.0; 2713 } else { /* case: levels>0 */ 2714 PetscCall(ISGetIndices(perm, &rip)); 2715 2716 /* initialization */ 2717 PetscCall(PetscMalloc1(am + 1, &ui)); 2718 ui[0] = 0; 2719 2720 /* jl: linked list for storing indices of the pivot rows 2721 il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 2722 PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 2723 for (i = 0; i < am; i++) { 2724 jl[i] = am; 2725 il[i] = 0; 2726 } 2727 2728 /* create and initialize a linked list for storing column indices of the active row k */ 2729 nlnk = am + 1; 2730 PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 2731 2732 /* initial FreeSpace size is fill*(ai[am]+1) */ 2733 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space)); 2734 2735 current_space = free_space; 2736 2737 PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl)); 2738 2739 current_space_lvl = free_space_lvl; 2740 2741 for (k = 0; k < am; k++) { /* for each active row k */ 2742 /* initialize lnk by the column indices of row rip[k] */ 2743 nzk = 0; 2744 ncols = ai[rip[k] + 1] - ai[rip[k]]; 2745 cols = aj + ai[rip[k]]; 2746 PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt)); 2747 nzk += nlnk; 2748 2749 /* update lnk by computing fill-in for each pivot row to be merged in */ 2750 prow = jl[k]; /* 1st pivot row */ 2751 2752 while (prow < k) { 2753 nextprow = jl[prow]; 2754 2755 /* merge prow into k-th row */ 2756 jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 2757 jmax = ui[prow + 1]; 2758 ncols = jmax - jmin; 2759 i = jmin - ui[prow]; 2760 cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 2761 j = *(uj_lvl_ptr[prow] + i - 1); 2762 cols_lvl = uj_lvl_ptr[prow] + i; 2763 PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j)); 2764 nzk += nlnk; 2765 2766 /* update il and jl for prow */ 2767 if (jmin < jmax) { 2768 il[prow] = jmin; 2769 j = *cols; 2770 jl[prow] = jl[j]; 2771 jl[j] = prow; 2772 } 2773 prow = nextprow; 2774 } 2775 2776 /* if free space is not available, make more free space */ 2777 if (current_space->local_remaining < nzk) { 2778 i = am - k + 1; /* num of unfactored rows */ 2779 i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 2780 PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 2781 PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 2782 reallocs++; 2783 } 2784 2785 /* copy data into free_space and free_space_lvl, then initialize lnk */ 2786 PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 2787 2788 /* add the k-th row into il and jl */ 2789 if (nzk - 1 > 0) { 2790 i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 2791 jl[k] = jl[i]; 2792 jl[i] = k; 2793 il[k] = ui[k] + 1; 2794 } 2795 uj_ptr[k] = current_space->array; 2796 uj_lvl_ptr[k] = current_space_lvl->array; 2797 2798 current_space->array += nzk; 2799 current_space->local_used += nzk; 2800 current_space->local_remaining -= nzk; 2801 current_space_lvl->array += nzk; 2802 current_space_lvl->local_used += nzk; 2803 current_space_lvl->local_remaining -= nzk; 2804 2805 ui[k + 1] = ui[k] + nzk; 2806 } 2807 2808 PetscCall(ISRestoreIndices(perm, &rip)); 2809 PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 2810 2811 /* destroy list of free space and other temporary array(s) */ 2812 PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 2813 PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 2814 PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 2815 PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 2816 if (ai[am] != 0) { 2817 ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]); 2818 } else { 2819 ratio_needed = 0.0; 2820 } 2821 } /* end of case: levels>0 || (levels=0 && !perm_identity) */ 2822 2823 /* put together the new matrix in MATSEQSBAIJ format */ 2824 PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL)); 2825 2826 b = (Mat_SeqSBAIJ *)fact->data; 2827 PetscCall(PetscFree2(b->imax, b->ilen)); 2828 PetscCall(PetscShmgetAllocateArray(ui[am], sizeof(PetscScalar), (void **)&b->a)); 2829 b->free_a = PETSC_TRUE; 2830 b->free_ij = free_ij; 2831 b->j = uj; 2832 b->i = ui; 2833 b->diag = NULL; 2834 b->ilen = NULL; 2835 b->imax = NULL; 2836 b->row = perm; 2837 b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 2838 2839 PetscCall(PetscObjectReference((PetscObject)perm)); 2840 b->icol = perm; 2841 PetscCall(PetscObjectReference((PetscObject)perm)); 2842 PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 2843 2844 b->maxnz = b->nz = ui[am]; 2845 2846 fact->info.factor_mallocs = reallocs; 2847 fact->info.fill_ratio_given = fill; 2848 fact->info.fill_ratio_needed = ratio_needed; 2849 #if defined(PETSC_USE_INFO) 2850 if (ai[am] != 0) { 2851 PetscReal af = fact->info.fill_ratio_needed; 2852 PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 2853 PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 2854 PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 2855 } else { 2856 PetscCall(PetscInfo(A, "Empty matrix\n")); 2857 } 2858 #endif 2859 if (perm_identity) { 2860 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace; 2861 } else { 2862 fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace; 2863 } 2864 PetscFunctionReturn(PETSC_SUCCESS); 2865 } 2866