1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) { 9 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 10 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 11 12 PetscFunctionBegin; 13 n = A->cmap->n; 14 m = A->rmap->n; 15 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 16 ns_row = a->inode.size; 17 18 min_mn = (m < n) ? m : n; 19 if (!ns) { 20 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) 21 ; 22 for (; count + 1 < n; count++, i++) 23 ; 24 if (count < n) i++; 25 *size = i; 26 PetscFunctionReturn(0); 27 } 28 PetscCall(PetscMalloc1(n + 1, &ns_col)); 29 30 /* Use the same row structure wherever feasible. */ 31 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i]; 32 33 /* if m < n; pad up the remainder with inode_limit */ 34 for (; count + 1 < n; count++, i++) ns_col[i] = 1; 35 /* The last node is the odd ball. padd it up with the remaining rows; */ 36 if (count < n) { 37 ns_col[i] = n - count; 38 i++; 39 } else if (count > n) { 40 /* Adjust for the over estimation */ 41 ns_col[i - 1] += n - count; 42 } 43 *size = i; 44 *ns = ns_col; 45 PetscFunctionReturn(0); 46 } 47 48 /* 49 This builds symmetric version of nonzero structure, 50 */ 51 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) { 52 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 53 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 54 PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2; 55 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 56 57 PetscFunctionBegin; 58 nslim_row = a->inode.node_count; 59 m = A->rmap->n; 60 n = A->cmap->n; 61 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 62 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 63 64 /* Use the row_inode as column_inode */ 65 nslim_col = nslim_row; 66 ns_col = ns_row; 67 68 /* allocate space for reformated inode structure */ 69 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 70 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1]; 71 72 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 73 nsz = ns_col[i1]; 74 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 75 } 76 /* allocate space for row pointers */ 77 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 78 *iia = ia; 79 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 80 81 /* determine the number of columns in each row */ 82 ia[0] = oshift; 83 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 84 j = aj + ai[row] + ishift; 85 jmax = aj + ai[row + 1] + ishift; 86 if (j == jmax) continue; /* empty row */ 87 col = *j++ + ishift; 88 i2 = tvc[col]; 89 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 90 ia[i1 + 1]++; 91 ia[i2 + 1]++; 92 i2++; /* Start col of next node */ 93 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 94 i2 = tvc[col]; 95 } 96 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 97 } 98 99 /* shift ia[i] to point to next row */ 100 for (i1 = 1; i1 < nslim_row + 1; i1++) { 101 row = ia[i1 - 1]; 102 ia[i1] += row; 103 work[i1 - 1] = row - oshift; 104 } 105 106 /* allocate space for column pointers */ 107 nz = ia[nslim_row] + (!ishift); 108 PetscCall(PetscMalloc1(nz, &ja)); 109 *jja = ja; 110 111 /* loop over lower triangular part putting into ja */ 112 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 113 j = aj + ai[row] + ishift; 114 jmax = aj + ai[row + 1] + ishift; 115 if (j == jmax) continue; /* empty row */ 116 col = *j++ + ishift; 117 i2 = tvc[col]; 118 while (i2 < i1 && j < jmax) { 119 ja[work[i2]++] = i1 + oshift; 120 ja[work[i1]++] = i2 + oshift; 121 ++i2; 122 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 123 i2 = tvc[col]; 124 } 125 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 126 } 127 PetscCall(PetscFree(work)); 128 PetscCall(PetscFree2(tns, tvc)); 129 PetscFunctionReturn(0); 130 } 131 132 /* 133 This builds nonsymmetric version of nonzero structure, 134 */ 135 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) { 136 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 137 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 138 PetscInt *tns, *tvc, nsz, i1, i2; 139 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size; 140 141 PetscFunctionBegin; 142 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 143 nslim_row = a->inode.node_count; 144 n = A->cmap->n; 145 146 /* Create The column_inode for this matrix */ 147 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 148 149 /* allocate space for reformated column_inode structure */ 150 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 151 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 152 153 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 154 nsz = ns_col[i1]; 155 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 156 } 157 /* allocate space for row pointers */ 158 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 159 *iia = ia; 160 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 161 162 /* determine the number of columns in each row */ 163 ia[0] = oshift; 164 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 165 j = aj + ai[row] + ishift; 166 nz = ai[row + 1] - ai[row]; 167 if (!nz) continue; /* empty row */ 168 col = *j++ + ishift; 169 i2 = tvc[col]; 170 while (nz-- > 0) { /* off-diagonal elements */ 171 ia[i1 + 1]++; 172 i2++; /* Start col of next node */ 173 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 174 if (nz > 0) i2 = tvc[col]; 175 } 176 } 177 178 /* shift ia[i] to point to next row */ 179 for (i1 = 1; i1 < nslim_row + 1; i1++) { 180 row = ia[i1 - 1]; 181 ia[i1] += row; 182 work[i1 - 1] = row - oshift; 183 } 184 185 /* allocate space for column pointers */ 186 nz = ia[nslim_row] + (!ishift); 187 PetscCall(PetscMalloc1(nz, &ja)); 188 *jja = ja; 189 190 /* loop over matrix putting into ja */ 191 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 192 j = aj + ai[row] + ishift; 193 nz = ai[row + 1] - ai[row]; 194 if (!nz) continue; /* empty row */ 195 col = *j++ + ishift; 196 i2 = tvc[col]; 197 while (nz-- > 0) { 198 ja[work[i1]++] = i2 + oshift; 199 ++i2; 200 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 201 if (nz > 0) i2 = tvc[col]; 202 } 203 } 204 PetscCall(PetscFree(ns_col)); 205 PetscCall(PetscFree(work)); 206 PetscCall(PetscFree2(tns, tvc)); 207 PetscFunctionReturn(0); 208 } 209 210 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) { 211 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 212 213 PetscFunctionBegin; 214 if (n) *n = a->inode.node_count; 215 if (!ia) PetscFunctionReturn(0); 216 if (!blockcompressed) { 217 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 218 } else if (symmetric) { 219 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 220 } else { 221 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 222 } 223 PetscFunctionReturn(0); 224 } 225 226 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) { 227 PetscFunctionBegin; 228 if (!ia) PetscFunctionReturn(0); 229 230 if (!blockcompressed) { 231 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 232 } else { 233 PetscCall(PetscFree(*ia)); 234 PetscCall(PetscFree(*ja)); 235 } 236 PetscFunctionReturn(0); 237 } 238 239 /* ----------------------------------------------------------- */ 240 241 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) { 242 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 243 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 244 PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j; 245 246 PetscFunctionBegin; 247 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 248 nslim_row = a->inode.node_count; 249 n = A->cmap->n; 250 251 /* Create The column_inode for this matrix */ 252 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 253 254 /* allocate space for reformated column_inode structure */ 255 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 256 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 257 258 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 259 nsz = ns_col[i1]; 260 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 261 } 262 /* allocate space for column pointers */ 263 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 264 *iia = ia; 265 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 266 267 /* determine the number of columns in each row */ 268 ia[0] = oshift; 269 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 270 j = aj + ai[row] + ishift; 271 col = *j++ + ishift; 272 i2 = tvc[col]; 273 nz = ai[row + 1] - ai[row]; 274 while (nz-- > 0) { /* off-diagonal elements */ 275 /* ia[i1+1]++; */ 276 ia[i2 + 1]++; 277 i2++; 278 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 279 if (nz > 0) i2 = tvc[col]; 280 } 281 } 282 283 /* shift ia[i] to point to next col */ 284 for (i1 = 1; i1 < nslim_col + 1; i1++) { 285 col = ia[i1 - 1]; 286 ia[i1] += col; 287 work[i1 - 1] = col - oshift; 288 } 289 290 /* allocate space for column pointers */ 291 nz = ia[nslim_col] + (!ishift); 292 PetscCall(PetscMalloc1(nz, &ja)); 293 *jja = ja; 294 295 /* loop over matrix putting into ja */ 296 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 297 j = aj + ai[row] + ishift; 298 col = *j++ + ishift; 299 i2 = tvc[col]; 300 nz = ai[row + 1] - ai[row]; 301 while (nz-- > 0) { 302 /* ja[work[i1]++] = i2 + oshift; */ 303 ja[work[i2]++] = i1 + oshift; 304 i2++; 305 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 306 if (nz > 0) i2 = tvc[col]; 307 } 308 } 309 PetscCall(PetscFree(ns_col)); 310 PetscCall(PetscFree(work)); 311 PetscCall(PetscFree2(tns, tvc)); 312 PetscFunctionReturn(0); 313 } 314 315 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) { 316 PetscFunctionBegin; 317 PetscCall(MatCreateColInode_Private(A, n, NULL)); 318 if (!ia) PetscFunctionReturn(0); 319 320 if (!blockcompressed) { 321 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 322 } else if (symmetric) { 323 /* Since the indices are symmetric it doesn't matter */ 324 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 325 } else { 326 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 327 } 328 PetscFunctionReturn(0); 329 } 330 331 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) { 332 PetscFunctionBegin; 333 if (!ia) PetscFunctionReturn(0); 334 if (!blockcompressed) { 335 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 336 } else { 337 PetscCall(PetscFree(*ia)); 338 PetscCall(PetscFree(*ja)); 339 } 340 PetscFunctionReturn(0); 341 } 342 343 /* ----------------------------------------------------------- */ 344 345 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) { 346 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 347 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 348 PetscScalar *y; 349 const PetscScalar *x; 350 const MatScalar *v1, *v2, *v3, *v4, *v5; 351 PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 352 const PetscInt *idx, *ns, *ii; 353 354 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 355 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 356 #endif 357 358 PetscFunctionBegin; 359 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 360 node_max = a->inode.node_count; 361 ns = a->inode.size; /* Node Size array */ 362 PetscCall(VecGetArrayRead(xx, &x)); 363 PetscCall(VecGetArray(yy, &y)); 364 idx = a->j; 365 v1 = a->a; 366 ii = a->i; 367 368 for (i = 0, row = 0; i < node_max; ++i) { 369 nsz = ns[i]; 370 n = ii[1] - ii[0]; 371 nonzerorow += (n > 0) * nsz; 372 ii += nsz; 373 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 374 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 375 sz = n; /* No of non zeros in this row */ 376 /* Switch on the size of Node */ 377 switch (nsz) { /* Each loop in 'case' is unrolled */ 378 case 1: 379 sum1 = 0.; 380 381 for (n = 0; n < sz - 1; n += 2) { 382 i1 = idx[0]; /* The instructions are ordered to */ 383 i2 = idx[1]; /* make the compiler's job easy */ 384 idx += 2; 385 tmp0 = x[i1]; 386 tmp1 = x[i2]; 387 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 388 v1 += 2; 389 } 390 391 if (n == sz - 1) { /* Take care of the last nonzero */ 392 tmp0 = x[*idx++]; 393 sum1 += *v1++ * tmp0; 394 } 395 y[row++] = sum1; 396 break; 397 case 2: 398 sum1 = 0.; 399 sum2 = 0.; 400 v2 = v1 + n; 401 402 for (n = 0; n < sz - 1; n += 2) { 403 i1 = idx[0]; 404 i2 = idx[1]; 405 idx += 2; 406 tmp0 = x[i1]; 407 tmp1 = x[i2]; 408 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 409 v1 += 2; 410 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 411 v2 += 2; 412 } 413 if (n == sz - 1) { 414 tmp0 = x[*idx++]; 415 sum1 += *v1++ * tmp0; 416 sum2 += *v2++ * tmp0; 417 } 418 y[row++] = sum1; 419 y[row++] = sum2; 420 v1 = v2; /* Since the next block to be processed starts there*/ 421 idx += sz; 422 break; 423 case 3: 424 sum1 = 0.; 425 sum2 = 0.; 426 sum3 = 0.; 427 v2 = v1 + n; 428 v3 = v2 + n; 429 430 for (n = 0; n < sz - 1; n += 2) { 431 i1 = idx[0]; 432 i2 = idx[1]; 433 idx += 2; 434 tmp0 = x[i1]; 435 tmp1 = x[i2]; 436 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 437 v1 += 2; 438 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 439 v2 += 2; 440 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 441 v3 += 2; 442 } 443 if (n == sz - 1) { 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 sum2 += *v2++ * tmp0; 447 sum3 += *v3++ * tmp0; 448 } 449 y[row++] = sum1; 450 y[row++] = sum2; 451 y[row++] = sum3; 452 v1 = v3; /* Since the next block to be processed starts there*/ 453 idx += 2 * sz; 454 break; 455 case 4: 456 sum1 = 0.; 457 sum2 = 0.; 458 sum3 = 0.; 459 sum4 = 0.; 460 v2 = v1 + n; 461 v3 = v2 + n; 462 v4 = v3 + n; 463 464 for (n = 0; n < sz - 1; n += 2) { 465 i1 = idx[0]; 466 i2 = idx[1]; 467 idx += 2; 468 tmp0 = x[i1]; 469 tmp1 = x[i2]; 470 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 471 v1 += 2; 472 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 473 v2 += 2; 474 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 475 v3 += 2; 476 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 477 v4 += 2; 478 } 479 if (n == sz - 1) { 480 tmp0 = x[*idx++]; 481 sum1 += *v1++ * tmp0; 482 sum2 += *v2++ * tmp0; 483 sum3 += *v3++ * tmp0; 484 sum4 += *v4++ * tmp0; 485 } 486 y[row++] = sum1; 487 y[row++] = sum2; 488 y[row++] = sum3; 489 y[row++] = sum4; 490 v1 = v4; /* Since the next block to be processed starts there*/ 491 idx += 3 * sz; 492 break; 493 case 5: 494 sum1 = 0.; 495 sum2 = 0.; 496 sum3 = 0.; 497 sum4 = 0.; 498 sum5 = 0.; 499 v2 = v1 + n; 500 v3 = v2 + n; 501 v4 = v3 + n; 502 v5 = v4 + n; 503 504 for (n = 0; n < sz - 1; n += 2) { 505 i1 = idx[0]; 506 i2 = idx[1]; 507 idx += 2; 508 tmp0 = x[i1]; 509 tmp1 = x[i2]; 510 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 511 v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 513 v2 += 2; 514 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 515 v3 += 2; 516 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 517 v4 += 2; 518 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 519 v5 += 2; 520 } 521 if (n == sz - 1) { 522 tmp0 = x[*idx++]; 523 sum1 += *v1++ * tmp0; 524 sum2 += *v2++ * tmp0; 525 sum3 += *v3++ * tmp0; 526 sum4 += *v4++ * tmp0; 527 sum5 += *v5++ * tmp0; 528 } 529 y[row++] = sum1; 530 y[row++] = sum2; 531 y[row++] = sum3; 532 y[row++] = sum4; 533 y[row++] = sum5; 534 v1 = v5; /* Since the next block to be processed starts there */ 535 idx += 4 * sz; 536 break; 537 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 538 } 539 } 540 PetscCall(VecRestoreArrayRead(xx, &x)); 541 PetscCall(VecRestoreArray(yy, &y)); 542 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 543 PetscFunctionReturn(0); 544 } 545 /* ----------------------------------------------------------- */ 546 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 547 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) { 548 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 549 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 550 const MatScalar *v1, *v2, *v3, *v4, *v5; 551 const PetscScalar *x; 552 PetscScalar *y, *z, *zt; 553 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 554 const PetscInt *idx, *ns, *ii; 555 556 PetscFunctionBegin; 557 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 558 node_max = a->inode.node_count; 559 ns = a->inode.size; /* Node Size array */ 560 561 PetscCall(VecGetArrayRead(xx, &x)); 562 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 563 zt = z; 564 565 idx = a->j; 566 v1 = a->a; 567 ii = a->i; 568 569 for (i = 0, row = 0; i < node_max; ++i) { 570 nsz = ns[i]; 571 n = ii[1] - ii[0]; 572 ii += nsz; 573 sz = n; /* No of non zeros in this row */ 574 /* Switch on the size of Node */ 575 switch (nsz) { /* Each loop in 'case' is unrolled */ 576 case 1: 577 sum1 = *zt++; 578 579 for (n = 0; n < sz - 1; n += 2) { 580 i1 = idx[0]; /* The instructions are ordered to */ 581 i2 = idx[1]; /* make the compiler's job easy */ 582 idx += 2; 583 tmp0 = x[i1]; 584 tmp1 = x[i2]; 585 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 586 v1 += 2; 587 } 588 589 if (n == sz - 1) { /* Take care of the last nonzero */ 590 tmp0 = x[*idx++]; 591 sum1 += *v1++ * tmp0; 592 } 593 y[row++] = sum1; 594 break; 595 case 2: 596 sum1 = *zt++; 597 sum2 = *zt++; 598 v2 = v1 + n; 599 600 for (n = 0; n < sz - 1; n += 2) { 601 i1 = idx[0]; 602 i2 = idx[1]; 603 idx += 2; 604 tmp0 = x[i1]; 605 tmp1 = x[i2]; 606 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 607 v1 += 2; 608 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 609 v2 += 2; 610 } 611 if (n == sz - 1) { 612 tmp0 = x[*idx++]; 613 sum1 += *v1++ * tmp0; 614 sum2 += *v2++ * tmp0; 615 } 616 y[row++] = sum1; 617 y[row++] = sum2; 618 v1 = v2; /* Since the next block to be processed starts there*/ 619 idx += sz; 620 break; 621 case 3: 622 sum1 = *zt++; 623 sum2 = *zt++; 624 sum3 = *zt++; 625 v2 = v1 + n; 626 v3 = v2 + n; 627 628 for (n = 0; n < sz - 1; n += 2) { 629 i1 = idx[0]; 630 i2 = idx[1]; 631 idx += 2; 632 tmp0 = x[i1]; 633 tmp1 = x[i2]; 634 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 635 v1 += 2; 636 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 637 v2 += 2; 638 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 639 v3 += 2; 640 } 641 if (n == sz - 1) { 642 tmp0 = x[*idx++]; 643 sum1 += *v1++ * tmp0; 644 sum2 += *v2++ * tmp0; 645 sum3 += *v3++ * tmp0; 646 } 647 y[row++] = sum1; 648 y[row++] = sum2; 649 y[row++] = sum3; 650 v1 = v3; /* Since the next block to be processed starts there*/ 651 idx += 2 * sz; 652 break; 653 case 4: 654 sum1 = *zt++; 655 sum2 = *zt++; 656 sum3 = *zt++; 657 sum4 = *zt++; 658 v2 = v1 + n; 659 v3 = v2 + n; 660 v4 = v3 + n; 661 662 for (n = 0; n < sz - 1; n += 2) { 663 i1 = idx[0]; 664 i2 = idx[1]; 665 idx += 2; 666 tmp0 = x[i1]; 667 tmp1 = x[i2]; 668 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 669 v1 += 2; 670 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 671 v2 += 2; 672 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 673 v3 += 2; 674 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 675 v4 += 2; 676 } 677 if (n == sz - 1) { 678 tmp0 = x[*idx++]; 679 sum1 += *v1++ * tmp0; 680 sum2 += *v2++ * tmp0; 681 sum3 += *v3++ * tmp0; 682 sum4 += *v4++ * tmp0; 683 } 684 y[row++] = sum1; 685 y[row++] = sum2; 686 y[row++] = sum3; 687 y[row++] = sum4; 688 v1 = v4; /* Since the next block to be processed starts there*/ 689 idx += 3 * sz; 690 break; 691 case 5: 692 sum1 = *zt++; 693 sum2 = *zt++; 694 sum3 = *zt++; 695 sum4 = *zt++; 696 sum5 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 v5 = v4 + n; 701 702 for (n = 0; n < sz - 1; n += 2) { 703 i1 = idx[0]; 704 i2 = idx[1]; 705 idx += 2; 706 tmp0 = x[i1]; 707 tmp1 = x[i2]; 708 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 709 v1 += 2; 710 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 711 v2 += 2; 712 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 713 v3 += 2; 714 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 715 v4 += 2; 716 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 717 v5 += 2; 718 } 719 if (n == sz - 1) { 720 tmp0 = x[*idx++]; 721 sum1 += *v1++ * tmp0; 722 sum2 += *v2++ * tmp0; 723 sum3 += *v3++ * tmp0; 724 sum4 += *v4++ * tmp0; 725 sum5 += *v5++ * tmp0; 726 } 727 y[row++] = sum1; 728 y[row++] = sum2; 729 y[row++] = sum3; 730 y[row++] = sum4; 731 y[row++] = sum5; 732 v1 = v5; /* Since the next block to be processed starts there */ 733 idx += 4 * sz; 734 break; 735 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 736 } 737 } 738 PetscCall(VecRestoreArrayRead(xx, &x)); 739 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 740 PetscCall(PetscLogFlops(2.0 * a->nz)); 741 PetscFunctionReturn(0); 742 } 743 744 /* ----------------------------------------------------------- */ 745 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) { 746 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 747 IS iscol = a->col, isrow = a->row; 748 const PetscInt *r, *c, *rout, *cout; 749 PetscInt i, j, n = A->rmap->n, nz; 750 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 751 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 752 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 753 PetscScalar sum1, sum2, sum3, sum4, sum5; 754 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 755 const PetscScalar *b; 756 757 PetscFunctionBegin; 758 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 759 node_max = a->inode.node_count; 760 ns = a->inode.size; /* Node Size array */ 761 762 PetscCall(VecGetArrayRead(bb, &b)); 763 PetscCall(VecGetArrayWrite(xx, &x)); 764 tmp = a->solve_work; 765 766 PetscCall(ISGetIndices(isrow, &rout)); 767 r = rout; 768 PetscCall(ISGetIndices(iscol, &cout)); 769 c = cout + (n - 1); 770 771 /* forward solve the lower triangular */ 772 tmps = tmp; 773 aa = a_a; 774 aj = a_j; 775 ad = a->diag; 776 777 for (i = 0, row = 0; i < node_max; ++i) { 778 nsz = ns[i]; 779 aii = ai[row]; 780 v1 = aa + aii; 781 vi = aj + aii; 782 nz = ad[row] - aii; 783 if (i < node_max - 1) { 784 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 785 * but our indexing to determine it's size could. */ 786 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 787 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 788 PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 789 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 790 } 791 792 switch (nsz) { /* Each loop in 'case' is unrolled */ 793 case 1: 794 sum1 = b[*r++]; 795 for (j = 0; j < nz - 1; j += 2) { 796 i0 = vi[0]; 797 i1 = vi[1]; 798 vi += 2; 799 tmp0 = tmps[i0]; 800 tmp1 = tmps[i1]; 801 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 802 v1 += 2; 803 } 804 if (j == nz - 1) { 805 tmp0 = tmps[*vi++]; 806 sum1 -= *v1++ * tmp0; 807 } 808 tmp[row++] = sum1; 809 break; 810 case 2: 811 sum1 = b[*r++]; 812 sum2 = b[*r++]; 813 v2 = aa + ai[row + 1]; 814 815 for (j = 0; j < nz - 1; j += 2) { 816 i0 = vi[0]; 817 i1 = vi[1]; 818 vi += 2; 819 tmp0 = tmps[i0]; 820 tmp1 = tmps[i1]; 821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 822 v1 += 2; 823 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 824 v2 += 2; 825 } 826 if (j == nz - 1) { 827 tmp0 = tmps[*vi++]; 828 sum1 -= *v1++ * tmp0; 829 sum2 -= *v2++ * tmp0; 830 } 831 sum2 -= *v2++ * sum1; 832 tmp[row++] = sum1; 833 tmp[row++] = sum2; 834 break; 835 case 3: 836 sum1 = b[*r++]; 837 sum2 = b[*r++]; 838 sum3 = b[*r++]; 839 v2 = aa + ai[row + 1]; 840 v3 = aa + ai[row + 2]; 841 842 for (j = 0; j < nz - 1; j += 2) { 843 i0 = vi[0]; 844 i1 = vi[1]; 845 vi += 2; 846 tmp0 = tmps[i0]; 847 tmp1 = tmps[i1]; 848 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 849 v1 += 2; 850 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 851 v2 += 2; 852 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 853 v3 += 2; 854 } 855 if (j == nz - 1) { 856 tmp0 = tmps[*vi++]; 857 sum1 -= *v1++ * tmp0; 858 sum2 -= *v2++ * tmp0; 859 sum3 -= *v3++ * tmp0; 860 } 861 sum2 -= *v2++ * sum1; 862 sum3 -= *v3++ * sum1; 863 sum3 -= *v3++ * sum2; 864 865 tmp[row++] = sum1; 866 tmp[row++] = sum2; 867 tmp[row++] = sum3; 868 break; 869 870 case 4: 871 sum1 = b[*r++]; 872 sum2 = b[*r++]; 873 sum3 = b[*r++]; 874 sum4 = b[*r++]; 875 v2 = aa + ai[row + 1]; 876 v3 = aa + ai[row + 2]; 877 v4 = aa + ai[row + 3]; 878 879 for (j = 0; j < nz - 1; j += 2) { 880 i0 = vi[0]; 881 i1 = vi[1]; 882 vi += 2; 883 tmp0 = tmps[i0]; 884 tmp1 = tmps[i1]; 885 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 886 v1 += 2; 887 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 888 v2 += 2; 889 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 890 v3 += 2; 891 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 892 v4 += 2; 893 } 894 if (j == nz - 1) { 895 tmp0 = tmps[*vi++]; 896 sum1 -= *v1++ * tmp0; 897 sum2 -= *v2++ * tmp0; 898 sum3 -= *v3++ * tmp0; 899 sum4 -= *v4++ * tmp0; 900 } 901 sum2 -= *v2++ * sum1; 902 sum3 -= *v3++ * sum1; 903 sum4 -= *v4++ * sum1; 904 sum3 -= *v3++ * sum2; 905 sum4 -= *v4++ * sum2; 906 sum4 -= *v4++ * sum3; 907 908 tmp[row++] = sum1; 909 tmp[row++] = sum2; 910 tmp[row++] = sum3; 911 tmp[row++] = sum4; 912 break; 913 case 5: 914 sum1 = b[*r++]; 915 sum2 = b[*r++]; 916 sum3 = b[*r++]; 917 sum4 = b[*r++]; 918 sum5 = b[*r++]; 919 v2 = aa + ai[row + 1]; 920 v3 = aa + ai[row + 2]; 921 v4 = aa + ai[row + 3]; 922 v5 = aa + ai[row + 4]; 923 924 for (j = 0; j < nz - 1; j += 2) { 925 i0 = vi[0]; 926 i1 = vi[1]; 927 vi += 2; 928 tmp0 = tmps[i0]; 929 tmp1 = tmps[i1]; 930 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 931 v1 += 2; 932 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 933 v2 += 2; 934 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 935 v3 += 2; 936 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 937 v4 += 2; 938 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 939 v5 += 2; 940 } 941 if (j == nz - 1) { 942 tmp0 = tmps[*vi++]; 943 sum1 -= *v1++ * tmp0; 944 sum2 -= *v2++ * tmp0; 945 sum3 -= *v3++ * tmp0; 946 sum4 -= *v4++ * tmp0; 947 sum5 -= *v5++ * tmp0; 948 } 949 950 sum2 -= *v2++ * sum1; 951 sum3 -= *v3++ * sum1; 952 sum4 -= *v4++ * sum1; 953 sum5 -= *v5++ * sum1; 954 sum3 -= *v3++ * sum2; 955 sum4 -= *v4++ * sum2; 956 sum5 -= *v5++ * sum2; 957 sum4 -= *v4++ * sum3; 958 sum5 -= *v5++ * sum3; 959 sum5 -= *v5++ * sum4; 960 961 tmp[row++] = sum1; 962 tmp[row++] = sum2; 963 tmp[row++] = sum3; 964 tmp[row++] = sum4; 965 tmp[row++] = sum5; 966 break; 967 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 968 } 969 } 970 /* backward solve the upper triangular */ 971 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 972 nsz = ns[i]; 973 aii = ai[row + 1] - 1; 974 v1 = aa + aii; 975 vi = aj + aii; 976 nz = aii - ad[row]; 977 switch (nsz) { /* Each loop in 'case' is unrolled */ 978 case 1: 979 sum1 = tmp[row]; 980 981 for (j = nz; j > 1; j -= 2) { 982 vi -= 2; 983 i0 = vi[2]; 984 i1 = vi[1]; 985 tmp0 = tmps[i0]; 986 tmp1 = tmps[i1]; 987 v1 -= 2; 988 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 989 } 990 if (j == 1) { 991 tmp0 = tmps[*vi--]; 992 sum1 -= *v1-- * tmp0; 993 } 994 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 995 row--; 996 break; 997 case 2: 998 sum1 = tmp[row]; 999 sum2 = tmp[row - 1]; 1000 v2 = aa + ai[row] - 1; 1001 for (j = nz; j > 1; j -= 2) { 1002 vi -= 2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 v2 -= 2; 1009 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1010 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1011 } 1012 if (j == 1) { 1013 tmp0 = tmps[*vi--]; 1014 sum1 -= *v1-- * tmp0; 1015 sum2 -= *v2-- * tmp0; 1016 } 1017 1018 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1019 row--; 1020 sum2 -= *v2-- * tmp0; 1021 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1022 row--; 1023 break; 1024 case 3: 1025 sum1 = tmp[row]; 1026 sum2 = tmp[row - 1]; 1027 sum3 = tmp[row - 2]; 1028 v2 = aa + ai[row] - 1; 1029 v3 = aa + ai[row - 1] - 1; 1030 for (j = nz; j > 1; j -= 2) { 1031 vi -= 2; 1032 i0 = vi[2]; 1033 i1 = vi[1]; 1034 tmp0 = tmps[i0]; 1035 tmp1 = tmps[i1]; 1036 v1 -= 2; 1037 v2 -= 2; 1038 v3 -= 2; 1039 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1040 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1041 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1042 } 1043 if (j == 1) { 1044 tmp0 = tmps[*vi--]; 1045 sum1 -= *v1-- * tmp0; 1046 sum2 -= *v2-- * tmp0; 1047 sum3 -= *v3-- * tmp0; 1048 } 1049 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1050 row--; 1051 sum2 -= *v2-- * tmp0; 1052 sum3 -= *v3-- * tmp0; 1053 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1054 row--; 1055 sum3 -= *v3-- * tmp0; 1056 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1057 row--; 1058 1059 break; 1060 case 4: 1061 sum1 = tmp[row]; 1062 sum2 = tmp[row - 1]; 1063 sum3 = tmp[row - 2]; 1064 sum4 = tmp[row - 3]; 1065 v2 = aa + ai[row] - 1; 1066 v3 = aa + ai[row - 1] - 1; 1067 v4 = aa + ai[row - 2] - 1; 1068 1069 for (j = nz; j > 1; j -= 2) { 1070 vi -= 2; 1071 i0 = vi[2]; 1072 i1 = vi[1]; 1073 tmp0 = tmps[i0]; 1074 tmp1 = tmps[i1]; 1075 v1 -= 2; 1076 v2 -= 2; 1077 v3 -= 2; 1078 v4 -= 2; 1079 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1080 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1081 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1082 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1083 } 1084 if (j == 1) { 1085 tmp0 = tmps[*vi--]; 1086 sum1 -= *v1-- * tmp0; 1087 sum2 -= *v2-- * tmp0; 1088 sum3 -= *v3-- * tmp0; 1089 sum4 -= *v4-- * tmp0; 1090 } 1091 1092 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1093 row--; 1094 sum2 -= *v2-- * tmp0; 1095 sum3 -= *v3-- * tmp0; 1096 sum4 -= *v4-- * tmp0; 1097 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1098 row--; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1102 row--; 1103 sum4 -= *v4-- * tmp0; 1104 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1105 row--; 1106 break; 1107 case 5: 1108 sum1 = tmp[row]; 1109 sum2 = tmp[row - 1]; 1110 sum3 = tmp[row - 2]; 1111 sum4 = tmp[row - 3]; 1112 sum5 = tmp[row - 4]; 1113 v2 = aa + ai[row] - 1; 1114 v3 = aa + ai[row - 1] - 1; 1115 v4 = aa + ai[row - 2] - 1; 1116 v5 = aa + ai[row - 3] - 1; 1117 for (j = nz; j > 1; j -= 2) { 1118 vi -= 2; 1119 i0 = vi[2]; 1120 i1 = vi[1]; 1121 tmp0 = tmps[i0]; 1122 tmp1 = tmps[i1]; 1123 v1 -= 2; 1124 v2 -= 2; 1125 v3 -= 2; 1126 v4 -= 2; 1127 v5 -= 2; 1128 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1129 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1130 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1131 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1132 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1133 } 1134 if (j == 1) { 1135 tmp0 = tmps[*vi--]; 1136 sum1 -= *v1-- * tmp0; 1137 sum2 -= *v2-- * tmp0; 1138 sum3 -= *v3-- * tmp0; 1139 sum4 -= *v4-- * tmp0; 1140 sum5 -= *v5-- * tmp0; 1141 } 1142 1143 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1144 row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1150 row--; 1151 sum3 -= *v3-- * tmp0; 1152 sum4 -= *v4-- * tmp0; 1153 sum5 -= *v5-- * tmp0; 1154 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1155 row--; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1159 row--; 1160 sum5 -= *v5-- * tmp0; 1161 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1162 row--; 1163 break; 1164 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1165 } 1166 } 1167 PetscCall(ISRestoreIndices(isrow, &rout)); 1168 PetscCall(ISRestoreIndices(iscol, &cout)); 1169 PetscCall(VecRestoreArrayRead(bb, &b)); 1170 PetscCall(VecRestoreArrayWrite(xx, &x)); 1171 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1172 PetscFunctionReturn(0); 1173 } 1174 1175 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) { 1176 Mat C = B; 1177 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1178 IS isrow = b->row, isicol = b->icol; 1179 const PetscInt *r, *ic, *ics; 1180 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1181 PetscInt i, j, k, nz, nzL, row, *pj; 1182 const PetscInt *ajtmp, *bjtmp; 1183 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1184 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1185 FactorShiftCtx sctx; 1186 const PetscInt *ddiag; 1187 PetscReal rs; 1188 MatScalar d; 1189 PetscInt inod, nodesz, node_max, col; 1190 const PetscInt *ns; 1191 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1192 1193 PetscFunctionBegin; 1194 /* MatPivotSetUp(): initialize shift context sctx */ 1195 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1196 1197 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1198 ddiag = a->diag; 1199 sctx.shift_top = info->zeropivot; 1200 for (i = 0; i < n; i++) { 1201 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1202 d = (aa)[ddiag[i]]; 1203 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1204 v = aa + ai[i]; 1205 nz = ai[i + 1] - ai[i]; 1206 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1207 if (rs > sctx.shift_top) sctx.shift_top = rs; 1208 } 1209 sctx.shift_top *= 1.1; 1210 sctx.nshift_max = 5; 1211 sctx.shift_lo = 0.; 1212 sctx.shift_hi = 1.; 1213 } 1214 1215 PetscCall(ISGetIndices(isrow, &r)); 1216 PetscCall(ISGetIndices(isicol, &ic)); 1217 1218 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1219 ics = ic; 1220 1221 node_max = a->inode.node_count; 1222 ns = a->inode.size; 1223 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1224 1225 /* If max inode size > 4, split it into two inodes.*/ 1226 /* also map the inode sizes according to the ordering */ 1227 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1228 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1229 if (ns[i] > 4) { 1230 tmp_vec1[j] = 4; 1231 ++j; 1232 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 1233 } else { 1234 tmp_vec1[j] = ns[i]; 1235 } 1236 } 1237 /* Use the correct node_max */ 1238 node_max = j; 1239 1240 /* Now reorder the inode info based on mat re-ordering info */ 1241 /* First create a row -> inode_size_array_index map */ 1242 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1243 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1244 for (i = 0, row = 0; i < node_max; i++) { 1245 nodesz = tmp_vec1[i]; 1246 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1247 } 1248 /* Using nsmap, create a reordered ns structure */ 1249 for (i = 0, j = 0; i < node_max; i++) { 1250 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1251 tmp_vec2[i] = nodesz; 1252 j += nodesz; 1253 } 1254 PetscCall(PetscFree(nsmap)); 1255 PetscCall(PetscFree(tmp_vec1)); 1256 1257 /* Now use the correct ns */ 1258 ns = tmp_vec2; 1259 1260 do { 1261 sctx.newshift = PETSC_FALSE; 1262 /* Now loop over each block-row, and do the factorization */ 1263 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1264 nodesz = ns[inod]; 1265 1266 switch (nodesz) { 1267 case 1: 1268 /*----------*/ 1269 /* zero rtmp1 */ 1270 /* L part */ 1271 nz = bi[i + 1] - bi[i]; 1272 bjtmp = bj + bi[i]; 1273 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1274 1275 /* U part */ 1276 nz = bdiag[i] - bdiag[i + 1]; 1277 bjtmp = bj + bdiag[i + 1] + 1; 1278 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1279 1280 /* load in initial (unfactored row) */ 1281 nz = ai[r[i] + 1] - ai[r[i]]; 1282 ajtmp = aj + ai[r[i]]; 1283 v = aa + ai[r[i]]; 1284 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1285 1286 /* ZeropivotApply() */ 1287 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1288 1289 /* elimination */ 1290 bjtmp = bj + bi[i]; 1291 row = *bjtmp++; 1292 nzL = bi[i + 1] - bi[i]; 1293 for (k = 0; k < nzL; k++) { 1294 pc = rtmp1 + row; 1295 if (*pc != 0.0) { 1296 pv = b->a + bdiag[row]; 1297 mul1 = *pc * (*pv); 1298 *pc = mul1; 1299 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1300 pv = b->a + bdiag[row + 1] + 1; 1301 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1302 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1303 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1304 } 1305 row = *bjtmp++; 1306 } 1307 1308 /* finished row so stick it into b->a */ 1309 rs = 0.0; 1310 /* L part */ 1311 pv = b->a + bi[i]; 1312 pj = b->j + bi[i]; 1313 nz = bi[i + 1] - bi[i]; 1314 for (j = 0; j < nz; j++) { 1315 pv[j] = rtmp1[pj[j]]; 1316 rs += PetscAbsScalar(pv[j]); 1317 } 1318 1319 /* U part */ 1320 pv = b->a + bdiag[i + 1] + 1; 1321 pj = b->j + bdiag[i + 1] + 1; 1322 nz = bdiag[i] - bdiag[i + 1] - 1; 1323 for (j = 0; j < nz; j++) { 1324 pv[j] = rtmp1[pj[j]]; 1325 rs += PetscAbsScalar(pv[j]); 1326 } 1327 1328 /* Check zero pivot */ 1329 sctx.rs = rs; 1330 sctx.pv = rtmp1[i]; 1331 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1332 if (sctx.newshift) break; 1333 1334 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1335 pv = b->a + bdiag[i]; 1336 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1337 break; 1338 1339 case 2: 1340 /*----------*/ 1341 /* zero rtmp1 and rtmp2 */ 1342 /* L part */ 1343 nz = bi[i + 1] - bi[i]; 1344 bjtmp = bj + bi[i]; 1345 for (j = 0; j < nz; j++) { 1346 col = bjtmp[j]; 1347 rtmp1[col] = 0.0; 1348 rtmp2[col] = 0.0; 1349 } 1350 1351 /* U part */ 1352 nz = bdiag[i] - bdiag[i + 1]; 1353 bjtmp = bj + bdiag[i + 1] + 1; 1354 for (j = 0; j < nz; j++) { 1355 col = bjtmp[j]; 1356 rtmp1[col] = 0.0; 1357 rtmp2[col] = 0.0; 1358 } 1359 1360 /* load in initial (unfactored row) */ 1361 nz = ai[r[i] + 1] - ai[r[i]]; 1362 ajtmp = aj + ai[r[i]]; 1363 v1 = aa + ai[r[i]]; 1364 v2 = aa + ai[r[i] + 1]; 1365 for (j = 0; j < nz; j++) { 1366 col = ics[ajtmp[j]]; 1367 rtmp1[col] = v1[j]; 1368 rtmp2[col] = v2[j]; 1369 } 1370 /* ZeropivotApply(): shift the diagonal of the matrix */ 1371 rtmp1[i] += sctx.shift_amount; 1372 rtmp2[i + 1] += sctx.shift_amount; 1373 1374 /* elimination */ 1375 bjtmp = bj + bi[i]; 1376 row = *bjtmp++; /* pivot row */ 1377 nzL = bi[i + 1] - bi[i]; 1378 for (k = 0; k < nzL; k++) { 1379 pc1 = rtmp1 + row; 1380 pc2 = rtmp2 + row; 1381 if (*pc1 != 0.0 || *pc2 != 0.0) { 1382 pv = b->a + bdiag[row]; 1383 mul1 = *pc1 * (*pv); 1384 mul2 = *pc2 * (*pv); 1385 *pc1 = mul1; 1386 *pc2 = mul2; 1387 1388 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1389 pv = b->a + bdiag[row + 1] + 1; 1390 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1391 for (j = 0; j < nz; j++) { 1392 col = pj[j]; 1393 rtmp1[col] -= mul1 * pv[j]; 1394 rtmp2[col] -= mul2 * pv[j]; 1395 } 1396 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1397 } 1398 row = *bjtmp++; 1399 } 1400 1401 /* finished row i; check zero pivot, then stick row i into b->a */ 1402 rs = 0.0; 1403 /* L part */ 1404 pc1 = b->a + bi[i]; 1405 pj = b->j + bi[i]; 1406 nz = bi[i + 1] - bi[i]; 1407 for (j = 0; j < nz; j++) { 1408 col = pj[j]; 1409 pc1[j] = rtmp1[col]; 1410 rs += PetscAbsScalar(pc1[j]); 1411 } 1412 /* U part */ 1413 pc1 = b->a + bdiag[i + 1] + 1; 1414 pj = b->j + bdiag[i + 1] + 1; 1415 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1416 for (j = 0; j < nz; j++) { 1417 col = pj[j]; 1418 pc1[j] = rtmp1[col]; 1419 rs += PetscAbsScalar(pc1[j]); 1420 } 1421 1422 sctx.rs = rs; 1423 sctx.pv = rtmp1[i]; 1424 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1425 if (sctx.newshift) break; 1426 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1427 *pc1 = 1.0 / sctx.pv; 1428 1429 /* Now take care of diagonal 2x2 block. */ 1430 pc2 = rtmp2 + i; 1431 if (*pc2 != 0.0) { 1432 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1433 *pc2 = mul1; /* insert L entry */ 1434 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1435 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1436 for (j = 0; j < nz; j++) { 1437 col = pj[j]; 1438 rtmp2[col] -= mul1 * rtmp1[col]; 1439 } 1440 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1441 } 1442 1443 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1444 rs = 0.0; 1445 /* L part */ 1446 pc2 = b->a + bi[i + 1]; 1447 pj = b->j + bi[i + 1]; 1448 nz = bi[i + 2] - bi[i + 1]; 1449 for (j = 0; j < nz; j++) { 1450 col = pj[j]; 1451 pc2[j] = rtmp2[col]; 1452 rs += PetscAbsScalar(pc2[j]); 1453 } 1454 /* U part */ 1455 pc2 = b->a + bdiag[i + 2] + 1; 1456 pj = b->j + bdiag[i + 2] + 1; 1457 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1458 for (j = 0; j < nz; j++) { 1459 col = pj[j]; 1460 pc2[j] = rtmp2[col]; 1461 rs += PetscAbsScalar(pc2[j]); 1462 } 1463 1464 sctx.rs = rs; 1465 sctx.pv = rtmp2[i + 1]; 1466 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1467 if (sctx.newshift) break; 1468 pc2 = b->a + bdiag[i + 1]; 1469 *pc2 = 1.0 / sctx.pv; 1470 break; 1471 1472 case 3: 1473 /*----------*/ 1474 /* zero rtmp */ 1475 /* L part */ 1476 nz = bi[i + 1] - bi[i]; 1477 bjtmp = bj + bi[i]; 1478 for (j = 0; j < nz; j++) { 1479 col = bjtmp[j]; 1480 rtmp1[col] = 0.0; 1481 rtmp2[col] = 0.0; 1482 rtmp3[col] = 0.0; 1483 } 1484 1485 /* U part */ 1486 nz = bdiag[i] - bdiag[i + 1]; 1487 bjtmp = bj + bdiag[i + 1] + 1; 1488 for (j = 0; j < nz; j++) { 1489 col = bjtmp[j]; 1490 rtmp1[col] = 0.0; 1491 rtmp2[col] = 0.0; 1492 rtmp3[col] = 0.0; 1493 } 1494 1495 /* load in initial (unfactored row) */ 1496 nz = ai[r[i] + 1] - ai[r[i]]; 1497 ajtmp = aj + ai[r[i]]; 1498 v1 = aa + ai[r[i]]; 1499 v2 = aa + ai[r[i] + 1]; 1500 v3 = aa + ai[r[i] + 2]; 1501 for (j = 0; j < nz; j++) { 1502 col = ics[ajtmp[j]]; 1503 rtmp1[col] = v1[j]; 1504 rtmp2[col] = v2[j]; 1505 rtmp3[col] = v3[j]; 1506 } 1507 /* ZeropivotApply(): shift the diagonal of the matrix */ 1508 rtmp1[i] += sctx.shift_amount; 1509 rtmp2[i + 1] += sctx.shift_amount; 1510 rtmp3[i + 2] += sctx.shift_amount; 1511 1512 /* elimination */ 1513 bjtmp = bj + bi[i]; 1514 row = *bjtmp++; /* pivot row */ 1515 nzL = bi[i + 1] - bi[i]; 1516 for (k = 0; k < nzL; k++) { 1517 pc1 = rtmp1 + row; 1518 pc2 = rtmp2 + row; 1519 pc3 = rtmp3 + row; 1520 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1521 pv = b->a + bdiag[row]; 1522 mul1 = *pc1 * (*pv); 1523 mul2 = *pc2 * (*pv); 1524 mul3 = *pc3 * (*pv); 1525 *pc1 = mul1; 1526 *pc2 = mul2; 1527 *pc3 = mul3; 1528 1529 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1530 pv = b->a + bdiag[row + 1] + 1; 1531 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1532 for (j = 0; j < nz; j++) { 1533 col = pj[j]; 1534 rtmp1[col] -= mul1 * pv[j]; 1535 rtmp2[col] -= mul2 * pv[j]; 1536 rtmp3[col] -= mul3 * pv[j]; 1537 } 1538 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1539 } 1540 row = *bjtmp++; 1541 } 1542 1543 /* finished row i; check zero pivot, then stick row i into b->a */ 1544 rs = 0.0; 1545 /* L part */ 1546 pc1 = b->a + bi[i]; 1547 pj = b->j + bi[i]; 1548 nz = bi[i + 1] - bi[i]; 1549 for (j = 0; j < nz; j++) { 1550 col = pj[j]; 1551 pc1[j] = rtmp1[col]; 1552 rs += PetscAbsScalar(pc1[j]); 1553 } 1554 /* U part */ 1555 pc1 = b->a + bdiag[i + 1] + 1; 1556 pj = b->j + bdiag[i + 1] + 1; 1557 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1558 for (j = 0; j < nz; j++) { 1559 col = pj[j]; 1560 pc1[j] = rtmp1[col]; 1561 rs += PetscAbsScalar(pc1[j]); 1562 } 1563 1564 sctx.rs = rs; 1565 sctx.pv = rtmp1[i]; 1566 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1567 if (sctx.newshift) break; 1568 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1569 *pc1 = 1.0 / sctx.pv; 1570 1571 /* Now take care of 1st column of diagonal 3x3 block. */ 1572 pc2 = rtmp2 + i; 1573 pc3 = rtmp3 + i; 1574 if (*pc2 != 0.0 || *pc3 != 0.0) { 1575 mul2 = (*pc2) * (*pc1); 1576 *pc2 = mul2; 1577 mul3 = (*pc3) * (*pc1); 1578 *pc3 = mul3; 1579 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1580 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1581 for (j = 0; j < nz; j++) { 1582 col = pj[j]; 1583 rtmp2[col] -= mul2 * rtmp1[col]; 1584 rtmp3[col] -= mul3 * rtmp1[col]; 1585 } 1586 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1587 } 1588 1589 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1590 rs = 0.0; 1591 /* L part */ 1592 pc2 = b->a + bi[i + 1]; 1593 pj = b->j + bi[i + 1]; 1594 nz = bi[i + 2] - bi[i + 1]; 1595 for (j = 0; j < nz; j++) { 1596 col = pj[j]; 1597 pc2[j] = rtmp2[col]; 1598 rs += PetscAbsScalar(pc2[j]); 1599 } 1600 /* U part */ 1601 pc2 = b->a + bdiag[i + 2] + 1; 1602 pj = b->j + bdiag[i + 2] + 1; 1603 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1604 for (j = 0; j < nz; j++) { 1605 col = pj[j]; 1606 pc2[j] = rtmp2[col]; 1607 rs += PetscAbsScalar(pc2[j]); 1608 } 1609 1610 sctx.rs = rs; 1611 sctx.pv = rtmp2[i + 1]; 1612 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1613 if (sctx.newshift) break; 1614 pc2 = b->a + bdiag[i + 1]; 1615 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1616 1617 /* Now take care of 2nd column of diagonal 3x3 block. */ 1618 pc3 = rtmp3 + i + 1; 1619 if (*pc3 != 0.0) { 1620 mul3 = (*pc3) * (*pc2); 1621 *pc3 = mul3; 1622 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1623 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1624 for (j = 0; j < nz; j++) { 1625 col = pj[j]; 1626 rtmp3[col] -= mul3 * rtmp2[col]; 1627 } 1628 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1629 } 1630 1631 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1632 rs = 0.0; 1633 /* L part */ 1634 pc3 = b->a + bi[i + 2]; 1635 pj = b->j + bi[i + 2]; 1636 nz = bi[i + 3] - bi[i + 2]; 1637 for (j = 0; j < nz; j++) { 1638 col = pj[j]; 1639 pc3[j] = rtmp3[col]; 1640 rs += PetscAbsScalar(pc3[j]); 1641 } 1642 /* U part */ 1643 pc3 = b->a + bdiag[i + 3] + 1; 1644 pj = b->j + bdiag[i + 3] + 1; 1645 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1646 for (j = 0; j < nz; j++) { 1647 col = pj[j]; 1648 pc3[j] = rtmp3[col]; 1649 rs += PetscAbsScalar(pc3[j]); 1650 } 1651 1652 sctx.rs = rs; 1653 sctx.pv = rtmp3[i + 2]; 1654 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1655 if (sctx.newshift) break; 1656 pc3 = b->a + bdiag[i + 2]; 1657 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1658 break; 1659 case 4: 1660 /*----------*/ 1661 /* zero rtmp */ 1662 /* L part */ 1663 nz = bi[i + 1] - bi[i]; 1664 bjtmp = bj + bi[i]; 1665 for (j = 0; j < nz; j++) { 1666 col = bjtmp[j]; 1667 rtmp1[col] = 0.0; 1668 rtmp2[col] = 0.0; 1669 rtmp3[col] = 0.0; 1670 rtmp4[col] = 0.0; 1671 } 1672 1673 /* U part */ 1674 nz = bdiag[i] - bdiag[i + 1]; 1675 bjtmp = bj + bdiag[i + 1] + 1; 1676 for (j = 0; j < nz; j++) { 1677 col = bjtmp[j]; 1678 rtmp1[col] = 0.0; 1679 rtmp2[col] = 0.0; 1680 rtmp3[col] = 0.0; 1681 rtmp4[col] = 0.0; 1682 } 1683 1684 /* load in initial (unfactored row) */ 1685 nz = ai[r[i] + 1] - ai[r[i]]; 1686 ajtmp = aj + ai[r[i]]; 1687 v1 = aa + ai[r[i]]; 1688 v2 = aa + ai[r[i] + 1]; 1689 v3 = aa + ai[r[i] + 2]; 1690 v4 = aa + ai[r[i] + 3]; 1691 for (j = 0; j < nz; j++) { 1692 col = ics[ajtmp[j]]; 1693 rtmp1[col] = v1[j]; 1694 rtmp2[col] = v2[j]; 1695 rtmp3[col] = v3[j]; 1696 rtmp4[col] = v4[j]; 1697 } 1698 /* ZeropivotApply(): shift the diagonal of the matrix */ 1699 rtmp1[i] += sctx.shift_amount; 1700 rtmp2[i + 1] += sctx.shift_amount; 1701 rtmp3[i + 2] += sctx.shift_amount; 1702 rtmp4[i + 3] += sctx.shift_amount; 1703 1704 /* elimination */ 1705 bjtmp = bj + bi[i]; 1706 row = *bjtmp++; /* pivot row */ 1707 nzL = bi[i + 1] - bi[i]; 1708 for (k = 0; k < nzL; k++) { 1709 pc1 = rtmp1 + row; 1710 pc2 = rtmp2 + row; 1711 pc3 = rtmp3 + row; 1712 pc4 = rtmp4 + row; 1713 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1714 pv = b->a + bdiag[row]; 1715 mul1 = *pc1 * (*pv); 1716 mul2 = *pc2 * (*pv); 1717 mul3 = *pc3 * (*pv); 1718 mul4 = *pc4 * (*pv); 1719 *pc1 = mul1; 1720 *pc2 = mul2; 1721 *pc3 = mul3; 1722 *pc4 = mul4; 1723 1724 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1725 pv = b->a + bdiag[row + 1] + 1; 1726 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1727 for (j = 0; j < nz; j++) { 1728 col = pj[j]; 1729 rtmp1[col] -= mul1 * pv[j]; 1730 rtmp2[col] -= mul2 * pv[j]; 1731 rtmp3[col] -= mul3 * pv[j]; 1732 rtmp4[col] -= mul4 * pv[j]; 1733 } 1734 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1735 } 1736 row = *bjtmp++; 1737 } 1738 1739 /* finished row i; check zero pivot, then stick row i into b->a */ 1740 rs = 0.0; 1741 /* L part */ 1742 pc1 = b->a + bi[i]; 1743 pj = b->j + bi[i]; 1744 nz = bi[i + 1] - bi[i]; 1745 for (j = 0; j < nz; j++) { 1746 col = pj[j]; 1747 pc1[j] = rtmp1[col]; 1748 rs += PetscAbsScalar(pc1[j]); 1749 } 1750 /* U part */ 1751 pc1 = b->a + bdiag[i + 1] + 1; 1752 pj = b->j + bdiag[i + 1] + 1; 1753 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1754 for (j = 0; j < nz; j++) { 1755 col = pj[j]; 1756 pc1[j] = rtmp1[col]; 1757 rs += PetscAbsScalar(pc1[j]); 1758 } 1759 1760 sctx.rs = rs; 1761 sctx.pv = rtmp1[i]; 1762 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1763 if (sctx.newshift) break; 1764 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1765 *pc1 = 1.0 / sctx.pv; 1766 1767 /* Now take care of 1st column of diagonal 4x4 block. */ 1768 pc2 = rtmp2 + i; 1769 pc3 = rtmp3 + i; 1770 pc4 = rtmp4 + i; 1771 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1772 mul2 = (*pc2) * (*pc1); 1773 *pc2 = mul2; 1774 mul3 = (*pc3) * (*pc1); 1775 *pc3 = mul3; 1776 mul4 = (*pc4) * (*pc1); 1777 *pc4 = mul4; 1778 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1779 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1780 for (j = 0; j < nz; j++) { 1781 col = pj[j]; 1782 rtmp2[col] -= mul2 * rtmp1[col]; 1783 rtmp3[col] -= mul3 * rtmp1[col]; 1784 rtmp4[col] -= mul4 * rtmp1[col]; 1785 } 1786 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1787 } 1788 1789 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1790 rs = 0.0; 1791 /* L part */ 1792 pc2 = b->a + bi[i + 1]; 1793 pj = b->j + bi[i + 1]; 1794 nz = bi[i + 2] - bi[i + 1]; 1795 for (j = 0; j < nz; j++) { 1796 col = pj[j]; 1797 pc2[j] = rtmp2[col]; 1798 rs += PetscAbsScalar(pc2[j]); 1799 } 1800 /* U part */ 1801 pc2 = b->a + bdiag[i + 2] + 1; 1802 pj = b->j + bdiag[i + 2] + 1; 1803 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1804 for (j = 0; j < nz; j++) { 1805 col = pj[j]; 1806 pc2[j] = rtmp2[col]; 1807 rs += PetscAbsScalar(pc2[j]); 1808 } 1809 1810 sctx.rs = rs; 1811 sctx.pv = rtmp2[i + 1]; 1812 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1813 if (sctx.newshift) break; 1814 pc2 = b->a + bdiag[i + 1]; 1815 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1816 1817 /* Now take care of 2nd column of diagonal 4x4 block. */ 1818 pc3 = rtmp3 + i + 1; 1819 pc4 = rtmp4 + i + 1; 1820 if (*pc3 != 0.0 || *pc4 != 0.0) { 1821 mul3 = (*pc3) * (*pc2); 1822 *pc3 = mul3; 1823 mul4 = (*pc4) * (*pc2); 1824 *pc4 = mul4; 1825 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1826 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1827 for (j = 0; j < nz; j++) { 1828 col = pj[j]; 1829 rtmp3[col] -= mul3 * rtmp2[col]; 1830 rtmp4[col] -= mul4 * rtmp2[col]; 1831 } 1832 PetscCall(PetscLogFlops(4.0 * nz)); 1833 } 1834 1835 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1836 rs = 0.0; 1837 /* L part */ 1838 pc3 = b->a + bi[i + 2]; 1839 pj = b->j + bi[i + 2]; 1840 nz = bi[i + 3] - bi[i + 2]; 1841 for (j = 0; j < nz; j++) { 1842 col = pj[j]; 1843 pc3[j] = rtmp3[col]; 1844 rs += PetscAbsScalar(pc3[j]); 1845 } 1846 /* U part */ 1847 pc3 = b->a + bdiag[i + 3] + 1; 1848 pj = b->j + bdiag[i + 3] + 1; 1849 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1850 for (j = 0; j < nz; j++) { 1851 col = pj[j]; 1852 pc3[j] = rtmp3[col]; 1853 rs += PetscAbsScalar(pc3[j]); 1854 } 1855 1856 sctx.rs = rs; 1857 sctx.pv = rtmp3[i + 2]; 1858 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1859 if (sctx.newshift) break; 1860 pc3 = b->a + bdiag[i + 2]; 1861 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1862 1863 /* Now take care of 3rd column of diagonal 4x4 block. */ 1864 pc4 = rtmp4 + i + 2; 1865 if (*pc4 != 0.0) { 1866 mul4 = (*pc4) * (*pc3); 1867 *pc4 = mul4; 1868 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1869 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1870 for (j = 0; j < nz; j++) { 1871 col = pj[j]; 1872 rtmp4[col] -= mul4 * rtmp3[col]; 1873 } 1874 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1875 } 1876 1877 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1878 rs = 0.0; 1879 /* L part */ 1880 pc4 = b->a + bi[i + 3]; 1881 pj = b->j + bi[i + 3]; 1882 nz = bi[i + 4] - bi[i + 3]; 1883 for (j = 0; j < nz; j++) { 1884 col = pj[j]; 1885 pc4[j] = rtmp4[col]; 1886 rs += PetscAbsScalar(pc4[j]); 1887 } 1888 /* U part */ 1889 pc4 = b->a + bdiag[i + 4] + 1; 1890 pj = b->j + bdiag[i + 4] + 1; 1891 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1892 for (j = 0; j < nz; j++) { 1893 col = pj[j]; 1894 pc4[j] = rtmp4[col]; 1895 rs += PetscAbsScalar(pc4[j]); 1896 } 1897 1898 sctx.rs = rs; 1899 sctx.pv = rtmp4[i + 3]; 1900 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1901 if (sctx.newshift) break; 1902 pc4 = b->a + bdiag[i + 3]; 1903 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1904 break; 1905 1906 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1907 } 1908 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1909 i += nodesz; /* Update the row */ 1910 } 1911 1912 /* MatPivotRefine() */ 1913 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1914 /* 1915 * if no shift in this attempt & shifting & started shifting & can refine, 1916 * then try lower shift 1917 */ 1918 sctx.shift_hi = sctx.shift_fraction; 1919 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1920 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1921 sctx.newshift = PETSC_TRUE; 1922 sctx.nshift++; 1923 } 1924 } while (sctx.newshift); 1925 1926 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1927 PetscCall(PetscFree(tmp_vec2)); 1928 PetscCall(ISRestoreIndices(isicol, &ic)); 1929 PetscCall(ISRestoreIndices(isrow, &r)); 1930 1931 if (b->inode.size) { 1932 C->ops->solve = MatSolve_SeqAIJ_Inode; 1933 } else { 1934 C->ops->solve = MatSolve_SeqAIJ; 1935 } 1936 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1937 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1938 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1939 C->ops->matsolve = MatMatSolve_SeqAIJ; 1940 C->assembled = PETSC_TRUE; 1941 C->preallocated = PETSC_TRUE; 1942 1943 PetscCall(PetscLogFlops(C->cmap->n)); 1944 1945 /* MatShiftView(A,info,&sctx) */ 1946 if (sctx.nshift) { 1947 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1948 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1949 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1950 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1951 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1952 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1953 } 1954 } 1955 PetscFunctionReturn(0); 1956 } 1957 1958 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) { 1959 Mat C = B; 1960 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1961 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1962 const PetscInt *r, *ic, *c, *ics; 1963 PetscInt n = A->rmap->n, *bi = b->i; 1964 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1965 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1966 PetscInt *ai = a->i, *aj = a->j; 1967 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1968 PetscScalar mul1, mul2, mul3, tmp; 1969 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1970 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1971 PetscReal rs = 0.0; 1972 FactorShiftCtx sctx; 1973 1974 PetscFunctionBegin; 1975 sctx.shift_top = 0; 1976 sctx.nshift_max = 0; 1977 sctx.shift_lo = 0; 1978 sctx.shift_hi = 0; 1979 sctx.shift_fraction = 0; 1980 1981 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1982 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1983 sctx.shift_top = 0; 1984 for (i = 0; i < n; i++) { 1985 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1986 rs = 0.0; 1987 ajtmp = aj + ai[i]; 1988 rtmp1 = aa + ai[i]; 1989 nz = ai[i + 1] - ai[i]; 1990 for (j = 0; j < nz; j++) { 1991 if (*ajtmp != i) { 1992 rs += PetscAbsScalar(*rtmp1++); 1993 } else { 1994 rs -= PetscRealPart(*rtmp1++); 1995 } 1996 ajtmp++; 1997 } 1998 if (rs > sctx.shift_top) sctx.shift_top = rs; 1999 } 2000 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2001 sctx.shift_top *= 1.1; 2002 sctx.nshift_max = 5; 2003 sctx.shift_lo = 0.; 2004 sctx.shift_hi = 1.; 2005 } 2006 sctx.shift_amount = 0; 2007 sctx.nshift = 0; 2008 2009 PetscCall(ISGetIndices(isrow, &r)); 2010 PetscCall(ISGetIndices(iscol, &c)); 2011 PetscCall(ISGetIndices(isicol, &ic)); 2012 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2013 ics = ic; 2014 2015 node_max = a->inode.node_count; 2016 ns = a->inode.size; 2017 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2018 2019 /* If max inode size > 3, split it into two inodes.*/ 2020 /* also map the inode sizes according to the ordering */ 2021 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2022 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2023 if (ns[i] > 3) { 2024 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2025 ++j; 2026 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2027 } else { 2028 tmp_vec1[j] = ns[i]; 2029 } 2030 } 2031 /* Use the correct node_max */ 2032 node_max = j; 2033 2034 /* Now reorder the inode info based on mat re-ordering info */ 2035 /* First create a row -> inode_size_array_index map */ 2036 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2037 for (i = 0, row = 0; i < node_max; i++) { 2038 nodesz = tmp_vec1[i]; 2039 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2040 } 2041 /* Using nsmap, create a reordered ns structure */ 2042 for (i = 0, j = 0; i < node_max; i++) { 2043 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2044 tmp_vec2[i] = nodesz; 2045 j += nodesz; 2046 } 2047 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2048 /* Now use the correct ns */ 2049 ns = tmp_vec2; 2050 2051 do { 2052 sctx.newshift = PETSC_FALSE; 2053 /* Now loop over each block-row, and do the factorization */ 2054 for (i = 0, row = 0; i < node_max; i++) { 2055 nodesz = ns[i]; 2056 nz = bi[row + 1] - bi[row]; 2057 bjtmp = bj + bi[row]; 2058 2059 switch (nodesz) { 2060 case 1: 2061 for (j = 0; j < nz; j++) { 2062 idx = bjtmp[j]; 2063 rtmp11[idx] = 0.0; 2064 } 2065 2066 /* load in initial (unfactored row) */ 2067 idx = r[row]; 2068 nz_tmp = ai[idx + 1] - ai[idx]; 2069 ajtmp = aj + ai[idx]; 2070 v1 = aa + ai[idx]; 2071 2072 for (j = 0; j < nz_tmp; j++) { 2073 idx = ics[ajtmp[j]]; 2074 rtmp11[idx] = v1[j]; 2075 } 2076 rtmp11[ics[r[row]]] += sctx.shift_amount; 2077 2078 prow = *bjtmp++; 2079 while (prow < row) { 2080 pc1 = rtmp11 + prow; 2081 if (*pc1 != 0.0) { 2082 pv = ba + bd[prow]; 2083 pj = nbj + bd[prow]; 2084 mul1 = *pc1 * *pv++; 2085 *pc1 = mul1; 2086 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2087 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2088 for (j = 0; j < nz_tmp; j++) { 2089 tmp = pv[j]; 2090 idx = pj[j]; 2091 rtmp11[idx] -= mul1 * tmp; 2092 } 2093 } 2094 prow = *bjtmp++; 2095 } 2096 pj = bj + bi[row]; 2097 pc1 = ba + bi[row]; 2098 2099 sctx.pv = rtmp11[row]; 2100 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2101 rs = 0.0; 2102 for (j = 0; j < nz; j++) { 2103 idx = pj[j]; 2104 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2105 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2106 } 2107 sctx.rs = rs; 2108 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2109 if (sctx.newshift) goto endofwhile; 2110 break; 2111 2112 case 2: 2113 for (j = 0; j < nz; j++) { 2114 idx = bjtmp[j]; 2115 rtmp11[idx] = 0.0; 2116 rtmp22[idx] = 0.0; 2117 } 2118 2119 /* load in initial (unfactored row) */ 2120 idx = r[row]; 2121 nz_tmp = ai[idx + 1] - ai[idx]; 2122 ajtmp = aj + ai[idx]; 2123 v1 = aa + ai[idx]; 2124 v2 = aa + ai[idx + 1]; 2125 for (j = 0; j < nz_tmp; j++) { 2126 idx = ics[ajtmp[j]]; 2127 rtmp11[idx] = v1[j]; 2128 rtmp22[idx] = v2[j]; 2129 } 2130 rtmp11[ics[r[row]]] += sctx.shift_amount; 2131 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2132 2133 prow = *bjtmp++; 2134 while (prow < row) { 2135 pc1 = rtmp11 + prow; 2136 pc2 = rtmp22 + prow; 2137 if (*pc1 != 0.0 || *pc2 != 0.0) { 2138 pv = ba + bd[prow]; 2139 pj = nbj + bd[prow]; 2140 mul1 = *pc1 * *pv; 2141 mul2 = *pc2 * *pv; 2142 ++pv; 2143 *pc1 = mul1; 2144 *pc2 = mul2; 2145 2146 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2147 for (j = 0; j < nz_tmp; j++) { 2148 tmp = pv[j]; 2149 idx = pj[j]; 2150 rtmp11[idx] -= mul1 * tmp; 2151 rtmp22[idx] -= mul2 * tmp; 2152 } 2153 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2154 } 2155 prow = *bjtmp++; 2156 } 2157 2158 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2159 pc1 = rtmp11 + prow; 2160 pc2 = rtmp22 + prow; 2161 2162 sctx.pv = *pc1; 2163 pj = bj + bi[prow]; 2164 rs = 0.0; 2165 for (j = 0; j < nz; j++) { 2166 idx = pj[j]; 2167 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2168 } 2169 sctx.rs = rs; 2170 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2171 if (sctx.newshift) goto endofwhile; 2172 2173 if (*pc2 != 0.0) { 2174 pj = nbj + bd[prow]; 2175 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2176 *pc2 = mul2; 2177 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2178 for (j = 0; j < nz_tmp; j++) { 2179 idx = pj[j]; 2180 tmp = rtmp11[idx]; 2181 rtmp22[idx] -= mul2 * tmp; 2182 } 2183 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2184 } 2185 2186 pj = bj + bi[row]; 2187 pc1 = ba + bi[row]; 2188 pc2 = ba + bi[row + 1]; 2189 2190 sctx.pv = rtmp22[row + 1]; 2191 rs = 0.0; 2192 rtmp11[row] = 1.0 / rtmp11[row]; 2193 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2194 /* copy row entries from dense representation to sparse */ 2195 for (j = 0; j < nz; j++) { 2196 idx = pj[j]; 2197 pc1[j] = rtmp11[idx]; 2198 pc2[j] = rtmp22[idx]; 2199 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2200 } 2201 sctx.rs = rs; 2202 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2203 if (sctx.newshift) goto endofwhile; 2204 break; 2205 2206 case 3: 2207 for (j = 0; j < nz; j++) { 2208 idx = bjtmp[j]; 2209 rtmp11[idx] = 0.0; 2210 rtmp22[idx] = 0.0; 2211 rtmp33[idx] = 0.0; 2212 } 2213 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2214 idx = r[row]; 2215 nz_tmp = ai[idx + 1] - ai[idx]; 2216 ajtmp = aj + ai[idx]; 2217 v1 = aa + ai[idx]; 2218 v2 = aa + ai[idx + 1]; 2219 v3 = aa + ai[idx + 2]; 2220 for (j = 0; j < nz_tmp; j++) { 2221 idx = ics[ajtmp[j]]; 2222 rtmp11[idx] = v1[j]; 2223 rtmp22[idx] = v2[j]; 2224 rtmp33[idx] = v3[j]; 2225 } 2226 rtmp11[ics[r[row]]] += sctx.shift_amount; 2227 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2228 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2229 2230 /* loop over all pivot row blocks above this row block */ 2231 prow = *bjtmp++; 2232 while (prow < row) { 2233 pc1 = rtmp11 + prow; 2234 pc2 = rtmp22 + prow; 2235 pc3 = rtmp33 + prow; 2236 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2237 pv = ba + bd[prow]; 2238 pj = nbj + bd[prow]; 2239 mul1 = *pc1 * *pv; 2240 mul2 = *pc2 * *pv; 2241 mul3 = *pc3 * *pv; 2242 ++pv; 2243 *pc1 = mul1; 2244 *pc2 = mul2; 2245 *pc3 = mul3; 2246 2247 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2248 /* update this row based on pivot row */ 2249 for (j = 0; j < nz_tmp; j++) { 2250 tmp = pv[j]; 2251 idx = pj[j]; 2252 rtmp11[idx] -= mul1 * tmp; 2253 rtmp22[idx] -= mul2 * tmp; 2254 rtmp33[idx] -= mul3 * tmp; 2255 } 2256 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2257 } 2258 prow = *bjtmp++; 2259 } 2260 2261 /* Now take care of diagonal 3x3 block in this set of rows */ 2262 /* note: prow = row here */ 2263 pc1 = rtmp11 + prow; 2264 pc2 = rtmp22 + prow; 2265 pc3 = rtmp33 + prow; 2266 2267 sctx.pv = *pc1; 2268 pj = bj + bi[prow]; 2269 rs = 0.0; 2270 for (j = 0; j < nz; j++) { 2271 idx = pj[j]; 2272 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2273 } 2274 sctx.rs = rs; 2275 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2276 if (sctx.newshift) goto endofwhile; 2277 2278 if (*pc2 != 0.0 || *pc3 != 0.0) { 2279 mul2 = (*pc2) / (*pc1); 2280 mul3 = (*pc3) / (*pc1); 2281 *pc2 = mul2; 2282 *pc3 = mul3; 2283 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2284 pj = nbj + bd[prow]; 2285 for (j = 0; j < nz_tmp; j++) { 2286 idx = pj[j]; 2287 tmp = rtmp11[idx]; 2288 rtmp22[idx] -= mul2 * tmp; 2289 rtmp33[idx] -= mul3 * tmp; 2290 } 2291 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2292 } 2293 ++prow; 2294 2295 pc2 = rtmp22 + prow; 2296 pc3 = rtmp33 + prow; 2297 sctx.pv = *pc2; 2298 pj = bj + bi[prow]; 2299 rs = 0.0; 2300 for (j = 0; j < nz; j++) { 2301 idx = pj[j]; 2302 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2303 } 2304 sctx.rs = rs; 2305 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2306 if (sctx.newshift) goto endofwhile; 2307 2308 if (*pc3 != 0.0) { 2309 mul3 = (*pc3) / (*pc2); 2310 *pc3 = mul3; 2311 pj = nbj + bd[prow]; 2312 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2313 for (j = 0; j < nz_tmp; j++) { 2314 idx = pj[j]; 2315 tmp = rtmp22[idx]; 2316 rtmp33[idx] -= mul3 * tmp; 2317 } 2318 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2319 } 2320 2321 pj = bj + bi[row]; 2322 pc1 = ba + bi[row]; 2323 pc2 = ba + bi[row + 1]; 2324 pc3 = ba + bi[row + 2]; 2325 2326 sctx.pv = rtmp33[row + 2]; 2327 rs = 0.0; 2328 rtmp11[row] = 1.0 / rtmp11[row]; 2329 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2330 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2331 /* copy row entries from dense representation to sparse */ 2332 for (j = 0; j < nz; j++) { 2333 idx = pj[j]; 2334 pc1[j] = rtmp11[idx]; 2335 pc2[j] = rtmp22[idx]; 2336 pc3[j] = rtmp33[idx]; 2337 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2338 } 2339 2340 sctx.rs = rs; 2341 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2342 if (sctx.newshift) goto endofwhile; 2343 break; 2344 2345 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2346 } 2347 row += nodesz; /* Update the row */ 2348 } 2349 endofwhile:; 2350 } while (sctx.newshift); 2351 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2352 PetscCall(PetscFree(tmp_vec2)); 2353 PetscCall(ISRestoreIndices(isicol, &ic)); 2354 PetscCall(ISRestoreIndices(isrow, &r)); 2355 PetscCall(ISRestoreIndices(iscol, &c)); 2356 2357 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2358 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2359 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2360 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2361 C->assembled = PETSC_TRUE; 2362 C->preallocated = PETSC_TRUE; 2363 if (sctx.nshift) { 2364 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2365 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2366 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2367 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2368 } 2369 } 2370 PetscCall(PetscLogFlops(C->cmap->n)); 2371 PetscCall(MatSeqAIJCheckInode(C)); 2372 PetscFunctionReturn(0); 2373 } 2374 2375 /* ----------------------------------------------------------- */ 2376 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) { 2377 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2378 IS iscol = a->col, isrow = a->row; 2379 const PetscInt *r, *c, *rout, *cout; 2380 PetscInt i, j, n = A->rmap->n; 2381 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2382 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2383 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2384 PetscScalar sum1, sum2, sum3, sum4, sum5; 2385 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2386 const PetscScalar *b; 2387 2388 PetscFunctionBegin; 2389 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2390 node_max = a->inode.node_count; 2391 ns = a->inode.size; /* Node Size array */ 2392 2393 PetscCall(VecGetArrayRead(bb, &b)); 2394 PetscCall(VecGetArrayWrite(xx, &x)); 2395 tmp = a->solve_work; 2396 2397 PetscCall(ISGetIndices(isrow, &rout)); 2398 r = rout; 2399 PetscCall(ISGetIndices(iscol, &cout)); 2400 c = cout; 2401 2402 /* forward solve the lower triangular */ 2403 tmps = tmp; 2404 aa = a_a; 2405 aj = a_j; 2406 ad = a->diag; 2407 2408 for (i = 0, row = 0; i < node_max; ++i) { 2409 nsz = ns[i]; 2410 aii = ai[row]; 2411 v1 = aa + aii; 2412 vi = aj + aii; 2413 nz = ai[row + 1] - ai[row]; 2414 2415 if (i < node_max - 1) { 2416 /* Prefetch the indices for the next block */ 2417 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2418 /* Prefetch the data for the next block */ 2419 PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2420 } 2421 2422 switch (nsz) { /* Each loop in 'case' is unrolled */ 2423 case 1: 2424 sum1 = b[r[row]]; 2425 for (j = 0; j < nz - 1; j += 2) { 2426 i0 = vi[j]; 2427 i1 = vi[j + 1]; 2428 tmp0 = tmps[i0]; 2429 tmp1 = tmps[i1]; 2430 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2431 } 2432 if (j == nz - 1) { 2433 tmp0 = tmps[vi[j]]; 2434 sum1 -= v1[j] * tmp0; 2435 } 2436 tmp[row++] = sum1; 2437 break; 2438 case 2: 2439 sum1 = b[r[row]]; 2440 sum2 = b[r[row + 1]]; 2441 v2 = aa + ai[row + 1]; 2442 2443 for (j = 0; j < nz - 1; j += 2) { 2444 i0 = vi[j]; 2445 i1 = vi[j + 1]; 2446 tmp0 = tmps[i0]; 2447 tmp1 = tmps[i1]; 2448 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2449 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2450 } 2451 if (j == nz - 1) { 2452 tmp0 = tmps[vi[j]]; 2453 sum1 -= v1[j] * tmp0; 2454 sum2 -= v2[j] * tmp0; 2455 } 2456 sum2 -= v2[nz] * sum1; 2457 tmp[row++] = sum1; 2458 tmp[row++] = sum2; 2459 break; 2460 case 3: 2461 sum1 = b[r[row]]; 2462 sum2 = b[r[row + 1]]; 2463 sum3 = b[r[row + 2]]; 2464 v2 = aa + ai[row + 1]; 2465 v3 = aa + ai[row + 2]; 2466 2467 for (j = 0; j < nz - 1; j += 2) { 2468 i0 = vi[j]; 2469 i1 = vi[j + 1]; 2470 tmp0 = tmps[i0]; 2471 tmp1 = tmps[i1]; 2472 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2473 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2474 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2475 } 2476 if (j == nz - 1) { 2477 tmp0 = tmps[vi[j]]; 2478 sum1 -= v1[j] * tmp0; 2479 sum2 -= v2[j] * tmp0; 2480 sum3 -= v3[j] * tmp0; 2481 } 2482 sum2 -= v2[nz] * sum1; 2483 sum3 -= v3[nz] * sum1; 2484 sum3 -= v3[nz + 1] * sum2; 2485 tmp[row++] = sum1; 2486 tmp[row++] = sum2; 2487 tmp[row++] = sum3; 2488 break; 2489 2490 case 4: 2491 sum1 = b[r[row]]; 2492 sum2 = b[r[row + 1]]; 2493 sum3 = b[r[row + 2]]; 2494 sum4 = b[r[row + 3]]; 2495 v2 = aa + ai[row + 1]; 2496 v3 = aa + ai[row + 2]; 2497 v4 = aa + ai[row + 3]; 2498 2499 for (j = 0; j < nz - 1; j += 2) { 2500 i0 = vi[j]; 2501 i1 = vi[j + 1]; 2502 tmp0 = tmps[i0]; 2503 tmp1 = tmps[i1]; 2504 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2505 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2506 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2507 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2508 } 2509 if (j == nz - 1) { 2510 tmp0 = tmps[vi[j]]; 2511 sum1 -= v1[j] * tmp0; 2512 sum2 -= v2[j] * tmp0; 2513 sum3 -= v3[j] * tmp0; 2514 sum4 -= v4[j] * tmp0; 2515 } 2516 sum2 -= v2[nz] * sum1; 2517 sum3 -= v3[nz] * sum1; 2518 sum4 -= v4[nz] * sum1; 2519 sum3 -= v3[nz + 1] * sum2; 2520 sum4 -= v4[nz + 1] * sum2; 2521 sum4 -= v4[nz + 2] * sum3; 2522 2523 tmp[row++] = sum1; 2524 tmp[row++] = sum2; 2525 tmp[row++] = sum3; 2526 tmp[row++] = sum4; 2527 break; 2528 case 5: 2529 sum1 = b[r[row]]; 2530 sum2 = b[r[row + 1]]; 2531 sum3 = b[r[row + 2]]; 2532 sum4 = b[r[row + 3]]; 2533 sum5 = b[r[row + 4]]; 2534 v2 = aa + ai[row + 1]; 2535 v3 = aa + ai[row + 2]; 2536 v4 = aa + ai[row + 3]; 2537 v5 = aa + ai[row + 4]; 2538 2539 for (j = 0; j < nz - 1; j += 2) { 2540 i0 = vi[j]; 2541 i1 = vi[j + 1]; 2542 tmp0 = tmps[i0]; 2543 tmp1 = tmps[i1]; 2544 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2545 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2546 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2547 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2548 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2549 } 2550 if (j == nz - 1) { 2551 tmp0 = tmps[vi[j]]; 2552 sum1 -= v1[j] * tmp0; 2553 sum2 -= v2[j] * tmp0; 2554 sum3 -= v3[j] * tmp0; 2555 sum4 -= v4[j] * tmp0; 2556 sum5 -= v5[j] * tmp0; 2557 } 2558 2559 sum2 -= v2[nz] * sum1; 2560 sum3 -= v3[nz] * sum1; 2561 sum4 -= v4[nz] * sum1; 2562 sum5 -= v5[nz] * sum1; 2563 sum3 -= v3[nz + 1] * sum2; 2564 sum4 -= v4[nz + 1] * sum2; 2565 sum5 -= v5[nz + 1] * sum2; 2566 sum4 -= v4[nz + 2] * sum3; 2567 sum5 -= v5[nz + 2] * sum3; 2568 sum5 -= v5[nz + 3] * sum4; 2569 2570 tmp[row++] = sum1; 2571 tmp[row++] = sum2; 2572 tmp[row++] = sum3; 2573 tmp[row++] = sum4; 2574 tmp[row++] = sum5; 2575 break; 2576 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2577 } 2578 } 2579 /* backward solve the upper triangular */ 2580 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 2581 nsz = ns[i]; 2582 aii = ad[row + 1] + 1; 2583 v1 = aa + aii; 2584 vi = aj + aii; 2585 nz = ad[row] - ad[row + 1] - 1; 2586 2587 if (i > 0) { 2588 /* Prefetch the indices for the next block */ 2589 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2590 /* Prefetch the data for the next block */ 2591 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2592 } 2593 2594 switch (nsz) { /* Each loop in 'case' is unrolled */ 2595 case 1: 2596 sum1 = tmp[row]; 2597 2598 for (j = 0; j < nz - 1; j += 2) { 2599 i0 = vi[j]; 2600 i1 = vi[j + 1]; 2601 tmp0 = tmps[i0]; 2602 tmp1 = tmps[i1]; 2603 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2604 } 2605 if (j == nz - 1) { 2606 tmp0 = tmps[vi[j]]; 2607 sum1 -= v1[j] * tmp0; 2608 } 2609 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2610 row--; 2611 break; 2612 case 2: 2613 sum1 = tmp[row]; 2614 sum2 = tmp[row - 1]; 2615 v2 = aa + ad[row] + 1; 2616 for (j = 0; j < nz - 1; j += 2) { 2617 i0 = vi[j]; 2618 i1 = vi[j + 1]; 2619 tmp0 = tmps[i0]; 2620 tmp1 = tmps[i1]; 2621 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2622 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2623 } 2624 if (j == nz - 1) { 2625 tmp0 = tmps[vi[j]]; 2626 sum1 -= v1[j] * tmp0; 2627 sum2 -= v2[j + 1] * tmp0; 2628 } 2629 2630 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2631 row--; 2632 sum2 -= v2[0] * tmp0; 2633 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2634 row--; 2635 break; 2636 case 3: 2637 sum1 = tmp[row]; 2638 sum2 = tmp[row - 1]; 2639 sum3 = tmp[row - 2]; 2640 v2 = aa + ad[row] + 1; 2641 v3 = aa + ad[row - 1] + 1; 2642 for (j = 0; j < nz - 1; j += 2) { 2643 i0 = vi[j]; 2644 i1 = vi[j + 1]; 2645 tmp0 = tmps[i0]; 2646 tmp1 = tmps[i1]; 2647 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2648 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2649 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2650 } 2651 if (j == nz - 1) { 2652 tmp0 = tmps[vi[j]]; 2653 sum1 -= v1[j] * tmp0; 2654 sum2 -= v2[j + 1] * tmp0; 2655 sum3 -= v3[j + 2] * tmp0; 2656 } 2657 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2658 row--; 2659 sum2 -= v2[0] * tmp0; 2660 sum3 -= v3[1] * tmp0; 2661 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2662 row--; 2663 sum3 -= v3[0] * tmp0; 2664 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2665 row--; 2666 2667 break; 2668 case 4: 2669 sum1 = tmp[row]; 2670 sum2 = tmp[row - 1]; 2671 sum3 = tmp[row - 2]; 2672 sum4 = tmp[row - 3]; 2673 v2 = aa + ad[row] + 1; 2674 v3 = aa + ad[row - 1] + 1; 2675 v4 = aa + ad[row - 2] + 1; 2676 2677 for (j = 0; j < nz - 1; j += 2) { 2678 i0 = vi[j]; 2679 i1 = vi[j + 1]; 2680 tmp0 = tmps[i0]; 2681 tmp1 = tmps[i1]; 2682 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2683 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2684 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2685 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2686 } 2687 if (j == nz - 1) { 2688 tmp0 = tmps[vi[j]]; 2689 sum1 -= v1[j] * tmp0; 2690 sum2 -= v2[j + 1] * tmp0; 2691 sum3 -= v3[j + 2] * tmp0; 2692 sum4 -= v4[j + 3] * tmp0; 2693 } 2694 2695 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2696 row--; 2697 sum2 -= v2[0] * tmp0; 2698 sum3 -= v3[1] * tmp0; 2699 sum4 -= v4[2] * tmp0; 2700 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2701 row--; 2702 sum3 -= v3[0] * tmp0; 2703 sum4 -= v4[1] * tmp0; 2704 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2705 row--; 2706 sum4 -= v4[0] * tmp0; 2707 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2708 row--; 2709 break; 2710 case 5: 2711 sum1 = tmp[row]; 2712 sum2 = tmp[row - 1]; 2713 sum3 = tmp[row - 2]; 2714 sum4 = tmp[row - 3]; 2715 sum5 = tmp[row - 4]; 2716 v2 = aa + ad[row] + 1; 2717 v3 = aa + ad[row - 1] + 1; 2718 v4 = aa + ad[row - 2] + 1; 2719 v5 = aa + ad[row - 3] + 1; 2720 for (j = 0; j < nz - 1; j += 2) { 2721 i0 = vi[j]; 2722 i1 = vi[j + 1]; 2723 tmp0 = tmps[i0]; 2724 tmp1 = tmps[i1]; 2725 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2726 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2727 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2728 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2729 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2730 } 2731 if (j == nz - 1) { 2732 tmp0 = tmps[vi[j]]; 2733 sum1 -= v1[j] * tmp0; 2734 sum2 -= v2[j + 1] * tmp0; 2735 sum3 -= v3[j + 2] * tmp0; 2736 sum4 -= v4[j + 3] * tmp0; 2737 sum5 -= v5[j + 4] * tmp0; 2738 } 2739 2740 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2741 row--; 2742 sum2 -= v2[0] * tmp0; 2743 sum3 -= v3[1] * tmp0; 2744 sum4 -= v4[2] * tmp0; 2745 sum5 -= v5[3] * tmp0; 2746 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2747 row--; 2748 sum3 -= v3[0] * tmp0; 2749 sum4 -= v4[1] * tmp0; 2750 sum5 -= v5[2] * tmp0; 2751 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2752 row--; 2753 sum4 -= v4[0] * tmp0; 2754 sum5 -= v5[1] * tmp0; 2755 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2756 row--; 2757 sum5 -= v5[0] * tmp0; 2758 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2759 row--; 2760 break; 2761 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2762 } 2763 } 2764 PetscCall(ISRestoreIndices(isrow, &rout)); 2765 PetscCall(ISRestoreIndices(iscol, &cout)); 2766 PetscCall(VecRestoreArrayRead(bb, &b)); 2767 PetscCall(VecRestoreArrayWrite(xx, &x)); 2768 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2769 PetscFunctionReturn(0); 2770 } 2771 2772 /* 2773 Makes a longer coloring[] array and calls the usual code with that 2774 */ 2775 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) { 2776 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2777 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row; 2778 PetscInt *colorused, i; 2779 ISColoringValue *newcolor; 2780 2781 PetscFunctionBegin; 2782 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2783 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2784 /* loop over inodes, marking a color for each column*/ 2785 row = 0; 2786 for (i = 0; i < m; i++) { 2787 for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors; 2788 } 2789 2790 /* eliminate unneeded colors */ 2791 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2792 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2793 2794 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2795 ncolors = colorused[5 * ncolors - 1]; 2796 for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1; 2797 PetscCall(PetscFree(colorused)); 2798 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2799 PetscCall(PetscFree(coloring)); 2800 PetscFunctionReturn(0); 2801 } 2802 2803 #include <petsc/private/kernels/blockinvert.h> 2804 2805 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) { 2806 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2807 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2808 MatScalar *ibdiag, *bdiag, work[25], *t; 2809 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2810 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2811 const PetscScalar *xb, *b; 2812 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2813 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2; 2814 PetscInt sz, k, ipvt[5]; 2815 PetscBool allowzeropivot, zeropivotdetected; 2816 const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i; 2817 2818 PetscFunctionBegin; 2819 allowzeropivot = PetscNot(A->erroriffailure); 2820 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2821 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2822 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2823 2824 if (!a->inode.ibdiagvalid) { 2825 if (!a->inode.ibdiag) { 2826 /* calculate space needed for diagonal blocks */ 2827 for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i]; 2828 a->inode.bdiagsize = cnt; 2829 2830 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2831 } 2832 2833 /* copy over the diagonal blocks and invert them */ 2834 ibdiag = a->inode.ibdiag; 2835 bdiag = a->inode.bdiag; 2836 cnt = 0; 2837 for (i = 0, row = 0; i < m; i++) { 2838 for (j = 0; j < sizes[i]; j++) { 2839 for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k]; 2840 } 2841 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i])); 2842 2843 switch (sizes[i]) { 2844 case 1: 2845 /* Create matrix data structure */ 2846 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2847 if (allowzeropivot) { 2848 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2849 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2850 A->factorerror_zeropivot_row = row; 2851 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2852 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2853 } 2854 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2855 break; 2856 case 2: 2857 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2858 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2859 break; 2860 case 3: 2861 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2862 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2863 break; 2864 case 4: 2865 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2866 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2867 break; 2868 case 5: 2869 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2870 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2871 break; 2872 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 2873 } 2874 cnt += sizes[i] * sizes[i]; 2875 row += sizes[i]; 2876 } 2877 a->inode.ibdiagvalid = PETSC_TRUE; 2878 } 2879 ibdiag = a->inode.ibdiag; 2880 bdiag = a->inode.bdiag; 2881 t = a->inode.ssor_work; 2882 2883 PetscCall(VecGetArray(xx, &x)); 2884 PetscCall(VecGetArrayRead(bb, &b)); 2885 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2886 if (flag & SOR_ZERO_INITIAL_GUESS) { 2887 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2888 for (i = 0, row = 0; i < m; i++) { 2889 sz = diag[row] - ii[row]; 2890 v1 = a->a + ii[row]; 2891 idx = a->j + ii[row]; 2892 2893 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2894 switch (sizes[i]) { 2895 case 1: 2896 2897 sum1 = b[row]; 2898 for (n = 0; n < sz - 1; n += 2) { 2899 i1 = idx[0]; 2900 i2 = idx[1]; 2901 idx += 2; 2902 tmp0 = x[i1]; 2903 tmp1 = x[i2]; 2904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2905 v1 += 2; 2906 } 2907 2908 if (n == sz - 1) { 2909 tmp0 = x[*idx]; 2910 sum1 -= *v1 * tmp0; 2911 } 2912 t[row] = sum1; 2913 x[row++] = sum1 * (*ibdiag++); 2914 break; 2915 case 2: 2916 v2 = a->a + ii[row + 1]; 2917 sum1 = b[row]; 2918 sum2 = b[row + 1]; 2919 for (n = 0; n < sz - 1; n += 2) { 2920 i1 = idx[0]; 2921 i2 = idx[1]; 2922 idx += 2; 2923 tmp0 = x[i1]; 2924 tmp1 = x[i2]; 2925 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2926 v1 += 2; 2927 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2928 v2 += 2; 2929 } 2930 2931 if (n == sz - 1) { 2932 tmp0 = x[*idx]; 2933 sum1 -= v1[0] * tmp0; 2934 sum2 -= v2[0] * tmp0; 2935 } 2936 t[row] = sum1; 2937 t[row + 1] = sum2; 2938 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2939 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2940 ibdiag += 4; 2941 break; 2942 case 3: 2943 v2 = a->a + ii[row + 1]; 2944 v3 = a->a + ii[row + 2]; 2945 sum1 = b[row]; 2946 sum2 = b[row + 1]; 2947 sum3 = b[row + 2]; 2948 for (n = 0; n < sz - 1; n += 2) { 2949 i1 = idx[0]; 2950 i2 = idx[1]; 2951 idx += 2; 2952 tmp0 = x[i1]; 2953 tmp1 = x[i2]; 2954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2955 v1 += 2; 2956 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2957 v2 += 2; 2958 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2959 v3 += 2; 2960 } 2961 2962 if (n == sz - 1) { 2963 tmp0 = x[*idx]; 2964 sum1 -= v1[0] * tmp0; 2965 sum2 -= v2[0] * tmp0; 2966 sum3 -= v3[0] * tmp0; 2967 } 2968 t[row] = sum1; 2969 t[row + 1] = sum2; 2970 t[row + 2] = sum3; 2971 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 2972 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 2973 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 2974 ibdiag += 9; 2975 break; 2976 case 4: 2977 v2 = a->a + ii[row + 1]; 2978 v3 = a->a + ii[row + 2]; 2979 v4 = a->a + ii[row + 3]; 2980 sum1 = b[row]; 2981 sum2 = b[row + 1]; 2982 sum3 = b[row + 2]; 2983 sum4 = b[row + 3]; 2984 for (n = 0; n < sz - 1; n += 2) { 2985 i1 = idx[0]; 2986 i2 = idx[1]; 2987 idx += 2; 2988 tmp0 = x[i1]; 2989 tmp1 = x[i2]; 2990 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2991 v1 += 2; 2992 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2993 v2 += 2; 2994 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2995 v3 += 2; 2996 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 2997 v4 += 2; 2998 } 2999 3000 if (n == sz - 1) { 3001 tmp0 = x[*idx]; 3002 sum1 -= v1[0] * tmp0; 3003 sum2 -= v2[0] * tmp0; 3004 sum3 -= v3[0] * tmp0; 3005 sum4 -= v4[0] * tmp0; 3006 } 3007 t[row] = sum1; 3008 t[row + 1] = sum2; 3009 t[row + 2] = sum3; 3010 t[row + 3] = sum4; 3011 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3012 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3013 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3014 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3015 ibdiag += 16; 3016 break; 3017 case 5: 3018 v2 = a->a + ii[row + 1]; 3019 v3 = a->a + ii[row + 2]; 3020 v4 = a->a + ii[row + 3]; 3021 v5 = a->a + ii[row + 4]; 3022 sum1 = b[row]; 3023 sum2 = b[row + 1]; 3024 sum3 = b[row + 2]; 3025 sum4 = b[row + 3]; 3026 sum5 = b[row + 4]; 3027 for (n = 0; n < sz - 1; n += 2) { 3028 i1 = idx[0]; 3029 i2 = idx[1]; 3030 idx += 2; 3031 tmp0 = x[i1]; 3032 tmp1 = x[i2]; 3033 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3034 v1 += 2; 3035 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3036 v2 += 2; 3037 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3038 v3 += 2; 3039 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3040 v4 += 2; 3041 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3042 v5 += 2; 3043 } 3044 3045 if (n == sz - 1) { 3046 tmp0 = x[*idx]; 3047 sum1 -= v1[0] * tmp0; 3048 sum2 -= v2[0] * tmp0; 3049 sum3 -= v3[0] * tmp0; 3050 sum4 -= v4[0] * tmp0; 3051 sum5 -= v5[0] * tmp0; 3052 } 3053 t[row] = sum1; 3054 t[row + 1] = sum2; 3055 t[row + 2] = sum3; 3056 t[row + 3] = sum4; 3057 t[row + 4] = sum5; 3058 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3059 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3060 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3061 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3062 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3063 ibdiag += 25; 3064 break; 3065 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3066 } 3067 } 3068 3069 xb = t; 3070 PetscCall(PetscLogFlops(a->nz)); 3071 } else xb = b; 3072 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3073 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3074 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3075 ibdiag -= sizes[i] * sizes[i]; 3076 sz = ii[row + 1] - diag[row] - 1; 3077 v1 = a->a + diag[row] + 1; 3078 idx = a->j + diag[row] + 1; 3079 3080 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3081 switch (sizes[i]) { 3082 case 1: 3083 3084 sum1 = xb[row]; 3085 for (n = 0; n < sz - 1; n += 2) { 3086 i1 = idx[0]; 3087 i2 = idx[1]; 3088 idx += 2; 3089 tmp0 = x[i1]; 3090 tmp1 = x[i2]; 3091 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3092 v1 += 2; 3093 } 3094 3095 if (n == sz - 1) { 3096 tmp0 = x[*idx]; 3097 sum1 -= *v1 * tmp0; 3098 } 3099 x[row--] = sum1 * (*ibdiag); 3100 break; 3101 3102 case 2: 3103 3104 sum1 = xb[row]; 3105 sum2 = xb[row - 1]; 3106 /* note that sum1 is associated with the second of the two rows */ 3107 v2 = a->a + diag[row - 1] + 2; 3108 for (n = 0; n < sz - 1; n += 2) { 3109 i1 = idx[0]; 3110 i2 = idx[1]; 3111 idx += 2; 3112 tmp0 = x[i1]; 3113 tmp1 = x[i2]; 3114 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3115 v1 += 2; 3116 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3117 v2 += 2; 3118 } 3119 3120 if (n == sz - 1) { 3121 tmp0 = x[*idx]; 3122 sum1 -= *v1 * tmp0; 3123 sum2 -= *v2 * tmp0; 3124 } 3125 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3126 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3127 break; 3128 case 3: 3129 3130 sum1 = xb[row]; 3131 sum2 = xb[row - 1]; 3132 sum3 = xb[row - 2]; 3133 v2 = a->a + diag[row - 1] + 2; 3134 v3 = a->a + diag[row - 2] + 3; 3135 for (n = 0; n < sz - 1; n += 2) { 3136 i1 = idx[0]; 3137 i2 = idx[1]; 3138 idx += 2; 3139 tmp0 = x[i1]; 3140 tmp1 = x[i2]; 3141 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3142 v1 += 2; 3143 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3144 v2 += 2; 3145 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3146 v3 += 2; 3147 } 3148 3149 if (n == sz - 1) { 3150 tmp0 = x[*idx]; 3151 sum1 -= *v1 * tmp0; 3152 sum2 -= *v2 * tmp0; 3153 sum3 -= *v3 * tmp0; 3154 } 3155 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3156 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3157 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3158 break; 3159 case 4: 3160 3161 sum1 = xb[row]; 3162 sum2 = xb[row - 1]; 3163 sum3 = xb[row - 2]; 3164 sum4 = xb[row - 3]; 3165 v2 = a->a + diag[row - 1] + 2; 3166 v3 = a->a + diag[row - 2] + 3; 3167 v4 = a->a + diag[row - 3] + 4; 3168 for (n = 0; n < sz - 1; n += 2) { 3169 i1 = idx[0]; 3170 i2 = idx[1]; 3171 idx += 2; 3172 tmp0 = x[i1]; 3173 tmp1 = x[i2]; 3174 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3175 v1 += 2; 3176 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3177 v2 += 2; 3178 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3179 v3 += 2; 3180 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3181 v4 += 2; 3182 } 3183 3184 if (n == sz - 1) { 3185 tmp0 = x[*idx]; 3186 sum1 -= *v1 * tmp0; 3187 sum2 -= *v2 * tmp0; 3188 sum3 -= *v3 * tmp0; 3189 sum4 -= *v4 * tmp0; 3190 } 3191 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3192 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3193 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3194 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3195 break; 3196 case 5: 3197 3198 sum1 = xb[row]; 3199 sum2 = xb[row - 1]; 3200 sum3 = xb[row - 2]; 3201 sum4 = xb[row - 3]; 3202 sum5 = xb[row - 4]; 3203 v2 = a->a + diag[row - 1] + 2; 3204 v3 = a->a + diag[row - 2] + 3; 3205 v4 = a->a + diag[row - 3] + 4; 3206 v5 = a->a + diag[row - 4] + 5; 3207 for (n = 0; n < sz - 1; n += 2) { 3208 i1 = idx[0]; 3209 i2 = idx[1]; 3210 idx += 2; 3211 tmp0 = x[i1]; 3212 tmp1 = x[i2]; 3213 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3214 v1 += 2; 3215 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3216 v2 += 2; 3217 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3218 v3 += 2; 3219 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3220 v4 += 2; 3221 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3222 v5 += 2; 3223 } 3224 3225 if (n == sz - 1) { 3226 tmp0 = x[*idx]; 3227 sum1 -= *v1 * tmp0; 3228 sum2 -= *v2 * tmp0; 3229 sum3 -= *v3 * tmp0; 3230 sum4 -= *v4 * tmp0; 3231 sum5 -= *v5 * tmp0; 3232 } 3233 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3234 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3235 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3236 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3237 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3238 break; 3239 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3240 } 3241 } 3242 3243 PetscCall(PetscLogFlops(a->nz)); 3244 } 3245 its--; 3246 } 3247 while (its--) { 3248 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3249 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) { 3250 sz = diag[row] - ii[row]; 3251 v1 = a->a + ii[row]; 3252 idx = a->j + ii[row]; 3253 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3254 switch (sizes[i]) { 3255 case 1: 3256 sum1 = b[row]; 3257 for (n = 0; n < sz - 1; n += 2) { 3258 i1 = idx[0]; 3259 i2 = idx[1]; 3260 idx += 2; 3261 tmp0 = x[i1]; 3262 tmp1 = x[i2]; 3263 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3264 v1 += 2; 3265 } 3266 if (n == sz - 1) { 3267 tmp0 = x[*idx++]; 3268 sum1 -= *v1 * tmp0; 3269 v1++; 3270 } 3271 t[row] = sum1; 3272 sz = ii[row + 1] - diag[row] - 1; 3273 idx = a->j + diag[row] + 1; 3274 v1 += 1; 3275 for (n = 0; n < sz - 1; n += 2) { 3276 i1 = idx[0]; 3277 i2 = idx[1]; 3278 idx += 2; 3279 tmp0 = x[i1]; 3280 tmp1 = x[i2]; 3281 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3282 v1 += 2; 3283 } 3284 if (n == sz - 1) { 3285 tmp0 = x[*idx++]; 3286 sum1 -= *v1 * tmp0; 3287 } 3288 /* in MatSOR_SeqAIJ this line would be 3289 * 3290 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3291 * 3292 * but omega == 1, so this becomes 3293 * 3294 * x[row] = sum1*(*ibdiag++); 3295 * 3296 */ 3297 x[row] = sum1 * (*ibdiag); 3298 break; 3299 case 2: 3300 v2 = a->a + ii[row + 1]; 3301 sum1 = b[row]; 3302 sum2 = b[row + 1]; 3303 for (n = 0; n < sz - 1; n += 2) { 3304 i1 = idx[0]; 3305 i2 = idx[1]; 3306 idx += 2; 3307 tmp0 = x[i1]; 3308 tmp1 = x[i2]; 3309 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3310 v1 += 2; 3311 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3312 v2 += 2; 3313 } 3314 if (n == sz - 1) { 3315 tmp0 = x[*idx++]; 3316 sum1 -= v1[0] * tmp0; 3317 sum2 -= v2[0] * tmp0; 3318 v1++; 3319 v2++; 3320 } 3321 t[row] = sum1; 3322 t[row + 1] = sum2; 3323 sz = ii[row + 1] - diag[row] - 2; 3324 idx = a->j + diag[row] + 2; 3325 v1 += 2; 3326 v2 += 2; 3327 for (n = 0; n < sz - 1; n += 2) { 3328 i1 = idx[0]; 3329 i2 = idx[1]; 3330 idx += 2; 3331 tmp0 = x[i1]; 3332 tmp1 = x[i2]; 3333 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3334 v1 += 2; 3335 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3336 v2 += 2; 3337 } 3338 if (n == sz - 1) { 3339 tmp0 = x[*idx]; 3340 sum1 -= v1[0] * tmp0; 3341 sum2 -= v2[0] * tmp0; 3342 } 3343 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3344 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3345 break; 3346 case 3: 3347 v2 = a->a + ii[row + 1]; 3348 v3 = a->a + ii[row + 2]; 3349 sum1 = b[row]; 3350 sum2 = b[row + 1]; 3351 sum3 = b[row + 2]; 3352 for (n = 0; n < sz - 1; n += 2) { 3353 i1 = idx[0]; 3354 i2 = idx[1]; 3355 idx += 2; 3356 tmp0 = x[i1]; 3357 tmp1 = x[i2]; 3358 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3359 v1 += 2; 3360 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3361 v2 += 2; 3362 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3363 v3 += 2; 3364 } 3365 if (n == sz - 1) { 3366 tmp0 = x[*idx++]; 3367 sum1 -= v1[0] * tmp0; 3368 sum2 -= v2[0] * tmp0; 3369 sum3 -= v3[0] * tmp0; 3370 v1++; 3371 v2++; 3372 v3++; 3373 } 3374 t[row] = sum1; 3375 t[row + 1] = sum2; 3376 t[row + 2] = sum3; 3377 sz = ii[row + 1] - diag[row] - 3; 3378 idx = a->j + diag[row] + 3; 3379 v1 += 3; 3380 v2 += 3; 3381 v3 += 3; 3382 for (n = 0; n < sz - 1; n += 2) { 3383 i1 = idx[0]; 3384 i2 = idx[1]; 3385 idx += 2; 3386 tmp0 = x[i1]; 3387 tmp1 = x[i2]; 3388 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3389 v1 += 2; 3390 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3391 v2 += 2; 3392 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3393 v3 += 2; 3394 } 3395 if (n == sz - 1) { 3396 tmp0 = x[*idx]; 3397 sum1 -= v1[0] * tmp0; 3398 sum2 -= v2[0] * tmp0; 3399 sum3 -= v3[0] * tmp0; 3400 } 3401 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3402 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3403 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3404 break; 3405 case 4: 3406 v2 = a->a + ii[row + 1]; 3407 v3 = a->a + ii[row + 2]; 3408 v4 = a->a + ii[row + 3]; 3409 sum1 = b[row]; 3410 sum2 = b[row + 1]; 3411 sum3 = b[row + 2]; 3412 sum4 = b[row + 3]; 3413 for (n = 0; n < sz - 1; n += 2) { 3414 i1 = idx[0]; 3415 i2 = idx[1]; 3416 idx += 2; 3417 tmp0 = x[i1]; 3418 tmp1 = x[i2]; 3419 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3420 v1 += 2; 3421 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3422 v2 += 2; 3423 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3424 v3 += 2; 3425 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3426 v4 += 2; 3427 } 3428 if (n == sz - 1) { 3429 tmp0 = x[*idx++]; 3430 sum1 -= v1[0] * tmp0; 3431 sum2 -= v2[0] * tmp0; 3432 sum3 -= v3[0] * tmp0; 3433 sum4 -= v4[0] * tmp0; 3434 v1++; 3435 v2++; 3436 v3++; 3437 v4++; 3438 } 3439 t[row] = sum1; 3440 t[row + 1] = sum2; 3441 t[row + 2] = sum3; 3442 t[row + 3] = sum4; 3443 sz = ii[row + 1] - diag[row] - 4; 3444 idx = a->j + diag[row] + 4; 3445 v1 += 4; 3446 v2 += 4; 3447 v3 += 4; 3448 v4 += 4; 3449 for (n = 0; n < sz - 1; n += 2) { 3450 i1 = idx[0]; 3451 i2 = idx[1]; 3452 idx += 2; 3453 tmp0 = x[i1]; 3454 tmp1 = x[i2]; 3455 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3456 v1 += 2; 3457 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3458 v2 += 2; 3459 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3460 v3 += 2; 3461 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3462 v4 += 2; 3463 } 3464 if (n == sz - 1) { 3465 tmp0 = x[*idx]; 3466 sum1 -= v1[0] * tmp0; 3467 sum2 -= v2[0] * tmp0; 3468 sum3 -= v3[0] * tmp0; 3469 sum4 -= v4[0] * tmp0; 3470 } 3471 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3472 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3473 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3474 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3475 break; 3476 case 5: 3477 v2 = a->a + ii[row + 1]; 3478 v3 = a->a + ii[row + 2]; 3479 v4 = a->a + ii[row + 3]; 3480 v5 = a->a + ii[row + 4]; 3481 sum1 = b[row]; 3482 sum2 = b[row + 1]; 3483 sum3 = b[row + 2]; 3484 sum4 = b[row + 3]; 3485 sum5 = b[row + 4]; 3486 for (n = 0; n < sz - 1; n += 2) { 3487 i1 = idx[0]; 3488 i2 = idx[1]; 3489 idx += 2; 3490 tmp0 = x[i1]; 3491 tmp1 = x[i2]; 3492 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3493 v1 += 2; 3494 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3495 v2 += 2; 3496 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3497 v3 += 2; 3498 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3499 v4 += 2; 3500 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3501 v5 += 2; 3502 } 3503 if (n == sz - 1) { 3504 tmp0 = x[*idx++]; 3505 sum1 -= v1[0] * tmp0; 3506 sum2 -= v2[0] * tmp0; 3507 sum3 -= v3[0] * tmp0; 3508 sum4 -= v4[0] * tmp0; 3509 sum5 -= v5[0] * tmp0; 3510 v1++; 3511 v2++; 3512 v3++; 3513 v4++; 3514 v5++; 3515 } 3516 t[row] = sum1; 3517 t[row + 1] = sum2; 3518 t[row + 2] = sum3; 3519 t[row + 3] = sum4; 3520 t[row + 4] = sum5; 3521 sz = ii[row + 1] - diag[row] - 5; 3522 idx = a->j + diag[row] + 5; 3523 v1 += 5; 3524 v2 += 5; 3525 v3 += 5; 3526 v4 += 5; 3527 v5 += 5; 3528 for (n = 0; n < sz - 1; n += 2) { 3529 i1 = idx[0]; 3530 i2 = idx[1]; 3531 idx += 2; 3532 tmp0 = x[i1]; 3533 tmp1 = x[i2]; 3534 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3535 v1 += 2; 3536 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3537 v2 += 2; 3538 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3539 v3 += 2; 3540 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3541 v4 += 2; 3542 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3543 v5 += 2; 3544 } 3545 if (n == sz - 1) { 3546 tmp0 = x[*idx]; 3547 sum1 -= v1[0] * tmp0; 3548 sum2 -= v2[0] * tmp0; 3549 sum3 -= v3[0] * tmp0; 3550 sum4 -= v4[0] * tmp0; 3551 sum5 -= v5[0] * tmp0; 3552 } 3553 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3554 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3555 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3556 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3557 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3558 break; 3559 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3560 } 3561 } 3562 xb = t; 3563 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3564 } else xb = b; 3565 3566 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3567 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3568 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3569 ibdiag -= sizes[i] * sizes[i]; 3570 3571 /* set RHS */ 3572 if (xb == b) { 3573 /* whole (old way) */ 3574 sz = ii[row + 1] - ii[row]; 3575 idx = a->j + ii[row]; 3576 switch (sizes[i]) { 3577 case 5: v5 = a->a + ii[row - 4]; /* fall through */ 3578 case 4: v4 = a->a + ii[row - 3]; /* fall through */ 3579 case 3: v3 = a->a + ii[row - 2]; /* fall through */ 3580 case 2: v2 = a->a + ii[row - 1]; /* fall through */ 3581 case 1: v1 = a->a + ii[row]; break; 3582 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3583 } 3584 } else { 3585 /* upper, no diag */ 3586 sz = ii[row + 1] - diag[row] - 1; 3587 idx = a->j + diag[row] + 1; 3588 switch (sizes[i]) { 3589 case 5: v5 = a->a + diag[row - 4] + 5; /* fall through */ 3590 case 4: v4 = a->a + diag[row - 3] + 4; /* fall through */ 3591 case 3: v3 = a->a + diag[row - 2] + 3; /* fall through */ 3592 case 2: v2 = a->a + diag[row - 1] + 2; /* fall through */ 3593 case 1: v1 = a->a + diag[row] + 1; 3594 } 3595 } 3596 /* set sum */ 3597 switch (sizes[i]) { 3598 case 5: sum5 = xb[row - 4]; /* fall through */ 3599 case 4: sum4 = xb[row - 3]; /* fall through */ 3600 case 3: sum3 = xb[row - 2]; /* fall through */ 3601 case 2: sum2 = xb[row - 1]; /* fall through */ 3602 case 1: 3603 /* note that sum1 is associated with the last row */ 3604 sum1 = xb[row]; 3605 } 3606 /* do sums */ 3607 for (n = 0; n < sz - 1; n += 2) { 3608 i1 = idx[0]; 3609 i2 = idx[1]; 3610 idx += 2; 3611 tmp0 = x[i1]; 3612 tmp1 = x[i2]; 3613 switch (sizes[i]) { 3614 case 5: sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; /* fall through */ 3615 case 4: sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; /* fall through */ 3616 case 3: sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; /* fall through */ 3617 case 2: sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; /* fall through */ 3618 case 1: sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3619 } 3620 } 3621 /* ragged edge */ 3622 if (n == sz - 1) { 3623 tmp0 = x[*idx]; 3624 switch (sizes[i]) { 3625 case 5: sum5 -= *v5 * tmp0; /* fall through */ 3626 case 4: sum4 -= *v4 * tmp0; /* fall through */ 3627 case 3: sum3 -= *v3 * tmp0; /* fall through */ 3628 case 2: sum2 -= *v2 * tmp0; /* fall through */ 3629 case 1: sum1 -= *v1 * tmp0; 3630 } 3631 } 3632 /* update */ 3633 if (xb == b) { 3634 /* whole (old way) w/ diag */ 3635 switch (sizes[i]) { 3636 case 5: 3637 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3638 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3639 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3640 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3641 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3642 break; 3643 case 4: 3644 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3645 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3646 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3647 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3648 break; 3649 case 3: 3650 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3651 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3652 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3653 break; 3654 case 2: 3655 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3656 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3657 break; 3658 case 1: x[row--] += sum1 * (*ibdiag); break; 3659 } 3660 } else { 3661 /* no diag so set = */ 3662 switch (sizes[i]) { 3663 case 5: 3664 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3665 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3666 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3667 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3668 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3669 break; 3670 case 4: 3671 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3672 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3673 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3674 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3675 break; 3676 case 3: 3677 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3678 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3679 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3680 break; 3681 case 2: 3682 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3683 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3684 break; 3685 case 1: x[row--] = sum1 * (*ibdiag); break; 3686 } 3687 } 3688 } 3689 if (xb == b) { 3690 PetscCall(PetscLogFlops(2.0 * a->nz)); 3691 } else { 3692 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3693 } 3694 } 3695 } 3696 if (flag & SOR_EISENSTAT) { 3697 /* 3698 Apply (U + D)^-1 where D is now the block diagonal 3699 */ 3700 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3701 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3702 ibdiag -= sizes[i] * sizes[i]; 3703 sz = ii[row + 1] - diag[row] - 1; 3704 v1 = a->a + diag[row] + 1; 3705 idx = a->j + diag[row] + 1; 3706 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3707 switch (sizes[i]) { 3708 case 1: 3709 3710 sum1 = b[row]; 3711 for (n = 0; n < sz - 1; n += 2) { 3712 i1 = idx[0]; 3713 i2 = idx[1]; 3714 idx += 2; 3715 tmp0 = x[i1]; 3716 tmp1 = x[i2]; 3717 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3718 v1 += 2; 3719 } 3720 3721 if (n == sz - 1) { 3722 tmp0 = x[*idx]; 3723 sum1 -= *v1 * tmp0; 3724 } 3725 x[row] = sum1 * (*ibdiag); 3726 row--; 3727 break; 3728 3729 case 2: 3730 3731 sum1 = b[row]; 3732 sum2 = b[row - 1]; 3733 /* note that sum1 is associated with the second of the two rows */ 3734 v2 = a->a + diag[row - 1] + 2; 3735 for (n = 0; n < sz - 1; n += 2) { 3736 i1 = idx[0]; 3737 i2 = idx[1]; 3738 idx += 2; 3739 tmp0 = x[i1]; 3740 tmp1 = x[i2]; 3741 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3742 v1 += 2; 3743 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3744 v2 += 2; 3745 } 3746 3747 if (n == sz - 1) { 3748 tmp0 = x[*idx]; 3749 sum1 -= *v1 * tmp0; 3750 sum2 -= *v2 * tmp0; 3751 } 3752 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3753 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3754 row -= 2; 3755 break; 3756 case 3: 3757 3758 sum1 = b[row]; 3759 sum2 = b[row - 1]; 3760 sum3 = b[row - 2]; 3761 v2 = a->a + diag[row - 1] + 2; 3762 v3 = a->a + diag[row - 2] + 3; 3763 for (n = 0; n < sz - 1; n += 2) { 3764 i1 = idx[0]; 3765 i2 = idx[1]; 3766 idx += 2; 3767 tmp0 = x[i1]; 3768 tmp1 = x[i2]; 3769 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3770 v1 += 2; 3771 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3772 v2 += 2; 3773 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3774 v3 += 2; 3775 } 3776 3777 if (n == sz - 1) { 3778 tmp0 = x[*idx]; 3779 sum1 -= *v1 * tmp0; 3780 sum2 -= *v2 * tmp0; 3781 sum3 -= *v3 * tmp0; 3782 } 3783 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3784 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3785 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3786 row -= 3; 3787 break; 3788 case 4: 3789 3790 sum1 = b[row]; 3791 sum2 = b[row - 1]; 3792 sum3 = b[row - 2]; 3793 sum4 = b[row - 3]; 3794 v2 = a->a + diag[row - 1] + 2; 3795 v3 = a->a + diag[row - 2] + 3; 3796 v4 = a->a + diag[row - 3] + 4; 3797 for (n = 0; n < sz - 1; n += 2) { 3798 i1 = idx[0]; 3799 i2 = idx[1]; 3800 idx += 2; 3801 tmp0 = x[i1]; 3802 tmp1 = x[i2]; 3803 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3804 v1 += 2; 3805 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3806 v2 += 2; 3807 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3808 v3 += 2; 3809 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3810 v4 += 2; 3811 } 3812 3813 if (n == sz - 1) { 3814 tmp0 = x[*idx]; 3815 sum1 -= *v1 * tmp0; 3816 sum2 -= *v2 * tmp0; 3817 sum3 -= *v3 * tmp0; 3818 sum4 -= *v4 * tmp0; 3819 } 3820 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3821 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3822 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3823 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3824 row -= 4; 3825 break; 3826 case 5: 3827 3828 sum1 = b[row]; 3829 sum2 = b[row - 1]; 3830 sum3 = b[row - 2]; 3831 sum4 = b[row - 3]; 3832 sum5 = b[row - 4]; 3833 v2 = a->a + diag[row - 1] + 2; 3834 v3 = a->a + diag[row - 2] + 3; 3835 v4 = a->a + diag[row - 3] + 4; 3836 v5 = a->a + diag[row - 4] + 5; 3837 for (n = 0; n < sz - 1; n += 2) { 3838 i1 = idx[0]; 3839 i2 = idx[1]; 3840 idx += 2; 3841 tmp0 = x[i1]; 3842 tmp1 = x[i2]; 3843 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3844 v1 += 2; 3845 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3846 v2 += 2; 3847 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3848 v3 += 2; 3849 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3850 v4 += 2; 3851 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3852 v5 += 2; 3853 } 3854 3855 if (n == sz - 1) { 3856 tmp0 = x[*idx]; 3857 sum1 -= *v1 * tmp0; 3858 sum2 -= *v2 * tmp0; 3859 sum3 -= *v3 * tmp0; 3860 sum4 -= *v4 * tmp0; 3861 sum5 -= *v5 * tmp0; 3862 } 3863 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3864 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3865 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3866 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3867 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3868 row -= 5; 3869 break; 3870 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3871 } 3872 } 3873 PetscCall(PetscLogFlops(a->nz)); 3874 3875 /* 3876 t = b - D x where D is the block diagonal 3877 */ 3878 cnt = 0; 3879 for (i = 0, row = 0; i < m; i++) { 3880 switch (sizes[i]) { 3881 case 1: 3882 t[row] = b[row] - bdiag[cnt++] * x[row]; 3883 row++; 3884 break; 3885 case 2: 3886 x1 = x[row]; 3887 x2 = x[row + 1]; 3888 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3889 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3890 t[row] = b[row] - tmp1; 3891 t[row + 1] = b[row + 1] - tmp2; 3892 row += 2; 3893 cnt += 4; 3894 break; 3895 case 3: 3896 x1 = x[row]; 3897 x2 = x[row + 1]; 3898 x3 = x[row + 2]; 3899 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3900 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3901 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3902 t[row] = b[row] - tmp1; 3903 t[row + 1] = b[row + 1] - tmp2; 3904 t[row + 2] = b[row + 2] - tmp3; 3905 row += 3; 3906 cnt += 9; 3907 break; 3908 case 4: 3909 x1 = x[row]; 3910 x2 = x[row + 1]; 3911 x3 = x[row + 2]; 3912 x4 = x[row + 3]; 3913 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3914 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3915 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3916 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3917 t[row] = b[row] - tmp1; 3918 t[row + 1] = b[row + 1] - tmp2; 3919 t[row + 2] = b[row + 2] - tmp3; 3920 t[row + 3] = b[row + 3] - tmp4; 3921 row += 4; 3922 cnt += 16; 3923 break; 3924 case 5: 3925 x1 = x[row]; 3926 x2 = x[row + 1]; 3927 x3 = x[row + 2]; 3928 x4 = x[row + 3]; 3929 x5 = x[row + 4]; 3930 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 3931 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 3932 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 3933 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 3934 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 3935 t[row] = b[row] - tmp1; 3936 t[row + 1] = b[row + 1] - tmp2; 3937 t[row + 2] = b[row + 2] - tmp3; 3938 t[row + 3] = b[row + 3] - tmp4; 3939 t[row + 4] = b[row + 4] - tmp5; 3940 row += 5; 3941 cnt += 25; 3942 break; 3943 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3944 } 3945 } 3946 PetscCall(PetscLogFlops(m)); 3947 3948 /* 3949 Apply (L + D)^-1 where D is the block diagonal 3950 */ 3951 for (i = 0, row = 0; i < m; i++) { 3952 sz = diag[row] - ii[row]; 3953 v1 = a->a + ii[row]; 3954 idx = a->j + ii[row]; 3955 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3956 switch (sizes[i]) { 3957 case 1: 3958 3959 sum1 = t[row]; 3960 for (n = 0; n < sz - 1; n += 2) { 3961 i1 = idx[0]; 3962 i2 = idx[1]; 3963 idx += 2; 3964 tmp0 = t[i1]; 3965 tmp1 = t[i2]; 3966 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3967 v1 += 2; 3968 } 3969 3970 if (n == sz - 1) { 3971 tmp0 = t[*idx]; 3972 sum1 -= *v1 * tmp0; 3973 } 3974 x[row] += t[row] = sum1 * (*ibdiag++); 3975 row++; 3976 break; 3977 case 2: 3978 v2 = a->a + ii[row + 1]; 3979 sum1 = t[row]; 3980 sum2 = t[row + 1]; 3981 for (n = 0; n < sz - 1; n += 2) { 3982 i1 = idx[0]; 3983 i2 = idx[1]; 3984 idx += 2; 3985 tmp0 = t[i1]; 3986 tmp1 = t[i2]; 3987 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3988 v1 += 2; 3989 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3990 v2 += 2; 3991 } 3992 3993 if (n == sz - 1) { 3994 tmp0 = t[*idx]; 3995 sum1 -= v1[0] * tmp0; 3996 sum2 -= v2[0] * tmp0; 3997 } 3998 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3999 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4000 ibdiag += 4; 4001 row += 2; 4002 break; 4003 case 3: 4004 v2 = a->a + ii[row + 1]; 4005 v3 = a->a + ii[row + 2]; 4006 sum1 = t[row]; 4007 sum2 = t[row + 1]; 4008 sum3 = t[row + 2]; 4009 for (n = 0; n < sz - 1; n += 2) { 4010 i1 = idx[0]; 4011 i2 = idx[1]; 4012 idx += 2; 4013 tmp0 = t[i1]; 4014 tmp1 = t[i2]; 4015 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4016 v1 += 2; 4017 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4018 v2 += 2; 4019 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4020 v3 += 2; 4021 } 4022 4023 if (n == sz - 1) { 4024 tmp0 = t[*idx]; 4025 sum1 -= v1[0] * tmp0; 4026 sum2 -= v2[0] * tmp0; 4027 sum3 -= v3[0] * tmp0; 4028 } 4029 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4030 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4031 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4032 ibdiag += 9; 4033 row += 3; 4034 break; 4035 case 4: 4036 v2 = a->a + ii[row + 1]; 4037 v3 = a->a + ii[row + 2]; 4038 v4 = a->a + ii[row + 3]; 4039 sum1 = t[row]; 4040 sum2 = t[row + 1]; 4041 sum3 = t[row + 2]; 4042 sum4 = t[row + 3]; 4043 for (n = 0; n < sz - 1; n += 2) { 4044 i1 = idx[0]; 4045 i2 = idx[1]; 4046 idx += 2; 4047 tmp0 = t[i1]; 4048 tmp1 = t[i2]; 4049 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4050 v1 += 2; 4051 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4052 v2 += 2; 4053 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4054 v3 += 2; 4055 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4056 v4 += 2; 4057 } 4058 4059 if (n == sz - 1) { 4060 tmp0 = t[*idx]; 4061 sum1 -= v1[0] * tmp0; 4062 sum2 -= v2[0] * tmp0; 4063 sum3 -= v3[0] * tmp0; 4064 sum4 -= v4[0] * tmp0; 4065 } 4066 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4067 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4068 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4069 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4070 ibdiag += 16; 4071 row += 4; 4072 break; 4073 case 5: 4074 v2 = a->a + ii[row + 1]; 4075 v3 = a->a + ii[row + 2]; 4076 v4 = a->a + ii[row + 3]; 4077 v5 = a->a + ii[row + 4]; 4078 sum1 = t[row]; 4079 sum2 = t[row + 1]; 4080 sum3 = t[row + 2]; 4081 sum4 = t[row + 3]; 4082 sum5 = t[row + 4]; 4083 for (n = 0; n < sz - 1; n += 2) { 4084 i1 = idx[0]; 4085 i2 = idx[1]; 4086 idx += 2; 4087 tmp0 = t[i1]; 4088 tmp1 = t[i2]; 4089 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4090 v1 += 2; 4091 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4092 v2 += 2; 4093 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4094 v3 += 2; 4095 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4096 v4 += 2; 4097 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4098 v5 += 2; 4099 } 4100 4101 if (n == sz - 1) { 4102 tmp0 = t[*idx]; 4103 sum1 -= v1[0] * tmp0; 4104 sum2 -= v2[0] * tmp0; 4105 sum3 -= v3[0] * tmp0; 4106 sum4 -= v4[0] * tmp0; 4107 sum5 -= v5[0] * tmp0; 4108 } 4109 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4110 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4111 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4112 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4113 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4114 ibdiag += 25; 4115 row += 5; 4116 break; 4117 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4118 } 4119 } 4120 PetscCall(PetscLogFlops(a->nz)); 4121 } 4122 PetscCall(VecRestoreArray(xx, &x)); 4123 PetscCall(VecRestoreArrayRead(bb, &b)); 4124 PetscFunctionReturn(0); 4125 } 4126 4127 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) { 4128 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4129 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4130 const MatScalar *bdiag = a->inode.bdiag; 4131 const PetscScalar *b; 4132 PetscInt m = a->inode.node_count, cnt = 0, i, row; 4133 const PetscInt *sizes = a->inode.size; 4134 4135 PetscFunctionBegin; 4136 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4137 PetscCall(VecGetArray(xx, &x)); 4138 PetscCall(VecGetArrayRead(bb, &b)); 4139 cnt = 0; 4140 for (i = 0, row = 0; i < m; i++) { 4141 switch (sizes[i]) { 4142 case 1: 4143 x[row] = b[row] * bdiag[cnt++]; 4144 row++; 4145 break; 4146 case 2: 4147 x1 = b[row]; 4148 x2 = b[row + 1]; 4149 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4150 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4151 x[row++] = tmp1; 4152 x[row++] = tmp2; 4153 cnt += 4; 4154 break; 4155 case 3: 4156 x1 = b[row]; 4157 x2 = b[row + 1]; 4158 x3 = b[row + 2]; 4159 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4160 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4161 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4162 x[row++] = tmp1; 4163 x[row++] = tmp2; 4164 x[row++] = tmp3; 4165 cnt += 9; 4166 break; 4167 case 4: 4168 x1 = b[row]; 4169 x2 = b[row + 1]; 4170 x3 = b[row + 2]; 4171 x4 = b[row + 3]; 4172 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4173 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4174 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4175 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4176 x[row++] = tmp1; 4177 x[row++] = tmp2; 4178 x[row++] = tmp3; 4179 x[row++] = tmp4; 4180 cnt += 16; 4181 break; 4182 case 5: 4183 x1 = b[row]; 4184 x2 = b[row + 1]; 4185 x3 = b[row + 2]; 4186 x4 = b[row + 3]; 4187 x5 = b[row + 4]; 4188 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4189 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4190 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4191 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4192 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4193 x[row++] = tmp1; 4194 x[row++] = tmp2; 4195 x[row++] = tmp3; 4196 x[row++] = tmp4; 4197 x[row++] = tmp5; 4198 cnt += 25; 4199 break; 4200 default: SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4201 } 4202 } 4203 PetscCall(PetscLogFlops(2.0 * cnt)); 4204 PetscCall(VecRestoreArray(xx, &x)); 4205 PetscCall(VecRestoreArrayRead(bb, &b)); 4206 PetscFunctionReturn(0); 4207 } 4208 4209 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) { 4210 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4211 4212 PetscFunctionBegin; 4213 a->inode.node_count = 0; 4214 a->inode.use = PETSC_FALSE; 4215 a->inode.checked = PETSC_FALSE; 4216 a->inode.mat_nonzerostate = -1; 4217 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4218 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4219 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4220 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4221 A->ops->coloringpatch = NULL; 4222 A->ops->multdiagonalblock = NULL; 4223 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4224 PetscFunctionReturn(0); 4225 } 4226 4227 /* 4228 samestructure indicates that the matrix has not changed its nonzero structure so we 4229 do not need to recompute the inodes 4230 */ 4231 PetscErrorCode MatSeqAIJCheckInode(Mat A) { 4232 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4233 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4234 PetscBool flag; 4235 const PetscInt *idx, *idy, *ii; 4236 4237 PetscFunctionBegin; 4238 if (!a->inode.use) { 4239 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4240 PetscCall(PetscFree(a->inode.size)); 4241 PetscFunctionReturn(0); 4242 } 4243 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4244 4245 m = A->rmap->n; 4246 if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size)); 4247 ns = a->inode.size; 4248 4249 i = 0; 4250 node_count = 0; 4251 idx = a->j; 4252 ii = a->i; 4253 while (i < m) { /* For each row */ 4254 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4255 /* Limits the number of elements in a node to 'a->inode.limit' */ 4256 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4257 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4258 if (nzy != nzx) break; 4259 idy += nzx; /* Same nonzero pattern */ 4260 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4261 if (!flag) break; 4262 } 4263 ns[node_count++] = blk_size; 4264 idx += blk_size * nzx; 4265 i = j; 4266 } 4267 4268 /* If not enough inodes found,, do not use inode version of the routines */ 4269 if (!m || node_count > .8 * m) { 4270 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4271 PetscCall(PetscFree(a->inode.size)); 4272 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4273 } else { 4274 if (!A->factortype) { 4275 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4276 if (A->rmap->n == A->cmap->n) { 4277 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4278 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4279 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4280 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4281 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4282 } 4283 } else { 4284 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4285 } 4286 a->inode.node_count = node_count; 4287 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4288 } 4289 a->inode.checked = PETSC_TRUE; 4290 a->inode.mat_nonzerostate = A->nonzerostate; 4291 PetscFunctionReturn(0); 4292 } 4293 4294 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) { 4295 Mat B = *C; 4296 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4297 PetscInt m = A->rmap->n; 4298 4299 PetscFunctionBegin; 4300 c->inode.use = a->inode.use; 4301 c->inode.limit = a->inode.limit; 4302 c->inode.max_limit = a->inode.max_limit; 4303 c->inode.checked = PETSC_FALSE; 4304 c->inode.size = NULL; 4305 c->inode.node_count = 0; 4306 c->inode.ibdiagvalid = PETSC_FALSE; 4307 c->inode.ibdiag = NULL; 4308 c->inode.bdiag = NULL; 4309 c->inode.mat_nonzerostate = -1; 4310 if (a->inode.use) { 4311 if (a->inode.checked && a->inode.size) { 4312 PetscCall(PetscMalloc1(m + 1, &c->inode.size)); 4313 PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1)); 4314 4315 c->inode.checked = PETSC_TRUE; 4316 c->inode.node_count = a->inode.node_count; 4317 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4318 } 4319 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4320 if (!B->factortype) { 4321 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4322 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4323 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4324 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4325 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4326 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4327 } else { 4328 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4329 } 4330 } 4331 PetscFunctionReturn(0); 4332 } 4333 4334 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) { 4335 PetscInt k; 4336 const PetscInt *vi; 4337 4338 PetscFunctionBegin; 4339 vi = aj + ai[row]; 4340 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4341 vi = aj + adiag[row]; 4342 cols[nzl] = vi[0]; 4343 vi = aj + adiag[row + 1] + 1; 4344 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4345 PetscFunctionReturn(0); 4346 } 4347 /* 4348 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4349 Modified from MatSeqAIJCheckInode(). 4350 4351 Input Parameters: 4352 . Mat A - ILU or LU matrix factor 4353 4354 */ 4355 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) { 4356 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4357 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4358 PetscInt *cols1, *cols2, *ns; 4359 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4360 PetscBool flag; 4361 4362 PetscFunctionBegin; 4363 if (!a->inode.use) PetscFunctionReturn(0); 4364 if (a->inode.checked) PetscFunctionReturn(0); 4365 4366 m = A->rmap->n; 4367 if (a->inode.size) ns = a->inode.size; 4368 else PetscCall(PetscMalloc1(m + 1, &ns)); 4369 4370 i = 0; 4371 node_count = 0; 4372 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4373 while (i < m) { /* For each row */ 4374 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4375 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4376 nzx = nzl1 + nzu1 + 1; 4377 MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i); 4378 4379 /* Limits the number of elements in a node to 'a->inode.limit' */ 4380 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4381 nzl2 = ai[j + 1] - ai[j]; 4382 nzu2 = adiag[j] - adiag[j + 1] - 1; 4383 nzy = nzl2 + nzu2 + 1; 4384 if (nzy != nzx) break; 4385 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4386 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4387 if (!flag) break; 4388 } 4389 ns[node_count++] = blk_size; 4390 i = j; 4391 } 4392 PetscCall(PetscFree2(cols1, cols2)); 4393 /* If not enough inodes found,, do not use inode version of the routines */ 4394 if (!m || node_count > .8 * m) { 4395 PetscCall(PetscFree(ns)); 4396 4397 a->inode.node_count = 0; 4398 a->inode.size = NULL; 4399 a->inode.use = PETSC_FALSE; 4400 4401 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4402 } else { 4403 A->ops->mult = NULL; 4404 A->ops->sor = NULL; 4405 A->ops->multadd = NULL; 4406 A->ops->getrowij = NULL; 4407 A->ops->restorerowij = NULL; 4408 A->ops->getcolumnij = NULL; 4409 A->ops->restorecolumnij = NULL; 4410 A->ops->coloringpatch = NULL; 4411 A->ops->multdiagonalblock = NULL; 4412 a->inode.node_count = node_count; 4413 a->inode.size = ns; 4414 4415 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4416 } 4417 a->inode.checked = PETSC_TRUE; 4418 PetscFunctionReturn(0); 4419 } 4420 4421 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) { 4422 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4423 4424 PetscFunctionBegin; 4425 a->inode.ibdiagvalid = PETSC_FALSE; 4426 PetscFunctionReturn(0); 4427 } 4428 4429 /* 4430 This is really ugly. if inodes are used this replaces the 4431 permutations with ones that correspond to rows/cols of the matrix 4432 rather then inode blocks 4433 */ 4434 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) { 4435 PetscFunctionBegin; 4436 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4437 PetscFunctionReturn(0); 4438 } 4439 4440 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) { 4441 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4442 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4443 const PetscInt *ridx, *cidx; 4444 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx; 4445 PetscInt nslim_col, *ns_col; 4446 IS ris = *rperm, cis = *cperm; 4447 4448 PetscFunctionBegin; 4449 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4450 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4451 4452 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4453 PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns)); 4454 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4455 4456 PetscCall(ISGetIndices(ris, &ridx)); 4457 PetscCall(ISGetIndices(cis, &cidx)); 4458 4459 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4460 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i]; 4461 4462 /* Construct the permutations for rows*/ 4463 for (i = 0, row = 0; i < nslim_row; ++i) { 4464 indx = ridx[i]; 4465 start_val = tns[indx]; 4466 end_val = tns[indx + 1]; 4467 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4468 } 4469 4470 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4471 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i]; 4472 4473 /* Construct permutations for columns */ 4474 for (i = 0, col = 0; i < nslim_col; ++i) { 4475 indx = cidx[i]; 4476 start_val = tns[indx]; 4477 end_val = tns[indx + 1]; 4478 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4479 } 4480 4481 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4482 PetscCall(ISSetPermutation(*rperm)); 4483 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4484 PetscCall(ISSetPermutation(*cperm)); 4485 4486 PetscCall(ISRestoreIndices(ris, &ridx)); 4487 PetscCall(ISRestoreIndices(cis, &cidx)); 4488 4489 PetscCall(PetscFree(ns_col)); 4490 PetscCall(PetscFree2(permr, permc)); 4491 PetscCall(ISDestroy(&cis)); 4492 PetscCall(ISDestroy(&ris)); 4493 PetscCall(PetscFree(tns)); 4494 PetscFunctionReturn(0); 4495 } 4496 4497 /*@C 4498 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4499 4500 Not Collective 4501 4502 Input Parameter: 4503 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4504 4505 Output Parameters: 4506 + node_count - no of inodes present in the matrix. 4507 . sizes - an array of size node_count,with sizes of each inode. 4508 - limit - the max size used to generate the inodes. 4509 4510 Level: advanced 4511 4512 Note: 4513 This routine returns some internal storage information 4514 of the matrix, it is intended to be used by advanced users. 4515 It should be called after the matrix is assembled. 4516 The contents of the sizes[] array should not be changed. 4517 NULL may be passed for information not requested. 4518 4519 .seealso: `MatGetInfo()` 4520 @*/ 4521 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) { 4522 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4523 4524 PetscFunctionBegin; 4525 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4526 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4527 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4528 PetscFunctionReturn(0); 4529 } 4530 4531 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) { 4532 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4533 4534 PetscFunctionBegin; 4535 if (node_count) *node_count = a->inode.node_count; 4536 if (sizes) *sizes = a->inode.size; 4537 if (limit) *limit = a->inode.limit; 4538 PetscFunctionReturn(0); 4539 } 4540