1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 9 { 10 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 11 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 12 13 PetscFunctionBegin; 14 n = A->cmap->n; 15 m = A->rmap->n; 16 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 17 ns_row = a->inode.size; 18 19 min_mn = (m < n) ? m : n; 20 if (!ns) { 21 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) 22 ; 23 for (; count + 1 < n; count++, i++) 24 ; 25 if (count < n) i++; 26 *size = i; 27 PetscFunctionReturn(PETSC_SUCCESS); 28 } 29 PetscCall(PetscMalloc1(n + 1, &ns_col)); 30 31 /* Use the same row structure wherever feasible. */ 32 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i]; 33 34 /* if m < n; pad up the remainder with inode_limit */ 35 for (; count + 1 < n; count++, i++) ns_col[i] = 1; 36 /* The last node is the odd ball. padd it up with the remaining rows; */ 37 if (count < n) { 38 ns_col[i] = n - count; 39 i++; 40 } else if (count > n) { 41 /* Adjust for the over estimation */ 42 ns_col[i - 1] += n - count; 43 } 44 *size = i; 45 *ns = ns_col; 46 PetscFunctionReturn(PETSC_SUCCESS); 47 } 48 49 /* 50 This builds symmetric version of nonzero structure, 51 */ 52 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 53 { 54 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 55 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 56 PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2; 57 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 58 59 PetscFunctionBegin; 60 nslim_row = a->inode.node_count; 61 m = A->rmap->n; 62 n = A->cmap->n; 63 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 64 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 65 66 /* Use the row_inode as column_inode */ 67 nslim_col = nslim_row; 68 ns_col = ns_row; 69 70 /* allocate space for reformatted inode structure */ 71 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 72 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1]; 73 74 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 75 nsz = ns_col[i1]; 76 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 77 } 78 /* allocate space for row pointers */ 79 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 80 *iia = ia; 81 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 82 83 /* determine the number of columns in each row */ 84 ia[0] = oshift; 85 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 86 j = aj + ai[row] + ishift; 87 jmax = aj + ai[row + 1] + ishift; 88 if (j == jmax) continue; /* empty row */ 89 col = *j++ + ishift; 90 i2 = tvc[col]; 91 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 92 ia[i1 + 1]++; 93 ia[i2 + 1]++; 94 i2++; /* Start col of next node */ 95 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 96 i2 = tvc[col]; 97 } 98 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 99 } 100 101 /* shift ia[i] to point to next row */ 102 for (i1 = 1; i1 < nslim_row + 1; i1++) { 103 row = ia[i1 - 1]; 104 ia[i1] += row; 105 work[i1 - 1] = row - oshift; 106 } 107 108 /* allocate space for column pointers */ 109 nz = ia[nslim_row] + (!ishift); 110 PetscCall(PetscMalloc1(nz, &ja)); 111 *jja = ja; 112 113 /* loop over lower triangular part putting into ja */ 114 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 115 j = aj + ai[row] + ishift; 116 jmax = aj + ai[row + 1] + ishift; 117 if (j == jmax) continue; /* empty row */ 118 col = *j++ + ishift; 119 i2 = tvc[col]; 120 while (i2 < i1 && j < jmax) { 121 ja[work[i2]++] = i1 + oshift; 122 ja[work[i1]++] = i2 + oshift; 123 ++i2; 124 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 125 i2 = tvc[col]; 126 } 127 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 128 } 129 PetscCall(PetscFree(work)); 130 PetscCall(PetscFree2(tns, tvc)); 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 /* 135 This builds nonsymmetric version of nonzero structure, 136 */ 137 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 138 { 139 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 140 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 141 PetscInt *tns, *tvc, nsz, i1, i2; 142 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size; 143 144 PetscFunctionBegin; 145 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 146 nslim_row = a->inode.node_count; 147 n = A->cmap->n; 148 149 /* Create The column_inode for this matrix */ 150 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 151 152 /* allocate space for reformatted column_inode structure */ 153 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 154 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 155 156 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 157 nsz = ns_col[i1]; 158 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 159 } 160 /* allocate space for row pointers */ 161 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 162 *iia = ia; 163 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 164 165 /* determine the number of columns in each row */ 166 ia[0] = oshift; 167 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 168 j = aj + ai[row] + ishift; 169 nz = ai[row + 1] - ai[row]; 170 if (!nz) continue; /* empty row */ 171 col = *j++ + ishift; 172 i2 = tvc[col]; 173 while (nz-- > 0) { /* off-diagonal elements */ 174 ia[i1 + 1]++; 175 i2++; /* Start col of next node */ 176 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 177 if (nz > 0) i2 = tvc[col]; 178 } 179 } 180 181 /* shift ia[i] to point to next row */ 182 for (i1 = 1; i1 < nslim_row + 1; i1++) { 183 row = ia[i1 - 1]; 184 ia[i1] += row; 185 work[i1 - 1] = row - oshift; 186 } 187 188 /* allocate space for column pointers */ 189 nz = ia[nslim_row] + (!ishift); 190 PetscCall(PetscMalloc1(nz, &ja)); 191 *jja = ja; 192 193 /* loop over matrix putting into ja */ 194 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 195 j = aj + ai[row] + ishift; 196 nz = ai[row + 1] - ai[row]; 197 if (!nz) continue; /* empty row */ 198 col = *j++ + ishift; 199 i2 = tvc[col]; 200 while (nz-- > 0) { 201 ja[work[i1]++] = i2 + oshift; 202 ++i2; 203 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 204 if (nz > 0) i2 = tvc[col]; 205 } 206 } 207 PetscCall(PetscFree(ns_col)); 208 PetscCall(PetscFree(work)); 209 PetscCall(PetscFree2(tns, tvc)); 210 PetscFunctionReturn(PETSC_SUCCESS); 211 } 212 213 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 214 { 215 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 216 217 PetscFunctionBegin; 218 if (n) *n = a->inode.node_count; 219 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 220 if (!blockcompressed) { 221 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 222 } else if (symmetric) { 223 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 224 } else { 225 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 226 } 227 PetscFunctionReturn(PETSC_SUCCESS); 228 } 229 230 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 231 { 232 PetscFunctionBegin; 233 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 234 235 if (!blockcompressed) { 236 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 237 } else { 238 PetscCall(PetscFree(*ia)); 239 PetscCall(PetscFree(*ja)); 240 } 241 PetscFunctionReturn(PETSC_SUCCESS); 242 } 243 244 /* ----------------------------------------------------------- */ 245 246 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 247 { 248 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 249 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 250 PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j; 251 252 PetscFunctionBegin; 253 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 254 nslim_row = a->inode.node_count; 255 n = A->cmap->n; 256 257 /* Create The column_inode for this matrix */ 258 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 259 260 /* allocate space for reformatted column_inode structure */ 261 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 262 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 263 264 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 265 nsz = ns_col[i1]; 266 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 267 } 268 /* allocate space for column pointers */ 269 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 270 *iia = ia; 271 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 272 273 /* determine the number of columns in each row */ 274 ia[0] = oshift; 275 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 276 j = aj + ai[row] + ishift; 277 col = *j++ + ishift; 278 i2 = tvc[col]; 279 nz = ai[row + 1] - ai[row]; 280 while (nz-- > 0) { /* off-diagonal elements */ 281 /* ia[i1+1]++; */ 282 ia[i2 + 1]++; 283 i2++; 284 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 285 if (nz > 0) i2 = tvc[col]; 286 } 287 } 288 289 /* shift ia[i] to point to next col */ 290 for (i1 = 1; i1 < nslim_col + 1; i1++) { 291 col = ia[i1 - 1]; 292 ia[i1] += col; 293 work[i1 - 1] = col - oshift; 294 } 295 296 /* allocate space for column pointers */ 297 nz = ia[nslim_col] + (!ishift); 298 PetscCall(PetscMalloc1(nz, &ja)); 299 *jja = ja; 300 301 /* loop over matrix putting into ja */ 302 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 303 j = aj + ai[row] + ishift; 304 col = *j++ + ishift; 305 i2 = tvc[col]; 306 nz = ai[row + 1] - ai[row]; 307 while (nz-- > 0) { 308 /* ja[work[i1]++] = i2 + oshift; */ 309 ja[work[i2]++] = i1 + oshift; 310 i2++; 311 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 312 if (nz > 0) i2 = tvc[col]; 313 } 314 } 315 PetscCall(PetscFree(ns_col)); 316 PetscCall(PetscFree(work)); 317 PetscCall(PetscFree2(tns, tvc)); 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 321 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 322 { 323 PetscFunctionBegin; 324 PetscCall(MatCreateColInode_Private(A, n, NULL)); 325 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 326 327 if (!blockcompressed) { 328 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 329 } else if (symmetric) { 330 /* Since the indices are symmetric it doesn't matter */ 331 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 332 } else { 333 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 334 } 335 PetscFunctionReturn(PETSC_SUCCESS); 336 } 337 338 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 339 { 340 PetscFunctionBegin; 341 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 342 if (!blockcompressed) { 343 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 344 } else { 345 PetscCall(PetscFree(*ia)); 346 PetscCall(PetscFree(*ja)); 347 } 348 PetscFunctionReturn(PETSC_SUCCESS); 349 } 350 351 /* ----------------------------------------------------------- */ 352 353 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 354 { 355 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 356 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 357 PetscScalar *y; 358 const PetscScalar *x; 359 const MatScalar *v1, *v2, *v3, *v4, *v5; 360 PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 361 const PetscInt *idx, *ns, *ii; 362 363 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 364 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 365 #endif 366 367 PetscFunctionBegin; 368 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 369 node_max = a->inode.node_count; 370 ns = a->inode.size; /* Node Size array */ 371 PetscCall(VecGetArrayRead(xx, &x)); 372 PetscCall(VecGetArray(yy, &y)); 373 idx = a->j; 374 v1 = a->a; 375 ii = a->i; 376 377 for (i = 0, row = 0; i < node_max; ++i) { 378 nsz = ns[i]; 379 n = ii[1] - ii[0]; 380 nonzerorow += (n > 0) * nsz; 381 ii += nsz; 382 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 383 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 384 sz = n; /* No of non zeros in this row */ 385 /* Switch on the size of Node */ 386 switch (nsz) { /* Each loop in 'case' is unrolled */ 387 case 1: 388 sum1 = 0.; 389 390 for (n = 0; n < sz - 1; n += 2) { 391 i1 = idx[0]; /* The instructions are ordered to */ 392 i2 = idx[1]; /* make the compiler's job easy */ 393 idx += 2; 394 tmp0 = x[i1]; 395 tmp1 = x[i2]; 396 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 397 v1 += 2; 398 } 399 400 if (n == sz - 1) { /* Take care of the last nonzero */ 401 tmp0 = x[*idx++]; 402 sum1 += *v1++ * tmp0; 403 } 404 y[row++] = sum1; 405 break; 406 case 2: 407 sum1 = 0.; 408 sum2 = 0.; 409 v2 = v1 + n; 410 411 for (n = 0; n < sz - 1; n += 2) { 412 i1 = idx[0]; 413 i2 = idx[1]; 414 idx += 2; 415 tmp0 = x[i1]; 416 tmp1 = x[i2]; 417 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 418 v1 += 2; 419 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 420 v2 += 2; 421 } 422 if (n == sz - 1) { 423 tmp0 = x[*idx++]; 424 sum1 += *v1++ * tmp0; 425 sum2 += *v2++ * tmp0; 426 } 427 y[row++] = sum1; 428 y[row++] = sum2; 429 v1 = v2; /* Since the next block to be processed starts there*/ 430 idx += sz; 431 break; 432 case 3: 433 sum1 = 0.; 434 sum2 = 0.; 435 sum3 = 0.; 436 v2 = v1 + n; 437 v3 = v2 + n; 438 439 for (n = 0; n < sz - 1; n += 2) { 440 i1 = idx[0]; 441 i2 = idx[1]; 442 idx += 2; 443 tmp0 = x[i1]; 444 tmp1 = x[i2]; 445 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 446 v1 += 2; 447 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 448 v2 += 2; 449 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 450 v3 += 2; 451 } 452 if (n == sz - 1) { 453 tmp0 = x[*idx++]; 454 sum1 += *v1++ * tmp0; 455 sum2 += *v2++ * tmp0; 456 sum3 += *v3++ * tmp0; 457 } 458 y[row++] = sum1; 459 y[row++] = sum2; 460 y[row++] = sum3; 461 v1 = v3; /* Since the next block to be processed starts there*/ 462 idx += 2 * sz; 463 break; 464 case 4: 465 sum1 = 0.; 466 sum2 = 0.; 467 sum3 = 0.; 468 sum4 = 0.; 469 v2 = v1 + n; 470 v3 = v2 + n; 471 v4 = v3 + n; 472 473 for (n = 0; n < sz - 1; n += 2) { 474 i1 = idx[0]; 475 i2 = idx[1]; 476 idx += 2; 477 tmp0 = x[i1]; 478 tmp1 = x[i2]; 479 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 480 v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 482 v2 += 2; 483 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 484 v3 += 2; 485 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 486 v4 += 2; 487 } 488 if (n == sz - 1) { 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 sum4 += *v4++ * tmp0; 494 } 495 y[row++] = sum1; 496 y[row++] = sum2; 497 y[row++] = sum3; 498 y[row++] = sum4; 499 v1 = v4; /* Since the next block to be processed starts there*/ 500 idx += 3 * sz; 501 break; 502 case 5: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 sum5 = 0.; 508 v2 = v1 + n; 509 v3 = v2 + n; 510 v4 = v3 + n; 511 v5 = v4 + n; 512 513 for (n = 0; n < sz - 1; n += 2) { 514 i1 = idx[0]; 515 i2 = idx[1]; 516 idx += 2; 517 tmp0 = x[i1]; 518 tmp1 = x[i2]; 519 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 520 v1 += 2; 521 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 522 v2 += 2; 523 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 524 v3 += 2; 525 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 526 v4 += 2; 527 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 528 v5 += 2; 529 } 530 if (n == sz - 1) { 531 tmp0 = x[*idx++]; 532 sum1 += *v1++ * tmp0; 533 sum2 += *v2++ * tmp0; 534 sum3 += *v3++ * tmp0; 535 sum4 += *v4++ * tmp0; 536 sum5 += *v5++ * tmp0; 537 } 538 y[row++] = sum1; 539 y[row++] = sum2; 540 y[row++] = sum3; 541 y[row++] = sum4; 542 y[row++] = sum5; 543 v1 = v5; /* Since the next block to be processed starts there */ 544 idx += 4 * sz; 545 break; 546 default: 547 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 548 } 549 } 550 PetscCall(VecRestoreArrayRead(xx, &x)); 551 PetscCall(VecRestoreArray(yy, &y)); 552 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 553 PetscFunctionReturn(PETSC_SUCCESS); 554 } 555 /* ----------------------------------------------------------- */ 556 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 557 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 558 { 559 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 560 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 561 const MatScalar *v1, *v2, *v3, *v4, *v5; 562 const PetscScalar *x; 563 PetscScalar *y, *z, *zt; 564 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 565 const PetscInt *idx, *ns, *ii; 566 567 PetscFunctionBegin; 568 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 569 node_max = a->inode.node_count; 570 ns = a->inode.size; /* Node Size array */ 571 572 PetscCall(VecGetArrayRead(xx, &x)); 573 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 574 zt = z; 575 576 idx = a->j; 577 v1 = a->a; 578 ii = a->i; 579 580 for (i = 0, row = 0; i < node_max; ++i) { 581 nsz = ns[i]; 582 n = ii[1] - ii[0]; 583 ii += nsz; 584 sz = n; /* No of non zeros in this row */ 585 /* Switch on the size of Node */ 586 switch (nsz) { /* Each loop in 'case' is unrolled */ 587 case 1: 588 sum1 = *zt++; 589 590 for (n = 0; n < sz - 1; n += 2) { 591 i1 = idx[0]; /* The instructions are ordered to */ 592 i2 = idx[1]; /* make the compiler's job easy */ 593 idx += 2; 594 tmp0 = x[i1]; 595 tmp1 = x[i2]; 596 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 597 v1 += 2; 598 } 599 600 if (n == sz - 1) { /* Take care of the last nonzero */ 601 tmp0 = x[*idx++]; 602 sum1 += *v1++ * tmp0; 603 } 604 y[row++] = sum1; 605 break; 606 case 2: 607 sum1 = *zt++; 608 sum2 = *zt++; 609 v2 = v1 + n; 610 611 for (n = 0; n < sz - 1; n += 2) { 612 i1 = idx[0]; 613 i2 = idx[1]; 614 idx += 2; 615 tmp0 = x[i1]; 616 tmp1 = x[i2]; 617 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 618 v1 += 2; 619 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 620 v2 += 2; 621 } 622 if (n == sz - 1) { 623 tmp0 = x[*idx++]; 624 sum1 += *v1++ * tmp0; 625 sum2 += *v2++ * tmp0; 626 } 627 y[row++] = sum1; 628 y[row++] = sum2; 629 v1 = v2; /* Since the next block to be processed starts there*/ 630 idx += sz; 631 break; 632 case 3: 633 sum1 = *zt++; 634 sum2 = *zt++; 635 sum3 = *zt++; 636 v2 = v1 + n; 637 v3 = v2 + n; 638 639 for (n = 0; n < sz - 1; n += 2) { 640 i1 = idx[0]; 641 i2 = idx[1]; 642 idx += 2; 643 tmp0 = x[i1]; 644 tmp1 = x[i2]; 645 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 646 v1 += 2; 647 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 648 v2 += 2; 649 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 650 v3 += 2; 651 } 652 if (n == sz - 1) { 653 tmp0 = x[*idx++]; 654 sum1 += *v1++ * tmp0; 655 sum2 += *v2++ * tmp0; 656 sum3 += *v3++ * tmp0; 657 } 658 y[row++] = sum1; 659 y[row++] = sum2; 660 y[row++] = sum3; 661 v1 = v3; /* Since the next block to be processed starts there*/ 662 idx += 2 * sz; 663 break; 664 case 4: 665 sum1 = *zt++; 666 sum2 = *zt++; 667 sum3 = *zt++; 668 sum4 = *zt++; 669 v2 = v1 + n; 670 v3 = v2 + n; 671 v4 = v3 + n; 672 673 for (n = 0; n < sz - 1; n += 2) { 674 i1 = idx[0]; 675 i2 = idx[1]; 676 idx += 2; 677 tmp0 = x[i1]; 678 tmp1 = x[i2]; 679 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 680 v1 += 2; 681 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 682 v2 += 2; 683 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 684 v3 += 2; 685 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 686 v4 += 2; 687 } 688 if (n == sz - 1) { 689 tmp0 = x[*idx++]; 690 sum1 += *v1++ * tmp0; 691 sum2 += *v2++ * tmp0; 692 sum3 += *v3++ * tmp0; 693 sum4 += *v4++ * tmp0; 694 } 695 y[row++] = sum1; 696 y[row++] = sum2; 697 y[row++] = sum3; 698 y[row++] = sum4; 699 v1 = v4; /* Since the next block to be processed starts there*/ 700 idx += 3 * sz; 701 break; 702 case 5: 703 sum1 = *zt++; 704 sum2 = *zt++; 705 sum3 = *zt++; 706 sum4 = *zt++; 707 sum5 = *zt++; 708 v2 = v1 + n; 709 v3 = v2 + n; 710 v4 = v3 + n; 711 v5 = v4 + n; 712 713 for (n = 0; n < sz - 1; n += 2) { 714 i1 = idx[0]; 715 i2 = idx[1]; 716 idx += 2; 717 tmp0 = x[i1]; 718 tmp1 = x[i2]; 719 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 720 v1 += 2; 721 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 722 v2 += 2; 723 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 724 v3 += 2; 725 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 726 v4 += 2; 727 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 728 v5 += 2; 729 } 730 if (n == sz - 1) { 731 tmp0 = x[*idx++]; 732 sum1 += *v1++ * tmp0; 733 sum2 += *v2++ * tmp0; 734 sum3 += *v3++ * tmp0; 735 sum4 += *v4++ * tmp0; 736 sum5 += *v5++ * tmp0; 737 } 738 y[row++] = sum1; 739 y[row++] = sum2; 740 y[row++] = sum3; 741 y[row++] = sum4; 742 y[row++] = sum5; 743 v1 = v5; /* Since the next block to be processed starts there */ 744 idx += 4 * sz; 745 break; 746 default: 747 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 748 } 749 } 750 PetscCall(VecRestoreArrayRead(xx, &x)); 751 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 752 PetscCall(PetscLogFlops(2.0 * a->nz)); 753 PetscFunctionReturn(PETSC_SUCCESS); 754 } 755 756 /* ----------------------------------------------------------- */ 757 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 758 { 759 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 760 IS iscol = a->col, isrow = a->row; 761 const PetscInt *r, *c, *rout, *cout; 762 PetscInt i, j, n = A->rmap->n, nz; 763 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 764 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 765 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 766 PetscScalar sum1, sum2, sum3, sum4, sum5; 767 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 768 const PetscScalar *b; 769 770 PetscFunctionBegin; 771 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 772 node_max = a->inode.node_count; 773 ns = a->inode.size; /* Node Size array */ 774 775 PetscCall(VecGetArrayRead(bb, &b)); 776 PetscCall(VecGetArrayWrite(xx, &x)); 777 tmp = a->solve_work; 778 779 PetscCall(ISGetIndices(isrow, &rout)); 780 r = rout; 781 PetscCall(ISGetIndices(iscol, &cout)); 782 c = cout + (n - 1); 783 784 /* forward solve the lower triangular */ 785 tmps = tmp; 786 aa = a_a; 787 aj = a_j; 788 ad = a->diag; 789 790 for (i = 0, row = 0; i < node_max; ++i) { 791 nsz = ns[i]; 792 aii = ai[row]; 793 v1 = aa + aii; 794 vi = aj + aii; 795 nz = ad[row] - aii; 796 if (i < node_max - 1) { 797 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 798 * but our indexing to determine it's size could. */ 799 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 800 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 801 PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 802 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 803 } 804 805 switch (nsz) { /* Each loop in 'case' is unrolled */ 806 case 1: 807 sum1 = b[*r++]; 808 for (j = 0; j < nz - 1; j += 2) { 809 i0 = vi[0]; 810 i1 = vi[1]; 811 vi += 2; 812 tmp0 = tmps[i0]; 813 tmp1 = tmps[i1]; 814 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 815 v1 += 2; 816 } 817 if (j == nz - 1) { 818 tmp0 = tmps[*vi++]; 819 sum1 -= *v1++ * tmp0; 820 } 821 tmp[row++] = sum1; 822 break; 823 case 2: 824 sum1 = b[*r++]; 825 sum2 = b[*r++]; 826 v2 = aa + ai[row + 1]; 827 828 for (j = 0; j < nz - 1; j += 2) { 829 i0 = vi[0]; 830 i1 = vi[1]; 831 vi += 2; 832 tmp0 = tmps[i0]; 833 tmp1 = tmps[i1]; 834 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 835 v1 += 2; 836 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 837 v2 += 2; 838 } 839 if (j == nz - 1) { 840 tmp0 = tmps[*vi++]; 841 sum1 -= *v1++ * tmp0; 842 sum2 -= *v2++ * tmp0; 843 } 844 sum2 -= *v2++ * sum1; 845 tmp[row++] = sum1; 846 tmp[row++] = sum2; 847 break; 848 case 3: 849 sum1 = b[*r++]; 850 sum2 = b[*r++]; 851 sum3 = b[*r++]; 852 v2 = aa + ai[row + 1]; 853 v3 = aa + ai[row + 2]; 854 855 for (j = 0; j < nz - 1; j += 2) { 856 i0 = vi[0]; 857 i1 = vi[1]; 858 vi += 2; 859 tmp0 = tmps[i0]; 860 tmp1 = tmps[i1]; 861 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 862 v1 += 2; 863 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 864 v2 += 2; 865 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 866 v3 += 2; 867 } 868 if (j == nz - 1) { 869 tmp0 = tmps[*vi++]; 870 sum1 -= *v1++ * tmp0; 871 sum2 -= *v2++ * tmp0; 872 sum3 -= *v3++ * tmp0; 873 } 874 sum2 -= *v2++ * sum1; 875 sum3 -= *v3++ * sum1; 876 sum3 -= *v3++ * sum2; 877 878 tmp[row++] = sum1; 879 tmp[row++] = sum2; 880 tmp[row++] = sum3; 881 break; 882 883 case 4: 884 sum1 = b[*r++]; 885 sum2 = b[*r++]; 886 sum3 = b[*r++]; 887 sum4 = b[*r++]; 888 v2 = aa + ai[row + 1]; 889 v3 = aa + ai[row + 2]; 890 v4 = aa + ai[row + 3]; 891 892 for (j = 0; j < nz - 1; j += 2) { 893 i0 = vi[0]; 894 i1 = vi[1]; 895 vi += 2; 896 tmp0 = tmps[i0]; 897 tmp1 = tmps[i1]; 898 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 899 v1 += 2; 900 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 901 v2 += 2; 902 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 903 v3 += 2; 904 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 905 v4 += 2; 906 } 907 if (j == nz - 1) { 908 tmp0 = tmps[*vi++]; 909 sum1 -= *v1++ * tmp0; 910 sum2 -= *v2++ * tmp0; 911 sum3 -= *v3++ * tmp0; 912 sum4 -= *v4++ * tmp0; 913 } 914 sum2 -= *v2++ * sum1; 915 sum3 -= *v3++ * sum1; 916 sum4 -= *v4++ * sum1; 917 sum3 -= *v3++ * sum2; 918 sum4 -= *v4++ * sum2; 919 sum4 -= *v4++ * sum3; 920 921 tmp[row++] = sum1; 922 tmp[row++] = sum2; 923 tmp[row++] = sum3; 924 tmp[row++] = sum4; 925 break; 926 case 5: 927 sum1 = b[*r++]; 928 sum2 = b[*r++]; 929 sum3 = b[*r++]; 930 sum4 = b[*r++]; 931 sum5 = b[*r++]; 932 v2 = aa + ai[row + 1]; 933 v3 = aa + ai[row + 2]; 934 v4 = aa + ai[row + 3]; 935 v5 = aa + ai[row + 4]; 936 937 for (j = 0; j < nz - 1; j += 2) { 938 i0 = vi[0]; 939 i1 = vi[1]; 940 vi += 2; 941 tmp0 = tmps[i0]; 942 tmp1 = tmps[i1]; 943 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 944 v1 += 2; 945 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 946 v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 948 v3 += 2; 949 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 950 v4 += 2; 951 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 952 v5 += 2; 953 } 954 if (j == nz - 1) { 955 tmp0 = tmps[*vi++]; 956 sum1 -= *v1++ * tmp0; 957 sum2 -= *v2++ * tmp0; 958 sum3 -= *v3++ * tmp0; 959 sum4 -= *v4++ * tmp0; 960 sum5 -= *v5++ * tmp0; 961 } 962 963 sum2 -= *v2++ * sum1; 964 sum3 -= *v3++ * sum1; 965 sum4 -= *v4++ * sum1; 966 sum5 -= *v5++ * sum1; 967 sum3 -= *v3++ * sum2; 968 sum4 -= *v4++ * sum2; 969 sum5 -= *v5++ * sum2; 970 sum4 -= *v4++ * sum3; 971 sum5 -= *v5++ * sum3; 972 sum5 -= *v5++ * sum4; 973 974 tmp[row++] = sum1; 975 tmp[row++] = sum2; 976 tmp[row++] = sum3; 977 tmp[row++] = sum4; 978 tmp[row++] = sum5; 979 break; 980 default: 981 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 982 } 983 } 984 /* backward solve the upper triangular */ 985 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 986 nsz = ns[i]; 987 aii = ai[row + 1] - 1; 988 v1 = aa + aii; 989 vi = aj + aii; 990 nz = aii - ad[row]; 991 switch (nsz) { /* Each loop in 'case' is unrolled */ 992 case 1: 993 sum1 = tmp[row]; 994 995 for (j = nz; j > 1; j -= 2) { 996 vi -= 2; 997 i0 = vi[2]; 998 i1 = vi[1]; 999 tmp0 = tmps[i0]; 1000 tmp1 = tmps[i1]; 1001 v1 -= 2; 1002 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1003 } 1004 if (j == 1) { 1005 tmp0 = tmps[*vi--]; 1006 sum1 -= *v1-- * tmp0; 1007 } 1008 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1009 row--; 1010 break; 1011 case 2: 1012 sum1 = tmp[row]; 1013 sum2 = tmp[row - 1]; 1014 v2 = aa + ai[row] - 1; 1015 for (j = nz; j > 1; j -= 2) { 1016 vi -= 2; 1017 i0 = vi[2]; 1018 i1 = vi[1]; 1019 tmp0 = tmps[i0]; 1020 tmp1 = tmps[i1]; 1021 v1 -= 2; 1022 v2 -= 2; 1023 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1024 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1025 } 1026 if (j == 1) { 1027 tmp0 = tmps[*vi--]; 1028 sum1 -= *v1-- * tmp0; 1029 sum2 -= *v2-- * tmp0; 1030 } 1031 1032 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1033 row--; 1034 sum2 -= *v2-- * tmp0; 1035 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1036 row--; 1037 break; 1038 case 3: 1039 sum1 = tmp[row]; 1040 sum2 = tmp[row - 1]; 1041 sum3 = tmp[row - 2]; 1042 v2 = aa + ai[row] - 1; 1043 v3 = aa + ai[row - 1] - 1; 1044 for (j = nz; j > 1; j -= 2) { 1045 vi -= 2; 1046 i0 = vi[2]; 1047 i1 = vi[1]; 1048 tmp0 = tmps[i0]; 1049 tmp1 = tmps[i1]; 1050 v1 -= 2; 1051 v2 -= 2; 1052 v3 -= 2; 1053 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1054 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1055 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1056 } 1057 if (j == 1) { 1058 tmp0 = tmps[*vi--]; 1059 sum1 -= *v1-- * tmp0; 1060 sum2 -= *v2-- * tmp0; 1061 sum3 -= *v3-- * tmp0; 1062 } 1063 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1064 row--; 1065 sum2 -= *v2-- * tmp0; 1066 sum3 -= *v3-- * tmp0; 1067 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1068 row--; 1069 sum3 -= *v3-- * tmp0; 1070 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1071 row--; 1072 1073 break; 1074 case 4: 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row - 1]; 1077 sum3 = tmp[row - 2]; 1078 sum4 = tmp[row - 3]; 1079 v2 = aa + ai[row] - 1; 1080 v3 = aa + ai[row - 1] - 1; 1081 v4 = aa + ai[row - 2] - 1; 1082 1083 for (j = nz; j > 1; j -= 2) { 1084 vi -= 2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j == 1) { 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1107 row--; 1108 sum2 -= *v2-- * tmp0; 1109 sum3 -= *v3-- * tmp0; 1110 sum4 -= *v4-- * tmp0; 1111 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1112 row--; 1113 sum3 -= *v3-- * tmp0; 1114 sum4 -= *v4-- * tmp0; 1115 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1116 row--; 1117 sum4 -= *v4-- * tmp0; 1118 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1119 row--; 1120 break; 1121 case 5: 1122 sum1 = tmp[row]; 1123 sum2 = tmp[row - 1]; 1124 sum3 = tmp[row - 2]; 1125 sum4 = tmp[row - 3]; 1126 sum5 = tmp[row - 4]; 1127 v2 = aa + ai[row] - 1; 1128 v3 = aa + ai[row - 1] - 1; 1129 v4 = aa + ai[row - 2] - 1; 1130 v5 = aa + ai[row - 3] - 1; 1131 for (j = nz; j > 1; j -= 2) { 1132 vi -= 2; 1133 i0 = vi[2]; 1134 i1 = vi[1]; 1135 tmp0 = tmps[i0]; 1136 tmp1 = tmps[i1]; 1137 v1 -= 2; 1138 v2 -= 2; 1139 v3 -= 2; 1140 v4 -= 2; 1141 v5 -= 2; 1142 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1143 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1144 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1145 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1146 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1147 } 1148 if (j == 1) { 1149 tmp0 = tmps[*vi--]; 1150 sum1 -= *v1-- * tmp0; 1151 sum2 -= *v2-- * tmp0; 1152 sum3 -= *v3-- * tmp0; 1153 sum4 -= *v4-- * tmp0; 1154 sum5 -= *v5-- * tmp0; 1155 } 1156 1157 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1158 row--; 1159 sum2 -= *v2-- * tmp0; 1160 sum3 -= *v3-- * tmp0; 1161 sum4 -= *v4-- * tmp0; 1162 sum5 -= *v5-- * tmp0; 1163 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1164 row--; 1165 sum3 -= *v3-- * tmp0; 1166 sum4 -= *v4-- * tmp0; 1167 sum5 -= *v5-- * tmp0; 1168 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1169 row--; 1170 sum4 -= *v4-- * tmp0; 1171 sum5 -= *v5-- * tmp0; 1172 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1173 row--; 1174 sum5 -= *v5-- * tmp0; 1175 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1176 row--; 1177 break; 1178 default: 1179 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1180 } 1181 } 1182 PetscCall(ISRestoreIndices(isrow, &rout)); 1183 PetscCall(ISRestoreIndices(iscol, &cout)); 1184 PetscCall(VecRestoreArrayRead(bb, &b)); 1185 PetscCall(VecRestoreArrayWrite(xx, &x)); 1186 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1187 PetscFunctionReturn(PETSC_SUCCESS); 1188 } 1189 1190 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1191 { 1192 Mat C = B; 1193 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1194 IS isrow = b->row, isicol = b->icol; 1195 const PetscInt *r, *ic, *ics; 1196 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1197 PetscInt i, j, k, nz, nzL, row, *pj; 1198 const PetscInt *ajtmp, *bjtmp; 1199 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1200 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1201 FactorShiftCtx sctx; 1202 const PetscInt *ddiag; 1203 PetscReal rs; 1204 MatScalar d; 1205 PetscInt inod, nodesz, node_max, col; 1206 const PetscInt *ns; 1207 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1208 1209 PetscFunctionBegin; 1210 /* MatPivotSetUp(): initialize shift context sctx */ 1211 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1212 1213 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1214 ddiag = a->diag; 1215 sctx.shift_top = info->zeropivot; 1216 for (i = 0; i < n; i++) { 1217 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1218 d = (aa)[ddiag[i]]; 1219 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1220 v = aa + ai[i]; 1221 nz = ai[i + 1] - ai[i]; 1222 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1223 if (rs > sctx.shift_top) sctx.shift_top = rs; 1224 } 1225 sctx.shift_top *= 1.1; 1226 sctx.nshift_max = 5; 1227 sctx.shift_lo = 0.; 1228 sctx.shift_hi = 1.; 1229 } 1230 1231 PetscCall(ISGetIndices(isrow, &r)); 1232 PetscCall(ISGetIndices(isicol, &ic)); 1233 1234 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1235 ics = ic; 1236 1237 node_max = a->inode.node_count; 1238 ns = a->inode.size; 1239 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1240 1241 /* If max inode size > 4, split it into two inodes.*/ 1242 /* also map the inode sizes according to the ordering */ 1243 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1244 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1245 if (ns[i] > 4) { 1246 tmp_vec1[j] = 4; 1247 ++j; 1248 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 1249 } else { 1250 tmp_vec1[j] = ns[i]; 1251 } 1252 } 1253 /* Use the correct node_max */ 1254 node_max = j; 1255 1256 /* Now reorder the inode info based on mat re-ordering info */ 1257 /* First create a row -> inode_size_array_index map */ 1258 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1259 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1260 for (i = 0, row = 0; i < node_max; i++) { 1261 nodesz = tmp_vec1[i]; 1262 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1263 } 1264 /* Using nsmap, create a reordered ns structure */ 1265 for (i = 0, j = 0; i < node_max; i++) { 1266 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1267 tmp_vec2[i] = nodesz; 1268 j += nodesz; 1269 } 1270 PetscCall(PetscFree(nsmap)); 1271 PetscCall(PetscFree(tmp_vec1)); 1272 1273 /* Now use the correct ns */ 1274 ns = tmp_vec2; 1275 1276 do { 1277 sctx.newshift = PETSC_FALSE; 1278 /* Now loop over each block-row, and do the factorization */ 1279 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1280 nodesz = ns[inod]; 1281 1282 switch (nodesz) { 1283 case 1: 1284 /*----------*/ 1285 /* zero rtmp1 */ 1286 /* L part */ 1287 nz = bi[i + 1] - bi[i]; 1288 bjtmp = bj + bi[i]; 1289 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1290 1291 /* U part */ 1292 nz = bdiag[i] - bdiag[i + 1]; 1293 bjtmp = bj + bdiag[i + 1] + 1; 1294 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1295 1296 /* load in initial (unfactored row) */ 1297 nz = ai[r[i] + 1] - ai[r[i]]; 1298 ajtmp = aj + ai[r[i]]; 1299 v = aa + ai[r[i]]; 1300 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1301 1302 /* ZeropivotApply() */ 1303 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1304 1305 /* elimination */ 1306 bjtmp = bj + bi[i]; 1307 row = *bjtmp++; 1308 nzL = bi[i + 1] - bi[i]; 1309 for (k = 0; k < nzL; k++) { 1310 pc = rtmp1 + row; 1311 if (*pc != 0.0) { 1312 pv = b->a + bdiag[row]; 1313 mul1 = *pc * (*pv); 1314 *pc = mul1; 1315 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1316 pv = b->a + bdiag[row + 1] + 1; 1317 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1318 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1319 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1320 } 1321 row = *bjtmp++; 1322 } 1323 1324 /* finished row so stick it into b->a */ 1325 rs = 0.0; 1326 /* L part */ 1327 pv = b->a + bi[i]; 1328 pj = b->j + bi[i]; 1329 nz = bi[i + 1] - bi[i]; 1330 for (j = 0; j < nz; j++) { 1331 pv[j] = rtmp1[pj[j]]; 1332 rs += PetscAbsScalar(pv[j]); 1333 } 1334 1335 /* U part */ 1336 pv = b->a + bdiag[i + 1] + 1; 1337 pj = b->j + bdiag[i + 1] + 1; 1338 nz = bdiag[i] - bdiag[i + 1] - 1; 1339 for (j = 0; j < nz; j++) { 1340 pv[j] = rtmp1[pj[j]]; 1341 rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* Check zero pivot */ 1345 sctx.rs = rs; 1346 sctx.pv = rtmp1[i]; 1347 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1348 if (sctx.newshift) break; 1349 1350 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1351 pv = b->a + bdiag[i]; 1352 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1353 break; 1354 1355 case 2: 1356 /*----------*/ 1357 /* zero rtmp1 and rtmp2 */ 1358 /* L part */ 1359 nz = bi[i + 1] - bi[i]; 1360 bjtmp = bj + bi[i]; 1361 for (j = 0; j < nz; j++) { 1362 col = bjtmp[j]; 1363 rtmp1[col] = 0.0; 1364 rtmp2[col] = 0.0; 1365 } 1366 1367 /* U part */ 1368 nz = bdiag[i] - bdiag[i + 1]; 1369 bjtmp = bj + bdiag[i + 1] + 1; 1370 for (j = 0; j < nz; j++) { 1371 col = bjtmp[j]; 1372 rtmp1[col] = 0.0; 1373 rtmp2[col] = 0.0; 1374 } 1375 1376 /* load in initial (unfactored row) */ 1377 nz = ai[r[i] + 1] - ai[r[i]]; 1378 ajtmp = aj + ai[r[i]]; 1379 v1 = aa + ai[r[i]]; 1380 v2 = aa + ai[r[i] + 1]; 1381 for (j = 0; j < nz; j++) { 1382 col = ics[ajtmp[j]]; 1383 rtmp1[col] = v1[j]; 1384 rtmp2[col] = v2[j]; 1385 } 1386 /* ZeropivotApply(): shift the diagonal of the matrix */ 1387 rtmp1[i] += sctx.shift_amount; 1388 rtmp2[i + 1] += sctx.shift_amount; 1389 1390 /* elimination */ 1391 bjtmp = bj + bi[i]; 1392 row = *bjtmp++; /* pivot row */ 1393 nzL = bi[i + 1] - bi[i]; 1394 for (k = 0; k < nzL; k++) { 1395 pc1 = rtmp1 + row; 1396 pc2 = rtmp2 + row; 1397 if (*pc1 != 0.0 || *pc2 != 0.0) { 1398 pv = b->a + bdiag[row]; 1399 mul1 = *pc1 * (*pv); 1400 mul2 = *pc2 * (*pv); 1401 *pc1 = mul1; 1402 *pc2 = mul2; 1403 1404 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1405 pv = b->a + bdiag[row + 1] + 1; 1406 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1407 for (j = 0; j < nz; j++) { 1408 col = pj[j]; 1409 rtmp1[col] -= mul1 * pv[j]; 1410 rtmp2[col] -= mul2 * pv[j]; 1411 } 1412 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1413 } 1414 row = *bjtmp++; 1415 } 1416 1417 /* finished row i; check zero pivot, then stick row i into b->a */ 1418 rs = 0.0; 1419 /* L part */ 1420 pc1 = b->a + bi[i]; 1421 pj = b->j + bi[i]; 1422 nz = bi[i + 1] - bi[i]; 1423 for (j = 0; j < nz; j++) { 1424 col = pj[j]; 1425 pc1[j] = rtmp1[col]; 1426 rs += PetscAbsScalar(pc1[j]); 1427 } 1428 /* U part */ 1429 pc1 = b->a + bdiag[i + 1] + 1; 1430 pj = b->j + bdiag[i + 1] + 1; 1431 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1432 for (j = 0; j < nz; j++) { 1433 col = pj[j]; 1434 pc1[j] = rtmp1[col]; 1435 rs += PetscAbsScalar(pc1[j]); 1436 } 1437 1438 sctx.rs = rs; 1439 sctx.pv = rtmp1[i]; 1440 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1441 if (sctx.newshift) break; 1442 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1443 *pc1 = 1.0 / sctx.pv; 1444 1445 /* Now take care of diagonal 2x2 block. */ 1446 pc2 = rtmp2 + i; 1447 if (*pc2 != 0.0) { 1448 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1449 *pc2 = mul1; /* insert L entry */ 1450 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1451 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1452 for (j = 0; j < nz; j++) { 1453 col = pj[j]; 1454 rtmp2[col] -= mul1 * rtmp1[col]; 1455 } 1456 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1457 } 1458 1459 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1460 rs = 0.0; 1461 /* L part */ 1462 pc2 = b->a + bi[i + 1]; 1463 pj = b->j + bi[i + 1]; 1464 nz = bi[i + 2] - bi[i + 1]; 1465 for (j = 0; j < nz; j++) { 1466 col = pj[j]; 1467 pc2[j] = rtmp2[col]; 1468 rs += PetscAbsScalar(pc2[j]); 1469 } 1470 /* U part */ 1471 pc2 = b->a + bdiag[i + 2] + 1; 1472 pj = b->j + bdiag[i + 2] + 1; 1473 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1474 for (j = 0; j < nz; j++) { 1475 col = pj[j]; 1476 pc2[j] = rtmp2[col]; 1477 rs += PetscAbsScalar(pc2[j]); 1478 } 1479 1480 sctx.rs = rs; 1481 sctx.pv = rtmp2[i + 1]; 1482 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1483 if (sctx.newshift) break; 1484 pc2 = b->a + bdiag[i + 1]; 1485 *pc2 = 1.0 / sctx.pv; 1486 break; 1487 1488 case 3: 1489 /*----------*/ 1490 /* zero rtmp */ 1491 /* L part */ 1492 nz = bi[i + 1] - bi[i]; 1493 bjtmp = bj + bi[i]; 1494 for (j = 0; j < nz; j++) { 1495 col = bjtmp[j]; 1496 rtmp1[col] = 0.0; 1497 rtmp2[col] = 0.0; 1498 rtmp3[col] = 0.0; 1499 } 1500 1501 /* U part */ 1502 nz = bdiag[i] - bdiag[i + 1]; 1503 bjtmp = bj + bdiag[i + 1] + 1; 1504 for (j = 0; j < nz; j++) { 1505 col = bjtmp[j]; 1506 rtmp1[col] = 0.0; 1507 rtmp2[col] = 0.0; 1508 rtmp3[col] = 0.0; 1509 } 1510 1511 /* load in initial (unfactored row) */ 1512 nz = ai[r[i] + 1] - ai[r[i]]; 1513 ajtmp = aj + ai[r[i]]; 1514 v1 = aa + ai[r[i]]; 1515 v2 = aa + ai[r[i] + 1]; 1516 v3 = aa + ai[r[i] + 2]; 1517 for (j = 0; j < nz; j++) { 1518 col = ics[ajtmp[j]]; 1519 rtmp1[col] = v1[j]; 1520 rtmp2[col] = v2[j]; 1521 rtmp3[col] = v3[j]; 1522 } 1523 /* ZeropivotApply(): shift the diagonal of the matrix */ 1524 rtmp1[i] += sctx.shift_amount; 1525 rtmp2[i + 1] += sctx.shift_amount; 1526 rtmp3[i + 2] += sctx.shift_amount; 1527 1528 /* elimination */ 1529 bjtmp = bj + bi[i]; 1530 row = *bjtmp++; /* pivot row */ 1531 nzL = bi[i + 1] - bi[i]; 1532 for (k = 0; k < nzL; k++) { 1533 pc1 = rtmp1 + row; 1534 pc2 = rtmp2 + row; 1535 pc3 = rtmp3 + row; 1536 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1537 pv = b->a + bdiag[row]; 1538 mul1 = *pc1 * (*pv); 1539 mul2 = *pc2 * (*pv); 1540 mul3 = *pc3 * (*pv); 1541 *pc1 = mul1; 1542 *pc2 = mul2; 1543 *pc3 = mul3; 1544 1545 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1546 pv = b->a + bdiag[row + 1] + 1; 1547 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1548 for (j = 0; j < nz; j++) { 1549 col = pj[j]; 1550 rtmp1[col] -= mul1 * pv[j]; 1551 rtmp2[col] -= mul2 * pv[j]; 1552 rtmp3[col] -= mul3 * pv[j]; 1553 } 1554 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1555 } 1556 row = *bjtmp++; 1557 } 1558 1559 /* finished row i; check zero pivot, then stick row i into b->a */ 1560 rs = 0.0; 1561 /* L part */ 1562 pc1 = b->a + bi[i]; 1563 pj = b->j + bi[i]; 1564 nz = bi[i + 1] - bi[i]; 1565 for (j = 0; j < nz; j++) { 1566 col = pj[j]; 1567 pc1[j] = rtmp1[col]; 1568 rs += PetscAbsScalar(pc1[j]); 1569 } 1570 /* U part */ 1571 pc1 = b->a + bdiag[i + 1] + 1; 1572 pj = b->j + bdiag[i + 1] + 1; 1573 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1574 for (j = 0; j < nz; j++) { 1575 col = pj[j]; 1576 pc1[j] = rtmp1[col]; 1577 rs += PetscAbsScalar(pc1[j]); 1578 } 1579 1580 sctx.rs = rs; 1581 sctx.pv = rtmp1[i]; 1582 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1583 if (sctx.newshift) break; 1584 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1585 *pc1 = 1.0 / sctx.pv; 1586 1587 /* Now take care of 1st column of diagonal 3x3 block. */ 1588 pc2 = rtmp2 + i; 1589 pc3 = rtmp3 + i; 1590 if (*pc2 != 0.0 || *pc3 != 0.0) { 1591 mul2 = (*pc2) * (*pc1); 1592 *pc2 = mul2; 1593 mul3 = (*pc3) * (*pc1); 1594 *pc3 = mul3; 1595 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1596 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1597 for (j = 0; j < nz; j++) { 1598 col = pj[j]; 1599 rtmp2[col] -= mul2 * rtmp1[col]; 1600 rtmp3[col] -= mul3 * rtmp1[col]; 1601 } 1602 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1603 } 1604 1605 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1606 rs = 0.0; 1607 /* L part */ 1608 pc2 = b->a + bi[i + 1]; 1609 pj = b->j + bi[i + 1]; 1610 nz = bi[i + 2] - bi[i + 1]; 1611 for (j = 0; j < nz; j++) { 1612 col = pj[j]; 1613 pc2[j] = rtmp2[col]; 1614 rs += PetscAbsScalar(pc2[j]); 1615 } 1616 /* U part */ 1617 pc2 = b->a + bdiag[i + 2] + 1; 1618 pj = b->j + bdiag[i + 2] + 1; 1619 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1620 for (j = 0; j < nz; j++) { 1621 col = pj[j]; 1622 pc2[j] = rtmp2[col]; 1623 rs += PetscAbsScalar(pc2[j]); 1624 } 1625 1626 sctx.rs = rs; 1627 sctx.pv = rtmp2[i + 1]; 1628 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1629 if (sctx.newshift) break; 1630 pc2 = b->a + bdiag[i + 1]; 1631 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1632 1633 /* Now take care of 2nd column of diagonal 3x3 block. */ 1634 pc3 = rtmp3 + i + 1; 1635 if (*pc3 != 0.0) { 1636 mul3 = (*pc3) * (*pc2); 1637 *pc3 = mul3; 1638 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1639 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1640 for (j = 0; j < nz; j++) { 1641 col = pj[j]; 1642 rtmp3[col] -= mul3 * rtmp2[col]; 1643 } 1644 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1645 } 1646 1647 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1648 rs = 0.0; 1649 /* L part */ 1650 pc3 = b->a + bi[i + 2]; 1651 pj = b->j + bi[i + 2]; 1652 nz = bi[i + 3] - bi[i + 2]; 1653 for (j = 0; j < nz; j++) { 1654 col = pj[j]; 1655 pc3[j] = rtmp3[col]; 1656 rs += PetscAbsScalar(pc3[j]); 1657 } 1658 /* U part */ 1659 pc3 = b->a + bdiag[i + 3] + 1; 1660 pj = b->j + bdiag[i + 3] + 1; 1661 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1662 for (j = 0; j < nz; j++) { 1663 col = pj[j]; 1664 pc3[j] = rtmp3[col]; 1665 rs += PetscAbsScalar(pc3[j]); 1666 } 1667 1668 sctx.rs = rs; 1669 sctx.pv = rtmp3[i + 2]; 1670 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1671 if (sctx.newshift) break; 1672 pc3 = b->a + bdiag[i + 2]; 1673 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1674 break; 1675 case 4: 1676 /*----------*/ 1677 /* zero rtmp */ 1678 /* L part */ 1679 nz = bi[i + 1] - bi[i]; 1680 bjtmp = bj + bi[i]; 1681 for (j = 0; j < nz; j++) { 1682 col = bjtmp[j]; 1683 rtmp1[col] = 0.0; 1684 rtmp2[col] = 0.0; 1685 rtmp3[col] = 0.0; 1686 rtmp4[col] = 0.0; 1687 } 1688 1689 /* U part */ 1690 nz = bdiag[i] - bdiag[i + 1]; 1691 bjtmp = bj + bdiag[i + 1] + 1; 1692 for (j = 0; j < nz; j++) { 1693 col = bjtmp[j]; 1694 rtmp1[col] = 0.0; 1695 rtmp2[col] = 0.0; 1696 rtmp3[col] = 0.0; 1697 rtmp4[col] = 0.0; 1698 } 1699 1700 /* load in initial (unfactored row) */ 1701 nz = ai[r[i] + 1] - ai[r[i]]; 1702 ajtmp = aj + ai[r[i]]; 1703 v1 = aa + ai[r[i]]; 1704 v2 = aa + ai[r[i] + 1]; 1705 v3 = aa + ai[r[i] + 2]; 1706 v4 = aa + ai[r[i] + 3]; 1707 for (j = 0; j < nz; j++) { 1708 col = ics[ajtmp[j]]; 1709 rtmp1[col] = v1[j]; 1710 rtmp2[col] = v2[j]; 1711 rtmp3[col] = v3[j]; 1712 rtmp4[col] = v4[j]; 1713 } 1714 /* ZeropivotApply(): shift the diagonal of the matrix */ 1715 rtmp1[i] += sctx.shift_amount; 1716 rtmp2[i + 1] += sctx.shift_amount; 1717 rtmp3[i + 2] += sctx.shift_amount; 1718 rtmp4[i + 3] += sctx.shift_amount; 1719 1720 /* elimination */ 1721 bjtmp = bj + bi[i]; 1722 row = *bjtmp++; /* pivot row */ 1723 nzL = bi[i + 1] - bi[i]; 1724 for (k = 0; k < nzL; k++) { 1725 pc1 = rtmp1 + row; 1726 pc2 = rtmp2 + row; 1727 pc3 = rtmp3 + row; 1728 pc4 = rtmp4 + row; 1729 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1730 pv = b->a + bdiag[row]; 1731 mul1 = *pc1 * (*pv); 1732 mul2 = *pc2 * (*pv); 1733 mul3 = *pc3 * (*pv); 1734 mul4 = *pc4 * (*pv); 1735 *pc1 = mul1; 1736 *pc2 = mul2; 1737 *pc3 = mul3; 1738 *pc4 = mul4; 1739 1740 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1741 pv = b->a + bdiag[row + 1] + 1; 1742 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1743 for (j = 0; j < nz; j++) { 1744 col = pj[j]; 1745 rtmp1[col] -= mul1 * pv[j]; 1746 rtmp2[col] -= mul2 * pv[j]; 1747 rtmp3[col] -= mul3 * pv[j]; 1748 rtmp4[col] -= mul4 * pv[j]; 1749 } 1750 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1751 } 1752 row = *bjtmp++; 1753 } 1754 1755 /* finished row i; check zero pivot, then stick row i into b->a */ 1756 rs = 0.0; 1757 /* L part */ 1758 pc1 = b->a + bi[i]; 1759 pj = b->j + bi[i]; 1760 nz = bi[i + 1] - bi[i]; 1761 for (j = 0; j < nz; j++) { 1762 col = pj[j]; 1763 pc1[j] = rtmp1[col]; 1764 rs += PetscAbsScalar(pc1[j]); 1765 } 1766 /* U part */ 1767 pc1 = b->a + bdiag[i + 1] + 1; 1768 pj = b->j + bdiag[i + 1] + 1; 1769 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1770 for (j = 0; j < nz; j++) { 1771 col = pj[j]; 1772 pc1[j] = rtmp1[col]; 1773 rs += PetscAbsScalar(pc1[j]); 1774 } 1775 1776 sctx.rs = rs; 1777 sctx.pv = rtmp1[i]; 1778 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1779 if (sctx.newshift) break; 1780 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1781 *pc1 = 1.0 / sctx.pv; 1782 1783 /* Now take care of 1st column of diagonal 4x4 block. */ 1784 pc2 = rtmp2 + i; 1785 pc3 = rtmp3 + i; 1786 pc4 = rtmp4 + i; 1787 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1788 mul2 = (*pc2) * (*pc1); 1789 *pc2 = mul2; 1790 mul3 = (*pc3) * (*pc1); 1791 *pc3 = mul3; 1792 mul4 = (*pc4) * (*pc1); 1793 *pc4 = mul4; 1794 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1795 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1796 for (j = 0; j < nz; j++) { 1797 col = pj[j]; 1798 rtmp2[col] -= mul2 * rtmp1[col]; 1799 rtmp3[col] -= mul3 * rtmp1[col]; 1800 rtmp4[col] -= mul4 * rtmp1[col]; 1801 } 1802 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1803 } 1804 1805 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1806 rs = 0.0; 1807 /* L part */ 1808 pc2 = b->a + bi[i + 1]; 1809 pj = b->j + bi[i + 1]; 1810 nz = bi[i + 2] - bi[i + 1]; 1811 for (j = 0; j < nz; j++) { 1812 col = pj[j]; 1813 pc2[j] = rtmp2[col]; 1814 rs += PetscAbsScalar(pc2[j]); 1815 } 1816 /* U part */ 1817 pc2 = b->a + bdiag[i + 2] + 1; 1818 pj = b->j + bdiag[i + 2] + 1; 1819 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1820 for (j = 0; j < nz; j++) { 1821 col = pj[j]; 1822 pc2[j] = rtmp2[col]; 1823 rs += PetscAbsScalar(pc2[j]); 1824 } 1825 1826 sctx.rs = rs; 1827 sctx.pv = rtmp2[i + 1]; 1828 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1829 if (sctx.newshift) break; 1830 pc2 = b->a + bdiag[i + 1]; 1831 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1832 1833 /* Now take care of 2nd column of diagonal 4x4 block. */ 1834 pc3 = rtmp3 + i + 1; 1835 pc4 = rtmp4 + i + 1; 1836 if (*pc3 != 0.0 || *pc4 != 0.0) { 1837 mul3 = (*pc3) * (*pc2); 1838 *pc3 = mul3; 1839 mul4 = (*pc4) * (*pc2); 1840 *pc4 = mul4; 1841 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1842 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1843 for (j = 0; j < nz; j++) { 1844 col = pj[j]; 1845 rtmp3[col] -= mul3 * rtmp2[col]; 1846 rtmp4[col] -= mul4 * rtmp2[col]; 1847 } 1848 PetscCall(PetscLogFlops(4.0 * nz)); 1849 } 1850 1851 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1852 rs = 0.0; 1853 /* L part */ 1854 pc3 = b->a + bi[i + 2]; 1855 pj = b->j + bi[i + 2]; 1856 nz = bi[i + 3] - bi[i + 2]; 1857 for (j = 0; j < nz; j++) { 1858 col = pj[j]; 1859 pc3[j] = rtmp3[col]; 1860 rs += PetscAbsScalar(pc3[j]); 1861 } 1862 /* U part */ 1863 pc3 = b->a + bdiag[i + 3] + 1; 1864 pj = b->j + bdiag[i + 3] + 1; 1865 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1866 for (j = 0; j < nz; j++) { 1867 col = pj[j]; 1868 pc3[j] = rtmp3[col]; 1869 rs += PetscAbsScalar(pc3[j]); 1870 } 1871 1872 sctx.rs = rs; 1873 sctx.pv = rtmp3[i + 2]; 1874 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1875 if (sctx.newshift) break; 1876 pc3 = b->a + bdiag[i + 2]; 1877 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1878 1879 /* Now take care of 3rd column of diagonal 4x4 block. */ 1880 pc4 = rtmp4 + i + 2; 1881 if (*pc4 != 0.0) { 1882 mul4 = (*pc4) * (*pc3); 1883 *pc4 = mul4; 1884 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1885 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1886 for (j = 0; j < nz; j++) { 1887 col = pj[j]; 1888 rtmp4[col] -= mul4 * rtmp3[col]; 1889 } 1890 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1891 } 1892 1893 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1894 rs = 0.0; 1895 /* L part */ 1896 pc4 = b->a + bi[i + 3]; 1897 pj = b->j + bi[i + 3]; 1898 nz = bi[i + 4] - bi[i + 3]; 1899 for (j = 0; j < nz; j++) { 1900 col = pj[j]; 1901 pc4[j] = rtmp4[col]; 1902 rs += PetscAbsScalar(pc4[j]); 1903 } 1904 /* U part */ 1905 pc4 = b->a + bdiag[i + 4] + 1; 1906 pj = b->j + bdiag[i + 4] + 1; 1907 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1908 for (j = 0; j < nz; j++) { 1909 col = pj[j]; 1910 pc4[j] = rtmp4[col]; 1911 rs += PetscAbsScalar(pc4[j]); 1912 } 1913 1914 sctx.rs = rs; 1915 sctx.pv = rtmp4[i + 3]; 1916 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1917 if (sctx.newshift) break; 1918 pc4 = b->a + bdiag[i + 3]; 1919 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1920 break; 1921 1922 default: 1923 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1924 } 1925 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1926 i += nodesz; /* Update the row */ 1927 } 1928 1929 /* MatPivotRefine() */ 1930 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1931 /* 1932 * if no shift in this attempt & shifting & started shifting & can refine, 1933 * then try lower shift 1934 */ 1935 sctx.shift_hi = sctx.shift_fraction; 1936 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1937 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1938 sctx.newshift = PETSC_TRUE; 1939 sctx.nshift++; 1940 } 1941 } while (sctx.newshift); 1942 1943 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1944 PetscCall(PetscFree(tmp_vec2)); 1945 PetscCall(ISRestoreIndices(isicol, &ic)); 1946 PetscCall(ISRestoreIndices(isrow, &r)); 1947 1948 if (b->inode.size) { 1949 C->ops->solve = MatSolve_SeqAIJ_Inode; 1950 } else { 1951 C->ops->solve = MatSolve_SeqAIJ; 1952 } 1953 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1954 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1955 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1956 C->ops->matsolve = MatMatSolve_SeqAIJ; 1957 C->assembled = PETSC_TRUE; 1958 C->preallocated = PETSC_TRUE; 1959 1960 PetscCall(PetscLogFlops(C->cmap->n)); 1961 1962 /* MatShiftView(A,info,&sctx) */ 1963 if (sctx.nshift) { 1964 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1965 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1966 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1967 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1968 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1969 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1970 } 1971 } 1972 PetscFunctionReturn(PETSC_SUCCESS); 1973 } 1974 1975 #if 0 1976 // unused 1977 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1978 { 1979 Mat C = B; 1980 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1981 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1982 const PetscInt *r, *ic, *c, *ics; 1983 PetscInt n = A->rmap->n, *bi = b->i; 1984 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1985 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1986 PetscInt *ai = a->i, *aj = a->j; 1987 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1988 PetscScalar mul1, mul2, mul3, tmp; 1989 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1990 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1991 PetscReal rs = 0.0; 1992 FactorShiftCtx sctx; 1993 1994 PetscFunctionBegin; 1995 sctx.shift_top = 0; 1996 sctx.nshift_max = 0; 1997 sctx.shift_lo = 0; 1998 sctx.shift_hi = 0; 1999 sctx.shift_fraction = 0; 2000 2001 /* if both shift schemes are chosen by user, only use info->shiftpd */ 2002 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 2003 sctx.shift_top = 0; 2004 for (i = 0; i < n; i++) { 2005 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2006 rs = 0.0; 2007 ajtmp = aj + ai[i]; 2008 rtmp1 = aa + ai[i]; 2009 nz = ai[i + 1] - ai[i]; 2010 for (j = 0; j < nz; j++) { 2011 if (*ajtmp != i) { 2012 rs += PetscAbsScalar(*rtmp1++); 2013 } else { 2014 rs -= PetscRealPart(*rtmp1++); 2015 } 2016 ajtmp++; 2017 } 2018 if (rs > sctx.shift_top) sctx.shift_top = rs; 2019 } 2020 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2021 sctx.shift_top *= 1.1; 2022 sctx.nshift_max = 5; 2023 sctx.shift_lo = 0.; 2024 sctx.shift_hi = 1.; 2025 } 2026 sctx.shift_amount = 0; 2027 sctx.nshift = 0; 2028 2029 PetscCall(ISGetIndices(isrow, &r)); 2030 PetscCall(ISGetIndices(iscol, &c)); 2031 PetscCall(ISGetIndices(isicol, &ic)); 2032 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2033 ics = ic; 2034 2035 node_max = a->inode.node_count; 2036 ns = a->inode.size; 2037 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2038 2039 /* If max inode size > 3, split it into two inodes.*/ 2040 /* also map the inode sizes according to the ordering */ 2041 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2042 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2043 if (ns[i] > 3) { 2044 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2045 ++j; 2046 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2047 } else { 2048 tmp_vec1[j] = ns[i]; 2049 } 2050 } 2051 /* Use the correct node_max */ 2052 node_max = j; 2053 2054 /* Now reorder the inode info based on mat re-ordering info */ 2055 /* First create a row -> inode_size_array_index map */ 2056 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2057 for (i = 0, row = 0; i < node_max; i++) { 2058 nodesz = tmp_vec1[i]; 2059 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2060 } 2061 /* Using nsmap, create a reordered ns structure */ 2062 for (i = 0, j = 0; i < node_max; i++) { 2063 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2064 tmp_vec2[i] = nodesz; 2065 j += nodesz; 2066 } 2067 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2068 /* Now use the correct ns */ 2069 ns = tmp_vec2; 2070 2071 do { 2072 sctx.newshift = PETSC_FALSE; 2073 /* Now loop over each block-row, and do the factorization */ 2074 for (i = 0, row = 0; i < node_max; i++) { 2075 nodesz = ns[i]; 2076 nz = bi[row + 1] - bi[row]; 2077 bjtmp = bj + bi[row]; 2078 2079 switch (nodesz) { 2080 case 1: 2081 for (j = 0; j < nz; j++) { 2082 idx = bjtmp[j]; 2083 rtmp11[idx] = 0.0; 2084 } 2085 2086 /* load in initial (unfactored row) */ 2087 idx = r[row]; 2088 nz_tmp = ai[idx + 1] - ai[idx]; 2089 ajtmp = aj + ai[idx]; 2090 v1 = aa + ai[idx]; 2091 2092 for (j = 0; j < nz_tmp; j++) { 2093 idx = ics[ajtmp[j]]; 2094 rtmp11[idx] = v1[j]; 2095 } 2096 rtmp11[ics[r[row]]] += sctx.shift_amount; 2097 2098 prow = *bjtmp++; 2099 while (prow < row) { 2100 pc1 = rtmp11 + prow; 2101 if (*pc1 != 0.0) { 2102 pv = ba + bd[prow]; 2103 pj = nbj + bd[prow]; 2104 mul1 = *pc1 * *pv++; 2105 *pc1 = mul1; 2106 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2107 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2108 for (j = 0; j < nz_tmp; j++) { 2109 tmp = pv[j]; 2110 idx = pj[j]; 2111 rtmp11[idx] -= mul1 * tmp; 2112 } 2113 } 2114 prow = *bjtmp++; 2115 } 2116 pj = bj + bi[row]; 2117 pc1 = ba + bi[row]; 2118 2119 sctx.pv = rtmp11[row]; 2120 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2121 rs = 0.0; 2122 for (j = 0; j < nz; j++) { 2123 idx = pj[j]; 2124 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2125 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2126 } 2127 sctx.rs = rs; 2128 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2129 if (sctx.newshift) goto endofwhile; 2130 break; 2131 2132 case 2: 2133 for (j = 0; j < nz; j++) { 2134 idx = bjtmp[j]; 2135 rtmp11[idx] = 0.0; 2136 rtmp22[idx] = 0.0; 2137 } 2138 2139 /* load in initial (unfactored row) */ 2140 idx = r[row]; 2141 nz_tmp = ai[idx + 1] - ai[idx]; 2142 ajtmp = aj + ai[idx]; 2143 v1 = aa + ai[idx]; 2144 v2 = aa + ai[idx + 1]; 2145 for (j = 0; j < nz_tmp; j++) { 2146 idx = ics[ajtmp[j]]; 2147 rtmp11[idx] = v1[j]; 2148 rtmp22[idx] = v2[j]; 2149 } 2150 rtmp11[ics[r[row]]] += sctx.shift_amount; 2151 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2152 2153 prow = *bjtmp++; 2154 while (prow < row) { 2155 pc1 = rtmp11 + prow; 2156 pc2 = rtmp22 + prow; 2157 if (*pc1 != 0.0 || *pc2 != 0.0) { 2158 pv = ba + bd[prow]; 2159 pj = nbj + bd[prow]; 2160 mul1 = *pc1 * *pv; 2161 mul2 = *pc2 * *pv; 2162 ++pv; 2163 *pc1 = mul1; 2164 *pc2 = mul2; 2165 2166 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2167 for (j = 0; j < nz_tmp; j++) { 2168 tmp = pv[j]; 2169 idx = pj[j]; 2170 rtmp11[idx] -= mul1 * tmp; 2171 rtmp22[idx] -= mul2 * tmp; 2172 } 2173 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2174 } 2175 prow = *bjtmp++; 2176 } 2177 2178 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2179 pc1 = rtmp11 + prow; 2180 pc2 = rtmp22 + prow; 2181 2182 sctx.pv = *pc1; 2183 pj = bj + bi[prow]; 2184 rs = 0.0; 2185 for (j = 0; j < nz; j++) { 2186 idx = pj[j]; 2187 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2188 } 2189 sctx.rs = rs; 2190 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2191 if (sctx.newshift) goto endofwhile; 2192 2193 if (*pc2 != 0.0) { 2194 pj = nbj + bd[prow]; 2195 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2196 *pc2 = mul2; 2197 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2198 for (j = 0; j < nz_tmp; j++) { 2199 idx = pj[j]; 2200 tmp = rtmp11[idx]; 2201 rtmp22[idx] -= mul2 * tmp; 2202 } 2203 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2204 } 2205 2206 pj = bj + bi[row]; 2207 pc1 = ba + bi[row]; 2208 pc2 = ba + bi[row + 1]; 2209 2210 sctx.pv = rtmp22[row + 1]; 2211 rs = 0.0; 2212 rtmp11[row] = 1.0 / rtmp11[row]; 2213 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2214 /* copy row entries from dense representation to sparse */ 2215 for (j = 0; j < nz; j++) { 2216 idx = pj[j]; 2217 pc1[j] = rtmp11[idx]; 2218 pc2[j] = rtmp22[idx]; 2219 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2220 } 2221 sctx.rs = rs; 2222 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2223 if (sctx.newshift) goto endofwhile; 2224 break; 2225 2226 case 3: 2227 for (j = 0; j < nz; j++) { 2228 idx = bjtmp[j]; 2229 rtmp11[idx] = 0.0; 2230 rtmp22[idx] = 0.0; 2231 rtmp33[idx] = 0.0; 2232 } 2233 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2234 idx = r[row]; 2235 nz_tmp = ai[idx + 1] - ai[idx]; 2236 ajtmp = aj + ai[idx]; 2237 v1 = aa + ai[idx]; 2238 v2 = aa + ai[idx + 1]; 2239 v3 = aa + ai[idx + 2]; 2240 for (j = 0; j < nz_tmp; j++) { 2241 idx = ics[ajtmp[j]]; 2242 rtmp11[idx] = v1[j]; 2243 rtmp22[idx] = v2[j]; 2244 rtmp33[idx] = v3[j]; 2245 } 2246 rtmp11[ics[r[row]]] += sctx.shift_amount; 2247 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2248 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2249 2250 /* loop over all pivot row blocks above this row block */ 2251 prow = *bjtmp++; 2252 while (prow < row) { 2253 pc1 = rtmp11 + prow; 2254 pc2 = rtmp22 + prow; 2255 pc3 = rtmp33 + prow; 2256 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2257 pv = ba + bd[prow]; 2258 pj = nbj + bd[prow]; 2259 mul1 = *pc1 * *pv; 2260 mul2 = *pc2 * *pv; 2261 mul3 = *pc3 * *pv; 2262 ++pv; 2263 *pc1 = mul1; 2264 *pc2 = mul2; 2265 *pc3 = mul3; 2266 2267 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2268 /* update this row based on pivot row */ 2269 for (j = 0; j < nz_tmp; j++) { 2270 tmp = pv[j]; 2271 idx = pj[j]; 2272 rtmp11[idx] -= mul1 * tmp; 2273 rtmp22[idx] -= mul2 * tmp; 2274 rtmp33[idx] -= mul3 * tmp; 2275 } 2276 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2277 } 2278 prow = *bjtmp++; 2279 } 2280 2281 /* Now take care of diagonal 3x3 block in this set of rows */ 2282 /* note: prow = row here */ 2283 pc1 = rtmp11 + prow; 2284 pc2 = rtmp22 + prow; 2285 pc3 = rtmp33 + prow; 2286 2287 sctx.pv = *pc1; 2288 pj = bj + bi[prow]; 2289 rs = 0.0; 2290 for (j = 0; j < nz; j++) { 2291 idx = pj[j]; 2292 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2293 } 2294 sctx.rs = rs; 2295 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2296 if (sctx.newshift) goto endofwhile; 2297 2298 if (*pc2 != 0.0 || *pc3 != 0.0) { 2299 mul2 = (*pc2) / (*pc1); 2300 mul3 = (*pc3) / (*pc1); 2301 *pc2 = mul2; 2302 *pc3 = mul3; 2303 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2304 pj = nbj + bd[prow]; 2305 for (j = 0; j < nz_tmp; j++) { 2306 idx = pj[j]; 2307 tmp = rtmp11[idx]; 2308 rtmp22[idx] -= mul2 * tmp; 2309 rtmp33[idx] -= mul3 * tmp; 2310 } 2311 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2312 } 2313 ++prow; 2314 2315 pc2 = rtmp22 + prow; 2316 pc3 = rtmp33 + prow; 2317 sctx.pv = *pc2; 2318 pj = bj + bi[prow]; 2319 rs = 0.0; 2320 for (j = 0; j < nz; j++) { 2321 idx = pj[j]; 2322 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2323 } 2324 sctx.rs = rs; 2325 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2326 if (sctx.newshift) goto endofwhile; 2327 2328 if (*pc3 != 0.0) { 2329 mul3 = (*pc3) / (*pc2); 2330 *pc3 = mul3; 2331 pj = nbj + bd[prow]; 2332 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2333 for (j = 0; j < nz_tmp; j++) { 2334 idx = pj[j]; 2335 tmp = rtmp22[idx]; 2336 rtmp33[idx] -= mul3 * tmp; 2337 } 2338 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2339 } 2340 2341 pj = bj + bi[row]; 2342 pc1 = ba + bi[row]; 2343 pc2 = ba + bi[row + 1]; 2344 pc3 = ba + bi[row + 2]; 2345 2346 sctx.pv = rtmp33[row + 2]; 2347 rs = 0.0; 2348 rtmp11[row] = 1.0 / rtmp11[row]; 2349 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2350 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2351 /* copy row entries from dense representation to sparse */ 2352 for (j = 0; j < nz; j++) { 2353 idx = pj[j]; 2354 pc1[j] = rtmp11[idx]; 2355 pc2[j] = rtmp22[idx]; 2356 pc3[j] = rtmp33[idx]; 2357 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2358 } 2359 2360 sctx.rs = rs; 2361 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2362 if (sctx.newshift) goto endofwhile; 2363 break; 2364 2365 default: 2366 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2367 } 2368 row += nodesz; /* Update the row */ 2369 } 2370 endofwhile:; 2371 } while (sctx.newshift); 2372 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2373 PetscCall(PetscFree(tmp_vec2)); 2374 PetscCall(ISRestoreIndices(isicol, &ic)); 2375 PetscCall(ISRestoreIndices(isrow, &r)); 2376 PetscCall(ISRestoreIndices(iscol, &c)); 2377 2378 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2379 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2380 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2381 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2382 C->assembled = PETSC_TRUE; 2383 C->preallocated = PETSC_TRUE; 2384 if (sctx.nshift) { 2385 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2386 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2387 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2388 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2389 } 2390 } 2391 PetscCall(PetscLogFlops(C->cmap->n)); 2392 PetscCall(MatSeqAIJCheckInode(C)); 2393 PetscFunctionReturn(PETSC_SUCCESS); 2394 } 2395 #endif 2396 2397 /* ----------------------------------------------------------- */ 2398 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2399 { 2400 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2401 IS iscol = a->col, isrow = a->row; 2402 const PetscInt *r, *c, *rout, *cout; 2403 PetscInt i, j, n = A->rmap->n; 2404 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2405 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2406 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2407 PetscScalar sum1, sum2, sum3, sum4, sum5; 2408 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2409 const PetscScalar *b; 2410 2411 PetscFunctionBegin; 2412 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2413 node_max = a->inode.node_count; 2414 ns = a->inode.size; /* Node Size array */ 2415 2416 PetscCall(VecGetArrayRead(bb, &b)); 2417 PetscCall(VecGetArrayWrite(xx, &x)); 2418 tmp = a->solve_work; 2419 2420 PetscCall(ISGetIndices(isrow, &rout)); 2421 r = rout; 2422 PetscCall(ISGetIndices(iscol, &cout)); 2423 c = cout; 2424 2425 /* forward solve the lower triangular */ 2426 tmps = tmp; 2427 aa = a_a; 2428 aj = a_j; 2429 ad = a->diag; 2430 2431 for (i = 0, row = 0; i < node_max; ++i) { 2432 nsz = ns[i]; 2433 aii = ai[row]; 2434 v1 = aa + aii; 2435 vi = aj + aii; 2436 nz = ai[row + 1] - ai[row]; 2437 2438 if (i < node_max - 1) { 2439 /* Prefetch the indices for the next block */ 2440 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2441 /* Prefetch the data for the next block */ 2442 PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2443 } 2444 2445 switch (nsz) { /* Each loop in 'case' is unrolled */ 2446 case 1: 2447 sum1 = b[r[row]]; 2448 for (j = 0; j < nz - 1; j += 2) { 2449 i0 = vi[j]; 2450 i1 = vi[j + 1]; 2451 tmp0 = tmps[i0]; 2452 tmp1 = tmps[i1]; 2453 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2454 } 2455 if (j == nz - 1) { 2456 tmp0 = tmps[vi[j]]; 2457 sum1 -= v1[j] * tmp0; 2458 } 2459 tmp[row++] = sum1; 2460 break; 2461 case 2: 2462 sum1 = b[r[row]]; 2463 sum2 = b[r[row + 1]]; 2464 v2 = aa + ai[row + 1]; 2465 2466 for (j = 0; j < nz - 1; j += 2) { 2467 i0 = vi[j]; 2468 i1 = vi[j + 1]; 2469 tmp0 = tmps[i0]; 2470 tmp1 = tmps[i1]; 2471 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2472 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2473 } 2474 if (j == nz - 1) { 2475 tmp0 = tmps[vi[j]]; 2476 sum1 -= v1[j] * tmp0; 2477 sum2 -= v2[j] * tmp0; 2478 } 2479 sum2 -= v2[nz] * sum1; 2480 tmp[row++] = sum1; 2481 tmp[row++] = sum2; 2482 break; 2483 case 3: 2484 sum1 = b[r[row]]; 2485 sum2 = b[r[row + 1]]; 2486 sum3 = b[r[row + 2]]; 2487 v2 = aa + ai[row + 1]; 2488 v3 = aa + ai[row + 2]; 2489 2490 for (j = 0; j < nz - 1; j += 2) { 2491 i0 = vi[j]; 2492 i1 = vi[j + 1]; 2493 tmp0 = tmps[i0]; 2494 tmp1 = tmps[i1]; 2495 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2496 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2497 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2498 } 2499 if (j == nz - 1) { 2500 tmp0 = tmps[vi[j]]; 2501 sum1 -= v1[j] * tmp0; 2502 sum2 -= v2[j] * tmp0; 2503 sum3 -= v3[j] * tmp0; 2504 } 2505 sum2 -= v2[nz] * sum1; 2506 sum3 -= v3[nz] * sum1; 2507 sum3 -= v3[nz + 1] * sum2; 2508 tmp[row++] = sum1; 2509 tmp[row++] = sum2; 2510 tmp[row++] = sum3; 2511 break; 2512 2513 case 4: 2514 sum1 = b[r[row]]; 2515 sum2 = b[r[row + 1]]; 2516 sum3 = b[r[row + 2]]; 2517 sum4 = b[r[row + 3]]; 2518 v2 = aa + ai[row + 1]; 2519 v3 = aa + ai[row + 2]; 2520 v4 = aa + ai[row + 3]; 2521 2522 for (j = 0; j < nz - 1; j += 2) { 2523 i0 = vi[j]; 2524 i1 = vi[j + 1]; 2525 tmp0 = tmps[i0]; 2526 tmp1 = tmps[i1]; 2527 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2528 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2529 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2530 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2531 } 2532 if (j == nz - 1) { 2533 tmp0 = tmps[vi[j]]; 2534 sum1 -= v1[j] * tmp0; 2535 sum2 -= v2[j] * tmp0; 2536 sum3 -= v3[j] * tmp0; 2537 sum4 -= v4[j] * tmp0; 2538 } 2539 sum2 -= v2[nz] * sum1; 2540 sum3 -= v3[nz] * sum1; 2541 sum4 -= v4[nz] * sum1; 2542 sum3 -= v3[nz + 1] * sum2; 2543 sum4 -= v4[nz + 1] * sum2; 2544 sum4 -= v4[nz + 2] * sum3; 2545 2546 tmp[row++] = sum1; 2547 tmp[row++] = sum2; 2548 tmp[row++] = sum3; 2549 tmp[row++] = sum4; 2550 break; 2551 case 5: 2552 sum1 = b[r[row]]; 2553 sum2 = b[r[row + 1]]; 2554 sum3 = b[r[row + 2]]; 2555 sum4 = b[r[row + 3]]; 2556 sum5 = b[r[row + 4]]; 2557 v2 = aa + ai[row + 1]; 2558 v3 = aa + ai[row + 2]; 2559 v4 = aa + ai[row + 3]; 2560 v5 = aa + ai[row + 4]; 2561 2562 for (j = 0; j < nz - 1; j += 2) { 2563 i0 = vi[j]; 2564 i1 = vi[j + 1]; 2565 tmp0 = tmps[i0]; 2566 tmp1 = tmps[i1]; 2567 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2568 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2569 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2570 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2571 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2572 } 2573 if (j == nz - 1) { 2574 tmp0 = tmps[vi[j]]; 2575 sum1 -= v1[j] * tmp0; 2576 sum2 -= v2[j] * tmp0; 2577 sum3 -= v3[j] * tmp0; 2578 sum4 -= v4[j] * tmp0; 2579 sum5 -= v5[j] * tmp0; 2580 } 2581 2582 sum2 -= v2[nz] * sum1; 2583 sum3 -= v3[nz] * sum1; 2584 sum4 -= v4[nz] * sum1; 2585 sum5 -= v5[nz] * sum1; 2586 sum3 -= v3[nz + 1] * sum2; 2587 sum4 -= v4[nz + 1] * sum2; 2588 sum5 -= v5[nz + 1] * sum2; 2589 sum4 -= v4[nz + 2] * sum3; 2590 sum5 -= v5[nz + 2] * sum3; 2591 sum5 -= v5[nz + 3] * sum4; 2592 2593 tmp[row++] = sum1; 2594 tmp[row++] = sum2; 2595 tmp[row++] = sum3; 2596 tmp[row++] = sum4; 2597 tmp[row++] = sum5; 2598 break; 2599 default: 2600 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2601 } 2602 } 2603 /* backward solve the upper triangular */ 2604 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 2605 nsz = ns[i]; 2606 aii = ad[row + 1] + 1; 2607 v1 = aa + aii; 2608 vi = aj + aii; 2609 nz = ad[row] - ad[row + 1] - 1; 2610 2611 if (i > 0) { 2612 /* Prefetch the indices for the next block */ 2613 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2614 /* Prefetch the data for the next block */ 2615 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2616 } 2617 2618 switch (nsz) { /* Each loop in 'case' is unrolled */ 2619 case 1: 2620 sum1 = tmp[row]; 2621 2622 for (j = 0; j < nz - 1; j += 2) { 2623 i0 = vi[j]; 2624 i1 = vi[j + 1]; 2625 tmp0 = tmps[i0]; 2626 tmp1 = tmps[i1]; 2627 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2628 } 2629 if (j == nz - 1) { 2630 tmp0 = tmps[vi[j]]; 2631 sum1 -= v1[j] * tmp0; 2632 } 2633 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2634 row--; 2635 break; 2636 case 2: 2637 sum1 = tmp[row]; 2638 sum2 = tmp[row - 1]; 2639 v2 = aa + ad[row] + 1; 2640 for (j = 0; j < nz - 1; j += 2) { 2641 i0 = vi[j]; 2642 i1 = vi[j + 1]; 2643 tmp0 = tmps[i0]; 2644 tmp1 = tmps[i1]; 2645 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2646 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2647 } 2648 if (j == nz - 1) { 2649 tmp0 = tmps[vi[j]]; 2650 sum1 -= v1[j] * tmp0; 2651 sum2 -= v2[j + 1] * tmp0; 2652 } 2653 2654 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2655 row--; 2656 sum2 -= v2[0] * tmp0; 2657 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2658 row--; 2659 break; 2660 case 3: 2661 sum1 = tmp[row]; 2662 sum2 = tmp[row - 1]; 2663 sum3 = tmp[row - 2]; 2664 v2 = aa + ad[row] + 1; 2665 v3 = aa + ad[row - 1] + 1; 2666 for (j = 0; j < nz - 1; j += 2) { 2667 i0 = vi[j]; 2668 i1 = vi[j + 1]; 2669 tmp0 = tmps[i0]; 2670 tmp1 = tmps[i1]; 2671 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2672 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2673 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2674 } 2675 if (j == nz - 1) { 2676 tmp0 = tmps[vi[j]]; 2677 sum1 -= v1[j] * tmp0; 2678 sum2 -= v2[j + 1] * tmp0; 2679 sum3 -= v3[j + 2] * tmp0; 2680 } 2681 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2682 row--; 2683 sum2 -= v2[0] * tmp0; 2684 sum3 -= v3[1] * tmp0; 2685 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2686 row--; 2687 sum3 -= v3[0] * tmp0; 2688 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2689 row--; 2690 2691 break; 2692 case 4: 2693 sum1 = tmp[row]; 2694 sum2 = tmp[row - 1]; 2695 sum3 = tmp[row - 2]; 2696 sum4 = tmp[row - 3]; 2697 v2 = aa + ad[row] + 1; 2698 v3 = aa + ad[row - 1] + 1; 2699 v4 = aa + ad[row - 2] + 1; 2700 2701 for (j = 0; j < nz - 1; j += 2) { 2702 i0 = vi[j]; 2703 i1 = vi[j + 1]; 2704 tmp0 = tmps[i0]; 2705 tmp1 = tmps[i1]; 2706 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2707 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2708 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2709 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2710 } 2711 if (j == nz - 1) { 2712 tmp0 = tmps[vi[j]]; 2713 sum1 -= v1[j] * tmp0; 2714 sum2 -= v2[j + 1] * tmp0; 2715 sum3 -= v3[j + 2] * tmp0; 2716 sum4 -= v4[j + 3] * tmp0; 2717 } 2718 2719 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2720 row--; 2721 sum2 -= v2[0] * tmp0; 2722 sum3 -= v3[1] * tmp0; 2723 sum4 -= v4[2] * tmp0; 2724 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2725 row--; 2726 sum3 -= v3[0] * tmp0; 2727 sum4 -= v4[1] * tmp0; 2728 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2729 row--; 2730 sum4 -= v4[0] * tmp0; 2731 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2732 row--; 2733 break; 2734 case 5: 2735 sum1 = tmp[row]; 2736 sum2 = tmp[row - 1]; 2737 sum3 = tmp[row - 2]; 2738 sum4 = tmp[row - 3]; 2739 sum5 = tmp[row - 4]; 2740 v2 = aa + ad[row] + 1; 2741 v3 = aa + ad[row - 1] + 1; 2742 v4 = aa + ad[row - 2] + 1; 2743 v5 = aa + ad[row - 3] + 1; 2744 for (j = 0; j < nz - 1; j += 2) { 2745 i0 = vi[j]; 2746 i1 = vi[j + 1]; 2747 tmp0 = tmps[i0]; 2748 tmp1 = tmps[i1]; 2749 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2750 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2751 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2752 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2753 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2754 } 2755 if (j == nz - 1) { 2756 tmp0 = tmps[vi[j]]; 2757 sum1 -= v1[j] * tmp0; 2758 sum2 -= v2[j + 1] * tmp0; 2759 sum3 -= v3[j + 2] * tmp0; 2760 sum4 -= v4[j + 3] * tmp0; 2761 sum5 -= v5[j + 4] * tmp0; 2762 } 2763 2764 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2765 row--; 2766 sum2 -= v2[0] * tmp0; 2767 sum3 -= v3[1] * tmp0; 2768 sum4 -= v4[2] * tmp0; 2769 sum5 -= v5[3] * tmp0; 2770 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2771 row--; 2772 sum3 -= v3[0] * tmp0; 2773 sum4 -= v4[1] * tmp0; 2774 sum5 -= v5[2] * tmp0; 2775 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2776 row--; 2777 sum4 -= v4[0] * tmp0; 2778 sum5 -= v5[1] * tmp0; 2779 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2780 row--; 2781 sum5 -= v5[0] * tmp0; 2782 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2783 row--; 2784 break; 2785 default: 2786 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2787 } 2788 } 2789 PetscCall(ISRestoreIndices(isrow, &rout)); 2790 PetscCall(ISRestoreIndices(iscol, &cout)); 2791 PetscCall(VecRestoreArrayRead(bb, &b)); 2792 PetscCall(VecRestoreArrayWrite(xx, &x)); 2793 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2794 PetscFunctionReturn(PETSC_SUCCESS); 2795 } 2796 2797 /* 2798 Makes a longer coloring[] array and calls the usual code with that 2799 */ 2800 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2801 { 2802 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2803 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row; 2804 PetscInt *colorused, i; 2805 ISColoringValue *newcolor; 2806 2807 PetscFunctionBegin; 2808 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2809 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2810 /* loop over inodes, marking a color for each column*/ 2811 row = 0; 2812 for (i = 0; i < m; i++) { 2813 for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors; 2814 } 2815 2816 /* eliminate unneeded colors */ 2817 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2818 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2819 2820 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2821 ncolors = colorused[5 * ncolors - 1]; 2822 for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1; 2823 PetscCall(PetscFree(colorused)); 2824 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2825 PetscCall(PetscFree(coloring)); 2826 PetscFunctionReturn(PETSC_SUCCESS); 2827 } 2828 2829 #include <petsc/private/kernels/blockinvert.h> 2830 2831 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2832 { 2833 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2834 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2835 MatScalar *ibdiag, *bdiag, work[25], *t; 2836 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2837 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2838 const PetscScalar *xb, *b; 2839 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2840 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2; 2841 PetscInt sz, k, ipvt[5]; 2842 PetscBool allowzeropivot, zeropivotdetected; 2843 const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i; 2844 2845 PetscFunctionBegin; 2846 allowzeropivot = PetscNot(A->erroriffailure); 2847 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2848 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2849 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2850 2851 if (!a->inode.ibdiagvalid) { 2852 if (!a->inode.ibdiag) { 2853 /* calculate space needed for diagonal blocks */ 2854 for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i]; 2855 a->inode.bdiagsize = cnt; 2856 2857 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2858 } 2859 2860 /* copy over the diagonal blocks and invert them */ 2861 ibdiag = a->inode.ibdiag; 2862 bdiag = a->inode.bdiag; 2863 cnt = 0; 2864 for (i = 0, row = 0; i < m; i++) { 2865 for (j = 0; j < sizes[i]; j++) { 2866 for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k]; 2867 } 2868 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i])); 2869 2870 switch (sizes[i]) { 2871 case 1: 2872 /* Create matrix data structure */ 2873 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2874 if (allowzeropivot) { 2875 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2876 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2877 A->factorerror_zeropivot_row = row; 2878 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2879 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2880 } 2881 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2882 break; 2883 case 2: 2884 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2885 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2886 break; 2887 case 3: 2888 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2889 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2890 break; 2891 case 4: 2892 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2893 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2894 break; 2895 case 5: 2896 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2897 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2898 break; 2899 default: 2900 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 2901 } 2902 cnt += sizes[i] * sizes[i]; 2903 row += sizes[i]; 2904 } 2905 a->inode.ibdiagvalid = PETSC_TRUE; 2906 } 2907 ibdiag = a->inode.ibdiag; 2908 bdiag = a->inode.bdiag; 2909 t = a->inode.ssor_work; 2910 2911 PetscCall(VecGetArray(xx, &x)); 2912 PetscCall(VecGetArrayRead(bb, &b)); 2913 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2914 if (flag & SOR_ZERO_INITIAL_GUESS) { 2915 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2916 for (i = 0, row = 0; i < m; i++) { 2917 sz = diag[row] - ii[row]; 2918 v1 = a->a + ii[row]; 2919 idx = a->j + ii[row]; 2920 2921 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2922 switch (sizes[i]) { 2923 case 1: 2924 2925 sum1 = b[row]; 2926 for (n = 0; n < sz - 1; n += 2) { 2927 i1 = idx[0]; 2928 i2 = idx[1]; 2929 idx += 2; 2930 tmp0 = x[i1]; 2931 tmp1 = x[i2]; 2932 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2933 v1 += 2; 2934 } 2935 2936 if (n == sz - 1) { 2937 tmp0 = x[*idx]; 2938 sum1 -= *v1 * tmp0; 2939 } 2940 t[row] = sum1; 2941 x[row++] = sum1 * (*ibdiag++); 2942 break; 2943 case 2: 2944 v2 = a->a + ii[row + 1]; 2945 sum1 = b[row]; 2946 sum2 = b[row + 1]; 2947 for (n = 0; n < sz - 1; n += 2) { 2948 i1 = idx[0]; 2949 i2 = idx[1]; 2950 idx += 2; 2951 tmp0 = x[i1]; 2952 tmp1 = x[i2]; 2953 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2954 v1 += 2; 2955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2956 v2 += 2; 2957 } 2958 2959 if (n == sz - 1) { 2960 tmp0 = x[*idx]; 2961 sum1 -= v1[0] * tmp0; 2962 sum2 -= v2[0] * tmp0; 2963 } 2964 t[row] = sum1; 2965 t[row + 1] = sum2; 2966 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2967 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2968 ibdiag += 4; 2969 break; 2970 case 3: 2971 v2 = a->a + ii[row + 1]; 2972 v3 = a->a + ii[row + 2]; 2973 sum1 = b[row]; 2974 sum2 = b[row + 1]; 2975 sum3 = b[row + 2]; 2976 for (n = 0; n < sz - 1; n += 2) { 2977 i1 = idx[0]; 2978 i2 = idx[1]; 2979 idx += 2; 2980 tmp0 = x[i1]; 2981 tmp1 = x[i2]; 2982 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2983 v1 += 2; 2984 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2985 v2 += 2; 2986 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2987 v3 += 2; 2988 } 2989 2990 if (n == sz - 1) { 2991 tmp0 = x[*idx]; 2992 sum1 -= v1[0] * tmp0; 2993 sum2 -= v2[0] * tmp0; 2994 sum3 -= v3[0] * tmp0; 2995 } 2996 t[row] = sum1; 2997 t[row + 1] = sum2; 2998 t[row + 2] = sum3; 2999 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3000 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3001 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3002 ibdiag += 9; 3003 break; 3004 case 4: 3005 v2 = a->a + ii[row + 1]; 3006 v3 = a->a + ii[row + 2]; 3007 v4 = a->a + ii[row + 3]; 3008 sum1 = b[row]; 3009 sum2 = b[row + 1]; 3010 sum3 = b[row + 2]; 3011 sum4 = b[row + 3]; 3012 for (n = 0; n < sz - 1; n += 2) { 3013 i1 = idx[0]; 3014 i2 = idx[1]; 3015 idx += 2; 3016 tmp0 = x[i1]; 3017 tmp1 = x[i2]; 3018 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3019 v1 += 2; 3020 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3021 v2 += 2; 3022 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3023 v3 += 2; 3024 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3025 v4 += 2; 3026 } 3027 3028 if (n == sz - 1) { 3029 tmp0 = x[*idx]; 3030 sum1 -= v1[0] * tmp0; 3031 sum2 -= v2[0] * tmp0; 3032 sum3 -= v3[0] * tmp0; 3033 sum4 -= v4[0] * tmp0; 3034 } 3035 t[row] = sum1; 3036 t[row + 1] = sum2; 3037 t[row + 2] = sum3; 3038 t[row + 3] = sum4; 3039 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3040 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3041 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3042 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3043 ibdiag += 16; 3044 break; 3045 case 5: 3046 v2 = a->a + ii[row + 1]; 3047 v3 = a->a + ii[row + 2]; 3048 v4 = a->a + ii[row + 3]; 3049 v5 = a->a + ii[row + 4]; 3050 sum1 = b[row]; 3051 sum2 = b[row + 1]; 3052 sum3 = b[row + 2]; 3053 sum4 = b[row + 3]; 3054 sum5 = b[row + 4]; 3055 for (n = 0; n < sz - 1; n += 2) { 3056 i1 = idx[0]; 3057 i2 = idx[1]; 3058 idx += 2; 3059 tmp0 = x[i1]; 3060 tmp1 = x[i2]; 3061 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3062 v1 += 2; 3063 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3064 v2 += 2; 3065 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3066 v3 += 2; 3067 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3068 v4 += 2; 3069 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3070 v5 += 2; 3071 } 3072 3073 if (n == sz - 1) { 3074 tmp0 = x[*idx]; 3075 sum1 -= v1[0] * tmp0; 3076 sum2 -= v2[0] * tmp0; 3077 sum3 -= v3[0] * tmp0; 3078 sum4 -= v4[0] * tmp0; 3079 sum5 -= v5[0] * tmp0; 3080 } 3081 t[row] = sum1; 3082 t[row + 1] = sum2; 3083 t[row + 2] = sum3; 3084 t[row + 3] = sum4; 3085 t[row + 4] = sum5; 3086 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3087 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3088 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3089 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3090 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3091 ibdiag += 25; 3092 break; 3093 default: 3094 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3095 } 3096 } 3097 3098 xb = t; 3099 PetscCall(PetscLogFlops(a->nz)); 3100 } else xb = b; 3101 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3102 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3103 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3104 ibdiag -= sizes[i] * sizes[i]; 3105 sz = ii[row + 1] - diag[row] - 1; 3106 v1 = a->a + diag[row] + 1; 3107 idx = a->j + diag[row] + 1; 3108 3109 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3110 switch (sizes[i]) { 3111 case 1: 3112 3113 sum1 = xb[row]; 3114 for (n = 0; n < sz - 1; n += 2) { 3115 i1 = idx[0]; 3116 i2 = idx[1]; 3117 idx += 2; 3118 tmp0 = x[i1]; 3119 tmp1 = x[i2]; 3120 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3121 v1 += 2; 3122 } 3123 3124 if (n == sz - 1) { 3125 tmp0 = x[*idx]; 3126 sum1 -= *v1 * tmp0; 3127 } 3128 x[row--] = sum1 * (*ibdiag); 3129 break; 3130 3131 case 2: 3132 3133 sum1 = xb[row]; 3134 sum2 = xb[row - 1]; 3135 /* note that sum1 is associated with the second of the two rows */ 3136 v2 = a->a + diag[row - 1] + 2; 3137 for (n = 0; n < sz - 1; n += 2) { 3138 i1 = idx[0]; 3139 i2 = idx[1]; 3140 idx += 2; 3141 tmp0 = x[i1]; 3142 tmp1 = x[i2]; 3143 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3144 v1 += 2; 3145 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3146 v2 += 2; 3147 } 3148 3149 if (n == sz - 1) { 3150 tmp0 = x[*idx]; 3151 sum1 -= *v1 * tmp0; 3152 sum2 -= *v2 * tmp0; 3153 } 3154 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3155 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3156 break; 3157 case 3: 3158 3159 sum1 = xb[row]; 3160 sum2 = xb[row - 1]; 3161 sum3 = xb[row - 2]; 3162 v2 = a->a + diag[row - 1] + 2; 3163 v3 = a->a + diag[row - 2] + 3; 3164 for (n = 0; n < sz - 1; n += 2) { 3165 i1 = idx[0]; 3166 i2 = idx[1]; 3167 idx += 2; 3168 tmp0 = x[i1]; 3169 tmp1 = x[i2]; 3170 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3171 v1 += 2; 3172 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3173 v2 += 2; 3174 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3175 v3 += 2; 3176 } 3177 3178 if (n == sz - 1) { 3179 tmp0 = x[*idx]; 3180 sum1 -= *v1 * tmp0; 3181 sum2 -= *v2 * tmp0; 3182 sum3 -= *v3 * tmp0; 3183 } 3184 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3185 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3186 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3187 break; 3188 case 4: 3189 3190 sum1 = xb[row]; 3191 sum2 = xb[row - 1]; 3192 sum3 = xb[row - 2]; 3193 sum4 = xb[row - 3]; 3194 v2 = a->a + diag[row - 1] + 2; 3195 v3 = a->a + diag[row - 2] + 3; 3196 v4 = a->a + diag[row - 3] + 4; 3197 for (n = 0; n < sz - 1; n += 2) { 3198 i1 = idx[0]; 3199 i2 = idx[1]; 3200 idx += 2; 3201 tmp0 = x[i1]; 3202 tmp1 = x[i2]; 3203 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3204 v1 += 2; 3205 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3206 v2 += 2; 3207 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3208 v3 += 2; 3209 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3210 v4 += 2; 3211 } 3212 3213 if (n == sz - 1) { 3214 tmp0 = x[*idx]; 3215 sum1 -= *v1 * tmp0; 3216 sum2 -= *v2 * tmp0; 3217 sum3 -= *v3 * tmp0; 3218 sum4 -= *v4 * tmp0; 3219 } 3220 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3221 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3222 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3223 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3224 break; 3225 case 5: 3226 3227 sum1 = xb[row]; 3228 sum2 = xb[row - 1]; 3229 sum3 = xb[row - 2]; 3230 sum4 = xb[row - 3]; 3231 sum5 = xb[row - 4]; 3232 v2 = a->a + diag[row - 1] + 2; 3233 v3 = a->a + diag[row - 2] + 3; 3234 v4 = a->a + diag[row - 3] + 4; 3235 v5 = a->a + diag[row - 4] + 5; 3236 for (n = 0; n < sz - 1; n += 2) { 3237 i1 = idx[0]; 3238 i2 = idx[1]; 3239 idx += 2; 3240 tmp0 = x[i1]; 3241 tmp1 = x[i2]; 3242 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3243 v1 += 2; 3244 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3245 v2 += 2; 3246 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3247 v3 += 2; 3248 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3249 v4 += 2; 3250 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3251 v5 += 2; 3252 } 3253 3254 if (n == sz - 1) { 3255 tmp0 = x[*idx]; 3256 sum1 -= *v1 * tmp0; 3257 sum2 -= *v2 * tmp0; 3258 sum3 -= *v3 * tmp0; 3259 sum4 -= *v4 * tmp0; 3260 sum5 -= *v5 * tmp0; 3261 } 3262 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3263 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3264 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3265 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3266 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3267 break; 3268 default: 3269 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3270 } 3271 } 3272 3273 PetscCall(PetscLogFlops(a->nz)); 3274 } 3275 its--; 3276 } 3277 while (its--) { 3278 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3279 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) { 3280 sz = diag[row] - ii[row]; 3281 v1 = a->a + ii[row]; 3282 idx = a->j + ii[row]; 3283 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3284 switch (sizes[i]) { 3285 case 1: 3286 sum1 = b[row]; 3287 for (n = 0; n < sz - 1; n += 2) { 3288 i1 = idx[0]; 3289 i2 = idx[1]; 3290 idx += 2; 3291 tmp0 = x[i1]; 3292 tmp1 = x[i2]; 3293 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3294 v1 += 2; 3295 } 3296 if (n == sz - 1) { 3297 tmp0 = x[*idx++]; 3298 sum1 -= *v1 * tmp0; 3299 v1++; 3300 } 3301 t[row] = sum1; 3302 sz = ii[row + 1] - diag[row] - 1; 3303 idx = a->j + diag[row] + 1; 3304 v1 += 1; 3305 for (n = 0; n < sz - 1; n += 2) { 3306 i1 = idx[0]; 3307 i2 = idx[1]; 3308 idx += 2; 3309 tmp0 = x[i1]; 3310 tmp1 = x[i2]; 3311 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3312 v1 += 2; 3313 } 3314 if (n == sz - 1) { 3315 tmp0 = x[*idx++]; 3316 sum1 -= *v1 * tmp0; 3317 } 3318 /* in MatSOR_SeqAIJ this line would be 3319 * 3320 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3321 * 3322 * but omega == 1, so this becomes 3323 * 3324 * x[row] = sum1*(*ibdiag++); 3325 * 3326 */ 3327 x[row] = sum1 * (*ibdiag); 3328 break; 3329 case 2: 3330 v2 = a->a + ii[row + 1]; 3331 sum1 = b[row]; 3332 sum2 = b[row + 1]; 3333 for (n = 0; n < sz - 1; n += 2) { 3334 i1 = idx[0]; 3335 i2 = idx[1]; 3336 idx += 2; 3337 tmp0 = x[i1]; 3338 tmp1 = x[i2]; 3339 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3340 v1 += 2; 3341 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3342 v2 += 2; 3343 } 3344 if (n == sz - 1) { 3345 tmp0 = x[*idx++]; 3346 sum1 -= v1[0] * tmp0; 3347 sum2 -= v2[0] * tmp0; 3348 v1++; 3349 v2++; 3350 } 3351 t[row] = sum1; 3352 t[row + 1] = sum2; 3353 sz = ii[row + 1] - diag[row] - 2; 3354 idx = a->j + diag[row] + 2; 3355 v1 += 2; 3356 v2 += 2; 3357 for (n = 0; n < sz - 1; n += 2) { 3358 i1 = idx[0]; 3359 i2 = idx[1]; 3360 idx += 2; 3361 tmp0 = x[i1]; 3362 tmp1 = x[i2]; 3363 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3364 v1 += 2; 3365 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3366 v2 += 2; 3367 } 3368 if (n == sz - 1) { 3369 tmp0 = x[*idx]; 3370 sum1 -= v1[0] * tmp0; 3371 sum2 -= v2[0] * tmp0; 3372 } 3373 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3374 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3375 break; 3376 case 3: 3377 v2 = a->a + ii[row + 1]; 3378 v3 = a->a + ii[row + 2]; 3379 sum1 = b[row]; 3380 sum2 = b[row + 1]; 3381 sum3 = b[row + 2]; 3382 for (n = 0; n < sz - 1; n += 2) { 3383 i1 = idx[0]; 3384 i2 = idx[1]; 3385 idx += 2; 3386 tmp0 = x[i1]; 3387 tmp1 = x[i2]; 3388 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3389 v1 += 2; 3390 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3391 v2 += 2; 3392 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3393 v3 += 2; 3394 } 3395 if (n == sz - 1) { 3396 tmp0 = x[*idx++]; 3397 sum1 -= v1[0] * tmp0; 3398 sum2 -= v2[0] * tmp0; 3399 sum3 -= v3[0] * tmp0; 3400 v1++; 3401 v2++; 3402 v3++; 3403 } 3404 t[row] = sum1; 3405 t[row + 1] = sum2; 3406 t[row + 2] = sum3; 3407 sz = ii[row + 1] - diag[row] - 3; 3408 idx = a->j + diag[row] + 3; 3409 v1 += 3; 3410 v2 += 3; 3411 v3 += 3; 3412 for (n = 0; n < sz - 1; n += 2) { 3413 i1 = idx[0]; 3414 i2 = idx[1]; 3415 idx += 2; 3416 tmp0 = x[i1]; 3417 tmp1 = x[i2]; 3418 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3419 v1 += 2; 3420 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3421 v2 += 2; 3422 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3423 v3 += 2; 3424 } 3425 if (n == sz - 1) { 3426 tmp0 = x[*idx]; 3427 sum1 -= v1[0] * tmp0; 3428 sum2 -= v2[0] * tmp0; 3429 sum3 -= v3[0] * tmp0; 3430 } 3431 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3432 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3433 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3434 break; 3435 case 4: 3436 v2 = a->a + ii[row + 1]; 3437 v3 = a->a + ii[row + 2]; 3438 v4 = a->a + ii[row + 3]; 3439 sum1 = b[row]; 3440 sum2 = b[row + 1]; 3441 sum3 = b[row + 2]; 3442 sum4 = b[row + 3]; 3443 for (n = 0; n < sz - 1; n += 2) { 3444 i1 = idx[0]; 3445 i2 = idx[1]; 3446 idx += 2; 3447 tmp0 = x[i1]; 3448 tmp1 = x[i2]; 3449 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3450 v1 += 2; 3451 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3452 v2 += 2; 3453 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3454 v3 += 2; 3455 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3456 v4 += 2; 3457 } 3458 if (n == sz - 1) { 3459 tmp0 = x[*idx++]; 3460 sum1 -= v1[0] * tmp0; 3461 sum2 -= v2[0] * tmp0; 3462 sum3 -= v3[0] * tmp0; 3463 sum4 -= v4[0] * tmp0; 3464 v1++; 3465 v2++; 3466 v3++; 3467 v4++; 3468 } 3469 t[row] = sum1; 3470 t[row + 1] = sum2; 3471 t[row + 2] = sum3; 3472 t[row + 3] = sum4; 3473 sz = ii[row + 1] - diag[row] - 4; 3474 idx = a->j + diag[row] + 4; 3475 v1 += 4; 3476 v2 += 4; 3477 v3 += 4; 3478 v4 += 4; 3479 for (n = 0; n < sz - 1; n += 2) { 3480 i1 = idx[0]; 3481 i2 = idx[1]; 3482 idx += 2; 3483 tmp0 = x[i1]; 3484 tmp1 = x[i2]; 3485 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3486 v1 += 2; 3487 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3488 v2 += 2; 3489 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3490 v3 += 2; 3491 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3492 v4 += 2; 3493 } 3494 if (n == sz - 1) { 3495 tmp0 = x[*idx]; 3496 sum1 -= v1[0] * tmp0; 3497 sum2 -= v2[0] * tmp0; 3498 sum3 -= v3[0] * tmp0; 3499 sum4 -= v4[0] * tmp0; 3500 } 3501 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3502 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3503 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3504 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3505 break; 3506 case 5: 3507 v2 = a->a + ii[row + 1]; 3508 v3 = a->a + ii[row + 2]; 3509 v4 = a->a + ii[row + 3]; 3510 v5 = a->a + ii[row + 4]; 3511 sum1 = b[row]; 3512 sum2 = b[row + 1]; 3513 sum3 = b[row + 2]; 3514 sum4 = b[row + 3]; 3515 sum5 = b[row + 4]; 3516 for (n = 0; n < sz - 1; n += 2) { 3517 i1 = idx[0]; 3518 i2 = idx[1]; 3519 idx += 2; 3520 tmp0 = x[i1]; 3521 tmp1 = x[i2]; 3522 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3523 v1 += 2; 3524 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3525 v2 += 2; 3526 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3527 v3 += 2; 3528 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3529 v4 += 2; 3530 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3531 v5 += 2; 3532 } 3533 if (n == sz - 1) { 3534 tmp0 = x[*idx++]; 3535 sum1 -= v1[0] * tmp0; 3536 sum2 -= v2[0] * tmp0; 3537 sum3 -= v3[0] * tmp0; 3538 sum4 -= v4[0] * tmp0; 3539 sum5 -= v5[0] * tmp0; 3540 v1++; 3541 v2++; 3542 v3++; 3543 v4++; 3544 v5++; 3545 } 3546 t[row] = sum1; 3547 t[row + 1] = sum2; 3548 t[row + 2] = sum3; 3549 t[row + 3] = sum4; 3550 t[row + 4] = sum5; 3551 sz = ii[row + 1] - diag[row] - 5; 3552 idx = a->j + diag[row] + 5; 3553 v1 += 5; 3554 v2 += 5; 3555 v3 += 5; 3556 v4 += 5; 3557 v5 += 5; 3558 for (n = 0; n < sz - 1; n += 2) { 3559 i1 = idx[0]; 3560 i2 = idx[1]; 3561 idx += 2; 3562 tmp0 = x[i1]; 3563 tmp1 = x[i2]; 3564 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3565 v1 += 2; 3566 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3567 v2 += 2; 3568 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3569 v3 += 2; 3570 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3571 v4 += 2; 3572 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3573 v5 += 2; 3574 } 3575 if (n == sz - 1) { 3576 tmp0 = x[*idx]; 3577 sum1 -= v1[0] * tmp0; 3578 sum2 -= v2[0] * tmp0; 3579 sum3 -= v3[0] * tmp0; 3580 sum4 -= v4[0] * tmp0; 3581 sum5 -= v5[0] * tmp0; 3582 } 3583 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3584 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3585 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3586 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3587 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3588 break; 3589 default: 3590 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3591 } 3592 } 3593 xb = t; 3594 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3595 } else xb = b; 3596 3597 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3598 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3599 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3600 ibdiag -= sizes[i] * sizes[i]; 3601 3602 /* set RHS */ 3603 if (xb == b) { 3604 /* whole (old way) */ 3605 sz = ii[row + 1] - ii[row]; 3606 idx = a->j + ii[row]; 3607 switch (sizes[i]) { 3608 case 5: 3609 v5 = a->a + ii[row - 4]; /* fall through */ 3610 case 4: 3611 v4 = a->a + ii[row - 3]; /* fall through */ 3612 case 3: 3613 v3 = a->a + ii[row - 2]; /* fall through */ 3614 case 2: 3615 v2 = a->a + ii[row - 1]; /* fall through */ 3616 case 1: 3617 v1 = a->a + ii[row]; 3618 break; 3619 default: 3620 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3621 } 3622 } else { 3623 /* upper, no diag */ 3624 sz = ii[row + 1] - diag[row] - 1; 3625 idx = a->j + diag[row] + 1; 3626 switch (sizes[i]) { 3627 case 5: 3628 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3629 case 4: 3630 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3631 case 3: 3632 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3633 case 2: 3634 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3635 case 1: 3636 v1 = a->a + diag[row] + 1; 3637 } 3638 } 3639 /* set sum */ 3640 switch (sizes[i]) { 3641 case 5: 3642 sum5 = xb[row - 4]; /* fall through */ 3643 case 4: 3644 sum4 = xb[row - 3]; /* fall through */ 3645 case 3: 3646 sum3 = xb[row - 2]; /* fall through */ 3647 case 2: 3648 sum2 = xb[row - 1]; /* fall through */ 3649 case 1: 3650 /* note that sum1 is associated with the last row */ 3651 sum1 = xb[row]; 3652 } 3653 /* do sums */ 3654 for (n = 0; n < sz - 1; n += 2) { 3655 i1 = idx[0]; 3656 i2 = idx[1]; 3657 idx += 2; 3658 tmp0 = x[i1]; 3659 tmp1 = x[i2]; 3660 switch (sizes[i]) { 3661 case 5: 3662 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3663 v5 += 2; /* fall through */ 3664 case 4: 3665 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3666 v4 += 2; /* fall through */ 3667 case 3: 3668 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3669 v3 += 2; /* fall through */ 3670 case 2: 3671 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3672 v2 += 2; /* fall through */ 3673 case 1: 3674 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3675 v1 += 2; 3676 } 3677 } 3678 /* ragged edge */ 3679 if (n == sz - 1) { 3680 tmp0 = x[*idx]; 3681 switch (sizes[i]) { 3682 case 5: 3683 sum5 -= *v5 * tmp0; /* fall through */ 3684 case 4: 3685 sum4 -= *v4 * tmp0; /* fall through */ 3686 case 3: 3687 sum3 -= *v3 * tmp0; /* fall through */ 3688 case 2: 3689 sum2 -= *v2 * tmp0; /* fall through */ 3690 case 1: 3691 sum1 -= *v1 * tmp0; 3692 } 3693 } 3694 /* update */ 3695 if (xb == b) { 3696 /* whole (old way) w/ diag */ 3697 switch (sizes[i]) { 3698 case 5: 3699 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3700 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3701 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3702 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3703 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3704 break; 3705 case 4: 3706 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3707 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3708 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3709 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3710 break; 3711 case 3: 3712 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3713 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3714 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3715 break; 3716 case 2: 3717 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3718 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3719 break; 3720 case 1: 3721 x[row--] += sum1 * (*ibdiag); 3722 break; 3723 } 3724 } else { 3725 /* no diag so set = */ 3726 switch (sizes[i]) { 3727 case 5: 3728 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3729 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3730 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3731 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3732 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3733 break; 3734 case 4: 3735 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3736 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3737 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3738 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3739 break; 3740 case 3: 3741 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3742 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3743 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3744 break; 3745 case 2: 3746 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3747 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3748 break; 3749 case 1: 3750 x[row--] = sum1 * (*ibdiag); 3751 break; 3752 } 3753 } 3754 } 3755 if (xb == b) { 3756 PetscCall(PetscLogFlops(2.0 * a->nz)); 3757 } else { 3758 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3759 } 3760 } 3761 } 3762 if (flag & SOR_EISENSTAT) { 3763 /* 3764 Apply (U + D)^-1 where D is now the block diagonal 3765 */ 3766 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3767 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3768 ibdiag -= sizes[i] * sizes[i]; 3769 sz = ii[row + 1] - diag[row] - 1; 3770 v1 = a->a + diag[row] + 1; 3771 idx = a->j + diag[row] + 1; 3772 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3773 switch (sizes[i]) { 3774 case 1: 3775 3776 sum1 = b[row]; 3777 for (n = 0; n < sz - 1; n += 2) { 3778 i1 = idx[0]; 3779 i2 = idx[1]; 3780 idx += 2; 3781 tmp0 = x[i1]; 3782 tmp1 = x[i2]; 3783 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3784 v1 += 2; 3785 } 3786 3787 if (n == sz - 1) { 3788 tmp0 = x[*idx]; 3789 sum1 -= *v1 * tmp0; 3790 } 3791 x[row] = sum1 * (*ibdiag); 3792 row--; 3793 break; 3794 3795 case 2: 3796 3797 sum1 = b[row]; 3798 sum2 = b[row - 1]; 3799 /* note that sum1 is associated with the second of the two rows */ 3800 v2 = a->a + diag[row - 1] + 2; 3801 for (n = 0; n < sz - 1; n += 2) { 3802 i1 = idx[0]; 3803 i2 = idx[1]; 3804 idx += 2; 3805 tmp0 = x[i1]; 3806 tmp1 = x[i2]; 3807 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3808 v1 += 2; 3809 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3810 v2 += 2; 3811 } 3812 3813 if (n == sz - 1) { 3814 tmp0 = x[*idx]; 3815 sum1 -= *v1 * tmp0; 3816 sum2 -= *v2 * tmp0; 3817 } 3818 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3819 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3820 row -= 2; 3821 break; 3822 case 3: 3823 3824 sum1 = b[row]; 3825 sum2 = b[row - 1]; 3826 sum3 = b[row - 2]; 3827 v2 = a->a + diag[row - 1] + 2; 3828 v3 = a->a + diag[row - 2] + 3; 3829 for (n = 0; n < sz - 1; n += 2) { 3830 i1 = idx[0]; 3831 i2 = idx[1]; 3832 idx += 2; 3833 tmp0 = x[i1]; 3834 tmp1 = x[i2]; 3835 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3836 v1 += 2; 3837 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3838 v2 += 2; 3839 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3840 v3 += 2; 3841 } 3842 3843 if (n == sz - 1) { 3844 tmp0 = x[*idx]; 3845 sum1 -= *v1 * tmp0; 3846 sum2 -= *v2 * tmp0; 3847 sum3 -= *v3 * tmp0; 3848 } 3849 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3850 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3851 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3852 row -= 3; 3853 break; 3854 case 4: 3855 3856 sum1 = b[row]; 3857 sum2 = b[row - 1]; 3858 sum3 = b[row - 2]; 3859 sum4 = b[row - 3]; 3860 v2 = a->a + diag[row - 1] + 2; 3861 v3 = a->a + diag[row - 2] + 3; 3862 v4 = a->a + diag[row - 3] + 4; 3863 for (n = 0; n < sz - 1; n += 2) { 3864 i1 = idx[0]; 3865 i2 = idx[1]; 3866 idx += 2; 3867 tmp0 = x[i1]; 3868 tmp1 = x[i2]; 3869 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3870 v1 += 2; 3871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3872 v2 += 2; 3873 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3874 v3 += 2; 3875 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3876 v4 += 2; 3877 } 3878 3879 if (n == sz - 1) { 3880 tmp0 = x[*idx]; 3881 sum1 -= *v1 * tmp0; 3882 sum2 -= *v2 * tmp0; 3883 sum3 -= *v3 * tmp0; 3884 sum4 -= *v4 * tmp0; 3885 } 3886 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3887 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3888 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3889 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3890 row -= 4; 3891 break; 3892 case 5: 3893 3894 sum1 = b[row]; 3895 sum2 = b[row - 1]; 3896 sum3 = b[row - 2]; 3897 sum4 = b[row - 3]; 3898 sum5 = b[row - 4]; 3899 v2 = a->a + diag[row - 1] + 2; 3900 v3 = a->a + diag[row - 2] + 3; 3901 v4 = a->a + diag[row - 3] + 4; 3902 v5 = a->a + diag[row - 4] + 5; 3903 for (n = 0; n < sz - 1; n += 2) { 3904 i1 = idx[0]; 3905 i2 = idx[1]; 3906 idx += 2; 3907 tmp0 = x[i1]; 3908 tmp1 = x[i2]; 3909 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3910 v1 += 2; 3911 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3912 v2 += 2; 3913 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3914 v3 += 2; 3915 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3916 v4 += 2; 3917 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3918 v5 += 2; 3919 } 3920 3921 if (n == sz - 1) { 3922 tmp0 = x[*idx]; 3923 sum1 -= *v1 * tmp0; 3924 sum2 -= *v2 * tmp0; 3925 sum3 -= *v3 * tmp0; 3926 sum4 -= *v4 * tmp0; 3927 sum5 -= *v5 * tmp0; 3928 } 3929 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3930 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3931 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3932 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3933 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3934 row -= 5; 3935 break; 3936 default: 3937 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3938 } 3939 } 3940 PetscCall(PetscLogFlops(a->nz)); 3941 3942 /* 3943 t = b - D x where D is the block diagonal 3944 */ 3945 cnt = 0; 3946 for (i = 0, row = 0; i < m; i++) { 3947 switch (sizes[i]) { 3948 case 1: 3949 t[row] = b[row] - bdiag[cnt++] * x[row]; 3950 row++; 3951 break; 3952 case 2: 3953 x1 = x[row]; 3954 x2 = x[row + 1]; 3955 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3956 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3957 t[row] = b[row] - tmp1; 3958 t[row + 1] = b[row + 1] - tmp2; 3959 row += 2; 3960 cnt += 4; 3961 break; 3962 case 3: 3963 x1 = x[row]; 3964 x2 = x[row + 1]; 3965 x3 = x[row + 2]; 3966 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3967 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3968 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3969 t[row] = b[row] - tmp1; 3970 t[row + 1] = b[row + 1] - tmp2; 3971 t[row + 2] = b[row + 2] - tmp3; 3972 row += 3; 3973 cnt += 9; 3974 break; 3975 case 4: 3976 x1 = x[row]; 3977 x2 = x[row + 1]; 3978 x3 = x[row + 2]; 3979 x4 = x[row + 3]; 3980 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3981 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3982 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3983 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3984 t[row] = b[row] - tmp1; 3985 t[row + 1] = b[row + 1] - tmp2; 3986 t[row + 2] = b[row + 2] - tmp3; 3987 t[row + 3] = b[row + 3] - tmp4; 3988 row += 4; 3989 cnt += 16; 3990 break; 3991 case 5: 3992 x1 = x[row]; 3993 x2 = x[row + 1]; 3994 x3 = x[row + 2]; 3995 x4 = x[row + 3]; 3996 x5 = x[row + 4]; 3997 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 3998 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 3999 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4000 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4001 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4002 t[row] = b[row] - tmp1; 4003 t[row + 1] = b[row + 1] - tmp2; 4004 t[row + 2] = b[row + 2] - tmp3; 4005 t[row + 3] = b[row + 3] - tmp4; 4006 t[row + 4] = b[row + 4] - tmp5; 4007 row += 5; 4008 cnt += 25; 4009 break; 4010 default: 4011 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4012 } 4013 } 4014 PetscCall(PetscLogFlops(m)); 4015 4016 /* 4017 Apply (L + D)^-1 where D is the block diagonal 4018 */ 4019 for (i = 0, row = 0; i < m; i++) { 4020 sz = diag[row] - ii[row]; 4021 v1 = a->a + ii[row]; 4022 idx = a->j + ii[row]; 4023 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4024 switch (sizes[i]) { 4025 case 1: 4026 4027 sum1 = t[row]; 4028 for (n = 0; n < sz - 1; n += 2) { 4029 i1 = idx[0]; 4030 i2 = idx[1]; 4031 idx += 2; 4032 tmp0 = t[i1]; 4033 tmp1 = t[i2]; 4034 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4035 v1 += 2; 4036 } 4037 4038 if (n == sz - 1) { 4039 tmp0 = t[*idx]; 4040 sum1 -= *v1 * tmp0; 4041 } 4042 x[row] += t[row] = sum1 * (*ibdiag++); 4043 row++; 4044 break; 4045 case 2: 4046 v2 = a->a + ii[row + 1]; 4047 sum1 = t[row]; 4048 sum2 = t[row + 1]; 4049 for (n = 0; n < sz - 1; n += 2) { 4050 i1 = idx[0]; 4051 i2 = idx[1]; 4052 idx += 2; 4053 tmp0 = t[i1]; 4054 tmp1 = t[i2]; 4055 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4056 v1 += 2; 4057 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4058 v2 += 2; 4059 } 4060 4061 if (n == sz - 1) { 4062 tmp0 = t[*idx]; 4063 sum1 -= v1[0] * tmp0; 4064 sum2 -= v2[0] * tmp0; 4065 } 4066 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4067 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4068 ibdiag += 4; 4069 row += 2; 4070 break; 4071 case 3: 4072 v2 = a->a + ii[row + 1]; 4073 v3 = a->a + ii[row + 2]; 4074 sum1 = t[row]; 4075 sum2 = t[row + 1]; 4076 sum3 = t[row + 2]; 4077 for (n = 0; n < sz - 1; n += 2) { 4078 i1 = idx[0]; 4079 i2 = idx[1]; 4080 idx += 2; 4081 tmp0 = t[i1]; 4082 tmp1 = t[i2]; 4083 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4084 v1 += 2; 4085 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4086 v2 += 2; 4087 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4088 v3 += 2; 4089 } 4090 4091 if (n == sz - 1) { 4092 tmp0 = t[*idx]; 4093 sum1 -= v1[0] * tmp0; 4094 sum2 -= v2[0] * tmp0; 4095 sum3 -= v3[0] * tmp0; 4096 } 4097 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4098 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4099 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4100 ibdiag += 9; 4101 row += 3; 4102 break; 4103 case 4: 4104 v2 = a->a + ii[row + 1]; 4105 v3 = a->a + ii[row + 2]; 4106 v4 = a->a + ii[row + 3]; 4107 sum1 = t[row]; 4108 sum2 = t[row + 1]; 4109 sum3 = t[row + 2]; 4110 sum4 = t[row + 3]; 4111 for (n = 0; n < sz - 1; n += 2) { 4112 i1 = idx[0]; 4113 i2 = idx[1]; 4114 idx += 2; 4115 tmp0 = t[i1]; 4116 tmp1 = t[i2]; 4117 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4118 v1 += 2; 4119 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4120 v2 += 2; 4121 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4122 v3 += 2; 4123 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4124 v4 += 2; 4125 } 4126 4127 if (n == sz - 1) { 4128 tmp0 = t[*idx]; 4129 sum1 -= v1[0] * tmp0; 4130 sum2 -= v2[0] * tmp0; 4131 sum3 -= v3[0] * tmp0; 4132 sum4 -= v4[0] * tmp0; 4133 } 4134 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4135 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4136 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4137 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4138 ibdiag += 16; 4139 row += 4; 4140 break; 4141 case 5: 4142 v2 = a->a + ii[row + 1]; 4143 v3 = a->a + ii[row + 2]; 4144 v4 = a->a + ii[row + 3]; 4145 v5 = a->a + ii[row + 4]; 4146 sum1 = t[row]; 4147 sum2 = t[row + 1]; 4148 sum3 = t[row + 2]; 4149 sum4 = t[row + 3]; 4150 sum5 = t[row + 4]; 4151 for (n = 0; n < sz - 1; n += 2) { 4152 i1 = idx[0]; 4153 i2 = idx[1]; 4154 idx += 2; 4155 tmp0 = t[i1]; 4156 tmp1 = t[i2]; 4157 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4158 v1 += 2; 4159 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4160 v2 += 2; 4161 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4162 v3 += 2; 4163 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4164 v4 += 2; 4165 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4166 v5 += 2; 4167 } 4168 4169 if (n == sz - 1) { 4170 tmp0 = t[*idx]; 4171 sum1 -= v1[0] * tmp0; 4172 sum2 -= v2[0] * tmp0; 4173 sum3 -= v3[0] * tmp0; 4174 sum4 -= v4[0] * tmp0; 4175 sum5 -= v5[0] * tmp0; 4176 } 4177 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4178 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4179 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4180 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4181 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4182 ibdiag += 25; 4183 row += 5; 4184 break; 4185 default: 4186 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4187 } 4188 } 4189 PetscCall(PetscLogFlops(a->nz)); 4190 } 4191 PetscCall(VecRestoreArray(xx, &x)); 4192 PetscCall(VecRestoreArrayRead(bb, &b)); 4193 PetscFunctionReturn(PETSC_SUCCESS); 4194 } 4195 4196 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4197 { 4198 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4199 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4200 const MatScalar *bdiag = a->inode.bdiag; 4201 const PetscScalar *b; 4202 PetscInt m = a->inode.node_count, cnt = 0, i, row; 4203 const PetscInt *sizes = a->inode.size; 4204 4205 PetscFunctionBegin; 4206 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4207 PetscCall(VecGetArray(xx, &x)); 4208 PetscCall(VecGetArrayRead(bb, &b)); 4209 cnt = 0; 4210 for (i = 0, row = 0; i < m; i++) { 4211 switch (sizes[i]) { 4212 case 1: 4213 x[row] = b[row] * bdiag[cnt++]; 4214 row++; 4215 break; 4216 case 2: 4217 x1 = b[row]; 4218 x2 = b[row + 1]; 4219 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4220 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4221 x[row++] = tmp1; 4222 x[row++] = tmp2; 4223 cnt += 4; 4224 break; 4225 case 3: 4226 x1 = b[row]; 4227 x2 = b[row + 1]; 4228 x3 = b[row + 2]; 4229 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4230 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4231 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4232 x[row++] = tmp1; 4233 x[row++] = tmp2; 4234 x[row++] = tmp3; 4235 cnt += 9; 4236 break; 4237 case 4: 4238 x1 = b[row]; 4239 x2 = b[row + 1]; 4240 x3 = b[row + 2]; 4241 x4 = b[row + 3]; 4242 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4243 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4244 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4245 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4246 x[row++] = tmp1; 4247 x[row++] = tmp2; 4248 x[row++] = tmp3; 4249 x[row++] = tmp4; 4250 cnt += 16; 4251 break; 4252 case 5: 4253 x1 = b[row]; 4254 x2 = b[row + 1]; 4255 x3 = b[row + 2]; 4256 x4 = b[row + 3]; 4257 x5 = b[row + 4]; 4258 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4259 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4260 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4261 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4262 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4263 x[row++] = tmp1; 4264 x[row++] = tmp2; 4265 x[row++] = tmp3; 4266 x[row++] = tmp4; 4267 x[row++] = tmp5; 4268 cnt += 25; 4269 break; 4270 default: 4271 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4272 } 4273 } 4274 PetscCall(PetscLogFlops(2.0 * cnt)); 4275 PetscCall(VecRestoreArray(xx, &x)); 4276 PetscCall(VecRestoreArrayRead(bb, &b)); 4277 PetscFunctionReturn(PETSC_SUCCESS); 4278 } 4279 4280 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4281 { 4282 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4283 4284 PetscFunctionBegin; 4285 a->inode.node_count = 0; 4286 a->inode.use = PETSC_FALSE; 4287 a->inode.checked = PETSC_FALSE; 4288 a->inode.mat_nonzerostate = -1; 4289 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4290 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4291 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4292 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4293 A->ops->coloringpatch = NULL; 4294 A->ops->multdiagonalblock = NULL; 4295 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4296 PetscFunctionReturn(PETSC_SUCCESS); 4297 } 4298 4299 /* 4300 samestructure indicates that the matrix has not changed its nonzero structure so we 4301 do not need to recompute the inodes 4302 */ 4303 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4304 { 4305 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4306 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4307 PetscBool flag; 4308 const PetscInt *idx, *idy, *ii; 4309 4310 PetscFunctionBegin; 4311 if (!a->inode.use) { 4312 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4313 PetscCall(PetscFree(a->inode.size)); 4314 PetscFunctionReturn(PETSC_SUCCESS); 4315 } 4316 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 4317 4318 m = A->rmap->n; 4319 if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size)); 4320 ns = a->inode.size; 4321 4322 i = 0; 4323 node_count = 0; 4324 idx = a->j; 4325 ii = a->i; 4326 while (i < m) { /* For each row */ 4327 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4328 /* Limits the number of elements in a node to 'a->inode.limit' */ 4329 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4330 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4331 if (nzy != nzx) break; 4332 idy += nzx; /* Same nonzero pattern */ 4333 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4334 if (!flag) break; 4335 } 4336 ns[node_count++] = blk_size; 4337 idx += blk_size * nzx; 4338 i = j; 4339 } 4340 4341 /* If not enough inodes found,, do not use inode version of the routines */ 4342 if (!m || node_count > .8 * m) { 4343 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4344 PetscCall(PetscFree(a->inode.size)); 4345 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4346 } else { 4347 if (!A->factortype) { 4348 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4349 if (A->rmap->n == A->cmap->n) { 4350 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4351 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4352 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4353 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4354 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4355 } 4356 } else { 4357 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4358 } 4359 a->inode.node_count = node_count; 4360 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4361 } 4362 a->inode.checked = PETSC_TRUE; 4363 a->inode.mat_nonzerostate = A->nonzerostate; 4364 PetscFunctionReturn(PETSC_SUCCESS); 4365 } 4366 4367 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4368 { 4369 Mat B = *C; 4370 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4371 PetscInt m = A->rmap->n; 4372 4373 PetscFunctionBegin; 4374 c->inode.use = a->inode.use; 4375 c->inode.limit = a->inode.limit; 4376 c->inode.max_limit = a->inode.max_limit; 4377 c->inode.checked = PETSC_FALSE; 4378 c->inode.size = NULL; 4379 c->inode.node_count = 0; 4380 c->inode.ibdiagvalid = PETSC_FALSE; 4381 c->inode.ibdiag = NULL; 4382 c->inode.bdiag = NULL; 4383 c->inode.mat_nonzerostate = -1; 4384 if (a->inode.use) { 4385 if (a->inode.checked && a->inode.size) { 4386 PetscCall(PetscMalloc1(m + 1, &c->inode.size)); 4387 PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1)); 4388 4389 c->inode.checked = PETSC_TRUE; 4390 c->inode.node_count = a->inode.node_count; 4391 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4392 } 4393 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4394 if (!B->factortype) { 4395 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4396 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4397 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4398 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4399 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4400 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4401 } else { 4402 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4403 } 4404 } 4405 PetscFunctionReturn(PETSC_SUCCESS); 4406 } 4407 4408 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4409 { 4410 PetscInt k; 4411 const PetscInt *vi; 4412 4413 PetscFunctionBegin; 4414 vi = aj + ai[row]; 4415 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4416 vi = aj + adiag[row]; 4417 cols[nzl] = vi[0]; 4418 vi = aj + adiag[row + 1] + 1; 4419 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4420 PetscFunctionReturn(PETSC_SUCCESS); 4421 } 4422 /* 4423 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4424 Modified from MatSeqAIJCheckInode(). 4425 4426 Input Parameters: 4427 . Mat A - ILU or LU matrix factor 4428 4429 */ 4430 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4431 { 4432 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4433 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4434 PetscInt *cols1, *cols2, *ns; 4435 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4436 PetscBool flag; 4437 4438 PetscFunctionBegin; 4439 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4440 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4441 4442 m = A->rmap->n; 4443 if (a->inode.size) ns = a->inode.size; 4444 else PetscCall(PetscMalloc1(m + 1, &ns)); 4445 4446 i = 0; 4447 node_count = 0; 4448 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4449 while (i < m) { /* For each row */ 4450 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4451 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4452 nzx = nzl1 + nzu1 + 1; 4453 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4454 4455 /* Limits the number of elements in a node to 'a->inode.limit' */ 4456 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4457 nzl2 = ai[j + 1] - ai[j]; 4458 nzu2 = adiag[j] - adiag[j + 1] - 1; 4459 nzy = nzl2 + nzu2 + 1; 4460 if (nzy != nzx) break; 4461 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4462 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4463 if (!flag) break; 4464 } 4465 ns[node_count++] = blk_size; 4466 i = j; 4467 } 4468 PetscCall(PetscFree2(cols1, cols2)); 4469 /* If not enough inodes found,, do not use inode version of the routines */ 4470 if (!m || node_count > .8 * m) { 4471 PetscCall(PetscFree(ns)); 4472 4473 a->inode.node_count = 0; 4474 a->inode.size = NULL; 4475 a->inode.use = PETSC_FALSE; 4476 4477 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4478 } else { 4479 A->ops->mult = NULL; 4480 A->ops->sor = NULL; 4481 A->ops->multadd = NULL; 4482 A->ops->getrowij = NULL; 4483 A->ops->restorerowij = NULL; 4484 A->ops->getcolumnij = NULL; 4485 A->ops->restorecolumnij = NULL; 4486 A->ops->coloringpatch = NULL; 4487 A->ops->multdiagonalblock = NULL; 4488 a->inode.node_count = node_count; 4489 a->inode.size = ns; 4490 4491 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4492 } 4493 a->inode.checked = PETSC_TRUE; 4494 PetscFunctionReturn(PETSC_SUCCESS); 4495 } 4496 4497 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4498 { 4499 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4500 4501 PetscFunctionBegin; 4502 a->inode.ibdiagvalid = PETSC_FALSE; 4503 PetscFunctionReturn(PETSC_SUCCESS); 4504 } 4505 4506 /* 4507 This is really ugly. if inodes are used this replaces the 4508 permutations with ones that correspond to rows/cols of the matrix 4509 rather then inode blocks 4510 */ 4511 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4512 { 4513 PetscFunctionBegin; 4514 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4515 PetscFunctionReturn(PETSC_SUCCESS); 4516 } 4517 4518 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4519 { 4520 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4521 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4522 const PetscInt *ridx, *cidx; 4523 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx; 4524 PetscInt nslim_col, *ns_col; 4525 IS ris = *rperm, cis = *cperm; 4526 4527 PetscFunctionBegin; 4528 if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4529 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4530 4531 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4532 PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns)); 4533 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4534 4535 PetscCall(ISGetIndices(ris, &ridx)); 4536 PetscCall(ISGetIndices(cis, &cidx)); 4537 4538 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4539 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i]; 4540 4541 /* Construct the permutations for rows*/ 4542 for (i = 0, row = 0; i < nslim_row; ++i) { 4543 indx = ridx[i]; 4544 start_val = tns[indx]; 4545 end_val = tns[indx + 1]; 4546 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4547 } 4548 4549 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4550 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i]; 4551 4552 /* Construct permutations for columns */ 4553 for (i = 0, col = 0; i < nslim_col; ++i) { 4554 indx = cidx[i]; 4555 start_val = tns[indx]; 4556 end_val = tns[indx + 1]; 4557 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4558 } 4559 4560 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4561 PetscCall(ISSetPermutation(*rperm)); 4562 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4563 PetscCall(ISSetPermutation(*cperm)); 4564 4565 PetscCall(ISRestoreIndices(ris, &ridx)); 4566 PetscCall(ISRestoreIndices(cis, &cidx)); 4567 4568 PetscCall(PetscFree(ns_col)); 4569 PetscCall(PetscFree2(permr, permc)); 4570 PetscCall(ISDestroy(&cis)); 4571 PetscCall(ISDestroy(&ris)); 4572 PetscCall(PetscFree(tns)); 4573 PetscFunctionReturn(PETSC_SUCCESS); 4574 } 4575 4576 /*@C 4577 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4578 4579 Not Collective 4580 4581 Input Parameter: 4582 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4583 4584 Output Parameters: 4585 + node_count - no of inodes present in the matrix. 4586 . sizes - an array of size node_count,with sizes of each inode. 4587 - limit - the max size used to generate the inodes. 4588 4589 Level: advanced 4590 4591 Note: 4592 This routine returns some internal storage information 4593 of the matrix, it is intended to be used by advanced users. 4594 It should be called after the matrix is assembled. 4595 The contents of the sizes[] array should not be changed. 4596 NULL may be passed for information not requested. 4597 4598 .seealso: `MatGetInfo()` 4599 @*/ 4600 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4601 { 4602 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4603 4604 PetscFunctionBegin; 4605 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4606 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4607 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4608 PetscFunctionReturn(PETSC_SUCCESS); 4609 } 4610 4611 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4612 { 4613 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4614 4615 PetscFunctionBegin; 4616 if (node_count) *node_count = a->inode.node_count; 4617 if (sizes) *sizes = a->inode.size; 4618 if (limit) *limit = a->inode.limit; 4619 PetscFunctionReturn(PETSC_SUCCESS); 4620 } 4621