1 /* 2 This file provides high performance routines for the Inode format (compressed sparse row) 3 by taking advantage of rows with identical nonzero structure (I-nodes). 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #if defined(PETSC_HAVE_XMMINTRIN_H) 7 #include <xmmintrin.h> 8 #endif 9 10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 13 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 14 15 PetscFunctionBegin; 16 n = A->cmap->n; 17 m = A->rmap->n; 18 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++); 24 for (; count + 1 < n; count++, i++); 25 if (count < n) i++; 26 *size = i; 27 PetscFunctionReturn(PETSC_SUCCESS); 28 } 29 PetscCall(PetscMalloc1(n + 1, &ns_col)); 30 31 /* Use the same row structure wherever feasible. */ 32 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i]; 33 34 /* if m < n; pad up the remainder with inode_limit */ 35 for (; count + 1 < n; count++, i++) ns_col[i] = 1; 36 /* The last node is the odd ball. pad it up with the remaining rows; */ 37 if (count < n) { 38 ns_col[i] = n - count; 39 i++; 40 } else if (count > n) { 41 /* Adjust for the over estimation */ 42 ns_col[i - 1] += n - count; 43 } 44 *size = i; 45 *ns = ns_col; 46 PetscFunctionReturn(PETSC_SUCCESS); 47 } 48 49 /* 50 This builds symmetric version of nonzero structure, 51 */ 52 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 53 { 54 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 55 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 56 PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2; 57 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 58 59 PetscFunctionBegin; 60 nslim_row = a->inode.node_count; 61 m = A->rmap->n; 62 n = A->cmap->n; 63 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 64 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 65 66 /* Use the row_inode as column_inode */ 67 nslim_col = nslim_row; 68 ns_col = ns_row; 69 70 /* allocate space for reformatted inode structure */ 71 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 72 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1]; 73 74 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 75 nsz = ns_col[i1]; 76 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 77 } 78 /* allocate space for row pointers */ 79 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 80 *iia = ia; 81 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 82 83 /* determine the number of columns in each row */ 84 ia[0] = oshift; 85 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 86 j = aj + ai[row] + ishift; 87 jmax = aj + ai[row + 1] + ishift; 88 if (j == jmax) continue; /* empty row */ 89 col = *j++ + ishift; 90 i2 = tvc[col]; 91 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 92 ia[i1 + 1]++; 93 ia[i2 + 1]++; 94 i2++; /* Start col of next node */ 95 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 96 i2 = tvc[col]; 97 } 98 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 99 } 100 101 /* shift ia[i] to point to next row */ 102 for (i1 = 1; i1 < nslim_row + 1; i1++) { 103 row = ia[i1 - 1]; 104 ia[i1] += row; 105 work[i1 - 1] = row - oshift; 106 } 107 108 /* allocate space for column pointers */ 109 nz = ia[nslim_row] + (!ishift); 110 PetscCall(PetscMalloc1(nz, &ja)); 111 *jja = ja; 112 113 /* loop over lower triangular part putting into ja */ 114 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 115 j = aj + ai[row] + ishift; 116 jmax = aj + ai[row + 1] + ishift; 117 if (j == jmax) continue; /* empty row */ 118 col = *j++ + ishift; 119 i2 = tvc[col]; 120 while (i2 < i1 && j < jmax) { 121 ja[work[i2]++] = i1 + oshift; 122 ja[work[i1]++] = i2 + oshift; 123 ++i2; 124 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 125 i2 = tvc[col]; 126 } 127 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 128 } 129 PetscCall(PetscFree(work)); 130 PetscCall(PetscFree2(tns, tvc)); 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 /* 135 This builds nonsymmetric version of nonzero structure, 136 */ 137 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 138 { 139 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 140 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 141 PetscInt *tns, *tvc, nsz, i1, i2; 142 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size; 143 144 PetscFunctionBegin; 145 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 146 nslim_row = a->inode.node_count; 147 n = A->cmap->n; 148 149 /* Create The column_inode for this matrix */ 150 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 151 152 /* allocate space for reformatted column_inode structure */ 153 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 154 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 155 156 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 157 nsz = ns_col[i1]; 158 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 159 } 160 /* allocate space for row pointers */ 161 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 162 *iia = ia; 163 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 164 165 /* determine the number of columns in each row */ 166 ia[0] = oshift; 167 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 168 j = aj + ai[row] + ishift; 169 nz = ai[row + 1] - ai[row]; 170 if (!nz) continue; /* empty row */ 171 col = *j++ + ishift; 172 i2 = tvc[col]; 173 while (nz-- > 0) { /* off-diagonal elements */ 174 ia[i1 + 1]++; 175 i2++; /* Start col of next node */ 176 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 177 if (nz > 0) i2 = tvc[col]; 178 } 179 } 180 181 /* shift ia[i] to point to next row */ 182 for (i1 = 1; i1 < nslim_row + 1; i1++) { 183 row = ia[i1 - 1]; 184 ia[i1] += row; 185 work[i1 - 1] = row - oshift; 186 } 187 188 /* allocate space for column pointers */ 189 nz = ia[nslim_row] + (!ishift); 190 PetscCall(PetscMalloc1(nz, &ja)); 191 *jja = ja; 192 193 /* loop over matrix putting into ja */ 194 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 195 j = aj + ai[row] + ishift; 196 nz = ai[row + 1] - ai[row]; 197 if (!nz) continue; /* empty row */ 198 col = *j++ + ishift; 199 i2 = tvc[col]; 200 while (nz-- > 0) { 201 ja[work[i1]++] = i2 + oshift; 202 ++i2; 203 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 204 if (nz > 0) i2 = tvc[col]; 205 } 206 } 207 PetscCall(PetscFree(ns_col)); 208 PetscCall(PetscFree(work)); 209 PetscCall(PetscFree2(tns, tvc)); 210 PetscFunctionReturn(PETSC_SUCCESS); 211 } 212 213 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 214 { 215 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 216 217 PetscFunctionBegin; 218 if (n) *n = a->inode.node_count; 219 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 220 if (!blockcompressed) { 221 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 222 } else if (symmetric) { 223 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 224 } else { 225 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 226 } 227 PetscFunctionReturn(PETSC_SUCCESS); 228 } 229 230 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 231 { 232 PetscFunctionBegin; 233 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 234 235 if (!blockcompressed) { 236 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 237 } else { 238 PetscCall(PetscFree(*ia)); 239 PetscCall(PetscFree(*ja)); 240 } 241 PetscFunctionReturn(PETSC_SUCCESS); 242 } 243 244 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 245 { 246 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 247 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 248 PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j; 249 250 PetscFunctionBegin; 251 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 252 nslim_row = a->inode.node_count; 253 n = A->cmap->n; 254 255 /* Create The column_inode for this matrix */ 256 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 257 258 /* allocate space for reformatted column_inode structure */ 259 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 260 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 261 262 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 263 nsz = ns_col[i1]; 264 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 265 } 266 /* allocate space for column pointers */ 267 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 268 *iia = ia; 269 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 270 271 /* determine the number of columns in each row */ 272 ia[0] = oshift; 273 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 274 j = aj + ai[row] + ishift; 275 col = *j++ + ishift; 276 i2 = tvc[col]; 277 nz = ai[row + 1] - ai[row]; 278 while (nz-- > 0) { /* off-diagonal elements */ 279 /* ia[i1+1]++; */ 280 ia[i2 + 1]++; 281 i2++; 282 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 283 if (nz > 0) i2 = tvc[col]; 284 } 285 } 286 287 /* shift ia[i] to point to next col */ 288 for (i1 = 1; i1 < nslim_col + 1; i1++) { 289 col = ia[i1 - 1]; 290 ia[i1] += col; 291 work[i1 - 1] = col - oshift; 292 } 293 294 /* allocate space for column pointers */ 295 nz = ia[nslim_col] + (!ishift); 296 PetscCall(PetscMalloc1(nz, &ja)); 297 *jja = ja; 298 299 /* loop over matrix putting into ja */ 300 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 301 j = aj + ai[row] + ishift; 302 col = *j++ + ishift; 303 i2 = tvc[col]; 304 nz = ai[row + 1] - ai[row]; 305 while (nz-- > 0) { 306 /* ja[work[i1]++] = i2 + oshift; */ 307 ja[work[i2]++] = i1 + oshift; 308 i2++; 309 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 310 if (nz > 0) i2 = tvc[col]; 311 } 312 } 313 PetscCall(PetscFree(ns_col)); 314 PetscCall(PetscFree(work)); 315 PetscCall(PetscFree2(tns, tvc)); 316 PetscFunctionReturn(PETSC_SUCCESS); 317 } 318 319 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 320 { 321 PetscFunctionBegin; 322 PetscCall(MatCreateColInode_Private(A, n, NULL)); 323 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 324 325 if (!blockcompressed) { 326 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 327 } else if (symmetric) { 328 /* Since the indices are symmetric it doesn't matter */ 329 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 330 } else { 331 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 332 } 333 PetscFunctionReturn(PETSC_SUCCESS); 334 } 335 336 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 337 { 338 PetscFunctionBegin; 339 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 340 if (!blockcompressed) { 341 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 342 } else { 343 PetscCall(PetscFree(*ia)); 344 PetscCall(PetscFree(*ja)); 345 } 346 PetscFunctionReturn(PETSC_SUCCESS); 347 } 348 349 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 350 { 351 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 352 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 353 PetscScalar *y; 354 const PetscScalar *x; 355 const MatScalar *v1, *v2, *v3, *v4, *v5; 356 PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 357 const PetscInt *idx, *ns, *ii; 358 359 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 360 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 361 #endif 362 363 PetscFunctionBegin; 364 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 365 node_max = a->inode.node_count; 366 ns = a->inode.size; /* Node Size array */ 367 PetscCall(VecGetArrayRead(xx, &x)); 368 PetscCall(VecGetArray(yy, &y)); 369 idx = a->j; 370 v1 = a->a; 371 ii = a->i; 372 373 for (i = 0, row = 0; i < node_max; ++i) { 374 nsz = ns[i]; 375 n = ii[1] - ii[0]; 376 nonzerorow += (n > 0) * nsz; 377 ii += nsz; 378 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 379 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 380 sz = n; /* No of non zeros in this row */ 381 /* Switch on the size of Node */ 382 switch (nsz) { /* Each loop in 'case' is unrolled */ 383 case 1: 384 sum1 = 0.; 385 386 for (n = 0; n < sz - 1; n += 2) { 387 i1 = idx[0]; /* The instructions are ordered to */ 388 i2 = idx[1]; /* make the compiler's job easy */ 389 idx += 2; 390 tmp0 = x[i1]; 391 tmp1 = x[i2]; 392 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 393 v1 += 2; 394 } 395 396 if (n == sz - 1) { /* Take care of the last nonzero */ 397 tmp0 = x[*idx++]; 398 sum1 += *v1++ * tmp0; 399 } 400 y[row++] = sum1; 401 break; 402 case 2: 403 sum1 = 0.; 404 sum2 = 0.; 405 v2 = v1 + n; 406 407 for (n = 0; n < sz - 1; n += 2) { 408 i1 = idx[0]; 409 i2 = idx[1]; 410 idx += 2; 411 tmp0 = x[i1]; 412 tmp1 = x[i2]; 413 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 414 v1 += 2; 415 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 416 v2 += 2; 417 } 418 if (n == sz - 1) { 419 tmp0 = x[*idx++]; 420 sum1 += *v1++ * tmp0; 421 sum2 += *v2++ * tmp0; 422 } 423 y[row++] = sum1; 424 y[row++] = sum2; 425 v1 = v2; /* Since the next block to be processed starts there*/ 426 idx += sz; 427 break; 428 case 3: 429 sum1 = 0.; 430 sum2 = 0.; 431 sum3 = 0.; 432 v2 = v1 + n; 433 v3 = v2 + n; 434 435 for (n = 0; n < sz - 1; n += 2) { 436 i1 = idx[0]; 437 i2 = idx[1]; 438 idx += 2; 439 tmp0 = x[i1]; 440 tmp1 = x[i2]; 441 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 442 v1 += 2; 443 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 444 v2 += 2; 445 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 446 v3 += 2; 447 } 448 if (n == sz - 1) { 449 tmp0 = x[*idx++]; 450 sum1 += *v1++ * tmp0; 451 sum2 += *v2++ * tmp0; 452 sum3 += *v3++ * tmp0; 453 } 454 y[row++] = sum1; 455 y[row++] = sum2; 456 y[row++] = sum3; 457 v1 = v3; /* Since the next block to be processed starts there*/ 458 idx += 2 * sz; 459 break; 460 case 4: 461 sum1 = 0.; 462 sum2 = 0.; 463 sum3 = 0.; 464 sum4 = 0.; 465 v2 = v1 + n; 466 v3 = v2 + n; 467 v4 = v3 + n; 468 469 for (n = 0; n < sz - 1; n += 2) { 470 i1 = idx[0]; 471 i2 = idx[1]; 472 idx += 2; 473 tmp0 = x[i1]; 474 tmp1 = x[i2]; 475 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 476 v1 += 2; 477 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 478 v2 += 2; 479 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 480 v3 += 2; 481 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 482 v4 += 2; 483 } 484 if (n == sz - 1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 sum4 += *v4++ * tmp0; 490 } 491 y[row++] = sum1; 492 y[row++] = sum2; 493 y[row++] = sum3; 494 y[row++] = sum4; 495 v1 = v4; /* Since the next block to be processed starts there*/ 496 idx += 3 * sz; 497 break; 498 case 5: 499 sum1 = 0.; 500 sum2 = 0.; 501 sum3 = 0.; 502 sum4 = 0.; 503 sum5 = 0.; 504 v2 = v1 + n; 505 v3 = v2 + n; 506 v4 = v3 + n; 507 v5 = v4 + n; 508 509 for (n = 0; n < sz - 1; n += 2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 516 v1 += 2; 517 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 518 v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 520 v3 += 2; 521 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 522 v4 += 2; 523 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 524 v5 += 2; 525 } 526 if (n == sz - 1) { 527 tmp0 = x[*idx++]; 528 sum1 += *v1++ * tmp0; 529 sum2 += *v2++ * tmp0; 530 sum3 += *v3++ * tmp0; 531 sum4 += *v4++ * tmp0; 532 sum5 += *v5++ * tmp0; 533 } 534 y[row++] = sum1; 535 y[row++] = sum2; 536 y[row++] = sum3; 537 y[row++] = sum4; 538 y[row++] = sum5; 539 v1 = v5; /* Since the next block to be processed starts there */ 540 idx += 4 * sz; 541 break; 542 default: 543 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 544 } 545 } 546 PetscCall(VecRestoreArrayRead(xx, &x)); 547 PetscCall(VecRestoreArray(yy, &y)); 548 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 549 PetscFunctionReturn(PETSC_SUCCESS); 550 } 551 552 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 553 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 554 { 555 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 556 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 557 const MatScalar *v1, *v2, *v3, *v4, *v5; 558 const PetscScalar *x; 559 PetscScalar *y, *z, *zt; 560 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 561 const PetscInt *idx, *ns, *ii; 562 563 PetscFunctionBegin; 564 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 565 node_max = a->inode.node_count; 566 ns = a->inode.size; /* Node Size array */ 567 568 PetscCall(VecGetArrayRead(xx, &x)); 569 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 570 zt = z; 571 572 idx = a->j; 573 v1 = a->a; 574 ii = a->i; 575 576 for (i = 0, row = 0; i < node_max; ++i) { 577 nsz = ns[i]; 578 n = ii[1] - ii[0]; 579 ii += nsz; 580 sz = n; /* No of non zeros in this row */ 581 /* Switch on the size of Node */ 582 switch (nsz) { /* Each loop in 'case' is unrolled */ 583 case 1: 584 sum1 = *zt++; 585 586 for (n = 0; n < sz - 1; n += 2) { 587 i1 = idx[0]; /* The instructions are ordered to */ 588 i2 = idx[1]; /* make the compiler's job easy */ 589 idx += 2; 590 tmp0 = x[i1]; 591 tmp1 = x[i2]; 592 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 593 v1 += 2; 594 } 595 596 if (n == sz - 1) { /* Take care of the last nonzero */ 597 tmp0 = x[*idx++]; 598 sum1 += *v1++ * tmp0; 599 } 600 y[row++] = sum1; 601 break; 602 case 2: 603 sum1 = *zt++; 604 sum2 = *zt++; 605 v2 = v1 + n; 606 607 for (n = 0; n < sz - 1; n += 2) { 608 i1 = idx[0]; 609 i2 = idx[1]; 610 idx += 2; 611 tmp0 = x[i1]; 612 tmp1 = x[i2]; 613 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 614 v1 += 2; 615 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 616 v2 += 2; 617 } 618 if (n == sz - 1) { 619 tmp0 = x[*idx++]; 620 sum1 += *v1++ * tmp0; 621 sum2 += *v2++ * tmp0; 622 } 623 y[row++] = sum1; 624 y[row++] = sum2; 625 v1 = v2; /* Since the next block to be processed starts there*/ 626 idx += sz; 627 break; 628 case 3: 629 sum1 = *zt++; 630 sum2 = *zt++; 631 sum3 = *zt++; 632 v2 = v1 + n; 633 v3 = v2 + n; 634 635 for (n = 0; n < sz - 1; n += 2) { 636 i1 = idx[0]; 637 i2 = idx[1]; 638 idx += 2; 639 tmp0 = x[i1]; 640 tmp1 = x[i2]; 641 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 642 v1 += 2; 643 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 644 v2 += 2; 645 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 646 v3 += 2; 647 } 648 if (n == sz - 1) { 649 tmp0 = x[*idx++]; 650 sum1 += *v1++ * tmp0; 651 sum2 += *v2++ * tmp0; 652 sum3 += *v3++ * tmp0; 653 } 654 y[row++] = sum1; 655 y[row++] = sum2; 656 y[row++] = sum3; 657 v1 = v3; /* Since the next block to be processed starts there*/ 658 idx += 2 * sz; 659 break; 660 case 4: 661 sum1 = *zt++; 662 sum2 = *zt++; 663 sum3 = *zt++; 664 sum4 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 v4 = v3 + n; 668 669 for (n = 0; n < sz - 1; n += 2) { 670 i1 = idx[0]; 671 i2 = idx[1]; 672 idx += 2; 673 tmp0 = x[i1]; 674 tmp1 = x[i2]; 675 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 676 v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 678 v2 += 2; 679 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 680 v3 += 2; 681 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 682 v4 += 2; 683 } 684 if (n == sz - 1) { 685 tmp0 = x[*idx++]; 686 sum1 += *v1++ * tmp0; 687 sum2 += *v2++ * tmp0; 688 sum3 += *v3++ * tmp0; 689 sum4 += *v4++ * tmp0; 690 } 691 y[row++] = sum1; 692 y[row++] = sum2; 693 y[row++] = sum3; 694 y[row++] = sum4; 695 v1 = v4; /* Since the next block to be processed starts there*/ 696 idx += 3 * sz; 697 break; 698 case 5: 699 sum1 = *zt++; 700 sum2 = *zt++; 701 sum3 = *zt++; 702 sum4 = *zt++; 703 sum5 = *zt++; 704 v2 = v1 + n; 705 v3 = v2 + n; 706 v4 = v3 + n; 707 v5 = v4 + n; 708 709 for (n = 0; n < sz - 1; n += 2) { 710 i1 = idx[0]; 711 i2 = idx[1]; 712 idx += 2; 713 tmp0 = x[i1]; 714 tmp1 = x[i2]; 715 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 716 v1 += 2; 717 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 718 v2 += 2; 719 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 720 v3 += 2; 721 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 722 v4 += 2; 723 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 724 v5 += 2; 725 } 726 if (n == sz - 1) { 727 tmp0 = x[*idx++]; 728 sum1 += *v1++ * tmp0; 729 sum2 += *v2++ * tmp0; 730 sum3 += *v3++ * tmp0; 731 sum4 += *v4++ * tmp0; 732 sum5 += *v5++ * tmp0; 733 } 734 y[row++] = sum1; 735 y[row++] = sum2; 736 y[row++] = sum3; 737 y[row++] = sum4; 738 y[row++] = sum5; 739 v1 = v5; /* Since the next block to be processed starts there */ 740 idx += 4 * sz; 741 break; 742 default: 743 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 744 } 745 } 746 PetscCall(VecRestoreArrayRead(xx, &x)); 747 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 748 PetscCall(PetscLogFlops(2.0 * a->nz)); 749 PetscFunctionReturn(PETSC_SUCCESS); 750 } 751 752 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 753 { 754 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 755 IS iscol = a->col, isrow = a->row; 756 const PetscInt *r, *c, *rout, *cout; 757 PetscInt i, j, n = A->rmap->n, nz; 758 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 759 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 760 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 761 PetscScalar sum1, sum2, sum3, sum4, sum5; 762 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 763 const PetscScalar *b; 764 765 PetscFunctionBegin; 766 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 767 node_max = a->inode.node_count; 768 ns = a->inode.size; /* Node Size array */ 769 770 PetscCall(VecGetArrayRead(bb, &b)); 771 PetscCall(VecGetArrayWrite(xx, &x)); 772 tmp = a->solve_work; 773 774 PetscCall(ISGetIndices(isrow, &rout)); 775 r = rout; 776 PetscCall(ISGetIndices(iscol, &cout)); 777 c = cout + (n - 1); 778 779 /* forward solve the lower triangular */ 780 tmps = tmp; 781 aa = a_a; 782 aj = a_j; 783 ad = a->diag; 784 785 for (i = 0, row = 0; i < node_max; ++i) { 786 nsz = ns[i]; 787 aii = ai[row]; 788 v1 = aa + aii; 789 vi = aj + aii; 790 nz = ad[row] - aii; 791 if (i < node_max - 1) { 792 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 793 * but our indexing to determine its size could. */ 794 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 795 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 796 PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 797 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 798 } 799 800 switch (nsz) { /* Each loop in 'case' is unrolled */ 801 case 1: 802 sum1 = b[*r++]; 803 for (j = 0; j < nz - 1; j += 2) { 804 i0 = vi[0]; 805 i1 = vi[1]; 806 vi += 2; 807 tmp0 = tmps[i0]; 808 tmp1 = tmps[i1]; 809 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 810 v1 += 2; 811 } 812 if (j == nz - 1) { 813 tmp0 = tmps[*vi++]; 814 sum1 -= *v1++ * tmp0; 815 } 816 tmp[row++] = sum1; 817 break; 818 case 2: 819 sum1 = b[*r++]; 820 sum2 = b[*r++]; 821 v2 = aa + ai[row + 1]; 822 823 for (j = 0; j < nz - 1; j += 2) { 824 i0 = vi[0]; 825 i1 = vi[1]; 826 vi += 2; 827 tmp0 = tmps[i0]; 828 tmp1 = tmps[i1]; 829 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 830 v1 += 2; 831 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 832 v2 += 2; 833 } 834 if (j == nz - 1) { 835 tmp0 = tmps[*vi++]; 836 sum1 -= *v1++ * tmp0; 837 sum2 -= *v2++ * tmp0; 838 } 839 sum2 -= *v2++ * sum1; 840 tmp[row++] = sum1; 841 tmp[row++] = sum2; 842 break; 843 case 3: 844 sum1 = b[*r++]; 845 sum2 = b[*r++]; 846 sum3 = b[*r++]; 847 v2 = aa + ai[row + 1]; 848 v3 = aa + ai[row + 2]; 849 850 for (j = 0; j < nz - 1; j += 2) { 851 i0 = vi[0]; 852 i1 = vi[1]; 853 vi += 2; 854 tmp0 = tmps[i0]; 855 tmp1 = tmps[i1]; 856 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 857 v1 += 2; 858 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 859 v2 += 2; 860 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 861 v3 += 2; 862 } 863 if (j == nz - 1) { 864 tmp0 = tmps[*vi++]; 865 sum1 -= *v1++ * tmp0; 866 sum2 -= *v2++ * tmp0; 867 sum3 -= *v3++ * tmp0; 868 } 869 sum2 -= *v2++ * sum1; 870 sum3 -= *v3++ * sum1; 871 sum3 -= *v3++ * sum2; 872 873 tmp[row++] = sum1; 874 tmp[row++] = sum2; 875 tmp[row++] = sum3; 876 break; 877 878 case 4: 879 sum1 = b[*r++]; 880 sum2 = b[*r++]; 881 sum3 = b[*r++]; 882 sum4 = b[*r++]; 883 v2 = aa + ai[row + 1]; 884 v3 = aa + ai[row + 2]; 885 v4 = aa + ai[row + 3]; 886 887 for (j = 0; j < nz - 1; j += 2) { 888 i0 = vi[0]; 889 i1 = vi[1]; 890 vi += 2; 891 tmp0 = tmps[i0]; 892 tmp1 = tmps[i1]; 893 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 894 v1 += 2; 895 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 896 v2 += 2; 897 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 898 v3 += 2; 899 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 900 v4 += 2; 901 } 902 if (j == nz - 1) { 903 tmp0 = tmps[*vi++]; 904 sum1 -= *v1++ * tmp0; 905 sum2 -= *v2++ * tmp0; 906 sum3 -= *v3++ * tmp0; 907 sum4 -= *v4++ * tmp0; 908 } 909 sum2 -= *v2++ * sum1; 910 sum3 -= *v3++ * sum1; 911 sum4 -= *v4++ * sum1; 912 sum3 -= *v3++ * sum2; 913 sum4 -= *v4++ * sum2; 914 sum4 -= *v4++ * sum3; 915 916 tmp[row++] = sum1; 917 tmp[row++] = sum2; 918 tmp[row++] = sum3; 919 tmp[row++] = sum4; 920 break; 921 case 5: 922 sum1 = b[*r++]; 923 sum2 = b[*r++]; 924 sum3 = b[*r++]; 925 sum4 = b[*r++]; 926 sum5 = b[*r++]; 927 v2 = aa + ai[row + 1]; 928 v3 = aa + ai[row + 2]; 929 v4 = aa + ai[row + 3]; 930 v5 = aa + ai[row + 4]; 931 932 for (j = 0; j < nz - 1; j += 2) { 933 i0 = vi[0]; 934 i1 = vi[1]; 935 vi += 2; 936 tmp0 = tmps[i0]; 937 tmp1 = tmps[i1]; 938 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 939 v1 += 2; 940 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 941 v2 += 2; 942 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 943 v3 += 2; 944 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 945 v4 += 2; 946 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 947 v5 += 2; 948 } 949 if (j == nz - 1) { 950 tmp0 = tmps[*vi++]; 951 sum1 -= *v1++ * tmp0; 952 sum2 -= *v2++ * tmp0; 953 sum3 -= *v3++ * tmp0; 954 sum4 -= *v4++ * tmp0; 955 sum5 -= *v5++ * tmp0; 956 } 957 958 sum2 -= *v2++ * sum1; 959 sum3 -= *v3++ * sum1; 960 sum4 -= *v4++ * sum1; 961 sum5 -= *v5++ * sum1; 962 sum3 -= *v3++ * sum2; 963 sum4 -= *v4++ * sum2; 964 sum5 -= *v5++ * sum2; 965 sum4 -= *v4++ * sum3; 966 sum5 -= *v5++ * sum3; 967 sum5 -= *v5++ * sum4; 968 969 tmp[row++] = sum1; 970 tmp[row++] = sum2; 971 tmp[row++] = sum3; 972 tmp[row++] = sum4; 973 tmp[row++] = sum5; 974 break; 975 default: 976 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 977 } 978 } 979 /* backward solve the upper triangular */ 980 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 981 nsz = ns[i]; 982 aii = ai[row + 1] - 1; 983 v1 = aa + aii; 984 vi = aj + aii; 985 nz = aii - ad[row]; 986 switch (nsz) { /* Each loop in 'case' is unrolled */ 987 case 1: 988 sum1 = tmp[row]; 989 990 for (j = nz; j > 1; j -= 2) { 991 vi -= 2; 992 i0 = vi[2]; 993 i1 = vi[1]; 994 tmp0 = tmps[i0]; 995 tmp1 = tmps[i1]; 996 v1 -= 2; 997 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 998 } 999 if (j == 1) { 1000 tmp0 = tmps[*vi--]; 1001 sum1 -= *v1-- * tmp0; 1002 } 1003 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1004 row--; 1005 break; 1006 case 2: 1007 sum1 = tmp[row]; 1008 sum2 = tmp[row - 1]; 1009 v2 = aa + ai[row] - 1; 1010 for (j = nz; j > 1; j -= 2) { 1011 vi -= 2; 1012 i0 = vi[2]; 1013 i1 = vi[1]; 1014 tmp0 = tmps[i0]; 1015 tmp1 = tmps[i1]; 1016 v1 -= 2; 1017 v2 -= 2; 1018 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1019 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1020 } 1021 if (j == 1) { 1022 tmp0 = tmps[*vi--]; 1023 sum1 -= *v1-- * tmp0; 1024 sum2 -= *v2-- * tmp0; 1025 } 1026 1027 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1028 row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1031 row--; 1032 break; 1033 case 3: 1034 sum1 = tmp[row]; 1035 sum2 = tmp[row - 1]; 1036 sum3 = tmp[row - 2]; 1037 v2 = aa + ai[row] - 1; 1038 v3 = aa + ai[row - 1] - 1; 1039 for (j = nz; j > 1; j -= 2) { 1040 vi -= 2; 1041 i0 = vi[2]; 1042 i1 = vi[1]; 1043 tmp0 = tmps[i0]; 1044 tmp1 = tmps[i1]; 1045 v1 -= 2; 1046 v2 -= 2; 1047 v3 -= 2; 1048 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1049 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1050 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1051 } 1052 if (j == 1) { 1053 tmp0 = tmps[*vi--]; 1054 sum1 -= *v1-- * tmp0; 1055 sum2 -= *v2-- * tmp0; 1056 sum3 -= *v3-- * tmp0; 1057 } 1058 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1059 row--; 1060 sum2 -= *v2-- * tmp0; 1061 sum3 -= *v3-- * tmp0; 1062 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1063 row--; 1064 sum3 -= *v3-- * tmp0; 1065 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1066 row--; 1067 1068 break; 1069 case 4: 1070 sum1 = tmp[row]; 1071 sum2 = tmp[row - 1]; 1072 sum3 = tmp[row - 2]; 1073 sum4 = tmp[row - 3]; 1074 v2 = aa + ai[row] - 1; 1075 v3 = aa + ai[row - 1] - 1; 1076 v4 = aa + ai[row - 2] - 1; 1077 1078 for (j = nz; j > 1; j -= 2) { 1079 vi -= 2; 1080 i0 = vi[2]; 1081 i1 = vi[1]; 1082 tmp0 = tmps[i0]; 1083 tmp1 = tmps[i1]; 1084 v1 -= 2; 1085 v2 -= 2; 1086 v3 -= 2; 1087 v4 -= 2; 1088 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1089 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1090 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1091 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1092 } 1093 if (j == 1) { 1094 tmp0 = tmps[*vi--]; 1095 sum1 -= *v1-- * tmp0; 1096 sum2 -= *v2-- * tmp0; 1097 sum3 -= *v3-- * tmp0; 1098 sum4 -= *v4-- * tmp0; 1099 } 1100 1101 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1102 row--; 1103 sum2 -= *v2-- * tmp0; 1104 sum3 -= *v3-- * tmp0; 1105 sum4 -= *v4-- * tmp0; 1106 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1107 row--; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1111 row--; 1112 sum4 -= *v4-- * tmp0; 1113 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1114 row--; 1115 break; 1116 case 5: 1117 sum1 = tmp[row]; 1118 sum2 = tmp[row - 1]; 1119 sum3 = tmp[row - 2]; 1120 sum4 = tmp[row - 3]; 1121 sum5 = tmp[row - 4]; 1122 v2 = aa + ai[row] - 1; 1123 v3 = aa + ai[row - 1] - 1; 1124 v4 = aa + ai[row - 2] - 1; 1125 v5 = aa + ai[row - 3] - 1; 1126 for (j = nz; j > 1; j -= 2) { 1127 vi -= 2; 1128 i0 = vi[2]; 1129 i1 = vi[1]; 1130 tmp0 = tmps[i0]; 1131 tmp1 = tmps[i1]; 1132 v1 -= 2; 1133 v2 -= 2; 1134 v3 -= 2; 1135 v4 -= 2; 1136 v5 -= 2; 1137 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1138 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1139 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1140 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1141 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1142 } 1143 if (j == 1) { 1144 tmp0 = tmps[*vi--]; 1145 sum1 -= *v1-- * tmp0; 1146 sum2 -= *v2-- * tmp0; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 } 1151 1152 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1153 row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1159 row--; 1160 sum3 -= *v3-- * tmp0; 1161 sum4 -= *v4-- * tmp0; 1162 sum5 -= *v5-- * tmp0; 1163 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1164 row--; 1165 sum4 -= *v4-- * tmp0; 1166 sum5 -= *v5-- * tmp0; 1167 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1168 row--; 1169 sum5 -= *v5-- * tmp0; 1170 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1171 row--; 1172 break; 1173 default: 1174 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1175 } 1176 } 1177 PetscCall(ISRestoreIndices(isrow, &rout)); 1178 PetscCall(ISRestoreIndices(iscol, &cout)); 1179 PetscCall(VecRestoreArrayRead(bb, &b)); 1180 PetscCall(VecRestoreArrayWrite(xx, &x)); 1181 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1182 PetscFunctionReturn(PETSC_SUCCESS); 1183 } 1184 1185 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1186 { 1187 Mat C = B; 1188 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1189 IS isrow = b->row, isicol = b->icol; 1190 const PetscInt *r, *ic, *ics; 1191 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1192 PetscInt i, j, k, nz, nzL, row, *pj; 1193 const PetscInt *ajtmp, *bjtmp; 1194 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1195 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1196 FactorShiftCtx sctx; 1197 const PetscInt *ddiag; 1198 PetscReal rs; 1199 MatScalar d; 1200 PetscInt inod, nodesz, node_max, col; 1201 const PetscInt *ns; 1202 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1203 1204 PetscFunctionBegin; 1205 /* MatPivotSetUp(): initialize shift context sctx */ 1206 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1207 1208 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1209 ddiag = a->diag; 1210 sctx.shift_top = info->zeropivot; 1211 for (i = 0; i < n; i++) { 1212 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1213 d = (aa)[ddiag[i]]; 1214 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1215 v = aa + ai[i]; 1216 nz = ai[i + 1] - ai[i]; 1217 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1218 if (rs > sctx.shift_top) sctx.shift_top = rs; 1219 } 1220 sctx.shift_top *= 1.1; 1221 sctx.nshift_max = 5; 1222 sctx.shift_lo = 0.; 1223 sctx.shift_hi = 1.; 1224 } 1225 1226 PetscCall(ISGetIndices(isrow, &r)); 1227 PetscCall(ISGetIndices(isicol, &ic)); 1228 1229 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1230 ics = ic; 1231 1232 node_max = a->inode.node_count; 1233 ns = a->inode.size; 1234 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1235 1236 /* If max inode size > 4, split it into two inodes.*/ 1237 /* also map the inode sizes according to the ordering */ 1238 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1239 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1240 if (ns[i] > 4) { 1241 tmp_vec1[j] = 4; 1242 ++j; 1243 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 1244 } else { 1245 tmp_vec1[j] = ns[i]; 1246 } 1247 } 1248 /* Use the correct node_max */ 1249 node_max = j; 1250 1251 /* Now reorder the inode info based on mat re-ordering info */ 1252 /* First create a row -> inode_size_array_index map */ 1253 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1254 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1255 for (i = 0, row = 0; i < node_max; i++) { 1256 nodesz = tmp_vec1[i]; 1257 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1258 } 1259 /* Using nsmap, create a reordered ns structure */ 1260 for (i = 0, j = 0; i < node_max; i++) { 1261 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1262 tmp_vec2[i] = nodesz; 1263 j += nodesz; 1264 } 1265 PetscCall(PetscFree(nsmap)); 1266 PetscCall(PetscFree(tmp_vec1)); 1267 1268 /* Now use the correct ns */ 1269 ns = tmp_vec2; 1270 1271 do { 1272 sctx.newshift = PETSC_FALSE; 1273 /* Now loop over each block-row, and do the factorization */ 1274 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1275 nodesz = ns[inod]; 1276 1277 switch (nodesz) { 1278 case 1: 1279 /* zero rtmp1 */ 1280 /* L part */ 1281 nz = bi[i + 1] - bi[i]; 1282 bjtmp = bj + bi[i]; 1283 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1284 1285 /* U part */ 1286 nz = bdiag[i] - bdiag[i + 1]; 1287 bjtmp = bj + bdiag[i + 1] + 1; 1288 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1289 1290 /* load in initial (unfactored row) */ 1291 nz = ai[r[i] + 1] - ai[r[i]]; 1292 ajtmp = aj + ai[r[i]]; 1293 v = aa + ai[r[i]]; 1294 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1295 1296 /* ZeropivotApply() */ 1297 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1298 1299 /* elimination */ 1300 bjtmp = bj + bi[i]; 1301 row = *bjtmp++; 1302 nzL = bi[i + 1] - bi[i]; 1303 for (k = 0; k < nzL; k++) { 1304 pc = rtmp1 + row; 1305 if (*pc != 0.0) { 1306 pv = b->a + bdiag[row]; 1307 mul1 = *pc * (*pv); 1308 *pc = mul1; 1309 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1310 pv = b->a + bdiag[row + 1] + 1; 1311 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1312 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1313 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1314 } 1315 row = *bjtmp++; 1316 } 1317 1318 /* finished row so stick it into b->a */ 1319 rs = 0.0; 1320 /* L part */ 1321 pv = b->a + bi[i]; 1322 pj = b->j + bi[i]; 1323 nz = bi[i + 1] - bi[i]; 1324 for (j = 0; j < nz; j++) { 1325 pv[j] = rtmp1[pj[j]]; 1326 rs += PetscAbsScalar(pv[j]); 1327 } 1328 1329 /* U part */ 1330 pv = b->a + bdiag[i + 1] + 1; 1331 pj = b->j + bdiag[i + 1] + 1; 1332 nz = bdiag[i] - bdiag[i + 1] - 1; 1333 for (j = 0; j < nz; j++) { 1334 pv[j] = rtmp1[pj[j]]; 1335 rs += PetscAbsScalar(pv[j]); 1336 } 1337 1338 /* Check zero pivot */ 1339 sctx.rs = rs; 1340 sctx.pv = rtmp1[i]; 1341 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1342 if (sctx.newshift) break; 1343 1344 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1345 pv = b->a + bdiag[i]; 1346 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1347 break; 1348 1349 case 2: 1350 /* zero rtmp1 and rtmp2 */ 1351 /* L part */ 1352 nz = bi[i + 1] - bi[i]; 1353 bjtmp = bj + bi[i]; 1354 for (j = 0; j < nz; j++) { 1355 col = bjtmp[j]; 1356 rtmp1[col] = 0.0; 1357 rtmp2[col] = 0.0; 1358 } 1359 1360 /* U part */ 1361 nz = bdiag[i] - bdiag[i + 1]; 1362 bjtmp = bj + bdiag[i + 1] + 1; 1363 for (j = 0; j < nz; j++) { 1364 col = bjtmp[j]; 1365 rtmp1[col] = 0.0; 1366 rtmp2[col] = 0.0; 1367 } 1368 1369 /* load in initial (unfactored row) */ 1370 nz = ai[r[i] + 1] - ai[r[i]]; 1371 ajtmp = aj + ai[r[i]]; 1372 v1 = aa + ai[r[i]]; 1373 v2 = aa + ai[r[i] + 1]; 1374 for (j = 0; j < nz; j++) { 1375 col = ics[ajtmp[j]]; 1376 rtmp1[col] = v1[j]; 1377 rtmp2[col] = v2[j]; 1378 } 1379 /* ZeropivotApply(): shift the diagonal of the matrix */ 1380 rtmp1[i] += sctx.shift_amount; 1381 rtmp2[i + 1] += sctx.shift_amount; 1382 1383 /* elimination */ 1384 bjtmp = bj + bi[i]; 1385 row = *bjtmp++; /* pivot row */ 1386 nzL = bi[i + 1] - bi[i]; 1387 for (k = 0; k < nzL; k++) { 1388 pc1 = rtmp1 + row; 1389 pc2 = rtmp2 + row; 1390 if (*pc1 != 0.0 || *pc2 != 0.0) { 1391 pv = b->a + bdiag[row]; 1392 mul1 = *pc1 * (*pv); 1393 mul2 = *pc2 * (*pv); 1394 *pc1 = mul1; 1395 *pc2 = mul2; 1396 1397 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1398 pv = b->a + bdiag[row + 1] + 1; 1399 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1400 for (j = 0; j < nz; j++) { 1401 col = pj[j]; 1402 rtmp1[col] -= mul1 * pv[j]; 1403 rtmp2[col] -= mul2 * pv[j]; 1404 } 1405 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1406 } 1407 row = *bjtmp++; 1408 } 1409 1410 /* finished row i; check zero pivot, then stick row i into b->a */ 1411 rs = 0.0; 1412 /* L part */ 1413 pc1 = b->a + bi[i]; 1414 pj = b->j + bi[i]; 1415 nz = bi[i + 1] - bi[i]; 1416 for (j = 0; j < nz; j++) { 1417 col = pj[j]; 1418 pc1[j] = rtmp1[col]; 1419 rs += PetscAbsScalar(pc1[j]); 1420 } 1421 /* U part */ 1422 pc1 = b->a + bdiag[i + 1] + 1; 1423 pj = b->j + bdiag[i + 1] + 1; 1424 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1425 for (j = 0; j < nz; j++) { 1426 col = pj[j]; 1427 pc1[j] = rtmp1[col]; 1428 rs += PetscAbsScalar(pc1[j]); 1429 } 1430 1431 sctx.rs = rs; 1432 sctx.pv = rtmp1[i]; 1433 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1434 if (sctx.newshift) break; 1435 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1436 *pc1 = 1.0 / sctx.pv; 1437 1438 /* Now take care of diagonal 2x2 block. */ 1439 pc2 = rtmp2 + i; 1440 if (*pc2 != 0.0) { 1441 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1442 *pc2 = mul1; /* insert L entry */ 1443 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1444 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1445 for (j = 0; j < nz; j++) { 1446 col = pj[j]; 1447 rtmp2[col] -= mul1 * rtmp1[col]; 1448 } 1449 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1450 } 1451 1452 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1453 rs = 0.0; 1454 /* L part */ 1455 pc2 = b->a + bi[i + 1]; 1456 pj = b->j + bi[i + 1]; 1457 nz = bi[i + 2] - bi[i + 1]; 1458 for (j = 0; j < nz; j++) { 1459 col = pj[j]; 1460 pc2[j] = rtmp2[col]; 1461 rs += PetscAbsScalar(pc2[j]); 1462 } 1463 /* U part */ 1464 pc2 = b->a + bdiag[i + 2] + 1; 1465 pj = b->j + bdiag[i + 2] + 1; 1466 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1467 for (j = 0; j < nz; j++) { 1468 col = pj[j]; 1469 pc2[j] = rtmp2[col]; 1470 rs += PetscAbsScalar(pc2[j]); 1471 } 1472 1473 sctx.rs = rs; 1474 sctx.pv = rtmp2[i + 1]; 1475 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1476 if (sctx.newshift) break; 1477 pc2 = b->a + bdiag[i + 1]; 1478 *pc2 = 1.0 / sctx.pv; 1479 break; 1480 1481 case 3: 1482 /* zero rtmp */ 1483 /* L part */ 1484 nz = bi[i + 1] - bi[i]; 1485 bjtmp = bj + bi[i]; 1486 for (j = 0; j < nz; j++) { 1487 col = bjtmp[j]; 1488 rtmp1[col] = 0.0; 1489 rtmp2[col] = 0.0; 1490 rtmp3[col] = 0.0; 1491 } 1492 1493 /* U part */ 1494 nz = bdiag[i] - bdiag[i + 1]; 1495 bjtmp = bj + bdiag[i + 1] + 1; 1496 for (j = 0; j < nz; j++) { 1497 col = bjtmp[j]; 1498 rtmp1[col] = 0.0; 1499 rtmp2[col] = 0.0; 1500 rtmp3[col] = 0.0; 1501 } 1502 1503 /* load in initial (unfactored row) */ 1504 nz = ai[r[i] + 1] - ai[r[i]]; 1505 ajtmp = aj + ai[r[i]]; 1506 v1 = aa + ai[r[i]]; 1507 v2 = aa + ai[r[i] + 1]; 1508 v3 = aa + ai[r[i] + 2]; 1509 for (j = 0; j < nz; j++) { 1510 col = ics[ajtmp[j]]; 1511 rtmp1[col] = v1[j]; 1512 rtmp2[col] = v2[j]; 1513 rtmp3[col] = v3[j]; 1514 } 1515 /* ZeropivotApply(): shift the diagonal of the matrix */ 1516 rtmp1[i] += sctx.shift_amount; 1517 rtmp2[i + 1] += sctx.shift_amount; 1518 rtmp3[i + 2] += sctx.shift_amount; 1519 1520 /* elimination */ 1521 bjtmp = bj + bi[i]; 1522 row = *bjtmp++; /* pivot row */ 1523 nzL = bi[i + 1] - bi[i]; 1524 for (k = 0; k < nzL; k++) { 1525 pc1 = rtmp1 + row; 1526 pc2 = rtmp2 + row; 1527 pc3 = rtmp3 + row; 1528 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1529 pv = b->a + bdiag[row]; 1530 mul1 = *pc1 * (*pv); 1531 mul2 = *pc2 * (*pv); 1532 mul3 = *pc3 * (*pv); 1533 *pc1 = mul1; 1534 *pc2 = mul2; 1535 *pc3 = mul3; 1536 1537 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1538 pv = b->a + bdiag[row + 1] + 1; 1539 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1540 for (j = 0; j < nz; j++) { 1541 col = pj[j]; 1542 rtmp1[col] -= mul1 * pv[j]; 1543 rtmp2[col] -= mul2 * pv[j]; 1544 rtmp3[col] -= mul3 * pv[j]; 1545 } 1546 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1547 } 1548 row = *bjtmp++; 1549 } 1550 1551 /* finished row i; check zero pivot, then stick row i into b->a */ 1552 rs = 0.0; 1553 /* L part */ 1554 pc1 = b->a + bi[i]; 1555 pj = b->j + bi[i]; 1556 nz = bi[i + 1] - bi[i]; 1557 for (j = 0; j < nz; j++) { 1558 col = pj[j]; 1559 pc1[j] = rtmp1[col]; 1560 rs += PetscAbsScalar(pc1[j]); 1561 } 1562 /* U part */ 1563 pc1 = b->a + bdiag[i + 1] + 1; 1564 pj = b->j + bdiag[i + 1] + 1; 1565 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1566 for (j = 0; j < nz; j++) { 1567 col = pj[j]; 1568 pc1[j] = rtmp1[col]; 1569 rs += PetscAbsScalar(pc1[j]); 1570 } 1571 1572 sctx.rs = rs; 1573 sctx.pv = rtmp1[i]; 1574 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1575 if (sctx.newshift) break; 1576 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1577 *pc1 = 1.0 / sctx.pv; 1578 1579 /* Now take care of 1st column of diagonal 3x3 block. */ 1580 pc2 = rtmp2 + i; 1581 pc3 = rtmp3 + i; 1582 if (*pc2 != 0.0 || *pc3 != 0.0) { 1583 mul2 = (*pc2) * (*pc1); 1584 *pc2 = mul2; 1585 mul3 = (*pc3) * (*pc1); 1586 *pc3 = mul3; 1587 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1588 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1589 for (j = 0; j < nz; j++) { 1590 col = pj[j]; 1591 rtmp2[col] -= mul2 * rtmp1[col]; 1592 rtmp3[col] -= mul3 * rtmp1[col]; 1593 } 1594 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1595 } 1596 1597 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1598 rs = 0.0; 1599 /* L part */ 1600 pc2 = b->a + bi[i + 1]; 1601 pj = b->j + bi[i + 1]; 1602 nz = bi[i + 2] - bi[i + 1]; 1603 for (j = 0; j < nz; j++) { 1604 col = pj[j]; 1605 pc2[j] = rtmp2[col]; 1606 rs += PetscAbsScalar(pc2[j]); 1607 } 1608 /* U part */ 1609 pc2 = b->a + bdiag[i + 2] + 1; 1610 pj = b->j + bdiag[i + 2] + 1; 1611 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1612 for (j = 0; j < nz; j++) { 1613 col = pj[j]; 1614 pc2[j] = rtmp2[col]; 1615 rs += PetscAbsScalar(pc2[j]); 1616 } 1617 1618 sctx.rs = rs; 1619 sctx.pv = rtmp2[i + 1]; 1620 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1621 if (sctx.newshift) break; 1622 pc2 = b->a + bdiag[i + 1]; 1623 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1624 1625 /* Now take care of 2nd column of diagonal 3x3 block. */ 1626 pc3 = rtmp3 + i + 1; 1627 if (*pc3 != 0.0) { 1628 mul3 = (*pc3) * (*pc2); 1629 *pc3 = mul3; 1630 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1631 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1632 for (j = 0; j < nz; j++) { 1633 col = pj[j]; 1634 rtmp3[col] -= mul3 * rtmp2[col]; 1635 } 1636 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1637 } 1638 1639 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1640 rs = 0.0; 1641 /* L part */ 1642 pc3 = b->a + bi[i + 2]; 1643 pj = b->j + bi[i + 2]; 1644 nz = bi[i + 3] - bi[i + 2]; 1645 for (j = 0; j < nz; j++) { 1646 col = pj[j]; 1647 pc3[j] = rtmp3[col]; 1648 rs += PetscAbsScalar(pc3[j]); 1649 } 1650 /* U part */ 1651 pc3 = b->a + bdiag[i + 3] + 1; 1652 pj = b->j + bdiag[i + 3] + 1; 1653 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1654 for (j = 0; j < nz; j++) { 1655 col = pj[j]; 1656 pc3[j] = rtmp3[col]; 1657 rs += PetscAbsScalar(pc3[j]); 1658 } 1659 1660 sctx.rs = rs; 1661 sctx.pv = rtmp3[i + 2]; 1662 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1663 if (sctx.newshift) break; 1664 pc3 = b->a + bdiag[i + 2]; 1665 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1666 break; 1667 case 4: 1668 /* zero rtmp */ 1669 /* L part */ 1670 nz = bi[i + 1] - bi[i]; 1671 bjtmp = bj + bi[i]; 1672 for (j = 0; j < nz; j++) { 1673 col = bjtmp[j]; 1674 rtmp1[col] = 0.0; 1675 rtmp2[col] = 0.0; 1676 rtmp3[col] = 0.0; 1677 rtmp4[col] = 0.0; 1678 } 1679 1680 /* U part */ 1681 nz = bdiag[i] - bdiag[i + 1]; 1682 bjtmp = bj + bdiag[i + 1] + 1; 1683 for (j = 0; j < nz; j++) { 1684 col = bjtmp[j]; 1685 rtmp1[col] = 0.0; 1686 rtmp2[col] = 0.0; 1687 rtmp3[col] = 0.0; 1688 rtmp4[col] = 0.0; 1689 } 1690 1691 /* load in initial (unfactored row) */ 1692 nz = ai[r[i] + 1] - ai[r[i]]; 1693 ajtmp = aj + ai[r[i]]; 1694 v1 = aa + ai[r[i]]; 1695 v2 = aa + ai[r[i] + 1]; 1696 v3 = aa + ai[r[i] + 2]; 1697 v4 = aa + ai[r[i] + 3]; 1698 for (j = 0; j < nz; j++) { 1699 col = ics[ajtmp[j]]; 1700 rtmp1[col] = v1[j]; 1701 rtmp2[col] = v2[j]; 1702 rtmp3[col] = v3[j]; 1703 rtmp4[col] = v4[j]; 1704 } 1705 /* ZeropivotApply(): shift the diagonal of the matrix */ 1706 rtmp1[i] += sctx.shift_amount; 1707 rtmp2[i + 1] += sctx.shift_amount; 1708 rtmp3[i + 2] += sctx.shift_amount; 1709 rtmp4[i + 3] += sctx.shift_amount; 1710 1711 /* elimination */ 1712 bjtmp = bj + bi[i]; 1713 row = *bjtmp++; /* pivot row */ 1714 nzL = bi[i + 1] - bi[i]; 1715 for (k = 0; k < nzL; k++) { 1716 pc1 = rtmp1 + row; 1717 pc2 = rtmp2 + row; 1718 pc3 = rtmp3 + row; 1719 pc4 = rtmp4 + row; 1720 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1721 pv = b->a + bdiag[row]; 1722 mul1 = *pc1 * (*pv); 1723 mul2 = *pc2 * (*pv); 1724 mul3 = *pc3 * (*pv); 1725 mul4 = *pc4 * (*pv); 1726 *pc1 = mul1; 1727 *pc2 = mul2; 1728 *pc3 = mul3; 1729 *pc4 = mul4; 1730 1731 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1732 pv = b->a + bdiag[row + 1] + 1; 1733 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1734 for (j = 0; j < nz; j++) { 1735 col = pj[j]; 1736 rtmp1[col] -= mul1 * pv[j]; 1737 rtmp2[col] -= mul2 * pv[j]; 1738 rtmp3[col] -= mul3 * pv[j]; 1739 rtmp4[col] -= mul4 * pv[j]; 1740 } 1741 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1742 } 1743 row = *bjtmp++; 1744 } 1745 1746 /* finished row i; check zero pivot, then stick row i into b->a */ 1747 rs = 0.0; 1748 /* L part */ 1749 pc1 = b->a + bi[i]; 1750 pj = b->j + bi[i]; 1751 nz = bi[i + 1] - bi[i]; 1752 for (j = 0; j < nz; j++) { 1753 col = pj[j]; 1754 pc1[j] = rtmp1[col]; 1755 rs += PetscAbsScalar(pc1[j]); 1756 } 1757 /* U part */ 1758 pc1 = b->a + bdiag[i + 1] + 1; 1759 pj = b->j + bdiag[i + 1] + 1; 1760 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1761 for (j = 0; j < nz; j++) { 1762 col = pj[j]; 1763 pc1[j] = rtmp1[col]; 1764 rs += PetscAbsScalar(pc1[j]); 1765 } 1766 1767 sctx.rs = rs; 1768 sctx.pv = rtmp1[i]; 1769 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1770 if (sctx.newshift) break; 1771 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1772 *pc1 = 1.0 / sctx.pv; 1773 1774 /* Now take care of 1st column of diagonal 4x4 block. */ 1775 pc2 = rtmp2 + i; 1776 pc3 = rtmp3 + i; 1777 pc4 = rtmp4 + i; 1778 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1779 mul2 = (*pc2) * (*pc1); 1780 *pc2 = mul2; 1781 mul3 = (*pc3) * (*pc1); 1782 *pc3 = mul3; 1783 mul4 = (*pc4) * (*pc1); 1784 *pc4 = mul4; 1785 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1786 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1787 for (j = 0; j < nz; j++) { 1788 col = pj[j]; 1789 rtmp2[col] -= mul2 * rtmp1[col]; 1790 rtmp3[col] -= mul3 * rtmp1[col]; 1791 rtmp4[col] -= mul4 * rtmp1[col]; 1792 } 1793 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1794 } 1795 1796 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1797 rs = 0.0; 1798 /* L part */ 1799 pc2 = b->a + bi[i + 1]; 1800 pj = b->j + bi[i + 1]; 1801 nz = bi[i + 2] - bi[i + 1]; 1802 for (j = 0; j < nz; j++) { 1803 col = pj[j]; 1804 pc2[j] = rtmp2[col]; 1805 rs += PetscAbsScalar(pc2[j]); 1806 } 1807 /* U part */ 1808 pc2 = b->a + bdiag[i + 2] + 1; 1809 pj = b->j + bdiag[i + 2] + 1; 1810 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1811 for (j = 0; j < nz; j++) { 1812 col = pj[j]; 1813 pc2[j] = rtmp2[col]; 1814 rs += PetscAbsScalar(pc2[j]); 1815 } 1816 1817 sctx.rs = rs; 1818 sctx.pv = rtmp2[i + 1]; 1819 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1820 if (sctx.newshift) break; 1821 pc2 = b->a + bdiag[i + 1]; 1822 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1823 1824 /* Now take care of 2nd column of diagonal 4x4 block. */ 1825 pc3 = rtmp3 + i + 1; 1826 pc4 = rtmp4 + i + 1; 1827 if (*pc3 != 0.0 || *pc4 != 0.0) { 1828 mul3 = (*pc3) * (*pc2); 1829 *pc3 = mul3; 1830 mul4 = (*pc4) * (*pc2); 1831 *pc4 = mul4; 1832 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1833 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1834 for (j = 0; j < nz; j++) { 1835 col = pj[j]; 1836 rtmp3[col] -= mul3 * rtmp2[col]; 1837 rtmp4[col] -= mul4 * rtmp2[col]; 1838 } 1839 PetscCall(PetscLogFlops(4.0 * nz)); 1840 } 1841 1842 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1843 rs = 0.0; 1844 /* L part */ 1845 pc3 = b->a + bi[i + 2]; 1846 pj = b->j + bi[i + 2]; 1847 nz = bi[i + 3] - bi[i + 2]; 1848 for (j = 0; j < nz; j++) { 1849 col = pj[j]; 1850 pc3[j] = rtmp3[col]; 1851 rs += PetscAbsScalar(pc3[j]); 1852 } 1853 /* U part */ 1854 pc3 = b->a + bdiag[i + 3] + 1; 1855 pj = b->j + bdiag[i + 3] + 1; 1856 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1857 for (j = 0; j < nz; j++) { 1858 col = pj[j]; 1859 pc3[j] = rtmp3[col]; 1860 rs += PetscAbsScalar(pc3[j]); 1861 } 1862 1863 sctx.rs = rs; 1864 sctx.pv = rtmp3[i + 2]; 1865 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1866 if (sctx.newshift) break; 1867 pc3 = b->a + bdiag[i + 2]; 1868 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1869 1870 /* Now take care of 3rd column of diagonal 4x4 block. */ 1871 pc4 = rtmp4 + i + 2; 1872 if (*pc4 != 0.0) { 1873 mul4 = (*pc4) * (*pc3); 1874 *pc4 = mul4; 1875 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1876 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1877 for (j = 0; j < nz; j++) { 1878 col = pj[j]; 1879 rtmp4[col] -= mul4 * rtmp3[col]; 1880 } 1881 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1882 } 1883 1884 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1885 rs = 0.0; 1886 /* L part */ 1887 pc4 = b->a + bi[i + 3]; 1888 pj = b->j + bi[i + 3]; 1889 nz = bi[i + 4] - bi[i + 3]; 1890 for (j = 0; j < nz; j++) { 1891 col = pj[j]; 1892 pc4[j] = rtmp4[col]; 1893 rs += PetscAbsScalar(pc4[j]); 1894 } 1895 /* U part */ 1896 pc4 = b->a + bdiag[i + 4] + 1; 1897 pj = b->j + bdiag[i + 4] + 1; 1898 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1899 for (j = 0; j < nz; j++) { 1900 col = pj[j]; 1901 pc4[j] = rtmp4[col]; 1902 rs += PetscAbsScalar(pc4[j]); 1903 } 1904 1905 sctx.rs = rs; 1906 sctx.pv = rtmp4[i + 3]; 1907 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1908 if (sctx.newshift) break; 1909 pc4 = b->a + bdiag[i + 3]; 1910 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1911 break; 1912 1913 default: 1914 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1915 } 1916 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1917 i += nodesz; /* Update the row */ 1918 } 1919 1920 /* MatPivotRefine() */ 1921 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1922 /* 1923 * if no shift in this attempt & shifting & started shifting & can refine, 1924 * then try lower shift 1925 */ 1926 sctx.shift_hi = sctx.shift_fraction; 1927 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1928 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1929 sctx.newshift = PETSC_TRUE; 1930 sctx.nshift++; 1931 } 1932 } while (sctx.newshift); 1933 1934 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1935 PetscCall(PetscFree(tmp_vec2)); 1936 PetscCall(ISRestoreIndices(isicol, &ic)); 1937 PetscCall(ISRestoreIndices(isrow, &r)); 1938 1939 if (b->inode.size) { 1940 C->ops->solve = MatSolve_SeqAIJ_Inode; 1941 } else { 1942 C->ops->solve = MatSolve_SeqAIJ; 1943 } 1944 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1945 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1946 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1947 C->ops->matsolve = MatMatSolve_SeqAIJ; 1948 C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ; 1949 C->assembled = PETSC_TRUE; 1950 C->preallocated = PETSC_TRUE; 1951 1952 PetscCall(PetscLogFlops(C->cmap->n)); 1953 1954 /* MatShiftView(A,info,&sctx) */ 1955 if (sctx.nshift) { 1956 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1957 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1958 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1959 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1960 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1961 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1962 } 1963 } 1964 PetscFunctionReturn(PETSC_SUCCESS); 1965 } 1966 1967 #if 0 1968 // unused 1969 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1970 { 1971 Mat C = B; 1972 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1973 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1974 const PetscInt *r, *ic, *c, *ics; 1975 PetscInt n = A->rmap->n, *bi = b->i; 1976 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1977 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1978 PetscInt *ai = a->i, *aj = a->j; 1979 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1980 PetscScalar mul1, mul2, mul3, tmp; 1981 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1982 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1983 PetscReal rs = 0.0; 1984 FactorShiftCtx sctx; 1985 1986 PetscFunctionBegin; 1987 sctx.shift_top = 0; 1988 sctx.nshift_max = 0; 1989 sctx.shift_lo = 0; 1990 sctx.shift_hi = 0; 1991 sctx.shift_fraction = 0; 1992 1993 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1994 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1995 sctx.shift_top = 0; 1996 for (i = 0; i < n; i++) { 1997 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1998 rs = 0.0; 1999 ajtmp = aj + ai[i]; 2000 rtmp1 = aa + ai[i]; 2001 nz = ai[i + 1] - ai[i]; 2002 for (j = 0; j < nz; j++) { 2003 if (*ajtmp != i) { 2004 rs += PetscAbsScalar(*rtmp1++); 2005 } else { 2006 rs -= PetscRealPart(*rtmp1++); 2007 } 2008 ajtmp++; 2009 } 2010 if (rs > sctx.shift_top) sctx.shift_top = rs; 2011 } 2012 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2013 sctx.shift_top *= 1.1; 2014 sctx.nshift_max = 5; 2015 sctx.shift_lo = 0.; 2016 sctx.shift_hi = 1.; 2017 } 2018 sctx.shift_amount = 0; 2019 sctx.nshift = 0; 2020 2021 PetscCall(ISGetIndices(isrow, &r)); 2022 PetscCall(ISGetIndices(iscol, &c)); 2023 PetscCall(ISGetIndices(isicol, &ic)); 2024 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2025 ics = ic; 2026 2027 node_max = a->inode.node_count; 2028 ns = a->inode.size; 2029 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2030 2031 /* If max inode size > 3, split it into two inodes.*/ 2032 /* also map the inode sizes according to the ordering */ 2033 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2034 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2035 if (ns[i] > 3) { 2036 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2037 ++j; 2038 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2039 } else { 2040 tmp_vec1[j] = ns[i]; 2041 } 2042 } 2043 /* Use the correct node_max */ 2044 node_max = j; 2045 2046 /* Now reorder the inode info based on mat re-ordering info */ 2047 /* First create a row -> inode_size_array_index map */ 2048 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2049 for (i = 0, row = 0; i < node_max; i++) { 2050 nodesz = tmp_vec1[i]; 2051 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2052 } 2053 /* Using nsmap, create a reordered ns structure */ 2054 for (i = 0, j = 0; i < node_max; i++) { 2055 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2056 tmp_vec2[i] = nodesz; 2057 j += nodesz; 2058 } 2059 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2060 /* Now use the correct ns */ 2061 ns = tmp_vec2; 2062 2063 do { 2064 sctx.newshift = PETSC_FALSE; 2065 /* Now loop over each block-row, and do the factorization */ 2066 for (i = 0, row = 0; i < node_max; i++) { 2067 nodesz = ns[i]; 2068 nz = bi[row + 1] - bi[row]; 2069 bjtmp = bj + bi[row]; 2070 2071 switch (nodesz) { 2072 case 1: 2073 for (j = 0; j < nz; j++) { 2074 idx = bjtmp[j]; 2075 rtmp11[idx] = 0.0; 2076 } 2077 2078 /* load in initial (unfactored row) */ 2079 idx = r[row]; 2080 nz_tmp = ai[idx + 1] - ai[idx]; 2081 ajtmp = aj + ai[idx]; 2082 v1 = aa + ai[idx]; 2083 2084 for (j = 0; j < nz_tmp; j++) { 2085 idx = ics[ajtmp[j]]; 2086 rtmp11[idx] = v1[j]; 2087 } 2088 rtmp11[ics[r[row]]] += sctx.shift_amount; 2089 2090 prow = *bjtmp++; 2091 while (prow < row) { 2092 pc1 = rtmp11 + prow; 2093 if (*pc1 != 0.0) { 2094 pv = ba + bd[prow]; 2095 pj = nbj + bd[prow]; 2096 mul1 = *pc1 * *pv++; 2097 *pc1 = mul1; 2098 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2099 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2100 for (j = 0; j < nz_tmp; j++) { 2101 tmp = pv[j]; 2102 idx = pj[j]; 2103 rtmp11[idx] -= mul1 * tmp; 2104 } 2105 } 2106 prow = *bjtmp++; 2107 } 2108 pj = bj + bi[row]; 2109 pc1 = ba + bi[row]; 2110 2111 sctx.pv = rtmp11[row]; 2112 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2113 rs = 0.0; 2114 for (j = 0; j < nz; j++) { 2115 idx = pj[j]; 2116 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2117 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2118 } 2119 sctx.rs = rs; 2120 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2121 if (sctx.newshift) goto endofwhile; 2122 break; 2123 2124 case 2: 2125 for (j = 0; j < nz; j++) { 2126 idx = bjtmp[j]; 2127 rtmp11[idx] = 0.0; 2128 rtmp22[idx] = 0.0; 2129 } 2130 2131 /* load in initial (unfactored row) */ 2132 idx = r[row]; 2133 nz_tmp = ai[idx + 1] - ai[idx]; 2134 ajtmp = aj + ai[idx]; 2135 v1 = aa + ai[idx]; 2136 v2 = aa + ai[idx + 1]; 2137 for (j = 0; j < nz_tmp; j++) { 2138 idx = ics[ajtmp[j]]; 2139 rtmp11[idx] = v1[j]; 2140 rtmp22[idx] = v2[j]; 2141 } 2142 rtmp11[ics[r[row]]] += sctx.shift_amount; 2143 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2144 2145 prow = *bjtmp++; 2146 while (prow < row) { 2147 pc1 = rtmp11 + prow; 2148 pc2 = rtmp22 + prow; 2149 if (*pc1 != 0.0 || *pc2 != 0.0) { 2150 pv = ba + bd[prow]; 2151 pj = nbj + bd[prow]; 2152 mul1 = *pc1 * *pv; 2153 mul2 = *pc2 * *pv; 2154 ++pv; 2155 *pc1 = mul1; 2156 *pc2 = mul2; 2157 2158 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2159 for (j = 0; j < nz_tmp; j++) { 2160 tmp = pv[j]; 2161 idx = pj[j]; 2162 rtmp11[idx] -= mul1 * tmp; 2163 rtmp22[idx] -= mul2 * tmp; 2164 } 2165 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2166 } 2167 prow = *bjtmp++; 2168 } 2169 2170 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2171 pc1 = rtmp11 + prow; 2172 pc2 = rtmp22 + prow; 2173 2174 sctx.pv = *pc1; 2175 pj = bj + bi[prow]; 2176 rs = 0.0; 2177 for (j = 0; j < nz; j++) { 2178 idx = pj[j]; 2179 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2180 } 2181 sctx.rs = rs; 2182 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2183 if (sctx.newshift) goto endofwhile; 2184 2185 if (*pc2 != 0.0) { 2186 pj = nbj + bd[prow]; 2187 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2188 *pc2 = mul2; 2189 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2190 for (j = 0; j < nz_tmp; j++) { 2191 idx = pj[j]; 2192 tmp = rtmp11[idx]; 2193 rtmp22[idx] -= mul2 * tmp; 2194 } 2195 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2196 } 2197 2198 pj = bj + bi[row]; 2199 pc1 = ba + bi[row]; 2200 pc2 = ba + bi[row + 1]; 2201 2202 sctx.pv = rtmp22[row + 1]; 2203 rs = 0.0; 2204 rtmp11[row] = 1.0 / rtmp11[row]; 2205 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2206 /* copy row entries from dense representation to sparse */ 2207 for (j = 0; j < nz; j++) { 2208 idx = pj[j]; 2209 pc1[j] = rtmp11[idx]; 2210 pc2[j] = rtmp22[idx]; 2211 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2212 } 2213 sctx.rs = rs; 2214 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2215 if (sctx.newshift) goto endofwhile; 2216 break; 2217 2218 case 3: 2219 for (j = 0; j < nz; j++) { 2220 idx = bjtmp[j]; 2221 rtmp11[idx] = 0.0; 2222 rtmp22[idx] = 0.0; 2223 rtmp33[idx] = 0.0; 2224 } 2225 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2226 idx = r[row]; 2227 nz_tmp = ai[idx + 1] - ai[idx]; 2228 ajtmp = aj + ai[idx]; 2229 v1 = aa + ai[idx]; 2230 v2 = aa + ai[idx + 1]; 2231 v3 = aa + ai[idx + 2]; 2232 for (j = 0; j < nz_tmp; j++) { 2233 idx = ics[ajtmp[j]]; 2234 rtmp11[idx] = v1[j]; 2235 rtmp22[idx] = v2[j]; 2236 rtmp33[idx] = v3[j]; 2237 } 2238 rtmp11[ics[r[row]]] += sctx.shift_amount; 2239 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2240 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2241 2242 /* loop over all pivot row blocks above this row block */ 2243 prow = *bjtmp++; 2244 while (prow < row) { 2245 pc1 = rtmp11 + prow; 2246 pc2 = rtmp22 + prow; 2247 pc3 = rtmp33 + prow; 2248 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2249 pv = ba + bd[prow]; 2250 pj = nbj + bd[prow]; 2251 mul1 = *pc1 * *pv; 2252 mul2 = *pc2 * *pv; 2253 mul3 = *pc3 * *pv; 2254 ++pv; 2255 *pc1 = mul1; 2256 *pc2 = mul2; 2257 *pc3 = mul3; 2258 2259 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2260 /* update this row based on pivot row */ 2261 for (j = 0; j < nz_tmp; j++) { 2262 tmp = pv[j]; 2263 idx = pj[j]; 2264 rtmp11[idx] -= mul1 * tmp; 2265 rtmp22[idx] -= mul2 * tmp; 2266 rtmp33[idx] -= mul3 * tmp; 2267 } 2268 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2269 } 2270 prow = *bjtmp++; 2271 } 2272 2273 /* Now take care of diagonal 3x3 block in this set of rows */ 2274 /* note: prow = row here */ 2275 pc1 = rtmp11 + prow; 2276 pc2 = rtmp22 + prow; 2277 pc3 = rtmp33 + prow; 2278 2279 sctx.pv = *pc1; 2280 pj = bj + bi[prow]; 2281 rs = 0.0; 2282 for (j = 0; j < nz; j++) { 2283 idx = pj[j]; 2284 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2285 } 2286 sctx.rs = rs; 2287 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2288 if (sctx.newshift) goto endofwhile; 2289 2290 if (*pc2 != 0.0 || *pc3 != 0.0) { 2291 mul2 = (*pc2) / (*pc1); 2292 mul3 = (*pc3) / (*pc1); 2293 *pc2 = mul2; 2294 *pc3 = mul3; 2295 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2296 pj = nbj + bd[prow]; 2297 for (j = 0; j < nz_tmp; j++) { 2298 idx = pj[j]; 2299 tmp = rtmp11[idx]; 2300 rtmp22[idx] -= mul2 * tmp; 2301 rtmp33[idx] -= mul3 * tmp; 2302 } 2303 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2304 } 2305 ++prow; 2306 2307 pc2 = rtmp22 + prow; 2308 pc3 = rtmp33 + prow; 2309 sctx.pv = *pc2; 2310 pj = bj + bi[prow]; 2311 rs = 0.0; 2312 for (j = 0; j < nz; j++) { 2313 idx = pj[j]; 2314 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2315 } 2316 sctx.rs = rs; 2317 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2318 if (sctx.newshift) goto endofwhile; 2319 2320 if (*pc3 != 0.0) { 2321 mul3 = (*pc3) / (*pc2); 2322 *pc3 = mul3; 2323 pj = nbj + bd[prow]; 2324 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2325 for (j = 0; j < nz_tmp; j++) { 2326 idx = pj[j]; 2327 tmp = rtmp22[idx]; 2328 rtmp33[idx] -= mul3 * tmp; 2329 } 2330 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2331 } 2332 2333 pj = bj + bi[row]; 2334 pc1 = ba + bi[row]; 2335 pc2 = ba + bi[row + 1]; 2336 pc3 = ba + bi[row + 2]; 2337 2338 sctx.pv = rtmp33[row + 2]; 2339 rs = 0.0; 2340 rtmp11[row] = 1.0 / rtmp11[row]; 2341 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2342 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2343 /* copy row entries from dense representation to sparse */ 2344 for (j = 0; j < nz; j++) { 2345 idx = pj[j]; 2346 pc1[j] = rtmp11[idx]; 2347 pc2[j] = rtmp22[idx]; 2348 pc3[j] = rtmp33[idx]; 2349 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2350 } 2351 2352 sctx.rs = rs; 2353 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2354 if (sctx.newshift) goto endofwhile; 2355 break; 2356 2357 default: 2358 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2359 } 2360 row += nodesz; /* Update the row */ 2361 } 2362 endofwhile:; 2363 } while (sctx.newshift); 2364 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2365 PetscCall(PetscFree(tmp_vec2)); 2366 PetscCall(ISRestoreIndices(isicol, &ic)); 2367 PetscCall(ISRestoreIndices(isrow, &r)); 2368 PetscCall(ISRestoreIndices(iscol, &c)); 2369 2370 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2371 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2372 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2373 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2374 C->assembled = PETSC_TRUE; 2375 C->preallocated = PETSC_TRUE; 2376 if (sctx.nshift) { 2377 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2378 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2379 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2380 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2381 } 2382 } 2383 PetscCall(PetscLogFlops(C->cmap->n)); 2384 PetscCall(MatSeqAIJCheckInode(C)); 2385 PetscFunctionReturn(PETSC_SUCCESS); 2386 } 2387 #endif 2388 2389 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2390 { 2391 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2392 IS iscol = a->col, isrow = a->row; 2393 const PetscInt *r, *c, *rout, *cout; 2394 PetscInt i, j, n = A->rmap->n; 2395 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2396 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2397 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2398 PetscScalar sum1, sum2, sum3, sum4, sum5; 2399 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2400 const PetscScalar *b; 2401 2402 PetscFunctionBegin; 2403 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2404 node_max = a->inode.node_count; 2405 ns = a->inode.size; /* Node Size array */ 2406 2407 PetscCall(VecGetArrayRead(bb, &b)); 2408 PetscCall(VecGetArrayWrite(xx, &x)); 2409 tmp = a->solve_work; 2410 2411 PetscCall(ISGetIndices(isrow, &rout)); 2412 r = rout; 2413 PetscCall(ISGetIndices(iscol, &cout)); 2414 c = cout; 2415 2416 /* forward solve the lower triangular */ 2417 tmps = tmp; 2418 aa = a_a; 2419 aj = a_j; 2420 ad = a->diag; 2421 2422 for (i = 0, row = 0; i < node_max; ++i) { 2423 nsz = ns[i]; 2424 aii = ai[row]; 2425 v1 = aa + aii; 2426 vi = aj + aii; 2427 nz = ai[row + 1] - ai[row]; 2428 2429 if (i < node_max - 1) { 2430 /* Prefetch the indices for the next block */ 2431 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2432 /* Prefetch the data for the next block */ 2433 PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2434 } 2435 2436 switch (nsz) { /* Each loop in 'case' is unrolled */ 2437 case 1: 2438 sum1 = b[r[row]]; 2439 for (j = 0; j < nz - 1; j += 2) { 2440 i0 = vi[j]; 2441 i1 = vi[j + 1]; 2442 tmp0 = tmps[i0]; 2443 tmp1 = tmps[i1]; 2444 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2445 } 2446 if (j == nz - 1) { 2447 tmp0 = tmps[vi[j]]; 2448 sum1 -= v1[j] * tmp0; 2449 } 2450 tmp[row++] = sum1; 2451 break; 2452 case 2: 2453 sum1 = b[r[row]]; 2454 sum2 = b[r[row + 1]]; 2455 v2 = aa + ai[row + 1]; 2456 2457 for (j = 0; j < nz - 1; j += 2) { 2458 i0 = vi[j]; 2459 i1 = vi[j + 1]; 2460 tmp0 = tmps[i0]; 2461 tmp1 = tmps[i1]; 2462 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2463 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2464 } 2465 if (j == nz - 1) { 2466 tmp0 = tmps[vi[j]]; 2467 sum1 -= v1[j] * tmp0; 2468 sum2 -= v2[j] * tmp0; 2469 } 2470 sum2 -= v2[nz] * sum1; 2471 tmp[row++] = sum1; 2472 tmp[row++] = sum2; 2473 break; 2474 case 3: 2475 sum1 = b[r[row]]; 2476 sum2 = b[r[row + 1]]; 2477 sum3 = b[r[row + 2]]; 2478 v2 = aa + ai[row + 1]; 2479 v3 = aa + ai[row + 2]; 2480 2481 for (j = 0; j < nz - 1; j += 2) { 2482 i0 = vi[j]; 2483 i1 = vi[j + 1]; 2484 tmp0 = tmps[i0]; 2485 tmp1 = tmps[i1]; 2486 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2487 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2488 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2489 } 2490 if (j == nz - 1) { 2491 tmp0 = tmps[vi[j]]; 2492 sum1 -= v1[j] * tmp0; 2493 sum2 -= v2[j] * tmp0; 2494 sum3 -= v3[j] * tmp0; 2495 } 2496 sum2 -= v2[nz] * sum1; 2497 sum3 -= v3[nz] * sum1; 2498 sum3 -= v3[nz + 1] * sum2; 2499 tmp[row++] = sum1; 2500 tmp[row++] = sum2; 2501 tmp[row++] = sum3; 2502 break; 2503 2504 case 4: 2505 sum1 = b[r[row]]; 2506 sum2 = b[r[row + 1]]; 2507 sum3 = b[r[row + 2]]; 2508 sum4 = b[r[row + 3]]; 2509 v2 = aa + ai[row + 1]; 2510 v3 = aa + ai[row + 2]; 2511 v4 = aa + ai[row + 3]; 2512 2513 for (j = 0; j < nz - 1; j += 2) { 2514 i0 = vi[j]; 2515 i1 = vi[j + 1]; 2516 tmp0 = tmps[i0]; 2517 tmp1 = tmps[i1]; 2518 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2519 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2520 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2521 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2522 } 2523 if (j == nz - 1) { 2524 tmp0 = tmps[vi[j]]; 2525 sum1 -= v1[j] * tmp0; 2526 sum2 -= v2[j] * tmp0; 2527 sum3 -= v3[j] * tmp0; 2528 sum4 -= v4[j] * tmp0; 2529 } 2530 sum2 -= v2[nz] * sum1; 2531 sum3 -= v3[nz] * sum1; 2532 sum4 -= v4[nz] * sum1; 2533 sum3 -= v3[nz + 1] * sum2; 2534 sum4 -= v4[nz + 1] * sum2; 2535 sum4 -= v4[nz + 2] * sum3; 2536 2537 tmp[row++] = sum1; 2538 tmp[row++] = sum2; 2539 tmp[row++] = sum3; 2540 tmp[row++] = sum4; 2541 break; 2542 case 5: 2543 sum1 = b[r[row]]; 2544 sum2 = b[r[row + 1]]; 2545 sum3 = b[r[row + 2]]; 2546 sum4 = b[r[row + 3]]; 2547 sum5 = b[r[row + 4]]; 2548 v2 = aa + ai[row + 1]; 2549 v3 = aa + ai[row + 2]; 2550 v4 = aa + ai[row + 3]; 2551 v5 = aa + ai[row + 4]; 2552 2553 for (j = 0; j < nz - 1; j += 2) { 2554 i0 = vi[j]; 2555 i1 = vi[j + 1]; 2556 tmp0 = tmps[i0]; 2557 tmp1 = tmps[i1]; 2558 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2559 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2560 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2561 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2562 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2563 } 2564 if (j == nz - 1) { 2565 tmp0 = tmps[vi[j]]; 2566 sum1 -= v1[j] * tmp0; 2567 sum2 -= v2[j] * tmp0; 2568 sum3 -= v3[j] * tmp0; 2569 sum4 -= v4[j] * tmp0; 2570 sum5 -= v5[j] * tmp0; 2571 } 2572 2573 sum2 -= v2[nz] * sum1; 2574 sum3 -= v3[nz] * sum1; 2575 sum4 -= v4[nz] * sum1; 2576 sum5 -= v5[nz] * sum1; 2577 sum3 -= v3[nz + 1] * sum2; 2578 sum4 -= v4[nz + 1] * sum2; 2579 sum5 -= v5[nz + 1] * sum2; 2580 sum4 -= v4[nz + 2] * sum3; 2581 sum5 -= v5[nz + 2] * sum3; 2582 sum5 -= v5[nz + 3] * sum4; 2583 2584 tmp[row++] = sum1; 2585 tmp[row++] = sum2; 2586 tmp[row++] = sum3; 2587 tmp[row++] = sum4; 2588 tmp[row++] = sum5; 2589 break; 2590 default: 2591 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2592 } 2593 } 2594 /* backward solve the upper triangular */ 2595 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 2596 nsz = ns[i]; 2597 aii = ad[row + 1] + 1; 2598 v1 = aa + aii; 2599 vi = aj + aii; 2600 nz = ad[row] - ad[row + 1] - 1; 2601 2602 if (i > 0) { 2603 /* Prefetch the indices for the next block */ 2604 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2605 /* Prefetch the data for the next block */ 2606 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2607 } 2608 2609 switch (nsz) { /* Each loop in 'case' is unrolled */ 2610 case 1: 2611 sum1 = tmp[row]; 2612 2613 for (j = 0; j < nz - 1; j += 2) { 2614 i0 = vi[j]; 2615 i1 = vi[j + 1]; 2616 tmp0 = tmps[i0]; 2617 tmp1 = tmps[i1]; 2618 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2619 } 2620 if (j == nz - 1) { 2621 tmp0 = tmps[vi[j]]; 2622 sum1 -= v1[j] * tmp0; 2623 } 2624 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2625 row--; 2626 break; 2627 case 2: 2628 sum1 = tmp[row]; 2629 sum2 = tmp[row - 1]; 2630 v2 = aa + ad[row] + 1; 2631 for (j = 0; j < nz - 1; j += 2) { 2632 i0 = vi[j]; 2633 i1 = vi[j + 1]; 2634 tmp0 = tmps[i0]; 2635 tmp1 = tmps[i1]; 2636 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2637 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2638 } 2639 if (j == nz - 1) { 2640 tmp0 = tmps[vi[j]]; 2641 sum1 -= v1[j] * tmp0; 2642 sum2 -= v2[j + 1] * tmp0; 2643 } 2644 2645 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2646 row--; 2647 sum2 -= v2[0] * tmp0; 2648 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2649 row--; 2650 break; 2651 case 3: 2652 sum1 = tmp[row]; 2653 sum2 = tmp[row - 1]; 2654 sum3 = tmp[row - 2]; 2655 v2 = aa + ad[row] + 1; 2656 v3 = aa + ad[row - 1] + 1; 2657 for (j = 0; j < nz - 1; j += 2) { 2658 i0 = vi[j]; 2659 i1 = vi[j + 1]; 2660 tmp0 = tmps[i0]; 2661 tmp1 = tmps[i1]; 2662 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2663 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2664 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2665 } 2666 if (j == nz - 1) { 2667 tmp0 = tmps[vi[j]]; 2668 sum1 -= v1[j] * tmp0; 2669 sum2 -= v2[j + 1] * tmp0; 2670 sum3 -= v3[j + 2] * tmp0; 2671 } 2672 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2673 row--; 2674 sum2 -= v2[0] * tmp0; 2675 sum3 -= v3[1] * tmp0; 2676 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2677 row--; 2678 sum3 -= v3[0] * tmp0; 2679 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2680 row--; 2681 2682 break; 2683 case 4: 2684 sum1 = tmp[row]; 2685 sum2 = tmp[row - 1]; 2686 sum3 = tmp[row - 2]; 2687 sum4 = tmp[row - 3]; 2688 v2 = aa + ad[row] + 1; 2689 v3 = aa + ad[row - 1] + 1; 2690 v4 = aa + ad[row - 2] + 1; 2691 2692 for (j = 0; j < nz - 1; j += 2) { 2693 i0 = vi[j]; 2694 i1 = vi[j + 1]; 2695 tmp0 = tmps[i0]; 2696 tmp1 = tmps[i1]; 2697 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2698 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2699 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2700 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2701 } 2702 if (j == nz - 1) { 2703 tmp0 = tmps[vi[j]]; 2704 sum1 -= v1[j] * tmp0; 2705 sum2 -= v2[j + 1] * tmp0; 2706 sum3 -= v3[j + 2] * tmp0; 2707 sum4 -= v4[j + 3] * tmp0; 2708 } 2709 2710 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2711 row--; 2712 sum2 -= v2[0] * tmp0; 2713 sum3 -= v3[1] * tmp0; 2714 sum4 -= v4[2] * tmp0; 2715 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2716 row--; 2717 sum3 -= v3[0] * tmp0; 2718 sum4 -= v4[1] * tmp0; 2719 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2720 row--; 2721 sum4 -= v4[0] * tmp0; 2722 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2723 row--; 2724 break; 2725 case 5: 2726 sum1 = tmp[row]; 2727 sum2 = tmp[row - 1]; 2728 sum3 = tmp[row - 2]; 2729 sum4 = tmp[row - 3]; 2730 sum5 = tmp[row - 4]; 2731 v2 = aa + ad[row] + 1; 2732 v3 = aa + ad[row - 1] + 1; 2733 v4 = aa + ad[row - 2] + 1; 2734 v5 = aa + ad[row - 3] + 1; 2735 for (j = 0; j < nz - 1; j += 2) { 2736 i0 = vi[j]; 2737 i1 = vi[j + 1]; 2738 tmp0 = tmps[i0]; 2739 tmp1 = tmps[i1]; 2740 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2741 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2742 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2743 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2744 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2745 } 2746 if (j == nz - 1) { 2747 tmp0 = tmps[vi[j]]; 2748 sum1 -= v1[j] * tmp0; 2749 sum2 -= v2[j + 1] * tmp0; 2750 sum3 -= v3[j + 2] * tmp0; 2751 sum4 -= v4[j + 3] * tmp0; 2752 sum5 -= v5[j + 4] * tmp0; 2753 } 2754 2755 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2756 row--; 2757 sum2 -= v2[0] * tmp0; 2758 sum3 -= v3[1] * tmp0; 2759 sum4 -= v4[2] * tmp0; 2760 sum5 -= v5[3] * tmp0; 2761 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2762 row--; 2763 sum3 -= v3[0] * tmp0; 2764 sum4 -= v4[1] * tmp0; 2765 sum5 -= v5[2] * tmp0; 2766 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2767 row--; 2768 sum4 -= v4[0] * tmp0; 2769 sum5 -= v5[1] * tmp0; 2770 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2771 row--; 2772 sum5 -= v5[0] * tmp0; 2773 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2774 row--; 2775 break; 2776 default: 2777 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2778 } 2779 } 2780 PetscCall(ISRestoreIndices(isrow, &rout)); 2781 PetscCall(ISRestoreIndices(iscol, &cout)); 2782 PetscCall(VecRestoreArrayRead(bb, &b)); 2783 PetscCall(VecRestoreArrayWrite(xx, &x)); 2784 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2785 PetscFunctionReturn(PETSC_SUCCESS); 2786 } 2787 2788 /* 2789 Makes a longer coloring[] array and calls the usual code with that 2790 */ 2791 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2792 { 2793 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2794 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row; 2795 PetscInt *colorused, i; 2796 ISColoringValue *newcolor; 2797 2798 PetscFunctionBegin; 2799 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2800 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2801 /* loop over inodes, marking a color for each column*/ 2802 row = 0; 2803 for (i = 0; i < m; i++) { 2804 for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors; 2805 } 2806 2807 /* eliminate unneeded colors */ 2808 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2809 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2810 2811 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2812 ncolors = colorused[5 * ncolors - 1]; 2813 for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1; 2814 PetscCall(PetscFree(colorused)); 2815 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2816 PetscCall(PetscFree(coloring)); 2817 PetscFunctionReturn(PETSC_SUCCESS); 2818 } 2819 2820 #include <petsc/private/kernels/blockinvert.h> 2821 2822 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2823 { 2824 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2825 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2826 MatScalar *ibdiag, *bdiag, work[25], *t; 2827 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2828 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2829 const PetscScalar *xb, *b; 2830 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2831 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2; 2832 PetscInt sz, k, ipvt[5]; 2833 PetscBool allowzeropivot, zeropivotdetected; 2834 const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i; 2835 2836 PetscFunctionBegin; 2837 allowzeropivot = PetscNot(A->erroriffailure); 2838 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2839 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2840 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2841 2842 if (!a->inode.ibdiagvalid) { 2843 if (!a->inode.ibdiag) { 2844 /* calculate space needed for diagonal blocks */ 2845 for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i]; 2846 a->inode.bdiagsize = cnt; 2847 2848 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2849 } 2850 2851 /* copy over the diagonal blocks and invert them */ 2852 ibdiag = a->inode.ibdiag; 2853 bdiag = a->inode.bdiag; 2854 cnt = 0; 2855 for (i = 0, row = 0; i < m; i++) { 2856 for (j = 0; j < sizes[i]; j++) { 2857 for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k]; 2858 } 2859 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i])); 2860 2861 switch (sizes[i]) { 2862 case 1: 2863 /* Create matrix data structure */ 2864 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2865 if (allowzeropivot) { 2866 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2867 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2868 A->factorerror_zeropivot_row = row; 2869 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2870 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2871 } 2872 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2873 break; 2874 case 2: 2875 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2876 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2877 break; 2878 case 3: 2879 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2880 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2881 break; 2882 case 4: 2883 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2884 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2885 break; 2886 case 5: 2887 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2888 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2889 break; 2890 default: 2891 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 2892 } 2893 cnt += sizes[i] * sizes[i]; 2894 row += sizes[i]; 2895 } 2896 a->inode.ibdiagvalid = PETSC_TRUE; 2897 } 2898 ibdiag = a->inode.ibdiag; 2899 bdiag = a->inode.bdiag; 2900 t = a->inode.ssor_work; 2901 2902 PetscCall(VecGetArray(xx, &x)); 2903 PetscCall(VecGetArrayRead(bb, &b)); 2904 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2905 if (flag & SOR_ZERO_INITIAL_GUESS) { 2906 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2907 for (i = 0, row = 0; i < m; i++) { 2908 sz = diag[row] - ii[row]; 2909 v1 = a->a + ii[row]; 2910 idx = a->j + ii[row]; 2911 2912 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2913 switch (sizes[i]) { 2914 case 1: 2915 2916 sum1 = b[row]; 2917 for (n = 0; n < sz - 1; n += 2) { 2918 i1 = idx[0]; 2919 i2 = idx[1]; 2920 idx += 2; 2921 tmp0 = x[i1]; 2922 tmp1 = x[i2]; 2923 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2924 v1 += 2; 2925 } 2926 2927 if (n == sz - 1) { 2928 tmp0 = x[*idx]; 2929 sum1 -= *v1 * tmp0; 2930 } 2931 t[row] = sum1; 2932 x[row++] = sum1 * (*ibdiag++); 2933 break; 2934 case 2: 2935 v2 = a->a + ii[row + 1]; 2936 sum1 = b[row]; 2937 sum2 = b[row + 1]; 2938 for (n = 0; n < sz - 1; n += 2) { 2939 i1 = idx[0]; 2940 i2 = idx[1]; 2941 idx += 2; 2942 tmp0 = x[i1]; 2943 tmp1 = x[i2]; 2944 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2945 v1 += 2; 2946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2947 v2 += 2; 2948 } 2949 2950 if (n == sz - 1) { 2951 tmp0 = x[*idx]; 2952 sum1 -= v1[0] * tmp0; 2953 sum2 -= v2[0] * tmp0; 2954 } 2955 t[row] = sum1; 2956 t[row + 1] = sum2; 2957 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2958 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2959 ibdiag += 4; 2960 break; 2961 case 3: 2962 v2 = a->a + ii[row + 1]; 2963 v3 = a->a + ii[row + 2]; 2964 sum1 = b[row]; 2965 sum2 = b[row + 1]; 2966 sum3 = b[row + 2]; 2967 for (n = 0; n < sz - 1; n += 2) { 2968 i1 = idx[0]; 2969 i2 = idx[1]; 2970 idx += 2; 2971 tmp0 = x[i1]; 2972 tmp1 = x[i2]; 2973 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2974 v1 += 2; 2975 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2976 v2 += 2; 2977 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2978 v3 += 2; 2979 } 2980 2981 if (n == sz - 1) { 2982 tmp0 = x[*idx]; 2983 sum1 -= v1[0] * tmp0; 2984 sum2 -= v2[0] * tmp0; 2985 sum3 -= v3[0] * tmp0; 2986 } 2987 t[row] = sum1; 2988 t[row + 1] = sum2; 2989 t[row + 2] = sum3; 2990 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 2991 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 2992 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 2993 ibdiag += 9; 2994 break; 2995 case 4: 2996 v2 = a->a + ii[row + 1]; 2997 v3 = a->a + ii[row + 2]; 2998 v4 = a->a + ii[row + 3]; 2999 sum1 = b[row]; 3000 sum2 = b[row + 1]; 3001 sum3 = b[row + 2]; 3002 sum4 = b[row + 3]; 3003 for (n = 0; n < sz - 1; n += 2) { 3004 i1 = idx[0]; 3005 i2 = idx[1]; 3006 idx += 2; 3007 tmp0 = x[i1]; 3008 tmp1 = x[i2]; 3009 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3010 v1 += 2; 3011 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3012 v2 += 2; 3013 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3014 v3 += 2; 3015 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3016 v4 += 2; 3017 } 3018 3019 if (n == sz - 1) { 3020 tmp0 = x[*idx]; 3021 sum1 -= v1[0] * tmp0; 3022 sum2 -= v2[0] * tmp0; 3023 sum3 -= v3[0] * tmp0; 3024 sum4 -= v4[0] * tmp0; 3025 } 3026 t[row] = sum1; 3027 t[row + 1] = sum2; 3028 t[row + 2] = sum3; 3029 t[row + 3] = sum4; 3030 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3031 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3032 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3033 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3034 ibdiag += 16; 3035 break; 3036 case 5: 3037 v2 = a->a + ii[row + 1]; 3038 v3 = a->a + ii[row + 2]; 3039 v4 = a->a + ii[row + 3]; 3040 v5 = a->a + ii[row + 4]; 3041 sum1 = b[row]; 3042 sum2 = b[row + 1]; 3043 sum3 = b[row + 2]; 3044 sum4 = b[row + 3]; 3045 sum5 = b[row + 4]; 3046 for (n = 0; n < sz - 1; n += 2) { 3047 i1 = idx[0]; 3048 i2 = idx[1]; 3049 idx += 2; 3050 tmp0 = x[i1]; 3051 tmp1 = x[i2]; 3052 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3053 v1 += 2; 3054 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3055 v2 += 2; 3056 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3057 v3 += 2; 3058 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3059 v4 += 2; 3060 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3061 v5 += 2; 3062 } 3063 3064 if (n == sz - 1) { 3065 tmp0 = x[*idx]; 3066 sum1 -= v1[0] * tmp0; 3067 sum2 -= v2[0] * tmp0; 3068 sum3 -= v3[0] * tmp0; 3069 sum4 -= v4[0] * tmp0; 3070 sum5 -= v5[0] * tmp0; 3071 } 3072 t[row] = sum1; 3073 t[row + 1] = sum2; 3074 t[row + 2] = sum3; 3075 t[row + 3] = sum4; 3076 t[row + 4] = sum5; 3077 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3078 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3079 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3080 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3081 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3082 ibdiag += 25; 3083 break; 3084 default: 3085 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3086 } 3087 } 3088 3089 xb = t; 3090 PetscCall(PetscLogFlops(a->nz)); 3091 } else xb = b; 3092 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3093 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3094 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3095 ibdiag -= sizes[i] * sizes[i]; 3096 sz = ii[row + 1] - diag[row] - 1; 3097 v1 = a->a + diag[row] + 1; 3098 idx = a->j + diag[row] + 1; 3099 3100 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3101 switch (sizes[i]) { 3102 case 1: 3103 3104 sum1 = xb[row]; 3105 for (n = 0; n < sz - 1; n += 2) { 3106 i1 = idx[0]; 3107 i2 = idx[1]; 3108 idx += 2; 3109 tmp0 = x[i1]; 3110 tmp1 = x[i2]; 3111 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3112 v1 += 2; 3113 } 3114 3115 if (n == sz - 1) { 3116 tmp0 = x[*idx]; 3117 sum1 -= *v1 * tmp0; 3118 } 3119 x[row--] = sum1 * (*ibdiag); 3120 break; 3121 3122 case 2: 3123 3124 sum1 = xb[row]; 3125 sum2 = xb[row - 1]; 3126 /* note that sum1 is associated with the second of the two rows */ 3127 v2 = a->a + diag[row - 1] + 2; 3128 for (n = 0; n < sz - 1; n += 2) { 3129 i1 = idx[0]; 3130 i2 = idx[1]; 3131 idx += 2; 3132 tmp0 = x[i1]; 3133 tmp1 = x[i2]; 3134 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3135 v1 += 2; 3136 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3137 v2 += 2; 3138 } 3139 3140 if (n == sz - 1) { 3141 tmp0 = x[*idx]; 3142 sum1 -= *v1 * tmp0; 3143 sum2 -= *v2 * tmp0; 3144 } 3145 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3146 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3147 break; 3148 case 3: 3149 3150 sum1 = xb[row]; 3151 sum2 = xb[row - 1]; 3152 sum3 = xb[row - 2]; 3153 v2 = a->a + diag[row - 1] + 2; 3154 v3 = a->a + diag[row - 2] + 3; 3155 for (n = 0; n < sz - 1; n += 2) { 3156 i1 = idx[0]; 3157 i2 = idx[1]; 3158 idx += 2; 3159 tmp0 = x[i1]; 3160 tmp1 = x[i2]; 3161 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3162 v1 += 2; 3163 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3164 v2 += 2; 3165 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3166 v3 += 2; 3167 } 3168 3169 if (n == sz - 1) { 3170 tmp0 = x[*idx]; 3171 sum1 -= *v1 * tmp0; 3172 sum2 -= *v2 * tmp0; 3173 sum3 -= *v3 * tmp0; 3174 } 3175 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3176 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3177 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3178 break; 3179 case 4: 3180 3181 sum1 = xb[row]; 3182 sum2 = xb[row - 1]; 3183 sum3 = xb[row - 2]; 3184 sum4 = xb[row - 3]; 3185 v2 = a->a + diag[row - 1] + 2; 3186 v3 = a->a + diag[row - 2] + 3; 3187 v4 = a->a + diag[row - 3] + 4; 3188 for (n = 0; n < sz - 1; n += 2) { 3189 i1 = idx[0]; 3190 i2 = idx[1]; 3191 idx += 2; 3192 tmp0 = x[i1]; 3193 tmp1 = x[i2]; 3194 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3195 v1 += 2; 3196 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3197 v2 += 2; 3198 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3199 v3 += 2; 3200 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3201 v4 += 2; 3202 } 3203 3204 if (n == sz - 1) { 3205 tmp0 = x[*idx]; 3206 sum1 -= *v1 * tmp0; 3207 sum2 -= *v2 * tmp0; 3208 sum3 -= *v3 * tmp0; 3209 sum4 -= *v4 * tmp0; 3210 } 3211 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3212 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3213 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3214 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3215 break; 3216 case 5: 3217 3218 sum1 = xb[row]; 3219 sum2 = xb[row - 1]; 3220 sum3 = xb[row - 2]; 3221 sum4 = xb[row - 3]; 3222 sum5 = xb[row - 4]; 3223 v2 = a->a + diag[row - 1] + 2; 3224 v3 = a->a + diag[row - 2] + 3; 3225 v4 = a->a + diag[row - 3] + 4; 3226 v5 = a->a + diag[row - 4] + 5; 3227 for (n = 0; n < sz - 1; n += 2) { 3228 i1 = idx[0]; 3229 i2 = idx[1]; 3230 idx += 2; 3231 tmp0 = x[i1]; 3232 tmp1 = x[i2]; 3233 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3234 v1 += 2; 3235 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3236 v2 += 2; 3237 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3238 v3 += 2; 3239 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3240 v4 += 2; 3241 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3242 v5 += 2; 3243 } 3244 3245 if (n == sz - 1) { 3246 tmp0 = x[*idx]; 3247 sum1 -= *v1 * tmp0; 3248 sum2 -= *v2 * tmp0; 3249 sum3 -= *v3 * tmp0; 3250 sum4 -= *v4 * tmp0; 3251 sum5 -= *v5 * tmp0; 3252 } 3253 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3254 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3255 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3256 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3257 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3258 break; 3259 default: 3260 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3261 } 3262 } 3263 3264 PetscCall(PetscLogFlops(a->nz)); 3265 } 3266 its--; 3267 } 3268 while (its--) { 3269 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3270 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) { 3271 sz = diag[row] - ii[row]; 3272 v1 = a->a + ii[row]; 3273 idx = a->j + ii[row]; 3274 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3275 switch (sizes[i]) { 3276 case 1: 3277 sum1 = b[row]; 3278 for (n = 0; n < sz - 1; n += 2) { 3279 i1 = idx[0]; 3280 i2 = idx[1]; 3281 idx += 2; 3282 tmp0 = x[i1]; 3283 tmp1 = x[i2]; 3284 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3285 v1 += 2; 3286 } 3287 if (n == sz - 1) { 3288 tmp0 = x[*idx++]; 3289 sum1 -= *v1 * tmp0; 3290 v1++; 3291 } 3292 t[row] = sum1; 3293 sz = ii[row + 1] - diag[row] - 1; 3294 idx = a->j + diag[row] + 1; 3295 v1 += 1; 3296 for (n = 0; n < sz - 1; n += 2) { 3297 i1 = idx[0]; 3298 i2 = idx[1]; 3299 idx += 2; 3300 tmp0 = x[i1]; 3301 tmp1 = x[i2]; 3302 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3303 v1 += 2; 3304 } 3305 if (n == sz - 1) { 3306 tmp0 = x[*idx++]; 3307 sum1 -= *v1 * tmp0; 3308 } 3309 /* in MatSOR_SeqAIJ this line would be 3310 * 3311 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3312 * 3313 * but omega == 1, so this becomes 3314 * 3315 * x[row] = sum1*(*ibdiag++); 3316 * 3317 */ 3318 x[row] = sum1 * (*ibdiag); 3319 break; 3320 case 2: 3321 v2 = a->a + ii[row + 1]; 3322 sum1 = b[row]; 3323 sum2 = b[row + 1]; 3324 for (n = 0; n < sz - 1; n += 2) { 3325 i1 = idx[0]; 3326 i2 = idx[1]; 3327 idx += 2; 3328 tmp0 = x[i1]; 3329 tmp1 = x[i2]; 3330 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3331 v1 += 2; 3332 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3333 v2 += 2; 3334 } 3335 if (n == sz - 1) { 3336 tmp0 = x[*idx++]; 3337 sum1 -= v1[0] * tmp0; 3338 sum2 -= v2[0] * tmp0; 3339 v1++; 3340 v2++; 3341 } 3342 t[row] = sum1; 3343 t[row + 1] = sum2; 3344 sz = ii[row + 1] - diag[row] - 2; 3345 idx = a->j + diag[row] + 2; 3346 v1 += 2; 3347 v2 += 2; 3348 for (n = 0; n < sz - 1; n += 2) { 3349 i1 = idx[0]; 3350 i2 = idx[1]; 3351 idx += 2; 3352 tmp0 = x[i1]; 3353 tmp1 = x[i2]; 3354 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3355 v1 += 2; 3356 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3357 v2 += 2; 3358 } 3359 if (n == sz - 1) { 3360 tmp0 = x[*idx]; 3361 sum1 -= v1[0] * tmp0; 3362 sum2 -= v2[0] * tmp0; 3363 } 3364 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3365 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3366 break; 3367 case 3: 3368 v2 = a->a + ii[row + 1]; 3369 v3 = a->a + ii[row + 2]; 3370 sum1 = b[row]; 3371 sum2 = b[row + 1]; 3372 sum3 = b[row + 2]; 3373 for (n = 0; n < sz - 1; n += 2) { 3374 i1 = idx[0]; 3375 i2 = idx[1]; 3376 idx += 2; 3377 tmp0 = x[i1]; 3378 tmp1 = x[i2]; 3379 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3380 v1 += 2; 3381 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3382 v2 += 2; 3383 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3384 v3 += 2; 3385 } 3386 if (n == sz - 1) { 3387 tmp0 = x[*idx++]; 3388 sum1 -= v1[0] * tmp0; 3389 sum2 -= v2[0] * tmp0; 3390 sum3 -= v3[0] * tmp0; 3391 v1++; 3392 v2++; 3393 v3++; 3394 } 3395 t[row] = sum1; 3396 t[row + 1] = sum2; 3397 t[row + 2] = sum3; 3398 sz = ii[row + 1] - diag[row] - 3; 3399 idx = a->j + diag[row] + 3; 3400 v1 += 3; 3401 v2 += 3; 3402 v3 += 3; 3403 for (n = 0; n < sz - 1; n += 2) { 3404 i1 = idx[0]; 3405 i2 = idx[1]; 3406 idx += 2; 3407 tmp0 = x[i1]; 3408 tmp1 = x[i2]; 3409 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3410 v1 += 2; 3411 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3412 v2 += 2; 3413 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3414 v3 += 2; 3415 } 3416 if (n == sz - 1) { 3417 tmp0 = x[*idx]; 3418 sum1 -= v1[0] * tmp0; 3419 sum2 -= v2[0] * tmp0; 3420 sum3 -= v3[0] * tmp0; 3421 } 3422 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3423 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3424 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3425 break; 3426 case 4: 3427 v2 = a->a + ii[row + 1]; 3428 v3 = a->a + ii[row + 2]; 3429 v4 = a->a + ii[row + 3]; 3430 sum1 = b[row]; 3431 sum2 = b[row + 1]; 3432 sum3 = b[row + 2]; 3433 sum4 = b[row + 3]; 3434 for (n = 0; n < sz - 1; n += 2) { 3435 i1 = idx[0]; 3436 i2 = idx[1]; 3437 idx += 2; 3438 tmp0 = x[i1]; 3439 tmp1 = x[i2]; 3440 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3441 v1 += 2; 3442 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3443 v2 += 2; 3444 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3445 v3 += 2; 3446 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3447 v4 += 2; 3448 } 3449 if (n == sz - 1) { 3450 tmp0 = x[*idx++]; 3451 sum1 -= v1[0] * tmp0; 3452 sum2 -= v2[0] * tmp0; 3453 sum3 -= v3[0] * tmp0; 3454 sum4 -= v4[0] * tmp0; 3455 v1++; 3456 v2++; 3457 v3++; 3458 v4++; 3459 } 3460 t[row] = sum1; 3461 t[row + 1] = sum2; 3462 t[row + 2] = sum3; 3463 t[row + 3] = sum4; 3464 sz = ii[row + 1] - diag[row] - 4; 3465 idx = a->j + diag[row] + 4; 3466 v1 += 4; 3467 v2 += 4; 3468 v3 += 4; 3469 v4 += 4; 3470 for (n = 0; n < sz - 1; n += 2) { 3471 i1 = idx[0]; 3472 i2 = idx[1]; 3473 idx += 2; 3474 tmp0 = x[i1]; 3475 tmp1 = x[i2]; 3476 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3477 v1 += 2; 3478 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3479 v2 += 2; 3480 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3481 v3 += 2; 3482 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3483 v4 += 2; 3484 } 3485 if (n == sz - 1) { 3486 tmp0 = x[*idx]; 3487 sum1 -= v1[0] * tmp0; 3488 sum2 -= v2[0] * tmp0; 3489 sum3 -= v3[0] * tmp0; 3490 sum4 -= v4[0] * tmp0; 3491 } 3492 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3493 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3494 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3495 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3496 break; 3497 case 5: 3498 v2 = a->a + ii[row + 1]; 3499 v3 = a->a + ii[row + 2]; 3500 v4 = a->a + ii[row + 3]; 3501 v5 = a->a + ii[row + 4]; 3502 sum1 = b[row]; 3503 sum2 = b[row + 1]; 3504 sum3 = b[row + 2]; 3505 sum4 = b[row + 3]; 3506 sum5 = b[row + 4]; 3507 for (n = 0; n < sz - 1; n += 2) { 3508 i1 = idx[0]; 3509 i2 = idx[1]; 3510 idx += 2; 3511 tmp0 = x[i1]; 3512 tmp1 = x[i2]; 3513 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3514 v1 += 2; 3515 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3516 v2 += 2; 3517 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3518 v3 += 2; 3519 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3520 v4 += 2; 3521 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3522 v5 += 2; 3523 } 3524 if (n == sz - 1) { 3525 tmp0 = x[*idx++]; 3526 sum1 -= v1[0] * tmp0; 3527 sum2 -= v2[0] * tmp0; 3528 sum3 -= v3[0] * tmp0; 3529 sum4 -= v4[0] * tmp0; 3530 sum5 -= v5[0] * tmp0; 3531 v1++; 3532 v2++; 3533 v3++; 3534 v4++; 3535 v5++; 3536 } 3537 t[row] = sum1; 3538 t[row + 1] = sum2; 3539 t[row + 2] = sum3; 3540 t[row + 3] = sum4; 3541 t[row + 4] = sum5; 3542 sz = ii[row + 1] - diag[row] - 5; 3543 idx = a->j + diag[row] + 5; 3544 v1 += 5; 3545 v2 += 5; 3546 v3 += 5; 3547 v4 += 5; 3548 v5 += 5; 3549 for (n = 0; n < sz - 1; n += 2) { 3550 i1 = idx[0]; 3551 i2 = idx[1]; 3552 idx += 2; 3553 tmp0 = x[i1]; 3554 tmp1 = x[i2]; 3555 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3556 v1 += 2; 3557 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3558 v2 += 2; 3559 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3560 v3 += 2; 3561 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3562 v4 += 2; 3563 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3564 v5 += 2; 3565 } 3566 if (n == sz - 1) { 3567 tmp0 = x[*idx]; 3568 sum1 -= v1[0] * tmp0; 3569 sum2 -= v2[0] * tmp0; 3570 sum3 -= v3[0] * tmp0; 3571 sum4 -= v4[0] * tmp0; 3572 sum5 -= v5[0] * tmp0; 3573 } 3574 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3575 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3576 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3577 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3578 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3579 break; 3580 default: 3581 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3582 } 3583 } 3584 xb = t; 3585 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3586 } else xb = b; 3587 3588 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3589 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3590 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3591 ibdiag -= sizes[i] * sizes[i]; 3592 3593 /* set RHS */ 3594 if (xb == b) { 3595 /* whole (old way) */ 3596 sz = ii[row + 1] - ii[row]; 3597 idx = a->j + ii[row]; 3598 switch (sizes[i]) { 3599 case 5: 3600 v5 = a->a + ii[row - 4]; /* fall through */ 3601 case 4: 3602 v4 = a->a + ii[row - 3]; /* fall through */ 3603 case 3: 3604 v3 = a->a + ii[row - 2]; /* fall through */ 3605 case 2: 3606 v2 = a->a + ii[row - 1]; /* fall through */ 3607 case 1: 3608 v1 = a->a + ii[row]; 3609 break; 3610 default: 3611 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3612 } 3613 } else { 3614 /* upper, no diag */ 3615 sz = ii[row + 1] - diag[row] - 1; 3616 idx = a->j + diag[row] + 1; 3617 switch (sizes[i]) { 3618 case 5: 3619 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3620 case 4: 3621 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3622 case 3: 3623 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3624 case 2: 3625 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3626 case 1: 3627 v1 = a->a + diag[row] + 1; 3628 } 3629 } 3630 /* set sum */ 3631 switch (sizes[i]) { 3632 case 5: 3633 sum5 = xb[row - 4]; /* fall through */ 3634 case 4: 3635 sum4 = xb[row - 3]; /* fall through */ 3636 case 3: 3637 sum3 = xb[row - 2]; /* fall through */ 3638 case 2: 3639 sum2 = xb[row - 1]; /* fall through */ 3640 case 1: 3641 /* note that sum1 is associated with the last row */ 3642 sum1 = xb[row]; 3643 } 3644 /* do sums */ 3645 for (n = 0; n < sz - 1; n += 2) { 3646 i1 = idx[0]; 3647 i2 = idx[1]; 3648 idx += 2; 3649 tmp0 = x[i1]; 3650 tmp1 = x[i2]; 3651 switch (sizes[i]) { 3652 case 5: 3653 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3654 v5 += 2; /* fall through */ 3655 case 4: 3656 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3657 v4 += 2; /* fall through */ 3658 case 3: 3659 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3660 v3 += 2; /* fall through */ 3661 case 2: 3662 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3663 v2 += 2; /* fall through */ 3664 case 1: 3665 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3666 v1 += 2; 3667 } 3668 } 3669 /* ragged edge */ 3670 if (n == sz - 1) { 3671 tmp0 = x[*idx]; 3672 switch (sizes[i]) { 3673 case 5: 3674 sum5 -= *v5 * tmp0; /* fall through */ 3675 case 4: 3676 sum4 -= *v4 * tmp0; /* fall through */ 3677 case 3: 3678 sum3 -= *v3 * tmp0; /* fall through */ 3679 case 2: 3680 sum2 -= *v2 * tmp0; /* fall through */ 3681 case 1: 3682 sum1 -= *v1 * tmp0; 3683 } 3684 } 3685 /* update */ 3686 if (xb == b) { 3687 /* whole (old way) w/ diag */ 3688 switch (sizes[i]) { 3689 case 5: 3690 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3691 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3692 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3693 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3694 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3695 break; 3696 case 4: 3697 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3698 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3699 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3700 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3701 break; 3702 case 3: 3703 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3704 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3705 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3706 break; 3707 case 2: 3708 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3709 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3710 break; 3711 case 1: 3712 x[row--] += sum1 * (*ibdiag); 3713 break; 3714 } 3715 } else { 3716 /* no diag so set = */ 3717 switch (sizes[i]) { 3718 case 5: 3719 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3720 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3721 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3722 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3723 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3724 break; 3725 case 4: 3726 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3727 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3728 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3729 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3730 break; 3731 case 3: 3732 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3733 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3734 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3735 break; 3736 case 2: 3737 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3738 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3739 break; 3740 case 1: 3741 x[row--] = sum1 * (*ibdiag); 3742 break; 3743 } 3744 } 3745 } 3746 if (xb == b) { 3747 PetscCall(PetscLogFlops(2.0 * a->nz)); 3748 } else { 3749 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3750 } 3751 } 3752 } 3753 if (flag & SOR_EISENSTAT) { 3754 /* 3755 Apply (U + D)^-1 where D is now the block diagonal 3756 */ 3757 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3758 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3759 ibdiag -= sizes[i] * sizes[i]; 3760 sz = ii[row + 1] - diag[row] - 1; 3761 v1 = a->a + diag[row] + 1; 3762 idx = a->j + diag[row] + 1; 3763 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3764 switch (sizes[i]) { 3765 case 1: 3766 3767 sum1 = b[row]; 3768 for (n = 0; n < sz - 1; n += 2) { 3769 i1 = idx[0]; 3770 i2 = idx[1]; 3771 idx += 2; 3772 tmp0 = x[i1]; 3773 tmp1 = x[i2]; 3774 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3775 v1 += 2; 3776 } 3777 3778 if (n == sz - 1) { 3779 tmp0 = x[*idx]; 3780 sum1 -= *v1 * tmp0; 3781 } 3782 x[row] = sum1 * (*ibdiag); 3783 row--; 3784 break; 3785 3786 case 2: 3787 3788 sum1 = b[row]; 3789 sum2 = b[row - 1]; 3790 /* note that sum1 is associated with the second of the two rows */ 3791 v2 = a->a + diag[row - 1] + 2; 3792 for (n = 0; n < sz - 1; n += 2) { 3793 i1 = idx[0]; 3794 i2 = idx[1]; 3795 idx += 2; 3796 tmp0 = x[i1]; 3797 tmp1 = x[i2]; 3798 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3799 v1 += 2; 3800 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3801 v2 += 2; 3802 } 3803 3804 if (n == sz - 1) { 3805 tmp0 = x[*idx]; 3806 sum1 -= *v1 * tmp0; 3807 sum2 -= *v2 * tmp0; 3808 } 3809 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3810 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3811 row -= 2; 3812 break; 3813 case 3: 3814 3815 sum1 = b[row]; 3816 sum2 = b[row - 1]; 3817 sum3 = b[row - 2]; 3818 v2 = a->a + diag[row - 1] + 2; 3819 v3 = a->a + diag[row - 2] + 3; 3820 for (n = 0; n < sz - 1; n += 2) { 3821 i1 = idx[0]; 3822 i2 = idx[1]; 3823 idx += 2; 3824 tmp0 = x[i1]; 3825 tmp1 = x[i2]; 3826 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3827 v1 += 2; 3828 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3829 v2 += 2; 3830 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3831 v3 += 2; 3832 } 3833 3834 if (n == sz - 1) { 3835 tmp0 = x[*idx]; 3836 sum1 -= *v1 * tmp0; 3837 sum2 -= *v2 * tmp0; 3838 sum3 -= *v3 * tmp0; 3839 } 3840 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3841 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3842 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3843 row -= 3; 3844 break; 3845 case 4: 3846 3847 sum1 = b[row]; 3848 sum2 = b[row - 1]; 3849 sum3 = b[row - 2]; 3850 sum4 = b[row - 3]; 3851 v2 = a->a + diag[row - 1] + 2; 3852 v3 = a->a + diag[row - 2] + 3; 3853 v4 = a->a + diag[row - 3] + 4; 3854 for (n = 0; n < sz - 1; n += 2) { 3855 i1 = idx[0]; 3856 i2 = idx[1]; 3857 idx += 2; 3858 tmp0 = x[i1]; 3859 tmp1 = x[i2]; 3860 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3861 v1 += 2; 3862 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3863 v2 += 2; 3864 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3865 v3 += 2; 3866 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3867 v4 += 2; 3868 } 3869 3870 if (n == sz - 1) { 3871 tmp0 = x[*idx]; 3872 sum1 -= *v1 * tmp0; 3873 sum2 -= *v2 * tmp0; 3874 sum3 -= *v3 * tmp0; 3875 sum4 -= *v4 * tmp0; 3876 } 3877 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3878 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3879 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3880 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3881 row -= 4; 3882 break; 3883 case 5: 3884 3885 sum1 = b[row]; 3886 sum2 = b[row - 1]; 3887 sum3 = b[row - 2]; 3888 sum4 = b[row - 3]; 3889 sum5 = b[row - 4]; 3890 v2 = a->a + diag[row - 1] + 2; 3891 v3 = a->a + diag[row - 2] + 3; 3892 v4 = a->a + diag[row - 3] + 4; 3893 v5 = a->a + diag[row - 4] + 5; 3894 for (n = 0; n < sz - 1; n += 2) { 3895 i1 = idx[0]; 3896 i2 = idx[1]; 3897 idx += 2; 3898 tmp0 = x[i1]; 3899 tmp1 = x[i2]; 3900 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3901 v1 += 2; 3902 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3903 v2 += 2; 3904 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3905 v3 += 2; 3906 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3907 v4 += 2; 3908 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3909 v5 += 2; 3910 } 3911 3912 if (n == sz - 1) { 3913 tmp0 = x[*idx]; 3914 sum1 -= *v1 * tmp0; 3915 sum2 -= *v2 * tmp0; 3916 sum3 -= *v3 * tmp0; 3917 sum4 -= *v4 * tmp0; 3918 sum5 -= *v5 * tmp0; 3919 } 3920 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3921 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3922 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3923 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3924 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3925 row -= 5; 3926 break; 3927 default: 3928 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3929 } 3930 } 3931 PetscCall(PetscLogFlops(a->nz)); 3932 3933 /* 3934 t = b - D x where D is the block diagonal 3935 */ 3936 cnt = 0; 3937 for (i = 0, row = 0; i < m; i++) { 3938 switch (sizes[i]) { 3939 case 1: 3940 t[row] = b[row] - bdiag[cnt++] * x[row]; 3941 row++; 3942 break; 3943 case 2: 3944 x1 = x[row]; 3945 x2 = x[row + 1]; 3946 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3947 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3948 t[row] = b[row] - tmp1; 3949 t[row + 1] = b[row + 1] - tmp2; 3950 row += 2; 3951 cnt += 4; 3952 break; 3953 case 3: 3954 x1 = x[row]; 3955 x2 = x[row + 1]; 3956 x3 = x[row + 2]; 3957 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3958 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3959 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3960 t[row] = b[row] - tmp1; 3961 t[row + 1] = b[row + 1] - tmp2; 3962 t[row + 2] = b[row + 2] - tmp3; 3963 row += 3; 3964 cnt += 9; 3965 break; 3966 case 4: 3967 x1 = x[row]; 3968 x2 = x[row + 1]; 3969 x3 = x[row + 2]; 3970 x4 = x[row + 3]; 3971 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3972 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3973 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3974 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3975 t[row] = b[row] - tmp1; 3976 t[row + 1] = b[row + 1] - tmp2; 3977 t[row + 2] = b[row + 2] - tmp3; 3978 t[row + 3] = b[row + 3] - tmp4; 3979 row += 4; 3980 cnt += 16; 3981 break; 3982 case 5: 3983 x1 = x[row]; 3984 x2 = x[row + 1]; 3985 x3 = x[row + 2]; 3986 x4 = x[row + 3]; 3987 x5 = x[row + 4]; 3988 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 3989 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 3990 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 3991 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 3992 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 3993 t[row] = b[row] - tmp1; 3994 t[row + 1] = b[row + 1] - tmp2; 3995 t[row + 2] = b[row + 2] - tmp3; 3996 t[row + 3] = b[row + 3] - tmp4; 3997 t[row + 4] = b[row + 4] - tmp5; 3998 row += 5; 3999 cnt += 25; 4000 break; 4001 default: 4002 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4003 } 4004 } 4005 PetscCall(PetscLogFlops(m)); 4006 4007 /* 4008 Apply (L + D)^-1 where D is the block diagonal 4009 */ 4010 for (i = 0, row = 0; i < m; i++) { 4011 sz = diag[row] - ii[row]; 4012 v1 = a->a + ii[row]; 4013 idx = a->j + ii[row]; 4014 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4015 switch (sizes[i]) { 4016 case 1: 4017 4018 sum1 = t[row]; 4019 for (n = 0; n < sz - 1; n += 2) { 4020 i1 = idx[0]; 4021 i2 = idx[1]; 4022 idx += 2; 4023 tmp0 = t[i1]; 4024 tmp1 = t[i2]; 4025 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4026 v1 += 2; 4027 } 4028 4029 if (n == sz - 1) { 4030 tmp0 = t[*idx]; 4031 sum1 -= *v1 * tmp0; 4032 } 4033 x[row] += t[row] = sum1 * (*ibdiag++); 4034 row++; 4035 break; 4036 case 2: 4037 v2 = a->a + ii[row + 1]; 4038 sum1 = t[row]; 4039 sum2 = t[row + 1]; 4040 for (n = 0; n < sz - 1; n += 2) { 4041 i1 = idx[0]; 4042 i2 = idx[1]; 4043 idx += 2; 4044 tmp0 = t[i1]; 4045 tmp1 = t[i2]; 4046 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4047 v1 += 2; 4048 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4049 v2 += 2; 4050 } 4051 4052 if (n == sz - 1) { 4053 tmp0 = t[*idx]; 4054 sum1 -= v1[0] * tmp0; 4055 sum2 -= v2[0] * tmp0; 4056 } 4057 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4058 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4059 ibdiag += 4; 4060 row += 2; 4061 break; 4062 case 3: 4063 v2 = a->a + ii[row + 1]; 4064 v3 = a->a + ii[row + 2]; 4065 sum1 = t[row]; 4066 sum2 = t[row + 1]; 4067 sum3 = t[row + 2]; 4068 for (n = 0; n < sz - 1; n += 2) { 4069 i1 = idx[0]; 4070 i2 = idx[1]; 4071 idx += 2; 4072 tmp0 = t[i1]; 4073 tmp1 = t[i2]; 4074 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4075 v1 += 2; 4076 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4077 v2 += 2; 4078 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4079 v3 += 2; 4080 } 4081 4082 if (n == sz - 1) { 4083 tmp0 = t[*idx]; 4084 sum1 -= v1[0] * tmp0; 4085 sum2 -= v2[0] * tmp0; 4086 sum3 -= v3[0] * tmp0; 4087 } 4088 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4089 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4090 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4091 ibdiag += 9; 4092 row += 3; 4093 break; 4094 case 4: 4095 v2 = a->a + ii[row + 1]; 4096 v3 = a->a + ii[row + 2]; 4097 v4 = a->a + ii[row + 3]; 4098 sum1 = t[row]; 4099 sum2 = t[row + 1]; 4100 sum3 = t[row + 2]; 4101 sum4 = t[row + 3]; 4102 for (n = 0; n < sz - 1; n += 2) { 4103 i1 = idx[0]; 4104 i2 = idx[1]; 4105 idx += 2; 4106 tmp0 = t[i1]; 4107 tmp1 = t[i2]; 4108 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4109 v1 += 2; 4110 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4111 v2 += 2; 4112 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4113 v3 += 2; 4114 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4115 v4 += 2; 4116 } 4117 4118 if (n == sz - 1) { 4119 tmp0 = t[*idx]; 4120 sum1 -= v1[0] * tmp0; 4121 sum2 -= v2[0] * tmp0; 4122 sum3 -= v3[0] * tmp0; 4123 sum4 -= v4[0] * tmp0; 4124 } 4125 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4126 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4127 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4128 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4129 ibdiag += 16; 4130 row += 4; 4131 break; 4132 case 5: 4133 v2 = a->a + ii[row + 1]; 4134 v3 = a->a + ii[row + 2]; 4135 v4 = a->a + ii[row + 3]; 4136 v5 = a->a + ii[row + 4]; 4137 sum1 = t[row]; 4138 sum2 = t[row + 1]; 4139 sum3 = t[row + 2]; 4140 sum4 = t[row + 3]; 4141 sum5 = t[row + 4]; 4142 for (n = 0; n < sz - 1; n += 2) { 4143 i1 = idx[0]; 4144 i2 = idx[1]; 4145 idx += 2; 4146 tmp0 = t[i1]; 4147 tmp1 = t[i2]; 4148 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4149 v1 += 2; 4150 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4151 v2 += 2; 4152 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4153 v3 += 2; 4154 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4155 v4 += 2; 4156 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4157 v5 += 2; 4158 } 4159 4160 if (n == sz - 1) { 4161 tmp0 = t[*idx]; 4162 sum1 -= v1[0] * tmp0; 4163 sum2 -= v2[0] * tmp0; 4164 sum3 -= v3[0] * tmp0; 4165 sum4 -= v4[0] * tmp0; 4166 sum5 -= v5[0] * tmp0; 4167 } 4168 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4169 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4170 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4171 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4172 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4173 ibdiag += 25; 4174 row += 5; 4175 break; 4176 default: 4177 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4178 } 4179 } 4180 PetscCall(PetscLogFlops(a->nz)); 4181 } 4182 PetscCall(VecRestoreArray(xx, &x)); 4183 PetscCall(VecRestoreArrayRead(bb, &b)); 4184 PetscFunctionReturn(PETSC_SUCCESS); 4185 } 4186 4187 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4188 { 4189 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4190 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4191 const MatScalar *bdiag = a->inode.bdiag; 4192 const PetscScalar *b; 4193 PetscInt m = a->inode.node_count, cnt = 0, i, row; 4194 const PetscInt *sizes = a->inode.size; 4195 4196 PetscFunctionBegin; 4197 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4198 PetscCall(VecGetArray(xx, &x)); 4199 PetscCall(VecGetArrayRead(bb, &b)); 4200 cnt = 0; 4201 for (i = 0, row = 0; i < m; i++) { 4202 switch (sizes[i]) { 4203 case 1: 4204 x[row] = b[row] * bdiag[cnt++]; 4205 row++; 4206 break; 4207 case 2: 4208 x1 = b[row]; 4209 x2 = b[row + 1]; 4210 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4211 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4212 x[row++] = tmp1; 4213 x[row++] = tmp2; 4214 cnt += 4; 4215 break; 4216 case 3: 4217 x1 = b[row]; 4218 x2 = b[row + 1]; 4219 x3 = b[row + 2]; 4220 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4221 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4222 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4223 x[row++] = tmp1; 4224 x[row++] = tmp2; 4225 x[row++] = tmp3; 4226 cnt += 9; 4227 break; 4228 case 4: 4229 x1 = b[row]; 4230 x2 = b[row + 1]; 4231 x3 = b[row + 2]; 4232 x4 = b[row + 3]; 4233 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4234 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4235 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4236 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4237 x[row++] = tmp1; 4238 x[row++] = tmp2; 4239 x[row++] = tmp3; 4240 x[row++] = tmp4; 4241 cnt += 16; 4242 break; 4243 case 5: 4244 x1 = b[row]; 4245 x2 = b[row + 1]; 4246 x3 = b[row + 2]; 4247 x4 = b[row + 3]; 4248 x5 = b[row + 4]; 4249 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4250 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4251 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4252 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4253 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4254 x[row++] = tmp1; 4255 x[row++] = tmp2; 4256 x[row++] = tmp3; 4257 x[row++] = tmp4; 4258 x[row++] = tmp5; 4259 cnt += 25; 4260 break; 4261 default: 4262 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4263 } 4264 } 4265 PetscCall(PetscLogFlops(2.0 * cnt)); 4266 PetscCall(VecRestoreArray(xx, &x)); 4267 PetscCall(VecRestoreArrayRead(bb, &b)); 4268 PetscFunctionReturn(PETSC_SUCCESS); 4269 } 4270 4271 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4272 { 4273 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4274 4275 PetscFunctionBegin; 4276 a->inode.node_count = 0; 4277 a->inode.use = PETSC_FALSE; 4278 a->inode.checked = PETSC_FALSE; 4279 a->inode.mat_nonzerostate = -1; 4280 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4281 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4282 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4283 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4284 A->ops->coloringpatch = NULL; 4285 A->ops->multdiagonalblock = NULL; 4286 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4287 PetscFunctionReturn(PETSC_SUCCESS); 4288 } 4289 4290 /* 4291 samestructure indicates that the matrix has not changed its nonzero structure so we 4292 do not need to recompute the inodes 4293 */ 4294 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4295 { 4296 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4297 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4298 PetscBool flag; 4299 const PetscInt *idx, *idy, *ii; 4300 4301 PetscFunctionBegin; 4302 if (!a->inode.use) { 4303 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4304 PetscCall(PetscFree(a->inode.size)); 4305 PetscFunctionReturn(PETSC_SUCCESS); 4306 } 4307 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 4308 4309 m = A->rmap->n; 4310 if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size)); 4311 ns = a->inode.size; 4312 4313 i = 0; 4314 node_count = 0; 4315 idx = a->j; 4316 ii = a->i; 4317 if (idx) { 4318 while (i < m) { /* For each row */ 4319 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4320 /* Limits the number of elements in a node to 'a->inode.limit' */ 4321 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4322 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4323 if (nzy != nzx) break; 4324 idy += nzx; /* Same nonzero pattern */ 4325 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4326 if (!flag) break; 4327 } 4328 ns[node_count++] = blk_size; 4329 idx += blk_size * nzx; 4330 i = j; 4331 } 4332 } 4333 /* If not enough inodes found,, do not use inode version of the routines */ 4334 if (!m || !idx || node_count > .8 * m) { 4335 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4336 PetscCall(PetscFree(a->inode.size)); 4337 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4338 } else { 4339 if (!A->factortype) { 4340 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4341 if (A->rmap->n == A->cmap->n) { 4342 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4343 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4344 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4345 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4346 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4347 } 4348 } else { 4349 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4350 } 4351 a->inode.node_count = node_count; 4352 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4353 } 4354 a->inode.checked = PETSC_TRUE; 4355 a->inode.mat_nonzerostate = A->nonzerostate; 4356 PetscFunctionReturn(PETSC_SUCCESS); 4357 } 4358 4359 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4360 { 4361 Mat B = *C; 4362 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4363 PetscInt m = A->rmap->n; 4364 4365 PetscFunctionBegin; 4366 c->inode.use = a->inode.use; 4367 c->inode.limit = a->inode.limit; 4368 c->inode.max_limit = a->inode.max_limit; 4369 c->inode.checked = PETSC_FALSE; 4370 c->inode.size = NULL; 4371 c->inode.node_count = 0; 4372 c->inode.ibdiagvalid = PETSC_FALSE; 4373 c->inode.ibdiag = NULL; 4374 c->inode.bdiag = NULL; 4375 c->inode.mat_nonzerostate = -1; 4376 if (a->inode.use) { 4377 if (a->inode.checked && a->inode.size) { 4378 PetscCall(PetscMalloc1(m + 1, &c->inode.size)); 4379 PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1)); 4380 4381 c->inode.checked = PETSC_TRUE; 4382 c->inode.node_count = a->inode.node_count; 4383 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4384 } 4385 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4386 if (!B->factortype) { 4387 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4388 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4389 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4390 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4391 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4392 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4393 } else { 4394 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4395 } 4396 } 4397 PetscFunctionReturn(PETSC_SUCCESS); 4398 } 4399 4400 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4401 { 4402 PetscInt k; 4403 const PetscInt *vi; 4404 4405 PetscFunctionBegin; 4406 vi = aj + ai[row]; 4407 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4408 vi = aj + adiag[row]; 4409 cols[nzl] = vi[0]; 4410 vi = aj + adiag[row + 1] + 1; 4411 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4412 PetscFunctionReturn(PETSC_SUCCESS); 4413 } 4414 /* 4415 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4416 Modified from MatSeqAIJCheckInode(). 4417 4418 Input Parameters: 4419 . Mat A - ILU or LU matrix factor 4420 4421 */ 4422 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4423 { 4424 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4425 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4426 PetscInt *cols1, *cols2, *ns; 4427 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4428 PetscBool flag; 4429 4430 PetscFunctionBegin; 4431 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4432 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4433 4434 m = A->rmap->n; 4435 if (a->inode.size) ns = a->inode.size; 4436 else PetscCall(PetscMalloc1(m + 1, &ns)); 4437 4438 i = 0; 4439 node_count = 0; 4440 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4441 while (i < m) { /* For each row */ 4442 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4443 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4444 nzx = nzl1 + nzu1 + 1; 4445 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4446 4447 /* Limits the number of elements in a node to 'a->inode.limit' */ 4448 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4449 nzl2 = ai[j + 1] - ai[j]; 4450 nzu2 = adiag[j] - adiag[j + 1] - 1; 4451 nzy = nzl2 + nzu2 + 1; 4452 if (nzy != nzx) break; 4453 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4454 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4455 if (!flag) break; 4456 } 4457 ns[node_count++] = blk_size; 4458 i = j; 4459 } 4460 PetscCall(PetscFree2(cols1, cols2)); 4461 /* If not enough inodes found,, do not use inode version of the routines */ 4462 if (!m || node_count > .8 * m) { 4463 PetscCall(PetscFree(ns)); 4464 4465 a->inode.node_count = 0; 4466 a->inode.size = NULL; 4467 a->inode.use = PETSC_FALSE; 4468 4469 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4470 } else { 4471 A->ops->mult = NULL; 4472 A->ops->sor = NULL; 4473 A->ops->multadd = NULL; 4474 A->ops->getrowij = NULL; 4475 A->ops->restorerowij = NULL; 4476 A->ops->getcolumnij = NULL; 4477 A->ops->restorecolumnij = NULL; 4478 A->ops->coloringpatch = NULL; 4479 A->ops->multdiagonalblock = NULL; 4480 a->inode.node_count = node_count; 4481 a->inode.size = ns; 4482 4483 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4484 } 4485 a->inode.checked = PETSC_TRUE; 4486 PetscFunctionReturn(PETSC_SUCCESS); 4487 } 4488 4489 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4490 { 4491 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4492 4493 PetscFunctionBegin; 4494 a->inode.ibdiagvalid = PETSC_FALSE; 4495 PetscFunctionReturn(PETSC_SUCCESS); 4496 } 4497 4498 /* 4499 This is really ugly. if inodes are used this replaces the 4500 permutations with ones that correspond to rows/cols of the matrix 4501 rather than inode blocks 4502 */ 4503 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4504 { 4505 PetscFunctionBegin; 4506 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4507 PetscFunctionReturn(PETSC_SUCCESS); 4508 } 4509 4510 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4511 { 4512 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4513 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4514 const PetscInt *ridx, *cidx; 4515 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx; 4516 PetscInt nslim_col, *ns_col; 4517 IS ris = *rperm, cis = *cperm; 4518 4519 PetscFunctionBegin; 4520 if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4521 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4522 4523 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4524 PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns)); 4525 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4526 4527 PetscCall(ISGetIndices(ris, &ridx)); 4528 PetscCall(ISGetIndices(cis, &cidx)); 4529 4530 /* Form the inode structure for the rows of permuted matrix using inv perm*/ 4531 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i]; 4532 4533 /* Construct the permutations for rows*/ 4534 for (i = 0, row = 0; i < nslim_row; ++i) { 4535 indx = ridx[i]; 4536 start_val = tns[indx]; 4537 end_val = tns[indx + 1]; 4538 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4539 } 4540 4541 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4542 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i]; 4543 4544 /* Construct permutations for columns */ 4545 for (i = 0, col = 0; i < nslim_col; ++i) { 4546 indx = cidx[i]; 4547 start_val = tns[indx]; 4548 end_val = tns[indx + 1]; 4549 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4550 } 4551 4552 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4553 PetscCall(ISSetPermutation(*rperm)); 4554 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4555 PetscCall(ISSetPermutation(*cperm)); 4556 4557 PetscCall(ISRestoreIndices(ris, &ridx)); 4558 PetscCall(ISRestoreIndices(cis, &cidx)); 4559 4560 PetscCall(PetscFree(ns_col)); 4561 PetscCall(PetscFree2(permr, permc)); 4562 PetscCall(ISDestroy(&cis)); 4563 PetscCall(ISDestroy(&ris)); 4564 PetscCall(PetscFree(tns)); 4565 PetscFunctionReturn(PETSC_SUCCESS); 4566 } 4567 4568 /*@C 4569 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4570 4571 Not Collective 4572 4573 Input Parameter: 4574 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4575 4576 Output Parameters: 4577 + node_count - no of inodes present in the matrix. 4578 . sizes - an array of size `node_count`, with the sizes of each inode. 4579 - limit - the max size used to generate the inodes. 4580 4581 Level: advanced 4582 4583 Note: 4584 It should be called after the matrix is assembled. 4585 The contents of the sizes[] array should not be changed. 4586 `NULL` may be passed for information not needed 4587 4588 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 4589 @*/ 4590 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4591 { 4592 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4593 4594 PetscFunctionBegin; 4595 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4596 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4597 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4598 PetscFunctionReturn(PETSC_SUCCESS); 4599 } 4600 4601 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4602 { 4603 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4604 4605 PetscFunctionBegin; 4606 if (node_count) *node_count = a->inode.node_count; 4607 if (sizes) *sizes = a->inode.size; 4608 if (limit) *limit = a->inode.limit; 4609 PetscFunctionReturn(PETSC_SUCCESS); 4610 } 4611