1 /* 2 This file provides high performance routines for the Inode format (compressed sparse row) 3 by taking advantage of rows with identical nonzero structure (I-nodes). 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #if defined(PETSC_HAVE_XMMINTRIN_H) 7 #include <xmmintrin.h> 8 #endif 9 10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 13 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 14 15 PetscFunctionBegin; 16 n = A->cmap->n; 17 m = A->rmap->n; 18 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) 24 ; 25 for (; count + 1 < n; count++, i++) 26 ; 27 if (count < n) i++; 28 *size = i; 29 PetscFunctionReturn(PETSC_SUCCESS); 30 } 31 PetscCall(PetscMalloc1(n + 1, &ns_col)); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i]; 35 36 /* if m < n; pad up the remainder with inode_limit */ 37 for (; count + 1 < n; count++, i++) ns_col[i] = 1; 38 /* The last node is the odd ball. pad it up with the remaining rows; */ 39 if (count < n) { 40 ns_col[i] = n - count; 41 i++; 42 } else if (count > n) { 43 /* Adjust for the over estimation */ 44 ns_col[i - 1] += n - count; 45 } 46 *size = i; 47 *ns = ns_col; 48 PetscFunctionReturn(PETSC_SUCCESS); 49 } 50 51 /* 52 This builds symmetric version of nonzero structure, 53 */ 54 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 55 { 56 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 57 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 58 PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2; 59 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 60 61 PetscFunctionBegin; 62 nslim_row = a->inode.node_count; 63 m = A->rmap->n; 64 n = A->cmap->n; 65 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 66 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 67 68 /* Use the row_inode as column_inode */ 69 nslim_col = nslim_row; 70 ns_col = ns_row; 71 72 /* allocate space for reformatted inode structure */ 73 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 74 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1]; 75 76 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 77 nsz = ns_col[i1]; 78 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 79 } 80 /* allocate space for row pointers */ 81 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 82 *iia = ia; 83 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 84 85 /* determine the number of columns in each row */ 86 ia[0] = oshift; 87 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 88 j = aj + ai[row] + ishift; 89 jmax = aj + ai[row + 1] + ishift; 90 if (j == jmax) continue; /* empty row */ 91 col = *j++ + ishift; 92 i2 = tvc[col]; 93 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 94 ia[i1 + 1]++; 95 ia[i2 + 1]++; 96 i2++; /* Start col of next node */ 97 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 98 i2 = tvc[col]; 99 } 100 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 101 } 102 103 /* shift ia[i] to point to next row */ 104 for (i1 = 1; i1 < nslim_row + 1; i1++) { 105 row = ia[i1 - 1]; 106 ia[i1] += row; 107 work[i1 - 1] = row - oshift; 108 } 109 110 /* allocate space for column pointers */ 111 nz = ia[nslim_row] + (!ishift); 112 PetscCall(PetscMalloc1(nz, &ja)); 113 *jja = ja; 114 115 /* loop over lower triangular part putting into ja */ 116 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 117 j = aj + ai[row] + ishift; 118 jmax = aj + ai[row + 1] + ishift; 119 if (j == jmax) continue; /* empty row */ 120 col = *j++ + ishift; 121 i2 = tvc[col]; 122 while (i2 < i1 && j < jmax) { 123 ja[work[i2]++] = i1 + oshift; 124 ja[work[i1]++] = i2 + oshift; 125 ++i2; 126 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 127 i2 = tvc[col]; 128 } 129 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 130 } 131 PetscCall(PetscFree(work)); 132 PetscCall(PetscFree2(tns, tvc)); 133 PetscFunctionReturn(PETSC_SUCCESS); 134 } 135 136 /* 137 This builds nonsymmetric version of nonzero structure, 138 */ 139 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 140 { 141 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 142 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 143 PetscInt *tns, *tvc, nsz, i1, i2; 144 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size; 145 146 PetscFunctionBegin; 147 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 148 nslim_row = a->inode.node_count; 149 n = A->cmap->n; 150 151 /* Create The column_inode for this matrix */ 152 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 153 154 /* allocate space for reformatted column_inode structure */ 155 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 156 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 157 158 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 159 nsz = ns_col[i1]; 160 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 161 } 162 /* allocate space for row pointers */ 163 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 164 *iia = ia; 165 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 166 167 /* determine the number of columns in each row */ 168 ia[0] = oshift; 169 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 170 j = aj + ai[row] + ishift; 171 nz = ai[row + 1] - ai[row]; 172 if (!nz) continue; /* empty row */ 173 col = *j++ + ishift; 174 i2 = tvc[col]; 175 while (nz-- > 0) { /* off-diagonal elements */ 176 ia[i1 + 1]++; 177 i2++; /* Start col of next node */ 178 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 179 if (nz > 0) i2 = tvc[col]; 180 } 181 } 182 183 /* shift ia[i] to point to next row */ 184 for (i1 = 1; i1 < nslim_row + 1; i1++) { 185 row = ia[i1 - 1]; 186 ia[i1] += row; 187 work[i1 - 1] = row - oshift; 188 } 189 190 /* allocate space for column pointers */ 191 nz = ia[nslim_row] + (!ishift); 192 PetscCall(PetscMalloc1(nz, &ja)); 193 *jja = ja; 194 195 /* loop over matrix putting into ja */ 196 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 197 j = aj + ai[row] + ishift; 198 nz = ai[row + 1] - ai[row]; 199 if (!nz) continue; /* empty row */ 200 col = *j++ + ishift; 201 i2 = tvc[col]; 202 while (nz-- > 0) { 203 ja[work[i1]++] = i2 + oshift; 204 ++i2; 205 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 206 if (nz > 0) i2 = tvc[col]; 207 } 208 } 209 PetscCall(PetscFree(ns_col)); 210 PetscCall(PetscFree(work)); 211 PetscCall(PetscFree2(tns, tvc)); 212 PetscFunctionReturn(PETSC_SUCCESS); 213 } 214 215 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 216 { 217 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 218 219 PetscFunctionBegin; 220 if (n) *n = a->inode.node_count; 221 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 222 if (!blockcompressed) { 223 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 224 } else if (symmetric) { 225 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 226 } else { 227 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 228 } 229 PetscFunctionReturn(PETSC_SUCCESS); 230 } 231 232 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 233 { 234 PetscFunctionBegin; 235 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 236 237 if (!blockcompressed) { 238 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 239 } else { 240 PetscCall(PetscFree(*ia)); 241 PetscCall(PetscFree(*ja)); 242 } 243 PetscFunctionReturn(PETSC_SUCCESS); 244 } 245 246 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 247 { 248 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 249 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 250 PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j; 251 252 PetscFunctionBegin; 253 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 254 nslim_row = a->inode.node_count; 255 n = A->cmap->n; 256 257 /* Create The column_inode for this matrix */ 258 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 259 260 /* allocate space for reformatted column_inode structure */ 261 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 262 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 263 264 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 265 nsz = ns_col[i1]; 266 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 267 } 268 /* allocate space for column pointers */ 269 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 270 *iia = ia; 271 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 272 273 /* determine the number of columns in each row */ 274 ia[0] = oshift; 275 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 276 j = aj + ai[row] + ishift; 277 col = *j++ + ishift; 278 i2 = tvc[col]; 279 nz = ai[row + 1] - ai[row]; 280 while (nz-- > 0) { /* off-diagonal elements */ 281 /* ia[i1+1]++; */ 282 ia[i2 + 1]++; 283 i2++; 284 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 285 if (nz > 0) i2 = tvc[col]; 286 } 287 } 288 289 /* shift ia[i] to point to next col */ 290 for (i1 = 1; i1 < nslim_col + 1; i1++) { 291 col = ia[i1 - 1]; 292 ia[i1] += col; 293 work[i1 - 1] = col - oshift; 294 } 295 296 /* allocate space for column pointers */ 297 nz = ia[nslim_col] + (!ishift); 298 PetscCall(PetscMalloc1(nz, &ja)); 299 *jja = ja; 300 301 /* loop over matrix putting into ja */ 302 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 303 j = aj + ai[row] + ishift; 304 col = *j++ + ishift; 305 i2 = tvc[col]; 306 nz = ai[row + 1] - ai[row]; 307 while (nz-- > 0) { 308 /* ja[work[i1]++] = i2 + oshift; */ 309 ja[work[i2]++] = i1 + oshift; 310 i2++; 311 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 312 if (nz > 0) i2 = tvc[col]; 313 } 314 } 315 PetscCall(PetscFree(ns_col)); 316 PetscCall(PetscFree(work)); 317 PetscCall(PetscFree2(tns, tvc)); 318 PetscFunctionReturn(PETSC_SUCCESS); 319 } 320 321 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 322 { 323 PetscFunctionBegin; 324 PetscCall(MatCreateColInode_Private(A, n, NULL)); 325 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 326 327 if (!blockcompressed) { 328 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 329 } else if (symmetric) { 330 /* Since the indices are symmetric it doesn't matter */ 331 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 332 } else { 333 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 334 } 335 PetscFunctionReturn(PETSC_SUCCESS); 336 } 337 338 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 339 { 340 PetscFunctionBegin; 341 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 342 if (!blockcompressed) { 343 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 344 } else { 345 PetscCall(PetscFree(*ia)); 346 PetscCall(PetscFree(*ja)); 347 } 348 PetscFunctionReturn(PETSC_SUCCESS); 349 } 350 351 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 352 { 353 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 354 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 355 PetscScalar *y; 356 const PetscScalar *x; 357 const MatScalar *v1, *v2, *v3, *v4, *v5; 358 PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 359 const PetscInt *idx, *ns, *ii; 360 361 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 362 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 363 #endif 364 365 PetscFunctionBegin; 366 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 367 node_max = a->inode.node_count; 368 ns = a->inode.size; /* Node Size array */ 369 PetscCall(VecGetArrayRead(xx, &x)); 370 PetscCall(VecGetArray(yy, &y)); 371 idx = a->j; 372 v1 = a->a; 373 ii = a->i; 374 375 for (i = 0, row = 0; i < node_max; ++i) { 376 nsz = ns[i]; 377 n = ii[1] - ii[0]; 378 nonzerorow += (n > 0) * nsz; 379 ii += nsz; 380 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 381 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 382 sz = n; /* No of non zeros in this row */ 383 /* Switch on the size of Node */ 384 switch (nsz) { /* Each loop in 'case' is unrolled */ 385 case 1: 386 sum1 = 0.; 387 388 for (n = 0; n < sz - 1; n += 2) { 389 i1 = idx[0]; /* The instructions are ordered to */ 390 i2 = idx[1]; /* make the compiler's job easy */ 391 idx += 2; 392 tmp0 = x[i1]; 393 tmp1 = x[i2]; 394 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 395 v1 += 2; 396 } 397 398 if (n == sz - 1) { /* Take care of the last nonzero */ 399 tmp0 = x[*idx++]; 400 sum1 += *v1++ * tmp0; 401 } 402 y[row++] = sum1; 403 break; 404 case 2: 405 sum1 = 0.; 406 sum2 = 0.; 407 v2 = v1 + n; 408 409 for (n = 0; n < sz - 1; n += 2) { 410 i1 = idx[0]; 411 i2 = idx[1]; 412 idx += 2; 413 tmp0 = x[i1]; 414 tmp1 = x[i2]; 415 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 416 v1 += 2; 417 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 418 v2 += 2; 419 } 420 if (n == sz - 1) { 421 tmp0 = x[*idx++]; 422 sum1 += *v1++ * tmp0; 423 sum2 += *v2++ * tmp0; 424 } 425 y[row++] = sum1; 426 y[row++] = sum2; 427 v1 = v2; /* Since the next block to be processed starts there*/ 428 idx += sz; 429 break; 430 case 3: 431 sum1 = 0.; 432 sum2 = 0.; 433 sum3 = 0.; 434 v2 = v1 + n; 435 v3 = v2 + n; 436 437 for (n = 0; n < sz - 1; n += 2) { 438 i1 = idx[0]; 439 i2 = idx[1]; 440 idx += 2; 441 tmp0 = x[i1]; 442 tmp1 = x[i2]; 443 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 444 v1 += 2; 445 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 446 v2 += 2; 447 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 448 v3 += 2; 449 } 450 if (n == sz - 1) { 451 tmp0 = x[*idx++]; 452 sum1 += *v1++ * tmp0; 453 sum2 += *v2++ * tmp0; 454 sum3 += *v3++ * tmp0; 455 } 456 y[row++] = sum1; 457 y[row++] = sum2; 458 y[row++] = sum3; 459 v1 = v3; /* Since the next block to be processed starts there*/ 460 idx += 2 * sz; 461 break; 462 case 4: 463 sum1 = 0.; 464 sum2 = 0.; 465 sum3 = 0.; 466 sum4 = 0.; 467 v2 = v1 + n; 468 v3 = v2 + n; 469 v4 = v3 + n; 470 471 for (n = 0; n < sz - 1; n += 2) { 472 i1 = idx[0]; 473 i2 = idx[1]; 474 idx += 2; 475 tmp0 = x[i1]; 476 tmp1 = x[i2]; 477 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 478 v1 += 2; 479 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 480 v2 += 2; 481 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 482 v3 += 2; 483 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 484 v4 += 2; 485 } 486 if (n == sz - 1) { 487 tmp0 = x[*idx++]; 488 sum1 += *v1++ * tmp0; 489 sum2 += *v2++ * tmp0; 490 sum3 += *v3++ * tmp0; 491 sum4 += *v4++ * tmp0; 492 } 493 y[row++] = sum1; 494 y[row++] = sum2; 495 y[row++] = sum3; 496 y[row++] = sum4; 497 v1 = v4; /* Since the next block to be processed starts there*/ 498 idx += 3 * sz; 499 break; 500 case 5: 501 sum1 = 0.; 502 sum2 = 0.; 503 sum3 = 0.; 504 sum4 = 0.; 505 sum5 = 0.; 506 v2 = v1 + n; 507 v3 = v2 + n; 508 v4 = v3 + n; 509 v5 = v4 + n; 510 511 for (n = 0; n < sz - 1; n += 2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 518 v1 += 2; 519 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 520 v2 += 2; 521 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 522 v3 += 2; 523 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 524 v4 += 2; 525 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 526 v5 += 2; 527 } 528 if (n == sz - 1) { 529 tmp0 = x[*idx++]; 530 sum1 += *v1++ * tmp0; 531 sum2 += *v2++ * tmp0; 532 sum3 += *v3++ * tmp0; 533 sum4 += *v4++ * tmp0; 534 sum5 += *v5++ * tmp0; 535 } 536 y[row++] = sum1; 537 y[row++] = sum2; 538 y[row++] = sum3; 539 y[row++] = sum4; 540 y[row++] = sum5; 541 v1 = v5; /* Since the next block to be processed starts there */ 542 idx += 4 * sz; 543 break; 544 default: 545 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 546 } 547 } 548 PetscCall(VecRestoreArrayRead(xx, &x)); 549 PetscCall(VecRestoreArray(yy, &y)); 550 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 551 PetscFunctionReturn(PETSC_SUCCESS); 552 } 553 554 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 555 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 556 { 557 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 558 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 559 const MatScalar *v1, *v2, *v3, *v4, *v5; 560 const PetscScalar *x; 561 PetscScalar *y, *z, *zt; 562 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 563 const PetscInt *idx, *ns, *ii; 564 565 PetscFunctionBegin; 566 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 567 node_max = a->inode.node_count; 568 ns = a->inode.size; /* Node Size array */ 569 570 PetscCall(VecGetArrayRead(xx, &x)); 571 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 572 zt = z; 573 574 idx = a->j; 575 v1 = a->a; 576 ii = a->i; 577 578 for (i = 0, row = 0; i < node_max; ++i) { 579 nsz = ns[i]; 580 n = ii[1] - ii[0]; 581 ii += nsz; 582 sz = n; /* No of non zeros in this row */ 583 /* Switch on the size of Node */ 584 switch (nsz) { /* Each loop in 'case' is unrolled */ 585 case 1: 586 sum1 = *zt++; 587 588 for (n = 0; n < sz - 1; n += 2) { 589 i1 = idx[0]; /* The instructions are ordered to */ 590 i2 = idx[1]; /* make the compiler's job easy */ 591 idx += 2; 592 tmp0 = x[i1]; 593 tmp1 = x[i2]; 594 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 595 v1 += 2; 596 } 597 598 if (n == sz - 1) { /* Take care of the last nonzero */ 599 tmp0 = x[*idx++]; 600 sum1 += *v1++ * tmp0; 601 } 602 y[row++] = sum1; 603 break; 604 case 2: 605 sum1 = *zt++; 606 sum2 = *zt++; 607 v2 = v1 + n; 608 609 for (n = 0; n < sz - 1; n += 2) { 610 i1 = idx[0]; 611 i2 = idx[1]; 612 idx += 2; 613 tmp0 = x[i1]; 614 tmp1 = x[i2]; 615 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 616 v1 += 2; 617 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 618 v2 += 2; 619 } 620 if (n == sz - 1) { 621 tmp0 = x[*idx++]; 622 sum1 += *v1++ * tmp0; 623 sum2 += *v2++ * tmp0; 624 } 625 y[row++] = sum1; 626 y[row++] = sum2; 627 v1 = v2; /* Since the next block to be processed starts there*/ 628 idx += sz; 629 break; 630 case 3: 631 sum1 = *zt++; 632 sum2 = *zt++; 633 sum3 = *zt++; 634 v2 = v1 + n; 635 v3 = v2 + n; 636 637 for (n = 0; n < sz - 1; n += 2) { 638 i1 = idx[0]; 639 i2 = idx[1]; 640 idx += 2; 641 tmp0 = x[i1]; 642 tmp1 = x[i2]; 643 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 644 v1 += 2; 645 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 646 v2 += 2; 647 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 648 v3 += 2; 649 } 650 if (n == sz - 1) { 651 tmp0 = x[*idx++]; 652 sum1 += *v1++ * tmp0; 653 sum2 += *v2++ * tmp0; 654 sum3 += *v3++ * tmp0; 655 } 656 y[row++] = sum1; 657 y[row++] = sum2; 658 y[row++] = sum3; 659 v1 = v3; /* Since the next block to be processed starts there*/ 660 idx += 2 * sz; 661 break; 662 case 4: 663 sum1 = *zt++; 664 sum2 = *zt++; 665 sum3 = *zt++; 666 sum4 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 v4 = v3 + n; 670 671 for (n = 0; n < sz - 1; n += 2) { 672 i1 = idx[0]; 673 i2 = idx[1]; 674 idx += 2; 675 tmp0 = x[i1]; 676 tmp1 = x[i2]; 677 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 678 v1 += 2; 679 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 680 v2 += 2; 681 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 682 v3 += 2; 683 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 684 v4 += 2; 685 } 686 if (n == sz - 1) { 687 tmp0 = x[*idx++]; 688 sum1 += *v1++ * tmp0; 689 sum2 += *v2++ * tmp0; 690 sum3 += *v3++ * tmp0; 691 sum4 += *v4++ * tmp0; 692 } 693 y[row++] = sum1; 694 y[row++] = sum2; 695 y[row++] = sum3; 696 y[row++] = sum4; 697 v1 = v4; /* Since the next block to be processed starts there*/ 698 idx += 3 * sz; 699 break; 700 case 5: 701 sum1 = *zt++; 702 sum2 = *zt++; 703 sum3 = *zt++; 704 sum4 = *zt++; 705 sum5 = *zt++; 706 v2 = v1 + n; 707 v3 = v2 + n; 708 v4 = v3 + n; 709 v5 = v4 + n; 710 711 for (n = 0; n < sz - 1; n += 2) { 712 i1 = idx[0]; 713 i2 = idx[1]; 714 idx += 2; 715 tmp0 = x[i1]; 716 tmp1 = x[i2]; 717 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 718 v1 += 2; 719 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 720 v2 += 2; 721 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 722 v3 += 2; 723 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 724 v4 += 2; 725 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 726 v5 += 2; 727 } 728 if (n == sz - 1) { 729 tmp0 = x[*idx++]; 730 sum1 += *v1++ * tmp0; 731 sum2 += *v2++ * tmp0; 732 sum3 += *v3++ * tmp0; 733 sum4 += *v4++ * tmp0; 734 sum5 += *v5++ * tmp0; 735 } 736 y[row++] = sum1; 737 y[row++] = sum2; 738 y[row++] = sum3; 739 y[row++] = sum4; 740 y[row++] = sum5; 741 v1 = v5; /* Since the next block to be processed starts there */ 742 idx += 4 * sz; 743 break; 744 default: 745 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 746 } 747 } 748 PetscCall(VecRestoreArrayRead(xx, &x)); 749 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 750 PetscCall(PetscLogFlops(2.0 * a->nz)); 751 PetscFunctionReturn(PETSC_SUCCESS); 752 } 753 754 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 755 { 756 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 757 IS iscol = a->col, isrow = a->row; 758 const PetscInt *r, *c, *rout, *cout; 759 PetscInt i, j, n = A->rmap->n, nz; 760 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 761 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 762 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 763 PetscScalar sum1, sum2, sum3, sum4, sum5; 764 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 765 const PetscScalar *b; 766 767 PetscFunctionBegin; 768 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 769 node_max = a->inode.node_count; 770 ns = a->inode.size; /* Node Size array */ 771 772 PetscCall(VecGetArrayRead(bb, &b)); 773 PetscCall(VecGetArrayWrite(xx, &x)); 774 tmp = a->solve_work; 775 776 PetscCall(ISGetIndices(isrow, &rout)); 777 r = rout; 778 PetscCall(ISGetIndices(iscol, &cout)); 779 c = cout + (n - 1); 780 781 /* forward solve the lower triangular */ 782 tmps = tmp; 783 aa = a_a; 784 aj = a_j; 785 ad = a->diag; 786 787 for (i = 0, row = 0; i < node_max; ++i) { 788 nsz = ns[i]; 789 aii = ai[row]; 790 v1 = aa + aii; 791 vi = aj + aii; 792 nz = ad[row] - aii; 793 if (i < node_max - 1) { 794 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 795 * but our indexing to determine its size could. */ 796 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 797 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 798 PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 799 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 800 } 801 802 switch (nsz) { /* Each loop in 'case' is unrolled */ 803 case 1: 804 sum1 = b[*r++]; 805 for (j = 0; j < nz - 1; j += 2) { 806 i0 = vi[0]; 807 i1 = vi[1]; 808 vi += 2; 809 tmp0 = tmps[i0]; 810 tmp1 = tmps[i1]; 811 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 812 v1 += 2; 813 } 814 if (j == nz - 1) { 815 tmp0 = tmps[*vi++]; 816 sum1 -= *v1++ * tmp0; 817 } 818 tmp[row++] = sum1; 819 break; 820 case 2: 821 sum1 = b[*r++]; 822 sum2 = b[*r++]; 823 v2 = aa + ai[row + 1]; 824 825 for (j = 0; j < nz - 1; j += 2) { 826 i0 = vi[0]; 827 i1 = vi[1]; 828 vi += 2; 829 tmp0 = tmps[i0]; 830 tmp1 = tmps[i1]; 831 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 832 v1 += 2; 833 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 834 v2 += 2; 835 } 836 if (j == nz - 1) { 837 tmp0 = tmps[*vi++]; 838 sum1 -= *v1++ * tmp0; 839 sum2 -= *v2++ * tmp0; 840 } 841 sum2 -= *v2++ * sum1; 842 tmp[row++] = sum1; 843 tmp[row++] = sum2; 844 break; 845 case 3: 846 sum1 = b[*r++]; 847 sum2 = b[*r++]; 848 sum3 = b[*r++]; 849 v2 = aa + ai[row + 1]; 850 v3 = aa + ai[row + 2]; 851 852 for (j = 0; j < nz - 1; j += 2) { 853 i0 = vi[0]; 854 i1 = vi[1]; 855 vi += 2; 856 tmp0 = tmps[i0]; 857 tmp1 = tmps[i1]; 858 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 859 v1 += 2; 860 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 861 v2 += 2; 862 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 863 v3 += 2; 864 } 865 if (j == nz - 1) { 866 tmp0 = tmps[*vi++]; 867 sum1 -= *v1++ * tmp0; 868 sum2 -= *v2++ * tmp0; 869 sum3 -= *v3++ * tmp0; 870 } 871 sum2 -= *v2++ * sum1; 872 sum3 -= *v3++ * sum1; 873 sum3 -= *v3++ * sum2; 874 875 tmp[row++] = sum1; 876 tmp[row++] = sum2; 877 tmp[row++] = sum3; 878 break; 879 880 case 4: 881 sum1 = b[*r++]; 882 sum2 = b[*r++]; 883 sum3 = b[*r++]; 884 sum4 = b[*r++]; 885 v2 = aa + ai[row + 1]; 886 v3 = aa + ai[row + 2]; 887 v4 = aa + ai[row + 3]; 888 889 for (j = 0; j < nz - 1; j += 2) { 890 i0 = vi[0]; 891 i1 = vi[1]; 892 vi += 2; 893 tmp0 = tmps[i0]; 894 tmp1 = tmps[i1]; 895 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 896 v1 += 2; 897 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 898 v2 += 2; 899 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 900 v3 += 2; 901 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 902 v4 += 2; 903 } 904 if (j == nz - 1) { 905 tmp0 = tmps[*vi++]; 906 sum1 -= *v1++ * tmp0; 907 sum2 -= *v2++ * tmp0; 908 sum3 -= *v3++ * tmp0; 909 sum4 -= *v4++ * tmp0; 910 } 911 sum2 -= *v2++ * sum1; 912 sum3 -= *v3++ * sum1; 913 sum4 -= *v4++ * sum1; 914 sum3 -= *v3++ * sum2; 915 sum4 -= *v4++ * sum2; 916 sum4 -= *v4++ * sum3; 917 918 tmp[row++] = sum1; 919 tmp[row++] = sum2; 920 tmp[row++] = sum3; 921 tmp[row++] = sum4; 922 break; 923 case 5: 924 sum1 = b[*r++]; 925 sum2 = b[*r++]; 926 sum3 = b[*r++]; 927 sum4 = b[*r++]; 928 sum5 = b[*r++]; 929 v2 = aa + ai[row + 1]; 930 v3 = aa + ai[row + 2]; 931 v4 = aa + ai[row + 3]; 932 v5 = aa + ai[row + 4]; 933 934 for (j = 0; j < nz - 1; j += 2) { 935 i0 = vi[0]; 936 i1 = vi[1]; 937 vi += 2; 938 tmp0 = tmps[i0]; 939 tmp1 = tmps[i1]; 940 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 941 v1 += 2; 942 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 943 v2 += 2; 944 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 945 v3 += 2; 946 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 947 v4 += 2; 948 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 949 v5 += 2; 950 } 951 if (j == nz - 1) { 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ * tmp0; 954 sum2 -= *v2++ * tmp0; 955 sum3 -= *v3++ * tmp0; 956 sum4 -= *v4++ * tmp0; 957 sum5 -= *v5++ * tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row++] = sum1; 972 tmp[row++] = sum2; 973 tmp[row++] = sum3; 974 tmp[row++] = sum4; 975 tmp[row++] = sum5; 976 break; 977 default: 978 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 983 nsz = ns[i]; 984 aii = ai[row + 1] - 1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii - ad[row]; 988 switch (nsz) { /* Each loop in 'case' is unrolled */ 989 case 1: 990 sum1 = tmp[row]; 991 992 for (j = nz; j > 1; j -= 2) { 993 vi -= 2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j == 1) { 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1006 row--; 1007 break; 1008 case 2: 1009 sum1 = tmp[row]; 1010 sum2 = tmp[row - 1]; 1011 v2 = aa + ai[row] - 1; 1012 for (j = nz; j > 1; j -= 2) { 1013 vi -= 2; 1014 i0 = vi[2]; 1015 i1 = vi[1]; 1016 tmp0 = tmps[i0]; 1017 tmp1 = tmps[i1]; 1018 v1 -= 2; 1019 v2 -= 2; 1020 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1021 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1022 } 1023 if (j == 1) { 1024 tmp0 = tmps[*vi--]; 1025 sum1 -= *v1-- * tmp0; 1026 sum2 -= *v2-- * tmp0; 1027 } 1028 1029 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1030 row--; 1031 sum2 -= *v2-- * tmp0; 1032 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1033 row--; 1034 break; 1035 case 3: 1036 sum1 = tmp[row]; 1037 sum2 = tmp[row - 1]; 1038 sum3 = tmp[row - 2]; 1039 v2 = aa + ai[row] - 1; 1040 v3 = aa + ai[row - 1] - 1; 1041 for (j = nz; j > 1; j -= 2) { 1042 vi -= 2; 1043 i0 = vi[2]; 1044 i1 = vi[1]; 1045 tmp0 = tmps[i0]; 1046 tmp1 = tmps[i1]; 1047 v1 -= 2; 1048 v2 -= 2; 1049 v3 -= 2; 1050 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1051 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1052 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1053 } 1054 if (j == 1) { 1055 tmp0 = tmps[*vi--]; 1056 sum1 -= *v1-- * tmp0; 1057 sum2 -= *v2-- * tmp0; 1058 sum3 -= *v3-- * tmp0; 1059 } 1060 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1061 row--; 1062 sum2 -= *v2-- * tmp0; 1063 sum3 -= *v3-- * tmp0; 1064 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1065 row--; 1066 sum3 -= *v3-- * tmp0; 1067 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1068 row--; 1069 1070 break; 1071 case 4: 1072 sum1 = tmp[row]; 1073 sum2 = tmp[row - 1]; 1074 sum3 = tmp[row - 2]; 1075 sum4 = tmp[row - 3]; 1076 v2 = aa + ai[row] - 1; 1077 v3 = aa + ai[row - 1] - 1; 1078 v4 = aa + ai[row - 2] - 1; 1079 1080 for (j = nz; j > 1; j -= 2) { 1081 vi -= 2; 1082 i0 = vi[2]; 1083 i1 = vi[1]; 1084 tmp0 = tmps[i0]; 1085 tmp1 = tmps[i1]; 1086 v1 -= 2; 1087 v2 -= 2; 1088 v3 -= 2; 1089 v4 -= 2; 1090 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1091 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1092 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1093 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1094 } 1095 if (j == 1) { 1096 tmp0 = tmps[*vi--]; 1097 sum1 -= *v1-- * tmp0; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 } 1102 1103 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1104 row--; 1105 sum2 -= *v2-- * tmp0; 1106 sum3 -= *v3-- * tmp0; 1107 sum4 -= *v4-- * tmp0; 1108 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1109 row--; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1113 row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1116 row--; 1117 break; 1118 case 5: 1119 sum1 = tmp[row]; 1120 sum2 = tmp[row - 1]; 1121 sum3 = tmp[row - 2]; 1122 sum4 = tmp[row - 3]; 1123 sum5 = tmp[row - 4]; 1124 v2 = aa + ai[row] - 1; 1125 v3 = aa + ai[row - 1] - 1; 1126 v4 = aa + ai[row - 2] - 1; 1127 v5 = aa + ai[row - 3] - 1; 1128 for (j = nz; j > 1; j -= 2) { 1129 vi -= 2; 1130 i0 = vi[2]; 1131 i1 = vi[1]; 1132 tmp0 = tmps[i0]; 1133 tmp1 = tmps[i1]; 1134 v1 -= 2; 1135 v2 -= 2; 1136 v3 -= 2; 1137 v4 -= 2; 1138 v5 -= 2; 1139 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1140 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1141 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1142 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1143 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1144 } 1145 if (j == 1) { 1146 tmp0 = tmps[*vi--]; 1147 sum1 -= *v1-- * tmp0; 1148 sum2 -= *v2-- * tmp0; 1149 sum3 -= *v3-- * tmp0; 1150 sum4 -= *v4-- * tmp0; 1151 sum5 -= *v5-- * tmp0; 1152 } 1153 1154 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1155 row--; 1156 sum2 -= *v2-- * tmp0; 1157 sum3 -= *v3-- * tmp0; 1158 sum4 -= *v4-- * tmp0; 1159 sum5 -= *v5-- * tmp0; 1160 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1161 row--; 1162 sum3 -= *v3-- * tmp0; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1166 row--; 1167 sum4 -= *v4-- * tmp0; 1168 sum5 -= *v5-- * tmp0; 1169 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1170 row--; 1171 sum5 -= *v5-- * tmp0; 1172 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1173 row--; 1174 break; 1175 default: 1176 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1177 } 1178 } 1179 PetscCall(ISRestoreIndices(isrow, &rout)); 1180 PetscCall(ISRestoreIndices(iscol, &cout)); 1181 PetscCall(VecRestoreArrayRead(bb, &b)); 1182 PetscCall(VecRestoreArrayWrite(xx, &x)); 1183 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1184 PetscFunctionReturn(PETSC_SUCCESS); 1185 } 1186 1187 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1188 { 1189 Mat C = B; 1190 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1191 IS isrow = b->row, isicol = b->icol; 1192 const PetscInt *r, *ic, *ics; 1193 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1194 PetscInt i, j, k, nz, nzL, row, *pj; 1195 const PetscInt *ajtmp, *bjtmp; 1196 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1197 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1198 FactorShiftCtx sctx; 1199 const PetscInt *ddiag; 1200 PetscReal rs; 1201 MatScalar d; 1202 PetscInt inod, nodesz, node_max, col; 1203 const PetscInt *ns; 1204 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1205 1206 PetscFunctionBegin; 1207 /* MatPivotSetUp(): initialize shift context sctx */ 1208 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1209 1210 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1211 ddiag = a->diag; 1212 sctx.shift_top = info->zeropivot; 1213 for (i = 0; i < n; i++) { 1214 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1215 d = (aa)[ddiag[i]]; 1216 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1217 v = aa + ai[i]; 1218 nz = ai[i + 1] - ai[i]; 1219 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1220 if (rs > sctx.shift_top) sctx.shift_top = rs; 1221 } 1222 sctx.shift_top *= 1.1; 1223 sctx.nshift_max = 5; 1224 sctx.shift_lo = 0.; 1225 sctx.shift_hi = 1.; 1226 } 1227 1228 PetscCall(ISGetIndices(isrow, &r)); 1229 PetscCall(ISGetIndices(isicol, &ic)); 1230 1231 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1232 ics = ic; 1233 1234 node_max = a->inode.node_count; 1235 ns = a->inode.size; 1236 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1237 1238 /* If max inode size > 4, split it into two inodes.*/ 1239 /* also map the inode sizes according to the ordering */ 1240 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1241 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1242 if (ns[i] > 4) { 1243 tmp_vec1[j] = 4; 1244 ++j; 1245 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 1246 } else { 1247 tmp_vec1[j] = ns[i]; 1248 } 1249 } 1250 /* Use the correct node_max */ 1251 node_max = j; 1252 1253 /* Now reorder the inode info based on mat re-ordering info */ 1254 /* First create a row -> inode_size_array_index map */ 1255 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1256 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1257 for (i = 0, row = 0; i < node_max; i++) { 1258 nodesz = tmp_vec1[i]; 1259 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1260 } 1261 /* Using nsmap, create a reordered ns structure */ 1262 for (i = 0, j = 0; i < node_max; i++) { 1263 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1264 tmp_vec2[i] = nodesz; 1265 j += nodesz; 1266 } 1267 PetscCall(PetscFree(nsmap)); 1268 PetscCall(PetscFree(tmp_vec1)); 1269 1270 /* Now use the correct ns */ 1271 ns = tmp_vec2; 1272 1273 do { 1274 sctx.newshift = PETSC_FALSE; 1275 /* Now loop over each block-row, and do the factorization */ 1276 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1277 nodesz = ns[inod]; 1278 1279 switch (nodesz) { 1280 case 1: 1281 /* zero rtmp1 */ 1282 /* L part */ 1283 nz = bi[i + 1] - bi[i]; 1284 bjtmp = bj + bi[i]; 1285 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1286 1287 /* U part */ 1288 nz = bdiag[i] - bdiag[i + 1]; 1289 bjtmp = bj + bdiag[i + 1] + 1; 1290 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1291 1292 /* load in initial (unfactored row) */ 1293 nz = ai[r[i] + 1] - ai[r[i]]; 1294 ajtmp = aj + ai[r[i]]; 1295 v = aa + ai[r[i]]; 1296 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1297 1298 /* ZeropivotApply() */ 1299 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1300 1301 /* elimination */ 1302 bjtmp = bj + bi[i]; 1303 row = *bjtmp++; 1304 nzL = bi[i + 1] - bi[i]; 1305 for (k = 0; k < nzL; k++) { 1306 pc = rtmp1 + row; 1307 if (*pc != 0.0) { 1308 pv = b->a + bdiag[row]; 1309 mul1 = *pc * (*pv); 1310 *pc = mul1; 1311 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1312 pv = b->a + bdiag[row + 1] + 1; 1313 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1314 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1315 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1316 } 1317 row = *bjtmp++; 1318 } 1319 1320 /* finished row so stick it into b->a */ 1321 rs = 0.0; 1322 /* L part */ 1323 pv = b->a + bi[i]; 1324 pj = b->j + bi[i]; 1325 nz = bi[i + 1] - bi[i]; 1326 for (j = 0; j < nz; j++) { 1327 pv[j] = rtmp1[pj[j]]; 1328 rs += PetscAbsScalar(pv[j]); 1329 } 1330 1331 /* U part */ 1332 pv = b->a + bdiag[i + 1] + 1; 1333 pj = b->j + bdiag[i + 1] + 1; 1334 nz = bdiag[i] - bdiag[i + 1] - 1; 1335 for (j = 0; j < nz; j++) { 1336 pv[j] = rtmp1[pj[j]]; 1337 rs += PetscAbsScalar(pv[j]); 1338 } 1339 1340 /* Check zero pivot */ 1341 sctx.rs = rs; 1342 sctx.pv = rtmp1[i]; 1343 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1344 if (sctx.newshift) break; 1345 1346 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1347 pv = b->a + bdiag[i]; 1348 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1349 break; 1350 1351 case 2: 1352 /* zero rtmp1 and rtmp2 */ 1353 /* L part */ 1354 nz = bi[i + 1] - bi[i]; 1355 bjtmp = bj + bi[i]; 1356 for (j = 0; j < nz; j++) { 1357 col = bjtmp[j]; 1358 rtmp1[col] = 0.0; 1359 rtmp2[col] = 0.0; 1360 } 1361 1362 /* U part */ 1363 nz = bdiag[i] - bdiag[i + 1]; 1364 bjtmp = bj + bdiag[i + 1] + 1; 1365 for (j = 0; j < nz; j++) { 1366 col = bjtmp[j]; 1367 rtmp1[col] = 0.0; 1368 rtmp2[col] = 0.0; 1369 } 1370 1371 /* load in initial (unfactored row) */ 1372 nz = ai[r[i] + 1] - ai[r[i]]; 1373 ajtmp = aj + ai[r[i]]; 1374 v1 = aa + ai[r[i]]; 1375 v2 = aa + ai[r[i] + 1]; 1376 for (j = 0; j < nz; j++) { 1377 col = ics[ajtmp[j]]; 1378 rtmp1[col] = v1[j]; 1379 rtmp2[col] = v2[j]; 1380 } 1381 /* ZeropivotApply(): shift the diagonal of the matrix */ 1382 rtmp1[i] += sctx.shift_amount; 1383 rtmp2[i + 1] += sctx.shift_amount; 1384 1385 /* elimination */ 1386 bjtmp = bj + bi[i]; 1387 row = *bjtmp++; /* pivot row */ 1388 nzL = bi[i + 1] - bi[i]; 1389 for (k = 0; k < nzL; k++) { 1390 pc1 = rtmp1 + row; 1391 pc2 = rtmp2 + row; 1392 if (*pc1 != 0.0 || *pc2 != 0.0) { 1393 pv = b->a + bdiag[row]; 1394 mul1 = *pc1 * (*pv); 1395 mul2 = *pc2 * (*pv); 1396 *pc1 = mul1; 1397 *pc2 = mul2; 1398 1399 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1400 pv = b->a + bdiag[row + 1] + 1; 1401 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1402 for (j = 0; j < nz; j++) { 1403 col = pj[j]; 1404 rtmp1[col] -= mul1 * pv[j]; 1405 rtmp2[col] -= mul2 * pv[j]; 1406 } 1407 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1408 } 1409 row = *bjtmp++; 1410 } 1411 1412 /* finished row i; check zero pivot, then stick row i into b->a */ 1413 rs = 0.0; 1414 /* L part */ 1415 pc1 = b->a + bi[i]; 1416 pj = b->j + bi[i]; 1417 nz = bi[i + 1] - bi[i]; 1418 for (j = 0; j < nz; j++) { 1419 col = pj[j]; 1420 pc1[j] = rtmp1[col]; 1421 rs += PetscAbsScalar(pc1[j]); 1422 } 1423 /* U part */ 1424 pc1 = b->a + bdiag[i + 1] + 1; 1425 pj = b->j + bdiag[i + 1] + 1; 1426 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1427 for (j = 0; j < nz; j++) { 1428 col = pj[j]; 1429 pc1[j] = rtmp1[col]; 1430 rs += PetscAbsScalar(pc1[j]); 1431 } 1432 1433 sctx.rs = rs; 1434 sctx.pv = rtmp1[i]; 1435 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1436 if (sctx.newshift) break; 1437 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1438 *pc1 = 1.0 / sctx.pv; 1439 1440 /* Now take care of diagonal 2x2 block. */ 1441 pc2 = rtmp2 + i; 1442 if (*pc2 != 0.0) { 1443 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1444 *pc2 = mul1; /* insert L entry */ 1445 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1446 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1447 for (j = 0; j < nz; j++) { 1448 col = pj[j]; 1449 rtmp2[col] -= mul1 * rtmp1[col]; 1450 } 1451 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1452 } 1453 1454 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1455 rs = 0.0; 1456 /* L part */ 1457 pc2 = b->a + bi[i + 1]; 1458 pj = b->j + bi[i + 1]; 1459 nz = bi[i + 2] - bi[i + 1]; 1460 for (j = 0; j < nz; j++) { 1461 col = pj[j]; 1462 pc2[j] = rtmp2[col]; 1463 rs += PetscAbsScalar(pc2[j]); 1464 } 1465 /* U part */ 1466 pc2 = b->a + bdiag[i + 2] + 1; 1467 pj = b->j + bdiag[i + 2] + 1; 1468 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1469 for (j = 0; j < nz; j++) { 1470 col = pj[j]; 1471 pc2[j] = rtmp2[col]; 1472 rs += PetscAbsScalar(pc2[j]); 1473 } 1474 1475 sctx.rs = rs; 1476 sctx.pv = rtmp2[i + 1]; 1477 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1478 if (sctx.newshift) break; 1479 pc2 = b->a + bdiag[i + 1]; 1480 *pc2 = 1.0 / sctx.pv; 1481 break; 1482 1483 case 3: 1484 /* zero rtmp */ 1485 /* L part */ 1486 nz = bi[i + 1] - bi[i]; 1487 bjtmp = bj + bi[i]; 1488 for (j = 0; j < nz; j++) { 1489 col = bjtmp[j]; 1490 rtmp1[col] = 0.0; 1491 rtmp2[col] = 0.0; 1492 rtmp3[col] = 0.0; 1493 } 1494 1495 /* U part */ 1496 nz = bdiag[i] - bdiag[i + 1]; 1497 bjtmp = bj + bdiag[i + 1] + 1; 1498 for (j = 0; j < nz; j++) { 1499 col = bjtmp[j]; 1500 rtmp1[col] = 0.0; 1501 rtmp2[col] = 0.0; 1502 rtmp3[col] = 0.0; 1503 } 1504 1505 /* load in initial (unfactored row) */ 1506 nz = ai[r[i] + 1] - ai[r[i]]; 1507 ajtmp = aj + ai[r[i]]; 1508 v1 = aa + ai[r[i]]; 1509 v2 = aa + ai[r[i] + 1]; 1510 v3 = aa + ai[r[i] + 2]; 1511 for (j = 0; j < nz; j++) { 1512 col = ics[ajtmp[j]]; 1513 rtmp1[col] = v1[j]; 1514 rtmp2[col] = v2[j]; 1515 rtmp3[col] = v3[j]; 1516 } 1517 /* ZeropivotApply(): shift the diagonal of the matrix */ 1518 rtmp1[i] += sctx.shift_amount; 1519 rtmp2[i + 1] += sctx.shift_amount; 1520 rtmp3[i + 2] += sctx.shift_amount; 1521 1522 /* elimination */ 1523 bjtmp = bj + bi[i]; 1524 row = *bjtmp++; /* pivot row */ 1525 nzL = bi[i + 1] - bi[i]; 1526 for (k = 0; k < nzL; k++) { 1527 pc1 = rtmp1 + row; 1528 pc2 = rtmp2 + row; 1529 pc3 = rtmp3 + row; 1530 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1531 pv = b->a + bdiag[row]; 1532 mul1 = *pc1 * (*pv); 1533 mul2 = *pc2 * (*pv); 1534 mul3 = *pc3 * (*pv); 1535 *pc1 = mul1; 1536 *pc2 = mul2; 1537 *pc3 = mul3; 1538 1539 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1540 pv = b->a + bdiag[row + 1] + 1; 1541 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1542 for (j = 0; j < nz; j++) { 1543 col = pj[j]; 1544 rtmp1[col] -= mul1 * pv[j]; 1545 rtmp2[col] -= mul2 * pv[j]; 1546 rtmp3[col] -= mul3 * pv[j]; 1547 } 1548 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1549 } 1550 row = *bjtmp++; 1551 } 1552 1553 /* finished row i; check zero pivot, then stick row i into b->a */ 1554 rs = 0.0; 1555 /* L part */ 1556 pc1 = b->a + bi[i]; 1557 pj = b->j + bi[i]; 1558 nz = bi[i + 1] - bi[i]; 1559 for (j = 0; j < nz; j++) { 1560 col = pj[j]; 1561 pc1[j] = rtmp1[col]; 1562 rs += PetscAbsScalar(pc1[j]); 1563 } 1564 /* U part */ 1565 pc1 = b->a + bdiag[i + 1] + 1; 1566 pj = b->j + bdiag[i + 1] + 1; 1567 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1568 for (j = 0; j < nz; j++) { 1569 col = pj[j]; 1570 pc1[j] = rtmp1[col]; 1571 rs += PetscAbsScalar(pc1[j]); 1572 } 1573 1574 sctx.rs = rs; 1575 sctx.pv = rtmp1[i]; 1576 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1577 if (sctx.newshift) break; 1578 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1579 *pc1 = 1.0 / sctx.pv; 1580 1581 /* Now take care of 1st column of diagonal 3x3 block. */ 1582 pc2 = rtmp2 + i; 1583 pc3 = rtmp3 + i; 1584 if (*pc2 != 0.0 || *pc3 != 0.0) { 1585 mul2 = (*pc2) * (*pc1); 1586 *pc2 = mul2; 1587 mul3 = (*pc3) * (*pc1); 1588 *pc3 = mul3; 1589 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1590 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1591 for (j = 0; j < nz; j++) { 1592 col = pj[j]; 1593 rtmp2[col] -= mul2 * rtmp1[col]; 1594 rtmp3[col] -= mul3 * rtmp1[col]; 1595 } 1596 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1597 } 1598 1599 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1600 rs = 0.0; 1601 /* L part */ 1602 pc2 = b->a + bi[i + 1]; 1603 pj = b->j + bi[i + 1]; 1604 nz = bi[i + 2] - bi[i + 1]; 1605 for (j = 0; j < nz; j++) { 1606 col = pj[j]; 1607 pc2[j] = rtmp2[col]; 1608 rs += PetscAbsScalar(pc2[j]); 1609 } 1610 /* U part */ 1611 pc2 = b->a + bdiag[i + 2] + 1; 1612 pj = b->j + bdiag[i + 2] + 1; 1613 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1614 for (j = 0; j < nz; j++) { 1615 col = pj[j]; 1616 pc2[j] = rtmp2[col]; 1617 rs += PetscAbsScalar(pc2[j]); 1618 } 1619 1620 sctx.rs = rs; 1621 sctx.pv = rtmp2[i + 1]; 1622 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1623 if (sctx.newshift) break; 1624 pc2 = b->a + bdiag[i + 1]; 1625 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1626 1627 /* Now take care of 2nd column of diagonal 3x3 block. */ 1628 pc3 = rtmp3 + i + 1; 1629 if (*pc3 != 0.0) { 1630 mul3 = (*pc3) * (*pc2); 1631 *pc3 = mul3; 1632 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1633 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1634 for (j = 0; j < nz; j++) { 1635 col = pj[j]; 1636 rtmp3[col] -= mul3 * rtmp2[col]; 1637 } 1638 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1639 } 1640 1641 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1642 rs = 0.0; 1643 /* L part */ 1644 pc3 = b->a + bi[i + 2]; 1645 pj = b->j + bi[i + 2]; 1646 nz = bi[i + 3] - bi[i + 2]; 1647 for (j = 0; j < nz; j++) { 1648 col = pj[j]; 1649 pc3[j] = rtmp3[col]; 1650 rs += PetscAbsScalar(pc3[j]); 1651 } 1652 /* U part */ 1653 pc3 = b->a + bdiag[i + 3] + 1; 1654 pj = b->j + bdiag[i + 3] + 1; 1655 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1656 for (j = 0; j < nz; j++) { 1657 col = pj[j]; 1658 pc3[j] = rtmp3[col]; 1659 rs += PetscAbsScalar(pc3[j]); 1660 } 1661 1662 sctx.rs = rs; 1663 sctx.pv = rtmp3[i + 2]; 1664 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1665 if (sctx.newshift) break; 1666 pc3 = b->a + bdiag[i + 2]; 1667 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1668 break; 1669 case 4: 1670 /* zero rtmp */ 1671 /* L part */ 1672 nz = bi[i + 1] - bi[i]; 1673 bjtmp = bj + bi[i]; 1674 for (j = 0; j < nz; j++) { 1675 col = bjtmp[j]; 1676 rtmp1[col] = 0.0; 1677 rtmp2[col] = 0.0; 1678 rtmp3[col] = 0.0; 1679 rtmp4[col] = 0.0; 1680 } 1681 1682 /* U part */ 1683 nz = bdiag[i] - bdiag[i + 1]; 1684 bjtmp = bj + bdiag[i + 1] + 1; 1685 for (j = 0; j < nz; j++) { 1686 col = bjtmp[j]; 1687 rtmp1[col] = 0.0; 1688 rtmp2[col] = 0.0; 1689 rtmp3[col] = 0.0; 1690 rtmp4[col] = 0.0; 1691 } 1692 1693 /* load in initial (unfactored row) */ 1694 nz = ai[r[i] + 1] - ai[r[i]]; 1695 ajtmp = aj + ai[r[i]]; 1696 v1 = aa + ai[r[i]]; 1697 v2 = aa + ai[r[i] + 1]; 1698 v3 = aa + ai[r[i] + 2]; 1699 v4 = aa + ai[r[i] + 3]; 1700 for (j = 0; j < nz; j++) { 1701 col = ics[ajtmp[j]]; 1702 rtmp1[col] = v1[j]; 1703 rtmp2[col] = v2[j]; 1704 rtmp3[col] = v3[j]; 1705 rtmp4[col] = v4[j]; 1706 } 1707 /* ZeropivotApply(): shift the diagonal of the matrix */ 1708 rtmp1[i] += sctx.shift_amount; 1709 rtmp2[i + 1] += sctx.shift_amount; 1710 rtmp3[i + 2] += sctx.shift_amount; 1711 rtmp4[i + 3] += sctx.shift_amount; 1712 1713 /* elimination */ 1714 bjtmp = bj + bi[i]; 1715 row = *bjtmp++; /* pivot row */ 1716 nzL = bi[i + 1] - bi[i]; 1717 for (k = 0; k < nzL; k++) { 1718 pc1 = rtmp1 + row; 1719 pc2 = rtmp2 + row; 1720 pc3 = rtmp3 + row; 1721 pc4 = rtmp4 + row; 1722 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1723 pv = b->a + bdiag[row]; 1724 mul1 = *pc1 * (*pv); 1725 mul2 = *pc2 * (*pv); 1726 mul3 = *pc3 * (*pv); 1727 mul4 = *pc4 * (*pv); 1728 *pc1 = mul1; 1729 *pc2 = mul2; 1730 *pc3 = mul3; 1731 *pc4 = mul4; 1732 1733 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1734 pv = b->a + bdiag[row + 1] + 1; 1735 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1736 for (j = 0; j < nz; j++) { 1737 col = pj[j]; 1738 rtmp1[col] -= mul1 * pv[j]; 1739 rtmp2[col] -= mul2 * pv[j]; 1740 rtmp3[col] -= mul3 * pv[j]; 1741 rtmp4[col] -= mul4 * pv[j]; 1742 } 1743 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1744 } 1745 row = *bjtmp++; 1746 } 1747 1748 /* finished row i; check zero pivot, then stick row i into b->a */ 1749 rs = 0.0; 1750 /* L part */ 1751 pc1 = b->a + bi[i]; 1752 pj = b->j + bi[i]; 1753 nz = bi[i + 1] - bi[i]; 1754 for (j = 0; j < nz; j++) { 1755 col = pj[j]; 1756 pc1[j] = rtmp1[col]; 1757 rs += PetscAbsScalar(pc1[j]); 1758 } 1759 /* U part */ 1760 pc1 = b->a + bdiag[i + 1] + 1; 1761 pj = b->j + bdiag[i + 1] + 1; 1762 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1763 for (j = 0; j < nz; j++) { 1764 col = pj[j]; 1765 pc1[j] = rtmp1[col]; 1766 rs += PetscAbsScalar(pc1[j]); 1767 } 1768 1769 sctx.rs = rs; 1770 sctx.pv = rtmp1[i]; 1771 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1772 if (sctx.newshift) break; 1773 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1774 *pc1 = 1.0 / sctx.pv; 1775 1776 /* Now take care of 1st column of diagonal 4x4 block. */ 1777 pc2 = rtmp2 + i; 1778 pc3 = rtmp3 + i; 1779 pc4 = rtmp4 + i; 1780 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1781 mul2 = (*pc2) * (*pc1); 1782 *pc2 = mul2; 1783 mul3 = (*pc3) * (*pc1); 1784 *pc3 = mul3; 1785 mul4 = (*pc4) * (*pc1); 1786 *pc4 = mul4; 1787 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1788 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1789 for (j = 0; j < nz; j++) { 1790 col = pj[j]; 1791 rtmp2[col] -= mul2 * rtmp1[col]; 1792 rtmp3[col] -= mul3 * rtmp1[col]; 1793 rtmp4[col] -= mul4 * rtmp1[col]; 1794 } 1795 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1796 } 1797 1798 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1799 rs = 0.0; 1800 /* L part */ 1801 pc2 = b->a + bi[i + 1]; 1802 pj = b->j + bi[i + 1]; 1803 nz = bi[i + 2] - bi[i + 1]; 1804 for (j = 0; j < nz; j++) { 1805 col = pj[j]; 1806 pc2[j] = rtmp2[col]; 1807 rs += PetscAbsScalar(pc2[j]); 1808 } 1809 /* U part */ 1810 pc2 = b->a + bdiag[i + 2] + 1; 1811 pj = b->j + bdiag[i + 2] + 1; 1812 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1813 for (j = 0; j < nz; j++) { 1814 col = pj[j]; 1815 pc2[j] = rtmp2[col]; 1816 rs += PetscAbsScalar(pc2[j]); 1817 } 1818 1819 sctx.rs = rs; 1820 sctx.pv = rtmp2[i + 1]; 1821 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1822 if (sctx.newshift) break; 1823 pc2 = b->a + bdiag[i + 1]; 1824 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1825 1826 /* Now take care of 2nd column of diagonal 4x4 block. */ 1827 pc3 = rtmp3 + i + 1; 1828 pc4 = rtmp4 + i + 1; 1829 if (*pc3 != 0.0 || *pc4 != 0.0) { 1830 mul3 = (*pc3) * (*pc2); 1831 *pc3 = mul3; 1832 mul4 = (*pc4) * (*pc2); 1833 *pc4 = mul4; 1834 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1835 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1836 for (j = 0; j < nz; j++) { 1837 col = pj[j]; 1838 rtmp3[col] -= mul3 * rtmp2[col]; 1839 rtmp4[col] -= mul4 * rtmp2[col]; 1840 } 1841 PetscCall(PetscLogFlops(4.0 * nz)); 1842 } 1843 1844 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1845 rs = 0.0; 1846 /* L part */ 1847 pc3 = b->a + bi[i + 2]; 1848 pj = b->j + bi[i + 2]; 1849 nz = bi[i + 3] - bi[i + 2]; 1850 for (j = 0; j < nz; j++) { 1851 col = pj[j]; 1852 pc3[j] = rtmp3[col]; 1853 rs += PetscAbsScalar(pc3[j]); 1854 } 1855 /* U part */ 1856 pc3 = b->a + bdiag[i + 3] + 1; 1857 pj = b->j + bdiag[i + 3] + 1; 1858 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1859 for (j = 0; j < nz; j++) { 1860 col = pj[j]; 1861 pc3[j] = rtmp3[col]; 1862 rs += PetscAbsScalar(pc3[j]); 1863 } 1864 1865 sctx.rs = rs; 1866 sctx.pv = rtmp3[i + 2]; 1867 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1868 if (sctx.newshift) break; 1869 pc3 = b->a + bdiag[i + 2]; 1870 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1871 1872 /* Now take care of 3rd column of diagonal 4x4 block. */ 1873 pc4 = rtmp4 + i + 2; 1874 if (*pc4 != 0.0) { 1875 mul4 = (*pc4) * (*pc3); 1876 *pc4 = mul4; 1877 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1878 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1879 for (j = 0; j < nz; j++) { 1880 col = pj[j]; 1881 rtmp4[col] -= mul4 * rtmp3[col]; 1882 } 1883 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1884 } 1885 1886 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1887 rs = 0.0; 1888 /* L part */ 1889 pc4 = b->a + bi[i + 3]; 1890 pj = b->j + bi[i + 3]; 1891 nz = bi[i + 4] - bi[i + 3]; 1892 for (j = 0; j < nz; j++) { 1893 col = pj[j]; 1894 pc4[j] = rtmp4[col]; 1895 rs += PetscAbsScalar(pc4[j]); 1896 } 1897 /* U part */ 1898 pc4 = b->a + bdiag[i + 4] + 1; 1899 pj = b->j + bdiag[i + 4] + 1; 1900 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1901 for (j = 0; j < nz; j++) { 1902 col = pj[j]; 1903 pc4[j] = rtmp4[col]; 1904 rs += PetscAbsScalar(pc4[j]); 1905 } 1906 1907 sctx.rs = rs; 1908 sctx.pv = rtmp4[i + 3]; 1909 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1910 if (sctx.newshift) break; 1911 pc4 = b->a + bdiag[i + 3]; 1912 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1913 break; 1914 1915 default: 1916 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1917 } 1918 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1919 i += nodesz; /* Update the row */ 1920 } 1921 1922 /* MatPivotRefine() */ 1923 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1924 /* 1925 * if no shift in this attempt & shifting & started shifting & can refine, 1926 * then try lower shift 1927 */ 1928 sctx.shift_hi = sctx.shift_fraction; 1929 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1930 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1931 sctx.newshift = PETSC_TRUE; 1932 sctx.nshift++; 1933 } 1934 } while (sctx.newshift); 1935 1936 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1937 PetscCall(PetscFree(tmp_vec2)); 1938 PetscCall(ISRestoreIndices(isicol, &ic)); 1939 PetscCall(ISRestoreIndices(isrow, &r)); 1940 1941 if (b->inode.size) { 1942 C->ops->solve = MatSolve_SeqAIJ_Inode; 1943 } else { 1944 C->ops->solve = MatSolve_SeqAIJ; 1945 } 1946 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1947 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1948 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1949 C->ops->matsolve = MatMatSolve_SeqAIJ; 1950 C->assembled = PETSC_TRUE; 1951 C->preallocated = PETSC_TRUE; 1952 1953 PetscCall(PetscLogFlops(C->cmap->n)); 1954 1955 /* MatShiftView(A,info,&sctx) */ 1956 if (sctx.nshift) { 1957 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1958 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1959 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1960 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1961 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1962 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1963 } 1964 } 1965 PetscFunctionReturn(PETSC_SUCCESS); 1966 } 1967 1968 #if 0 1969 // unused 1970 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1971 { 1972 Mat C = B; 1973 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1974 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1975 const PetscInt *r, *ic, *c, *ics; 1976 PetscInt n = A->rmap->n, *bi = b->i; 1977 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1978 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1979 PetscInt *ai = a->i, *aj = a->j; 1980 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1981 PetscScalar mul1, mul2, mul3, tmp; 1982 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1983 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1984 PetscReal rs = 0.0; 1985 FactorShiftCtx sctx; 1986 1987 PetscFunctionBegin; 1988 sctx.shift_top = 0; 1989 sctx.nshift_max = 0; 1990 sctx.shift_lo = 0; 1991 sctx.shift_hi = 0; 1992 sctx.shift_fraction = 0; 1993 1994 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1995 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1996 sctx.shift_top = 0; 1997 for (i = 0; i < n; i++) { 1998 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1999 rs = 0.0; 2000 ajtmp = aj + ai[i]; 2001 rtmp1 = aa + ai[i]; 2002 nz = ai[i + 1] - ai[i]; 2003 for (j = 0; j < nz; j++) { 2004 if (*ajtmp != i) { 2005 rs += PetscAbsScalar(*rtmp1++); 2006 } else { 2007 rs -= PetscRealPart(*rtmp1++); 2008 } 2009 ajtmp++; 2010 } 2011 if (rs > sctx.shift_top) sctx.shift_top = rs; 2012 } 2013 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2014 sctx.shift_top *= 1.1; 2015 sctx.nshift_max = 5; 2016 sctx.shift_lo = 0.; 2017 sctx.shift_hi = 1.; 2018 } 2019 sctx.shift_amount = 0; 2020 sctx.nshift = 0; 2021 2022 PetscCall(ISGetIndices(isrow, &r)); 2023 PetscCall(ISGetIndices(iscol, &c)); 2024 PetscCall(ISGetIndices(isicol, &ic)); 2025 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2026 ics = ic; 2027 2028 node_max = a->inode.node_count; 2029 ns = a->inode.size; 2030 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2031 2032 /* If max inode size > 3, split it into two inodes.*/ 2033 /* also map the inode sizes according to the ordering */ 2034 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2035 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2036 if (ns[i] > 3) { 2037 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2038 ++j; 2039 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2040 } else { 2041 tmp_vec1[j] = ns[i]; 2042 } 2043 } 2044 /* Use the correct node_max */ 2045 node_max = j; 2046 2047 /* Now reorder the inode info based on mat re-ordering info */ 2048 /* First create a row -> inode_size_array_index map */ 2049 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2050 for (i = 0, row = 0; i < node_max; i++) { 2051 nodesz = tmp_vec1[i]; 2052 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2053 } 2054 /* Using nsmap, create a reordered ns structure */ 2055 for (i = 0, j = 0; i < node_max; i++) { 2056 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2057 tmp_vec2[i] = nodesz; 2058 j += nodesz; 2059 } 2060 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2061 /* Now use the correct ns */ 2062 ns = tmp_vec2; 2063 2064 do { 2065 sctx.newshift = PETSC_FALSE; 2066 /* Now loop over each block-row, and do the factorization */ 2067 for (i = 0, row = 0; i < node_max; i++) { 2068 nodesz = ns[i]; 2069 nz = bi[row + 1] - bi[row]; 2070 bjtmp = bj + bi[row]; 2071 2072 switch (nodesz) { 2073 case 1: 2074 for (j = 0; j < nz; j++) { 2075 idx = bjtmp[j]; 2076 rtmp11[idx] = 0.0; 2077 } 2078 2079 /* load in initial (unfactored row) */ 2080 idx = r[row]; 2081 nz_tmp = ai[idx + 1] - ai[idx]; 2082 ajtmp = aj + ai[idx]; 2083 v1 = aa + ai[idx]; 2084 2085 for (j = 0; j < nz_tmp; j++) { 2086 idx = ics[ajtmp[j]]; 2087 rtmp11[idx] = v1[j]; 2088 } 2089 rtmp11[ics[r[row]]] += sctx.shift_amount; 2090 2091 prow = *bjtmp++; 2092 while (prow < row) { 2093 pc1 = rtmp11 + prow; 2094 if (*pc1 != 0.0) { 2095 pv = ba + bd[prow]; 2096 pj = nbj + bd[prow]; 2097 mul1 = *pc1 * *pv++; 2098 *pc1 = mul1; 2099 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2100 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2101 for (j = 0; j < nz_tmp; j++) { 2102 tmp = pv[j]; 2103 idx = pj[j]; 2104 rtmp11[idx] -= mul1 * tmp; 2105 } 2106 } 2107 prow = *bjtmp++; 2108 } 2109 pj = bj + bi[row]; 2110 pc1 = ba + bi[row]; 2111 2112 sctx.pv = rtmp11[row]; 2113 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2114 rs = 0.0; 2115 for (j = 0; j < nz; j++) { 2116 idx = pj[j]; 2117 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2118 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2119 } 2120 sctx.rs = rs; 2121 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2122 if (sctx.newshift) goto endofwhile; 2123 break; 2124 2125 case 2: 2126 for (j = 0; j < nz; j++) { 2127 idx = bjtmp[j]; 2128 rtmp11[idx] = 0.0; 2129 rtmp22[idx] = 0.0; 2130 } 2131 2132 /* load in initial (unfactored row) */ 2133 idx = r[row]; 2134 nz_tmp = ai[idx + 1] - ai[idx]; 2135 ajtmp = aj + ai[idx]; 2136 v1 = aa + ai[idx]; 2137 v2 = aa + ai[idx + 1]; 2138 for (j = 0; j < nz_tmp; j++) { 2139 idx = ics[ajtmp[j]]; 2140 rtmp11[idx] = v1[j]; 2141 rtmp22[idx] = v2[j]; 2142 } 2143 rtmp11[ics[r[row]]] += sctx.shift_amount; 2144 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2145 2146 prow = *bjtmp++; 2147 while (prow < row) { 2148 pc1 = rtmp11 + prow; 2149 pc2 = rtmp22 + prow; 2150 if (*pc1 != 0.0 || *pc2 != 0.0) { 2151 pv = ba + bd[prow]; 2152 pj = nbj + bd[prow]; 2153 mul1 = *pc1 * *pv; 2154 mul2 = *pc2 * *pv; 2155 ++pv; 2156 *pc1 = mul1; 2157 *pc2 = mul2; 2158 2159 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2160 for (j = 0; j < nz_tmp; j++) { 2161 tmp = pv[j]; 2162 idx = pj[j]; 2163 rtmp11[idx] -= mul1 * tmp; 2164 rtmp22[idx] -= mul2 * tmp; 2165 } 2166 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2167 } 2168 prow = *bjtmp++; 2169 } 2170 2171 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2172 pc1 = rtmp11 + prow; 2173 pc2 = rtmp22 + prow; 2174 2175 sctx.pv = *pc1; 2176 pj = bj + bi[prow]; 2177 rs = 0.0; 2178 for (j = 0; j < nz; j++) { 2179 idx = pj[j]; 2180 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2181 } 2182 sctx.rs = rs; 2183 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2184 if (sctx.newshift) goto endofwhile; 2185 2186 if (*pc2 != 0.0) { 2187 pj = nbj + bd[prow]; 2188 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2189 *pc2 = mul2; 2190 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2191 for (j = 0; j < nz_tmp; j++) { 2192 idx = pj[j]; 2193 tmp = rtmp11[idx]; 2194 rtmp22[idx] -= mul2 * tmp; 2195 } 2196 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2197 } 2198 2199 pj = bj + bi[row]; 2200 pc1 = ba + bi[row]; 2201 pc2 = ba + bi[row + 1]; 2202 2203 sctx.pv = rtmp22[row + 1]; 2204 rs = 0.0; 2205 rtmp11[row] = 1.0 / rtmp11[row]; 2206 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2207 /* copy row entries from dense representation to sparse */ 2208 for (j = 0; j < nz; j++) { 2209 idx = pj[j]; 2210 pc1[j] = rtmp11[idx]; 2211 pc2[j] = rtmp22[idx]; 2212 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2213 } 2214 sctx.rs = rs; 2215 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2216 if (sctx.newshift) goto endofwhile; 2217 break; 2218 2219 case 3: 2220 for (j = 0; j < nz; j++) { 2221 idx = bjtmp[j]; 2222 rtmp11[idx] = 0.0; 2223 rtmp22[idx] = 0.0; 2224 rtmp33[idx] = 0.0; 2225 } 2226 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2227 idx = r[row]; 2228 nz_tmp = ai[idx + 1] - ai[idx]; 2229 ajtmp = aj + ai[idx]; 2230 v1 = aa + ai[idx]; 2231 v2 = aa + ai[idx + 1]; 2232 v3 = aa + ai[idx + 2]; 2233 for (j = 0; j < nz_tmp; j++) { 2234 idx = ics[ajtmp[j]]; 2235 rtmp11[idx] = v1[j]; 2236 rtmp22[idx] = v2[j]; 2237 rtmp33[idx] = v3[j]; 2238 } 2239 rtmp11[ics[r[row]]] += sctx.shift_amount; 2240 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2241 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2242 2243 /* loop over all pivot row blocks above this row block */ 2244 prow = *bjtmp++; 2245 while (prow < row) { 2246 pc1 = rtmp11 + prow; 2247 pc2 = rtmp22 + prow; 2248 pc3 = rtmp33 + prow; 2249 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2250 pv = ba + bd[prow]; 2251 pj = nbj + bd[prow]; 2252 mul1 = *pc1 * *pv; 2253 mul2 = *pc2 * *pv; 2254 mul3 = *pc3 * *pv; 2255 ++pv; 2256 *pc1 = mul1; 2257 *pc2 = mul2; 2258 *pc3 = mul3; 2259 2260 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2261 /* update this row based on pivot row */ 2262 for (j = 0; j < nz_tmp; j++) { 2263 tmp = pv[j]; 2264 idx = pj[j]; 2265 rtmp11[idx] -= mul1 * tmp; 2266 rtmp22[idx] -= mul2 * tmp; 2267 rtmp33[idx] -= mul3 * tmp; 2268 } 2269 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2270 } 2271 prow = *bjtmp++; 2272 } 2273 2274 /* Now take care of diagonal 3x3 block in this set of rows */ 2275 /* note: prow = row here */ 2276 pc1 = rtmp11 + prow; 2277 pc2 = rtmp22 + prow; 2278 pc3 = rtmp33 + prow; 2279 2280 sctx.pv = *pc1; 2281 pj = bj + bi[prow]; 2282 rs = 0.0; 2283 for (j = 0; j < nz; j++) { 2284 idx = pj[j]; 2285 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2286 } 2287 sctx.rs = rs; 2288 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2289 if (sctx.newshift) goto endofwhile; 2290 2291 if (*pc2 != 0.0 || *pc3 != 0.0) { 2292 mul2 = (*pc2) / (*pc1); 2293 mul3 = (*pc3) / (*pc1); 2294 *pc2 = mul2; 2295 *pc3 = mul3; 2296 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2297 pj = nbj + bd[prow]; 2298 for (j = 0; j < nz_tmp; j++) { 2299 idx = pj[j]; 2300 tmp = rtmp11[idx]; 2301 rtmp22[idx] -= mul2 * tmp; 2302 rtmp33[idx] -= mul3 * tmp; 2303 } 2304 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2305 } 2306 ++prow; 2307 2308 pc2 = rtmp22 + prow; 2309 pc3 = rtmp33 + prow; 2310 sctx.pv = *pc2; 2311 pj = bj + bi[prow]; 2312 rs = 0.0; 2313 for (j = 0; j < nz; j++) { 2314 idx = pj[j]; 2315 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2316 } 2317 sctx.rs = rs; 2318 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2319 if (sctx.newshift) goto endofwhile; 2320 2321 if (*pc3 != 0.0) { 2322 mul3 = (*pc3) / (*pc2); 2323 *pc3 = mul3; 2324 pj = nbj + bd[prow]; 2325 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2326 for (j = 0; j < nz_tmp; j++) { 2327 idx = pj[j]; 2328 tmp = rtmp22[idx]; 2329 rtmp33[idx] -= mul3 * tmp; 2330 } 2331 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2332 } 2333 2334 pj = bj + bi[row]; 2335 pc1 = ba + bi[row]; 2336 pc2 = ba + bi[row + 1]; 2337 pc3 = ba + bi[row + 2]; 2338 2339 sctx.pv = rtmp33[row + 2]; 2340 rs = 0.0; 2341 rtmp11[row] = 1.0 / rtmp11[row]; 2342 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2343 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2344 /* copy row entries from dense representation to sparse */ 2345 for (j = 0; j < nz; j++) { 2346 idx = pj[j]; 2347 pc1[j] = rtmp11[idx]; 2348 pc2[j] = rtmp22[idx]; 2349 pc3[j] = rtmp33[idx]; 2350 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2351 } 2352 2353 sctx.rs = rs; 2354 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2355 if (sctx.newshift) goto endofwhile; 2356 break; 2357 2358 default: 2359 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2360 } 2361 row += nodesz; /* Update the row */ 2362 } 2363 endofwhile:; 2364 } while (sctx.newshift); 2365 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2366 PetscCall(PetscFree(tmp_vec2)); 2367 PetscCall(ISRestoreIndices(isicol, &ic)); 2368 PetscCall(ISRestoreIndices(isrow, &r)); 2369 PetscCall(ISRestoreIndices(iscol, &c)); 2370 2371 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2372 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2373 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2374 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2375 C->assembled = PETSC_TRUE; 2376 C->preallocated = PETSC_TRUE; 2377 if (sctx.nshift) { 2378 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2379 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2380 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2381 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2382 } 2383 } 2384 PetscCall(PetscLogFlops(C->cmap->n)); 2385 PetscCall(MatSeqAIJCheckInode(C)); 2386 PetscFunctionReturn(PETSC_SUCCESS); 2387 } 2388 #endif 2389 2390 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2391 { 2392 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2393 IS iscol = a->col, isrow = a->row; 2394 const PetscInt *r, *c, *rout, *cout; 2395 PetscInt i, j, n = A->rmap->n; 2396 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2397 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2398 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2399 PetscScalar sum1, sum2, sum3, sum4, sum5; 2400 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2401 const PetscScalar *b; 2402 2403 PetscFunctionBegin; 2404 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2405 node_max = a->inode.node_count; 2406 ns = a->inode.size; /* Node Size array */ 2407 2408 PetscCall(VecGetArrayRead(bb, &b)); 2409 PetscCall(VecGetArrayWrite(xx, &x)); 2410 tmp = a->solve_work; 2411 2412 PetscCall(ISGetIndices(isrow, &rout)); 2413 r = rout; 2414 PetscCall(ISGetIndices(iscol, &cout)); 2415 c = cout; 2416 2417 /* forward solve the lower triangular */ 2418 tmps = tmp; 2419 aa = a_a; 2420 aj = a_j; 2421 ad = a->diag; 2422 2423 for (i = 0, row = 0; i < node_max; ++i) { 2424 nsz = ns[i]; 2425 aii = ai[row]; 2426 v1 = aa + aii; 2427 vi = aj + aii; 2428 nz = ai[row + 1] - ai[row]; 2429 2430 if (i < node_max - 1) { 2431 /* Prefetch the indices for the next block */ 2432 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2433 /* Prefetch the data for the next block */ 2434 PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2435 } 2436 2437 switch (nsz) { /* Each loop in 'case' is unrolled */ 2438 case 1: 2439 sum1 = b[r[row]]; 2440 for (j = 0; j < nz - 1; j += 2) { 2441 i0 = vi[j]; 2442 i1 = vi[j + 1]; 2443 tmp0 = tmps[i0]; 2444 tmp1 = tmps[i1]; 2445 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2446 } 2447 if (j == nz - 1) { 2448 tmp0 = tmps[vi[j]]; 2449 sum1 -= v1[j] * tmp0; 2450 } 2451 tmp[row++] = sum1; 2452 break; 2453 case 2: 2454 sum1 = b[r[row]]; 2455 sum2 = b[r[row + 1]]; 2456 v2 = aa + ai[row + 1]; 2457 2458 for (j = 0; j < nz - 1; j += 2) { 2459 i0 = vi[j]; 2460 i1 = vi[j + 1]; 2461 tmp0 = tmps[i0]; 2462 tmp1 = tmps[i1]; 2463 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2464 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2465 } 2466 if (j == nz - 1) { 2467 tmp0 = tmps[vi[j]]; 2468 sum1 -= v1[j] * tmp0; 2469 sum2 -= v2[j] * tmp0; 2470 } 2471 sum2 -= v2[nz] * sum1; 2472 tmp[row++] = sum1; 2473 tmp[row++] = sum2; 2474 break; 2475 case 3: 2476 sum1 = b[r[row]]; 2477 sum2 = b[r[row + 1]]; 2478 sum3 = b[r[row + 2]]; 2479 v2 = aa + ai[row + 1]; 2480 v3 = aa + ai[row + 2]; 2481 2482 for (j = 0; j < nz - 1; j += 2) { 2483 i0 = vi[j]; 2484 i1 = vi[j + 1]; 2485 tmp0 = tmps[i0]; 2486 tmp1 = tmps[i1]; 2487 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2488 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2489 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2490 } 2491 if (j == nz - 1) { 2492 tmp0 = tmps[vi[j]]; 2493 sum1 -= v1[j] * tmp0; 2494 sum2 -= v2[j] * tmp0; 2495 sum3 -= v3[j] * tmp0; 2496 } 2497 sum2 -= v2[nz] * sum1; 2498 sum3 -= v3[nz] * sum1; 2499 sum3 -= v3[nz + 1] * sum2; 2500 tmp[row++] = sum1; 2501 tmp[row++] = sum2; 2502 tmp[row++] = sum3; 2503 break; 2504 2505 case 4: 2506 sum1 = b[r[row]]; 2507 sum2 = b[r[row + 1]]; 2508 sum3 = b[r[row + 2]]; 2509 sum4 = b[r[row + 3]]; 2510 v2 = aa + ai[row + 1]; 2511 v3 = aa + ai[row + 2]; 2512 v4 = aa + ai[row + 3]; 2513 2514 for (j = 0; j < nz - 1; j += 2) { 2515 i0 = vi[j]; 2516 i1 = vi[j + 1]; 2517 tmp0 = tmps[i0]; 2518 tmp1 = tmps[i1]; 2519 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2520 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2521 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2522 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2523 } 2524 if (j == nz - 1) { 2525 tmp0 = tmps[vi[j]]; 2526 sum1 -= v1[j] * tmp0; 2527 sum2 -= v2[j] * tmp0; 2528 sum3 -= v3[j] * tmp0; 2529 sum4 -= v4[j] * tmp0; 2530 } 2531 sum2 -= v2[nz] * sum1; 2532 sum3 -= v3[nz] * sum1; 2533 sum4 -= v4[nz] * sum1; 2534 sum3 -= v3[nz + 1] * sum2; 2535 sum4 -= v4[nz + 1] * sum2; 2536 sum4 -= v4[nz + 2] * sum3; 2537 2538 tmp[row++] = sum1; 2539 tmp[row++] = sum2; 2540 tmp[row++] = sum3; 2541 tmp[row++] = sum4; 2542 break; 2543 case 5: 2544 sum1 = b[r[row]]; 2545 sum2 = b[r[row + 1]]; 2546 sum3 = b[r[row + 2]]; 2547 sum4 = b[r[row + 3]]; 2548 sum5 = b[r[row + 4]]; 2549 v2 = aa + ai[row + 1]; 2550 v3 = aa + ai[row + 2]; 2551 v4 = aa + ai[row + 3]; 2552 v5 = aa + ai[row + 4]; 2553 2554 for (j = 0; j < nz - 1; j += 2) { 2555 i0 = vi[j]; 2556 i1 = vi[j + 1]; 2557 tmp0 = tmps[i0]; 2558 tmp1 = tmps[i1]; 2559 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2560 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2561 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2562 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2563 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2564 } 2565 if (j == nz - 1) { 2566 tmp0 = tmps[vi[j]]; 2567 sum1 -= v1[j] * tmp0; 2568 sum2 -= v2[j] * tmp0; 2569 sum3 -= v3[j] * tmp0; 2570 sum4 -= v4[j] * tmp0; 2571 sum5 -= v5[j] * tmp0; 2572 } 2573 2574 sum2 -= v2[nz] * sum1; 2575 sum3 -= v3[nz] * sum1; 2576 sum4 -= v4[nz] * sum1; 2577 sum5 -= v5[nz] * sum1; 2578 sum3 -= v3[nz + 1] * sum2; 2579 sum4 -= v4[nz + 1] * sum2; 2580 sum5 -= v5[nz + 1] * sum2; 2581 sum4 -= v4[nz + 2] * sum3; 2582 sum5 -= v5[nz + 2] * sum3; 2583 sum5 -= v5[nz + 3] * sum4; 2584 2585 tmp[row++] = sum1; 2586 tmp[row++] = sum2; 2587 tmp[row++] = sum3; 2588 tmp[row++] = sum4; 2589 tmp[row++] = sum5; 2590 break; 2591 default: 2592 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2593 } 2594 } 2595 /* backward solve the upper triangular */ 2596 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 2597 nsz = ns[i]; 2598 aii = ad[row + 1] + 1; 2599 v1 = aa + aii; 2600 vi = aj + aii; 2601 nz = ad[row] - ad[row + 1] - 1; 2602 2603 if (i > 0) { 2604 /* Prefetch the indices for the next block */ 2605 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2606 /* Prefetch the data for the next block */ 2607 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2608 } 2609 2610 switch (nsz) { /* Each loop in 'case' is unrolled */ 2611 case 1: 2612 sum1 = tmp[row]; 2613 2614 for (j = 0; j < nz - 1; j += 2) { 2615 i0 = vi[j]; 2616 i1 = vi[j + 1]; 2617 tmp0 = tmps[i0]; 2618 tmp1 = tmps[i1]; 2619 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2620 } 2621 if (j == nz - 1) { 2622 tmp0 = tmps[vi[j]]; 2623 sum1 -= v1[j] * tmp0; 2624 } 2625 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2626 row--; 2627 break; 2628 case 2: 2629 sum1 = tmp[row]; 2630 sum2 = tmp[row - 1]; 2631 v2 = aa + ad[row] + 1; 2632 for (j = 0; j < nz - 1; j += 2) { 2633 i0 = vi[j]; 2634 i1 = vi[j + 1]; 2635 tmp0 = tmps[i0]; 2636 tmp1 = tmps[i1]; 2637 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2638 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2639 } 2640 if (j == nz - 1) { 2641 tmp0 = tmps[vi[j]]; 2642 sum1 -= v1[j] * tmp0; 2643 sum2 -= v2[j + 1] * tmp0; 2644 } 2645 2646 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2647 row--; 2648 sum2 -= v2[0] * tmp0; 2649 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2650 row--; 2651 break; 2652 case 3: 2653 sum1 = tmp[row]; 2654 sum2 = tmp[row - 1]; 2655 sum3 = tmp[row - 2]; 2656 v2 = aa + ad[row] + 1; 2657 v3 = aa + ad[row - 1] + 1; 2658 for (j = 0; j < nz - 1; j += 2) { 2659 i0 = vi[j]; 2660 i1 = vi[j + 1]; 2661 tmp0 = tmps[i0]; 2662 tmp1 = tmps[i1]; 2663 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2664 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2665 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2666 } 2667 if (j == nz - 1) { 2668 tmp0 = tmps[vi[j]]; 2669 sum1 -= v1[j] * tmp0; 2670 sum2 -= v2[j + 1] * tmp0; 2671 sum3 -= v3[j + 2] * tmp0; 2672 } 2673 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2674 row--; 2675 sum2 -= v2[0] * tmp0; 2676 sum3 -= v3[1] * tmp0; 2677 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2678 row--; 2679 sum3 -= v3[0] * tmp0; 2680 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2681 row--; 2682 2683 break; 2684 case 4: 2685 sum1 = tmp[row]; 2686 sum2 = tmp[row - 1]; 2687 sum3 = tmp[row - 2]; 2688 sum4 = tmp[row - 3]; 2689 v2 = aa + ad[row] + 1; 2690 v3 = aa + ad[row - 1] + 1; 2691 v4 = aa + ad[row - 2] + 1; 2692 2693 for (j = 0; j < nz - 1; j += 2) { 2694 i0 = vi[j]; 2695 i1 = vi[j + 1]; 2696 tmp0 = tmps[i0]; 2697 tmp1 = tmps[i1]; 2698 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2699 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2700 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2701 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2702 } 2703 if (j == nz - 1) { 2704 tmp0 = tmps[vi[j]]; 2705 sum1 -= v1[j] * tmp0; 2706 sum2 -= v2[j + 1] * tmp0; 2707 sum3 -= v3[j + 2] * tmp0; 2708 sum4 -= v4[j + 3] * tmp0; 2709 } 2710 2711 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2712 row--; 2713 sum2 -= v2[0] * tmp0; 2714 sum3 -= v3[1] * tmp0; 2715 sum4 -= v4[2] * tmp0; 2716 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2717 row--; 2718 sum3 -= v3[0] * tmp0; 2719 sum4 -= v4[1] * tmp0; 2720 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2721 row--; 2722 sum4 -= v4[0] * tmp0; 2723 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2724 row--; 2725 break; 2726 case 5: 2727 sum1 = tmp[row]; 2728 sum2 = tmp[row - 1]; 2729 sum3 = tmp[row - 2]; 2730 sum4 = tmp[row - 3]; 2731 sum5 = tmp[row - 4]; 2732 v2 = aa + ad[row] + 1; 2733 v3 = aa + ad[row - 1] + 1; 2734 v4 = aa + ad[row - 2] + 1; 2735 v5 = aa + ad[row - 3] + 1; 2736 for (j = 0; j < nz - 1; j += 2) { 2737 i0 = vi[j]; 2738 i1 = vi[j + 1]; 2739 tmp0 = tmps[i0]; 2740 tmp1 = tmps[i1]; 2741 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2742 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2743 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2744 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2745 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2746 } 2747 if (j == nz - 1) { 2748 tmp0 = tmps[vi[j]]; 2749 sum1 -= v1[j] * tmp0; 2750 sum2 -= v2[j + 1] * tmp0; 2751 sum3 -= v3[j + 2] * tmp0; 2752 sum4 -= v4[j + 3] * tmp0; 2753 sum5 -= v5[j + 4] * tmp0; 2754 } 2755 2756 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2757 row--; 2758 sum2 -= v2[0] * tmp0; 2759 sum3 -= v3[1] * tmp0; 2760 sum4 -= v4[2] * tmp0; 2761 sum5 -= v5[3] * tmp0; 2762 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2763 row--; 2764 sum3 -= v3[0] * tmp0; 2765 sum4 -= v4[1] * tmp0; 2766 sum5 -= v5[2] * tmp0; 2767 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2768 row--; 2769 sum4 -= v4[0] * tmp0; 2770 sum5 -= v5[1] * tmp0; 2771 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2772 row--; 2773 sum5 -= v5[0] * tmp0; 2774 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2775 row--; 2776 break; 2777 default: 2778 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2779 } 2780 } 2781 PetscCall(ISRestoreIndices(isrow, &rout)); 2782 PetscCall(ISRestoreIndices(iscol, &cout)); 2783 PetscCall(VecRestoreArrayRead(bb, &b)); 2784 PetscCall(VecRestoreArrayWrite(xx, &x)); 2785 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2786 PetscFunctionReturn(PETSC_SUCCESS); 2787 } 2788 2789 /* 2790 Makes a longer coloring[] array and calls the usual code with that 2791 */ 2792 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2793 { 2794 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2795 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row; 2796 PetscInt *colorused, i; 2797 ISColoringValue *newcolor; 2798 2799 PetscFunctionBegin; 2800 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2801 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2802 /* loop over inodes, marking a color for each column*/ 2803 row = 0; 2804 for (i = 0; i < m; i++) { 2805 for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors; 2806 } 2807 2808 /* eliminate unneeded colors */ 2809 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2810 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2811 2812 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2813 ncolors = colorused[5 * ncolors - 1]; 2814 for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1; 2815 PetscCall(PetscFree(colorused)); 2816 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2817 PetscCall(PetscFree(coloring)); 2818 PetscFunctionReturn(PETSC_SUCCESS); 2819 } 2820 2821 #include <petsc/private/kernels/blockinvert.h> 2822 2823 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2824 { 2825 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2826 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2827 MatScalar *ibdiag, *bdiag, work[25], *t; 2828 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2829 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2830 const PetscScalar *xb, *b; 2831 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2832 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2; 2833 PetscInt sz, k, ipvt[5]; 2834 PetscBool allowzeropivot, zeropivotdetected; 2835 const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i; 2836 2837 PetscFunctionBegin; 2838 allowzeropivot = PetscNot(A->erroriffailure); 2839 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2840 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2841 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2842 2843 if (!a->inode.ibdiagvalid) { 2844 if (!a->inode.ibdiag) { 2845 /* calculate space needed for diagonal blocks */ 2846 for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i]; 2847 a->inode.bdiagsize = cnt; 2848 2849 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2850 } 2851 2852 /* copy over the diagonal blocks and invert them */ 2853 ibdiag = a->inode.ibdiag; 2854 bdiag = a->inode.bdiag; 2855 cnt = 0; 2856 for (i = 0, row = 0; i < m; i++) { 2857 for (j = 0; j < sizes[i]; j++) { 2858 for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k]; 2859 } 2860 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i])); 2861 2862 switch (sizes[i]) { 2863 case 1: 2864 /* Create matrix data structure */ 2865 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2866 if (allowzeropivot) { 2867 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2868 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2869 A->factorerror_zeropivot_row = row; 2870 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2871 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2872 } 2873 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2874 break; 2875 case 2: 2876 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2877 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2878 break; 2879 case 3: 2880 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2881 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2882 break; 2883 case 4: 2884 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2885 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2886 break; 2887 case 5: 2888 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2889 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2890 break; 2891 default: 2892 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 2893 } 2894 cnt += sizes[i] * sizes[i]; 2895 row += sizes[i]; 2896 } 2897 a->inode.ibdiagvalid = PETSC_TRUE; 2898 } 2899 ibdiag = a->inode.ibdiag; 2900 bdiag = a->inode.bdiag; 2901 t = a->inode.ssor_work; 2902 2903 PetscCall(VecGetArray(xx, &x)); 2904 PetscCall(VecGetArrayRead(bb, &b)); 2905 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2906 if (flag & SOR_ZERO_INITIAL_GUESS) { 2907 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2908 for (i = 0, row = 0; i < m; i++) { 2909 sz = diag[row] - ii[row]; 2910 v1 = a->a + ii[row]; 2911 idx = a->j + ii[row]; 2912 2913 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2914 switch (sizes[i]) { 2915 case 1: 2916 2917 sum1 = b[row]; 2918 for (n = 0; n < sz - 1; n += 2) { 2919 i1 = idx[0]; 2920 i2 = idx[1]; 2921 idx += 2; 2922 tmp0 = x[i1]; 2923 tmp1 = x[i2]; 2924 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2925 v1 += 2; 2926 } 2927 2928 if (n == sz - 1) { 2929 tmp0 = x[*idx]; 2930 sum1 -= *v1 * tmp0; 2931 } 2932 t[row] = sum1; 2933 x[row++] = sum1 * (*ibdiag++); 2934 break; 2935 case 2: 2936 v2 = a->a + ii[row + 1]; 2937 sum1 = b[row]; 2938 sum2 = b[row + 1]; 2939 for (n = 0; n < sz - 1; n += 2) { 2940 i1 = idx[0]; 2941 i2 = idx[1]; 2942 idx += 2; 2943 tmp0 = x[i1]; 2944 tmp1 = x[i2]; 2945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2946 v1 += 2; 2947 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2948 v2 += 2; 2949 } 2950 2951 if (n == sz - 1) { 2952 tmp0 = x[*idx]; 2953 sum1 -= v1[0] * tmp0; 2954 sum2 -= v2[0] * tmp0; 2955 } 2956 t[row] = sum1; 2957 t[row + 1] = sum2; 2958 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2959 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2960 ibdiag += 4; 2961 break; 2962 case 3: 2963 v2 = a->a + ii[row + 1]; 2964 v3 = a->a + ii[row + 2]; 2965 sum1 = b[row]; 2966 sum2 = b[row + 1]; 2967 sum3 = b[row + 2]; 2968 for (n = 0; n < sz - 1; n += 2) { 2969 i1 = idx[0]; 2970 i2 = idx[1]; 2971 idx += 2; 2972 tmp0 = x[i1]; 2973 tmp1 = x[i2]; 2974 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2975 v1 += 2; 2976 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2977 v2 += 2; 2978 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2979 v3 += 2; 2980 } 2981 2982 if (n == sz - 1) { 2983 tmp0 = x[*idx]; 2984 sum1 -= v1[0] * tmp0; 2985 sum2 -= v2[0] * tmp0; 2986 sum3 -= v3[0] * tmp0; 2987 } 2988 t[row] = sum1; 2989 t[row + 1] = sum2; 2990 t[row + 2] = sum3; 2991 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 2992 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 2993 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 2994 ibdiag += 9; 2995 break; 2996 case 4: 2997 v2 = a->a + ii[row + 1]; 2998 v3 = a->a + ii[row + 2]; 2999 v4 = a->a + ii[row + 3]; 3000 sum1 = b[row]; 3001 sum2 = b[row + 1]; 3002 sum3 = b[row + 2]; 3003 sum4 = b[row + 3]; 3004 for (n = 0; n < sz - 1; n += 2) { 3005 i1 = idx[0]; 3006 i2 = idx[1]; 3007 idx += 2; 3008 tmp0 = x[i1]; 3009 tmp1 = x[i2]; 3010 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3011 v1 += 2; 3012 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3013 v2 += 2; 3014 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3015 v3 += 2; 3016 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3017 v4 += 2; 3018 } 3019 3020 if (n == sz - 1) { 3021 tmp0 = x[*idx]; 3022 sum1 -= v1[0] * tmp0; 3023 sum2 -= v2[0] * tmp0; 3024 sum3 -= v3[0] * tmp0; 3025 sum4 -= v4[0] * tmp0; 3026 } 3027 t[row] = sum1; 3028 t[row + 1] = sum2; 3029 t[row + 2] = sum3; 3030 t[row + 3] = sum4; 3031 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3032 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3033 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3034 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3035 ibdiag += 16; 3036 break; 3037 case 5: 3038 v2 = a->a + ii[row + 1]; 3039 v3 = a->a + ii[row + 2]; 3040 v4 = a->a + ii[row + 3]; 3041 v5 = a->a + ii[row + 4]; 3042 sum1 = b[row]; 3043 sum2 = b[row + 1]; 3044 sum3 = b[row + 2]; 3045 sum4 = b[row + 3]; 3046 sum5 = b[row + 4]; 3047 for (n = 0; n < sz - 1; n += 2) { 3048 i1 = idx[0]; 3049 i2 = idx[1]; 3050 idx += 2; 3051 tmp0 = x[i1]; 3052 tmp1 = x[i2]; 3053 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3054 v1 += 2; 3055 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3056 v2 += 2; 3057 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3058 v3 += 2; 3059 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3060 v4 += 2; 3061 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3062 v5 += 2; 3063 } 3064 3065 if (n == sz - 1) { 3066 tmp0 = x[*idx]; 3067 sum1 -= v1[0] * tmp0; 3068 sum2 -= v2[0] * tmp0; 3069 sum3 -= v3[0] * tmp0; 3070 sum4 -= v4[0] * tmp0; 3071 sum5 -= v5[0] * tmp0; 3072 } 3073 t[row] = sum1; 3074 t[row + 1] = sum2; 3075 t[row + 2] = sum3; 3076 t[row + 3] = sum4; 3077 t[row + 4] = sum5; 3078 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3079 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3080 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3081 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3082 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3083 ibdiag += 25; 3084 break; 3085 default: 3086 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3087 } 3088 } 3089 3090 xb = t; 3091 PetscCall(PetscLogFlops(a->nz)); 3092 } else xb = b; 3093 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3094 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3095 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3096 ibdiag -= sizes[i] * sizes[i]; 3097 sz = ii[row + 1] - diag[row] - 1; 3098 v1 = a->a + diag[row] + 1; 3099 idx = a->j + diag[row] + 1; 3100 3101 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3102 switch (sizes[i]) { 3103 case 1: 3104 3105 sum1 = xb[row]; 3106 for (n = 0; n < sz - 1; n += 2) { 3107 i1 = idx[0]; 3108 i2 = idx[1]; 3109 idx += 2; 3110 tmp0 = x[i1]; 3111 tmp1 = x[i2]; 3112 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3113 v1 += 2; 3114 } 3115 3116 if (n == sz - 1) { 3117 tmp0 = x[*idx]; 3118 sum1 -= *v1 * tmp0; 3119 } 3120 x[row--] = sum1 * (*ibdiag); 3121 break; 3122 3123 case 2: 3124 3125 sum1 = xb[row]; 3126 sum2 = xb[row - 1]; 3127 /* note that sum1 is associated with the second of the two rows */ 3128 v2 = a->a + diag[row - 1] + 2; 3129 for (n = 0; n < sz - 1; n += 2) { 3130 i1 = idx[0]; 3131 i2 = idx[1]; 3132 idx += 2; 3133 tmp0 = x[i1]; 3134 tmp1 = x[i2]; 3135 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3136 v1 += 2; 3137 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3138 v2 += 2; 3139 } 3140 3141 if (n == sz - 1) { 3142 tmp0 = x[*idx]; 3143 sum1 -= *v1 * tmp0; 3144 sum2 -= *v2 * tmp0; 3145 } 3146 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3147 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3148 break; 3149 case 3: 3150 3151 sum1 = xb[row]; 3152 sum2 = xb[row - 1]; 3153 sum3 = xb[row - 2]; 3154 v2 = a->a + diag[row - 1] + 2; 3155 v3 = a->a + diag[row - 2] + 3; 3156 for (n = 0; n < sz - 1; n += 2) { 3157 i1 = idx[0]; 3158 i2 = idx[1]; 3159 idx += 2; 3160 tmp0 = x[i1]; 3161 tmp1 = x[i2]; 3162 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3163 v1 += 2; 3164 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3165 v2 += 2; 3166 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3167 v3 += 2; 3168 } 3169 3170 if (n == sz - 1) { 3171 tmp0 = x[*idx]; 3172 sum1 -= *v1 * tmp0; 3173 sum2 -= *v2 * tmp0; 3174 sum3 -= *v3 * tmp0; 3175 } 3176 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3177 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3178 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3179 break; 3180 case 4: 3181 3182 sum1 = xb[row]; 3183 sum2 = xb[row - 1]; 3184 sum3 = xb[row - 2]; 3185 sum4 = xb[row - 3]; 3186 v2 = a->a + diag[row - 1] + 2; 3187 v3 = a->a + diag[row - 2] + 3; 3188 v4 = a->a + diag[row - 3] + 4; 3189 for (n = 0; n < sz - 1; n += 2) { 3190 i1 = idx[0]; 3191 i2 = idx[1]; 3192 idx += 2; 3193 tmp0 = x[i1]; 3194 tmp1 = x[i2]; 3195 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3196 v1 += 2; 3197 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3198 v2 += 2; 3199 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3200 v3 += 2; 3201 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3202 v4 += 2; 3203 } 3204 3205 if (n == sz - 1) { 3206 tmp0 = x[*idx]; 3207 sum1 -= *v1 * tmp0; 3208 sum2 -= *v2 * tmp0; 3209 sum3 -= *v3 * tmp0; 3210 sum4 -= *v4 * tmp0; 3211 } 3212 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3213 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3214 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3215 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3216 break; 3217 case 5: 3218 3219 sum1 = xb[row]; 3220 sum2 = xb[row - 1]; 3221 sum3 = xb[row - 2]; 3222 sum4 = xb[row - 3]; 3223 sum5 = xb[row - 4]; 3224 v2 = a->a + diag[row - 1] + 2; 3225 v3 = a->a + diag[row - 2] + 3; 3226 v4 = a->a + diag[row - 3] + 4; 3227 v5 = a->a + diag[row - 4] + 5; 3228 for (n = 0; n < sz - 1; n += 2) { 3229 i1 = idx[0]; 3230 i2 = idx[1]; 3231 idx += 2; 3232 tmp0 = x[i1]; 3233 tmp1 = x[i2]; 3234 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3235 v1 += 2; 3236 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3237 v2 += 2; 3238 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3239 v3 += 2; 3240 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3241 v4 += 2; 3242 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3243 v5 += 2; 3244 } 3245 3246 if (n == sz - 1) { 3247 tmp0 = x[*idx]; 3248 sum1 -= *v1 * tmp0; 3249 sum2 -= *v2 * tmp0; 3250 sum3 -= *v3 * tmp0; 3251 sum4 -= *v4 * tmp0; 3252 sum5 -= *v5 * tmp0; 3253 } 3254 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3255 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3256 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3257 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3258 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3259 break; 3260 default: 3261 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3262 } 3263 } 3264 3265 PetscCall(PetscLogFlops(a->nz)); 3266 } 3267 its--; 3268 } 3269 while (its--) { 3270 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3271 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) { 3272 sz = diag[row] - ii[row]; 3273 v1 = a->a + ii[row]; 3274 idx = a->j + ii[row]; 3275 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3276 switch (sizes[i]) { 3277 case 1: 3278 sum1 = b[row]; 3279 for (n = 0; n < sz - 1; n += 2) { 3280 i1 = idx[0]; 3281 i2 = idx[1]; 3282 idx += 2; 3283 tmp0 = x[i1]; 3284 tmp1 = x[i2]; 3285 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3286 v1 += 2; 3287 } 3288 if (n == sz - 1) { 3289 tmp0 = x[*idx++]; 3290 sum1 -= *v1 * tmp0; 3291 v1++; 3292 } 3293 t[row] = sum1; 3294 sz = ii[row + 1] - diag[row] - 1; 3295 idx = a->j + diag[row] + 1; 3296 v1 += 1; 3297 for (n = 0; n < sz - 1; n += 2) { 3298 i1 = idx[0]; 3299 i2 = idx[1]; 3300 idx += 2; 3301 tmp0 = x[i1]; 3302 tmp1 = x[i2]; 3303 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3304 v1 += 2; 3305 } 3306 if (n == sz - 1) { 3307 tmp0 = x[*idx++]; 3308 sum1 -= *v1 * tmp0; 3309 } 3310 /* in MatSOR_SeqAIJ this line would be 3311 * 3312 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3313 * 3314 * but omega == 1, so this becomes 3315 * 3316 * x[row] = sum1*(*ibdiag++); 3317 * 3318 */ 3319 x[row] = sum1 * (*ibdiag); 3320 break; 3321 case 2: 3322 v2 = a->a + ii[row + 1]; 3323 sum1 = b[row]; 3324 sum2 = b[row + 1]; 3325 for (n = 0; n < sz - 1; n += 2) { 3326 i1 = idx[0]; 3327 i2 = idx[1]; 3328 idx += 2; 3329 tmp0 = x[i1]; 3330 tmp1 = x[i2]; 3331 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3332 v1 += 2; 3333 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3334 v2 += 2; 3335 } 3336 if (n == sz - 1) { 3337 tmp0 = x[*idx++]; 3338 sum1 -= v1[0] * tmp0; 3339 sum2 -= v2[0] * tmp0; 3340 v1++; 3341 v2++; 3342 } 3343 t[row] = sum1; 3344 t[row + 1] = sum2; 3345 sz = ii[row + 1] - diag[row] - 2; 3346 idx = a->j + diag[row] + 2; 3347 v1 += 2; 3348 v2 += 2; 3349 for (n = 0; n < sz - 1; n += 2) { 3350 i1 = idx[0]; 3351 i2 = idx[1]; 3352 idx += 2; 3353 tmp0 = x[i1]; 3354 tmp1 = x[i2]; 3355 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3356 v1 += 2; 3357 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3358 v2 += 2; 3359 } 3360 if (n == sz - 1) { 3361 tmp0 = x[*idx]; 3362 sum1 -= v1[0] * tmp0; 3363 sum2 -= v2[0] * tmp0; 3364 } 3365 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3366 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3367 break; 3368 case 3: 3369 v2 = a->a + ii[row + 1]; 3370 v3 = a->a + ii[row + 2]; 3371 sum1 = b[row]; 3372 sum2 = b[row + 1]; 3373 sum3 = b[row + 2]; 3374 for (n = 0; n < sz - 1; n += 2) { 3375 i1 = idx[0]; 3376 i2 = idx[1]; 3377 idx += 2; 3378 tmp0 = x[i1]; 3379 tmp1 = x[i2]; 3380 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3381 v1 += 2; 3382 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3383 v2 += 2; 3384 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3385 v3 += 2; 3386 } 3387 if (n == sz - 1) { 3388 tmp0 = x[*idx++]; 3389 sum1 -= v1[0] * tmp0; 3390 sum2 -= v2[0] * tmp0; 3391 sum3 -= v3[0] * tmp0; 3392 v1++; 3393 v2++; 3394 v3++; 3395 } 3396 t[row] = sum1; 3397 t[row + 1] = sum2; 3398 t[row + 2] = sum3; 3399 sz = ii[row + 1] - diag[row] - 3; 3400 idx = a->j + diag[row] + 3; 3401 v1 += 3; 3402 v2 += 3; 3403 v3 += 3; 3404 for (n = 0; n < sz - 1; n += 2) { 3405 i1 = idx[0]; 3406 i2 = idx[1]; 3407 idx += 2; 3408 tmp0 = x[i1]; 3409 tmp1 = x[i2]; 3410 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3411 v1 += 2; 3412 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3413 v2 += 2; 3414 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3415 v3 += 2; 3416 } 3417 if (n == sz - 1) { 3418 tmp0 = x[*idx]; 3419 sum1 -= v1[0] * tmp0; 3420 sum2 -= v2[0] * tmp0; 3421 sum3 -= v3[0] * tmp0; 3422 } 3423 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3424 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3425 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3426 break; 3427 case 4: 3428 v2 = a->a + ii[row + 1]; 3429 v3 = a->a + ii[row + 2]; 3430 v4 = a->a + ii[row + 3]; 3431 sum1 = b[row]; 3432 sum2 = b[row + 1]; 3433 sum3 = b[row + 2]; 3434 sum4 = b[row + 3]; 3435 for (n = 0; n < sz - 1; n += 2) { 3436 i1 = idx[0]; 3437 i2 = idx[1]; 3438 idx += 2; 3439 tmp0 = x[i1]; 3440 tmp1 = x[i2]; 3441 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3442 v1 += 2; 3443 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3444 v2 += 2; 3445 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3446 v3 += 2; 3447 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3448 v4 += 2; 3449 } 3450 if (n == sz - 1) { 3451 tmp0 = x[*idx++]; 3452 sum1 -= v1[0] * tmp0; 3453 sum2 -= v2[0] * tmp0; 3454 sum3 -= v3[0] * tmp0; 3455 sum4 -= v4[0] * tmp0; 3456 v1++; 3457 v2++; 3458 v3++; 3459 v4++; 3460 } 3461 t[row] = sum1; 3462 t[row + 1] = sum2; 3463 t[row + 2] = sum3; 3464 t[row + 3] = sum4; 3465 sz = ii[row + 1] - diag[row] - 4; 3466 idx = a->j + diag[row] + 4; 3467 v1 += 4; 3468 v2 += 4; 3469 v3 += 4; 3470 v4 += 4; 3471 for (n = 0; n < sz - 1; n += 2) { 3472 i1 = idx[0]; 3473 i2 = idx[1]; 3474 idx += 2; 3475 tmp0 = x[i1]; 3476 tmp1 = x[i2]; 3477 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3478 v1 += 2; 3479 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3480 v2 += 2; 3481 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3482 v3 += 2; 3483 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3484 v4 += 2; 3485 } 3486 if (n == sz - 1) { 3487 tmp0 = x[*idx]; 3488 sum1 -= v1[0] * tmp0; 3489 sum2 -= v2[0] * tmp0; 3490 sum3 -= v3[0] * tmp0; 3491 sum4 -= v4[0] * tmp0; 3492 } 3493 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3494 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3495 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3496 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3497 break; 3498 case 5: 3499 v2 = a->a + ii[row + 1]; 3500 v3 = a->a + ii[row + 2]; 3501 v4 = a->a + ii[row + 3]; 3502 v5 = a->a + ii[row + 4]; 3503 sum1 = b[row]; 3504 sum2 = b[row + 1]; 3505 sum3 = b[row + 2]; 3506 sum4 = b[row + 3]; 3507 sum5 = b[row + 4]; 3508 for (n = 0; n < sz - 1; n += 2) { 3509 i1 = idx[0]; 3510 i2 = idx[1]; 3511 idx += 2; 3512 tmp0 = x[i1]; 3513 tmp1 = x[i2]; 3514 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3515 v1 += 2; 3516 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3517 v2 += 2; 3518 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3519 v3 += 2; 3520 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3521 v4 += 2; 3522 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3523 v5 += 2; 3524 } 3525 if (n == sz - 1) { 3526 tmp0 = x[*idx++]; 3527 sum1 -= v1[0] * tmp0; 3528 sum2 -= v2[0] * tmp0; 3529 sum3 -= v3[0] * tmp0; 3530 sum4 -= v4[0] * tmp0; 3531 sum5 -= v5[0] * tmp0; 3532 v1++; 3533 v2++; 3534 v3++; 3535 v4++; 3536 v5++; 3537 } 3538 t[row] = sum1; 3539 t[row + 1] = sum2; 3540 t[row + 2] = sum3; 3541 t[row + 3] = sum4; 3542 t[row + 4] = sum5; 3543 sz = ii[row + 1] - diag[row] - 5; 3544 idx = a->j + diag[row] + 5; 3545 v1 += 5; 3546 v2 += 5; 3547 v3 += 5; 3548 v4 += 5; 3549 v5 += 5; 3550 for (n = 0; n < sz - 1; n += 2) { 3551 i1 = idx[0]; 3552 i2 = idx[1]; 3553 idx += 2; 3554 tmp0 = x[i1]; 3555 tmp1 = x[i2]; 3556 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3557 v1 += 2; 3558 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3559 v2 += 2; 3560 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3561 v3 += 2; 3562 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3563 v4 += 2; 3564 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3565 v5 += 2; 3566 } 3567 if (n == sz - 1) { 3568 tmp0 = x[*idx]; 3569 sum1 -= v1[0] * tmp0; 3570 sum2 -= v2[0] * tmp0; 3571 sum3 -= v3[0] * tmp0; 3572 sum4 -= v4[0] * tmp0; 3573 sum5 -= v5[0] * tmp0; 3574 } 3575 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3576 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3577 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3578 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3579 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3580 break; 3581 default: 3582 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3583 } 3584 } 3585 xb = t; 3586 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3587 } else xb = b; 3588 3589 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3590 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3591 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3592 ibdiag -= sizes[i] * sizes[i]; 3593 3594 /* set RHS */ 3595 if (xb == b) { 3596 /* whole (old way) */ 3597 sz = ii[row + 1] - ii[row]; 3598 idx = a->j + ii[row]; 3599 switch (sizes[i]) { 3600 case 5: 3601 v5 = a->a + ii[row - 4]; /* fall through */ 3602 case 4: 3603 v4 = a->a + ii[row - 3]; /* fall through */ 3604 case 3: 3605 v3 = a->a + ii[row - 2]; /* fall through */ 3606 case 2: 3607 v2 = a->a + ii[row - 1]; /* fall through */ 3608 case 1: 3609 v1 = a->a + ii[row]; 3610 break; 3611 default: 3612 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3613 } 3614 } else { 3615 /* upper, no diag */ 3616 sz = ii[row + 1] - diag[row] - 1; 3617 idx = a->j + diag[row] + 1; 3618 switch (sizes[i]) { 3619 case 5: 3620 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3621 case 4: 3622 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3623 case 3: 3624 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3625 case 2: 3626 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3627 case 1: 3628 v1 = a->a + diag[row] + 1; 3629 } 3630 } 3631 /* set sum */ 3632 switch (sizes[i]) { 3633 case 5: 3634 sum5 = xb[row - 4]; /* fall through */ 3635 case 4: 3636 sum4 = xb[row - 3]; /* fall through */ 3637 case 3: 3638 sum3 = xb[row - 2]; /* fall through */ 3639 case 2: 3640 sum2 = xb[row - 1]; /* fall through */ 3641 case 1: 3642 /* note that sum1 is associated with the last row */ 3643 sum1 = xb[row]; 3644 } 3645 /* do sums */ 3646 for (n = 0; n < sz - 1; n += 2) { 3647 i1 = idx[0]; 3648 i2 = idx[1]; 3649 idx += 2; 3650 tmp0 = x[i1]; 3651 tmp1 = x[i2]; 3652 switch (sizes[i]) { 3653 case 5: 3654 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3655 v5 += 2; /* fall through */ 3656 case 4: 3657 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3658 v4 += 2; /* fall through */ 3659 case 3: 3660 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3661 v3 += 2; /* fall through */ 3662 case 2: 3663 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3664 v2 += 2; /* fall through */ 3665 case 1: 3666 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3667 v1 += 2; 3668 } 3669 } 3670 /* ragged edge */ 3671 if (n == sz - 1) { 3672 tmp0 = x[*idx]; 3673 switch (sizes[i]) { 3674 case 5: 3675 sum5 -= *v5 * tmp0; /* fall through */ 3676 case 4: 3677 sum4 -= *v4 * tmp0; /* fall through */ 3678 case 3: 3679 sum3 -= *v3 * tmp0; /* fall through */ 3680 case 2: 3681 sum2 -= *v2 * tmp0; /* fall through */ 3682 case 1: 3683 sum1 -= *v1 * tmp0; 3684 } 3685 } 3686 /* update */ 3687 if (xb == b) { 3688 /* whole (old way) w/ diag */ 3689 switch (sizes[i]) { 3690 case 5: 3691 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3692 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3693 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3694 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3695 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3696 break; 3697 case 4: 3698 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3699 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3700 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3701 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3702 break; 3703 case 3: 3704 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3705 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3706 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3707 break; 3708 case 2: 3709 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3710 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3711 break; 3712 case 1: 3713 x[row--] += sum1 * (*ibdiag); 3714 break; 3715 } 3716 } else { 3717 /* no diag so set = */ 3718 switch (sizes[i]) { 3719 case 5: 3720 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3721 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3722 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3723 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3724 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3725 break; 3726 case 4: 3727 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3728 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3729 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3730 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3731 break; 3732 case 3: 3733 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3734 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3735 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3736 break; 3737 case 2: 3738 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3739 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3740 break; 3741 case 1: 3742 x[row--] = sum1 * (*ibdiag); 3743 break; 3744 } 3745 } 3746 } 3747 if (xb == b) { 3748 PetscCall(PetscLogFlops(2.0 * a->nz)); 3749 } else { 3750 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3751 } 3752 } 3753 } 3754 if (flag & SOR_EISENSTAT) { 3755 /* 3756 Apply (U + D)^-1 where D is now the block diagonal 3757 */ 3758 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3759 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3760 ibdiag -= sizes[i] * sizes[i]; 3761 sz = ii[row + 1] - diag[row] - 1; 3762 v1 = a->a + diag[row] + 1; 3763 idx = a->j + diag[row] + 1; 3764 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3765 switch (sizes[i]) { 3766 case 1: 3767 3768 sum1 = b[row]; 3769 for (n = 0; n < sz - 1; n += 2) { 3770 i1 = idx[0]; 3771 i2 = idx[1]; 3772 idx += 2; 3773 tmp0 = x[i1]; 3774 tmp1 = x[i2]; 3775 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3776 v1 += 2; 3777 } 3778 3779 if (n == sz - 1) { 3780 tmp0 = x[*idx]; 3781 sum1 -= *v1 * tmp0; 3782 } 3783 x[row] = sum1 * (*ibdiag); 3784 row--; 3785 break; 3786 3787 case 2: 3788 3789 sum1 = b[row]; 3790 sum2 = b[row - 1]; 3791 /* note that sum1 is associated with the second of the two rows */ 3792 v2 = a->a + diag[row - 1] + 2; 3793 for (n = 0; n < sz - 1; n += 2) { 3794 i1 = idx[0]; 3795 i2 = idx[1]; 3796 idx += 2; 3797 tmp0 = x[i1]; 3798 tmp1 = x[i2]; 3799 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3800 v1 += 2; 3801 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3802 v2 += 2; 3803 } 3804 3805 if (n == sz - 1) { 3806 tmp0 = x[*idx]; 3807 sum1 -= *v1 * tmp0; 3808 sum2 -= *v2 * tmp0; 3809 } 3810 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3811 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3812 row -= 2; 3813 break; 3814 case 3: 3815 3816 sum1 = b[row]; 3817 sum2 = b[row - 1]; 3818 sum3 = b[row - 2]; 3819 v2 = a->a + diag[row - 1] + 2; 3820 v3 = a->a + diag[row - 2] + 3; 3821 for (n = 0; n < sz - 1; n += 2) { 3822 i1 = idx[0]; 3823 i2 = idx[1]; 3824 idx += 2; 3825 tmp0 = x[i1]; 3826 tmp1 = x[i2]; 3827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3828 v1 += 2; 3829 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3830 v2 += 2; 3831 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3832 v3 += 2; 3833 } 3834 3835 if (n == sz - 1) { 3836 tmp0 = x[*idx]; 3837 sum1 -= *v1 * tmp0; 3838 sum2 -= *v2 * tmp0; 3839 sum3 -= *v3 * tmp0; 3840 } 3841 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3842 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3843 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3844 row -= 3; 3845 break; 3846 case 4: 3847 3848 sum1 = b[row]; 3849 sum2 = b[row - 1]; 3850 sum3 = b[row - 2]; 3851 sum4 = b[row - 3]; 3852 v2 = a->a + diag[row - 1] + 2; 3853 v3 = a->a + diag[row - 2] + 3; 3854 v4 = a->a + diag[row - 3] + 4; 3855 for (n = 0; n < sz - 1; n += 2) { 3856 i1 = idx[0]; 3857 i2 = idx[1]; 3858 idx += 2; 3859 tmp0 = x[i1]; 3860 tmp1 = x[i2]; 3861 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3862 v1 += 2; 3863 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3864 v2 += 2; 3865 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3866 v3 += 2; 3867 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3868 v4 += 2; 3869 } 3870 3871 if (n == sz - 1) { 3872 tmp0 = x[*idx]; 3873 sum1 -= *v1 * tmp0; 3874 sum2 -= *v2 * tmp0; 3875 sum3 -= *v3 * tmp0; 3876 sum4 -= *v4 * tmp0; 3877 } 3878 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3879 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3880 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3881 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3882 row -= 4; 3883 break; 3884 case 5: 3885 3886 sum1 = b[row]; 3887 sum2 = b[row - 1]; 3888 sum3 = b[row - 2]; 3889 sum4 = b[row - 3]; 3890 sum5 = b[row - 4]; 3891 v2 = a->a + diag[row - 1] + 2; 3892 v3 = a->a + diag[row - 2] + 3; 3893 v4 = a->a + diag[row - 3] + 4; 3894 v5 = a->a + diag[row - 4] + 5; 3895 for (n = 0; n < sz - 1; n += 2) { 3896 i1 = idx[0]; 3897 i2 = idx[1]; 3898 idx += 2; 3899 tmp0 = x[i1]; 3900 tmp1 = x[i2]; 3901 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3902 v1 += 2; 3903 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3904 v2 += 2; 3905 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3906 v3 += 2; 3907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3908 v4 += 2; 3909 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3910 v5 += 2; 3911 } 3912 3913 if (n == sz - 1) { 3914 tmp0 = x[*idx]; 3915 sum1 -= *v1 * tmp0; 3916 sum2 -= *v2 * tmp0; 3917 sum3 -= *v3 * tmp0; 3918 sum4 -= *v4 * tmp0; 3919 sum5 -= *v5 * tmp0; 3920 } 3921 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3922 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3923 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3924 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3925 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3926 row -= 5; 3927 break; 3928 default: 3929 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3930 } 3931 } 3932 PetscCall(PetscLogFlops(a->nz)); 3933 3934 /* 3935 t = b - D x where D is the block diagonal 3936 */ 3937 cnt = 0; 3938 for (i = 0, row = 0; i < m; i++) { 3939 switch (sizes[i]) { 3940 case 1: 3941 t[row] = b[row] - bdiag[cnt++] * x[row]; 3942 row++; 3943 break; 3944 case 2: 3945 x1 = x[row]; 3946 x2 = x[row + 1]; 3947 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3948 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3949 t[row] = b[row] - tmp1; 3950 t[row + 1] = b[row + 1] - tmp2; 3951 row += 2; 3952 cnt += 4; 3953 break; 3954 case 3: 3955 x1 = x[row]; 3956 x2 = x[row + 1]; 3957 x3 = x[row + 2]; 3958 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3959 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3960 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3961 t[row] = b[row] - tmp1; 3962 t[row + 1] = b[row + 1] - tmp2; 3963 t[row + 2] = b[row + 2] - tmp3; 3964 row += 3; 3965 cnt += 9; 3966 break; 3967 case 4: 3968 x1 = x[row]; 3969 x2 = x[row + 1]; 3970 x3 = x[row + 2]; 3971 x4 = x[row + 3]; 3972 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3973 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3974 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3975 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3976 t[row] = b[row] - tmp1; 3977 t[row + 1] = b[row + 1] - tmp2; 3978 t[row + 2] = b[row + 2] - tmp3; 3979 t[row + 3] = b[row + 3] - tmp4; 3980 row += 4; 3981 cnt += 16; 3982 break; 3983 case 5: 3984 x1 = x[row]; 3985 x2 = x[row + 1]; 3986 x3 = x[row + 2]; 3987 x4 = x[row + 3]; 3988 x5 = x[row + 4]; 3989 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 3990 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 3991 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 3992 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 3993 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 3994 t[row] = b[row] - tmp1; 3995 t[row + 1] = b[row + 1] - tmp2; 3996 t[row + 2] = b[row + 2] - tmp3; 3997 t[row + 3] = b[row + 3] - tmp4; 3998 t[row + 4] = b[row + 4] - tmp5; 3999 row += 5; 4000 cnt += 25; 4001 break; 4002 default: 4003 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4004 } 4005 } 4006 PetscCall(PetscLogFlops(m)); 4007 4008 /* 4009 Apply (L + D)^-1 where D is the block diagonal 4010 */ 4011 for (i = 0, row = 0; i < m; i++) { 4012 sz = diag[row] - ii[row]; 4013 v1 = a->a + ii[row]; 4014 idx = a->j + ii[row]; 4015 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4016 switch (sizes[i]) { 4017 case 1: 4018 4019 sum1 = t[row]; 4020 for (n = 0; n < sz - 1; n += 2) { 4021 i1 = idx[0]; 4022 i2 = idx[1]; 4023 idx += 2; 4024 tmp0 = t[i1]; 4025 tmp1 = t[i2]; 4026 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4027 v1 += 2; 4028 } 4029 4030 if (n == sz - 1) { 4031 tmp0 = t[*idx]; 4032 sum1 -= *v1 * tmp0; 4033 } 4034 x[row] += t[row] = sum1 * (*ibdiag++); 4035 row++; 4036 break; 4037 case 2: 4038 v2 = a->a + ii[row + 1]; 4039 sum1 = t[row]; 4040 sum2 = t[row + 1]; 4041 for (n = 0; n < sz - 1; n += 2) { 4042 i1 = idx[0]; 4043 i2 = idx[1]; 4044 idx += 2; 4045 tmp0 = t[i1]; 4046 tmp1 = t[i2]; 4047 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4048 v1 += 2; 4049 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4050 v2 += 2; 4051 } 4052 4053 if (n == sz - 1) { 4054 tmp0 = t[*idx]; 4055 sum1 -= v1[0] * tmp0; 4056 sum2 -= v2[0] * tmp0; 4057 } 4058 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4059 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4060 ibdiag += 4; 4061 row += 2; 4062 break; 4063 case 3: 4064 v2 = a->a + ii[row + 1]; 4065 v3 = a->a + ii[row + 2]; 4066 sum1 = t[row]; 4067 sum2 = t[row + 1]; 4068 sum3 = t[row + 2]; 4069 for (n = 0; n < sz - 1; n += 2) { 4070 i1 = idx[0]; 4071 i2 = idx[1]; 4072 idx += 2; 4073 tmp0 = t[i1]; 4074 tmp1 = t[i2]; 4075 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4076 v1 += 2; 4077 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4078 v2 += 2; 4079 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4080 v3 += 2; 4081 } 4082 4083 if (n == sz - 1) { 4084 tmp0 = t[*idx]; 4085 sum1 -= v1[0] * tmp0; 4086 sum2 -= v2[0] * tmp0; 4087 sum3 -= v3[0] * tmp0; 4088 } 4089 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4090 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4091 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4092 ibdiag += 9; 4093 row += 3; 4094 break; 4095 case 4: 4096 v2 = a->a + ii[row + 1]; 4097 v3 = a->a + ii[row + 2]; 4098 v4 = a->a + ii[row + 3]; 4099 sum1 = t[row]; 4100 sum2 = t[row + 1]; 4101 sum3 = t[row + 2]; 4102 sum4 = t[row + 3]; 4103 for (n = 0; n < sz - 1; n += 2) { 4104 i1 = idx[0]; 4105 i2 = idx[1]; 4106 idx += 2; 4107 tmp0 = t[i1]; 4108 tmp1 = t[i2]; 4109 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4110 v1 += 2; 4111 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4112 v2 += 2; 4113 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4114 v3 += 2; 4115 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4116 v4 += 2; 4117 } 4118 4119 if (n == sz - 1) { 4120 tmp0 = t[*idx]; 4121 sum1 -= v1[0] * tmp0; 4122 sum2 -= v2[0] * tmp0; 4123 sum3 -= v3[0] * tmp0; 4124 sum4 -= v4[0] * tmp0; 4125 } 4126 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4127 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4128 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4129 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4130 ibdiag += 16; 4131 row += 4; 4132 break; 4133 case 5: 4134 v2 = a->a + ii[row + 1]; 4135 v3 = a->a + ii[row + 2]; 4136 v4 = a->a + ii[row + 3]; 4137 v5 = a->a + ii[row + 4]; 4138 sum1 = t[row]; 4139 sum2 = t[row + 1]; 4140 sum3 = t[row + 2]; 4141 sum4 = t[row + 3]; 4142 sum5 = t[row + 4]; 4143 for (n = 0; n < sz - 1; n += 2) { 4144 i1 = idx[0]; 4145 i2 = idx[1]; 4146 idx += 2; 4147 tmp0 = t[i1]; 4148 tmp1 = t[i2]; 4149 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4150 v1 += 2; 4151 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4152 v2 += 2; 4153 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4154 v3 += 2; 4155 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4156 v4 += 2; 4157 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4158 v5 += 2; 4159 } 4160 4161 if (n == sz - 1) { 4162 tmp0 = t[*idx]; 4163 sum1 -= v1[0] * tmp0; 4164 sum2 -= v2[0] * tmp0; 4165 sum3 -= v3[0] * tmp0; 4166 sum4 -= v4[0] * tmp0; 4167 sum5 -= v5[0] * tmp0; 4168 } 4169 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4170 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4171 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4172 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4173 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4174 ibdiag += 25; 4175 row += 5; 4176 break; 4177 default: 4178 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4179 } 4180 } 4181 PetscCall(PetscLogFlops(a->nz)); 4182 } 4183 PetscCall(VecRestoreArray(xx, &x)); 4184 PetscCall(VecRestoreArrayRead(bb, &b)); 4185 PetscFunctionReturn(PETSC_SUCCESS); 4186 } 4187 4188 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4189 { 4190 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4191 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4192 const MatScalar *bdiag = a->inode.bdiag; 4193 const PetscScalar *b; 4194 PetscInt m = a->inode.node_count, cnt = 0, i, row; 4195 const PetscInt *sizes = a->inode.size; 4196 4197 PetscFunctionBegin; 4198 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4199 PetscCall(VecGetArray(xx, &x)); 4200 PetscCall(VecGetArrayRead(bb, &b)); 4201 cnt = 0; 4202 for (i = 0, row = 0; i < m; i++) { 4203 switch (sizes[i]) { 4204 case 1: 4205 x[row] = b[row] * bdiag[cnt++]; 4206 row++; 4207 break; 4208 case 2: 4209 x1 = b[row]; 4210 x2 = b[row + 1]; 4211 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4212 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4213 x[row++] = tmp1; 4214 x[row++] = tmp2; 4215 cnt += 4; 4216 break; 4217 case 3: 4218 x1 = b[row]; 4219 x2 = b[row + 1]; 4220 x3 = b[row + 2]; 4221 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4222 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4223 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4224 x[row++] = tmp1; 4225 x[row++] = tmp2; 4226 x[row++] = tmp3; 4227 cnt += 9; 4228 break; 4229 case 4: 4230 x1 = b[row]; 4231 x2 = b[row + 1]; 4232 x3 = b[row + 2]; 4233 x4 = b[row + 3]; 4234 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4235 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4236 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4237 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4238 x[row++] = tmp1; 4239 x[row++] = tmp2; 4240 x[row++] = tmp3; 4241 x[row++] = tmp4; 4242 cnt += 16; 4243 break; 4244 case 5: 4245 x1 = b[row]; 4246 x2 = b[row + 1]; 4247 x3 = b[row + 2]; 4248 x4 = b[row + 3]; 4249 x5 = b[row + 4]; 4250 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4251 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4252 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4253 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4254 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4255 x[row++] = tmp1; 4256 x[row++] = tmp2; 4257 x[row++] = tmp3; 4258 x[row++] = tmp4; 4259 x[row++] = tmp5; 4260 cnt += 25; 4261 break; 4262 default: 4263 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4264 } 4265 } 4266 PetscCall(PetscLogFlops(2.0 * cnt)); 4267 PetscCall(VecRestoreArray(xx, &x)); 4268 PetscCall(VecRestoreArrayRead(bb, &b)); 4269 PetscFunctionReturn(PETSC_SUCCESS); 4270 } 4271 4272 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4273 { 4274 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4275 4276 PetscFunctionBegin; 4277 a->inode.node_count = 0; 4278 a->inode.use = PETSC_FALSE; 4279 a->inode.checked = PETSC_FALSE; 4280 a->inode.mat_nonzerostate = -1; 4281 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4282 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4283 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4284 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4285 A->ops->coloringpatch = NULL; 4286 A->ops->multdiagonalblock = NULL; 4287 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4288 PetscFunctionReturn(PETSC_SUCCESS); 4289 } 4290 4291 /* 4292 samestructure indicates that the matrix has not changed its nonzero structure so we 4293 do not need to recompute the inodes 4294 */ 4295 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4296 { 4297 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4298 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4299 PetscBool flag; 4300 const PetscInt *idx, *idy, *ii; 4301 4302 PetscFunctionBegin; 4303 if (!a->inode.use) { 4304 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4305 PetscCall(PetscFree(a->inode.size)); 4306 PetscFunctionReturn(PETSC_SUCCESS); 4307 } 4308 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 4309 4310 m = A->rmap->n; 4311 if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size)); 4312 ns = a->inode.size; 4313 4314 i = 0; 4315 node_count = 0; 4316 idx = a->j; 4317 ii = a->i; 4318 if (idx) { 4319 while (i < m) { /* For each row */ 4320 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4321 /* Limits the number of elements in a node to 'a->inode.limit' */ 4322 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4323 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4324 if (nzy != nzx) break; 4325 idy += nzx; /* Same nonzero pattern */ 4326 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4327 if (!flag) break; 4328 } 4329 ns[node_count++] = blk_size; 4330 idx += blk_size * nzx; 4331 i = j; 4332 } 4333 } 4334 /* If not enough inodes found,, do not use inode version of the routines */ 4335 if (!m || !idx || node_count > .8 * m) { 4336 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4337 PetscCall(PetscFree(a->inode.size)); 4338 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4339 } else { 4340 if (!A->factortype) { 4341 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4342 if (A->rmap->n == A->cmap->n) { 4343 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4344 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4345 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4346 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4347 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4348 } 4349 } else { 4350 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4351 } 4352 a->inode.node_count = node_count; 4353 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4354 } 4355 a->inode.checked = PETSC_TRUE; 4356 a->inode.mat_nonzerostate = A->nonzerostate; 4357 PetscFunctionReturn(PETSC_SUCCESS); 4358 } 4359 4360 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4361 { 4362 Mat B = *C; 4363 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4364 PetscInt m = A->rmap->n; 4365 4366 PetscFunctionBegin; 4367 c->inode.use = a->inode.use; 4368 c->inode.limit = a->inode.limit; 4369 c->inode.max_limit = a->inode.max_limit; 4370 c->inode.checked = PETSC_FALSE; 4371 c->inode.size = NULL; 4372 c->inode.node_count = 0; 4373 c->inode.ibdiagvalid = PETSC_FALSE; 4374 c->inode.ibdiag = NULL; 4375 c->inode.bdiag = NULL; 4376 c->inode.mat_nonzerostate = -1; 4377 if (a->inode.use) { 4378 if (a->inode.checked && a->inode.size) { 4379 PetscCall(PetscMalloc1(m + 1, &c->inode.size)); 4380 PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1)); 4381 4382 c->inode.checked = PETSC_TRUE; 4383 c->inode.node_count = a->inode.node_count; 4384 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4385 } 4386 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4387 if (!B->factortype) { 4388 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4389 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4390 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4391 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4392 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4393 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4394 } else { 4395 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4396 } 4397 } 4398 PetscFunctionReturn(PETSC_SUCCESS); 4399 } 4400 4401 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4402 { 4403 PetscInt k; 4404 const PetscInt *vi; 4405 4406 PetscFunctionBegin; 4407 vi = aj + ai[row]; 4408 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4409 vi = aj + adiag[row]; 4410 cols[nzl] = vi[0]; 4411 vi = aj + adiag[row + 1] + 1; 4412 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4413 PetscFunctionReturn(PETSC_SUCCESS); 4414 } 4415 /* 4416 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4417 Modified from MatSeqAIJCheckInode(). 4418 4419 Input Parameters: 4420 . Mat A - ILU or LU matrix factor 4421 4422 */ 4423 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4424 { 4425 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4426 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4427 PetscInt *cols1, *cols2, *ns; 4428 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4429 PetscBool flag; 4430 4431 PetscFunctionBegin; 4432 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4433 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4434 4435 m = A->rmap->n; 4436 if (a->inode.size) ns = a->inode.size; 4437 else PetscCall(PetscMalloc1(m + 1, &ns)); 4438 4439 i = 0; 4440 node_count = 0; 4441 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4442 while (i < m) { /* For each row */ 4443 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4444 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4445 nzx = nzl1 + nzu1 + 1; 4446 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4447 4448 /* Limits the number of elements in a node to 'a->inode.limit' */ 4449 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4450 nzl2 = ai[j + 1] - ai[j]; 4451 nzu2 = adiag[j] - adiag[j + 1] - 1; 4452 nzy = nzl2 + nzu2 + 1; 4453 if (nzy != nzx) break; 4454 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4455 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4456 if (!flag) break; 4457 } 4458 ns[node_count++] = blk_size; 4459 i = j; 4460 } 4461 PetscCall(PetscFree2(cols1, cols2)); 4462 /* If not enough inodes found,, do not use inode version of the routines */ 4463 if (!m || node_count > .8 * m) { 4464 PetscCall(PetscFree(ns)); 4465 4466 a->inode.node_count = 0; 4467 a->inode.size = NULL; 4468 a->inode.use = PETSC_FALSE; 4469 4470 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4471 } else { 4472 A->ops->mult = NULL; 4473 A->ops->sor = NULL; 4474 A->ops->multadd = NULL; 4475 A->ops->getrowij = NULL; 4476 A->ops->restorerowij = NULL; 4477 A->ops->getcolumnij = NULL; 4478 A->ops->restorecolumnij = NULL; 4479 A->ops->coloringpatch = NULL; 4480 A->ops->multdiagonalblock = NULL; 4481 a->inode.node_count = node_count; 4482 a->inode.size = ns; 4483 4484 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4485 } 4486 a->inode.checked = PETSC_TRUE; 4487 PetscFunctionReturn(PETSC_SUCCESS); 4488 } 4489 4490 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4491 { 4492 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4493 4494 PetscFunctionBegin; 4495 a->inode.ibdiagvalid = PETSC_FALSE; 4496 PetscFunctionReturn(PETSC_SUCCESS); 4497 } 4498 4499 /* 4500 This is really ugly. if inodes are used this replaces the 4501 permutations with ones that correspond to rows/cols of the matrix 4502 rather then inode blocks 4503 */ 4504 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4505 { 4506 PetscFunctionBegin; 4507 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4508 PetscFunctionReturn(PETSC_SUCCESS); 4509 } 4510 4511 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4512 { 4513 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4514 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4515 const PetscInt *ridx, *cidx; 4516 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx; 4517 PetscInt nslim_col, *ns_col; 4518 IS ris = *rperm, cis = *cperm; 4519 4520 PetscFunctionBegin; 4521 if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4522 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4523 4524 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4525 PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns)); 4526 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4527 4528 PetscCall(ISGetIndices(ris, &ridx)); 4529 PetscCall(ISGetIndices(cis, &cidx)); 4530 4531 /* Form the inode structure for the rows of permuted matrix using inv perm*/ 4532 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i]; 4533 4534 /* Construct the permutations for rows*/ 4535 for (i = 0, row = 0; i < nslim_row; ++i) { 4536 indx = ridx[i]; 4537 start_val = tns[indx]; 4538 end_val = tns[indx + 1]; 4539 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4540 } 4541 4542 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4543 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i]; 4544 4545 /* Construct permutations for columns */ 4546 for (i = 0, col = 0; i < nslim_col; ++i) { 4547 indx = cidx[i]; 4548 start_val = tns[indx]; 4549 end_val = tns[indx + 1]; 4550 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4551 } 4552 4553 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4554 PetscCall(ISSetPermutation(*rperm)); 4555 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4556 PetscCall(ISSetPermutation(*cperm)); 4557 4558 PetscCall(ISRestoreIndices(ris, &ridx)); 4559 PetscCall(ISRestoreIndices(cis, &cidx)); 4560 4561 PetscCall(PetscFree(ns_col)); 4562 PetscCall(PetscFree2(permr, permc)); 4563 PetscCall(ISDestroy(&cis)); 4564 PetscCall(ISDestroy(&ris)); 4565 PetscCall(PetscFree(tns)); 4566 PetscFunctionReturn(PETSC_SUCCESS); 4567 } 4568 4569 /*@C 4570 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4571 4572 Not Collective 4573 4574 Input Parameter: 4575 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4576 4577 Output Parameters: 4578 + node_count - no of inodes present in the matrix. 4579 . sizes - an array of size `node_count`, with the sizes of each inode. 4580 - limit - the max size used to generate the inodes. 4581 4582 Level: advanced 4583 4584 Note: 4585 It should be called after the matrix is assembled. 4586 The contents of the sizes[] array should not be changed. 4587 `NULL` may be passed for information not needed 4588 4589 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 4590 @*/ 4591 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4592 { 4593 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4594 4595 PetscFunctionBegin; 4596 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4597 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4598 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4599 PetscFunctionReturn(PETSC_SUCCESS); 4600 } 4601 4602 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4603 { 4604 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4605 4606 PetscFunctionBegin; 4607 if (node_count) *node_count = a->inode.node_count; 4608 if (sizes) *sizes = a->inode.size; 4609 if (limit) *limit = a->inode.limit; 4610 PetscFunctionReturn(PETSC_SUCCESS); 4611 } 4612