1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 #if defined(PETSC_HAVE_XMMINTRIN_H) 8 #include <xmmintrin.h> 9 #endif 10 11 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 14 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) 25 ; 26 for (; count + 1 < n; count++, i++) 27 ; 28 if (count < n) i++; 29 *size = i; 30 PetscFunctionReturn(PETSC_SUCCESS); 31 } 32 PetscCall(PetscMalloc1(n + 1, &ns_col)); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i]; 36 37 /* if m < n; pad up the remainder with inode_limit */ 38 for (; count + 1 < n; count++, i++) ns_col[i] = 1; 39 /* The last node is the odd ball. pad it up with the remaining rows; */ 40 if (count < n) { 41 ns_col[i] = n - count; 42 i++; 43 } else if (count > n) { 44 /* Adjust for the over estimation */ 45 ns_col[i - 1] += n - count; 46 } 47 *size = i; 48 *ns = ns_col; 49 PetscFunctionReturn(PETSC_SUCCESS); 50 } 51 52 /* 53 This builds symmetric version of nonzero structure, 54 */ 55 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 56 { 57 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 58 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 59 PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2; 60 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 61 62 PetscFunctionBegin; 63 nslim_row = a->inode.node_count; 64 m = A->rmap->n; 65 n = A->cmap->n; 66 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 67 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 68 69 /* Use the row_inode as column_inode */ 70 nslim_col = nslim_row; 71 ns_col = ns_row; 72 73 /* allocate space for reformatted inode structure */ 74 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 75 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1]; 76 77 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 78 nsz = ns_col[i1]; 79 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 80 } 81 /* allocate space for row pointers */ 82 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 83 *iia = ia; 84 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 85 86 /* determine the number of columns in each row */ 87 ia[0] = oshift; 88 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 89 j = aj + ai[row] + ishift; 90 jmax = aj + ai[row + 1] + ishift; 91 if (j == jmax) continue; /* empty row */ 92 col = *j++ + ishift; 93 i2 = tvc[col]; 94 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 95 ia[i1 + 1]++; 96 ia[i2 + 1]++; 97 i2++; /* Start col of next node */ 98 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 99 i2 = tvc[col]; 100 } 101 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 102 } 103 104 /* shift ia[i] to point to next row */ 105 for (i1 = 1; i1 < nslim_row + 1; i1++) { 106 row = ia[i1 - 1]; 107 ia[i1] += row; 108 work[i1 - 1] = row - oshift; 109 } 110 111 /* allocate space for column pointers */ 112 nz = ia[nslim_row] + (!ishift); 113 PetscCall(PetscMalloc1(nz, &ja)); 114 *jja = ja; 115 116 /* loop over lower triangular part putting into ja */ 117 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 118 j = aj + ai[row] + ishift; 119 jmax = aj + ai[row + 1] + ishift; 120 if (j == jmax) continue; /* empty row */ 121 col = *j++ + ishift; 122 i2 = tvc[col]; 123 while (i2 < i1 && j < jmax) { 124 ja[work[i2]++] = i1 + oshift; 125 ja[work[i1]++] = i2 + oshift; 126 ++i2; 127 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 128 i2 = tvc[col]; 129 } 130 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 131 } 132 PetscCall(PetscFree(work)); 133 PetscCall(PetscFree2(tns, tvc)); 134 PetscFunctionReturn(PETSC_SUCCESS); 135 } 136 137 /* 138 This builds nonsymmetric version of nonzero structure, 139 */ 140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 141 { 142 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 143 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 144 PetscInt *tns, *tvc, nsz, i1, i2; 145 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size; 146 147 PetscFunctionBegin; 148 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 149 nslim_row = a->inode.node_count; 150 n = A->cmap->n; 151 152 /* Create The column_inode for this matrix */ 153 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 154 155 /* allocate space for reformatted column_inode structure */ 156 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 157 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 158 159 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 160 nsz = ns_col[i1]; 161 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 162 } 163 /* allocate space for row pointers */ 164 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 165 *iia = ia; 166 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 167 168 /* determine the number of columns in each row */ 169 ia[0] = oshift; 170 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 171 j = aj + ai[row] + ishift; 172 nz = ai[row + 1] - ai[row]; 173 if (!nz) continue; /* empty row */ 174 col = *j++ + ishift; 175 i2 = tvc[col]; 176 while (nz-- > 0) { /* off-diagonal elements */ 177 ia[i1 + 1]++; 178 i2++; /* Start col of next node */ 179 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 180 if (nz > 0) i2 = tvc[col]; 181 } 182 } 183 184 /* shift ia[i] to point to next row */ 185 for (i1 = 1; i1 < nslim_row + 1; i1++) { 186 row = ia[i1 - 1]; 187 ia[i1] += row; 188 work[i1 - 1] = row - oshift; 189 } 190 191 /* allocate space for column pointers */ 192 nz = ia[nslim_row] + (!ishift); 193 PetscCall(PetscMalloc1(nz, &ja)); 194 *jja = ja; 195 196 /* loop over matrix putting into ja */ 197 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 198 j = aj + ai[row] + ishift; 199 nz = ai[row + 1] - ai[row]; 200 if (!nz) continue; /* empty row */ 201 col = *j++ + ishift; 202 i2 = tvc[col]; 203 while (nz-- > 0) { 204 ja[work[i1]++] = i2 + oshift; 205 ++i2; 206 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 207 if (nz > 0) i2 = tvc[col]; 208 } 209 } 210 PetscCall(PetscFree(ns_col)); 211 PetscCall(PetscFree(work)); 212 PetscCall(PetscFree2(tns, tvc)); 213 PetscFunctionReturn(PETSC_SUCCESS); 214 } 215 216 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 217 { 218 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 219 220 PetscFunctionBegin; 221 if (n) *n = a->inode.node_count; 222 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 223 if (!blockcompressed) { 224 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 225 } else if (symmetric) { 226 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 227 } else { 228 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 229 } 230 PetscFunctionReturn(PETSC_SUCCESS); 231 } 232 233 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 234 { 235 PetscFunctionBegin; 236 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 237 238 if (!blockcompressed) { 239 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 240 } else { 241 PetscCall(PetscFree(*ia)); 242 PetscCall(PetscFree(*ja)); 243 } 244 PetscFunctionReturn(PETSC_SUCCESS); 245 } 246 247 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 248 { 249 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 250 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 251 PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j; 252 253 PetscFunctionBegin; 254 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 255 nslim_row = a->inode.node_count; 256 n = A->cmap->n; 257 258 /* Create The column_inode for this matrix */ 259 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 260 261 /* allocate space for reformatted column_inode structure */ 262 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 263 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 264 265 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 266 nsz = ns_col[i1]; 267 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 268 } 269 /* allocate space for column pointers */ 270 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 271 *iia = ia; 272 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 273 274 /* determine the number of columns in each row */ 275 ia[0] = oshift; 276 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 277 j = aj + ai[row] + ishift; 278 col = *j++ + ishift; 279 i2 = tvc[col]; 280 nz = ai[row + 1] - ai[row]; 281 while (nz-- > 0) { /* off-diagonal elements */ 282 /* ia[i1+1]++; */ 283 ia[i2 + 1]++; 284 i2++; 285 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 286 if (nz > 0) i2 = tvc[col]; 287 } 288 } 289 290 /* shift ia[i] to point to next col */ 291 for (i1 = 1; i1 < nslim_col + 1; i1++) { 292 col = ia[i1 - 1]; 293 ia[i1] += col; 294 work[i1 - 1] = col - oshift; 295 } 296 297 /* allocate space for column pointers */ 298 nz = ia[nslim_col] + (!ishift); 299 PetscCall(PetscMalloc1(nz, &ja)); 300 *jja = ja; 301 302 /* loop over matrix putting into ja */ 303 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 304 j = aj + ai[row] + ishift; 305 col = *j++ + ishift; 306 i2 = tvc[col]; 307 nz = ai[row + 1] - ai[row]; 308 while (nz-- > 0) { 309 /* ja[work[i1]++] = i2 + oshift; */ 310 ja[work[i2]++] = i1 + oshift; 311 i2++; 312 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 313 if (nz > 0) i2 = tvc[col]; 314 } 315 } 316 PetscCall(PetscFree(ns_col)); 317 PetscCall(PetscFree(work)); 318 PetscCall(PetscFree2(tns, tvc)); 319 PetscFunctionReturn(PETSC_SUCCESS); 320 } 321 322 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 323 { 324 PetscFunctionBegin; 325 PetscCall(MatCreateColInode_Private(A, n, NULL)); 326 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 327 328 if (!blockcompressed) { 329 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 330 } else if (symmetric) { 331 /* Since the indices are symmetric it doesn't matter */ 332 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 333 } else { 334 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 335 } 336 PetscFunctionReturn(PETSC_SUCCESS); 337 } 338 339 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 340 { 341 PetscFunctionBegin; 342 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 343 if (!blockcompressed) { 344 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 345 } else { 346 PetscCall(PetscFree(*ia)); 347 PetscCall(PetscFree(*ja)); 348 } 349 PetscFunctionReturn(PETSC_SUCCESS); 350 } 351 352 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 353 { 354 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 355 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 356 PetscScalar *y; 357 const PetscScalar *x; 358 const MatScalar *v1, *v2, *v3, *v4, *v5; 359 PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 360 const PetscInt *idx, *ns, *ii; 361 362 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 363 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 364 #endif 365 366 PetscFunctionBegin; 367 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 368 node_max = a->inode.node_count; 369 ns = a->inode.size; /* Node Size array */ 370 PetscCall(VecGetArrayRead(xx, &x)); 371 PetscCall(VecGetArray(yy, &y)); 372 idx = a->j; 373 v1 = a->a; 374 ii = a->i; 375 376 for (i = 0, row = 0; i < node_max; ++i) { 377 nsz = ns[i]; 378 n = ii[1] - ii[0]; 379 nonzerorow += (n > 0) * nsz; 380 ii += nsz; 381 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 382 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 383 sz = n; /* No of non zeros in this row */ 384 /* Switch on the size of Node */ 385 switch (nsz) { /* Each loop in 'case' is unrolled */ 386 case 1: 387 sum1 = 0.; 388 389 for (n = 0; n < sz - 1; n += 2) { 390 i1 = idx[0]; /* The instructions are ordered to */ 391 i2 = idx[1]; /* make the compiler's job easy */ 392 idx += 2; 393 tmp0 = x[i1]; 394 tmp1 = x[i2]; 395 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 396 v1 += 2; 397 } 398 399 if (n == sz - 1) { /* Take care of the last nonzero */ 400 tmp0 = x[*idx++]; 401 sum1 += *v1++ * tmp0; 402 } 403 y[row++] = sum1; 404 break; 405 case 2: 406 sum1 = 0.; 407 sum2 = 0.; 408 v2 = v1 + n; 409 410 for (n = 0; n < sz - 1; n += 2) { 411 i1 = idx[0]; 412 i2 = idx[1]; 413 idx += 2; 414 tmp0 = x[i1]; 415 tmp1 = x[i2]; 416 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 417 v1 += 2; 418 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 419 v2 += 2; 420 } 421 if (n == sz - 1) { 422 tmp0 = x[*idx++]; 423 sum1 += *v1++ * tmp0; 424 sum2 += *v2++ * tmp0; 425 } 426 y[row++] = sum1; 427 y[row++] = sum2; 428 v1 = v2; /* Since the next block to be processed starts there*/ 429 idx += sz; 430 break; 431 case 3: 432 sum1 = 0.; 433 sum2 = 0.; 434 sum3 = 0.; 435 v2 = v1 + n; 436 v3 = v2 + n; 437 438 for (n = 0; n < sz - 1; n += 2) { 439 i1 = idx[0]; 440 i2 = idx[1]; 441 idx += 2; 442 tmp0 = x[i1]; 443 tmp1 = x[i2]; 444 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 445 v1 += 2; 446 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 447 v2 += 2; 448 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 449 v3 += 2; 450 } 451 if (n == sz - 1) { 452 tmp0 = x[*idx++]; 453 sum1 += *v1++ * tmp0; 454 sum2 += *v2++ * tmp0; 455 sum3 += *v3++ * tmp0; 456 } 457 y[row++] = sum1; 458 y[row++] = sum2; 459 y[row++] = sum3; 460 v1 = v3; /* Since the next block to be processed starts there*/ 461 idx += 2 * sz; 462 break; 463 case 4: 464 sum1 = 0.; 465 sum2 = 0.; 466 sum3 = 0.; 467 sum4 = 0.; 468 v2 = v1 + n; 469 v3 = v2 + n; 470 v4 = v3 + n; 471 472 for (n = 0; n < sz - 1; n += 2) { 473 i1 = idx[0]; 474 i2 = idx[1]; 475 idx += 2; 476 tmp0 = x[i1]; 477 tmp1 = x[i2]; 478 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 479 v1 += 2; 480 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 481 v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 483 v3 += 2; 484 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 485 v4 += 2; 486 } 487 if (n == sz - 1) { 488 tmp0 = x[*idx++]; 489 sum1 += *v1++ * tmp0; 490 sum2 += *v2++ * tmp0; 491 sum3 += *v3++ * tmp0; 492 sum4 += *v4++ * tmp0; 493 } 494 y[row++] = sum1; 495 y[row++] = sum2; 496 y[row++] = sum3; 497 y[row++] = sum4; 498 v1 = v4; /* Since the next block to be processed starts there*/ 499 idx += 3 * sz; 500 break; 501 case 5: 502 sum1 = 0.; 503 sum2 = 0.; 504 sum3 = 0.; 505 sum4 = 0.; 506 sum5 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 v5 = v4 + n; 511 512 for (n = 0; n < sz - 1; n += 2) { 513 i1 = idx[0]; 514 i2 = idx[1]; 515 idx += 2; 516 tmp0 = x[i1]; 517 tmp1 = x[i2]; 518 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 519 v1 += 2; 520 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 521 v2 += 2; 522 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 523 v3 += 2; 524 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 525 v4 += 2; 526 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 527 v5 += 2; 528 } 529 if (n == sz - 1) { 530 tmp0 = x[*idx++]; 531 sum1 += *v1++ * tmp0; 532 sum2 += *v2++ * tmp0; 533 sum3 += *v3++ * tmp0; 534 sum4 += *v4++ * tmp0; 535 sum5 += *v5++ * tmp0; 536 } 537 y[row++] = sum1; 538 y[row++] = sum2; 539 y[row++] = sum3; 540 y[row++] = sum4; 541 y[row++] = sum5; 542 v1 = v5; /* Since the next block to be processed starts there */ 543 idx += 4 * sz; 544 break; 545 default: 546 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 547 } 548 } 549 PetscCall(VecRestoreArrayRead(xx, &x)); 550 PetscCall(VecRestoreArray(yy, &y)); 551 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 552 PetscFunctionReturn(PETSC_SUCCESS); 553 } 554 555 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 556 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 557 { 558 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 559 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 560 const MatScalar *v1, *v2, *v3, *v4, *v5; 561 const PetscScalar *x; 562 PetscScalar *y, *z, *zt; 563 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 564 const PetscInt *idx, *ns, *ii; 565 566 PetscFunctionBegin; 567 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 568 node_max = a->inode.node_count; 569 ns = a->inode.size; /* Node Size array */ 570 571 PetscCall(VecGetArrayRead(xx, &x)); 572 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 573 zt = z; 574 575 idx = a->j; 576 v1 = a->a; 577 ii = a->i; 578 579 for (i = 0, row = 0; i < node_max; ++i) { 580 nsz = ns[i]; 581 n = ii[1] - ii[0]; 582 ii += nsz; 583 sz = n; /* No of non zeros in this row */ 584 /* Switch on the size of Node */ 585 switch (nsz) { /* Each loop in 'case' is unrolled */ 586 case 1: 587 sum1 = *zt++; 588 589 for (n = 0; n < sz - 1; n += 2) { 590 i1 = idx[0]; /* The instructions are ordered to */ 591 i2 = idx[1]; /* make the compiler's job easy */ 592 idx += 2; 593 tmp0 = x[i1]; 594 tmp1 = x[i2]; 595 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 596 v1 += 2; 597 } 598 599 if (n == sz - 1) { /* Take care of the last nonzero */ 600 tmp0 = x[*idx++]; 601 sum1 += *v1++ * tmp0; 602 } 603 y[row++] = sum1; 604 break; 605 case 2: 606 sum1 = *zt++; 607 sum2 = *zt++; 608 v2 = v1 + n; 609 610 for (n = 0; n < sz - 1; n += 2) { 611 i1 = idx[0]; 612 i2 = idx[1]; 613 idx += 2; 614 tmp0 = x[i1]; 615 tmp1 = x[i2]; 616 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 617 v1 += 2; 618 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 619 v2 += 2; 620 } 621 if (n == sz - 1) { 622 tmp0 = x[*idx++]; 623 sum1 += *v1++ * tmp0; 624 sum2 += *v2++ * tmp0; 625 } 626 y[row++] = sum1; 627 y[row++] = sum2; 628 v1 = v2; /* Since the next block to be processed starts there*/ 629 idx += sz; 630 break; 631 case 3: 632 sum1 = *zt++; 633 sum2 = *zt++; 634 sum3 = *zt++; 635 v2 = v1 + n; 636 v3 = v2 + n; 637 638 for (n = 0; n < sz - 1; n += 2) { 639 i1 = idx[0]; 640 i2 = idx[1]; 641 idx += 2; 642 tmp0 = x[i1]; 643 tmp1 = x[i2]; 644 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 645 v1 += 2; 646 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 647 v2 += 2; 648 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 649 v3 += 2; 650 } 651 if (n == sz - 1) { 652 tmp0 = x[*idx++]; 653 sum1 += *v1++ * tmp0; 654 sum2 += *v2++ * tmp0; 655 sum3 += *v3++ * tmp0; 656 } 657 y[row++] = sum1; 658 y[row++] = sum2; 659 y[row++] = sum3; 660 v1 = v3; /* Since the next block to be processed starts there*/ 661 idx += 2 * sz; 662 break; 663 case 4: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 sum4 = *zt++; 668 v2 = v1 + n; 669 v3 = v2 + n; 670 v4 = v3 + n; 671 672 for (n = 0; n < sz - 1; n += 2) { 673 i1 = idx[0]; 674 i2 = idx[1]; 675 idx += 2; 676 tmp0 = x[i1]; 677 tmp1 = x[i2]; 678 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 679 v1 += 2; 680 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 681 v2 += 2; 682 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 683 v3 += 2; 684 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 685 v4 += 2; 686 } 687 if (n == sz - 1) { 688 tmp0 = x[*idx++]; 689 sum1 += *v1++ * tmp0; 690 sum2 += *v2++ * tmp0; 691 sum3 += *v3++ * tmp0; 692 sum4 += *v4++ * tmp0; 693 } 694 y[row++] = sum1; 695 y[row++] = sum2; 696 y[row++] = sum3; 697 y[row++] = sum4; 698 v1 = v4; /* Since the next block to be processed starts there*/ 699 idx += 3 * sz; 700 break; 701 case 5: 702 sum1 = *zt++; 703 sum2 = *zt++; 704 sum3 = *zt++; 705 sum4 = *zt++; 706 sum5 = *zt++; 707 v2 = v1 + n; 708 v3 = v2 + n; 709 v4 = v3 + n; 710 v5 = v4 + n; 711 712 for (n = 0; n < sz - 1; n += 2) { 713 i1 = idx[0]; 714 i2 = idx[1]; 715 idx += 2; 716 tmp0 = x[i1]; 717 tmp1 = x[i2]; 718 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 719 v1 += 2; 720 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 721 v2 += 2; 722 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 723 v3 += 2; 724 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 725 v4 += 2; 726 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 727 v5 += 2; 728 } 729 if (n == sz - 1) { 730 tmp0 = x[*idx++]; 731 sum1 += *v1++ * tmp0; 732 sum2 += *v2++ * tmp0; 733 sum3 += *v3++ * tmp0; 734 sum4 += *v4++ * tmp0; 735 sum5 += *v5++ * tmp0; 736 } 737 y[row++] = sum1; 738 y[row++] = sum2; 739 y[row++] = sum3; 740 y[row++] = sum4; 741 y[row++] = sum5; 742 v1 = v5; /* Since the next block to be processed starts there */ 743 idx += 4 * sz; 744 break; 745 default: 746 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 747 } 748 } 749 PetscCall(VecRestoreArrayRead(xx, &x)); 750 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 751 PetscCall(PetscLogFlops(2.0 * a->nz)); 752 PetscFunctionReturn(PETSC_SUCCESS); 753 } 754 755 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 756 { 757 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 758 IS iscol = a->col, isrow = a->row; 759 const PetscInt *r, *c, *rout, *cout; 760 PetscInt i, j, n = A->rmap->n, nz; 761 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 762 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 763 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 764 PetscScalar sum1, sum2, sum3, sum4, sum5; 765 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 766 const PetscScalar *b; 767 768 PetscFunctionBegin; 769 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 770 node_max = a->inode.node_count; 771 ns = a->inode.size; /* Node Size array */ 772 773 PetscCall(VecGetArrayRead(bb, &b)); 774 PetscCall(VecGetArrayWrite(xx, &x)); 775 tmp = a->solve_work; 776 777 PetscCall(ISGetIndices(isrow, &rout)); 778 r = rout; 779 PetscCall(ISGetIndices(iscol, &cout)); 780 c = cout + (n - 1); 781 782 /* forward solve the lower triangular */ 783 tmps = tmp; 784 aa = a_a; 785 aj = a_j; 786 ad = a->diag; 787 788 for (i = 0, row = 0; i < node_max; ++i) { 789 nsz = ns[i]; 790 aii = ai[row]; 791 v1 = aa + aii; 792 vi = aj + aii; 793 nz = ad[row] - aii; 794 if (i < node_max - 1) { 795 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 796 * but our indexing to determine its size could. */ 797 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 798 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 799 PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 800 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 801 } 802 803 switch (nsz) { /* Each loop in 'case' is unrolled */ 804 case 1: 805 sum1 = b[*r++]; 806 for (j = 0; j < nz - 1; j += 2) { 807 i0 = vi[0]; 808 i1 = vi[1]; 809 vi += 2; 810 tmp0 = tmps[i0]; 811 tmp1 = tmps[i1]; 812 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 813 v1 += 2; 814 } 815 if (j == nz - 1) { 816 tmp0 = tmps[*vi++]; 817 sum1 -= *v1++ * tmp0; 818 } 819 tmp[row++] = sum1; 820 break; 821 case 2: 822 sum1 = b[*r++]; 823 sum2 = b[*r++]; 824 v2 = aa + ai[row + 1]; 825 826 for (j = 0; j < nz - 1; j += 2) { 827 i0 = vi[0]; 828 i1 = vi[1]; 829 vi += 2; 830 tmp0 = tmps[i0]; 831 tmp1 = tmps[i1]; 832 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 833 v1 += 2; 834 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 835 v2 += 2; 836 } 837 if (j == nz - 1) { 838 tmp0 = tmps[*vi++]; 839 sum1 -= *v1++ * tmp0; 840 sum2 -= *v2++ * tmp0; 841 } 842 sum2 -= *v2++ * sum1; 843 tmp[row++] = sum1; 844 tmp[row++] = sum2; 845 break; 846 case 3: 847 sum1 = b[*r++]; 848 sum2 = b[*r++]; 849 sum3 = b[*r++]; 850 v2 = aa + ai[row + 1]; 851 v3 = aa + ai[row + 2]; 852 853 for (j = 0; j < nz - 1; j += 2) { 854 i0 = vi[0]; 855 i1 = vi[1]; 856 vi += 2; 857 tmp0 = tmps[i0]; 858 tmp1 = tmps[i1]; 859 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 860 v1 += 2; 861 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 862 v2 += 2; 863 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 864 v3 += 2; 865 } 866 if (j == nz - 1) { 867 tmp0 = tmps[*vi++]; 868 sum1 -= *v1++ * tmp0; 869 sum2 -= *v2++ * tmp0; 870 sum3 -= *v3++ * tmp0; 871 } 872 sum2 -= *v2++ * sum1; 873 sum3 -= *v3++ * sum1; 874 sum3 -= *v3++ * sum2; 875 876 tmp[row++] = sum1; 877 tmp[row++] = sum2; 878 tmp[row++] = sum3; 879 break; 880 881 case 4: 882 sum1 = b[*r++]; 883 sum2 = b[*r++]; 884 sum3 = b[*r++]; 885 sum4 = b[*r++]; 886 v2 = aa + ai[row + 1]; 887 v3 = aa + ai[row + 2]; 888 v4 = aa + ai[row + 3]; 889 890 for (j = 0; j < nz - 1; j += 2) { 891 i0 = vi[0]; 892 i1 = vi[1]; 893 vi += 2; 894 tmp0 = tmps[i0]; 895 tmp1 = tmps[i1]; 896 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 897 v1 += 2; 898 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 899 v2 += 2; 900 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 901 v3 += 2; 902 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 903 v4 += 2; 904 } 905 if (j == nz - 1) { 906 tmp0 = tmps[*vi++]; 907 sum1 -= *v1++ * tmp0; 908 sum2 -= *v2++ * tmp0; 909 sum3 -= *v3++ * tmp0; 910 sum4 -= *v4++ * tmp0; 911 } 912 sum2 -= *v2++ * sum1; 913 sum3 -= *v3++ * sum1; 914 sum4 -= *v4++ * sum1; 915 sum3 -= *v3++ * sum2; 916 sum4 -= *v4++ * sum2; 917 sum4 -= *v4++ * sum3; 918 919 tmp[row++] = sum1; 920 tmp[row++] = sum2; 921 tmp[row++] = sum3; 922 tmp[row++] = sum4; 923 break; 924 case 5: 925 sum1 = b[*r++]; 926 sum2 = b[*r++]; 927 sum3 = b[*r++]; 928 sum4 = b[*r++]; 929 sum5 = b[*r++]; 930 v2 = aa + ai[row + 1]; 931 v3 = aa + ai[row + 2]; 932 v4 = aa + ai[row + 3]; 933 v5 = aa + ai[row + 4]; 934 935 for (j = 0; j < nz - 1; j += 2) { 936 i0 = vi[0]; 937 i1 = vi[1]; 938 vi += 2; 939 tmp0 = tmps[i0]; 940 tmp1 = tmps[i1]; 941 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 942 v1 += 2; 943 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 944 v2 += 2; 945 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 946 v3 += 2; 947 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 948 v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 950 v5 += 2; 951 } 952 if (j == nz - 1) { 953 tmp0 = tmps[*vi++]; 954 sum1 -= *v1++ * tmp0; 955 sum2 -= *v2++ * tmp0; 956 sum3 -= *v3++ * tmp0; 957 sum4 -= *v4++ * tmp0; 958 sum5 -= *v5++ * tmp0; 959 } 960 961 sum2 -= *v2++ * sum1; 962 sum3 -= *v3++ * sum1; 963 sum4 -= *v4++ * sum1; 964 sum5 -= *v5++ * sum1; 965 sum3 -= *v3++ * sum2; 966 sum4 -= *v4++ * sum2; 967 sum5 -= *v5++ * sum2; 968 sum4 -= *v4++ * sum3; 969 sum5 -= *v5++ * sum3; 970 sum5 -= *v5++ * sum4; 971 972 tmp[row++] = sum1; 973 tmp[row++] = sum2; 974 tmp[row++] = sum3; 975 tmp[row++] = sum4; 976 tmp[row++] = sum5; 977 break; 978 default: 979 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 980 } 981 } 982 /* backward solve the upper triangular */ 983 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 984 nsz = ns[i]; 985 aii = ai[row + 1] - 1; 986 v1 = aa + aii; 987 vi = aj + aii; 988 nz = aii - ad[row]; 989 switch (nsz) { /* Each loop in 'case' is unrolled */ 990 case 1: 991 sum1 = tmp[row]; 992 993 for (j = nz; j > 1; j -= 2) { 994 vi -= 2; 995 i0 = vi[2]; 996 i1 = vi[1]; 997 tmp0 = tmps[i0]; 998 tmp1 = tmps[i1]; 999 v1 -= 2; 1000 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1001 } 1002 if (j == 1) { 1003 tmp0 = tmps[*vi--]; 1004 sum1 -= *v1-- * tmp0; 1005 } 1006 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1007 row--; 1008 break; 1009 case 2: 1010 sum1 = tmp[row]; 1011 sum2 = tmp[row - 1]; 1012 v2 = aa + ai[row] - 1; 1013 for (j = nz; j > 1; j -= 2) { 1014 vi -= 2; 1015 i0 = vi[2]; 1016 i1 = vi[1]; 1017 tmp0 = tmps[i0]; 1018 tmp1 = tmps[i1]; 1019 v1 -= 2; 1020 v2 -= 2; 1021 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1022 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1023 } 1024 if (j == 1) { 1025 tmp0 = tmps[*vi--]; 1026 sum1 -= *v1-- * tmp0; 1027 sum2 -= *v2-- * tmp0; 1028 } 1029 1030 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1031 row--; 1032 sum2 -= *v2-- * tmp0; 1033 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1034 row--; 1035 break; 1036 case 3: 1037 sum1 = tmp[row]; 1038 sum2 = tmp[row - 1]; 1039 sum3 = tmp[row - 2]; 1040 v2 = aa + ai[row] - 1; 1041 v3 = aa + ai[row - 1] - 1; 1042 for (j = nz; j > 1; j -= 2) { 1043 vi -= 2; 1044 i0 = vi[2]; 1045 i1 = vi[1]; 1046 tmp0 = tmps[i0]; 1047 tmp1 = tmps[i1]; 1048 v1 -= 2; 1049 v2 -= 2; 1050 v3 -= 2; 1051 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1052 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1053 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1054 } 1055 if (j == 1) { 1056 tmp0 = tmps[*vi--]; 1057 sum1 -= *v1-- * tmp0; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 } 1061 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1062 row--; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1066 row--; 1067 sum3 -= *v3-- * tmp0; 1068 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1069 row--; 1070 1071 break; 1072 case 4: 1073 sum1 = tmp[row]; 1074 sum2 = tmp[row - 1]; 1075 sum3 = tmp[row - 2]; 1076 sum4 = tmp[row - 3]; 1077 v2 = aa + ai[row] - 1; 1078 v3 = aa + ai[row - 1] - 1; 1079 v4 = aa + ai[row - 2] - 1; 1080 1081 for (j = nz; j > 1; j -= 2) { 1082 vi -= 2; 1083 i0 = vi[2]; 1084 i1 = vi[1]; 1085 tmp0 = tmps[i0]; 1086 tmp1 = tmps[i1]; 1087 v1 -= 2; 1088 v2 -= 2; 1089 v3 -= 2; 1090 v4 -= 2; 1091 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1092 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1093 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1094 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1095 } 1096 if (j == 1) { 1097 tmp0 = tmps[*vi--]; 1098 sum1 -= *v1-- * tmp0; 1099 sum2 -= *v2-- * tmp0; 1100 sum3 -= *v3-- * tmp0; 1101 sum4 -= *v4-- * tmp0; 1102 } 1103 1104 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1105 row--; 1106 sum2 -= *v2-- * tmp0; 1107 sum3 -= *v3-- * tmp0; 1108 sum4 -= *v4-- * tmp0; 1109 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1110 row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1114 row--; 1115 sum4 -= *v4-- * tmp0; 1116 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1117 row--; 1118 break; 1119 case 5: 1120 sum1 = tmp[row]; 1121 sum2 = tmp[row - 1]; 1122 sum3 = tmp[row - 2]; 1123 sum4 = tmp[row - 3]; 1124 sum5 = tmp[row - 4]; 1125 v2 = aa + ai[row] - 1; 1126 v3 = aa + ai[row - 1] - 1; 1127 v4 = aa + ai[row - 2] - 1; 1128 v5 = aa + ai[row - 3] - 1; 1129 for (j = nz; j > 1; j -= 2) { 1130 vi -= 2; 1131 i0 = vi[2]; 1132 i1 = vi[1]; 1133 tmp0 = tmps[i0]; 1134 tmp1 = tmps[i1]; 1135 v1 -= 2; 1136 v2 -= 2; 1137 v3 -= 2; 1138 v4 -= 2; 1139 v5 -= 2; 1140 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1141 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1142 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1143 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1144 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1145 } 1146 if (j == 1) { 1147 tmp0 = tmps[*vi--]; 1148 sum1 -= *v1-- * tmp0; 1149 sum2 -= *v2-- * tmp0; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 } 1154 1155 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1156 row--; 1157 sum2 -= *v2-- * tmp0; 1158 sum3 -= *v3-- * tmp0; 1159 sum4 -= *v4-- * tmp0; 1160 sum5 -= *v5-- * tmp0; 1161 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1162 row--; 1163 sum3 -= *v3-- * tmp0; 1164 sum4 -= *v4-- * tmp0; 1165 sum5 -= *v5-- * tmp0; 1166 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1167 row--; 1168 sum4 -= *v4-- * tmp0; 1169 sum5 -= *v5-- * tmp0; 1170 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1171 row--; 1172 sum5 -= *v5-- * tmp0; 1173 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1174 row--; 1175 break; 1176 default: 1177 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1178 } 1179 } 1180 PetscCall(ISRestoreIndices(isrow, &rout)); 1181 PetscCall(ISRestoreIndices(iscol, &cout)); 1182 PetscCall(VecRestoreArrayRead(bb, &b)); 1183 PetscCall(VecRestoreArrayWrite(xx, &x)); 1184 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1185 PetscFunctionReturn(PETSC_SUCCESS); 1186 } 1187 1188 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1189 { 1190 Mat C = B; 1191 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1192 IS isrow = b->row, isicol = b->icol; 1193 const PetscInt *r, *ic, *ics; 1194 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1195 PetscInt i, j, k, nz, nzL, row, *pj; 1196 const PetscInt *ajtmp, *bjtmp; 1197 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1198 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1199 FactorShiftCtx sctx; 1200 const PetscInt *ddiag; 1201 PetscReal rs; 1202 MatScalar d; 1203 PetscInt inod, nodesz, node_max, col; 1204 const PetscInt *ns; 1205 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1206 1207 PetscFunctionBegin; 1208 /* MatPivotSetUp(): initialize shift context sctx */ 1209 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1210 1211 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1212 ddiag = a->diag; 1213 sctx.shift_top = info->zeropivot; 1214 for (i = 0; i < n; i++) { 1215 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1216 d = (aa)[ddiag[i]]; 1217 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1218 v = aa + ai[i]; 1219 nz = ai[i + 1] - ai[i]; 1220 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1221 if (rs > sctx.shift_top) sctx.shift_top = rs; 1222 } 1223 sctx.shift_top *= 1.1; 1224 sctx.nshift_max = 5; 1225 sctx.shift_lo = 0.; 1226 sctx.shift_hi = 1.; 1227 } 1228 1229 PetscCall(ISGetIndices(isrow, &r)); 1230 PetscCall(ISGetIndices(isicol, &ic)); 1231 1232 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1233 ics = ic; 1234 1235 node_max = a->inode.node_count; 1236 ns = a->inode.size; 1237 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1238 1239 /* If max inode size > 4, split it into two inodes.*/ 1240 /* also map the inode sizes according to the ordering */ 1241 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1242 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1243 if (ns[i] > 4) { 1244 tmp_vec1[j] = 4; 1245 ++j; 1246 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 1247 } else { 1248 tmp_vec1[j] = ns[i]; 1249 } 1250 } 1251 /* Use the correct node_max */ 1252 node_max = j; 1253 1254 /* Now reorder the inode info based on mat re-ordering info */ 1255 /* First create a row -> inode_size_array_index map */ 1256 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1257 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1258 for (i = 0, row = 0; i < node_max; i++) { 1259 nodesz = tmp_vec1[i]; 1260 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1261 } 1262 /* Using nsmap, create a reordered ns structure */ 1263 for (i = 0, j = 0; i < node_max; i++) { 1264 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1265 tmp_vec2[i] = nodesz; 1266 j += nodesz; 1267 } 1268 PetscCall(PetscFree(nsmap)); 1269 PetscCall(PetscFree(tmp_vec1)); 1270 1271 /* Now use the correct ns */ 1272 ns = tmp_vec2; 1273 1274 do { 1275 sctx.newshift = PETSC_FALSE; 1276 /* Now loop over each block-row, and do the factorization */ 1277 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1278 nodesz = ns[inod]; 1279 1280 switch (nodesz) { 1281 case 1: 1282 /* zero rtmp1 */ 1283 /* L part */ 1284 nz = bi[i + 1] - bi[i]; 1285 bjtmp = bj + bi[i]; 1286 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1287 1288 /* U part */ 1289 nz = bdiag[i] - bdiag[i + 1]; 1290 bjtmp = bj + bdiag[i + 1] + 1; 1291 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1292 1293 /* load in initial (unfactored row) */ 1294 nz = ai[r[i] + 1] - ai[r[i]]; 1295 ajtmp = aj + ai[r[i]]; 1296 v = aa + ai[r[i]]; 1297 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1298 1299 /* ZeropivotApply() */ 1300 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1301 1302 /* elimination */ 1303 bjtmp = bj + bi[i]; 1304 row = *bjtmp++; 1305 nzL = bi[i + 1] - bi[i]; 1306 for (k = 0; k < nzL; k++) { 1307 pc = rtmp1 + row; 1308 if (*pc != 0.0) { 1309 pv = b->a + bdiag[row]; 1310 mul1 = *pc * (*pv); 1311 *pc = mul1; 1312 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1313 pv = b->a + bdiag[row + 1] + 1; 1314 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1315 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1316 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1317 } 1318 row = *bjtmp++; 1319 } 1320 1321 /* finished row so stick it into b->a */ 1322 rs = 0.0; 1323 /* L part */ 1324 pv = b->a + bi[i]; 1325 pj = b->j + bi[i]; 1326 nz = bi[i + 1] - bi[i]; 1327 for (j = 0; j < nz; j++) { 1328 pv[j] = rtmp1[pj[j]]; 1329 rs += PetscAbsScalar(pv[j]); 1330 } 1331 1332 /* U part */ 1333 pv = b->a + bdiag[i + 1] + 1; 1334 pj = b->j + bdiag[i + 1] + 1; 1335 nz = bdiag[i] - bdiag[i + 1] - 1; 1336 for (j = 0; j < nz; j++) { 1337 pv[j] = rtmp1[pj[j]]; 1338 rs += PetscAbsScalar(pv[j]); 1339 } 1340 1341 /* Check zero pivot */ 1342 sctx.rs = rs; 1343 sctx.pv = rtmp1[i]; 1344 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1345 if (sctx.newshift) break; 1346 1347 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1348 pv = b->a + bdiag[i]; 1349 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1350 break; 1351 1352 case 2: 1353 /* zero rtmp1 and rtmp2 */ 1354 /* L part */ 1355 nz = bi[i + 1] - bi[i]; 1356 bjtmp = bj + bi[i]; 1357 for (j = 0; j < nz; j++) { 1358 col = bjtmp[j]; 1359 rtmp1[col] = 0.0; 1360 rtmp2[col] = 0.0; 1361 } 1362 1363 /* U part */ 1364 nz = bdiag[i] - bdiag[i + 1]; 1365 bjtmp = bj + bdiag[i + 1] + 1; 1366 for (j = 0; j < nz; j++) { 1367 col = bjtmp[j]; 1368 rtmp1[col] = 0.0; 1369 rtmp2[col] = 0.0; 1370 } 1371 1372 /* load in initial (unfactored row) */ 1373 nz = ai[r[i] + 1] - ai[r[i]]; 1374 ajtmp = aj + ai[r[i]]; 1375 v1 = aa + ai[r[i]]; 1376 v2 = aa + ai[r[i] + 1]; 1377 for (j = 0; j < nz; j++) { 1378 col = ics[ajtmp[j]]; 1379 rtmp1[col] = v1[j]; 1380 rtmp2[col] = v2[j]; 1381 } 1382 /* ZeropivotApply(): shift the diagonal of the matrix */ 1383 rtmp1[i] += sctx.shift_amount; 1384 rtmp2[i + 1] += sctx.shift_amount; 1385 1386 /* elimination */ 1387 bjtmp = bj + bi[i]; 1388 row = *bjtmp++; /* pivot row */ 1389 nzL = bi[i + 1] - bi[i]; 1390 for (k = 0; k < nzL; k++) { 1391 pc1 = rtmp1 + row; 1392 pc2 = rtmp2 + row; 1393 if (*pc1 != 0.0 || *pc2 != 0.0) { 1394 pv = b->a + bdiag[row]; 1395 mul1 = *pc1 * (*pv); 1396 mul2 = *pc2 * (*pv); 1397 *pc1 = mul1; 1398 *pc2 = mul2; 1399 1400 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1401 pv = b->a + bdiag[row + 1] + 1; 1402 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1403 for (j = 0; j < nz; j++) { 1404 col = pj[j]; 1405 rtmp1[col] -= mul1 * pv[j]; 1406 rtmp2[col] -= mul2 * pv[j]; 1407 } 1408 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1409 } 1410 row = *bjtmp++; 1411 } 1412 1413 /* finished row i; check zero pivot, then stick row i into b->a */ 1414 rs = 0.0; 1415 /* L part */ 1416 pc1 = b->a + bi[i]; 1417 pj = b->j + bi[i]; 1418 nz = bi[i + 1] - bi[i]; 1419 for (j = 0; j < nz; j++) { 1420 col = pj[j]; 1421 pc1[j] = rtmp1[col]; 1422 rs += PetscAbsScalar(pc1[j]); 1423 } 1424 /* U part */ 1425 pc1 = b->a + bdiag[i + 1] + 1; 1426 pj = b->j + bdiag[i + 1] + 1; 1427 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1428 for (j = 0; j < nz; j++) { 1429 col = pj[j]; 1430 pc1[j] = rtmp1[col]; 1431 rs += PetscAbsScalar(pc1[j]); 1432 } 1433 1434 sctx.rs = rs; 1435 sctx.pv = rtmp1[i]; 1436 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1437 if (sctx.newshift) break; 1438 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1439 *pc1 = 1.0 / sctx.pv; 1440 1441 /* Now take care of diagonal 2x2 block. */ 1442 pc2 = rtmp2 + i; 1443 if (*pc2 != 0.0) { 1444 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1445 *pc2 = mul1; /* insert L entry */ 1446 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1447 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1448 for (j = 0; j < nz; j++) { 1449 col = pj[j]; 1450 rtmp2[col] -= mul1 * rtmp1[col]; 1451 } 1452 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1453 } 1454 1455 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1456 rs = 0.0; 1457 /* L part */ 1458 pc2 = b->a + bi[i + 1]; 1459 pj = b->j + bi[i + 1]; 1460 nz = bi[i + 2] - bi[i + 1]; 1461 for (j = 0; j < nz; j++) { 1462 col = pj[j]; 1463 pc2[j] = rtmp2[col]; 1464 rs += PetscAbsScalar(pc2[j]); 1465 } 1466 /* U part */ 1467 pc2 = b->a + bdiag[i + 2] + 1; 1468 pj = b->j + bdiag[i + 2] + 1; 1469 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1470 for (j = 0; j < nz; j++) { 1471 col = pj[j]; 1472 pc2[j] = rtmp2[col]; 1473 rs += PetscAbsScalar(pc2[j]); 1474 } 1475 1476 sctx.rs = rs; 1477 sctx.pv = rtmp2[i + 1]; 1478 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1479 if (sctx.newshift) break; 1480 pc2 = b->a + bdiag[i + 1]; 1481 *pc2 = 1.0 / sctx.pv; 1482 break; 1483 1484 case 3: 1485 /* zero rtmp */ 1486 /* L part */ 1487 nz = bi[i + 1] - bi[i]; 1488 bjtmp = bj + bi[i]; 1489 for (j = 0; j < nz; j++) { 1490 col = bjtmp[j]; 1491 rtmp1[col] = 0.0; 1492 rtmp2[col] = 0.0; 1493 rtmp3[col] = 0.0; 1494 } 1495 1496 /* U part */ 1497 nz = bdiag[i] - bdiag[i + 1]; 1498 bjtmp = bj + bdiag[i + 1] + 1; 1499 for (j = 0; j < nz; j++) { 1500 col = bjtmp[j]; 1501 rtmp1[col] = 0.0; 1502 rtmp2[col] = 0.0; 1503 rtmp3[col] = 0.0; 1504 } 1505 1506 /* load in initial (unfactored row) */ 1507 nz = ai[r[i] + 1] - ai[r[i]]; 1508 ajtmp = aj + ai[r[i]]; 1509 v1 = aa + ai[r[i]]; 1510 v2 = aa + ai[r[i] + 1]; 1511 v3 = aa + ai[r[i] + 2]; 1512 for (j = 0; j < nz; j++) { 1513 col = ics[ajtmp[j]]; 1514 rtmp1[col] = v1[j]; 1515 rtmp2[col] = v2[j]; 1516 rtmp3[col] = v3[j]; 1517 } 1518 /* ZeropivotApply(): shift the diagonal of the matrix */ 1519 rtmp1[i] += sctx.shift_amount; 1520 rtmp2[i + 1] += sctx.shift_amount; 1521 rtmp3[i + 2] += sctx.shift_amount; 1522 1523 /* elimination */ 1524 bjtmp = bj + bi[i]; 1525 row = *bjtmp++; /* pivot row */ 1526 nzL = bi[i + 1] - bi[i]; 1527 for (k = 0; k < nzL; k++) { 1528 pc1 = rtmp1 + row; 1529 pc2 = rtmp2 + row; 1530 pc3 = rtmp3 + row; 1531 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1532 pv = b->a + bdiag[row]; 1533 mul1 = *pc1 * (*pv); 1534 mul2 = *pc2 * (*pv); 1535 mul3 = *pc3 * (*pv); 1536 *pc1 = mul1; 1537 *pc2 = mul2; 1538 *pc3 = mul3; 1539 1540 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1541 pv = b->a + bdiag[row + 1] + 1; 1542 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1543 for (j = 0; j < nz; j++) { 1544 col = pj[j]; 1545 rtmp1[col] -= mul1 * pv[j]; 1546 rtmp2[col] -= mul2 * pv[j]; 1547 rtmp3[col] -= mul3 * pv[j]; 1548 } 1549 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1550 } 1551 row = *bjtmp++; 1552 } 1553 1554 /* finished row i; check zero pivot, then stick row i into b->a */ 1555 rs = 0.0; 1556 /* L part */ 1557 pc1 = b->a + bi[i]; 1558 pj = b->j + bi[i]; 1559 nz = bi[i + 1] - bi[i]; 1560 for (j = 0; j < nz; j++) { 1561 col = pj[j]; 1562 pc1[j] = rtmp1[col]; 1563 rs += PetscAbsScalar(pc1[j]); 1564 } 1565 /* U part */ 1566 pc1 = b->a + bdiag[i + 1] + 1; 1567 pj = b->j + bdiag[i + 1] + 1; 1568 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1569 for (j = 0; j < nz; j++) { 1570 col = pj[j]; 1571 pc1[j] = rtmp1[col]; 1572 rs += PetscAbsScalar(pc1[j]); 1573 } 1574 1575 sctx.rs = rs; 1576 sctx.pv = rtmp1[i]; 1577 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1578 if (sctx.newshift) break; 1579 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1580 *pc1 = 1.0 / sctx.pv; 1581 1582 /* Now take care of 1st column of diagonal 3x3 block. */ 1583 pc2 = rtmp2 + i; 1584 pc3 = rtmp3 + i; 1585 if (*pc2 != 0.0 || *pc3 != 0.0) { 1586 mul2 = (*pc2) * (*pc1); 1587 *pc2 = mul2; 1588 mul3 = (*pc3) * (*pc1); 1589 *pc3 = mul3; 1590 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1591 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1592 for (j = 0; j < nz; j++) { 1593 col = pj[j]; 1594 rtmp2[col] -= mul2 * rtmp1[col]; 1595 rtmp3[col] -= mul3 * rtmp1[col]; 1596 } 1597 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1598 } 1599 1600 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1601 rs = 0.0; 1602 /* L part */ 1603 pc2 = b->a + bi[i + 1]; 1604 pj = b->j + bi[i + 1]; 1605 nz = bi[i + 2] - bi[i + 1]; 1606 for (j = 0; j < nz; j++) { 1607 col = pj[j]; 1608 pc2[j] = rtmp2[col]; 1609 rs += PetscAbsScalar(pc2[j]); 1610 } 1611 /* U part */ 1612 pc2 = b->a + bdiag[i + 2] + 1; 1613 pj = b->j + bdiag[i + 2] + 1; 1614 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1615 for (j = 0; j < nz; j++) { 1616 col = pj[j]; 1617 pc2[j] = rtmp2[col]; 1618 rs += PetscAbsScalar(pc2[j]); 1619 } 1620 1621 sctx.rs = rs; 1622 sctx.pv = rtmp2[i + 1]; 1623 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1624 if (sctx.newshift) break; 1625 pc2 = b->a + bdiag[i + 1]; 1626 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1627 1628 /* Now take care of 2nd column of diagonal 3x3 block. */ 1629 pc3 = rtmp3 + i + 1; 1630 if (*pc3 != 0.0) { 1631 mul3 = (*pc3) * (*pc2); 1632 *pc3 = mul3; 1633 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1634 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1635 for (j = 0; j < nz; j++) { 1636 col = pj[j]; 1637 rtmp3[col] -= mul3 * rtmp2[col]; 1638 } 1639 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1640 } 1641 1642 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1643 rs = 0.0; 1644 /* L part */ 1645 pc3 = b->a + bi[i + 2]; 1646 pj = b->j + bi[i + 2]; 1647 nz = bi[i + 3] - bi[i + 2]; 1648 for (j = 0; j < nz; j++) { 1649 col = pj[j]; 1650 pc3[j] = rtmp3[col]; 1651 rs += PetscAbsScalar(pc3[j]); 1652 } 1653 /* U part */ 1654 pc3 = b->a + bdiag[i + 3] + 1; 1655 pj = b->j + bdiag[i + 3] + 1; 1656 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1657 for (j = 0; j < nz; j++) { 1658 col = pj[j]; 1659 pc3[j] = rtmp3[col]; 1660 rs += PetscAbsScalar(pc3[j]); 1661 } 1662 1663 sctx.rs = rs; 1664 sctx.pv = rtmp3[i + 2]; 1665 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1666 if (sctx.newshift) break; 1667 pc3 = b->a + bdiag[i + 2]; 1668 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1669 break; 1670 case 4: 1671 /* zero rtmp */ 1672 /* L part */ 1673 nz = bi[i + 1] - bi[i]; 1674 bjtmp = bj + bi[i]; 1675 for (j = 0; j < nz; j++) { 1676 col = bjtmp[j]; 1677 rtmp1[col] = 0.0; 1678 rtmp2[col] = 0.0; 1679 rtmp3[col] = 0.0; 1680 rtmp4[col] = 0.0; 1681 } 1682 1683 /* U part */ 1684 nz = bdiag[i] - bdiag[i + 1]; 1685 bjtmp = bj + bdiag[i + 1] + 1; 1686 for (j = 0; j < nz; j++) { 1687 col = bjtmp[j]; 1688 rtmp1[col] = 0.0; 1689 rtmp2[col] = 0.0; 1690 rtmp3[col] = 0.0; 1691 rtmp4[col] = 0.0; 1692 } 1693 1694 /* load in initial (unfactored row) */ 1695 nz = ai[r[i] + 1] - ai[r[i]]; 1696 ajtmp = aj + ai[r[i]]; 1697 v1 = aa + ai[r[i]]; 1698 v2 = aa + ai[r[i] + 1]; 1699 v3 = aa + ai[r[i] + 2]; 1700 v4 = aa + ai[r[i] + 3]; 1701 for (j = 0; j < nz; j++) { 1702 col = ics[ajtmp[j]]; 1703 rtmp1[col] = v1[j]; 1704 rtmp2[col] = v2[j]; 1705 rtmp3[col] = v3[j]; 1706 rtmp4[col] = v4[j]; 1707 } 1708 /* ZeropivotApply(): shift the diagonal of the matrix */ 1709 rtmp1[i] += sctx.shift_amount; 1710 rtmp2[i + 1] += sctx.shift_amount; 1711 rtmp3[i + 2] += sctx.shift_amount; 1712 rtmp4[i + 3] += sctx.shift_amount; 1713 1714 /* elimination */ 1715 bjtmp = bj + bi[i]; 1716 row = *bjtmp++; /* pivot row */ 1717 nzL = bi[i + 1] - bi[i]; 1718 for (k = 0; k < nzL; k++) { 1719 pc1 = rtmp1 + row; 1720 pc2 = rtmp2 + row; 1721 pc3 = rtmp3 + row; 1722 pc4 = rtmp4 + row; 1723 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1724 pv = b->a + bdiag[row]; 1725 mul1 = *pc1 * (*pv); 1726 mul2 = *pc2 * (*pv); 1727 mul3 = *pc3 * (*pv); 1728 mul4 = *pc4 * (*pv); 1729 *pc1 = mul1; 1730 *pc2 = mul2; 1731 *pc3 = mul3; 1732 *pc4 = mul4; 1733 1734 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1735 pv = b->a + bdiag[row + 1] + 1; 1736 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1737 for (j = 0; j < nz; j++) { 1738 col = pj[j]; 1739 rtmp1[col] -= mul1 * pv[j]; 1740 rtmp2[col] -= mul2 * pv[j]; 1741 rtmp3[col] -= mul3 * pv[j]; 1742 rtmp4[col] -= mul4 * pv[j]; 1743 } 1744 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1745 } 1746 row = *bjtmp++; 1747 } 1748 1749 /* finished row i; check zero pivot, then stick row i into b->a */ 1750 rs = 0.0; 1751 /* L part */ 1752 pc1 = b->a + bi[i]; 1753 pj = b->j + bi[i]; 1754 nz = bi[i + 1] - bi[i]; 1755 for (j = 0; j < nz; j++) { 1756 col = pj[j]; 1757 pc1[j] = rtmp1[col]; 1758 rs += PetscAbsScalar(pc1[j]); 1759 } 1760 /* U part */ 1761 pc1 = b->a + bdiag[i + 1] + 1; 1762 pj = b->j + bdiag[i + 1] + 1; 1763 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1764 for (j = 0; j < nz; j++) { 1765 col = pj[j]; 1766 pc1[j] = rtmp1[col]; 1767 rs += PetscAbsScalar(pc1[j]); 1768 } 1769 1770 sctx.rs = rs; 1771 sctx.pv = rtmp1[i]; 1772 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1773 if (sctx.newshift) break; 1774 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1775 *pc1 = 1.0 / sctx.pv; 1776 1777 /* Now take care of 1st column of diagonal 4x4 block. */ 1778 pc2 = rtmp2 + i; 1779 pc3 = rtmp3 + i; 1780 pc4 = rtmp4 + i; 1781 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1782 mul2 = (*pc2) * (*pc1); 1783 *pc2 = mul2; 1784 mul3 = (*pc3) * (*pc1); 1785 *pc3 = mul3; 1786 mul4 = (*pc4) * (*pc1); 1787 *pc4 = mul4; 1788 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1789 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1790 for (j = 0; j < nz; j++) { 1791 col = pj[j]; 1792 rtmp2[col] -= mul2 * rtmp1[col]; 1793 rtmp3[col] -= mul3 * rtmp1[col]; 1794 rtmp4[col] -= mul4 * rtmp1[col]; 1795 } 1796 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1797 } 1798 1799 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1800 rs = 0.0; 1801 /* L part */ 1802 pc2 = b->a + bi[i + 1]; 1803 pj = b->j + bi[i + 1]; 1804 nz = bi[i + 2] - bi[i + 1]; 1805 for (j = 0; j < nz; j++) { 1806 col = pj[j]; 1807 pc2[j] = rtmp2[col]; 1808 rs += PetscAbsScalar(pc2[j]); 1809 } 1810 /* U part */ 1811 pc2 = b->a + bdiag[i + 2] + 1; 1812 pj = b->j + bdiag[i + 2] + 1; 1813 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1814 for (j = 0; j < nz; j++) { 1815 col = pj[j]; 1816 pc2[j] = rtmp2[col]; 1817 rs += PetscAbsScalar(pc2[j]); 1818 } 1819 1820 sctx.rs = rs; 1821 sctx.pv = rtmp2[i + 1]; 1822 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1823 if (sctx.newshift) break; 1824 pc2 = b->a + bdiag[i + 1]; 1825 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1826 1827 /* Now take care of 2nd column of diagonal 4x4 block. */ 1828 pc3 = rtmp3 + i + 1; 1829 pc4 = rtmp4 + i + 1; 1830 if (*pc3 != 0.0 || *pc4 != 0.0) { 1831 mul3 = (*pc3) * (*pc2); 1832 *pc3 = mul3; 1833 mul4 = (*pc4) * (*pc2); 1834 *pc4 = mul4; 1835 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1836 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1837 for (j = 0; j < nz; j++) { 1838 col = pj[j]; 1839 rtmp3[col] -= mul3 * rtmp2[col]; 1840 rtmp4[col] -= mul4 * rtmp2[col]; 1841 } 1842 PetscCall(PetscLogFlops(4.0 * nz)); 1843 } 1844 1845 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1846 rs = 0.0; 1847 /* L part */ 1848 pc3 = b->a + bi[i + 2]; 1849 pj = b->j + bi[i + 2]; 1850 nz = bi[i + 3] - bi[i + 2]; 1851 for (j = 0; j < nz; j++) { 1852 col = pj[j]; 1853 pc3[j] = rtmp3[col]; 1854 rs += PetscAbsScalar(pc3[j]); 1855 } 1856 /* U part */ 1857 pc3 = b->a + bdiag[i + 3] + 1; 1858 pj = b->j + bdiag[i + 3] + 1; 1859 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1860 for (j = 0; j < nz; j++) { 1861 col = pj[j]; 1862 pc3[j] = rtmp3[col]; 1863 rs += PetscAbsScalar(pc3[j]); 1864 } 1865 1866 sctx.rs = rs; 1867 sctx.pv = rtmp3[i + 2]; 1868 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1869 if (sctx.newshift) break; 1870 pc3 = b->a + bdiag[i + 2]; 1871 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1872 1873 /* Now take care of 3rd column of diagonal 4x4 block. */ 1874 pc4 = rtmp4 + i + 2; 1875 if (*pc4 != 0.0) { 1876 mul4 = (*pc4) * (*pc3); 1877 *pc4 = mul4; 1878 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1879 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1880 for (j = 0; j < nz; j++) { 1881 col = pj[j]; 1882 rtmp4[col] -= mul4 * rtmp3[col]; 1883 } 1884 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1885 } 1886 1887 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1888 rs = 0.0; 1889 /* L part */ 1890 pc4 = b->a + bi[i + 3]; 1891 pj = b->j + bi[i + 3]; 1892 nz = bi[i + 4] - bi[i + 3]; 1893 for (j = 0; j < nz; j++) { 1894 col = pj[j]; 1895 pc4[j] = rtmp4[col]; 1896 rs += PetscAbsScalar(pc4[j]); 1897 } 1898 /* U part */ 1899 pc4 = b->a + bdiag[i + 4] + 1; 1900 pj = b->j + bdiag[i + 4] + 1; 1901 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1902 for (j = 0; j < nz; j++) { 1903 col = pj[j]; 1904 pc4[j] = rtmp4[col]; 1905 rs += PetscAbsScalar(pc4[j]); 1906 } 1907 1908 sctx.rs = rs; 1909 sctx.pv = rtmp4[i + 3]; 1910 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1911 if (sctx.newshift) break; 1912 pc4 = b->a + bdiag[i + 3]; 1913 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1914 break; 1915 1916 default: 1917 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1918 } 1919 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1920 i += nodesz; /* Update the row */ 1921 } 1922 1923 /* MatPivotRefine() */ 1924 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1925 /* 1926 * if no shift in this attempt & shifting & started shifting & can refine, 1927 * then try lower shift 1928 */ 1929 sctx.shift_hi = sctx.shift_fraction; 1930 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1931 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1932 sctx.newshift = PETSC_TRUE; 1933 sctx.nshift++; 1934 } 1935 } while (sctx.newshift); 1936 1937 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1938 PetscCall(PetscFree(tmp_vec2)); 1939 PetscCall(ISRestoreIndices(isicol, &ic)); 1940 PetscCall(ISRestoreIndices(isrow, &r)); 1941 1942 if (b->inode.size) { 1943 C->ops->solve = MatSolve_SeqAIJ_Inode; 1944 } else { 1945 C->ops->solve = MatSolve_SeqAIJ; 1946 } 1947 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1948 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1949 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1950 C->ops->matsolve = MatMatSolve_SeqAIJ; 1951 C->assembled = PETSC_TRUE; 1952 C->preallocated = PETSC_TRUE; 1953 1954 PetscCall(PetscLogFlops(C->cmap->n)); 1955 1956 /* MatShiftView(A,info,&sctx) */ 1957 if (sctx.nshift) { 1958 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1959 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1960 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1961 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1962 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1963 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1964 } 1965 } 1966 PetscFunctionReturn(PETSC_SUCCESS); 1967 } 1968 1969 #if 0 1970 // unused 1971 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1972 { 1973 Mat C = B; 1974 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1975 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1976 const PetscInt *r, *ic, *c, *ics; 1977 PetscInt n = A->rmap->n, *bi = b->i; 1978 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1979 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1980 PetscInt *ai = a->i, *aj = a->j; 1981 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1982 PetscScalar mul1, mul2, mul3, tmp; 1983 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1984 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1985 PetscReal rs = 0.0; 1986 FactorShiftCtx sctx; 1987 1988 PetscFunctionBegin; 1989 sctx.shift_top = 0; 1990 sctx.nshift_max = 0; 1991 sctx.shift_lo = 0; 1992 sctx.shift_hi = 0; 1993 sctx.shift_fraction = 0; 1994 1995 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1996 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1997 sctx.shift_top = 0; 1998 for (i = 0; i < n; i++) { 1999 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2000 rs = 0.0; 2001 ajtmp = aj + ai[i]; 2002 rtmp1 = aa + ai[i]; 2003 nz = ai[i + 1] - ai[i]; 2004 for (j = 0; j < nz; j++) { 2005 if (*ajtmp != i) { 2006 rs += PetscAbsScalar(*rtmp1++); 2007 } else { 2008 rs -= PetscRealPart(*rtmp1++); 2009 } 2010 ajtmp++; 2011 } 2012 if (rs > sctx.shift_top) sctx.shift_top = rs; 2013 } 2014 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2015 sctx.shift_top *= 1.1; 2016 sctx.nshift_max = 5; 2017 sctx.shift_lo = 0.; 2018 sctx.shift_hi = 1.; 2019 } 2020 sctx.shift_amount = 0; 2021 sctx.nshift = 0; 2022 2023 PetscCall(ISGetIndices(isrow, &r)); 2024 PetscCall(ISGetIndices(iscol, &c)); 2025 PetscCall(ISGetIndices(isicol, &ic)); 2026 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2027 ics = ic; 2028 2029 node_max = a->inode.node_count; 2030 ns = a->inode.size; 2031 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2032 2033 /* If max inode size > 3, split it into two inodes.*/ 2034 /* also map the inode sizes according to the ordering */ 2035 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2036 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2037 if (ns[i] > 3) { 2038 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2039 ++j; 2040 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2041 } else { 2042 tmp_vec1[j] = ns[i]; 2043 } 2044 } 2045 /* Use the correct node_max */ 2046 node_max = j; 2047 2048 /* Now reorder the inode info based on mat re-ordering info */ 2049 /* First create a row -> inode_size_array_index map */ 2050 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2051 for (i = 0, row = 0; i < node_max; i++) { 2052 nodesz = tmp_vec1[i]; 2053 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2054 } 2055 /* Using nsmap, create a reordered ns structure */ 2056 for (i = 0, j = 0; i < node_max; i++) { 2057 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2058 tmp_vec2[i] = nodesz; 2059 j += nodesz; 2060 } 2061 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2062 /* Now use the correct ns */ 2063 ns = tmp_vec2; 2064 2065 do { 2066 sctx.newshift = PETSC_FALSE; 2067 /* Now loop over each block-row, and do the factorization */ 2068 for (i = 0, row = 0; i < node_max; i++) { 2069 nodesz = ns[i]; 2070 nz = bi[row + 1] - bi[row]; 2071 bjtmp = bj + bi[row]; 2072 2073 switch (nodesz) { 2074 case 1: 2075 for (j = 0; j < nz; j++) { 2076 idx = bjtmp[j]; 2077 rtmp11[idx] = 0.0; 2078 } 2079 2080 /* load in initial (unfactored row) */ 2081 idx = r[row]; 2082 nz_tmp = ai[idx + 1] - ai[idx]; 2083 ajtmp = aj + ai[idx]; 2084 v1 = aa + ai[idx]; 2085 2086 for (j = 0; j < nz_tmp; j++) { 2087 idx = ics[ajtmp[j]]; 2088 rtmp11[idx] = v1[j]; 2089 } 2090 rtmp11[ics[r[row]]] += sctx.shift_amount; 2091 2092 prow = *bjtmp++; 2093 while (prow < row) { 2094 pc1 = rtmp11 + prow; 2095 if (*pc1 != 0.0) { 2096 pv = ba + bd[prow]; 2097 pj = nbj + bd[prow]; 2098 mul1 = *pc1 * *pv++; 2099 *pc1 = mul1; 2100 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2101 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2102 for (j = 0; j < nz_tmp; j++) { 2103 tmp = pv[j]; 2104 idx = pj[j]; 2105 rtmp11[idx] -= mul1 * tmp; 2106 } 2107 } 2108 prow = *bjtmp++; 2109 } 2110 pj = bj + bi[row]; 2111 pc1 = ba + bi[row]; 2112 2113 sctx.pv = rtmp11[row]; 2114 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2115 rs = 0.0; 2116 for (j = 0; j < nz; j++) { 2117 idx = pj[j]; 2118 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2119 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2120 } 2121 sctx.rs = rs; 2122 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2123 if (sctx.newshift) goto endofwhile; 2124 break; 2125 2126 case 2: 2127 for (j = 0; j < nz; j++) { 2128 idx = bjtmp[j]; 2129 rtmp11[idx] = 0.0; 2130 rtmp22[idx] = 0.0; 2131 } 2132 2133 /* load in initial (unfactored row) */ 2134 idx = r[row]; 2135 nz_tmp = ai[idx + 1] - ai[idx]; 2136 ajtmp = aj + ai[idx]; 2137 v1 = aa + ai[idx]; 2138 v2 = aa + ai[idx + 1]; 2139 for (j = 0; j < nz_tmp; j++) { 2140 idx = ics[ajtmp[j]]; 2141 rtmp11[idx] = v1[j]; 2142 rtmp22[idx] = v2[j]; 2143 } 2144 rtmp11[ics[r[row]]] += sctx.shift_amount; 2145 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2146 2147 prow = *bjtmp++; 2148 while (prow < row) { 2149 pc1 = rtmp11 + prow; 2150 pc2 = rtmp22 + prow; 2151 if (*pc1 != 0.0 || *pc2 != 0.0) { 2152 pv = ba + bd[prow]; 2153 pj = nbj + bd[prow]; 2154 mul1 = *pc1 * *pv; 2155 mul2 = *pc2 * *pv; 2156 ++pv; 2157 *pc1 = mul1; 2158 *pc2 = mul2; 2159 2160 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2161 for (j = 0; j < nz_tmp; j++) { 2162 tmp = pv[j]; 2163 idx = pj[j]; 2164 rtmp11[idx] -= mul1 * tmp; 2165 rtmp22[idx] -= mul2 * tmp; 2166 } 2167 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2168 } 2169 prow = *bjtmp++; 2170 } 2171 2172 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2173 pc1 = rtmp11 + prow; 2174 pc2 = rtmp22 + prow; 2175 2176 sctx.pv = *pc1; 2177 pj = bj + bi[prow]; 2178 rs = 0.0; 2179 for (j = 0; j < nz; j++) { 2180 idx = pj[j]; 2181 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2182 } 2183 sctx.rs = rs; 2184 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2185 if (sctx.newshift) goto endofwhile; 2186 2187 if (*pc2 != 0.0) { 2188 pj = nbj + bd[prow]; 2189 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2190 *pc2 = mul2; 2191 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2192 for (j = 0; j < nz_tmp; j++) { 2193 idx = pj[j]; 2194 tmp = rtmp11[idx]; 2195 rtmp22[idx] -= mul2 * tmp; 2196 } 2197 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2198 } 2199 2200 pj = bj + bi[row]; 2201 pc1 = ba + bi[row]; 2202 pc2 = ba + bi[row + 1]; 2203 2204 sctx.pv = rtmp22[row + 1]; 2205 rs = 0.0; 2206 rtmp11[row] = 1.0 / rtmp11[row]; 2207 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2208 /* copy row entries from dense representation to sparse */ 2209 for (j = 0; j < nz; j++) { 2210 idx = pj[j]; 2211 pc1[j] = rtmp11[idx]; 2212 pc2[j] = rtmp22[idx]; 2213 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2214 } 2215 sctx.rs = rs; 2216 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2217 if (sctx.newshift) goto endofwhile; 2218 break; 2219 2220 case 3: 2221 for (j = 0; j < nz; j++) { 2222 idx = bjtmp[j]; 2223 rtmp11[idx] = 0.0; 2224 rtmp22[idx] = 0.0; 2225 rtmp33[idx] = 0.0; 2226 } 2227 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2228 idx = r[row]; 2229 nz_tmp = ai[idx + 1] - ai[idx]; 2230 ajtmp = aj + ai[idx]; 2231 v1 = aa + ai[idx]; 2232 v2 = aa + ai[idx + 1]; 2233 v3 = aa + ai[idx + 2]; 2234 for (j = 0; j < nz_tmp; j++) { 2235 idx = ics[ajtmp[j]]; 2236 rtmp11[idx] = v1[j]; 2237 rtmp22[idx] = v2[j]; 2238 rtmp33[idx] = v3[j]; 2239 } 2240 rtmp11[ics[r[row]]] += sctx.shift_amount; 2241 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2242 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2243 2244 /* loop over all pivot row blocks above this row block */ 2245 prow = *bjtmp++; 2246 while (prow < row) { 2247 pc1 = rtmp11 + prow; 2248 pc2 = rtmp22 + prow; 2249 pc3 = rtmp33 + prow; 2250 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2251 pv = ba + bd[prow]; 2252 pj = nbj + bd[prow]; 2253 mul1 = *pc1 * *pv; 2254 mul2 = *pc2 * *pv; 2255 mul3 = *pc3 * *pv; 2256 ++pv; 2257 *pc1 = mul1; 2258 *pc2 = mul2; 2259 *pc3 = mul3; 2260 2261 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2262 /* update this row based on pivot row */ 2263 for (j = 0; j < nz_tmp; j++) { 2264 tmp = pv[j]; 2265 idx = pj[j]; 2266 rtmp11[idx] -= mul1 * tmp; 2267 rtmp22[idx] -= mul2 * tmp; 2268 rtmp33[idx] -= mul3 * tmp; 2269 } 2270 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2271 } 2272 prow = *bjtmp++; 2273 } 2274 2275 /* Now take care of diagonal 3x3 block in this set of rows */ 2276 /* note: prow = row here */ 2277 pc1 = rtmp11 + prow; 2278 pc2 = rtmp22 + prow; 2279 pc3 = rtmp33 + prow; 2280 2281 sctx.pv = *pc1; 2282 pj = bj + bi[prow]; 2283 rs = 0.0; 2284 for (j = 0; j < nz; j++) { 2285 idx = pj[j]; 2286 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2287 } 2288 sctx.rs = rs; 2289 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2290 if (sctx.newshift) goto endofwhile; 2291 2292 if (*pc2 != 0.0 || *pc3 != 0.0) { 2293 mul2 = (*pc2) / (*pc1); 2294 mul3 = (*pc3) / (*pc1); 2295 *pc2 = mul2; 2296 *pc3 = mul3; 2297 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2298 pj = nbj + bd[prow]; 2299 for (j = 0; j < nz_tmp; j++) { 2300 idx = pj[j]; 2301 tmp = rtmp11[idx]; 2302 rtmp22[idx] -= mul2 * tmp; 2303 rtmp33[idx] -= mul3 * tmp; 2304 } 2305 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2306 } 2307 ++prow; 2308 2309 pc2 = rtmp22 + prow; 2310 pc3 = rtmp33 + prow; 2311 sctx.pv = *pc2; 2312 pj = bj + bi[prow]; 2313 rs = 0.0; 2314 for (j = 0; j < nz; j++) { 2315 idx = pj[j]; 2316 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2317 } 2318 sctx.rs = rs; 2319 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2320 if (sctx.newshift) goto endofwhile; 2321 2322 if (*pc3 != 0.0) { 2323 mul3 = (*pc3) / (*pc2); 2324 *pc3 = mul3; 2325 pj = nbj + bd[prow]; 2326 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2327 for (j = 0; j < nz_tmp; j++) { 2328 idx = pj[j]; 2329 tmp = rtmp22[idx]; 2330 rtmp33[idx] -= mul3 * tmp; 2331 } 2332 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2333 } 2334 2335 pj = bj + bi[row]; 2336 pc1 = ba + bi[row]; 2337 pc2 = ba + bi[row + 1]; 2338 pc3 = ba + bi[row + 2]; 2339 2340 sctx.pv = rtmp33[row + 2]; 2341 rs = 0.0; 2342 rtmp11[row] = 1.0 / rtmp11[row]; 2343 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2344 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2345 /* copy row entries from dense representation to sparse */ 2346 for (j = 0; j < nz; j++) { 2347 idx = pj[j]; 2348 pc1[j] = rtmp11[idx]; 2349 pc2[j] = rtmp22[idx]; 2350 pc3[j] = rtmp33[idx]; 2351 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2352 } 2353 2354 sctx.rs = rs; 2355 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2356 if (sctx.newshift) goto endofwhile; 2357 break; 2358 2359 default: 2360 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2361 } 2362 row += nodesz; /* Update the row */ 2363 } 2364 endofwhile:; 2365 } while (sctx.newshift); 2366 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2367 PetscCall(PetscFree(tmp_vec2)); 2368 PetscCall(ISRestoreIndices(isicol, &ic)); 2369 PetscCall(ISRestoreIndices(isrow, &r)); 2370 PetscCall(ISRestoreIndices(iscol, &c)); 2371 2372 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2373 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2374 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2375 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2376 C->assembled = PETSC_TRUE; 2377 C->preallocated = PETSC_TRUE; 2378 if (sctx.nshift) { 2379 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2380 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2381 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2382 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2383 } 2384 } 2385 PetscCall(PetscLogFlops(C->cmap->n)); 2386 PetscCall(MatSeqAIJCheckInode(C)); 2387 PetscFunctionReturn(PETSC_SUCCESS); 2388 } 2389 #endif 2390 2391 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2392 { 2393 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2394 IS iscol = a->col, isrow = a->row; 2395 const PetscInt *r, *c, *rout, *cout; 2396 PetscInt i, j, n = A->rmap->n; 2397 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2398 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2399 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2400 PetscScalar sum1, sum2, sum3, sum4, sum5; 2401 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2402 const PetscScalar *b; 2403 2404 PetscFunctionBegin; 2405 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2406 node_max = a->inode.node_count; 2407 ns = a->inode.size; /* Node Size array */ 2408 2409 PetscCall(VecGetArrayRead(bb, &b)); 2410 PetscCall(VecGetArrayWrite(xx, &x)); 2411 tmp = a->solve_work; 2412 2413 PetscCall(ISGetIndices(isrow, &rout)); 2414 r = rout; 2415 PetscCall(ISGetIndices(iscol, &cout)); 2416 c = cout; 2417 2418 /* forward solve the lower triangular */ 2419 tmps = tmp; 2420 aa = a_a; 2421 aj = a_j; 2422 ad = a->diag; 2423 2424 for (i = 0, row = 0; i < node_max; ++i) { 2425 nsz = ns[i]; 2426 aii = ai[row]; 2427 v1 = aa + aii; 2428 vi = aj + aii; 2429 nz = ai[row + 1] - ai[row]; 2430 2431 if (i < node_max - 1) { 2432 /* Prefetch the indices for the next block */ 2433 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2434 /* Prefetch the data for the next block */ 2435 PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2436 } 2437 2438 switch (nsz) { /* Each loop in 'case' is unrolled */ 2439 case 1: 2440 sum1 = b[r[row]]; 2441 for (j = 0; j < nz - 1; j += 2) { 2442 i0 = vi[j]; 2443 i1 = vi[j + 1]; 2444 tmp0 = tmps[i0]; 2445 tmp1 = tmps[i1]; 2446 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2447 } 2448 if (j == nz - 1) { 2449 tmp0 = tmps[vi[j]]; 2450 sum1 -= v1[j] * tmp0; 2451 } 2452 tmp[row++] = sum1; 2453 break; 2454 case 2: 2455 sum1 = b[r[row]]; 2456 sum2 = b[r[row + 1]]; 2457 v2 = aa + ai[row + 1]; 2458 2459 for (j = 0; j < nz - 1; j += 2) { 2460 i0 = vi[j]; 2461 i1 = vi[j + 1]; 2462 tmp0 = tmps[i0]; 2463 tmp1 = tmps[i1]; 2464 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2465 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2466 } 2467 if (j == nz - 1) { 2468 tmp0 = tmps[vi[j]]; 2469 sum1 -= v1[j] * tmp0; 2470 sum2 -= v2[j] * tmp0; 2471 } 2472 sum2 -= v2[nz] * sum1; 2473 tmp[row++] = sum1; 2474 tmp[row++] = sum2; 2475 break; 2476 case 3: 2477 sum1 = b[r[row]]; 2478 sum2 = b[r[row + 1]]; 2479 sum3 = b[r[row + 2]]; 2480 v2 = aa + ai[row + 1]; 2481 v3 = aa + ai[row + 2]; 2482 2483 for (j = 0; j < nz - 1; j += 2) { 2484 i0 = vi[j]; 2485 i1 = vi[j + 1]; 2486 tmp0 = tmps[i0]; 2487 tmp1 = tmps[i1]; 2488 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2489 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2490 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2491 } 2492 if (j == nz - 1) { 2493 tmp0 = tmps[vi[j]]; 2494 sum1 -= v1[j] * tmp0; 2495 sum2 -= v2[j] * tmp0; 2496 sum3 -= v3[j] * tmp0; 2497 } 2498 sum2 -= v2[nz] * sum1; 2499 sum3 -= v3[nz] * sum1; 2500 sum3 -= v3[nz + 1] * sum2; 2501 tmp[row++] = sum1; 2502 tmp[row++] = sum2; 2503 tmp[row++] = sum3; 2504 break; 2505 2506 case 4: 2507 sum1 = b[r[row]]; 2508 sum2 = b[r[row + 1]]; 2509 sum3 = b[r[row + 2]]; 2510 sum4 = b[r[row + 3]]; 2511 v2 = aa + ai[row + 1]; 2512 v3 = aa + ai[row + 2]; 2513 v4 = aa + ai[row + 3]; 2514 2515 for (j = 0; j < nz - 1; j += 2) { 2516 i0 = vi[j]; 2517 i1 = vi[j + 1]; 2518 tmp0 = tmps[i0]; 2519 tmp1 = tmps[i1]; 2520 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2521 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2522 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2523 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2524 } 2525 if (j == nz - 1) { 2526 tmp0 = tmps[vi[j]]; 2527 sum1 -= v1[j] * tmp0; 2528 sum2 -= v2[j] * tmp0; 2529 sum3 -= v3[j] * tmp0; 2530 sum4 -= v4[j] * tmp0; 2531 } 2532 sum2 -= v2[nz] * sum1; 2533 sum3 -= v3[nz] * sum1; 2534 sum4 -= v4[nz] * sum1; 2535 sum3 -= v3[nz + 1] * sum2; 2536 sum4 -= v4[nz + 1] * sum2; 2537 sum4 -= v4[nz + 2] * sum3; 2538 2539 tmp[row++] = sum1; 2540 tmp[row++] = sum2; 2541 tmp[row++] = sum3; 2542 tmp[row++] = sum4; 2543 break; 2544 case 5: 2545 sum1 = b[r[row]]; 2546 sum2 = b[r[row + 1]]; 2547 sum3 = b[r[row + 2]]; 2548 sum4 = b[r[row + 3]]; 2549 sum5 = b[r[row + 4]]; 2550 v2 = aa + ai[row + 1]; 2551 v3 = aa + ai[row + 2]; 2552 v4 = aa + ai[row + 3]; 2553 v5 = aa + ai[row + 4]; 2554 2555 for (j = 0; j < nz - 1; j += 2) { 2556 i0 = vi[j]; 2557 i1 = vi[j + 1]; 2558 tmp0 = tmps[i0]; 2559 tmp1 = tmps[i1]; 2560 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2561 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2562 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2563 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2564 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2565 } 2566 if (j == nz - 1) { 2567 tmp0 = tmps[vi[j]]; 2568 sum1 -= v1[j] * tmp0; 2569 sum2 -= v2[j] * tmp0; 2570 sum3 -= v3[j] * tmp0; 2571 sum4 -= v4[j] * tmp0; 2572 sum5 -= v5[j] * tmp0; 2573 } 2574 2575 sum2 -= v2[nz] * sum1; 2576 sum3 -= v3[nz] * sum1; 2577 sum4 -= v4[nz] * sum1; 2578 sum5 -= v5[nz] * sum1; 2579 sum3 -= v3[nz + 1] * sum2; 2580 sum4 -= v4[nz + 1] * sum2; 2581 sum5 -= v5[nz + 1] * sum2; 2582 sum4 -= v4[nz + 2] * sum3; 2583 sum5 -= v5[nz + 2] * sum3; 2584 sum5 -= v5[nz + 3] * sum4; 2585 2586 tmp[row++] = sum1; 2587 tmp[row++] = sum2; 2588 tmp[row++] = sum3; 2589 tmp[row++] = sum4; 2590 tmp[row++] = sum5; 2591 break; 2592 default: 2593 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2594 } 2595 } 2596 /* backward solve the upper triangular */ 2597 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 2598 nsz = ns[i]; 2599 aii = ad[row + 1] + 1; 2600 v1 = aa + aii; 2601 vi = aj + aii; 2602 nz = ad[row] - ad[row + 1] - 1; 2603 2604 if (i > 0) { 2605 /* Prefetch the indices for the next block */ 2606 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2607 /* Prefetch the data for the next block */ 2608 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2609 } 2610 2611 switch (nsz) { /* Each loop in 'case' is unrolled */ 2612 case 1: 2613 sum1 = tmp[row]; 2614 2615 for (j = 0; j < nz - 1; j += 2) { 2616 i0 = vi[j]; 2617 i1 = vi[j + 1]; 2618 tmp0 = tmps[i0]; 2619 tmp1 = tmps[i1]; 2620 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2621 } 2622 if (j == nz - 1) { 2623 tmp0 = tmps[vi[j]]; 2624 sum1 -= v1[j] * tmp0; 2625 } 2626 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2627 row--; 2628 break; 2629 case 2: 2630 sum1 = tmp[row]; 2631 sum2 = tmp[row - 1]; 2632 v2 = aa + ad[row] + 1; 2633 for (j = 0; j < nz - 1; j += 2) { 2634 i0 = vi[j]; 2635 i1 = vi[j + 1]; 2636 tmp0 = tmps[i0]; 2637 tmp1 = tmps[i1]; 2638 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2639 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2640 } 2641 if (j == nz - 1) { 2642 tmp0 = tmps[vi[j]]; 2643 sum1 -= v1[j] * tmp0; 2644 sum2 -= v2[j + 1] * tmp0; 2645 } 2646 2647 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2648 row--; 2649 sum2 -= v2[0] * tmp0; 2650 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2651 row--; 2652 break; 2653 case 3: 2654 sum1 = tmp[row]; 2655 sum2 = tmp[row - 1]; 2656 sum3 = tmp[row - 2]; 2657 v2 = aa + ad[row] + 1; 2658 v3 = aa + ad[row - 1] + 1; 2659 for (j = 0; j < nz - 1; j += 2) { 2660 i0 = vi[j]; 2661 i1 = vi[j + 1]; 2662 tmp0 = tmps[i0]; 2663 tmp1 = tmps[i1]; 2664 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2665 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2666 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2667 } 2668 if (j == nz - 1) { 2669 tmp0 = tmps[vi[j]]; 2670 sum1 -= v1[j] * tmp0; 2671 sum2 -= v2[j + 1] * tmp0; 2672 sum3 -= v3[j + 2] * tmp0; 2673 } 2674 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2675 row--; 2676 sum2 -= v2[0] * tmp0; 2677 sum3 -= v3[1] * tmp0; 2678 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2679 row--; 2680 sum3 -= v3[0] * tmp0; 2681 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2682 row--; 2683 2684 break; 2685 case 4: 2686 sum1 = tmp[row]; 2687 sum2 = tmp[row - 1]; 2688 sum3 = tmp[row - 2]; 2689 sum4 = tmp[row - 3]; 2690 v2 = aa + ad[row] + 1; 2691 v3 = aa + ad[row - 1] + 1; 2692 v4 = aa + ad[row - 2] + 1; 2693 2694 for (j = 0; j < nz - 1; j += 2) { 2695 i0 = vi[j]; 2696 i1 = vi[j + 1]; 2697 tmp0 = tmps[i0]; 2698 tmp1 = tmps[i1]; 2699 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2700 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2701 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2702 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2703 } 2704 if (j == nz - 1) { 2705 tmp0 = tmps[vi[j]]; 2706 sum1 -= v1[j] * tmp0; 2707 sum2 -= v2[j + 1] * tmp0; 2708 sum3 -= v3[j + 2] * tmp0; 2709 sum4 -= v4[j + 3] * tmp0; 2710 } 2711 2712 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2713 row--; 2714 sum2 -= v2[0] * tmp0; 2715 sum3 -= v3[1] * tmp0; 2716 sum4 -= v4[2] * tmp0; 2717 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2718 row--; 2719 sum3 -= v3[0] * tmp0; 2720 sum4 -= v4[1] * tmp0; 2721 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2722 row--; 2723 sum4 -= v4[0] * tmp0; 2724 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2725 row--; 2726 break; 2727 case 5: 2728 sum1 = tmp[row]; 2729 sum2 = tmp[row - 1]; 2730 sum3 = tmp[row - 2]; 2731 sum4 = tmp[row - 3]; 2732 sum5 = tmp[row - 4]; 2733 v2 = aa + ad[row] + 1; 2734 v3 = aa + ad[row - 1] + 1; 2735 v4 = aa + ad[row - 2] + 1; 2736 v5 = aa + ad[row - 3] + 1; 2737 for (j = 0; j < nz - 1; j += 2) { 2738 i0 = vi[j]; 2739 i1 = vi[j + 1]; 2740 tmp0 = tmps[i0]; 2741 tmp1 = tmps[i1]; 2742 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2743 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2744 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2745 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2746 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2747 } 2748 if (j == nz - 1) { 2749 tmp0 = tmps[vi[j]]; 2750 sum1 -= v1[j] * tmp0; 2751 sum2 -= v2[j + 1] * tmp0; 2752 sum3 -= v3[j + 2] * tmp0; 2753 sum4 -= v4[j + 3] * tmp0; 2754 sum5 -= v5[j + 4] * tmp0; 2755 } 2756 2757 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2758 row--; 2759 sum2 -= v2[0] * tmp0; 2760 sum3 -= v3[1] * tmp0; 2761 sum4 -= v4[2] * tmp0; 2762 sum5 -= v5[3] * tmp0; 2763 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2764 row--; 2765 sum3 -= v3[0] * tmp0; 2766 sum4 -= v4[1] * tmp0; 2767 sum5 -= v5[2] * tmp0; 2768 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2769 row--; 2770 sum4 -= v4[0] * tmp0; 2771 sum5 -= v5[1] * tmp0; 2772 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2773 row--; 2774 sum5 -= v5[0] * tmp0; 2775 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2776 row--; 2777 break; 2778 default: 2779 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2780 } 2781 } 2782 PetscCall(ISRestoreIndices(isrow, &rout)); 2783 PetscCall(ISRestoreIndices(iscol, &cout)); 2784 PetscCall(VecRestoreArrayRead(bb, &b)); 2785 PetscCall(VecRestoreArrayWrite(xx, &x)); 2786 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2787 PetscFunctionReturn(PETSC_SUCCESS); 2788 } 2789 2790 /* 2791 Makes a longer coloring[] array and calls the usual code with that 2792 */ 2793 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2794 { 2795 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2796 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row; 2797 PetscInt *colorused, i; 2798 ISColoringValue *newcolor; 2799 2800 PetscFunctionBegin; 2801 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2802 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2803 /* loop over inodes, marking a color for each column*/ 2804 row = 0; 2805 for (i = 0; i < m; i++) { 2806 for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors; 2807 } 2808 2809 /* eliminate unneeded colors */ 2810 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2811 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2812 2813 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2814 ncolors = colorused[5 * ncolors - 1]; 2815 for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1; 2816 PetscCall(PetscFree(colorused)); 2817 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2818 PetscCall(PetscFree(coloring)); 2819 PetscFunctionReturn(PETSC_SUCCESS); 2820 } 2821 2822 #include <petsc/private/kernels/blockinvert.h> 2823 2824 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2825 { 2826 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2827 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2828 MatScalar *ibdiag, *bdiag, work[25], *t; 2829 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2830 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2831 const PetscScalar *xb, *b; 2832 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2833 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2; 2834 PetscInt sz, k, ipvt[5]; 2835 PetscBool allowzeropivot, zeropivotdetected; 2836 const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i; 2837 2838 PetscFunctionBegin; 2839 allowzeropivot = PetscNot(A->erroriffailure); 2840 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2841 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2842 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2843 2844 if (!a->inode.ibdiagvalid) { 2845 if (!a->inode.ibdiag) { 2846 /* calculate space needed for diagonal blocks */ 2847 for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i]; 2848 a->inode.bdiagsize = cnt; 2849 2850 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2851 } 2852 2853 /* copy over the diagonal blocks and invert them */ 2854 ibdiag = a->inode.ibdiag; 2855 bdiag = a->inode.bdiag; 2856 cnt = 0; 2857 for (i = 0, row = 0; i < m; i++) { 2858 for (j = 0; j < sizes[i]; j++) { 2859 for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k]; 2860 } 2861 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i])); 2862 2863 switch (sizes[i]) { 2864 case 1: 2865 /* Create matrix data structure */ 2866 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2867 if (allowzeropivot) { 2868 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2869 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2870 A->factorerror_zeropivot_row = row; 2871 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2872 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2873 } 2874 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2875 break; 2876 case 2: 2877 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2878 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2879 break; 2880 case 3: 2881 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2882 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2883 break; 2884 case 4: 2885 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2886 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2887 break; 2888 case 5: 2889 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2890 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2891 break; 2892 default: 2893 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 2894 } 2895 cnt += sizes[i] * sizes[i]; 2896 row += sizes[i]; 2897 } 2898 a->inode.ibdiagvalid = PETSC_TRUE; 2899 } 2900 ibdiag = a->inode.ibdiag; 2901 bdiag = a->inode.bdiag; 2902 t = a->inode.ssor_work; 2903 2904 PetscCall(VecGetArray(xx, &x)); 2905 PetscCall(VecGetArrayRead(bb, &b)); 2906 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2907 if (flag & SOR_ZERO_INITIAL_GUESS) { 2908 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2909 for (i = 0, row = 0; i < m; i++) { 2910 sz = diag[row] - ii[row]; 2911 v1 = a->a + ii[row]; 2912 idx = a->j + ii[row]; 2913 2914 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2915 switch (sizes[i]) { 2916 case 1: 2917 2918 sum1 = b[row]; 2919 for (n = 0; n < sz - 1; n += 2) { 2920 i1 = idx[0]; 2921 i2 = idx[1]; 2922 idx += 2; 2923 tmp0 = x[i1]; 2924 tmp1 = x[i2]; 2925 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2926 v1 += 2; 2927 } 2928 2929 if (n == sz - 1) { 2930 tmp0 = x[*idx]; 2931 sum1 -= *v1 * tmp0; 2932 } 2933 t[row] = sum1; 2934 x[row++] = sum1 * (*ibdiag++); 2935 break; 2936 case 2: 2937 v2 = a->a + ii[row + 1]; 2938 sum1 = b[row]; 2939 sum2 = b[row + 1]; 2940 for (n = 0; n < sz - 1; n += 2) { 2941 i1 = idx[0]; 2942 i2 = idx[1]; 2943 idx += 2; 2944 tmp0 = x[i1]; 2945 tmp1 = x[i2]; 2946 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2947 v1 += 2; 2948 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2949 v2 += 2; 2950 } 2951 2952 if (n == sz - 1) { 2953 tmp0 = x[*idx]; 2954 sum1 -= v1[0] * tmp0; 2955 sum2 -= v2[0] * tmp0; 2956 } 2957 t[row] = sum1; 2958 t[row + 1] = sum2; 2959 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2960 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2961 ibdiag += 4; 2962 break; 2963 case 3: 2964 v2 = a->a + ii[row + 1]; 2965 v3 = a->a + ii[row + 2]; 2966 sum1 = b[row]; 2967 sum2 = b[row + 1]; 2968 sum3 = b[row + 2]; 2969 for (n = 0; n < sz - 1; n += 2) { 2970 i1 = idx[0]; 2971 i2 = idx[1]; 2972 idx += 2; 2973 tmp0 = x[i1]; 2974 tmp1 = x[i2]; 2975 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2976 v1 += 2; 2977 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2978 v2 += 2; 2979 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2980 v3 += 2; 2981 } 2982 2983 if (n == sz - 1) { 2984 tmp0 = x[*idx]; 2985 sum1 -= v1[0] * tmp0; 2986 sum2 -= v2[0] * tmp0; 2987 sum3 -= v3[0] * tmp0; 2988 } 2989 t[row] = sum1; 2990 t[row + 1] = sum2; 2991 t[row + 2] = sum3; 2992 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 2993 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 2994 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 2995 ibdiag += 9; 2996 break; 2997 case 4: 2998 v2 = a->a + ii[row + 1]; 2999 v3 = a->a + ii[row + 2]; 3000 v4 = a->a + ii[row + 3]; 3001 sum1 = b[row]; 3002 sum2 = b[row + 1]; 3003 sum3 = b[row + 2]; 3004 sum4 = b[row + 3]; 3005 for (n = 0; n < sz - 1; n += 2) { 3006 i1 = idx[0]; 3007 i2 = idx[1]; 3008 idx += 2; 3009 tmp0 = x[i1]; 3010 tmp1 = x[i2]; 3011 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3012 v1 += 2; 3013 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3014 v2 += 2; 3015 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3016 v3 += 2; 3017 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3018 v4 += 2; 3019 } 3020 3021 if (n == sz - 1) { 3022 tmp0 = x[*idx]; 3023 sum1 -= v1[0] * tmp0; 3024 sum2 -= v2[0] * tmp0; 3025 sum3 -= v3[0] * tmp0; 3026 sum4 -= v4[0] * tmp0; 3027 } 3028 t[row] = sum1; 3029 t[row + 1] = sum2; 3030 t[row + 2] = sum3; 3031 t[row + 3] = sum4; 3032 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3033 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3034 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3035 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3036 ibdiag += 16; 3037 break; 3038 case 5: 3039 v2 = a->a + ii[row + 1]; 3040 v3 = a->a + ii[row + 2]; 3041 v4 = a->a + ii[row + 3]; 3042 v5 = a->a + ii[row + 4]; 3043 sum1 = b[row]; 3044 sum2 = b[row + 1]; 3045 sum3 = b[row + 2]; 3046 sum4 = b[row + 3]; 3047 sum5 = b[row + 4]; 3048 for (n = 0; n < sz - 1; n += 2) { 3049 i1 = idx[0]; 3050 i2 = idx[1]; 3051 idx += 2; 3052 tmp0 = x[i1]; 3053 tmp1 = x[i2]; 3054 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3055 v1 += 2; 3056 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3057 v2 += 2; 3058 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3059 v3 += 2; 3060 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3061 v4 += 2; 3062 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3063 v5 += 2; 3064 } 3065 3066 if (n == sz - 1) { 3067 tmp0 = x[*idx]; 3068 sum1 -= v1[0] * tmp0; 3069 sum2 -= v2[0] * tmp0; 3070 sum3 -= v3[0] * tmp0; 3071 sum4 -= v4[0] * tmp0; 3072 sum5 -= v5[0] * tmp0; 3073 } 3074 t[row] = sum1; 3075 t[row + 1] = sum2; 3076 t[row + 2] = sum3; 3077 t[row + 3] = sum4; 3078 t[row + 4] = sum5; 3079 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3080 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3081 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3082 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3083 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3084 ibdiag += 25; 3085 break; 3086 default: 3087 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3088 } 3089 } 3090 3091 xb = t; 3092 PetscCall(PetscLogFlops(a->nz)); 3093 } else xb = b; 3094 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3095 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3096 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3097 ibdiag -= sizes[i] * sizes[i]; 3098 sz = ii[row + 1] - diag[row] - 1; 3099 v1 = a->a + diag[row] + 1; 3100 idx = a->j + diag[row] + 1; 3101 3102 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3103 switch (sizes[i]) { 3104 case 1: 3105 3106 sum1 = xb[row]; 3107 for (n = 0; n < sz - 1; n += 2) { 3108 i1 = idx[0]; 3109 i2 = idx[1]; 3110 idx += 2; 3111 tmp0 = x[i1]; 3112 tmp1 = x[i2]; 3113 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3114 v1 += 2; 3115 } 3116 3117 if (n == sz - 1) { 3118 tmp0 = x[*idx]; 3119 sum1 -= *v1 * tmp0; 3120 } 3121 x[row--] = sum1 * (*ibdiag); 3122 break; 3123 3124 case 2: 3125 3126 sum1 = xb[row]; 3127 sum2 = xb[row - 1]; 3128 /* note that sum1 is associated with the second of the two rows */ 3129 v2 = a->a + diag[row - 1] + 2; 3130 for (n = 0; n < sz - 1; n += 2) { 3131 i1 = idx[0]; 3132 i2 = idx[1]; 3133 idx += 2; 3134 tmp0 = x[i1]; 3135 tmp1 = x[i2]; 3136 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3137 v1 += 2; 3138 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3139 v2 += 2; 3140 } 3141 3142 if (n == sz - 1) { 3143 tmp0 = x[*idx]; 3144 sum1 -= *v1 * tmp0; 3145 sum2 -= *v2 * tmp0; 3146 } 3147 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3148 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3149 break; 3150 case 3: 3151 3152 sum1 = xb[row]; 3153 sum2 = xb[row - 1]; 3154 sum3 = xb[row - 2]; 3155 v2 = a->a + diag[row - 1] + 2; 3156 v3 = a->a + diag[row - 2] + 3; 3157 for (n = 0; n < sz - 1; n += 2) { 3158 i1 = idx[0]; 3159 i2 = idx[1]; 3160 idx += 2; 3161 tmp0 = x[i1]; 3162 tmp1 = x[i2]; 3163 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3164 v1 += 2; 3165 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3166 v2 += 2; 3167 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3168 v3 += 2; 3169 } 3170 3171 if (n == sz - 1) { 3172 tmp0 = x[*idx]; 3173 sum1 -= *v1 * tmp0; 3174 sum2 -= *v2 * tmp0; 3175 sum3 -= *v3 * tmp0; 3176 } 3177 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3178 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3179 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3180 break; 3181 case 4: 3182 3183 sum1 = xb[row]; 3184 sum2 = xb[row - 1]; 3185 sum3 = xb[row - 2]; 3186 sum4 = xb[row - 3]; 3187 v2 = a->a + diag[row - 1] + 2; 3188 v3 = a->a + diag[row - 2] + 3; 3189 v4 = a->a + diag[row - 3] + 4; 3190 for (n = 0; n < sz - 1; n += 2) { 3191 i1 = idx[0]; 3192 i2 = idx[1]; 3193 idx += 2; 3194 tmp0 = x[i1]; 3195 tmp1 = x[i2]; 3196 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3197 v1 += 2; 3198 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3199 v2 += 2; 3200 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3201 v3 += 2; 3202 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3203 v4 += 2; 3204 } 3205 3206 if (n == sz - 1) { 3207 tmp0 = x[*idx]; 3208 sum1 -= *v1 * tmp0; 3209 sum2 -= *v2 * tmp0; 3210 sum3 -= *v3 * tmp0; 3211 sum4 -= *v4 * tmp0; 3212 } 3213 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3214 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3215 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3216 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3217 break; 3218 case 5: 3219 3220 sum1 = xb[row]; 3221 sum2 = xb[row - 1]; 3222 sum3 = xb[row - 2]; 3223 sum4 = xb[row - 3]; 3224 sum5 = xb[row - 4]; 3225 v2 = a->a + diag[row - 1] + 2; 3226 v3 = a->a + diag[row - 2] + 3; 3227 v4 = a->a + diag[row - 3] + 4; 3228 v5 = a->a + diag[row - 4] + 5; 3229 for (n = 0; n < sz - 1; n += 2) { 3230 i1 = idx[0]; 3231 i2 = idx[1]; 3232 idx += 2; 3233 tmp0 = x[i1]; 3234 tmp1 = x[i2]; 3235 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3236 v1 += 2; 3237 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3238 v2 += 2; 3239 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3240 v3 += 2; 3241 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3242 v4 += 2; 3243 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3244 v5 += 2; 3245 } 3246 3247 if (n == sz - 1) { 3248 tmp0 = x[*idx]; 3249 sum1 -= *v1 * tmp0; 3250 sum2 -= *v2 * tmp0; 3251 sum3 -= *v3 * tmp0; 3252 sum4 -= *v4 * tmp0; 3253 sum5 -= *v5 * tmp0; 3254 } 3255 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3256 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3257 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3258 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3259 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3260 break; 3261 default: 3262 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3263 } 3264 } 3265 3266 PetscCall(PetscLogFlops(a->nz)); 3267 } 3268 its--; 3269 } 3270 while (its--) { 3271 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3272 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) { 3273 sz = diag[row] - ii[row]; 3274 v1 = a->a + ii[row]; 3275 idx = a->j + ii[row]; 3276 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3277 switch (sizes[i]) { 3278 case 1: 3279 sum1 = b[row]; 3280 for (n = 0; n < sz - 1; n += 2) { 3281 i1 = idx[0]; 3282 i2 = idx[1]; 3283 idx += 2; 3284 tmp0 = x[i1]; 3285 tmp1 = x[i2]; 3286 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3287 v1 += 2; 3288 } 3289 if (n == sz - 1) { 3290 tmp0 = x[*idx++]; 3291 sum1 -= *v1 * tmp0; 3292 v1++; 3293 } 3294 t[row] = sum1; 3295 sz = ii[row + 1] - diag[row] - 1; 3296 idx = a->j + diag[row] + 1; 3297 v1 += 1; 3298 for (n = 0; n < sz - 1; n += 2) { 3299 i1 = idx[0]; 3300 i2 = idx[1]; 3301 idx += 2; 3302 tmp0 = x[i1]; 3303 tmp1 = x[i2]; 3304 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3305 v1 += 2; 3306 } 3307 if (n == sz - 1) { 3308 tmp0 = x[*idx++]; 3309 sum1 -= *v1 * tmp0; 3310 } 3311 /* in MatSOR_SeqAIJ this line would be 3312 * 3313 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3314 * 3315 * but omega == 1, so this becomes 3316 * 3317 * x[row] = sum1*(*ibdiag++); 3318 * 3319 */ 3320 x[row] = sum1 * (*ibdiag); 3321 break; 3322 case 2: 3323 v2 = a->a + ii[row + 1]; 3324 sum1 = b[row]; 3325 sum2 = b[row + 1]; 3326 for (n = 0; n < sz - 1; n += 2) { 3327 i1 = idx[0]; 3328 i2 = idx[1]; 3329 idx += 2; 3330 tmp0 = x[i1]; 3331 tmp1 = x[i2]; 3332 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3333 v1 += 2; 3334 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3335 v2 += 2; 3336 } 3337 if (n == sz - 1) { 3338 tmp0 = x[*idx++]; 3339 sum1 -= v1[0] * tmp0; 3340 sum2 -= v2[0] * tmp0; 3341 v1++; 3342 v2++; 3343 } 3344 t[row] = sum1; 3345 t[row + 1] = sum2; 3346 sz = ii[row + 1] - diag[row] - 2; 3347 idx = a->j + diag[row] + 2; 3348 v1 += 2; 3349 v2 += 2; 3350 for (n = 0; n < sz - 1; n += 2) { 3351 i1 = idx[0]; 3352 i2 = idx[1]; 3353 idx += 2; 3354 tmp0 = x[i1]; 3355 tmp1 = x[i2]; 3356 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3357 v1 += 2; 3358 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3359 v2 += 2; 3360 } 3361 if (n == sz - 1) { 3362 tmp0 = x[*idx]; 3363 sum1 -= v1[0] * tmp0; 3364 sum2 -= v2[0] * tmp0; 3365 } 3366 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3367 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3368 break; 3369 case 3: 3370 v2 = a->a + ii[row + 1]; 3371 v3 = a->a + ii[row + 2]; 3372 sum1 = b[row]; 3373 sum2 = b[row + 1]; 3374 sum3 = b[row + 2]; 3375 for (n = 0; n < sz - 1; n += 2) { 3376 i1 = idx[0]; 3377 i2 = idx[1]; 3378 idx += 2; 3379 tmp0 = x[i1]; 3380 tmp1 = x[i2]; 3381 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3382 v1 += 2; 3383 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3384 v2 += 2; 3385 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3386 v3 += 2; 3387 } 3388 if (n == sz - 1) { 3389 tmp0 = x[*idx++]; 3390 sum1 -= v1[0] * tmp0; 3391 sum2 -= v2[0] * tmp0; 3392 sum3 -= v3[0] * tmp0; 3393 v1++; 3394 v2++; 3395 v3++; 3396 } 3397 t[row] = sum1; 3398 t[row + 1] = sum2; 3399 t[row + 2] = sum3; 3400 sz = ii[row + 1] - diag[row] - 3; 3401 idx = a->j + diag[row] + 3; 3402 v1 += 3; 3403 v2 += 3; 3404 v3 += 3; 3405 for (n = 0; n < sz - 1; n += 2) { 3406 i1 = idx[0]; 3407 i2 = idx[1]; 3408 idx += 2; 3409 tmp0 = x[i1]; 3410 tmp1 = x[i2]; 3411 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3412 v1 += 2; 3413 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3414 v2 += 2; 3415 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3416 v3 += 2; 3417 } 3418 if (n == sz - 1) { 3419 tmp0 = x[*idx]; 3420 sum1 -= v1[0] * tmp0; 3421 sum2 -= v2[0] * tmp0; 3422 sum3 -= v3[0] * tmp0; 3423 } 3424 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3425 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3426 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3427 break; 3428 case 4: 3429 v2 = a->a + ii[row + 1]; 3430 v3 = a->a + ii[row + 2]; 3431 v4 = a->a + ii[row + 3]; 3432 sum1 = b[row]; 3433 sum2 = b[row + 1]; 3434 sum3 = b[row + 2]; 3435 sum4 = b[row + 3]; 3436 for (n = 0; n < sz - 1; n += 2) { 3437 i1 = idx[0]; 3438 i2 = idx[1]; 3439 idx += 2; 3440 tmp0 = x[i1]; 3441 tmp1 = x[i2]; 3442 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3443 v1 += 2; 3444 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3445 v2 += 2; 3446 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3447 v3 += 2; 3448 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3449 v4 += 2; 3450 } 3451 if (n == sz - 1) { 3452 tmp0 = x[*idx++]; 3453 sum1 -= v1[0] * tmp0; 3454 sum2 -= v2[0] * tmp0; 3455 sum3 -= v3[0] * tmp0; 3456 sum4 -= v4[0] * tmp0; 3457 v1++; 3458 v2++; 3459 v3++; 3460 v4++; 3461 } 3462 t[row] = sum1; 3463 t[row + 1] = sum2; 3464 t[row + 2] = sum3; 3465 t[row + 3] = sum4; 3466 sz = ii[row + 1] - diag[row] - 4; 3467 idx = a->j + diag[row] + 4; 3468 v1 += 4; 3469 v2 += 4; 3470 v3 += 4; 3471 v4 += 4; 3472 for (n = 0; n < sz - 1; n += 2) { 3473 i1 = idx[0]; 3474 i2 = idx[1]; 3475 idx += 2; 3476 tmp0 = x[i1]; 3477 tmp1 = x[i2]; 3478 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3479 v1 += 2; 3480 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3481 v2 += 2; 3482 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3483 v3 += 2; 3484 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3485 v4 += 2; 3486 } 3487 if (n == sz - 1) { 3488 tmp0 = x[*idx]; 3489 sum1 -= v1[0] * tmp0; 3490 sum2 -= v2[0] * tmp0; 3491 sum3 -= v3[0] * tmp0; 3492 sum4 -= v4[0] * tmp0; 3493 } 3494 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3495 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3496 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3497 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3498 break; 3499 case 5: 3500 v2 = a->a + ii[row + 1]; 3501 v3 = a->a + ii[row + 2]; 3502 v4 = a->a + ii[row + 3]; 3503 v5 = a->a + ii[row + 4]; 3504 sum1 = b[row]; 3505 sum2 = b[row + 1]; 3506 sum3 = b[row + 2]; 3507 sum4 = b[row + 3]; 3508 sum5 = b[row + 4]; 3509 for (n = 0; n < sz - 1; n += 2) { 3510 i1 = idx[0]; 3511 i2 = idx[1]; 3512 idx += 2; 3513 tmp0 = x[i1]; 3514 tmp1 = x[i2]; 3515 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3516 v1 += 2; 3517 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3518 v2 += 2; 3519 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3520 v3 += 2; 3521 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3522 v4 += 2; 3523 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3524 v5 += 2; 3525 } 3526 if (n == sz - 1) { 3527 tmp0 = x[*idx++]; 3528 sum1 -= v1[0] * tmp0; 3529 sum2 -= v2[0] * tmp0; 3530 sum3 -= v3[0] * tmp0; 3531 sum4 -= v4[0] * tmp0; 3532 sum5 -= v5[0] * tmp0; 3533 v1++; 3534 v2++; 3535 v3++; 3536 v4++; 3537 v5++; 3538 } 3539 t[row] = sum1; 3540 t[row + 1] = sum2; 3541 t[row + 2] = sum3; 3542 t[row + 3] = sum4; 3543 t[row + 4] = sum5; 3544 sz = ii[row + 1] - diag[row] - 5; 3545 idx = a->j + diag[row] + 5; 3546 v1 += 5; 3547 v2 += 5; 3548 v3 += 5; 3549 v4 += 5; 3550 v5 += 5; 3551 for (n = 0; n < sz - 1; n += 2) { 3552 i1 = idx[0]; 3553 i2 = idx[1]; 3554 idx += 2; 3555 tmp0 = x[i1]; 3556 tmp1 = x[i2]; 3557 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3558 v1 += 2; 3559 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3560 v2 += 2; 3561 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3562 v3 += 2; 3563 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3564 v4 += 2; 3565 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3566 v5 += 2; 3567 } 3568 if (n == sz - 1) { 3569 tmp0 = x[*idx]; 3570 sum1 -= v1[0] * tmp0; 3571 sum2 -= v2[0] * tmp0; 3572 sum3 -= v3[0] * tmp0; 3573 sum4 -= v4[0] * tmp0; 3574 sum5 -= v5[0] * tmp0; 3575 } 3576 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3577 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3578 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3579 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3580 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3581 break; 3582 default: 3583 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3584 } 3585 } 3586 xb = t; 3587 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3588 } else xb = b; 3589 3590 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3591 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3592 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3593 ibdiag -= sizes[i] * sizes[i]; 3594 3595 /* set RHS */ 3596 if (xb == b) { 3597 /* whole (old way) */ 3598 sz = ii[row + 1] - ii[row]; 3599 idx = a->j + ii[row]; 3600 switch (sizes[i]) { 3601 case 5: 3602 v5 = a->a + ii[row - 4]; /* fall through */ 3603 case 4: 3604 v4 = a->a + ii[row - 3]; /* fall through */ 3605 case 3: 3606 v3 = a->a + ii[row - 2]; /* fall through */ 3607 case 2: 3608 v2 = a->a + ii[row - 1]; /* fall through */ 3609 case 1: 3610 v1 = a->a + ii[row]; 3611 break; 3612 default: 3613 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3614 } 3615 } else { 3616 /* upper, no diag */ 3617 sz = ii[row + 1] - diag[row] - 1; 3618 idx = a->j + diag[row] + 1; 3619 switch (sizes[i]) { 3620 case 5: 3621 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3622 case 4: 3623 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3624 case 3: 3625 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3626 case 2: 3627 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3628 case 1: 3629 v1 = a->a + diag[row] + 1; 3630 } 3631 } 3632 /* set sum */ 3633 switch (sizes[i]) { 3634 case 5: 3635 sum5 = xb[row - 4]; /* fall through */ 3636 case 4: 3637 sum4 = xb[row - 3]; /* fall through */ 3638 case 3: 3639 sum3 = xb[row - 2]; /* fall through */ 3640 case 2: 3641 sum2 = xb[row - 1]; /* fall through */ 3642 case 1: 3643 /* note that sum1 is associated with the last row */ 3644 sum1 = xb[row]; 3645 } 3646 /* do sums */ 3647 for (n = 0; n < sz - 1; n += 2) { 3648 i1 = idx[0]; 3649 i2 = idx[1]; 3650 idx += 2; 3651 tmp0 = x[i1]; 3652 tmp1 = x[i2]; 3653 switch (sizes[i]) { 3654 case 5: 3655 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3656 v5 += 2; /* fall through */ 3657 case 4: 3658 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3659 v4 += 2; /* fall through */ 3660 case 3: 3661 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3662 v3 += 2; /* fall through */ 3663 case 2: 3664 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3665 v2 += 2; /* fall through */ 3666 case 1: 3667 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3668 v1 += 2; 3669 } 3670 } 3671 /* ragged edge */ 3672 if (n == sz - 1) { 3673 tmp0 = x[*idx]; 3674 switch (sizes[i]) { 3675 case 5: 3676 sum5 -= *v5 * tmp0; /* fall through */ 3677 case 4: 3678 sum4 -= *v4 * tmp0; /* fall through */ 3679 case 3: 3680 sum3 -= *v3 * tmp0; /* fall through */ 3681 case 2: 3682 sum2 -= *v2 * tmp0; /* fall through */ 3683 case 1: 3684 sum1 -= *v1 * tmp0; 3685 } 3686 } 3687 /* update */ 3688 if (xb == b) { 3689 /* whole (old way) w/ diag */ 3690 switch (sizes[i]) { 3691 case 5: 3692 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3693 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3694 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3695 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3696 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3697 break; 3698 case 4: 3699 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3700 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3701 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3702 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3703 break; 3704 case 3: 3705 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3706 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3707 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3708 break; 3709 case 2: 3710 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3711 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3712 break; 3713 case 1: 3714 x[row--] += sum1 * (*ibdiag); 3715 break; 3716 } 3717 } else { 3718 /* no diag so set = */ 3719 switch (sizes[i]) { 3720 case 5: 3721 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3722 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3723 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3724 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3725 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3726 break; 3727 case 4: 3728 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3729 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3730 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3731 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3732 break; 3733 case 3: 3734 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3735 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3736 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3737 break; 3738 case 2: 3739 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3740 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3741 break; 3742 case 1: 3743 x[row--] = sum1 * (*ibdiag); 3744 break; 3745 } 3746 } 3747 } 3748 if (xb == b) { 3749 PetscCall(PetscLogFlops(2.0 * a->nz)); 3750 } else { 3751 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3752 } 3753 } 3754 } 3755 if (flag & SOR_EISENSTAT) { 3756 /* 3757 Apply (U + D)^-1 where D is now the block diagonal 3758 */ 3759 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3760 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3761 ibdiag -= sizes[i] * sizes[i]; 3762 sz = ii[row + 1] - diag[row] - 1; 3763 v1 = a->a + diag[row] + 1; 3764 idx = a->j + diag[row] + 1; 3765 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3766 switch (sizes[i]) { 3767 case 1: 3768 3769 sum1 = b[row]; 3770 for (n = 0; n < sz - 1; n += 2) { 3771 i1 = idx[0]; 3772 i2 = idx[1]; 3773 idx += 2; 3774 tmp0 = x[i1]; 3775 tmp1 = x[i2]; 3776 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3777 v1 += 2; 3778 } 3779 3780 if (n == sz - 1) { 3781 tmp0 = x[*idx]; 3782 sum1 -= *v1 * tmp0; 3783 } 3784 x[row] = sum1 * (*ibdiag); 3785 row--; 3786 break; 3787 3788 case 2: 3789 3790 sum1 = b[row]; 3791 sum2 = b[row - 1]; 3792 /* note that sum1 is associated with the second of the two rows */ 3793 v2 = a->a + diag[row - 1] + 2; 3794 for (n = 0; n < sz - 1; n += 2) { 3795 i1 = idx[0]; 3796 i2 = idx[1]; 3797 idx += 2; 3798 tmp0 = x[i1]; 3799 tmp1 = x[i2]; 3800 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3801 v1 += 2; 3802 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3803 v2 += 2; 3804 } 3805 3806 if (n == sz - 1) { 3807 tmp0 = x[*idx]; 3808 sum1 -= *v1 * tmp0; 3809 sum2 -= *v2 * tmp0; 3810 } 3811 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3812 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3813 row -= 2; 3814 break; 3815 case 3: 3816 3817 sum1 = b[row]; 3818 sum2 = b[row - 1]; 3819 sum3 = b[row - 2]; 3820 v2 = a->a + diag[row - 1] + 2; 3821 v3 = a->a + diag[row - 2] + 3; 3822 for (n = 0; n < sz - 1; n += 2) { 3823 i1 = idx[0]; 3824 i2 = idx[1]; 3825 idx += 2; 3826 tmp0 = x[i1]; 3827 tmp1 = x[i2]; 3828 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3829 v1 += 2; 3830 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3831 v2 += 2; 3832 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3833 v3 += 2; 3834 } 3835 3836 if (n == sz - 1) { 3837 tmp0 = x[*idx]; 3838 sum1 -= *v1 * tmp0; 3839 sum2 -= *v2 * tmp0; 3840 sum3 -= *v3 * tmp0; 3841 } 3842 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3843 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3844 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3845 row -= 3; 3846 break; 3847 case 4: 3848 3849 sum1 = b[row]; 3850 sum2 = b[row - 1]; 3851 sum3 = b[row - 2]; 3852 sum4 = b[row - 3]; 3853 v2 = a->a + diag[row - 1] + 2; 3854 v3 = a->a + diag[row - 2] + 3; 3855 v4 = a->a + diag[row - 3] + 4; 3856 for (n = 0; n < sz - 1; n += 2) { 3857 i1 = idx[0]; 3858 i2 = idx[1]; 3859 idx += 2; 3860 tmp0 = x[i1]; 3861 tmp1 = x[i2]; 3862 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3863 v1 += 2; 3864 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3865 v2 += 2; 3866 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3867 v3 += 2; 3868 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3869 v4 += 2; 3870 } 3871 3872 if (n == sz - 1) { 3873 tmp0 = x[*idx]; 3874 sum1 -= *v1 * tmp0; 3875 sum2 -= *v2 * tmp0; 3876 sum3 -= *v3 * tmp0; 3877 sum4 -= *v4 * tmp0; 3878 } 3879 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3880 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3881 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3882 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3883 row -= 4; 3884 break; 3885 case 5: 3886 3887 sum1 = b[row]; 3888 sum2 = b[row - 1]; 3889 sum3 = b[row - 2]; 3890 sum4 = b[row - 3]; 3891 sum5 = b[row - 4]; 3892 v2 = a->a + diag[row - 1] + 2; 3893 v3 = a->a + diag[row - 2] + 3; 3894 v4 = a->a + diag[row - 3] + 4; 3895 v5 = a->a + diag[row - 4] + 5; 3896 for (n = 0; n < sz - 1; n += 2) { 3897 i1 = idx[0]; 3898 i2 = idx[1]; 3899 idx += 2; 3900 tmp0 = x[i1]; 3901 tmp1 = x[i2]; 3902 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3903 v1 += 2; 3904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3905 v2 += 2; 3906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3907 v3 += 2; 3908 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3909 v4 += 2; 3910 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3911 v5 += 2; 3912 } 3913 3914 if (n == sz - 1) { 3915 tmp0 = x[*idx]; 3916 sum1 -= *v1 * tmp0; 3917 sum2 -= *v2 * tmp0; 3918 sum3 -= *v3 * tmp0; 3919 sum4 -= *v4 * tmp0; 3920 sum5 -= *v5 * tmp0; 3921 } 3922 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3923 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3924 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3925 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3926 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3927 row -= 5; 3928 break; 3929 default: 3930 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3931 } 3932 } 3933 PetscCall(PetscLogFlops(a->nz)); 3934 3935 /* 3936 t = b - D x where D is the block diagonal 3937 */ 3938 cnt = 0; 3939 for (i = 0, row = 0; i < m; i++) { 3940 switch (sizes[i]) { 3941 case 1: 3942 t[row] = b[row] - bdiag[cnt++] * x[row]; 3943 row++; 3944 break; 3945 case 2: 3946 x1 = x[row]; 3947 x2 = x[row + 1]; 3948 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3949 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3950 t[row] = b[row] - tmp1; 3951 t[row + 1] = b[row + 1] - tmp2; 3952 row += 2; 3953 cnt += 4; 3954 break; 3955 case 3: 3956 x1 = x[row]; 3957 x2 = x[row + 1]; 3958 x3 = x[row + 2]; 3959 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3960 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3961 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3962 t[row] = b[row] - tmp1; 3963 t[row + 1] = b[row + 1] - tmp2; 3964 t[row + 2] = b[row + 2] - tmp3; 3965 row += 3; 3966 cnt += 9; 3967 break; 3968 case 4: 3969 x1 = x[row]; 3970 x2 = x[row + 1]; 3971 x3 = x[row + 2]; 3972 x4 = x[row + 3]; 3973 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3974 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3975 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3976 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3977 t[row] = b[row] - tmp1; 3978 t[row + 1] = b[row + 1] - tmp2; 3979 t[row + 2] = b[row + 2] - tmp3; 3980 t[row + 3] = b[row + 3] - tmp4; 3981 row += 4; 3982 cnt += 16; 3983 break; 3984 case 5: 3985 x1 = x[row]; 3986 x2 = x[row + 1]; 3987 x3 = x[row + 2]; 3988 x4 = x[row + 3]; 3989 x5 = x[row + 4]; 3990 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 3991 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 3992 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 3993 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 3994 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 3995 t[row] = b[row] - tmp1; 3996 t[row + 1] = b[row + 1] - tmp2; 3997 t[row + 2] = b[row + 2] - tmp3; 3998 t[row + 3] = b[row + 3] - tmp4; 3999 t[row + 4] = b[row + 4] - tmp5; 4000 row += 5; 4001 cnt += 25; 4002 break; 4003 default: 4004 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4005 } 4006 } 4007 PetscCall(PetscLogFlops(m)); 4008 4009 /* 4010 Apply (L + D)^-1 where D is the block diagonal 4011 */ 4012 for (i = 0, row = 0; i < m; i++) { 4013 sz = diag[row] - ii[row]; 4014 v1 = a->a + ii[row]; 4015 idx = a->j + ii[row]; 4016 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4017 switch (sizes[i]) { 4018 case 1: 4019 4020 sum1 = t[row]; 4021 for (n = 0; n < sz - 1; n += 2) { 4022 i1 = idx[0]; 4023 i2 = idx[1]; 4024 idx += 2; 4025 tmp0 = t[i1]; 4026 tmp1 = t[i2]; 4027 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4028 v1 += 2; 4029 } 4030 4031 if (n == sz - 1) { 4032 tmp0 = t[*idx]; 4033 sum1 -= *v1 * tmp0; 4034 } 4035 x[row] += t[row] = sum1 * (*ibdiag++); 4036 row++; 4037 break; 4038 case 2: 4039 v2 = a->a + ii[row + 1]; 4040 sum1 = t[row]; 4041 sum2 = t[row + 1]; 4042 for (n = 0; n < sz - 1; n += 2) { 4043 i1 = idx[0]; 4044 i2 = idx[1]; 4045 idx += 2; 4046 tmp0 = t[i1]; 4047 tmp1 = t[i2]; 4048 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4049 v1 += 2; 4050 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4051 v2 += 2; 4052 } 4053 4054 if (n == sz - 1) { 4055 tmp0 = t[*idx]; 4056 sum1 -= v1[0] * tmp0; 4057 sum2 -= v2[0] * tmp0; 4058 } 4059 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4060 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4061 ibdiag += 4; 4062 row += 2; 4063 break; 4064 case 3: 4065 v2 = a->a + ii[row + 1]; 4066 v3 = a->a + ii[row + 2]; 4067 sum1 = t[row]; 4068 sum2 = t[row + 1]; 4069 sum3 = t[row + 2]; 4070 for (n = 0; n < sz - 1; n += 2) { 4071 i1 = idx[0]; 4072 i2 = idx[1]; 4073 idx += 2; 4074 tmp0 = t[i1]; 4075 tmp1 = t[i2]; 4076 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4077 v1 += 2; 4078 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4079 v2 += 2; 4080 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4081 v3 += 2; 4082 } 4083 4084 if (n == sz - 1) { 4085 tmp0 = t[*idx]; 4086 sum1 -= v1[0] * tmp0; 4087 sum2 -= v2[0] * tmp0; 4088 sum3 -= v3[0] * tmp0; 4089 } 4090 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4091 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4092 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4093 ibdiag += 9; 4094 row += 3; 4095 break; 4096 case 4: 4097 v2 = a->a + ii[row + 1]; 4098 v3 = a->a + ii[row + 2]; 4099 v4 = a->a + ii[row + 3]; 4100 sum1 = t[row]; 4101 sum2 = t[row + 1]; 4102 sum3 = t[row + 2]; 4103 sum4 = t[row + 3]; 4104 for (n = 0; n < sz - 1; n += 2) { 4105 i1 = idx[0]; 4106 i2 = idx[1]; 4107 idx += 2; 4108 tmp0 = t[i1]; 4109 tmp1 = t[i2]; 4110 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4111 v1 += 2; 4112 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4113 v2 += 2; 4114 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4115 v3 += 2; 4116 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4117 v4 += 2; 4118 } 4119 4120 if (n == sz - 1) { 4121 tmp0 = t[*idx]; 4122 sum1 -= v1[0] * tmp0; 4123 sum2 -= v2[0] * tmp0; 4124 sum3 -= v3[0] * tmp0; 4125 sum4 -= v4[0] * tmp0; 4126 } 4127 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4128 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4129 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4130 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4131 ibdiag += 16; 4132 row += 4; 4133 break; 4134 case 5: 4135 v2 = a->a + ii[row + 1]; 4136 v3 = a->a + ii[row + 2]; 4137 v4 = a->a + ii[row + 3]; 4138 v5 = a->a + ii[row + 4]; 4139 sum1 = t[row]; 4140 sum2 = t[row + 1]; 4141 sum3 = t[row + 2]; 4142 sum4 = t[row + 3]; 4143 sum5 = t[row + 4]; 4144 for (n = 0; n < sz - 1; n += 2) { 4145 i1 = idx[0]; 4146 i2 = idx[1]; 4147 idx += 2; 4148 tmp0 = t[i1]; 4149 tmp1 = t[i2]; 4150 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4151 v1 += 2; 4152 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4153 v2 += 2; 4154 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4155 v3 += 2; 4156 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4157 v4 += 2; 4158 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4159 v5 += 2; 4160 } 4161 4162 if (n == sz - 1) { 4163 tmp0 = t[*idx]; 4164 sum1 -= v1[0] * tmp0; 4165 sum2 -= v2[0] * tmp0; 4166 sum3 -= v3[0] * tmp0; 4167 sum4 -= v4[0] * tmp0; 4168 sum5 -= v5[0] * tmp0; 4169 } 4170 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4171 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4172 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4173 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4174 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4175 ibdiag += 25; 4176 row += 5; 4177 break; 4178 default: 4179 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4180 } 4181 } 4182 PetscCall(PetscLogFlops(a->nz)); 4183 } 4184 PetscCall(VecRestoreArray(xx, &x)); 4185 PetscCall(VecRestoreArrayRead(bb, &b)); 4186 PetscFunctionReturn(PETSC_SUCCESS); 4187 } 4188 4189 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4190 { 4191 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4192 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4193 const MatScalar *bdiag = a->inode.bdiag; 4194 const PetscScalar *b; 4195 PetscInt m = a->inode.node_count, cnt = 0, i, row; 4196 const PetscInt *sizes = a->inode.size; 4197 4198 PetscFunctionBegin; 4199 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4200 PetscCall(VecGetArray(xx, &x)); 4201 PetscCall(VecGetArrayRead(bb, &b)); 4202 cnt = 0; 4203 for (i = 0, row = 0; i < m; i++) { 4204 switch (sizes[i]) { 4205 case 1: 4206 x[row] = b[row] * bdiag[cnt++]; 4207 row++; 4208 break; 4209 case 2: 4210 x1 = b[row]; 4211 x2 = b[row + 1]; 4212 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4213 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4214 x[row++] = tmp1; 4215 x[row++] = tmp2; 4216 cnt += 4; 4217 break; 4218 case 3: 4219 x1 = b[row]; 4220 x2 = b[row + 1]; 4221 x3 = b[row + 2]; 4222 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4223 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4224 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4225 x[row++] = tmp1; 4226 x[row++] = tmp2; 4227 x[row++] = tmp3; 4228 cnt += 9; 4229 break; 4230 case 4: 4231 x1 = b[row]; 4232 x2 = b[row + 1]; 4233 x3 = b[row + 2]; 4234 x4 = b[row + 3]; 4235 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4236 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4237 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4238 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4239 x[row++] = tmp1; 4240 x[row++] = tmp2; 4241 x[row++] = tmp3; 4242 x[row++] = tmp4; 4243 cnt += 16; 4244 break; 4245 case 5: 4246 x1 = b[row]; 4247 x2 = b[row + 1]; 4248 x3 = b[row + 2]; 4249 x4 = b[row + 3]; 4250 x5 = b[row + 4]; 4251 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4252 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4253 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4254 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4255 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4256 x[row++] = tmp1; 4257 x[row++] = tmp2; 4258 x[row++] = tmp3; 4259 x[row++] = tmp4; 4260 x[row++] = tmp5; 4261 cnt += 25; 4262 break; 4263 default: 4264 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4265 } 4266 } 4267 PetscCall(PetscLogFlops(2.0 * cnt)); 4268 PetscCall(VecRestoreArray(xx, &x)); 4269 PetscCall(VecRestoreArrayRead(bb, &b)); 4270 PetscFunctionReturn(PETSC_SUCCESS); 4271 } 4272 4273 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4274 { 4275 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4276 4277 PetscFunctionBegin; 4278 a->inode.node_count = 0; 4279 a->inode.use = PETSC_FALSE; 4280 a->inode.checked = PETSC_FALSE; 4281 a->inode.mat_nonzerostate = -1; 4282 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4283 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4284 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4285 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4286 A->ops->coloringpatch = NULL; 4287 A->ops->multdiagonalblock = NULL; 4288 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4289 PetscFunctionReturn(PETSC_SUCCESS); 4290 } 4291 4292 /* 4293 samestructure indicates that the matrix has not changed its nonzero structure so we 4294 do not need to recompute the inodes 4295 */ 4296 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4297 { 4298 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4299 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4300 PetscBool flag; 4301 const PetscInt *idx, *idy, *ii; 4302 4303 PetscFunctionBegin; 4304 if (!a->inode.use) { 4305 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4306 PetscCall(PetscFree(a->inode.size)); 4307 PetscFunctionReturn(PETSC_SUCCESS); 4308 } 4309 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 4310 4311 m = A->rmap->n; 4312 if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size)); 4313 ns = a->inode.size; 4314 4315 i = 0; 4316 node_count = 0; 4317 idx = a->j; 4318 ii = a->i; 4319 if (idx) { 4320 while (i < m) { /* For each row */ 4321 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4322 /* Limits the number of elements in a node to 'a->inode.limit' */ 4323 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4324 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4325 if (nzy != nzx) break; 4326 idy += nzx; /* Same nonzero pattern */ 4327 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4328 if (!flag) break; 4329 } 4330 ns[node_count++] = blk_size; 4331 idx += blk_size * nzx; 4332 i = j; 4333 } 4334 } 4335 /* If not enough inodes found,, do not use inode version of the routines */ 4336 if (!m || !idx || node_count > .8 * m) { 4337 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4338 PetscCall(PetscFree(a->inode.size)); 4339 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4340 } else { 4341 if (!A->factortype) { 4342 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4343 if (A->rmap->n == A->cmap->n) { 4344 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4345 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4346 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4347 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4348 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4349 } 4350 } else { 4351 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4352 } 4353 a->inode.node_count = node_count; 4354 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4355 } 4356 a->inode.checked = PETSC_TRUE; 4357 a->inode.mat_nonzerostate = A->nonzerostate; 4358 PetscFunctionReturn(PETSC_SUCCESS); 4359 } 4360 4361 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4362 { 4363 Mat B = *C; 4364 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4365 PetscInt m = A->rmap->n; 4366 4367 PetscFunctionBegin; 4368 c->inode.use = a->inode.use; 4369 c->inode.limit = a->inode.limit; 4370 c->inode.max_limit = a->inode.max_limit; 4371 c->inode.checked = PETSC_FALSE; 4372 c->inode.size = NULL; 4373 c->inode.node_count = 0; 4374 c->inode.ibdiagvalid = PETSC_FALSE; 4375 c->inode.ibdiag = NULL; 4376 c->inode.bdiag = NULL; 4377 c->inode.mat_nonzerostate = -1; 4378 if (a->inode.use) { 4379 if (a->inode.checked && a->inode.size) { 4380 PetscCall(PetscMalloc1(m + 1, &c->inode.size)); 4381 PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1)); 4382 4383 c->inode.checked = PETSC_TRUE; 4384 c->inode.node_count = a->inode.node_count; 4385 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4386 } 4387 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4388 if (!B->factortype) { 4389 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4390 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4391 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4392 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4393 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4394 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4395 } else { 4396 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4397 } 4398 } 4399 PetscFunctionReturn(PETSC_SUCCESS); 4400 } 4401 4402 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4403 { 4404 PetscInt k; 4405 const PetscInt *vi; 4406 4407 PetscFunctionBegin; 4408 vi = aj + ai[row]; 4409 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4410 vi = aj + adiag[row]; 4411 cols[nzl] = vi[0]; 4412 vi = aj + adiag[row + 1] + 1; 4413 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4414 PetscFunctionReturn(PETSC_SUCCESS); 4415 } 4416 /* 4417 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4418 Modified from MatSeqAIJCheckInode(). 4419 4420 Input Parameters: 4421 . Mat A - ILU or LU matrix factor 4422 4423 */ 4424 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4425 { 4426 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4427 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4428 PetscInt *cols1, *cols2, *ns; 4429 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4430 PetscBool flag; 4431 4432 PetscFunctionBegin; 4433 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4434 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4435 4436 m = A->rmap->n; 4437 if (a->inode.size) ns = a->inode.size; 4438 else PetscCall(PetscMalloc1(m + 1, &ns)); 4439 4440 i = 0; 4441 node_count = 0; 4442 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4443 while (i < m) { /* For each row */ 4444 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4445 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4446 nzx = nzl1 + nzu1 + 1; 4447 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4448 4449 /* Limits the number of elements in a node to 'a->inode.limit' */ 4450 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4451 nzl2 = ai[j + 1] - ai[j]; 4452 nzu2 = adiag[j] - adiag[j + 1] - 1; 4453 nzy = nzl2 + nzu2 + 1; 4454 if (nzy != nzx) break; 4455 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4456 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4457 if (!flag) break; 4458 } 4459 ns[node_count++] = blk_size; 4460 i = j; 4461 } 4462 PetscCall(PetscFree2(cols1, cols2)); 4463 /* If not enough inodes found,, do not use inode version of the routines */ 4464 if (!m || node_count > .8 * m) { 4465 PetscCall(PetscFree(ns)); 4466 4467 a->inode.node_count = 0; 4468 a->inode.size = NULL; 4469 a->inode.use = PETSC_FALSE; 4470 4471 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4472 } else { 4473 A->ops->mult = NULL; 4474 A->ops->sor = NULL; 4475 A->ops->multadd = NULL; 4476 A->ops->getrowij = NULL; 4477 A->ops->restorerowij = NULL; 4478 A->ops->getcolumnij = NULL; 4479 A->ops->restorecolumnij = NULL; 4480 A->ops->coloringpatch = NULL; 4481 A->ops->multdiagonalblock = NULL; 4482 a->inode.node_count = node_count; 4483 a->inode.size = ns; 4484 4485 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4486 } 4487 a->inode.checked = PETSC_TRUE; 4488 PetscFunctionReturn(PETSC_SUCCESS); 4489 } 4490 4491 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4492 { 4493 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4494 4495 PetscFunctionBegin; 4496 a->inode.ibdiagvalid = PETSC_FALSE; 4497 PetscFunctionReturn(PETSC_SUCCESS); 4498 } 4499 4500 /* 4501 This is really ugly. if inodes are used this replaces the 4502 permutations with ones that correspond to rows/cols of the matrix 4503 rather then inode blocks 4504 */ 4505 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4506 { 4507 PetscFunctionBegin; 4508 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4509 PetscFunctionReturn(PETSC_SUCCESS); 4510 } 4511 4512 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4513 { 4514 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4515 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4516 const PetscInt *ridx, *cidx; 4517 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx; 4518 PetscInt nslim_col, *ns_col; 4519 IS ris = *rperm, cis = *cperm; 4520 4521 PetscFunctionBegin; 4522 if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4523 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4524 4525 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4526 PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns)); 4527 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4528 4529 PetscCall(ISGetIndices(ris, &ridx)); 4530 PetscCall(ISGetIndices(cis, &cidx)); 4531 4532 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4533 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i]; 4534 4535 /* Construct the permutations for rows*/ 4536 for (i = 0, row = 0; i < nslim_row; ++i) { 4537 indx = ridx[i]; 4538 start_val = tns[indx]; 4539 end_val = tns[indx + 1]; 4540 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4541 } 4542 4543 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4544 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i]; 4545 4546 /* Construct permutations for columns */ 4547 for (i = 0, col = 0; i < nslim_col; ++i) { 4548 indx = cidx[i]; 4549 start_val = tns[indx]; 4550 end_val = tns[indx + 1]; 4551 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4552 } 4553 4554 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4555 PetscCall(ISSetPermutation(*rperm)); 4556 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4557 PetscCall(ISSetPermutation(*cperm)); 4558 4559 PetscCall(ISRestoreIndices(ris, &ridx)); 4560 PetscCall(ISRestoreIndices(cis, &cidx)); 4561 4562 PetscCall(PetscFree(ns_col)); 4563 PetscCall(PetscFree2(permr, permc)); 4564 PetscCall(ISDestroy(&cis)); 4565 PetscCall(ISDestroy(&ris)); 4566 PetscCall(PetscFree(tns)); 4567 PetscFunctionReturn(PETSC_SUCCESS); 4568 } 4569 4570 /*@C 4571 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4572 4573 Not Collective 4574 4575 Input Parameter: 4576 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4577 4578 Output Parameters: 4579 + node_count - no of inodes present in the matrix. 4580 . sizes - an array of size `node_count`, with the sizes of each inode. 4581 - limit - the max size used to generate the inodes. 4582 4583 Level: advanced 4584 4585 Note: 4586 It should be called after the matrix is assembled. 4587 The contents of the sizes[] array should not be changed. 4588 `NULL` may be passed for information not needed 4589 4590 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 4591 @*/ 4592 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4593 { 4594 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4595 4596 PetscFunctionBegin; 4597 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4598 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4599 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4600 PetscFunctionReturn(PETSC_SUCCESS); 4601 } 4602 4603 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4604 { 4605 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4606 4607 PetscFunctionBegin; 4608 if (node_count) *node_count = a->inode.node_count; 4609 if (sizes) *sizes = a->inode.size; 4610 if (limit) *limit = a->inode.limit; 4611 PetscFunctionReturn(PETSC_SUCCESS); 4612 } 4613