1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 9 { 10 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 11 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 12 13 PetscFunctionBegin; 14 n = A->cmap->n; 15 m = A->rmap->n; 16 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 17 ns_row = a->inode.size; 18 19 min_mn = (m < n) ? m : n; 20 if (!ns) { 21 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) 22 ; 23 for (; count + 1 < n; count++, i++) 24 ; 25 if (count < n) i++; 26 *size = i; 27 PetscFunctionReturn(0); 28 } 29 PetscCall(PetscMalloc1(n + 1, &ns_col)); 30 31 /* Use the same row structure wherever feasible. */ 32 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i]; 33 34 /* if m < n; pad up the remainder with inode_limit */ 35 for (; count + 1 < n; count++, i++) ns_col[i] = 1; 36 /* The last node is the odd ball. padd it up with the remaining rows; */ 37 if (count < n) { 38 ns_col[i] = n - count; 39 i++; 40 } else if (count > n) { 41 /* Adjust for the over estimation */ 42 ns_col[i - 1] += n - count; 43 } 44 *size = i; 45 *ns = ns_col; 46 PetscFunctionReturn(0); 47 } 48 49 /* 50 This builds symmetric version of nonzero structure, 51 */ 52 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 53 { 54 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 55 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 56 PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2; 57 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 58 59 PetscFunctionBegin; 60 nslim_row = a->inode.node_count; 61 m = A->rmap->n; 62 n = A->cmap->n; 63 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 64 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 65 66 /* Use the row_inode as column_inode */ 67 nslim_col = nslim_row; 68 ns_col = ns_row; 69 70 /* allocate space for reformated inode structure */ 71 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 72 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1]; 73 74 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 75 nsz = ns_col[i1]; 76 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 77 } 78 /* allocate space for row pointers */ 79 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 80 *iia = ia; 81 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 82 83 /* determine the number of columns in each row */ 84 ia[0] = oshift; 85 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 86 j = aj + ai[row] + ishift; 87 jmax = aj + ai[row + 1] + ishift; 88 if (j == jmax) continue; /* empty row */ 89 col = *j++ + ishift; 90 i2 = tvc[col]; 91 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 92 ia[i1 + 1]++; 93 ia[i2 + 1]++; 94 i2++; /* Start col of next node */ 95 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 96 i2 = tvc[col]; 97 } 98 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 99 } 100 101 /* shift ia[i] to point to next row */ 102 for (i1 = 1; i1 < nslim_row + 1; i1++) { 103 row = ia[i1 - 1]; 104 ia[i1] += row; 105 work[i1 - 1] = row - oshift; 106 } 107 108 /* allocate space for column pointers */ 109 nz = ia[nslim_row] + (!ishift); 110 PetscCall(PetscMalloc1(nz, &ja)); 111 *jja = ja; 112 113 /* loop over lower triangular part putting into ja */ 114 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 115 j = aj + ai[row] + ishift; 116 jmax = aj + ai[row + 1] + ishift; 117 if (j == jmax) continue; /* empty row */ 118 col = *j++ + ishift; 119 i2 = tvc[col]; 120 while (i2 < i1 && j < jmax) { 121 ja[work[i2]++] = i1 + oshift; 122 ja[work[i1]++] = i2 + oshift; 123 ++i2; 124 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 125 i2 = tvc[col]; 126 } 127 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 128 } 129 PetscCall(PetscFree(work)); 130 PetscCall(PetscFree2(tns, tvc)); 131 PetscFunctionReturn(0); 132 } 133 134 /* 135 This builds nonsymmetric version of nonzero structure, 136 */ 137 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 138 { 139 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 140 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 141 PetscInt *tns, *tvc, nsz, i1, i2; 142 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size; 143 144 PetscFunctionBegin; 145 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 146 nslim_row = a->inode.node_count; 147 n = A->cmap->n; 148 149 /* Create The column_inode for this matrix */ 150 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 151 152 /* allocate space for reformated column_inode structure */ 153 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 154 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 155 156 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 157 nsz = ns_col[i1]; 158 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 159 } 160 /* allocate space for row pointers */ 161 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 162 *iia = ia; 163 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 164 165 /* determine the number of columns in each row */ 166 ia[0] = oshift; 167 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 168 j = aj + ai[row] + ishift; 169 nz = ai[row + 1] - ai[row]; 170 if (!nz) continue; /* empty row */ 171 col = *j++ + ishift; 172 i2 = tvc[col]; 173 while (nz-- > 0) { /* off-diagonal elements */ 174 ia[i1 + 1]++; 175 i2++; /* Start col of next node */ 176 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 177 if (nz > 0) i2 = tvc[col]; 178 } 179 } 180 181 /* shift ia[i] to point to next row */ 182 for (i1 = 1; i1 < nslim_row + 1; i1++) { 183 row = ia[i1 - 1]; 184 ia[i1] += row; 185 work[i1 - 1] = row - oshift; 186 } 187 188 /* allocate space for column pointers */ 189 nz = ia[nslim_row] + (!ishift); 190 PetscCall(PetscMalloc1(nz, &ja)); 191 *jja = ja; 192 193 /* loop over matrix putting into ja */ 194 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 195 j = aj + ai[row] + ishift; 196 nz = ai[row + 1] - ai[row]; 197 if (!nz) continue; /* empty row */ 198 col = *j++ + ishift; 199 i2 = tvc[col]; 200 while (nz-- > 0) { 201 ja[work[i1]++] = i2 + oshift; 202 ++i2; 203 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 204 if (nz > 0) i2 = tvc[col]; 205 } 206 } 207 PetscCall(PetscFree(ns_col)); 208 PetscCall(PetscFree(work)); 209 PetscCall(PetscFree2(tns, tvc)); 210 PetscFunctionReturn(0); 211 } 212 213 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 214 { 215 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 216 217 PetscFunctionBegin; 218 if (n) *n = a->inode.node_count; 219 if (!ia) PetscFunctionReturn(0); 220 if (!blockcompressed) { 221 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 222 } else if (symmetric) { 223 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 224 } else { 225 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 226 } 227 PetscFunctionReturn(0); 228 } 229 230 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 231 { 232 PetscFunctionBegin; 233 if (!ia) PetscFunctionReturn(0); 234 235 if (!blockcompressed) { 236 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 237 } else { 238 PetscCall(PetscFree(*ia)); 239 PetscCall(PetscFree(*ja)); 240 } 241 PetscFunctionReturn(0); 242 } 243 244 /* ----------------------------------------------------------- */ 245 246 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 247 { 248 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 249 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 250 PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j; 251 252 PetscFunctionBegin; 253 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 254 nslim_row = a->inode.node_count; 255 n = A->cmap->n; 256 257 /* Create The column_inode for this matrix */ 258 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 259 260 /* allocate space for reformated column_inode structure */ 261 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 262 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 263 264 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 265 nsz = ns_col[i1]; 266 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 267 } 268 /* allocate space for column pointers */ 269 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 270 *iia = ia; 271 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 272 273 /* determine the number of columns in each row */ 274 ia[0] = oshift; 275 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 276 j = aj + ai[row] + ishift; 277 col = *j++ + ishift; 278 i2 = tvc[col]; 279 nz = ai[row + 1] - ai[row]; 280 while (nz-- > 0) { /* off-diagonal elements */ 281 /* ia[i1+1]++; */ 282 ia[i2 + 1]++; 283 i2++; 284 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 285 if (nz > 0) i2 = tvc[col]; 286 } 287 } 288 289 /* shift ia[i] to point to next col */ 290 for (i1 = 1; i1 < nslim_col + 1; i1++) { 291 col = ia[i1 - 1]; 292 ia[i1] += col; 293 work[i1 - 1] = col - oshift; 294 } 295 296 /* allocate space for column pointers */ 297 nz = ia[nslim_col] + (!ishift); 298 PetscCall(PetscMalloc1(nz, &ja)); 299 *jja = ja; 300 301 /* loop over matrix putting into ja */ 302 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 303 j = aj + ai[row] + ishift; 304 col = *j++ + ishift; 305 i2 = tvc[col]; 306 nz = ai[row + 1] - ai[row]; 307 while (nz-- > 0) { 308 /* ja[work[i1]++] = i2 + oshift; */ 309 ja[work[i2]++] = i1 + oshift; 310 i2++; 311 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 312 if (nz > 0) i2 = tvc[col]; 313 } 314 } 315 PetscCall(PetscFree(ns_col)); 316 PetscCall(PetscFree(work)); 317 PetscCall(PetscFree2(tns, tvc)); 318 PetscFunctionReturn(0); 319 } 320 321 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 322 { 323 PetscFunctionBegin; 324 PetscCall(MatCreateColInode_Private(A, n, NULL)); 325 if (!ia) PetscFunctionReturn(0); 326 327 if (!blockcompressed) { 328 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 329 } else if (symmetric) { 330 /* Since the indices are symmetric it doesn't matter */ 331 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 332 } else { 333 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 334 } 335 PetscFunctionReturn(0); 336 } 337 338 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 339 { 340 PetscFunctionBegin; 341 if (!ia) PetscFunctionReturn(0); 342 if (!blockcompressed) { 343 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 344 } else { 345 PetscCall(PetscFree(*ia)); 346 PetscCall(PetscFree(*ja)); 347 } 348 PetscFunctionReturn(0); 349 } 350 351 /* ----------------------------------------------------------- */ 352 353 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 354 { 355 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 356 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 357 PetscScalar *y; 358 const PetscScalar *x; 359 const MatScalar *v1, *v2, *v3, *v4, *v5; 360 PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 361 const PetscInt *idx, *ns, *ii; 362 363 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 364 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 365 #endif 366 367 PetscFunctionBegin; 368 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 369 node_max = a->inode.node_count; 370 ns = a->inode.size; /* Node Size array */ 371 PetscCall(VecGetArrayRead(xx, &x)); 372 PetscCall(VecGetArray(yy, &y)); 373 idx = a->j; 374 v1 = a->a; 375 ii = a->i; 376 377 for (i = 0, row = 0; i < node_max; ++i) { 378 nsz = ns[i]; 379 n = ii[1] - ii[0]; 380 nonzerorow += (n > 0) * nsz; 381 ii += nsz; 382 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 383 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 384 sz = n; /* No of non zeros in this row */ 385 /* Switch on the size of Node */ 386 switch (nsz) { /* Each loop in 'case' is unrolled */ 387 case 1: 388 sum1 = 0.; 389 390 for (n = 0; n < sz - 1; n += 2) { 391 i1 = idx[0]; /* The instructions are ordered to */ 392 i2 = idx[1]; /* make the compiler's job easy */ 393 idx += 2; 394 tmp0 = x[i1]; 395 tmp1 = x[i2]; 396 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 397 v1 += 2; 398 } 399 400 if (n == sz - 1) { /* Take care of the last nonzero */ 401 tmp0 = x[*idx++]; 402 sum1 += *v1++ * tmp0; 403 } 404 y[row++] = sum1; 405 break; 406 case 2: 407 sum1 = 0.; 408 sum2 = 0.; 409 v2 = v1 + n; 410 411 for (n = 0; n < sz - 1; n += 2) { 412 i1 = idx[0]; 413 i2 = idx[1]; 414 idx += 2; 415 tmp0 = x[i1]; 416 tmp1 = x[i2]; 417 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 418 v1 += 2; 419 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 420 v2 += 2; 421 } 422 if (n == sz - 1) { 423 tmp0 = x[*idx++]; 424 sum1 += *v1++ * tmp0; 425 sum2 += *v2++ * tmp0; 426 } 427 y[row++] = sum1; 428 y[row++] = sum2; 429 v1 = v2; /* Since the next block to be processed starts there*/ 430 idx += sz; 431 break; 432 case 3: 433 sum1 = 0.; 434 sum2 = 0.; 435 sum3 = 0.; 436 v2 = v1 + n; 437 v3 = v2 + n; 438 439 for (n = 0; n < sz - 1; n += 2) { 440 i1 = idx[0]; 441 i2 = idx[1]; 442 idx += 2; 443 tmp0 = x[i1]; 444 tmp1 = x[i2]; 445 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 446 v1 += 2; 447 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 448 v2 += 2; 449 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 450 v3 += 2; 451 } 452 if (n == sz - 1) { 453 tmp0 = x[*idx++]; 454 sum1 += *v1++ * tmp0; 455 sum2 += *v2++ * tmp0; 456 sum3 += *v3++ * tmp0; 457 } 458 y[row++] = sum1; 459 y[row++] = sum2; 460 y[row++] = sum3; 461 v1 = v3; /* Since the next block to be processed starts there*/ 462 idx += 2 * sz; 463 break; 464 case 4: 465 sum1 = 0.; 466 sum2 = 0.; 467 sum3 = 0.; 468 sum4 = 0.; 469 v2 = v1 + n; 470 v3 = v2 + n; 471 v4 = v3 + n; 472 473 for (n = 0; n < sz - 1; n += 2) { 474 i1 = idx[0]; 475 i2 = idx[1]; 476 idx += 2; 477 tmp0 = x[i1]; 478 tmp1 = x[i2]; 479 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 480 v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 482 v2 += 2; 483 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 484 v3 += 2; 485 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 486 v4 += 2; 487 } 488 if (n == sz - 1) { 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 sum4 += *v4++ * tmp0; 494 } 495 y[row++] = sum1; 496 y[row++] = sum2; 497 y[row++] = sum3; 498 y[row++] = sum4; 499 v1 = v4; /* Since the next block to be processed starts there*/ 500 idx += 3 * sz; 501 break; 502 case 5: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 sum5 = 0.; 508 v2 = v1 + n; 509 v3 = v2 + n; 510 v4 = v3 + n; 511 v5 = v4 + n; 512 513 for (n = 0; n < sz - 1; n += 2) { 514 i1 = idx[0]; 515 i2 = idx[1]; 516 idx += 2; 517 tmp0 = x[i1]; 518 tmp1 = x[i2]; 519 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 520 v1 += 2; 521 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 522 v2 += 2; 523 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 524 v3 += 2; 525 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 526 v4 += 2; 527 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 528 v5 += 2; 529 } 530 if (n == sz - 1) { 531 tmp0 = x[*idx++]; 532 sum1 += *v1++ * tmp0; 533 sum2 += *v2++ * tmp0; 534 sum3 += *v3++ * tmp0; 535 sum4 += *v4++ * tmp0; 536 sum5 += *v5++ * tmp0; 537 } 538 y[row++] = sum1; 539 y[row++] = sum2; 540 y[row++] = sum3; 541 y[row++] = sum4; 542 y[row++] = sum5; 543 v1 = v5; /* Since the next block to be processed starts there */ 544 idx += 4 * sz; 545 break; 546 default: 547 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 548 } 549 } 550 PetscCall(VecRestoreArrayRead(xx, &x)); 551 PetscCall(VecRestoreArray(yy, &y)); 552 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 553 PetscFunctionReturn(0); 554 } 555 /* ----------------------------------------------------------- */ 556 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 557 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 558 { 559 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 560 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 561 const MatScalar *v1, *v2, *v3, *v4, *v5; 562 const PetscScalar *x; 563 PetscScalar *y, *z, *zt; 564 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 565 const PetscInt *idx, *ns, *ii; 566 567 PetscFunctionBegin; 568 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 569 node_max = a->inode.node_count; 570 ns = a->inode.size; /* Node Size array */ 571 572 PetscCall(VecGetArrayRead(xx, &x)); 573 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 574 zt = z; 575 576 idx = a->j; 577 v1 = a->a; 578 ii = a->i; 579 580 for (i = 0, row = 0; i < node_max; ++i) { 581 nsz = ns[i]; 582 n = ii[1] - ii[0]; 583 ii += nsz; 584 sz = n; /* No of non zeros in this row */ 585 /* Switch on the size of Node */ 586 switch (nsz) { /* Each loop in 'case' is unrolled */ 587 case 1: 588 sum1 = *zt++; 589 590 for (n = 0; n < sz - 1; n += 2) { 591 i1 = idx[0]; /* The instructions are ordered to */ 592 i2 = idx[1]; /* make the compiler's job easy */ 593 idx += 2; 594 tmp0 = x[i1]; 595 tmp1 = x[i2]; 596 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 597 v1 += 2; 598 } 599 600 if (n == sz - 1) { /* Take care of the last nonzero */ 601 tmp0 = x[*idx++]; 602 sum1 += *v1++ * tmp0; 603 } 604 y[row++] = sum1; 605 break; 606 case 2: 607 sum1 = *zt++; 608 sum2 = *zt++; 609 v2 = v1 + n; 610 611 for (n = 0; n < sz - 1; n += 2) { 612 i1 = idx[0]; 613 i2 = idx[1]; 614 idx += 2; 615 tmp0 = x[i1]; 616 tmp1 = x[i2]; 617 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 618 v1 += 2; 619 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 620 v2 += 2; 621 } 622 if (n == sz - 1) { 623 tmp0 = x[*idx++]; 624 sum1 += *v1++ * tmp0; 625 sum2 += *v2++ * tmp0; 626 } 627 y[row++] = sum1; 628 y[row++] = sum2; 629 v1 = v2; /* Since the next block to be processed starts there*/ 630 idx += sz; 631 break; 632 case 3: 633 sum1 = *zt++; 634 sum2 = *zt++; 635 sum3 = *zt++; 636 v2 = v1 + n; 637 v3 = v2 + n; 638 639 for (n = 0; n < sz - 1; n += 2) { 640 i1 = idx[0]; 641 i2 = idx[1]; 642 idx += 2; 643 tmp0 = x[i1]; 644 tmp1 = x[i2]; 645 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 646 v1 += 2; 647 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 648 v2 += 2; 649 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 650 v3 += 2; 651 } 652 if (n == sz - 1) { 653 tmp0 = x[*idx++]; 654 sum1 += *v1++ * tmp0; 655 sum2 += *v2++ * tmp0; 656 sum3 += *v3++ * tmp0; 657 } 658 y[row++] = sum1; 659 y[row++] = sum2; 660 y[row++] = sum3; 661 v1 = v3; /* Since the next block to be processed starts there*/ 662 idx += 2 * sz; 663 break; 664 case 4: 665 sum1 = *zt++; 666 sum2 = *zt++; 667 sum3 = *zt++; 668 sum4 = *zt++; 669 v2 = v1 + n; 670 v3 = v2 + n; 671 v4 = v3 + n; 672 673 for (n = 0; n < sz - 1; n += 2) { 674 i1 = idx[0]; 675 i2 = idx[1]; 676 idx += 2; 677 tmp0 = x[i1]; 678 tmp1 = x[i2]; 679 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 680 v1 += 2; 681 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 682 v2 += 2; 683 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 684 v3 += 2; 685 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 686 v4 += 2; 687 } 688 if (n == sz - 1) { 689 tmp0 = x[*idx++]; 690 sum1 += *v1++ * tmp0; 691 sum2 += *v2++ * tmp0; 692 sum3 += *v3++ * tmp0; 693 sum4 += *v4++ * tmp0; 694 } 695 y[row++] = sum1; 696 y[row++] = sum2; 697 y[row++] = sum3; 698 y[row++] = sum4; 699 v1 = v4; /* Since the next block to be processed starts there*/ 700 idx += 3 * sz; 701 break; 702 case 5: 703 sum1 = *zt++; 704 sum2 = *zt++; 705 sum3 = *zt++; 706 sum4 = *zt++; 707 sum5 = *zt++; 708 v2 = v1 + n; 709 v3 = v2 + n; 710 v4 = v3 + n; 711 v5 = v4 + n; 712 713 for (n = 0; n < sz - 1; n += 2) { 714 i1 = idx[0]; 715 i2 = idx[1]; 716 idx += 2; 717 tmp0 = x[i1]; 718 tmp1 = x[i2]; 719 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 720 v1 += 2; 721 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 722 v2 += 2; 723 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 724 v3 += 2; 725 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 726 v4 += 2; 727 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 728 v5 += 2; 729 } 730 if (n == sz - 1) { 731 tmp0 = x[*idx++]; 732 sum1 += *v1++ * tmp0; 733 sum2 += *v2++ * tmp0; 734 sum3 += *v3++ * tmp0; 735 sum4 += *v4++ * tmp0; 736 sum5 += *v5++ * tmp0; 737 } 738 y[row++] = sum1; 739 y[row++] = sum2; 740 y[row++] = sum3; 741 y[row++] = sum4; 742 y[row++] = sum5; 743 v1 = v5; /* Since the next block to be processed starts there */ 744 idx += 4 * sz; 745 break; 746 default: 747 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 748 } 749 } 750 PetscCall(VecRestoreArrayRead(xx, &x)); 751 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 752 PetscCall(PetscLogFlops(2.0 * a->nz)); 753 PetscFunctionReturn(0); 754 } 755 756 /* ----------------------------------------------------------- */ 757 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 758 { 759 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 760 IS iscol = a->col, isrow = a->row; 761 const PetscInt *r, *c, *rout, *cout; 762 PetscInt i, j, n = A->rmap->n, nz; 763 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 764 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 765 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 766 PetscScalar sum1, sum2, sum3, sum4, sum5; 767 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 768 const PetscScalar *b; 769 770 PetscFunctionBegin; 771 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 772 node_max = a->inode.node_count; 773 ns = a->inode.size; /* Node Size array */ 774 775 PetscCall(VecGetArrayRead(bb, &b)); 776 PetscCall(VecGetArrayWrite(xx, &x)); 777 tmp = a->solve_work; 778 779 PetscCall(ISGetIndices(isrow, &rout)); 780 r = rout; 781 PetscCall(ISGetIndices(iscol, &cout)); 782 c = cout + (n - 1); 783 784 /* forward solve the lower triangular */ 785 tmps = tmp; 786 aa = a_a; 787 aj = a_j; 788 ad = a->diag; 789 790 for (i = 0, row = 0; i < node_max; ++i) { 791 nsz = ns[i]; 792 aii = ai[row]; 793 v1 = aa + aii; 794 vi = aj + aii; 795 nz = ad[row] - aii; 796 if (i < node_max - 1) { 797 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 798 * but our indexing to determine it's size could. */ 799 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 800 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 801 PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 802 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 803 } 804 805 switch (nsz) { /* Each loop in 'case' is unrolled */ 806 case 1: 807 sum1 = b[*r++]; 808 for (j = 0; j < nz - 1; j += 2) { 809 i0 = vi[0]; 810 i1 = vi[1]; 811 vi += 2; 812 tmp0 = tmps[i0]; 813 tmp1 = tmps[i1]; 814 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 815 v1 += 2; 816 } 817 if (j == nz - 1) { 818 tmp0 = tmps[*vi++]; 819 sum1 -= *v1++ * tmp0; 820 } 821 tmp[row++] = sum1; 822 break; 823 case 2: 824 sum1 = b[*r++]; 825 sum2 = b[*r++]; 826 v2 = aa + ai[row + 1]; 827 828 for (j = 0; j < nz - 1; j += 2) { 829 i0 = vi[0]; 830 i1 = vi[1]; 831 vi += 2; 832 tmp0 = tmps[i0]; 833 tmp1 = tmps[i1]; 834 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 835 v1 += 2; 836 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 837 v2 += 2; 838 } 839 if (j == nz - 1) { 840 tmp0 = tmps[*vi++]; 841 sum1 -= *v1++ * tmp0; 842 sum2 -= *v2++ * tmp0; 843 } 844 sum2 -= *v2++ * sum1; 845 tmp[row++] = sum1; 846 tmp[row++] = sum2; 847 break; 848 case 3: 849 sum1 = b[*r++]; 850 sum2 = b[*r++]; 851 sum3 = b[*r++]; 852 v2 = aa + ai[row + 1]; 853 v3 = aa + ai[row + 2]; 854 855 for (j = 0; j < nz - 1; j += 2) { 856 i0 = vi[0]; 857 i1 = vi[1]; 858 vi += 2; 859 tmp0 = tmps[i0]; 860 tmp1 = tmps[i1]; 861 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 862 v1 += 2; 863 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 864 v2 += 2; 865 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 866 v3 += 2; 867 } 868 if (j == nz - 1) { 869 tmp0 = tmps[*vi++]; 870 sum1 -= *v1++ * tmp0; 871 sum2 -= *v2++ * tmp0; 872 sum3 -= *v3++ * tmp0; 873 } 874 sum2 -= *v2++ * sum1; 875 sum3 -= *v3++ * sum1; 876 sum3 -= *v3++ * sum2; 877 878 tmp[row++] = sum1; 879 tmp[row++] = sum2; 880 tmp[row++] = sum3; 881 break; 882 883 case 4: 884 sum1 = b[*r++]; 885 sum2 = b[*r++]; 886 sum3 = b[*r++]; 887 sum4 = b[*r++]; 888 v2 = aa + ai[row + 1]; 889 v3 = aa + ai[row + 2]; 890 v4 = aa + ai[row + 3]; 891 892 for (j = 0; j < nz - 1; j += 2) { 893 i0 = vi[0]; 894 i1 = vi[1]; 895 vi += 2; 896 tmp0 = tmps[i0]; 897 tmp1 = tmps[i1]; 898 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 899 v1 += 2; 900 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 901 v2 += 2; 902 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 903 v3 += 2; 904 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 905 v4 += 2; 906 } 907 if (j == nz - 1) { 908 tmp0 = tmps[*vi++]; 909 sum1 -= *v1++ * tmp0; 910 sum2 -= *v2++ * tmp0; 911 sum3 -= *v3++ * tmp0; 912 sum4 -= *v4++ * tmp0; 913 } 914 sum2 -= *v2++ * sum1; 915 sum3 -= *v3++ * sum1; 916 sum4 -= *v4++ * sum1; 917 sum3 -= *v3++ * sum2; 918 sum4 -= *v4++ * sum2; 919 sum4 -= *v4++ * sum3; 920 921 tmp[row++] = sum1; 922 tmp[row++] = sum2; 923 tmp[row++] = sum3; 924 tmp[row++] = sum4; 925 break; 926 case 5: 927 sum1 = b[*r++]; 928 sum2 = b[*r++]; 929 sum3 = b[*r++]; 930 sum4 = b[*r++]; 931 sum5 = b[*r++]; 932 v2 = aa + ai[row + 1]; 933 v3 = aa + ai[row + 2]; 934 v4 = aa + ai[row + 3]; 935 v5 = aa + ai[row + 4]; 936 937 for (j = 0; j < nz - 1; j += 2) { 938 i0 = vi[0]; 939 i1 = vi[1]; 940 vi += 2; 941 tmp0 = tmps[i0]; 942 tmp1 = tmps[i1]; 943 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 944 v1 += 2; 945 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 946 v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 948 v3 += 2; 949 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 950 v4 += 2; 951 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 952 v5 += 2; 953 } 954 if (j == nz - 1) { 955 tmp0 = tmps[*vi++]; 956 sum1 -= *v1++ * tmp0; 957 sum2 -= *v2++ * tmp0; 958 sum3 -= *v3++ * tmp0; 959 sum4 -= *v4++ * tmp0; 960 sum5 -= *v5++ * tmp0; 961 } 962 963 sum2 -= *v2++ * sum1; 964 sum3 -= *v3++ * sum1; 965 sum4 -= *v4++ * sum1; 966 sum5 -= *v5++ * sum1; 967 sum3 -= *v3++ * sum2; 968 sum4 -= *v4++ * sum2; 969 sum5 -= *v5++ * sum2; 970 sum4 -= *v4++ * sum3; 971 sum5 -= *v5++ * sum3; 972 sum5 -= *v5++ * sum4; 973 974 tmp[row++] = sum1; 975 tmp[row++] = sum2; 976 tmp[row++] = sum3; 977 tmp[row++] = sum4; 978 tmp[row++] = sum5; 979 break; 980 default: 981 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 982 } 983 } 984 /* backward solve the upper triangular */ 985 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 986 nsz = ns[i]; 987 aii = ai[row + 1] - 1; 988 v1 = aa + aii; 989 vi = aj + aii; 990 nz = aii - ad[row]; 991 switch (nsz) { /* Each loop in 'case' is unrolled */ 992 case 1: 993 sum1 = tmp[row]; 994 995 for (j = nz; j > 1; j -= 2) { 996 vi -= 2; 997 i0 = vi[2]; 998 i1 = vi[1]; 999 tmp0 = tmps[i0]; 1000 tmp1 = tmps[i1]; 1001 v1 -= 2; 1002 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1003 } 1004 if (j == 1) { 1005 tmp0 = tmps[*vi--]; 1006 sum1 -= *v1-- * tmp0; 1007 } 1008 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1009 row--; 1010 break; 1011 case 2: 1012 sum1 = tmp[row]; 1013 sum2 = tmp[row - 1]; 1014 v2 = aa + ai[row] - 1; 1015 for (j = nz; j > 1; j -= 2) { 1016 vi -= 2; 1017 i0 = vi[2]; 1018 i1 = vi[1]; 1019 tmp0 = tmps[i0]; 1020 tmp1 = tmps[i1]; 1021 v1 -= 2; 1022 v2 -= 2; 1023 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1024 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1025 } 1026 if (j == 1) { 1027 tmp0 = tmps[*vi--]; 1028 sum1 -= *v1-- * tmp0; 1029 sum2 -= *v2-- * tmp0; 1030 } 1031 1032 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1033 row--; 1034 sum2 -= *v2-- * tmp0; 1035 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1036 row--; 1037 break; 1038 case 3: 1039 sum1 = tmp[row]; 1040 sum2 = tmp[row - 1]; 1041 sum3 = tmp[row - 2]; 1042 v2 = aa + ai[row] - 1; 1043 v3 = aa + ai[row - 1] - 1; 1044 for (j = nz; j > 1; j -= 2) { 1045 vi -= 2; 1046 i0 = vi[2]; 1047 i1 = vi[1]; 1048 tmp0 = tmps[i0]; 1049 tmp1 = tmps[i1]; 1050 v1 -= 2; 1051 v2 -= 2; 1052 v3 -= 2; 1053 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1054 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1055 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1056 } 1057 if (j == 1) { 1058 tmp0 = tmps[*vi--]; 1059 sum1 -= *v1-- * tmp0; 1060 sum2 -= *v2-- * tmp0; 1061 sum3 -= *v3-- * tmp0; 1062 } 1063 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1064 row--; 1065 sum2 -= *v2-- * tmp0; 1066 sum3 -= *v3-- * tmp0; 1067 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1068 row--; 1069 sum3 -= *v3-- * tmp0; 1070 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1071 row--; 1072 1073 break; 1074 case 4: 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row - 1]; 1077 sum3 = tmp[row - 2]; 1078 sum4 = tmp[row - 3]; 1079 v2 = aa + ai[row] - 1; 1080 v3 = aa + ai[row - 1] - 1; 1081 v4 = aa + ai[row - 2] - 1; 1082 1083 for (j = nz; j > 1; j -= 2) { 1084 vi -= 2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j == 1) { 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1107 row--; 1108 sum2 -= *v2-- * tmp0; 1109 sum3 -= *v3-- * tmp0; 1110 sum4 -= *v4-- * tmp0; 1111 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1112 row--; 1113 sum3 -= *v3-- * tmp0; 1114 sum4 -= *v4-- * tmp0; 1115 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1116 row--; 1117 sum4 -= *v4-- * tmp0; 1118 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1119 row--; 1120 break; 1121 case 5: 1122 sum1 = tmp[row]; 1123 sum2 = tmp[row - 1]; 1124 sum3 = tmp[row - 2]; 1125 sum4 = tmp[row - 3]; 1126 sum5 = tmp[row - 4]; 1127 v2 = aa + ai[row] - 1; 1128 v3 = aa + ai[row - 1] - 1; 1129 v4 = aa + ai[row - 2] - 1; 1130 v5 = aa + ai[row - 3] - 1; 1131 for (j = nz; j > 1; j -= 2) { 1132 vi -= 2; 1133 i0 = vi[2]; 1134 i1 = vi[1]; 1135 tmp0 = tmps[i0]; 1136 tmp1 = tmps[i1]; 1137 v1 -= 2; 1138 v2 -= 2; 1139 v3 -= 2; 1140 v4 -= 2; 1141 v5 -= 2; 1142 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1143 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1144 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1145 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1146 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1147 } 1148 if (j == 1) { 1149 tmp0 = tmps[*vi--]; 1150 sum1 -= *v1-- * tmp0; 1151 sum2 -= *v2-- * tmp0; 1152 sum3 -= *v3-- * tmp0; 1153 sum4 -= *v4-- * tmp0; 1154 sum5 -= *v5-- * tmp0; 1155 } 1156 1157 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1158 row--; 1159 sum2 -= *v2-- * tmp0; 1160 sum3 -= *v3-- * tmp0; 1161 sum4 -= *v4-- * tmp0; 1162 sum5 -= *v5-- * tmp0; 1163 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1164 row--; 1165 sum3 -= *v3-- * tmp0; 1166 sum4 -= *v4-- * tmp0; 1167 sum5 -= *v5-- * tmp0; 1168 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1169 row--; 1170 sum4 -= *v4-- * tmp0; 1171 sum5 -= *v5-- * tmp0; 1172 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1173 row--; 1174 sum5 -= *v5-- * tmp0; 1175 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1176 row--; 1177 break; 1178 default: 1179 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1180 } 1181 } 1182 PetscCall(ISRestoreIndices(isrow, &rout)); 1183 PetscCall(ISRestoreIndices(iscol, &cout)); 1184 PetscCall(VecRestoreArrayRead(bb, &b)); 1185 PetscCall(VecRestoreArrayWrite(xx, &x)); 1186 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1187 PetscFunctionReturn(0); 1188 } 1189 1190 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1191 { 1192 Mat C = B; 1193 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1194 IS isrow = b->row, isicol = b->icol; 1195 const PetscInt *r, *ic, *ics; 1196 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1197 PetscInt i, j, k, nz, nzL, row, *pj; 1198 const PetscInt *ajtmp, *bjtmp; 1199 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1200 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1201 FactorShiftCtx sctx; 1202 const PetscInt *ddiag; 1203 PetscReal rs; 1204 MatScalar d; 1205 PetscInt inod, nodesz, node_max, col; 1206 const PetscInt *ns; 1207 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1208 1209 PetscFunctionBegin; 1210 /* MatPivotSetUp(): initialize shift context sctx */ 1211 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1212 1213 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1214 ddiag = a->diag; 1215 sctx.shift_top = info->zeropivot; 1216 for (i = 0; i < n; i++) { 1217 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1218 d = (aa)[ddiag[i]]; 1219 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1220 v = aa + ai[i]; 1221 nz = ai[i + 1] - ai[i]; 1222 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1223 if (rs > sctx.shift_top) sctx.shift_top = rs; 1224 } 1225 sctx.shift_top *= 1.1; 1226 sctx.nshift_max = 5; 1227 sctx.shift_lo = 0.; 1228 sctx.shift_hi = 1.; 1229 } 1230 1231 PetscCall(ISGetIndices(isrow, &r)); 1232 PetscCall(ISGetIndices(isicol, &ic)); 1233 1234 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1235 ics = ic; 1236 1237 node_max = a->inode.node_count; 1238 ns = a->inode.size; 1239 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1240 1241 /* If max inode size > 4, split it into two inodes.*/ 1242 /* also map the inode sizes according to the ordering */ 1243 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1244 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1245 if (ns[i] > 4) { 1246 tmp_vec1[j] = 4; 1247 ++j; 1248 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 1249 } else { 1250 tmp_vec1[j] = ns[i]; 1251 } 1252 } 1253 /* Use the correct node_max */ 1254 node_max = j; 1255 1256 /* Now reorder the inode info based on mat re-ordering info */ 1257 /* First create a row -> inode_size_array_index map */ 1258 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1259 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1260 for (i = 0, row = 0; i < node_max; i++) { 1261 nodesz = tmp_vec1[i]; 1262 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1263 } 1264 /* Using nsmap, create a reordered ns structure */ 1265 for (i = 0, j = 0; i < node_max; i++) { 1266 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1267 tmp_vec2[i] = nodesz; 1268 j += nodesz; 1269 } 1270 PetscCall(PetscFree(nsmap)); 1271 PetscCall(PetscFree(tmp_vec1)); 1272 1273 /* Now use the correct ns */ 1274 ns = tmp_vec2; 1275 1276 do { 1277 sctx.newshift = PETSC_FALSE; 1278 /* Now loop over each block-row, and do the factorization */ 1279 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1280 nodesz = ns[inod]; 1281 1282 switch (nodesz) { 1283 case 1: 1284 /*----------*/ 1285 /* zero rtmp1 */ 1286 /* L part */ 1287 nz = bi[i + 1] - bi[i]; 1288 bjtmp = bj + bi[i]; 1289 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1290 1291 /* U part */ 1292 nz = bdiag[i] - bdiag[i + 1]; 1293 bjtmp = bj + bdiag[i + 1] + 1; 1294 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1295 1296 /* load in initial (unfactored row) */ 1297 nz = ai[r[i] + 1] - ai[r[i]]; 1298 ajtmp = aj + ai[r[i]]; 1299 v = aa + ai[r[i]]; 1300 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1301 1302 /* ZeropivotApply() */ 1303 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1304 1305 /* elimination */ 1306 bjtmp = bj + bi[i]; 1307 row = *bjtmp++; 1308 nzL = bi[i + 1] - bi[i]; 1309 for (k = 0; k < nzL; k++) { 1310 pc = rtmp1 + row; 1311 if (*pc != 0.0) { 1312 pv = b->a + bdiag[row]; 1313 mul1 = *pc * (*pv); 1314 *pc = mul1; 1315 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1316 pv = b->a + bdiag[row + 1] + 1; 1317 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1318 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1319 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1320 } 1321 row = *bjtmp++; 1322 } 1323 1324 /* finished row so stick it into b->a */ 1325 rs = 0.0; 1326 /* L part */ 1327 pv = b->a + bi[i]; 1328 pj = b->j + bi[i]; 1329 nz = bi[i + 1] - bi[i]; 1330 for (j = 0; j < nz; j++) { 1331 pv[j] = rtmp1[pj[j]]; 1332 rs += PetscAbsScalar(pv[j]); 1333 } 1334 1335 /* U part */ 1336 pv = b->a + bdiag[i + 1] + 1; 1337 pj = b->j + bdiag[i + 1] + 1; 1338 nz = bdiag[i] - bdiag[i + 1] - 1; 1339 for (j = 0; j < nz; j++) { 1340 pv[j] = rtmp1[pj[j]]; 1341 rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* Check zero pivot */ 1345 sctx.rs = rs; 1346 sctx.pv = rtmp1[i]; 1347 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1348 if (sctx.newshift) break; 1349 1350 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1351 pv = b->a + bdiag[i]; 1352 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1353 break; 1354 1355 case 2: 1356 /*----------*/ 1357 /* zero rtmp1 and rtmp2 */ 1358 /* L part */ 1359 nz = bi[i + 1] - bi[i]; 1360 bjtmp = bj + bi[i]; 1361 for (j = 0; j < nz; j++) { 1362 col = bjtmp[j]; 1363 rtmp1[col] = 0.0; 1364 rtmp2[col] = 0.0; 1365 } 1366 1367 /* U part */ 1368 nz = bdiag[i] - bdiag[i + 1]; 1369 bjtmp = bj + bdiag[i + 1] + 1; 1370 for (j = 0; j < nz; j++) { 1371 col = bjtmp[j]; 1372 rtmp1[col] = 0.0; 1373 rtmp2[col] = 0.0; 1374 } 1375 1376 /* load in initial (unfactored row) */ 1377 nz = ai[r[i] + 1] - ai[r[i]]; 1378 ajtmp = aj + ai[r[i]]; 1379 v1 = aa + ai[r[i]]; 1380 v2 = aa + ai[r[i] + 1]; 1381 for (j = 0; j < nz; j++) { 1382 col = ics[ajtmp[j]]; 1383 rtmp1[col] = v1[j]; 1384 rtmp2[col] = v2[j]; 1385 } 1386 /* ZeropivotApply(): shift the diagonal of the matrix */ 1387 rtmp1[i] += sctx.shift_amount; 1388 rtmp2[i + 1] += sctx.shift_amount; 1389 1390 /* elimination */ 1391 bjtmp = bj + bi[i]; 1392 row = *bjtmp++; /* pivot row */ 1393 nzL = bi[i + 1] - bi[i]; 1394 for (k = 0; k < nzL; k++) { 1395 pc1 = rtmp1 + row; 1396 pc2 = rtmp2 + row; 1397 if (*pc1 != 0.0 || *pc2 != 0.0) { 1398 pv = b->a + bdiag[row]; 1399 mul1 = *pc1 * (*pv); 1400 mul2 = *pc2 * (*pv); 1401 *pc1 = mul1; 1402 *pc2 = mul2; 1403 1404 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1405 pv = b->a + bdiag[row + 1] + 1; 1406 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1407 for (j = 0; j < nz; j++) { 1408 col = pj[j]; 1409 rtmp1[col] -= mul1 * pv[j]; 1410 rtmp2[col] -= mul2 * pv[j]; 1411 } 1412 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1413 } 1414 row = *bjtmp++; 1415 } 1416 1417 /* finished row i; check zero pivot, then stick row i into b->a */ 1418 rs = 0.0; 1419 /* L part */ 1420 pc1 = b->a + bi[i]; 1421 pj = b->j + bi[i]; 1422 nz = bi[i + 1] - bi[i]; 1423 for (j = 0; j < nz; j++) { 1424 col = pj[j]; 1425 pc1[j] = rtmp1[col]; 1426 rs += PetscAbsScalar(pc1[j]); 1427 } 1428 /* U part */ 1429 pc1 = b->a + bdiag[i + 1] + 1; 1430 pj = b->j + bdiag[i + 1] + 1; 1431 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1432 for (j = 0; j < nz; j++) { 1433 col = pj[j]; 1434 pc1[j] = rtmp1[col]; 1435 rs += PetscAbsScalar(pc1[j]); 1436 } 1437 1438 sctx.rs = rs; 1439 sctx.pv = rtmp1[i]; 1440 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1441 if (sctx.newshift) break; 1442 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1443 *pc1 = 1.0 / sctx.pv; 1444 1445 /* Now take care of diagonal 2x2 block. */ 1446 pc2 = rtmp2 + i; 1447 if (*pc2 != 0.0) { 1448 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1449 *pc2 = mul1; /* insert L entry */ 1450 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1451 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1452 for (j = 0; j < nz; j++) { 1453 col = pj[j]; 1454 rtmp2[col] -= mul1 * rtmp1[col]; 1455 } 1456 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1457 } 1458 1459 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1460 rs = 0.0; 1461 /* L part */ 1462 pc2 = b->a + bi[i + 1]; 1463 pj = b->j + bi[i + 1]; 1464 nz = bi[i + 2] - bi[i + 1]; 1465 for (j = 0; j < nz; j++) { 1466 col = pj[j]; 1467 pc2[j] = rtmp2[col]; 1468 rs += PetscAbsScalar(pc2[j]); 1469 } 1470 /* U part */ 1471 pc2 = b->a + bdiag[i + 2] + 1; 1472 pj = b->j + bdiag[i + 2] + 1; 1473 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1474 for (j = 0; j < nz; j++) { 1475 col = pj[j]; 1476 pc2[j] = rtmp2[col]; 1477 rs += PetscAbsScalar(pc2[j]); 1478 } 1479 1480 sctx.rs = rs; 1481 sctx.pv = rtmp2[i + 1]; 1482 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1483 if (sctx.newshift) break; 1484 pc2 = b->a + bdiag[i + 1]; 1485 *pc2 = 1.0 / sctx.pv; 1486 break; 1487 1488 case 3: 1489 /*----------*/ 1490 /* zero rtmp */ 1491 /* L part */ 1492 nz = bi[i + 1] - bi[i]; 1493 bjtmp = bj + bi[i]; 1494 for (j = 0; j < nz; j++) { 1495 col = bjtmp[j]; 1496 rtmp1[col] = 0.0; 1497 rtmp2[col] = 0.0; 1498 rtmp3[col] = 0.0; 1499 } 1500 1501 /* U part */ 1502 nz = bdiag[i] - bdiag[i + 1]; 1503 bjtmp = bj + bdiag[i + 1] + 1; 1504 for (j = 0; j < nz; j++) { 1505 col = bjtmp[j]; 1506 rtmp1[col] = 0.0; 1507 rtmp2[col] = 0.0; 1508 rtmp3[col] = 0.0; 1509 } 1510 1511 /* load in initial (unfactored row) */ 1512 nz = ai[r[i] + 1] - ai[r[i]]; 1513 ajtmp = aj + ai[r[i]]; 1514 v1 = aa + ai[r[i]]; 1515 v2 = aa + ai[r[i] + 1]; 1516 v3 = aa + ai[r[i] + 2]; 1517 for (j = 0; j < nz; j++) { 1518 col = ics[ajtmp[j]]; 1519 rtmp1[col] = v1[j]; 1520 rtmp2[col] = v2[j]; 1521 rtmp3[col] = v3[j]; 1522 } 1523 /* ZeropivotApply(): shift the diagonal of the matrix */ 1524 rtmp1[i] += sctx.shift_amount; 1525 rtmp2[i + 1] += sctx.shift_amount; 1526 rtmp3[i + 2] += sctx.shift_amount; 1527 1528 /* elimination */ 1529 bjtmp = bj + bi[i]; 1530 row = *bjtmp++; /* pivot row */ 1531 nzL = bi[i + 1] - bi[i]; 1532 for (k = 0; k < nzL; k++) { 1533 pc1 = rtmp1 + row; 1534 pc2 = rtmp2 + row; 1535 pc3 = rtmp3 + row; 1536 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1537 pv = b->a + bdiag[row]; 1538 mul1 = *pc1 * (*pv); 1539 mul2 = *pc2 * (*pv); 1540 mul3 = *pc3 * (*pv); 1541 *pc1 = mul1; 1542 *pc2 = mul2; 1543 *pc3 = mul3; 1544 1545 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1546 pv = b->a + bdiag[row + 1] + 1; 1547 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1548 for (j = 0; j < nz; j++) { 1549 col = pj[j]; 1550 rtmp1[col] -= mul1 * pv[j]; 1551 rtmp2[col] -= mul2 * pv[j]; 1552 rtmp3[col] -= mul3 * pv[j]; 1553 } 1554 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1555 } 1556 row = *bjtmp++; 1557 } 1558 1559 /* finished row i; check zero pivot, then stick row i into b->a */ 1560 rs = 0.0; 1561 /* L part */ 1562 pc1 = b->a + bi[i]; 1563 pj = b->j + bi[i]; 1564 nz = bi[i + 1] - bi[i]; 1565 for (j = 0; j < nz; j++) { 1566 col = pj[j]; 1567 pc1[j] = rtmp1[col]; 1568 rs += PetscAbsScalar(pc1[j]); 1569 } 1570 /* U part */ 1571 pc1 = b->a + bdiag[i + 1] + 1; 1572 pj = b->j + bdiag[i + 1] + 1; 1573 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1574 for (j = 0; j < nz; j++) { 1575 col = pj[j]; 1576 pc1[j] = rtmp1[col]; 1577 rs += PetscAbsScalar(pc1[j]); 1578 } 1579 1580 sctx.rs = rs; 1581 sctx.pv = rtmp1[i]; 1582 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1583 if (sctx.newshift) break; 1584 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1585 *pc1 = 1.0 / sctx.pv; 1586 1587 /* Now take care of 1st column of diagonal 3x3 block. */ 1588 pc2 = rtmp2 + i; 1589 pc3 = rtmp3 + i; 1590 if (*pc2 != 0.0 || *pc3 != 0.0) { 1591 mul2 = (*pc2) * (*pc1); 1592 *pc2 = mul2; 1593 mul3 = (*pc3) * (*pc1); 1594 *pc3 = mul3; 1595 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1596 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1597 for (j = 0; j < nz; j++) { 1598 col = pj[j]; 1599 rtmp2[col] -= mul2 * rtmp1[col]; 1600 rtmp3[col] -= mul3 * rtmp1[col]; 1601 } 1602 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1603 } 1604 1605 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1606 rs = 0.0; 1607 /* L part */ 1608 pc2 = b->a + bi[i + 1]; 1609 pj = b->j + bi[i + 1]; 1610 nz = bi[i + 2] - bi[i + 1]; 1611 for (j = 0; j < nz; j++) { 1612 col = pj[j]; 1613 pc2[j] = rtmp2[col]; 1614 rs += PetscAbsScalar(pc2[j]); 1615 } 1616 /* U part */ 1617 pc2 = b->a + bdiag[i + 2] + 1; 1618 pj = b->j + bdiag[i + 2] + 1; 1619 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1620 for (j = 0; j < nz; j++) { 1621 col = pj[j]; 1622 pc2[j] = rtmp2[col]; 1623 rs += PetscAbsScalar(pc2[j]); 1624 } 1625 1626 sctx.rs = rs; 1627 sctx.pv = rtmp2[i + 1]; 1628 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1629 if (sctx.newshift) break; 1630 pc2 = b->a + bdiag[i + 1]; 1631 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1632 1633 /* Now take care of 2nd column of diagonal 3x3 block. */ 1634 pc3 = rtmp3 + i + 1; 1635 if (*pc3 != 0.0) { 1636 mul3 = (*pc3) * (*pc2); 1637 *pc3 = mul3; 1638 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1639 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1640 for (j = 0; j < nz; j++) { 1641 col = pj[j]; 1642 rtmp3[col] -= mul3 * rtmp2[col]; 1643 } 1644 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1645 } 1646 1647 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1648 rs = 0.0; 1649 /* L part */ 1650 pc3 = b->a + bi[i + 2]; 1651 pj = b->j + bi[i + 2]; 1652 nz = bi[i + 3] - bi[i + 2]; 1653 for (j = 0; j < nz; j++) { 1654 col = pj[j]; 1655 pc3[j] = rtmp3[col]; 1656 rs += PetscAbsScalar(pc3[j]); 1657 } 1658 /* U part */ 1659 pc3 = b->a + bdiag[i + 3] + 1; 1660 pj = b->j + bdiag[i + 3] + 1; 1661 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1662 for (j = 0; j < nz; j++) { 1663 col = pj[j]; 1664 pc3[j] = rtmp3[col]; 1665 rs += PetscAbsScalar(pc3[j]); 1666 } 1667 1668 sctx.rs = rs; 1669 sctx.pv = rtmp3[i + 2]; 1670 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1671 if (sctx.newshift) break; 1672 pc3 = b->a + bdiag[i + 2]; 1673 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1674 break; 1675 case 4: 1676 /*----------*/ 1677 /* zero rtmp */ 1678 /* L part */ 1679 nz = bi[i + 1] - bi[i]; 1680 bjtmp = bj + bi[i]; 1681 for (j = 0; j < nz; j++) { 1682 col = bjtmp[j]; 1683 rtmp1[col] = 0.0; 1684 rtmp2[col] = 0.0; 1685 rtmp3[col] = 0.0; 1686 rtmp4[col] = 0.0; 1687 } 1688 1689 /* U part */ 1690 nz = bdiag[i] - bdiag[i + 1]; 1691 bjtmp = bj + bdiag[i + 1] + 1; 1692 for (j = 0; j < nz; j++) { 1693 col = bjtmp[j]; 1694 rtmp1[col] = 0.0; 1695 rtmp2[col] = 0.0; 1696 rtmp3[col] = 0.0; 1697 rtmp4[col] = 0.0; 1698 } 1699 1700 /* load in initial (unfactored row) */ 1701 nz = ai[r[i] + 1] - ai[r[i]]; 1702 ajtmp = aj + ai[r[i]]; 1703 v1 = aa + ai[r[i]]; 1704 v2 = aa + ai[r[i] + 1]; 1705 v3 = aa + ai[r[i] + 2]; 1706 v4 = aa + ai[r[i] + 3]; 1707 for (j = 0; j < nz; j++) { 1708 col = ics[ajtmp[j]]; 1709 rtmp1[col] = v1[j]; 1710 rtmp2[col] = v2[j]; 1711 rtmp3[col] = v3[j]; 1712 rtmp4[col] = v4[j]; 1713 } 1714 /* ZeropivotApply(): shift the diagonal of the matrix */ 1715 rtmp1[i] += sctx.shift_amount; 1716 rtmp2[i + 1] += sctx.shift_amount; 1717 rtmp3[i + 2] += sctx.shift_amount; 1718 rtmp4[i + 3] += sctx.shift_amount; 1719 1720 /* elimination */ 1721 bjtmp = bj + bi[i]; 1722 row = *bjtmp++; /* pivot row */ 1723 nzL = bi[i + 1] - bi[i]; 1724 for (k = 0; k < nzL; k++) { 1725 pc1 = rtmp1 + row; 1726 pc2 = rtmp2 + row; 1727 pc3 = rtmp3 + row; 1728 pc4 = rtmp4 + row; 1729 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1730 pv = b->a + bdiag[row]; 1731 mul1 = *pc1 * (*pv); 1732 mul2 = *pc2 * (*pv); 1733 mul3 = *pc3 * (*pv); 1734 mul4 = *pc4 * (*pv); 1735 *pc1 = mul1; 1736 *pc2 = mul2; 1737 *pc3 = mul3; 1738 *pc4 = mul4; 1739 1740 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1741 pv = b->a + bdiag[row + 1] + 1; 1742 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1743 for (j = 0; j < nz; j++) { 1744 col = pj[j]; 1745 rtmp1[col] -= mul1 * pv[j]; 1746 rtmp2[col] -= mul2 * pv[j]; 1747 rtmp3[col] -= mul3 * pv[j]; 1748 rtmp4[col] -= mul4 * pv[j]; 1749 } 1750 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1751 } 1752 row = *bjtmp++; 1753 } 1754 1755 /* finished row i; check zero pivot, then stick row i into b->a */ 1756 rs = 0.0; 1757 /* L part */ 1758 pc1 = b->a + bi[i]; 1759 pj = b->j + bi[i]; 1760 nz = bi[i + 1] - bi[i]; 1761 for (j = 0; j < nz; j++) { 1762 col = pj[j]; 1763 pc1[j] = rtmp1[col]; 1764 rs += PetscAbsScalar(pc1[j]); 1765 } 1766 /* U part */ 1767 pc1 = b->a + bdiag[i + 1] + 1; 1768 pj = b->j + bdiag[i + 1] + 1; 1769 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1770 for (j = 0; j < nz; j++) { 1771 col = pj[j]; 1772 pc1[j] = rtmp1[col]; 1773 rs += PetscAbsScalar(pc1[j]); 1774 } 1775 1776 sctx.rs = rs; 1777 sctx.pv = rtmp1[i]; 1778 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1779 if (sctx.newshift) break; 1780 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1781 *pc1 = 1.0 / sctx.pv; 1782 1783 /* Now take care of 1st column of diagonal 4x4 block. */ 1784 pc2 = rtmp2 + i; 1785 pc3 = rtmp3 + i; 1786 pc4 = rtmp4 + i; 1787 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1788 mul2 = (*pc2) * (*pc1); 1789 *pc2 = mul2; 1790 mul3 = (*pc3) * (*pc1); 1791 *pc3 = mul3; 1792 mul4 = (*pc4) * (*pc1); 1793 *pc4 = mul4; 1794 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1795 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1796 for (j = 0; j < nz; j++) { 1797 col = pj[j]; 1798 rtmp2[col] -= mul2 * rtmp1[col]; 1799 rtmp3[col] -= mul3 * rtmp1[col]; 1800 rtmp4[col] -= mul4 * rtmp1[col]; 1801 } 1802 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1803 } 1804 1805 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1806 rs = 0.0; 1807 /* L part */ 1808 pc2 = b->a + bi[i + 1]; 1809 pj = b->j + bi[i + 1]; 1810 nz = bi[i + 2] - bi[i + 1]; 1811 for (j = 0; j < nz; j++) { 1812 col = pj[j]; 1813 pc2[j] = rtmp2[col]; 1814 rs += PetscAbsScalar(pc2[j]); 1815 } 1816 /* U part */ 1817 pc2 = b->a + bdiag[i + 2] + 1; 1818 pj = b->j + bdiag[i + 2] + 1; 1819 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1820 for (j = 0; j < nz; j++) { 1821 col = pj[j]; 1822 pc2[j] = rtmp2[col]; 1823 rs += PetscAbsScalar(pc2[j]); 1824 } 1825 1826 sctx.rs = rs; 1827 sctx.pv = rtmp2[i + 1]; 1828 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1829 if (sctx.newshift) break; 1830 pc2 = b->a + bdiag[i + 1]; 1831 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1832 1833 /* Now take care of 2nd column of diagonal 4x4 block. */ 1834 pc3 = rtmp3 + i + 1; 1835 pc4 = rtmp4 + i + 1; 1836 if (*pc3 != 0.0 || *pc4 != 0.0) { 1837 mul3 = (*pc3) * (*pc2); 1838 *pc3 = mul3; 1839 mul4 = (*pc4) * (*pc2); 1840 *pc4 = mul4; 1841 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1842 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1843 for (j = 0; j < nz; j++) { 1844 col = pj[j]; 1845 rtmp3[col] -= mul3 * rtmp2[col]; 1846 rtmp4[col] -= mul4 * rtmp2[col]; 1847 } 1848 PetscCall(PetscLogFlops(4.0 * nz)); 1849 } 1850 1851 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1852 rs = 0.0; 1853 /* L part */ 1854 pc3 = b->a + bi[i + 2]; 1855 pj = b->j + bi[i + 2]; 1856 nz = bi[i + 3] - bi[i + 2]; 1857 for (j = 0; j < nz; j++) { 1858 col = pj[j]; 1859 pc3[j] = rtmp3[col]; 1860 rs += PetscAbsScalar(pc3[j]); 1861 } 1862 /* U part */ 1863 pc3 = b->a + bdiag[i + 3] + 1; 1864 pj = b->j + bdiag[i + 3] + 1; 1865 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1866 for (j = 0; j < nz; j++) { 1867 col = pj[j]; 1868 pc3[j] = rtmp3[col]; 1869 rs += PetscAbsScalar(pc3[j]); 1870 } 1871 1872 sctx.rs = rs; 1873 sctx.pv = rtmp3[i + 2]; 1874 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1875 if (sctx.newshift) break; 1876 pc3 = b->a + bdiag[i + 2]; 1877 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1878 1879 /* Now take care of 3rd column of diagonal 4x4 block. */ 1880 pc4 = rtmp4 + i + 2; 1881 if (*pc4 != 0.0) { 1882 mul4 = (*pc4) * (*pc3); 1883 *pc4 = mul4; 1884 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1885 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1886 for (j = 0; j < nz; j++) { 1887 col = pj[j]; 1888 rtmp4[col] -= mul4 * rtmp3[col]; 1889 } 1890 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1891 } 1892 1893 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1894 rs = 0.0; 1895 /* L part */ 1896 pc4 = b->a + bi[i + 3]; 1897 pj = b->j + bi[i + 3]; 1898 nz = bi[i + 4] - bi[i + 3]; 1899 for (j = 0; j < nz; j++) { 1900 col = pj[j]; 1901 pc4[j] = rtmp4[col]; 1902 rs += PetscAbsScalar(pc4[j]); 1903 } 1904 /* U part */ 1905 pc4 = b->a + bdiag[i + 4] + 1; 1906 pj = b->j + bdiag[i + 4] + 1; 1907 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1908 for (j = 0; j < nz; j++) { 1909 col = pj[j]; 1910 pc4[j] = rtmp4[col]; 1911 rs += PetscAbsScalar(pc4[j]); 1912 } 1913 1914 sctx.rs = rs; 1915 sctx.pv = rtmp4[i + 3]; 1916 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1917 if (sctx.newshift) break; 1918 pc4 = b->a + bdiag[i + 3]; 1919 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1920 break; 1921 1922 default: 1923 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1924 } 1925 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1926 i += nodesz; /* Update the row */ 1927 } 1928 1929 /* MatPivotRefine() */ 1930 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1931 /* 1932 * if no shift in this attempt & shifting & started shifting & can refine, 1933 * then try lower shift 1934 */ 1935 sctx.shift_hi = sctx.shift_fraction; 1936 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1937 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1938 sctx.newshift = PETSC_TRUE; 1939 sctx.nshift++; 1940 } 1941 } while (sctx.newshift); 1942 1943 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1944 PetscCall(PetscFree(tmp_vec2)); 1945 PetscCall(ISRestoreIndices(isicol, &ic)); 1946 PetscCall(ISRestoreIndices(isrow, &r)); 1947 1948 if (b->inode.size) { 1949 C->ops->solve = MatSolve_SeqAIJ_Inode; 1950 } else { 1951 C->ops->solve = MatSolve_SeqAIJ; 1952 } 1953 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1954 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1955 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1956 C->ops->matsolve = MatMatSolve_SeqAIJ; 1957 C->assembled = PETSC_TRUE; 1958 C->preallocated = PETSC_TRUE; 1959 1960 PetscCall(PetscLogFlops(C->cmap->n)); 1961 1962 /* MatShiftView(A,info,&sctx) */ 1963 if (sctx.nshift) { 1964 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1965 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1966 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1967 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1968 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1969 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1970 } 1971 } 1972 PetscFunctionReturn(0); 1973 } 1974 1975 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1976 { 1977 Mat C = B; 1978 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1979 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1980 const PetscInt *r, *ic, *c, *ics; 1981 PetscInt n = A->rmap->n, *bi = b->i; 1982 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1983 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1984 PetscInt *ai = a->i, *aj = a->j; 1985 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1986 PetscScalar mul1, mul2, mul3, tmp; 1987 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1988 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1989 PetscReal rs = 0.0; 1990 FactorShiftCtx sctx; 1991 1992 PetscFunctionBegin; 1993 sctx.shift_top = 0; 1994 sctx.nshift_max = 0; 1995 sctx.shift_lo = 0; 1996 sctx.shift_hi = 0; 1997 sctx.shift_fraction = 0; 1998 1999 /* if both shift schemes are chosen by user, only use info->shiftpd */ 2000 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 2001 sctx.shift_top = 0; 2002 for (i = 0; i < n; i++) { 2003 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2004 rs = 0.0; 2005 ajtmp = aj + ai[i]; 2006 rtmp1 = aa + ai[i]; 2007 nz = ai[i + 1] - ai[i]; 2008 for (j = 0; j < nz; j++) { 2009 if (*ajtmp != i) { 2010 rs += PetscAbsScalar(*rtmp1++); 2011 } else { 2012 rs -= PetscRealPart(*rtmp1++); 2013 } 2014 ajtmp++; 2015 } 2016 if (rs > sctx.shift_top) sctx.shift_top = rs; 2017 } 2018 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2019 sctx.shift_top *= 1.1; 2020 sctx.nshift_max = 5; 2021 sctx.shift_lo = 0.; 2022 sctx.shift_hi = 1.; 2023 } 2024 sctx.shift_amount = 0; 2025 sctx.nshift = 0; 2026 2027 PetscCall(ISGetIndices(isrow, &r)); 2028 PetscCall(ISGetIndices(iscol, &c)); 2029 PetscCall(ISGetIndices(isicol, &ic)); 2030 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2031 ics = ic; 2032 2033 node_max = a->inode.node_count; 2034 ns = a->inode.size; 2035 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2036 2037 /* If max inode size > 3, split it into two inodes.*/ 2038 /* also map the inode sizes according to the ordering */ 2039 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2040 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2041 if (ns[i] > 3) { 2042 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2043 ++j; 2044 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2045 } else { 2046 tmp_vec1[j] = ns[i]; 2047 } 2048 } 2049 /* Use the correct node_max */ 2050 node_max = j; 2051 2052 /* Now reorder the inode info based on mat re-ordering info */ 2053 /* First create a row -> inode_size_array_index map */ 2054 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2055 for (i = 0, row = 0; i < node_max; i++) { 2056 nodesz = tmp_vec1[i]; 2057 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2058 } 2059 /* Using nsmap, create a reordered ns structure */ 2060 for (i = 0, j = 0; i < node_max; i++) { 2061 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2062 tmp_vec2[i] = nodesz; 2063 j += nodesz; 2064 } 2065 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2066 /* Now use the correct ns */ 2067 ns = tmp_vec2; 2068 2069 do { 2070 sctx.newshift = PETSC_FALSE; 2071 /* Now loop over each block-row, and do the factorization */ 2072 for (i = 0, row = 0; i < node_max; i++) { 2073 nodesz = ns[i]; 2074 nz = bi[row + 1] - bi[row]; 2075 bjtmp = bj + bi[row]; 2076 2077 switch (nodesz) { 2078 case 1: 2079 for (j = 0; j < nz; j++) { 2080 idx = bjtmp[j]; 2081 rtmp11[idx] = 0.0; 2082 } 2083 2084 /* load in initial (unfactored row) */ 2085 idx = r[row]; 2086 nz_tmp = ai[idx + 1] - ai[idx]; 2087 ajtmp = aj + ai[idx]; 2088 v1 = aa + ai[idx]; 2089 2090 for (j = 0; j < nz_tmp; j++) { 2091 idx = ics[ajtmp[j]]; 2092 rtmp11[idx] = v1[j]; 2093 } 2094 rtmp11[ics[r[row]]] += sctx.shift_amount; 2095 2096 prow = *bjtmp++; 2097 while (prow < row) { 2098 pc1 = rtmp11 + prow; 2099 if (*pc1 != 0.0) { 2100 pv = ba + bd[prow]; 2101 pj = nbj + bd[prow]; 2102 mul1 = *pc1 * *pv++; 2103 *pc1 = mul1; 2104 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2105 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2106 for (j = 0; j < nz_tmp; j++) { 2107 tmp = pv[j]; 2108 idx = pj[j]; 2109 rtmp11[idx] -= mul1 * tmp; 2110 } 2111 } 2112 prow = *bjtmp++; 2113 } 2114 pj = bj + bi[row]; 2115 pc1 = ba + bi[row]; 2116 2117 sctx.pv = rtmp11[row]; 2118 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2119 rs = 0.0; 2120 for (j = 0; j < nz; j++) { 2121 idx = pj[j]; 2122 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2123 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2124 } 2125 sctx.rs = rs; 2126 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2127 if (sctx.newshift) goto endofwhile; 2128 break; 2129 2130 case 2: 2131 for (j = 0; j < nz; j++) { 2132 idx = bjtmp[j]; 2133 rtmp11[idx] = 0.0; 2134 rtmp22[idx] = 0.0; 2135 } 2136 2137 /* load in initial (unfactored row) */ 2138 idx = r[row]; 2139 nz_tmp = ai[idx + 1] - ai[idx]; 2140 ajtmp = aj + ai[idx]; 2141 v1 = aa + ai[idx]; 2142 v2 = aa + ai[idx + 1]; 2143 for (j = 0; j < nz_tmp; j++) { 2144 idx = ics[ajtmp[j]]; 2145 rtmp11[idx] = v1[j]; 2146 rtmp22[idx] = v2[j]; 2147 } 2148 rtmp11[ics[r[row]]] += sctx.shift_amount; 2149 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2150 2151 prow = *bjtmp++; 2152 while (prow < row) { 2153 pc1 = rtmp11 + prow; 2154 pc2 = rtmp22 + prow; 2155 if (*pc1 != 0.0 || *pc2 != 0.0) { 2156 pv = ba + bd[prow]; 2157 pj = nbj + bd[prow]; 2158 mul1 = *pc1 * *pv; 2159 mul2 = *pc2 * *pv; 2160 ++pv; 2161 *pc1 = mul1; 2162 *pc2 = mul2; 2163 2164 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2165 for (j = 0; j < nz_tmp; j++) { 2166 tmp = pv[j]; 2167 idx = pj[j]; 2168 rtmp11[idx] -= mul1 * tmp; 2169 rtmp22[idx] -= mul2 * tmp; 2170 } 2171 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2172 } 2173 prow = *bjtmp++; 2174 } 2175 2176 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2177 pc1 = rtmp11 + prow; 2178 pc2 = rtmp22 + prow; 2179 2180 sctx.pv = *pc1; 2181 pj = bj + bi[prow]; 2182 rs = 0.0; 2183 for (j = 0; j < nz; j++) { 2184 idx = pj[j]; 2185 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2186 } 2187 sctx.rs = rs; 2188 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2189 if (sctx.newshift) goto endofwhile; 2190 2191 if (*pc2 != 0.0) { 2192 pj = nbj + bd[prow]; 2193 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2194 *pc2 = mul2; 2195 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2196 for (j = 0; j < nz_tmp; j++) { 2197 idx = pj[j]; 2198 tmp = rtmp11[idx]; 2199 rtmp22[idx] -= mul2 * tmp; 2200 } 2201 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2202 } 2203 2204 pj = bj + bi[row]; 2205 pc1 = ba + bi[row]; 2206 pc2 = ba + bi[row + 1]; 2207 2208 sctx.pv = rtmp22[row + 1]; 2209 rs = 0.0; 2210 rtmp11[row] = 1.0 / rtmp11[row]; 2211 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2212 /* copy row entries from dense representation to sparse */ 2213 for (j = 0; j < nz; j++) { 2214 idx = pj[j]; 2215 pc1[j] = rtmp11[idx]; 2216 pc2[j] = rtmp22[idx]; 2217 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2218 } 2219 sctx.rs = rs; 2220 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2221 if (sctx.newshift) goto endofwhile; 2222 break; 2223 2224 case 3: 2225 for (j = 0; j < nz; j++) { 2226 idx = bjtmp[j]; 2227 rtmp11[idx] = 0.0; 2228 rtmp22[idx] = 0.0; 2229 rtmp33[idx] = 0.0; 2230 } 2231 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2232 idx = r[row]; 2233 nz_tmp = ai[idx + 1] - ai[idx]; 2234 ajtmp = aj + ai[idx]; 2235 v1 = aa + ai[idx]; 2236 v2 = aa + ai[idx + 1]; 2237 v3 = aa + ai[idx + 2]; 2238 for (j = 0; j < nz_tmp; j++) { 2239 idx = ics[ajtmp[j]]; 2240 rtmp11[idx] = v1[j]; 2241 rtmp22[idx] = v2[j]; 2242 rtmp33[idx] = v3[j]; 2243 } 2244 rtmp11[ics[r[row]]] += sctx.shift_amount; 2245 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2246 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2247 2248 /* loop over all pivot row blocks above this row block */ 2249 prow = *bjtmp++; 2250 while (prow < row) { 2251 pc1 = rtmp11 + prow; 2252 pc2 = rtmp22 + prow; 2253 pc3 = rtmp33 + prow; 2254 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2255 pv = ba + bd[prow]; 2256 pj = nbj + bd[prow]; 2257 mul1 = *pc1 * *pv; 2258 mul2 = *pc2 * *pv; 2259 mul3 = *pc3 * *pv; 2260 ++pv; 2261 *pc1 = mul1; 2262 *pc2 = mul2; 2263 *pc3 = mul3; 2264 2265 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2266 /* update this row based on pivot row */ 2267 for (j = 0; j < nz_tmp; j++) { 2268 tmp = pv[j]; 2269 idx = pj[j]; 2270 rtmp11[idx] -= mul1 * tmp; 2271 rtmp22[idx] -= mul2 * tmp; 2272 rtmp33[idx] -= mul3 * tmp; 2273 } 2274 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2275 } 2276 prow = *bjtmp++; 2277 } 2278 2279 /* Now take care of diagonal 3x3 block in this set of rows */ 2280 /* note: prow = row here */ 2281 pc1 = rtmp11 + prow; 2282 pc2 = rtmp22 + prow; 2283 pc3 = rtmp33 + prow; 2284 2285 sctx.pv = *pc1; 2286 pj = bj + bi[prow]; 2287 rs = 0.0; 2288 for (j = 0; j < nz; j++) { 2289 idx = pj[j]; 2290 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2291 } 2292 sctx.rs = rs; 2293 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2294 if (sctx.newshift) goto endofwhile; 2295 2296 if (*pc2 != 0.0 || *pc3 != 0.0) { 2297 mul2 = (*pc2) / (*pc1); 2298 mul3 = (*pc3) / (*pc1); 2299 *pc2 = mul2; 2300 *pc3 = mul3; 2301 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2302 pj = nbj + bd[prow]; 2303 for (j = 0; j < nz_tmp; j++) { 2304 idx = pj[j]; 2305 tmp = rtmp11[idx]; 2306 rtmp22[idx] -= mul2 * tmp; 2307 rtmp33[idx] -= mul3 * tmp; 2308 } 2309 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2310 } 2311 ++prow; 2312 2313 pc2 = rtmp22 + prow; 2314 pc3 = rtmp33 + prow; 2315 sctx.pv = *pc2; 2316 pj = bj + bi[prow]; 2317 rs = 0.0; 2318 for (j = 0; j < nz; j++) { 2319 idx = pj[j]; 2320 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2321 } 2322 sctx.rs = rs; 2323 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2324 if (sctx.newshift) goto endofwhile; 2325 2326 if (*pc3 != 0.0) { 2327 mul3 = (*pc3) / (*pc2); 2328 *pc3 = mul3; 2329 pj = nbj + bd[prow]; 2330 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2331 for (j = 0; j < nz_tmp; j++) { 2332 idx = pj[j]; 2333 tmp = rtmp22[idx]; 2334 rtmp33[idx] -= mul3 * tmp; 2335 } 2336 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2337 } 2338 2339 pj = bj + bi[row]; 2340 pc1 = ba + bi[row]; 2341 pc2 = ba + bi[row + 1]; 2342 pc3 = ba + bi[row + 2]; 2343 2344 sctx.pv = rtmp33[row + 2]; 2345 rs = 0.0; 2346 rtmp11[row] = 1.0 / rtmp11[row]; 2347 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2348 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2349 /* copy row entries from dense representation to sparse */ 2350 for (j = 0; j < nz; j++) { 2351 idx = pj[j]; 2352 pc1[j] = rtmp11[idx]; 2353 pc2[j] = rtmp22[idx]; 2354 pc3[j] = rtmp33[idx]; 2355 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2356 } 2357 2358 sctx.rs = rs; 2359 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2360 if (sctx.newshift) goto endofwhile; 2361 break; 2362 2363 default: 2364 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2365 } 2366 row += nodesz; /* Update the row */ 2367 } 2368 endofwhile:; 2369 } while (sctx.newshift); 2370 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2371 PetscCall(PetscFree(tmp_vec2)); 2372 PetscCall(ISRestoreIndices(isicol, &ic)); 2373 PetscCall(ISRestoreIndices(isrow, &r)); 2374 PetscCall(ISRestoreIndices(iscol, &c)); 2375 2376 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2377 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2378 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2379 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2380 C->assembled = PETSC_TRUE; 2381 C->preallocated = PETSC_TRUE; 2382 if (sctx.nshift) { 2383 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2384 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2385 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2386 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2387 } 2388 } 2389 PetscCall(PetscLogFlops(C->cmap->n)); 2390 PetscCall(MatSeqAIJCheckInode(C)); 2391 PetscFunctionReturn(0); 2392 } 2393 2394 /* ----------------------------------------------------------- */ 2395 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2396 { 2397 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2398 IS iscol = a->col, isrow = a->row; 2399 const PetscInt *r, *c, *rout, *cout; 2400 PetscInt i, j, n = A->rmap->n; 2401 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2402 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2403 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2404 PetscScalar sum1, sum2, sum3, sum4, sum5; 2405 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2406 const PetscScalar *b; 2407 2408 PetscFunctionBegin; 2409 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2410 node_max = a->inode.node_count; 2411 ns = a->inode.size; /* Node Size array */ 2412 2413 PetscCall(VecGetArrayRead(bb, &b)); 2414 PetscCall(VecGetArrayWrite(xx, &x)); 2415 tmp = a->solve_work; 2416 2417 PetscCall(ISGetIndices(isrow, &rout)); 2418 r = rout; 2419 PetscCall(ISGetIndices(iscol, &cout)); 2420 c = cout; 2421 2422 /* forward solve the lower triangular */ 2423 tmps = tmp; 2424 aa = a_a; 2425 aj = a_j; 2426 ad = a->diag; 2427 2428 for (i = 0, row = 0; i < node_max; ++i) { 2429 nsz = ns[i]; 2430 aii = ai[row]; 2431 v1 = aa + aii; 2432 vi = aj + aii; 2433 nz = ai[row + 1] - ai[row]; 2434 2435 if (i < node_max - 1) { 2436 /* Prefetch the indices for the next block */ 2437 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2438 /* Prefetch the data for the next block */ 2439 PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2440 } 2441 2442 switch (nsz) { /* Each loop in 'case' is unrolled */ 2443 case 1: 2444 sum1 = b[r[row]]; 2445 for (j = 0; j < nz - 1; j += 2) { 2446 i0 = vi[j]; 2447 i1 = vi[j + 1]; 2448 tmp0 = tmps[i0]; 2449 tmp1 = tmps[i1]; 2450 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2451 } 2452 if (j == nz - 1) { 2453 tmp0 = tmps[vi[j]]; 2454 sum1 -= v1[j] * tmp0; 2455 } 2456 tmp[row++] = sum1; 2457 break; 2458 case 2: 2459 sum1 = b[r[row]]; 2460 sum2 = b[r[row + 1]]; 2461 v2 = aa + ai[row + 1]; 2462 2463 for (j = 0; j < nz - 1; j += 2) { 2464 i0 = vi[j]; 2465 i1 = vi[j + 1]; 2466 tmp0 = tmps[i0]; 2467 tmp1 = tmps[i1]; 2468 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2469 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2470 } 2471 if (j == nz - 1) { 2472 tmp0 = tmps[vi[j]]; 2473 sum1 -= v1[j] * tmp0; 2474 sum2 -= v2[j] * tmp0; 2475 } 2476 sum2 -= v2[nz] * sum1; 2477 tmp[row++] = sum1; 2478 tmp[row++] = sum2; 2479 break; 2480 case 3: 2481 sum1 = b[r[row]]; 2482 sum2 = b[r[row + 1]]; 2483 sum3 = b[r[row + 2]]; 2484 v2 = aa + ai[row + 1]; 2485 v3 = aa + ai[row + 2]; 2486 2487 for (j = 0; j < nz - 1; j += 2) { 2488 i0 = vi[j]; 2489 i1 = vi[j + 1]; 2490 tmp0 = tmps[i0]; 2491 tmp1 = tmps[i1]; 2492 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2493 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2494 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2495 } 2496 if (j == nz - 1) { 2497 tmp0 = tmps[vi[j]]; 2498 sum1 -= v1[j] * tmp0; 2499 sum2 -= v2[j] * tmp0; 2500 sum3 -= v3[j] * tmp0; 2501 } 2502 sum2 -= v2[nz] * sum1; 2503 sum3 -= v3[nz] * sum1; 2504 sum3 -= v3[nz + 1] * sum2; 2505 tmp[row++] = sum1; 2506 tmp[row++] = sum2; 2507 tmp[row++] = sum3; 2508 break; 2509 2510 case 4: 2511 sum1 = b[r[row]]; 2512 sum2 = b[r[row + 1]]; 2513 sum3 = b[r[row + 2]]; 2514 sum4 = b[r[row + 3]]; 2515 v2 = aa + ai[row + 1]; 2516 v3 = aa + ai[row + 2]; 2517 v4 = aa + ai[row + 3]; 2518 2519 for (j = 0; j < nz - 1; j += 2) { 2520 i0 = vi[j]; 2521 i1 = vi[j + 1]; 2522 tmp0 = tmps[i0]; 2523 tmp1 = tmps[i1]; 2524 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2525 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2526 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2527 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2528 } 2529 if (j == nz - 1) { 2530 tmp0 = tmps[vi[j]]; 2531 sum1 -= v1[j] * tmp0; 2532 sum2 -= v2[j] * tmp0; 2533 sum3 -= v3[j] * tmp0; 2534 sum4 -= v4[j] * tmp0; 2535 } 2536 sum2 -= v2[nz] * sum1; 2537 sum3 -= v3[nz] * sum1; 2538 sum4 -= v4[nz] * sum1; 2539 sum3 -= v3[nz + 1] * sum2; 2540 sum4 -= v4[nz + 1] * sum2; 2541 sum4 -= v4[nz + 2] * sum3; 2542 2543 tmp[row++] = sum1; 2544 tmp[row++] = sum2; 2545 tmp[row++] = sum3; 2546 tmp[row++] = sum4; 2547 break; 2548 case 5: 2549 sum1 = b[r[row]]; 2550 sum2 = b[r[row + 1]]; 2551 sum3 = b[r[row + 2]]; 2552 sum4 = b[r[row + 3]]; 2553 sum5 = b[r[row + 4]]; 2554 v2 = aa + ai[row + 1]; 2555 v3 = aa + ai[row + 2]; 2556 v4 = aa + ai[row + 3]; 2557 v5 = aa + ai[row + 4]; 2558 2559 for (j = 0; j < nz - 1; j += 2) { 2560 i0 = vi[j]; 2561 i1 = vi[j + 1]; 2562 tmp0 = tmps[i0]; 2563 tmp1 = tmps[i1]; 2564 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2565 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2566 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2567 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2568 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2569 } 2570 if (j == nz - 1) { 2571 tmp0 = tmps[vi[j]]; 2572 sum1 -= v1[j] * tmp0; 2573 sum2 -= v2[j] * tmp0; 2574 sum3 -= v3[j] * tmp0; 2575 sum4 -= v4[j] * tmp0; 2576 sum5 -= v5[j] * tmp0; 2577 } 2578 2579 sum2 -= v2[nz] * sum1; 2580 sum3 -= v3[nz] * sum1; 2581 sum4 -= v4[nz] * sum1; 2582 sum5 -= v5[nz] * sum1; 2583 sum3 -= v3[nz + 1] * sum2; 2584 sum4 -= v4[nz + 1] * sum2; 2585 sum5 -= v5[nz + 1] * sum2; 2586 sum4 -= v4[nz + 2] * sum3; 2587 sum5 -= v5[nz + 2] * sum3; 2588 sum5 -= v5[nz + 3] * sum4; 2589 2590 tmp[row++] = sum1; 2591 tmp[row++] = sum2; 2592 tmp[row++] = sum3; 2593 tmp[row++] = sum4; 2594 tmp[row++] = sum5; 2595 break; 2596 default: 2597 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2598 } 2599 } 2600 /* backward solve the upper triangular */ 2601 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 2602 nsz = ns[i]; 2603 aii = ad[row + 1] + 1; 2604 v1 = aa + aii; 2605 vi = aj + aii; 2606 nz = ad[row] - ad[row + 1] - 1; 2607 2608 if (i > 0) { 2609 /* Prefetch the indices for the next block */ 2610 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2611 /* Prefetch the data for the next block */ 2612 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2613 } 2614 2615 switch (nsz) { /* Each loop in 'case' is unrolled */ 2616 case 1: 2617 sum1 = tmp[row]; 2618 2619 for (j = 0; j < nz - 1; j += 2) { 2620 i0 = vi[j]; 2621 i1 = vi[j + 1]; 2622 tmp0 = tmps[i0]; 2623 tmp1 = tmps[i1]; 2624 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2625 } 2626 if (j == nz - 1) { 2627 tmp0 = tmps[vi[j]]; 2628 sum1 -= v1[j] * tmp0; 2629 } 2630 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2631 row--; 2632 break; 2633 case 2: 2634 sum1 = tmp[row]; 2635 sum2 = tmp[row - 1]; 2636 v2 = aa + ad[row] + 1; 2637 for (j = 0; j < nz - 1; j += 2) { 2638 i0 = vi[j]; 2639 i1 = vi[j + 1]; 2640 tmp0 = tmps[i0]; 2641 tmp1 = tmps[i1]; 2642 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2643 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2644 } 2645 if (j == nz - 1) { 2646 tmp0 = tmps[vi[j]]; 2647 sum1 -= v1[j] * tmp0; 2648 sum2 -= v2[j + 1] * tmp0; 2649 } 2650 2651 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2652 row--; 2653 sum2 -= v2[0] * tmp0; 2654 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2655 row--; 2656 break; 2657 case 3: 2658 sum1 = tmp[row]; 2659 sum2 = tmp[row - 1]; 2660 sum3 = tmp[row - 2]; 2661 v2 = aa + ad[row] + 1; 2662 v3 = aa + ad[row - 1] + 1; 2663 for (j = 0; j < nz - 1; j += 2) { 2664 i0 = vi[j]; 2665 i1 = vi[j + 1]; 2666 tmp0 = tmps[i0]; 2667 tmp1 = tmps[i1]; 2668 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2669 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2670 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2671 } 2672 if (j == nz - 1) { 2673 tmp0 = tmps[vi[j]]; 2674 sum1 -= v1[j] * tmp0; 2675 sum2 -= v2[j + 1] * tmp0; 2676 sum3 -= v3[j + 2] * tmp0; 2677 } 2678 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2679 row--; 2680 sum2 -= v2[0] * tmp0; 2681 sum3 -= v3[1] * tmp0; 2682 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2683 row--; 2684 sum3 -= v3[0] * tmp0; 2685 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2686 row--; 2687 2688 break; 2689 case 4: 2690 sum1 = tmp[row]; 2691 sum2 = tmp[row - 1]; 2692 sum3 = tmp[row - 2]; 2693 sum4 = tmp[row - 3]; 2694 v2 = aa + ad[row] + 1; 2695 v3 = aa + ad[row - 1] + 1; 2696 v4 = aa + ad[row - 2] + 1; 2697 2698 for (j = 0; j < nz - 1; j += 2) { 2699 i0 = vi[j]; 2700 i1 = vi[j + 1]; 2701 tmp0 = tmps[i0]; 2702 tmp1 = tmps[i1]; 2703 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2704 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2705 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2706 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2707 } 2708 if (j == nz - 1) { 2709 tmp0 = tmps[vi[j]]; 2710 sum1 -= v1[j] * tmp0; 2711 sum2 -= v2[j + 1] * tmp0; 2712 sum3 -= v3[j + 2] * tmp0; 2713 sum4 -= v4[j + 3] * tmp0; 2714 } 2715 2716 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2717 row--; 2718 sum2 -= v2[0] * tmp0; 2719 sum3 -= v3[1] * tmp0; 2720 sum4 -= v4[2] * tmp0; 2721 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2722 row--; 2723 sum3 -= v3[0] * tmp0; 2724 sum4 -= v4[1] * tmp0; 2725 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2726 row--; 2727 sum4 -= v4[0] * tmp0; 2728 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2729 row--; 2730 break; 2731 case 5: 2732 sum1 = tmp[row]; 2733 sum2 = tmp[row - 1]; 2734 sum3 = tmp[row - 2]; 2735 sum4 = tmp[row - 3]; 2736 sum5 = tmp[row - 4]; 2737 v2 = aa + ad[row] + 1; 2738 v3 = aa + ad[row - 1] + 1; 2739 v4 = aa + ad[row - 2] + 1; 2740 v5 = aa + ad[row - 3] + 1; 2741 for (j = 0; j < nz - 1; j += 2) { 2742 i0 = vi[j]; 2743 i1 = vi[j + 1]; 2744 tmp0 = tmps[i0]; 2745 tmp1 = tmps[i1]; 2746 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2747 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2748 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2749 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2750 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2751 } 2752 if (j == nz - 1) { 2753 tmp0 = tmps[vi[j]]; 2754 sum1 -= v1[j] * tmp0; 2755 sum2 -= v2[j + 1] * tmp0; 2756 sum3 -= v3[j + 2] * tmp0; 2757 sum4 -= v4[j + 3] * tmp0; 2758 sum5 -= v5[j + 4] * tmp0; 2759 } 2760 2761 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2762 row--; 2763 sum2 -= v2[0] * tmp0; 2764 sum3 -= v3[1] * tmp0; 2765 sum4 -= v4[2] * tmp0; 2766 sum5 -= v5[3] * tmp0; 2767 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2768 row--; 2769 sum3 -= v3[0] * tmp0; 2770 sum4 -= v4[1] * tmp0; 2771 sum5 -= v5[2] * tmp0; 2772 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2773 row--; 2774 sum4 -= v4[0] * tmp0; 2775 sum5 -= v5[1] * tmp0; 2776 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2777 row--; 2778 sum5 -= v5[0] * tmp0; 2779 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2780 row--; 2781 break; 2782 default: 2783 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2784 } 2785 } 2786 PetscCall(ISRestoreIndices(isrow, &rout)); 2787 PetscCall(ISRestoreIndices(iscol, &cout)); 2788 PetscCall(VecRestoreArrayRead(bb, &b)); 2789 PetscCall(VecRestoreArrayWrite(xx, &x)); 2790 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2791 PetscFunctionReturn(0); 2792 } 2793 2794 /* 2795 Makes a longer coloring[] array and calls the usual code with that 2796 */ 2797 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2798 { 2799 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2800 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row; 2801 PetscInt *colorused, i; 2802 ISColoringValue *newcolor; 2803 2804 PetscFunctionBegin; 2805 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2806 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2807 /* loop over inodes, marking a color for each column*/ 2808 row = 0; 2809 for (i = 0; i < m; i++) { 2810 for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors; 2811 } 2812 2813 /* eliminate unneeded colors */ 2814 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2815 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2816 2817 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2818 ncolors = colorused[5 * ncolors - 1]; 2819 for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1; 2820 PetscCall(PetscFree(colorused)); 2821 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2822 PetscCall(PetscFree(coloring)); 2823 PetscFunctionReturn(0); 2824 } 2825 2826 #include <petsc/private/kernels/blockinvert.h> 2827 2828 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2829 { 2830 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2831 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2832 MatScalar *ibdiag, *bdiag, work[25], *t; 2833 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2834 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2835 const PetscScalar *xb, *b; 2836 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2837 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2; 2838 PetscInt sz, k, ipvt[5]; 2839 PetscBool allowzeropivot, zeropivotdetected; 2840 const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i; 2841 2842 PetscFunctionBegin; 2843 allowzeropivot = PetscNot(A->erroriffailure); 2844 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2845 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2846 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2847 2848 if (!a->inode.ibdiagvalid) { 2849 if (!a->inode.ibdiag) { 2850 /* calculate space needed for diagonal blocks */ 2851 for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i]; 2852 a->inode.bdiagsize = cnt; 2853 2854 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2855 } 2856 2857 /* copy over the diagonal blocks and invert them */ 2858 ibdiag = a->inode.ibdiag; 2859 bdiag = a->inode.bdiag; 2860 cnt = 0; 2861 for (i = 0, row = 0; i < m; i++) { 2862 for (j = 0; j < sizes[i]; j++) { 2863 for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k]; 2864 } 2865 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i])); 2866 2867 switch (sizes[i]) { 2868 case 1: 2869 /* Create matrix data structure */ 2870 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2871 if (allowzeropivot) { 2872 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2873 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2874 A->factorerror_zeropivot_row = row; 2875 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2876 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2877 } 2878 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2879 break; 2880 case 2: 2881 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2882 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2883 break; 2884 case 3: 2885 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2886 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2887 break; 2888 case 4: 2889 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2890 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2891 break; 2892 case 5: 2893 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2894 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2895 break; 2896 default: 2897 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 2898 } 2899 cnt += sizes[i] * sizes[i]; 2900 row += sizes[i]; 2901 } 2902 a->inode.ibdiagvalid = PETSC_TRUE; 2903 } 2904 ibdiag = a->inode.ibdiag; 2905 bdiag = a->inode.bdiag; 2906 t = a->inode.ssor_work; 2907 2908 PetscCall(VecGetArray(xx, &x)); 2909 PetscCall(VecGetArrayRead(bb, &b)); 2910 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2911 if (flag & SOR_ZERO_INITIAL_GUESS) { 2912 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2913 for (i = 0, row = 0; i < m; i++) { 2914 sz = diag[row] - ii[row]; 2915 v1 = a->a + ii[row]; 2916 idx = a->j + ii[row]; 2917 2918 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2919 switch (sizes[i]) { 2920 case 1: 2921 2922 sum1 = b[row]; 2923 for (n = 0; n < sz - 1; n += 2) { 2924 i1 = idx[0]; 2925 i2 = idx[1]; 2926 idx += 2; 2927 tmp0 = x[i1]; 2928 tmp1 = x[i2]; 2929 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2930 v1 += 2; 2931 } 2932 2933 if (n == sz - 1) { 2934 tmp0 = x[*idx]; 2935 sum1 -= *v1 * tmp0; 2936 } 2937 t[row] = sum1; 2938 x[row++] = sum1 * (*ibdiag++); 2939 break; 2940 case 2: 2941 v2 = a->a + ii[row + 1]; 2942 sum1 = b[row]; 2943 sum2 = b[row + 1]; 2944 for (n = 0; n < sz - 1; n += 2) { 2945 i1 = idx[0]; 2946 i2 = idx[1]; 2947 idx += 2; 2948 tmp0 = x[i1]; 2949 tmp1 = x[i2]; 2950 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2951 v1 += 2; 2952 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2953 v2 += 2; 2954 } 2955 2956 if (n == sz - 1) { 2957 tmp0 = x[*idx]; 2958 sum1 -= v1[0] * tmp0; 2959 sum2 -= v2[0] * tmp0; 2960 } 2961 t[row] = sum1; 2962 t[row + 1] = sum2; 2963 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2964 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2965 ibdiag += 4; 2966 break; 2967 case 3: 2968 v2 = a->a + ii[row + 1]; 2969 v3 = a->a + ii[row + 2]; 2970 sum1 = b[row]; 2971 sum2 = b[row + 1]; 2972 sum3 = b[row + 2]; 2973 for (n = 0; n < sz - 1; n += 2) { 2974 i1 = idx[0]; 2975 i2 = idx[1]; 2976 idx += 2; 2977 tmp0 = x[i1]; 2978 tmp1 = x[i2]; 2979 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2980 v1 += 2; 2981 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2982 v2 += 2; 2983 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2984 v3 += 2; 2985 } 2986 2987 if (n == sz - 1) { 2988 tmp0 = x[*idx]; 2989 sum1 -= v1[0] * tmp0; 2990 sum2 -= v2[0] * tmp0; 2991 sum3 -= v3[0] * tmp0; 2992 } 2993 t[row] = sum1; 2994 t[row + 1] = sum2; 2995 t[row + 2] = sum3; 2996 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 2997 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 2998 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 2999 ibdiag += 9; 3000 break; 3001 case 4: 3002 v2 = a->a + ii[row + 1]; 3003 v3 = a->a + ii[row + 2]; 3004 v4 = a->a + ii[row + 3]; 3005 sum1 = b[row]; 3006 sum2 = b[row + 1]; 3007 sum3 = b[row + 2]; 3008 sum4 = b[row + 3]; 3009 for (n = 0; n < sz - 1; n += 2) { 3010 i1 = idx[0]; 3011 i2 = idx[1]; 3012 idx += 2; 3013 tmp0 = x[i1]; 3014 tmp1 = x[i2]; 3015 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3016 v1 += 2; 3017 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3018 v2 += 2; 3019 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3020 v3 += 2; 3021 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3022 v4 += 2; 3023 } 3024 3025 if (n == sz - 1) { 3026 tmp0 = x[*idx]; 3027 sum1 -= v1[0] * tmp0; 3028 sum2 -= v2[0] * tmp0; 3029 sum3 -= v3[0] * tmp0; 3030 sum4 -= v4[0] * tmp0; 3031 } 3032 t[row] = sum1; 3033 t[row + 1] = sum2; 3034 t[row + 2] = sum3; 3035 t[row + 3] = sum4; 3036 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3037 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3038 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3039 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3040 ibdiag += 16; 3041 break; 3042 case 5: 3043 v2 = a->a + ii[row + 1]; 3044 v3 = a->a + ii[row + 2]; 3045 v4 = a->a + ii[row + 3]; 3046 v5 = a->a + ii[row + 4]; 3047 sum1 = b[row]; 3048 sum2 = b[row + 1]; 3049 sum3 = b[row + 2]; 3050 sum4 = b[row + 3]; 3051 sum5 = b[row + 4]; 3052 for (n = 0; n < sz - 1; n += 2) { 3053 i1 = idx[0]; 3054 i2 = idx[1]; 3055 idx += 2; 3056 tmp0 = x[i1]; 3057 tmp1 = x[i2]; 3058 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3059 v1 += 2; 3060 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3061 v2 += 2; 3062 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3063 v3 += 2; 3064 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3065 v4 += 2; 3066 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3067 v5 += 2; 3068 } 3069 3070 if (n == sz - 1) { 3071 tmp0 = x[*idx]; 3072 sum1 -= v1[0] * tmp0; 3073 sum2 -= v2[0] * tmp0; 3074 sum3 -= v3[0] * tmp0; 3075 sum4 -= v4[0] * tmp0; 3076 sum5 -= v5[0] * tmp0; 3077 } 3078 t[row] = sum1; 3079 t[row + 1] = sum2; 3080 t[row + 2] = sum3; 3081 t[row + 3] = sum4; 3082 t[row + 4] = sum5; 3083 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3084 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3085 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3086 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3087 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3088 ibdiag += 25; 3089 break; 3090 default: 3091 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3092 } 3093 } 3094 3095 xb = t; 3096 PetscCall(PetscLogFlops(a->nz)); 3097 } else xb = b; 3098 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3099 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3100 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3101 ibdiag -= sizes[i] * sizes[i]; 3102 sz = ii[row + 1] - diag[row] - 1; 3103 v1 = a->a + diag[row] + 1; 3104 idx = a->j + diag[row] + 1; 3105 3106 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3107 switch (sizes[i]) { 3108 case 1: 3109 3110 sum1 = xb[row]; 3111 for (n = 0; n < sz - 1; n += 2) { 3112 i1 = idx[0]; 3113 i2 = idx[1]; 3114 idx += 2; 3115 tmp0 = x[i1]; 3116 tmp1 = x[i2]; 3117 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3118 v1 += 2; 3119 } 3120 3121 if (n == sz - 1) { 3122 tmp0 = x[*idx]; 3123 sum1 -= *v1 * tmp0; 3124 } 3125 x[row--] = sum1 * (*ibdiag); 3126 break; 3127 3128 case 2: 3129 3130 sum1 = xb[row]; 3131 sum2 = xb[row - 1]; 3132 /* note that sum1 is associated with the second of the two rows */ 3133 v2 = a->a + diag[row - 1] + 2; 3134 for (n = 0; n < sz - 1; n += 2) { 3135 i1 = idx[0]; 3136 i2 = idx[1]; 3137 idx += 2; 3138 tmp0 = x[i1]; 3139 tmp1 = x[i2]; 3140 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3141 v1 += 2; 3142 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3143 v2 += 2; 3144 } 3145 3146 if (n == sz - 1) { 3147 tmp0 = x[*idx]; 3148 sum1 -= *v1 * tmp0; 3149 sum2 -= *v2 * tmp0; 3150 } 3151 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3152 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3153 break; 3154 case 3: 3155 3156 sum1 = xb[row]; 3157 sum2 = xb[row - 1]; 3158 sum3 = xb[row - 2]; 3159 v2 = a->a + diag[row - 1] + 2; 3160 v3 = a->a + diag[row - 2] + 3; 3161 for (n = 0; n < sz - 1; n += 2) { 3162 i1 = idx[0]; 3163 i2 = idx[1]; 3164 idx += 2; 3165 tmp0 = x[i1]; 3166 tmp1 = x[i2]; 3167 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3168 v1 += 2; 3169 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3170 v2 += 2; 3171 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3172 v3 += 2; 3173 } 3174 3175 if (n == sz - 1) { 3176 tmp0 = x[*idx]; 3177 sum1 -= *v1 * tmp0; 3178 sum2 -= *v2 * tmp0; 3179 sum3 -= *v3 * tmp0; 3180 } 3181 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3182 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3183 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3184 break; 3185 case 4: 3186 3187 sum1 = xb[row]; 3188 sum2 = xb[row - 1]; 3189 sum3 = xb[row - 2]; 3190 sum4 = xb[row - 3]; 3191 v2 = a->a + diag[row - 1] + 2; 3192 v3 = a->a + diag[row - 2] + 3; 3193 v4 = a->a + diag[row - 3] + 4; 3194 for (n = 0; n < sz - 1; n += 2) { 3195 i1 = idx[0]; 3196 i2 = idx[1]; 3197 idx += 2; 3198 tmp0 = x[i1]; 3199 tmp1 = x[i2]; 3200 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3201 v1 += 2; 3202 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3203 v2 += 2; 3204 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3205 v3 += 2; 3206 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3207 v4 += 2; 3208 } 3209 3210 if (n == sz - 1) { 3211 tmp0 = x[*idx]; 3212 sum1 -= *v1 * tmp0; 3213 sum2 -= *v2 * tmp0; 3214 sum3 -= *v3 * tmp0; 3215 sum4 -= *v4 * tmp0; 3216 } 3217 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3218 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3219 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3220 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3221 break; 3222 case 5: 3223 3224 sum1 = xb[row]; 3225 sum2 = xb[row - 1]; 3226 sum3 = xb[row - 2]; 3227 sum4 = xb[row - 3]; 3228 sum5 = xb[row - 4]; 3229 v2 = a->a + diag[row - 1] + 2; 3230 v3 = a->a + diag[row - 2] + 3; 3231 v4 = a->a + diag[row - 3] + 4; 3232 v5 = a->a + diag[row - 4] + 5; 3233 for (n = 0; n < sz - 1; n += 2) { 3234 i1 = idx[0]; 3235 i2 = idx[1]; 3236 idx += 2; 3237 tmp0 = x[i1]; 3238 tmp1 = x[i2]; 3239 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3240 v1 += 2; 3241 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3242 v2 += 2; 3243 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3244 v3 += 2; 3245 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3246 v4 += 2; 3247 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3248 v5 += 2; 3249 } 3250 3251 if (n == sz - 1) { 3252 tmp0 = x[*idx]; 3253 sum1 -= *v1 * tmp0; 3254 sum2 -= *v2 * tmp0; 3255 sum3 -= *v3 * tmp0; 3256 sum4 -= *v4 * tmp0; 3257 sum5 -= *v5 * tmp0; 3258 } 3259 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3260 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3261 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3262 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3263 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3264 break; 3265 default: 3266 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3267 } 3268 } 3269 3270 PetscCall(PetscLogFlops(a->nz)); 3271 } 3272 its--; 3273 } 3274 while (its--) { 3275 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3276 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) { 3277 sz = diag[row] - ii[row]; 3278 v1 = a->a + ii[row]; 3279 idx = a->j + ii[row]; 3280 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3281 switch (sizes[i]) { 3282 case 1: 3283 sum1 = b[row]; 3284 for (n = 0; n < sz - 1; n += 2) { 3285 i1 = idx[0]; 3286 i2 = idx[1]; 3287 idx += 2; 3288 tmp0 = x[i1]; 3289 tmp1 = x[i2]; 3290 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3291 v1 += 2; 3292 } 3293 if (n == sz - 1) { 3294 tmp0 = x[*idx++]; 3295 sum1 -= *v1 * tmp0; 3296 v1++; 3297 } 3298 t[row] = sum1; 3299 sz = ii[row + 1] - diag[row] - 1; 3300 idx = a->j + diag[row] + 1; 3301 v1 += 1; 3302 for (n = 0; n < sz - 1; n += 2) { 3303 i1 = idx[0]; 3304 i2 = idx[1]; 3305 idx += 2; 3306 tmp0 = x[i1]; 3307 tmp1 = x[i2]; 3308 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3309 v1 += 2; 3310 } 3311 if (n == sz - 1) { 3312 tmp0 = x[*idx++]; 3313 sum1 -= *v1 * tmp0; 3314 } 3315 /* in MatSOR_SeqAIJ this line would be 3316 * 3317 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3318 * 3319 * but omega == 1, so this becomes 3320 * 3321 * x[row] = sum1*(*ibdiag++); 3322 * 3323 */ 3324 x[row] = sum1 * (*ibdiag); 3325 break; 3326 case 2: 3327 v2 = a->a + ii[row + 1]; 3328 sum1 = b[row]; 3329 sum2 = b[row + 1]; 3330 for (n = 0; n < sz - 1; n += 2) { 3331 i1 = idx[0]; 3332 i2 = idx[1]; 3333 idx += 2; 3334 tmp0 = x[i1]; 3335 tmp1 = x[i2]; 3336 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3337 v1 += 2; 3338 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3339 v2 += 2; 3340 } 3341 if (n == sz - 1) { 3342 tmp0 = x[*idx++]; 3343 sum1 -= v1[0] * tmp0; 3344 sum2 -= v2[0] * tmp0; 3345 v1++; 3346 v2++; 3347 } 3348 t[row] = sum1; 3349 t[row + 1] = sum2; 3350 sz = ii[row + 1] - diag[row] - 2; 3351 idx = a->j + diag[row] + 2; 3352 v1 += 2; 3353 v2 += 2; 3354 for (n = 0; n < sz - 1; n += 2) { 3355 i1 = idx[0]; 3356 i2 = idx[1]; 3357 idx += 2; 3358 tmp0 = x[i1]; 3359 tmp1 = x[i2]; 3360 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3361 v1 += 2; 3362 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3363 v2 += 2; 3364 } 3365 if (n == sz - 1) { 3366 tmp0 = x[*idx]; 3367 sum1 -= v1[0] * tmp0; 3368 sum2 -= v2[0] * tmp0; 3369 } 3370 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3371 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3372 break; 3373 case 3: 3374 v2 = a->a + ii[row + 1]; 3375 v3 = a->a + ii[row + 2]; 3376 sum1 = b[row]; 3377 sum2 = b[row + 1]; 3378 sum3 = b[row + 2]; 3379 for (n = 0; n < sz - 1; n += 2) { 3380 i1 = idx[0]; 3381 i2 = idx[1]; 3382 idx += 2; 3383 tmp0 = x[i1]; 3384 tmp1 = x[i2]; 3385 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3386 v1 += 2; 3387 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3388 v2 += 2; 3389 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3390 v3 += 2; 3391 } 3392 if (n == sz - 1) { 3393 tmp0 = x[*idx++]; 3394 sum1 -= v1[0] * tmp0; 3395 sum2 -= v2[0] * tmp0; 3396 sum3 -= v3[0] * tmp0; 3397 v1++; 3398 v2++; 3399 v3++; 3400 } 3401 t[row] = sum1; 3402 t[row + 1] = sum2; 3403 t[row + 2] = sum3; 3404 sz = ii[row + 1] - diag[row] - 3; 3405 idx = a->j + diag[row] + 3; 3406 v1 += 3; 3407 v2 += 3; 3408 v3 += 3; 3409 for (n = 0; n < sz - 1; n += 2) { 3410 i1 = idx[0]; 3411 i2 = idx[1]; 3412 idx += 2; 3413 tmp0 = x[i1]; 3414 tmp1 = x[i2]; 3415 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3416 v1 += 2; 3417 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3418 v2 += 2; 3419 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3420 v3 += 2; 3421 } 3422 if (n == sz - 1) { 3423 tmp0 = x[*idx]; 3424 sum1 -= v1[0] * tmp0; 3425 sum2 -= v2[0] * tmp0; 3426 sum3 -= v3[0] * tmp0; 3427 } 3428 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3429 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3430 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3431 break; 3432 case 4: 3433 v2 = a->a + ii[row + 1]; 3434 v3 = a->a + ii[row + 2]; 3435 v4 = a->a + ii[row + 3]; 3436 sum1 = b[row]; 3437 sum2 = b[row + 1]; 3438 sum3 = b[row + 2]; 3439 sum4 = b[row + 3]; 3440 for (n = 0; n < sz - 1; n += 2) { 3441 i1 = idx[0]; 3442 i2 = idx[1]; 3443 idx += 2; 3444 tmp0 = x[i1]; 3445 tmp1 = x[i2]; 3446 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3447 v1 += 2; 3448 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3449 v2 += 2; 3450 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3451 v3 += 2; 3452 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3453 v4 += 2; 3454 } 3455 if (n == sz - 1) { 3456 tmp0 = x[*idx++]; 3457 sum1 -= v1[0] * tmp0; 3458 sum2 -= v2[0] * tmp0; 3459 sum3 -= v3[0] * tmp0; 3460 sum4 -= v4[0] * tmp0; 3461 v1++; 3462 v2++; 3463 v3++; 3464 v4++; 3465 } 3466 t[row] = sum1; 3467 t[row + 1] = sum2; 3468 t[row + 2] = sum3; 3469 t[row + 3] = sum4; 3470 sz = ii[row + 1] - diag[row] - 4; 3471 idx = a->j + diag[row] + 4; 3472 v1 += 4; 3473 v2 += 4; 3474 v3 += 4; 3475 v4 += 4; 3476 for (n = 0; n < sz - 1; n += 2) { 3477 i1 = idx[0]; 3478 i2 = idx[1]; 3479 idx += 2; 3480 tmp0 = x[i1]; 3481 tmp1 = x[i2]; 3482 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3483 v1 += 2; 3484 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3485 v2 += 2; 3486 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3487 v3 += 2; 3488 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3489 v4 += 2; 3490 } 3491 if (n == sz - 1) { 3492 tmp0 = x[*idx]; 3493 sum1 -= v1[0] * tmp0; 3494 sum2 -= v2[0] * tmp0; 3495 sum3 -= v3[0] * tmp0; 3496 sum4 -= v4[0] * tmp0; 3497 } 3498 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3499 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3500 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3501 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3502 break; 3503 case 5: 3504 v2 = a->a + ii[row + 1]; 3505 v3 = a->a + ii[row + 2]; 3506 v4 = a->a + ii[row + 3]; 3507 v5 = a->a + ii[row + 4]; 3508 sum1 = b[row]; 3509 sum2 = b[row + 1]; 3510 sum3 = b[row + 2]; 3511 sum4 = b[row + 3]; 3512 sum5 = b[row + 4]; 3513 for (n = 0; n < sz - 1; n += 2) { 3514 i1 = idx[0]; 3515 i2 = idx[1]; 3516 idx += 2; 3517 tmp0 = x[i1]; 3518 tmp1 = x[i2]; 3519 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3520 v1 += 2; 3521 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3522 v2 += 2; 3523 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3524 v3 += 2; 3525 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3526 v4 += 2; 3527 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3528 v5 += 2; 3529 } 3530 if (n == sz - 1) { 3531 tmp0 = x[*idx++]; 3532 sum1 -= v1[0] * tmp0; 3533 sum2 -= v2[0] * tmp0; 3534 sum3 -= v3[0] * tmp0; 3535 sum4 -= v4[0] * tmp0; 3536 sum5 -= v5[0] * tmp0; 3537 v1++; 3538 v2++; 3539 v3++; 3540 v4++; 3541 v5++; 3542 } 3543 t[row] = sum1; 3544 t[row + 1] = sum2; 3545 t[row + 2] = sum3; 3546 t[row + 3] = sum4; 3547 t[row + 4] = sum5; 3548 sz = ii[row + 1] - diag[row] - 5; 3549 idx = a->j + diag[row] + 5; 3550 v1 += 5; 3551 v2 += 5; 3552 v3 += 5; 3553 v4 += 5; 3554 v5 += 5; 3555 for (n = 0; n < sz - 1; n += 2) { 3556 i1 = idx[0]; 3557 i2 = idx[1]; 3558 idx += 2; 3559 tmp0 = x[i1]; 3560 tmp1 = x[i2]; 3561 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3562 v1 += 2; 3563 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3564 v2 += 2; 3565 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3566 v3 += 2; 3567 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3568 v4 += 2; 3569 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3570 v5 += 2; 3571 } 3572 if (n == sz - 1) { 3573 tmp0 = x[*idx]; 3574 sum1 -= v1[0] * tmp0; 3575 sum2 -= v2[0] * tmp0; 3576 sum3 -= v3[0] * tmp0; 3577 sum4 -= v4[0] * tmp0; 3578 sum5 -= v5[0] * tmp0; 3579 } 3580 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3581 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3582 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3583 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3584 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3585 break; 3586 default: 3587 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3588 } 3589 } 3590 xb = t; 3591 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3592 } else xb = b; 3593 3594 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3595 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3596 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3597 ibdiag -= sizes[i] * sizes[i]; 3598 3599 /* set RHS */ 3600 if (xb == b) { 3601 /* whole (old way) */ 3602 sz = ii[row + 1] - ii[row]; 3603 idx = a->j + ii[row]; 3604 switch (sizes[i]) { 3605 case 5: 3606 v5 = a->a + ii[row - 4]; /* fall through */ 3607 case 4: 3608 v4 = a->a + ii[row - 3]; /* fall through */ 3609 case 3: 3610 v3 = a->a + ii[row - 2]; /* fall through */ 3611 case 2: 3612 v2 = a->a + ii[row - 1]; /* fall through */ 3613 case 1: 3614 v1 = a->a + ii[row]; 3615 break; 3616 default: 3617 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3618 } 3619 } else { 3620 /* upper, no diag */ 3621 sz = ii[row + 1] - diag[row] - 1; 3622 idx = a->j + diag[row] + 1; 3623 switch (sizes[i]) { 3624 case 5: 3625 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3626 case 4: 3627 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3628 case 3: 3629 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3630 case 2: 3631 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3632 case 1: 3633 v1 = a->a + diag[row] + 1; 3634 } 3635 } 3636 /* set sum */ 3637 switch (sizes[i]) { 3638 case 5: 3639 sum5 = xb[row - 4]; /* fall through */ 3640 case 4: 3641 sum4 = xb[row - 3]; /* fall through */ 3642 case 3: 3643 sum3 = xb[row - 2]; /* fall through */ 3644 case 2: 3645 sum2 = xb[row - 1]; /* fall through */ 3646 case 1: 3647 /* note that sum1 is associated with the last row */ 3648 sum1 = xb[row]; 3649 } 3650 /* do sums */ 3651 for (n = 0; n < sz - 1; n += 2) { 3652 i1 = idx[0]; 3653 i2 = idx[1]; 3654 idx += 2; 3655 tmp0 = x[i1]; 3656 tmp1 = x[i2]; 3657 switch (sizes[i]) { 3658 case 5: 3659 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3660 v5 += 2; /* fall through */ 3661 case 4: 3662 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3663 v4 += 2; /* fall through */ 3664 case 3: 3665 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3666 v3 += 2; /* fall through */ 3667 case 2: 3668 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3669 v2 += 2; /* fall through */ 3670 case 1: 3671 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3672 v1 += 2; 3673 } 3674 } 3675 /* ragged edge */ 3676 if (n == sz - 1) { 3677 tmp0 = x[*idx]; 3678 switch (sizes[i]) { 3679 case 5: 3680 sum5 -= *v5 * tmp0; /* fall through */ 3681 case 4: 3682 sum4 -= *v4 * tmp0; /* fall through */ 3683 case 3: 3684 sum3 -= *v3 * tmp0; /* fall through */ 3685 case 2: 3686 sum2 -= *v2 * tmp0; /* fall through */ 3687 case 1: 3688 sum1 -= *v1 * tmp0; 3689 } 3690 } 3691 /* update */ 3692 if (xb == b) { 3693 /* whole (old way) w/ diag */ 3694 switch (sizes[i]) { 3695 case 5: 3696 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3697 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3698 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3699 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3700 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3701 break; 3702 case 4: 3703 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3704 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3705 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3706 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3707 break; 3708 case 3: 3709 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3710 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3711 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3712 break; 3713 case 2: 3714 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3715 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3716 break; 3717 case 1: 3718 x[row--] += sum1 * (*ibdiag); 3719 break; 3720 } 3721 } else { 3722 /* no diag so set = */ 3723 switch (sizes[i]) { 3724 case 5: 3725 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3726 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3727 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3728 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3729 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3730 break; 3731 case 4: 3732 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3733 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3734 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3735 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3736 break; 3737 case 3: 3738 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3739 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3740 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3741 break; 3742 case 2: 3743 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3744 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3745 break; 3746 case 1: 3747 x[row--] = sum1 * (*ibdiag); 3748 break; 3749 } 3750 } 3751 } 3752 if (xb == b) { 3753 PetscCall(PetscLogFlops(2.0 * a->nz)); 3754 } else { 3755 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3756 } 3757 } 3758 } 3759 if (flag & SOR_EISENSTAT) { 3760 /* 3761 Apply (U + D)^-1 where D is now the block diagonal 3762 */ 3763 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3764 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3765 ibdiag -= sizes[i] * sizes[i]; 3766 sz = ii[row + 1] - diag[row] - 1; 3767 v1 = a->a + diag[row] + 1; 3768 idx = a->j + diag[row] + 1; 3769 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3770 switch (sizes[i]) { 3771 case 1: 3772 3773 sum1 = b[row]; 3774 for (n = 0; n < sz - 1; n += 2) { 3775 i1 = idx[0]; 3776 i2 = idx[1]; 3777 idx += 2; 3778 tmp0 = x[i1]; 3779 tmp1 = x[i2]; 3780 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3781 v1 += 2; 3782 } 3783 3784 if (n == sz - 1) { 3785 tmp0 = x[*idx]; 3786 sum1 -= *v1 * tmp0; 3787 } 3788 x[row] = sum1 * (*ibdiag); 3789 row--; 3790 break; 3791 3792 case 2: 3793 3794 sum1 = b[row]; 3795 sum2 = b[row - 1]; 3796 /* note that sum1 is associated with the second of the two rows */ 3797 v2 = a->a + diag[row - 1] + 2; 3798 for (n = 0; n < sz - 1; n += 2) { 3799 i1 = idx[0]; 3800 i2 = idx[1]; 3801 idx += 2; 3802 tmp0 = x[i1]; 3803 tmp1 = x[i2]; 3804 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3805 v1 += 2; 3806 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3807 v2 += 2; 3808 } 3809 3810 if (n == sz - 1) { 3811 tmp0 = x[*idx]; 3812 sum1 -= *v1 * tmp0; 3813 sum2 -= *v2 * tmp0; 3814 } 3815 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3816 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3817 row -= 2; 3818 break; 3819 case 3: 3820 3821 sum1 = b[row]; 3822 sum2 = b[row - 1]; 3823 sum3 = b[row - 2]; 3824 v2 = a->a + diag[row - 1] + 2; 3825 v3 = a->a + diag[row - 2] + 3; 3826 for (n = 0; n < sz - 1; n += 2) { 3827 i1 = idx[0]; 3828 i2 = idx[1]; 3829 idx += 2; 3830 tmp0 = x[i1]; 3831 tmp1 = x[i2]; 3832 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3833 v1 += 2; 3834 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3835 v2 += 2; 3836 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3837 v3 += 2; 3838 } 3839 3840 if (n == sz - 1) { 3841 tmp0 = x[*idx]; 3842 sum1 -= *v1 * tmp0; 3843 sum2 -= *v2 * tmp0; 3844 sum3 -= *v3 * tmp0; 3845 } 3846 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3847 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3848 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3849 row -= 3; 3850 break; 3851 case 4: 3852 3853 sum1 = b[row]; 3854 sum2 = b[row - 1]; 3855 sum3 = b[row - 2]; 3856 sum4 = b[row - 3]; 3857 v2 = a->a + diag[row - 1] + 2; 3858 v3 = a->a + diag[row - 2] + 3; 3859 v4 = a->a + diag[row - 3] + 4; 3860 for (n = 0; n < sz - 1; n += 2) { 3861 i1 = idx[0]; 3862 i2 = idx[1]; 3863 idx += 2; 3864 tmp0 = x[i1]; 3865 tmp1 = x[i2]; 3866 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3867 v1 += 2; 3868 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3869 v2 += 2; 3870 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3871 v3 += 2; 3872 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3873 v4 += 2; 3874 } 3875 3876 if (n == sz - 1) { 3877 tmp0 = x[*idx]; 3878 sum1 -= *v1 * tmp0; 3879 sum2 -= *v2 * tmp0; 3880 sum3 -= *v3 * tmp0; 3881 sum4 -= *v4 * tmp0; 3882 } 3883 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3884 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3885 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3886 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3887 row -= 4; 3888 break; 3889 case 5: 3890 3891 sum1 = b[row]; 3892 sum2 = b[row - 1]; 3893 sum3 = b[row - 2]; 3894 sum4 = b[row - 3]; 3895 sum5 = b[row - 4]; 3896 v2 = a->a + diag[row - 1] + 2; 3897 v3 = a->a + diag[row - 2] + 3; 3898 v4 = a->a + diag[row - 3] + 4; 3899 v5 = a->a + diag[row - 4] + 5; 3900 for (n = 0; n < sz - 1; n += 2) { 3901 i1 = idx[0]; 3902 i2 = idx[1]; 3903 idx += 2; 3904 tmp0 = x[i1]; 3905 tmp1 = x[i2]; 3906 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3907 v1 += 2; 3908 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3909 v2 += 2; 3910 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3911 v3 += 2; 3912 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3913 v4 += 2; 3914 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3915 v5 += 2; 3916 } 3917 3918 if (n == sz - 1) { 3919 tmp0 = x[*idx]; 3920 sum1 -= *v1 * tmp0; 3921 sum2 -= *v2 * tmp0; 3922 sum3 -= *v3 * tmp0; 3923 sum4 -= *v4 * tmp0; 3924 sum5 -= *v5 * tmp0; 3925 } 3926 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3927 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3928 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3929 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3930 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3931 row -= 5; 3932 break; 3933 default: 3934 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3935 } 3936 } 3937 PetscCall(PetscLogFlops(a->nz)); 3938 3939 /* 3940 t = b - D x where D is the block diagonal 3941 */ 3942 cnt = 0; 3943 for (i = 0, row = 0; i < m; i++) { 3944 switch (sizes[i]) { 3945 case 1: 3946 t[row] = b[row] - bdiag[cnt++] * x[row]; 3947 row++; 3948 break; 3949 case 2: 3950 x1 = x[row]; 3951 x2 = x[row + 1]; 3952 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3953 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3954 t[row] = b[row] - tmp1; 3955 t[row + 1] = b[row + 1] - tmp2; 3956 row += 2; 3957 cnt += 4; 3958 break; 3959 case 3: 3960 x1 = x[row]; 3961 x2 = x[row + 1]; 3962 x3 = x[row + 2]; 3963 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3964 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3965 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3966 t[row] = b[row] - tmp1; 3967 t[row + 1] = b[row + 1] - tmp2; 3968 t[row + 2] = b[row + 2] - tmp3; 3969 row += 3; 3970 cnt += 9; 3971 break; 3972 case 4: 3973 x1 = x[row]; 3974 x2 = x[row + 1]; 3975 x3 = x[row + 2]; 3976 x4 = x[row + 3]; 3977 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3978 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3979 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3980 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3981 t[row] = b[row] - tmp1; 3982 t[row + 1] = b[row + 1] - tmp2; 3983 t[row + 2] = b[row + 2] - tmp3; 3984 t[row + 3] = b[row + 3] - tmp4; 3985 row += 4; 3986 cnt += 16; 3987 break; 3988 case 5: 3989 x1 = x[row]; 3990 x2 = x[row + 1]; 3991 x3 = x[row + 2]; 3992 x4 = x[row + 3]; 3993 x5 = x[row + 4]; 3994 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 3995 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 3996 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 3997 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 3998 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 3999 t[row] = b[row] - tmp1; 4000 t[row + 1] = b[row + 1] - tmp2; 4001 t[row + 2] = b[row + 2] - tmp3; 4002 t[row + 3] = b[row + 3] - tmp4; 4003 t[row + 4] = b[row + 4] - tmp5; 4004 row += 5; 4005 cnt += 25; 4006 break; 4007 default: 4008 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4009 } 4010 } 4011 PetscCall(PetscLogFlops(m)); 4012 4013 /* 4014 Apply (L + D)^-1 where D is the block diagonal 4015 */ 4016 for (i = 0, row = 0; i < m; i++) { 4017 sz = diag[row] - ii[row]; 4018 v1 = a->a + ii[row]; 4019 idx = a->j + ii[row]; 4020 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4021 switch (sizes[i]) { 4022 case 1: 4023 4024 sum1 = t[row]; 4025 for (n = 0; n < sz - 1; n += 2) { 4026 i1 = idx[0]; 4027 i2 = idx[1]; 4028 idx += 2; 4029 tmp0 = t[i1]; 4030 tmp1 = t[i2]; 4031 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4032 v1 += 2; 4033 } 4034 4035 if (n == sz - 1) { 4036 tmp0 = t[*idx]; 4037 sum1 -= *v1 * tmp0; 4038 } 4039 x[row] += t[row] = sum1 * (*ibdiag++); 4040 row++; 4041 break; 4042 case 2: 4043 v2 = a->a + ii[row + 1]; 4044 sum1 = t[row]; 4045 sum2 = t[row + 1]; 4046 for (n = 0; n < sz - 1; n += 2) { 4047 i1 = idx[0]; 4048 i2 = idx[1]; 4049 idx += 2; 4050 tmp0 = t[i1]; 4051 tmp1 = t[i2]; 4052 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4053 v1 += 2; 4054 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4055 v2 += 2; 4056 } 4057 4058 if (n == sz - 1) { 4059 tmp0 = t[*idx]; 4060 sum1 -= v1[0] * tmp0; 4061 sum2 -= v2[0] * tmp0; 4062 } 4063 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4064 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4065 ibdiag += 4; 4066 row += 2; 4067 break; 4068 case 3: 4069 v2 = a->a + ii[row + 1]; 4070 v3 = a->a + ii[row + 2]; 4071 sum1 = t[row]; 4072 sum2 = t[row + 1]; 4073 sum3 = t[row + 2]; 4074 for (n = 0; n < sz - 1; n += 2) { 4075 i1 = idx[0]; 4076 i2 = idx[1]; 4077 idx += 2; 4078 tmp0 = t[i1]; 4079 tmp1 = t[i2]; 4080 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4081 v1 += 2; 4082 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4083 v2 += 2; 4084 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4085 v3 += 2; 4086 } 4087 4088 if (n == sz - 1) { 4089 tmp0 = t[*idx]; 4090 sum1 -= v1[0] * tmp0; 4091 sum2 -= v2[0] * tmp0; 4092 sum3 -= v3[0] * tmp0; 4093 } 4094 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4095 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4096 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4097 ibdiag += 9; 4098 row += 3; 4099 break; 4100 case 4: 4101 v2 = a->a + ii[row + 1]; 4102 v3 = a->a + ii[row + 2]; 4103 v4 = a->a + ii[row + 3]; 4104 sum1 = t[row]; 4105 sum2 = t[row + 1]; 4106 sum3 = t[row + 2]; 4107 sum4 = t[row + 3]; 4108 for (n = 0; n < sz - 1; n += 2) { 4109 i1 = idx[0]; 4110 i2 = idx[1]; 4111 idx += 2; 4112 tmp0 = t[i1]; 4113 tmp1 = t[i2]; 4114 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4115 v1 += 2; 4116 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4117 v2 += 2; 4118 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4119 v3 += 2; 4120 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4121 v4 += 2; 4122 } 4123 4124 if (n == sz - 1) { 4125 tmp0 = t[*idx]; 4126 sum1 -= v1[0] * tmp0; 4127 sum2 -= v2[0] * tmp0; 4128 sum3 -= v3[0] * tmp0; 4129 sum4 -= v4[0] * tmp0; 4130 } 4131 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4132 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4133 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4134 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4135 ibdiag += 16; 4136 row += 4; 4137 break; 4138 case 5: 4139 v2 = a->a + ii[row + 1]; 4140 v3 = a->a + ii[row + 2]; 4141 v4 = a->a + ii[row + 3]; 4142 v5 = a->a + ii[row + 4]; 4143 sum1 = t[row]; 4144 sum2 = t[row + 1]; 4145 sum3 = t[row + 2]; 4146 sum4 = t[row + 3]; 4147 sum5 = t[row + 4]; 4148 for (n = 0; n < sz - 1; n += 2) { 4149 i1 = idx[0]; 4150 i2 = idx[1]; 4151 idx += 2; 4152 tmp0 = t[i1]; 4153 tmp1 = t[i2]; 4154 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4155 v1 += 2; 4156 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4157 v2 += 2; 4158 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4159 v3 += 2; 4160 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4161 v4 += 2; 4162 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4163 v5 += 2; 4164 } 4165 4166 if (n == sz - 1) { 4167 tmp0 = t[*idx]; 4168 sum1 -= v1[0] * tmp0; 4169 sum2 -= v2[0] * tmp0; 4170 sum3 -= v3[0] * tmp0; 4171 sum4 -= v4[0] * tmp0; 4172 sum5 -= v5[0] * tmp0; 4173 } 4174 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4175 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4176 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4177 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4178 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4179 ibdiag += 25; 4180 row += 5; 4181 break; 4182 default: 4183 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4184 } 4185 } 4186 PetscCall(PetscLogFlops(a->nz)); 4187 } 4188 PetscCall(VecRestoreArray(xx, &x)); 4189 PetscCall(VecRestoreArrayRead(bb, &b)); 4190 PetscFunctionReturn(0); 4191 } 4192 4193 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4194 { 4195 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4196 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4197 const MatScalar *bdiag = a->inode.bdiag; 4198 const PetscScalar *b; 4199 PetscInt m = a->inode.node_count, cnt = 0, i, row; 4200 const PetscInt *sizes = a->inode.size; 4201 4202 PetscFunctionBegin; 4203 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4204 PetscCall(VecGetArray(xx, &x)); 4205 PetscCall(VecGetArrayRead(bb, &b)); 4206 cnt = 0; 4207 for (i = 0, row = 0; i < m; i++) { 4208 switch (sizes[i]) { 4209 case 1: 4210 x[row] = b[row] * bdiag[cnt++]; 4211 row++; 4212 break; 4213 case 2: 4214 x1 = b[row]; 4215 x2 = b[row + 1]; 4216 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4217 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4218 x[row++] = tmp1; 4219 x[row++] = tmp2; 4220 cnt += 4; 4221 break; 4222 case 3: 4223 x1 = b[row]; 4224 x2 = b[row + 1]; 4225 x3 = b[row + 2]; 4226 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4227 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4228 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4229 x[row++] = tmp1; 4230 x[row++] = tmp2; 4231 x[row++] = tmp3; 4232 cnt += 9; 4233 break; 4234 case 4: 4235 x1 = b[row]; 4236 x2 = b[row + 1]; 4237 x3 = b[row + 2]; 4238 x4 = b[row + 3]; 4239 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4240 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4241 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4242 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4243 x[row++] = tmp1; 4244 x[row++] = tmp2; 4245 x[row++] = tmp3; 4246 x[row++] = tmp4; 4247 cnt += 16; 4248 break; 4249 case 5: 4250 x1 = b[row]; 4251 x2 = b[row + 1]; 4252 x3 = b[row + 2]; 4253 x4 = b[row + 3]; 4254 x5 = b[row + 4]; 4255 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4256 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4257 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4258 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4259 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4260 x[row++] = tmp1; 4261 x[row++] = tmp2; 4262 x[row++] = tmp3; 4263 x[row++] = tmp4; 4264 x[row++] = tmp5; 4265 cnt += 25; 4266 break; 4267 default: 4268 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4269 } 4270 } 4271 PetscCall(PetscLogFlops(2.0 * cnt)); 4272 PetscCall(VecRestoreArray(xx, &x)); 4273 PetscCall(VecRestoreArrayRead(bb, &b)); 4274 PetscFunctionReturn(0); 4275 } 4276 4277 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4278 { 4279 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4280 4281 PetscFunctionBegin; 4282 a->inode.node_count = 0; 4283 a->inode.use = PETSC_FALSE; 4284 a->inode.checked = PETSC_FALSE; 4285 a->inode.mat_nonzerostate = -1; 4286 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4287 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4288 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4289 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4290 A->ops->coloringpatch = NULL; 4291 A->ops->multdiagonalblock = NULL; 4292 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4293 PetscFunctionReturn(0); 4294 } 4295 4296 /* 4297 samestructure indicates that the matrix has not changed its nonzero structure so we 4298 do not need to recompute the inodes 4299 */ 4300 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4301 { 4302 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4303 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4304 PetscBool flag; 4305 const PetscInt *idx, *idy, *ii; 4306 4307 PetscFunctionBegin; 4308 if (!a->inode.use) { 4309 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4310 PetscCall(PetscFree(a->inode.size)); 4311 PetscFunctionReturn(0); 4312 } 4313 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4314 4315 m = A->rmap->n; 4316 if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size)); 4317 ns = a->inode.size; 4318 4319 i = 0; 4320 node_count = 0; 4321 idx = a->j; 4322 ii = a->i; 4323 while (i < m) { /* For each row */ 4324 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4325 /* Limits the number of elements in a node to 'a->inode.limit' */ 4326 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4327 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4328 if (nzy != nzx) break; 4329 idy += nzx; /* Same nonzero pattern */ 4330 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4331 if (!flag) break; 4332 } 4333 ns[node_count++] = blk_size; 4334 idx += blk_size * nzx; 4335 i = j; 4336 } 4337 4338 /* If not enough inodes found,, do not use inode version of the routines */ 4339 if (!m || node_count > .8 * m) { 4340 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4341 PetscCall(PetscFree(a->inode.size)); 4342 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4343 } else { 4344 if (!A->factortype) { 4345 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4346 if (A->rmap->n == A->cmap->n) { 4347 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4348 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4349 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4350 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4351 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4352 } 4353 } else { 4354 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4355 } 4356 a->inode.node_count = node_count; 4357 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4358 } 4359 a->inode.checked = PETSC_TRUE; 4360 a->inode.mat_nonzerostate = A->nonzerostate; 4361 PetscFunctionReturn(0); 4362 } 4363 4364 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4365 { 4366 Mat B = *C; 4367 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4368 PetscInt m = A->rmap->n; 4369 4370 PetscFunctionBegin; 4371 c->inode.use = a->inode.use; 4372 c->inode.limit = a->inode.limit; 4373 c->inode.max_limit = a->inode.max_limit; 4374 c->inode.checked = PETSC_FALSE; 4375 c->inode.size = NULL; 4376 c->inode.node_count = 0; 4377 c->inode.ibdiagvalid = PETSC_FALSE; 4378 c->inode.ibdiag = NULL; 4379 c->inode.bdiag = NULL; 4380 c->inode.mat_nonzerostate = -1; 4381 if (a->inode.use) { 4382 if (a->inode.checked && a->inode.size) { 4383 PetscCall(PetscMalloc1(m + 1, &c->inode.size)); 4384 PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1)); 4385 4386 c->inode.checked = PETSC_TRUE; 4387 c->inode.node_count = a->inode.node_count; 4388 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4389 } 4390 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4391 if (!B->factortype) { 4392 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4393 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4394 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4395 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4396 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4397 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4398 } else { 4399 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4400 } 4401 } 4402 PetscFunctionReturn(0); 4403 } 4404 4405 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4406 { 4407 PetscInt k; 4408 const PetscInt *vi; 4409 4410 PetscFunctionBegin; 4411 vi = aj + ai[row]; 4412 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4413 vi = aj + adiag[row]; 4414 cols[nzl] = vi[0]; 4415 vi = aj + adiag[row + 1] + 1; 4416 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4417 PetscFunctionReturn(0); 4418 } 4419 /* 4420 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4421 Modified from MatSeqAIJCheckInode(). 4422 4423 Input Parameters: 4424 . Mat A - ILU or LU matrix factor 4425 4426 */ 4427 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4428 { 4429 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4430 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4431 PetscInt *cols1, *cols2, *ns; 4432 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4433 PetscBool flag; 4434 4435 PetscFunctionBegin; 4436 if (!a->inode.use) PetscFunctionReturn(0); 4437 if (a->inode.checked) PetscFunctionReturn(0); 4438 4439 m = A->rmap->n; 4440 if (a->inode.size) ns = a->inode.size; 4441 else PetscCall(PetscMalloc1(m + 1, &ns)); 4442 4443 i = 0; 4444 node_count = 0; 4445 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4446 while (i < m) { /* For each row */ 4447 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4448 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4449 nzx = nzl1 + nzu1 + 1; 4450 MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i); 4451 4452 /* Limits the number of elements in a node to 'a->inode.limit' */ 4453 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4454 nzl2 = ai[j + 1] - ai[j]; 4455 nzu2 = adiag[j] - adiag[j + 1] - 1; 4456 nzy = nzl2 + nzu2 + 1; 4457 if (nzy != nzx) break; 4458 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4459 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4460 if (!flag) break; 4461 } 4462 ns[node_count++] = blk_size; 4463 i = j; 4464 } 4465 PetscCall(PetscFree2(cols1, cols2)); 4466 /* If not enough inodes found,, do not use inode version of the routines */ 4467 if (!m || node_count > .8 * m) { 4468 PetscCall(PetscFree(ns)); 4469 4470 a->inode.node_count = 0; 4471 a->inode.size = NULL; 4472 a->inode.use = PETSC_FALSE; 4473 4474 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4475 } else { 4476 A->ops->mult = NULL; 4477 A->ops->sor = NULL; 4478 A->ops->multadd = NULL; 4479 A->ops->getrowij = NULL; 4480 A->ops->restorerowij = NULL; 4481 A->ops->getcolumnij = NULL; 4482 A->ops->restorecolumnij = NULL; 4483 A->ops->coloringpatch = NULL; 4484 A->ops->multdiagonalblock = NULL; 4485 a->inode.node_count = node_count; 4486 a->inode.size = ns; 4487 4488 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4489 } 4490 a->inode.checked = PETSC_TRUE; 4491 PetscFunctionReturn(0); 4492 } 4493 4494 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4495 { 4496 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4497 4498 PetscFunctionBegin; 4499 a->inode.ibdiagvalid = PETSC_FALSE; 4500 PetscFunctionReturn(0); 4501 } 4502 4503 /* 4504 This is really ugly. if inodes are used this replaces the 4505 permutations with ones that correspond to rows/cols of the matrix 4506 rather then inode blocks 4507 */ 4508 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4509 { 4510 PetscFunctionBegin; 4511 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4512 PetscFunctionReturn(0); 4513 } 4514 4515 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4516 { 4517 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4518 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4519 const PetscInt *ridx, *cidx; 4520 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx; 4521 PetscInt nslim_col, *ns_col; 4522 IS ris = *rperm, cis = *cperm; 4523 4524 PetscFunctionBegin; 4525 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4526 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4527 4528 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4529 PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns)); 4530 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4531 4532 PetscCall(ISGetIndices(ris, &ridx)); 4533 PetscCall(ISGetIndices(cis, &cidx)); 4534 4535 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4536 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i]; 4537 4538 /* Construct the permutations for rows*/ 4539 for (i = 0, row = 0; i < nslim_row; ++i) { 4540 indx = ridx[i]; 4541 start_val = tns[indx]; 4542 end_val = tns[indx + 1]; 4543 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4544 } 4545 4546 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4547 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i]; 4548 4549 /* Construct permutations for columns */ 4550 for (i = 0, col = 0; i < nslim_col; ++i) { 4551 indx = cidx[i]; 4552 start_val = tns[indx]; 4553 end_val = tns[indx + 1]; 4554 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4555 } 4556 4557 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4558 PetscCall(ISSetPermutation(*rperm)); 4559 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4560 PetscCall(ISSetPermutation(*cperm)); 4561 4562 PetscCall(ISRestoreIndices(ris, &ridx)); 4563 PetscCall(ISRestoreIndices(cis, &cidx)); 4564 4565 PetscCall(PetscFree(ns_col)); 4566 PetscCall(PetscFree2(permr, permc)); 4567 PetscCall(ISDestroy(&cis)); 4568 PetscCall(ISDestroy(&ris)); 4569 PetscCall(PetscFree(tns)); 4570 PetscFunctionReturn(0); 4571 } 4572 4573 /*@C 4574 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4575 4576 Not Collective 4577 4578 Input Parameter: 4579 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4580 4581 Output Parameters: 4582 + node_count - no of inodes present in the matrix. 4583 . sizes - an array of size node_count,with sizes of each inode. 4584 - limit - the max size used to generate the inodes. 4585 4586 Level: advanced 4587 4588 Note: 4589 This routine returns some internal storage information 4590 of the matrix, it is intended to be used by advanced users. 4591 It should be called after the matrix is assembled. 4592 The contents of the sizes[] array should not be changed. 4593 NULL may be passed for information not requested. 4594 4595 .seealso: `MatGetInfo()` 4596 @*/ 4597 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4598 { 4599 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4600 4601 PetscFunctionBegin; 4602 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4603 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4604 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4605 PetscFunctionReturn(0); 4606 } 4607 4608 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4609 { 4610 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4611 4612 PetscFunctionBegin; 4613 if (node_count) *node_count = a->inode.node_count; 4614 if (sizes) *sizes = a->inode.size; 4615 if (limit) *limit = a->inode.limit; 4616 PetscFunctionReturn(0); 4617 } 4618