1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 9 { 10 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 11 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 12 13 PetscFunctionBegin; 14 n = A->cmap->n; 15 m = A->rmap->n; 16 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 17 ns_row = a->inode.size; 18 19 min_mn = (m < n) ? m : n; 20 if (!ns) { 21 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) 22 ; 23 for (; count + 1 < n; count++, i++) 24 ; 25 if (count < n) i++; 26 *size = i; 27 PetscFunctionReturn(PETSC_SUCCESS); 28 } 29 PetscCall(PetscMalloc1(n + 1, &ns_col)); 30 31 /* Use the same row structure wherever feasible. */ 32 for (count = 0, i = 0; count < min_mn; count += ns_row[i], i++) ns_col[i] = ns_row[i]; 33 34 /* if m < n; pad up the remainder with inode_limit */ 35 for (; count + 1 < n; count++, i++) ns_col[i] = 1; 36 /* The last node is the odd ball. pad it up with the remaining rows; */ 37 if (count < n) { 38 ns_col[i] = n - count; 39 i++; 40 } else if (count > n) { 41 /* Adjust for the over estimation */ 42 ns_col[i - 1] += n - count; 43 } 44 *size = i; 45 *ns = ns_col; 46 PetscFunctionReturn(PETSC_SUCCESS); 47 } 48 49 /* 50 This builds symmetric version of nonzero structure, 51 */ 52 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 53 { 54 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 55 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 56 PetscInt *tns, *tvc, *ns_row = a->inode.size, *ns_col, nsz, i1, i2; 57 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 58 59 PetscFunctionBegin; 60 nslim_row = a->inode.node_count; 61 m = A->rmap->n; 62 n = A->cmap->n; 63 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 64 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 65 66 /* Use the row_inode as column_inode */ 67 nslim_col = nslim_row; 68 ns_col = ns_row; 69 70 /* allocate space for reformatted inode structure */ 71 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 72 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_row[i1]; 73 74 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 75 nsz = ns_col[i1]; 76 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 77 } 78 /* allocate space for row pointers */ 79 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 80 *iia = ia; 81 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 82 83 /* determine the number of columns in each row */ 84 ia[0] = oshift; 85 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 86 j = aj + ai[row] + ishift; 87 jmax = aj + ai[row + 1] + ishift; 88 if (j == jmax) continue; /* empty row */ 89 col = *j++ + ishift; 90 i2 = tvc[col]; 91 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 92 ia[i1 + 1]++; 93 ia[i2 + 1]++; 94 i2++; /* Start col of next node */ 95 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 96 i2 = tvc[col]; 97 } 98 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 99 } 100 101 /* shift ia[i] to point to next row */ 102 for (i1 = 1; i1 < nslim_row + 1; i1++) { 103 row = ia[i1 - 1]; 104 ia[i1] += row; 105 work[i1 - 1] = row - oshift; 106 } 107 108 /* allocate space for column pointers */ 109 nz = ia[nslim_row] + (!ishift); 110 PetscCall(PetscMalloc1(nz, &ja)); 111 *jja = ja; 112 113 /* loop over lower triangular part putting into ja */ 114 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 115 j = aj + ai[row] + ishift; 116 jmax = aj + ai[row + 1] + ishift; 117 if (j == jmax) continue; /* empty row */ 118 col = *j++ + ishift; 119 i2 = tvc[col]; 120 while (i2 < i1 && j < jmax) { 121 ja[work[i2]++] = i1 + oshift; 122 ja[work[i1]++] = i2 + oshift; 123 ++i2; 124 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 125 i2 = tvc[col]; 126 } 127 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 128 } 129 PetscCall(PetscFree(work)); 130 PetscCall(PetscFree2(tns, tvc)); 131 PetscFunctionReturn(PETSC_SUCCESS); 132 } 133 134 /* 135 This builds nonsymmetric version of nonzero structure, 136 */ 137 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 138 { 139 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 140 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 141 PetscInt *tns, *tvc, nsz, i1, i2; 142 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size; 143 144 PetscFunctionBegin; 145 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 146 nslim_row = a->inode.node_count; 147 n = A->cmap->n; 148 149 /* Create The column_inode for this matrix */ 150 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 151 152 /* allocate space for reformatted column_inode structure */ 153 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 154 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 155 156 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 157 nsz = ns_col[i1]; 158 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 159 } 160 /* allocate space for row pointers */ 161 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 162 *iia = ia; 163 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 164 165 /* determine the number of columns in each row */ 166 ia[0] = oshift; 167 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 168 j = aj + ai[row] + ishift; 169 nz = ai[row + 1] - ai[row]; 170 if (!nz) continue; /* empty row */ 171 col = *j++ + ishift; 172 i2 = tvc[col]; 173 while (nz-- > 0) { /* off-diagonal elements */ 174 ia[i1 + 1]++; 175 i2++; /* Start col of next node */ 176 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 177 if (nz > 0) i2 = tvc[col]; 178 } 179 } 180 181 /* shift ia[i] to point to next row */ 182 for (i1 = 1; i1 < nslim_row + 1; i1++) { 183 row = ia[i1 - 1]; 184 ia[i1] += row; 185 work[i1 - 1] = row - oshift; 186 } 187 188 /* allocate space for column pointers */ 189 nz = ia[nslim_row] + (!ishift); 190 PetscCall(PetscMalloc1(nz, &ja)); 191 *jja = ja; 192 193 /* loop over matrix putting into ja */ 194 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 195 j = aj + ai[row] + ishift; 196 nz = ai[row + 1] - ai[row]; 197 if (!nz) continue; /* empty row */ 198 col = *j++ + ishift; 199 i2 = tvc[col]; 200 while (nz-- > 0) { 201 ja[work[i1]++] = i2 + oshift; 202 ++i2; 203 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 204 if (nz > 0) i2 = tvc[col]; 205 } 206 } 207 PetscCall(PetscFree(ns_col)); 208 PetscCall(PetscFree(work)); 209 PetscCall(PetscFree2(tns, tvc)); 210 PetscFunctionReturn(PETSC_SUCCESS); 211 } 212 213 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 214 { 215 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 216 217 PetscFunctionBegin; 218 if (n) *n = a->inode.node_count; 219 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 220 if (!blockcompressed) { 221 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 222 } else if (symmetric) { 223 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 224 } else { 225 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 226 } 227 PetscFunctionReturn(PETSC_SUCCESS); 228 } 229 230 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 231 { 232 PetscFunctionBegin; 233 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 234 235 if (!blockcompressed) { 236 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 237 } else { 238 PetscCall(PetscFree(*ia)); 239 PetscCall(PetscFree(*ja)); 240 } 241 PetscFunctionReturn(PETSC_SUCCESS); 242 } 243 244 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 245 { 246 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 247 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 248 PetscInt *tns, *tvc, *ns_row = a->inode.size, nsz, i1, i2, *ai = a->i, *aj = a->j; 249 250 PetscFunctionBegin; 251 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 252 nslim_row = a->inode.node_count; 253 n = A->cmap->n; 254 255 /* Create The column_inode for this matrix */ 256 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 257 258 /* allocate space for reformatted column_inode structure */ 259 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 260 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + ns_col[i1]; 261 262 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 263 nsz = ns_col[i1]; 264 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 265 } 266 /* allocate space for column pointers */ 267 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 268 *iia = ia; 269 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 270 271 /* determine the number of columns in each row */ 272 ia[0] = oshift; 273 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 274 j = aj + ai[row] + ishift; 275 col = *j++ + ishift; 276 i2 = tvc[col]; 277 nz = ai[row + 1] - ai[row]; 278 while (nz-- > 0) { /* off-diagonal elements */ 279 /* ia[i1+1]++; */ 280 ia[i2 + 1]++; 281 i2++; 282 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 283 if (nz > 0) i2 = tvc[col]; 284 } 285 } 286 287 /* shift ia[i] to point to next col */ 288 for (i1 = 1; i1 < nslim_col + 1; i1++) { 289 col = ia[i1 - 1]; 290 ia[i1] += col; 291 work[i1 - 1] = col - oshift; 292 } 293 294 /* allocate space for column pointers */ 295 nz = ia[nslim_col] + (!ishift); 296 PetscCall(PetscMalloc1(nz, &ja)); 297 *jja = ja; 298 299 /* loop over matrix putting into ja */ 300 for (i1 = 0, row = 0; i1 < nslim_row; row += ns_row[i1], i1++) { 301 j = aj + ai[row] + ishift; 302 col = *j++ + ishift; 303 i2 = tvc[col]; 304 nz = ai[row + 1] - ai[row]; 305 while (nz-- > 0) { 306 /* ja[work[i1]++] = i2 + oshift; */ 307 ja[work[i2]++] = i1 + oshift; 308 i2++; 309 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 310 if (nz > 0) i2 = tvc[col]; 311 } 312 } 313 PetscCall(PetscFree(ns_col)); 314 PetscCall(PetscFree(work)); 315 PetscCall(PetscFree2(tns, tvc)); 316 PetscFunctionReturn(PETSC_SUCCESS); 317 } 318 319 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 320 { 321 PetscFunctionBegin; 322 PetscCall(MatCreateColInode_Private(A, n, NULL)); 323 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 324 325 if (!blockcompressed) { 326 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 327 } else if (symmetric) { 328 /* Since the indices are symmetric it doesn't matter */ 329 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 330 } else { 331 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 332 } 333 PetscFunctionReturn(PETSC_SUCCESS); 334 } 335 336 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 337 { 338 PetscFunctionBegin; 339 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 340 if (!blockcompressed) { 341 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 342 } else { 343 PetscCall(PetscFree(*ia)); 344 PetscCall(PetscFree(*ja)); 345 } 346 PetscFunctionReturn(PETSC_SUCCESS); 347 } 348 349 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 350 { 351 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 352 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 353 PetscScalar *y; 354 const PetscScalar *x; 355 const MatScalar *v1, *v2, *v3, *v4, *v5; 356 PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 357 const PetscInt *idx, *ns, *ii; 358 359 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 360 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 361 #endif 362 363 PetscFunctionBegin; 364 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 365 node_max = a->inode.node_count; 366 ns = a->inode.size; /* Node Size array */ 367 PetscCall(VecGetArrayRead(xx, &x)); 368 PetscCall(VecGetArray(yy, &y)); 369 idx = a->j; 370 v1 = a->a; 371 ii = a->i; 372 373 for (i = 0, row = 0; i < node_max; ++i) { 374 nsz = ns[i]; 375 n = ii[1] - ii[0]; 376 nonzerorow += (n > 0) * nsz; 377 ii += nsz; 378 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 379 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 380 sz = n; /* No of non zeros in this row */ 381 /* Switch on the size of Node */ 382 switch (nsz) { /* Each loop in 'case' is unrolled */ 383 case 1: 384 sum1 = 0.; 385 386 for (n = 0; n < sz - 1; n += 2) { 387 i1 = idx[0]; /* The instructions are ordered to */ 388 i2 = idx[1]; /* make the compiler's job easy */ 389 idx += 2; 390 tmp0 = x[i1]; 391 tmp1 = x[i2]; 392 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 393 v1 += 2; 394 } 395 396 if (n == sz - 1) { /* Take care of the last nonzero */ 397 tmp0 = x[*idx++]; 398 sum1 += *v1++ * tmp0; 399 } 400 y[row++] = sum1; 401 break; 402 case 2: 403 sum1 = 0.; 404 sum2 = 0.; 405 v2 = v1 + n; 406 407 for (n = 0; n < sz - 1; n += 2) { 408 i1 = idx[0]; 409 i2 = idx[1]; 410 idx += 2; 411 tmp0 = x[i1]; 412 tmp1 = x[i2]; 413 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 414 v1 += 2; 415 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 416 v2 += 2; 417 } 418 if (n == sz - 1) { 419 tmp0 = x[*idx++]; 420 sum1 += *v1++ * tmp0; 421 sum2 += *v2++ * tmp0; 422 } 423 y[row++] = sum1; 424 y[row++] = sum2; 425 v1 = v2; /* Since the next block to be processed starts there*/ 426 idx += sz; 427 break; 428 case 3: 429 sum1 = 0.; 430 sum2 = 0.; 431 sum3 = 0.; 432 v2 = v1 + n; 433 v3 = v2 + n; 434 435 for (n = 0; n < sz - 1; n += 2) { 436 i1 = idx[0]; 437 i2 = idx[1]; 438 idx += 2; 439 tmp0 = x[i1]; 440 tmp1 = x[i2]; 441 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 442 v1 += 2; 443 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 444 v2 += 2; 445 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 446 v3 += 2; 447 } 448 if (n == sz - 1) { 449 tmp0 = x[*idx++]; 450 sum1 += *v1++ * tmp0; 451 sum2 += *v2++ * tmp0; 452 sum3 += *v3++ * tmp0; 453 } 454 y[row++] = sum1; 455 y[row++] = sum2; 456 y[row++] = sum3; 457 v1 = v3; /* Since the next block to be processed starts there*/ 458 idx += 2 * sz; 459 break; 460 case 4: 461 sum1 = 0.; 462 sum2 = 0.; 463 sum3 = 0.; 464 sum4 = 0.; 465 v2 = v1 + n; 466 v3 = v2 + n; 467 v4 = v3 + n; 468 469 for (n = 0; n < sz - 1; n += 2) { 470 i1 = idx[0]; 471 i2 = idx[1]; 472 idx += 2; 473 tmp0 = x[i1]; 474 tmp1 = x[i2]; 475 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 476 v1 += 2; 477 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 478 v2 += 2; 479 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 480 v3 += 2; 481 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 482 v4 += 2; 483 } 484 if (n == sz - 1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 sum4 += *v4++ * tmp0; 490 } 491 y[row++] = sum1; 492 y[row++] = sum2; 493 y[row++] = sum3; 494 y[row++] = sum4; 495 v1 = v4; /* Since the next block to be processed starts there*/ 496 idx += 3 * sz; 497 break; 498 case 5: 499 sum1 = 0.; 500 sum2 = 0.; 501 sum3 = 0.; 502 sum4 = 0.; 503 sum5 = 0.; 504 v2 = v1 + n; 505 v3 = v2 + n; 506 v4 = v3 + n; 507 v5 = v4 + n; 508 509 for (n = 0; n < sz - 1; n += 2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 516 v1 += 2; 517 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 518 v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 520 v3 += 2; 521 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 522 v4 += 2; 523 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 524 v5 += 2; 525 } 526 if (n == sz - 1) { 527 tmp0 = x[*idx++]; 528 sum1 += *v1++ * tmp0; 529 sum2 += *v2++ * tmp0; 530 sum3 += *v3++ * tmp0; 531 sum4 += *v4++ * tmp0; 532 sum5 += *v5++ * tmp0; 533 } 534 y[row++] = sum1; 535 y[row++] = sum2; 536 y[row++] = sum3; 537 y[row++] = sum4; 538 y[row++] = sum5; 539 v1 = v5; /* Since the next block to be processed starts there */ 540 idx += 4 * sz; 541 break; 542 default: 543 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 544 } 545 } 546 PetscCall(VecRestoreArrayRead(xx, &x)); 547 PetscCall(VecRestoreArray(yy, &y)); 548 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 549 PetscFunctionReturn(PETSC_SUCCESS); 550 } 551 552 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 553 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 554 { 555 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 556 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 557 const MatScalar *v1, *v2, *v3, *v4, *v5; 558 const PetscScalar *x; 559 PetscScalar *y, *z, *zt; 560 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 561 const PetscInt *idx, *ns, *ii; 562 563 PetscFunctionBegin; 564 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 565 node_max = a->inode.node_count; 566 ns = a->inode.size; /* Node Size array */ 567 568 PetscCall(VecGetArrayRead(xx, &x)); 569 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 570 zt = z; 571 572 idx = a->j; 573 v1 = a->a; 574 ii = a->i; 575 576 for (i = 0, row = 0; i < node_max; ++i) { 577 nsz = ns[i]; 578 n = ii[1] - ii[0]; 579 ii += nsz; 580 sz = n; /* No of non zeros in this row */ 581 /* Switch on the size of Node */ 582 switch (nsz) { /* Each loop in 'case' is unrolled */ 583 case 1: 584 sum1 = *zt++; 585 586 for (n = 0; n < sz - 1; n += 2) { 587 i1 = idx[0]; /* The instructions are ordered to */ 588 i2 = idx[1]; /* make the compiler's job easy */ 589 idx += 2; 590 tmp0 = x[i1]; 591 tmp1 = x[i2]; 592 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 593 v1 += 2; 594 } 595 596 if (n == sz - 1) { /* Take care of the last nonzero */ 597 tmp0 = x[*idx++]; 598 sum1 += *v1++ * tmp0; 599 } 600 y[row++] = sum1; 601 break; 602 case 2: 603 sum1 = *zt++; 604 sum2 = *zt++; 605 v2 = v1 + n; 606 607 for (n = 0; n < sz - 1; n += 2) { 608 i1 = idx[0]; 609 i2 = idx[1]; 610 idx += 2; 611 tmp0 = x[i1]; 612 tmp1 = x[i2]; 613 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 614 v1 += 2; 615 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 616 v2 += 2; 617 } 618 if (n == sz - 1) { 619 tmp0 = x[*idx++]; 620 sum1 += *v1++ * tmp0; 621 sum2 += *v2++ * tmp0; 622 } 623 y[row++] = sum1; 624 y[row++] = sum2; 625 v1 = v2; /* Since the next block to be processed starts there*/ 626 idx += sz; 627 break; 628 case 3: 629 sum1 = *zt++; 630 sum2 = *zt++; 631 sum3 = *zt++; 632 v2 = v1 + n; 633 v3 = v2 + n; 634 635 for (n = 0; n < sz - 1; n += 2) { 636 i1 = idx[0]; 637 i2 = idx[1]; 638 idx += 2; 639 tmp0 = x[i1]; 640 tmp1 = x[i2]; 641 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 642 v1 += 2; 643 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 644 v2 += 2; 645 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 646 v3 += 2; 647 } 648 if (n == sz - 1) { 649 tmp0 = x[*idx++]; 650 sum1 += *v1++ * tmp0; 651 sum2 += *v2++ * tmp0; 652 sum3 += *v3++ * tmp0; 653 } 654 y[row++] = sum1; 655 y[row++] = sum2; 656 y[row++] = sum3; 657 v1 = v3; /* Since the next block to be processed starts there*/ 658 idx += 2 * sz; 659 break; 660 case 4: 661 sum1 = *zt++; 662 sum2 = *zt++; 663 sum3 = *zt++; 664 sum4 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 v4 = v3 + n; 668 669 for (n = 0; n < sz - 1; n += 2) { 670 i1 = idx[0]; 671 i2 = idx[1]; 672 idx += 2; 673 tmp0 = x[i1]; 674 tmp1 = x[i2]; 675 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 676 v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 678 v2 += 2; 679 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 680 v3 += 2; 681 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 682 v4 += 2; 683 } 684 if (n == sz - 1) { 685 tmp0 = x[*idx++]; 686 sum1 += *v1++ * tmp0; 687 sum2 += *v2++ * tmp0; 688 sum3 += *v3++ * tmp0; 689 sum4 += *v4++ * tmp0; 690 } 691 y[row++] = sum1; 692 y[row++] = sum2; 693 y[row++] = sum3; 694 y[row++] = sum4; 695 v1 = v4; /* Since the next block to be processed starts there*/ 696 idx += 3 * sz; 697 break; 698 case 5: 699 sum1 = *zt++; 700 sum2 = *zt++; 701 sum3 = *zt++; 702 sum4 = *zt++; 703 sum5 = *zt++; 704 v2 = v1 + n; 705 v3 = v2 + n; 706 v4 = v3 + n; 707 v5 = v4 + n; 708 709 for (n = 0; n < sz - 1; n += 2) { 710 i1 = idx[0]; 711 i2 = idx[1]; 712 idx += 2; 713 tmp0 = x[i1]; 714 tmp1 = x[i2]; 715 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 716 v1 += 2; 717 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 718 v2 += 2; 719 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 720 v3 += 2; 721 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 722 v4 += 2; 723 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 724 v5 += 2; 725 } 726 if (n == sz - 1) { 727 tmp0 = x[*idx++]; 728 sum1 += *v1++ * tmp0; 729 sum2 += *v2++ * tmp0; 730 sum3 += *v3++ * tmp0; 731 sum4 += *v4++ * tmp0; 732 sum5 += *v5++ * tmp0; 733 } 734 y[row++] = sum1; 735 y[row++] = sum2; 736 y[row++] = sum3; 737 y[row++] = sum4; 738 y[row++] = sum5; 739 v1 = v5; /* Since the next block to be processed starts there */ 740 idx += 4 * sz; 741 break; 742 default: 743 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 744 } 745 } 746 PetscCall(VecRestoreArrayRead(xx, &x)); 747 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 748 PetscCall(PetscLogFlops(2.0 * a->nz)); 749 PetscFunctionReturn(PETSC_SUCCESS); 750 } 751 752 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 753 { 754 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 755 IS iscol = a->col, isrow = a->row; 756 const PetscInt *r, *c, *rout, *cout; 757 PetscInt i, j, n = A->rmap->n, nz; 758 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 759 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 760 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 761 PetscScalar sum1, sum2, sum3, sum4, sum5; 762 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 763 const PetscScalar *b; 764 765 PetscFunctionBegin; 766 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 767 node_max = a->inode.node_count; 768 ns = a->inode.size; /* Node Size array */ 769 770 PetscCall(VecGetArrayRead(bb, &b)); 771 PetscCall(VecGetArrayWrite(xx, &x)); 772 tmp = a->solve_work; 773 774 PetscCall(ISGetIndices(isrow, &rout)); 775 r = rout; 776 PetscCall(ISGetIndices(iscol, &cout)); 777 c = cout + (n - 1); 778 779 /* forward solve the lower triangular */ 780 tmps = tmp; 781 aa = a_a; 782 aj = a_j; 783 ad = a->diag; 784 785 for (i = 0, row = 0; i < node_max; ++i) { 786 nsz = ns[i]; 787 aii = ai[row]; 788 v1 = aa + aii; 789 vi = aj + aii; 790 nz = ad[row] - aii; 791 if (i < node_max - 1) { 792 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 793 * but our indexing to determine its size could. */ 794 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 795 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 796 PetscPrefetchBlock(aa + ai[row + nsz], ad[row + nsz + ns[i + 1] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 797 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 798 } 799 800 switch (nsz) { /* Each loop in 'case' is unrolled */ 801 case 1: 802 sum1 = b[*r++]; 803 for (j = 0; j < nz - 1; j += 2) { 804 i0 = vi[0]; 805 i1 = vi[1]; 806 vi += 2; 807 tmp0 = tmps[i0]; 808 tmp1 = tmps[i1]; 809 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 810 v1 += 2; 811 } 812 if (j == nz - 1) { 813 tmp0 = tmps[*vi++]; 814 sum1 -= *v1++ * tmp0; 815 } 816 tmp[row++] = sum1; 817 break; 818 case 2: 819 sum1 = b[*r++]; 820 sum2 = b[*r++]; 821 v2 = aa + ai[row + 1]; 822 823 for (j = 0; j < nz - 1; j += 2) { 824 i0 = vi[0]; 825 i1 = vi[1]; 826 vi += 2; 827 tmp0 = tmps[i0]; 828 tmp1 = tmps[i1]; 829 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 830 v1 += 2; 831 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 832 v2 += 2; 833 } 834 if (j == nz - 1) { 835 tmp0 = tmps[*vi++]; 836 sum1 -= *v1++ * tmp0; 837 sum2 -= *v2++ * tmp0; 838 } 839 sum2 -= *v2++ * sum1; 840 tmp[row++] = sum1; 841 tmp[row++] = sum2; 842 break; 843 case 3: 844 sum1 = b[*r++]; 845 sum2 = b[*r++]; 846 sum3 = b[*r++]; 847 v2 = aa + ai[row + 1]; 848 v3 = aa + ai[row + 2]; 849 850 for (j = 0; j < nz - 1; j += 2) { 851 i0 = vi[0]; 852 i1 = vi[1]; 853 vi += 2; 854 tmp0 = tmps[i0]; 855 tmp1 = tmps[i1]; 856 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 857 v1 += 2; 858 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 859 v2 += 2; 860 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 861 v3 += 2; 862 } 863 if (j == nz - 1) { 864 tmp0 = tmps[*vi++]; 865 sum1 -= *v1++ * tmp0; 866 sum2 -= *v2++ * tmp0; 867 sum3 -= *v3++ * tmp0; 868 } 869 sum2 -= *v2++ * sum1; 870 sum3 -= *v3++ * sum1; 871 sum3 -= *v3++ * sum2; 872 873 tmp[row++] = sum1; 874 tmp[row++] = sum2; 875 tmp[row++] = sum3; 876 break; 877 878 case 4: 879 sum1 = b[*r++]; 880 sum2 = b[*r++]; 881 sum3 = b[*r++]; 882 sum4 = b[*r++]; 883 v2 = aa + ai[row + 1]; 884 v3 = aa + ai[row + 2]; 885 v4 = aa + ai[row + 3]; 886 887 for (j = 0; j < nz - 1; j += 2) { 888 i0 = vi[0]; 889 i1 = vi[1]; 890 vi += 2; 891 tmp0 = tmps[i0]; 892 tmp1 = tmps[i1]; 893 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 894 v1 += 2; 895 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 896 v2 += 2; 897 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 898 v3 += 2; 899 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 900 v4 += 2; 901 } 902 if (j == nz - 1) { 903 tmp0 = tmps[*vi++]; 904 sum1 -= *v1++ * tmp0; 905 sum2 -= *v2++ * tmp0; 906 sum3 -= *v3++ * tmp0; 907 sum4 -= *v4++ * tmp0; 908 } 909 sum2 -= *v2++ * sum1; 910 sum3 -= *v3++ * sum1; 911 sum4 -= *v4++ * sum1; 912 sum3 -= *v3++ * sum2; 913 sum4 -= *v4++ * sum2; 914 sum4 -= *v4++ * sum3; 915 916 tmp[row++] = sum1; 917 tmp[row++] = sum2; 918 tmp[row++] = sum3; 919 tmp[row++] = sum4; 920 break; 921 case 5: 922 sum1 = b[*r++]; 923 sum2 = b[*r++]; 924 sum3 = b[*r++]; 925 sum4 = b[*r++]; 926 sum5 = b[*r++]; 927 v2 = aa + ai[row + 1]; 928 v3 = aa + ai[row + 2]; 929 v4 = aa + ai[row + 3]; 930 v5 = aa + ai[row + 4]; 931 932 for (j = 0; j < nz - 1; j += 2) { 933 i0 = vi[0]; 934 i1 = vi[1]; 935 vi += 2; 936 tmp0 = tmps[i0]; 937 tmp1 = tmps[i1]; 938 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 939 v1 += 2; 940 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 941 v2 += 2; 942 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 943 v3 += 2; 944 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 945 v4 += 2; 946 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 947 v5 += 2; 948 } 949 if (j == nz - 1) { 950 tmp0 = tmps[*vi++]; 951 sum1 -= *v1++ * tmp0; 952 sum2 -= *v2++ * tmp0; 953 sum3 -= *v3++ * tmp0; 954 sum4 -= *v4++ * tmp0; 955 sum5 -= *v5++ * tmp0; 956 } 957 958 sum2 -= *v2++ * sum1; 959 sum3 -= *v3++ * sum1; 960 sum4 -= *v4++ * sum1; 961 sum5 -= *v5++ * sum1; 962 sum3 -= *v3++ * sum2; 963 sum4 -= *v4++ * sum2; 964 sum5 -= *v5++ * sum2; 965 sum4 -= *v4++ * sum3; 966 sum5 -= *v5++ * sum3; 967 sum5 -= *v5++ * sum4; 968 969 tmp[row++] = sum1; 970 tmp[row++] = sum2; 971 tmp[row++] = sum3; 972 tmp[row++] = sum4; 973 tmp[row++] = sum5; 974 break; 975 default: 976 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 977 } 978 } 979 /* backward solve the upper triangular */ 980 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 981 nsz = ns[i]; 982 aii = ai[row + 1] - 1; 983 v1 = aa + aii; 984 vi = aj + aii; 985 nz = aii - ad[row]; 986 switch (nsz) { /* Each loop in 'case' is unrolled */ 987 case 1: 988 sum1 = tmp[row]; 989 990 for (j = nz; j > 1; j -= 2) { 991 vi -= 2; 992 i0 = vi[2]; 993 i1 = vi[1]; 994 tmp0 = tmps[i0]; 995 tmp1 = tmps[i1]; 996 v1 -= 2; 997 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 998 } 999 if (j == 1) { 1000 tmp0 = tmps[*vi--]; 1001 sum1 -= *v1-- * tmp0; 1002 } 1003 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1004 row--; 1005 break; 1006 case 2: 1007 sum1 = tmp[row]; 1008 sum2 = tmp[row - 1]; 1009 v2 = aa + ai[row] - 1; 1010 for (j = nz; j > 1; j -= 2) { 1011 vi -= 2; 1012 i0 = vi[2]; 1013 i1 = vi[1]; 1014 tmp0 = tmps[i0]; 1015 tmp1 = tmps[i1]; 1016 v1 -= 2; 1017 v2 -= 2; 1018 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1019 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1020 } 1021 if (j == 1) { 1022 tmp0 = tmps[*vi--]; 1023 sum1 -= *v1-- * tmp0; 1024 sum2 -= *v2-- * tmp0; 1025 } 1026 1027 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1028 row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1031 row--; 1032 break; 1033 case 3: 1034 sum1 = tmp[row]; 1035 sum2 = tmp[row - 1]; 1036 sum3 = tmp[row - 2]; 1037 v2 = aa + ai[row] - 1; 1038 v3 = aa + ai[row - 1] - 1; 1039 for (j = nz; j > 1; j -= 2) { 1040 vi -= 2; 1041 i0 = vi[2]; 1042 i1 = vi[1]; 1043 tmp0 = tmps[i0]; 1044 tmp1 = tmps[i1]; 1045 v1 -= 2; 1046 v2 -= 2; 1047 v3 -= 2; 1048 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1049 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1050 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1051 } 1052 if (j == 1) { 1053 tmp0 = tmps[*vi--]; 1054 sum1 -= *v1-- * tmp0; 1055 sum2 -= *v2-- * tmp0; 1056 sum3 -= *v3-- * tmp0; 1057 } 1058 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1059 row--; 1060 sum2 -= *v2-- * tmp0; 1061 sum3 -= *v3-- * tmp0; 1062 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1063 row--; 1064 sum3 -= *v3-- * tmp0; 1065 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1066 row--; 1067 1068 break; 1069 case 4: 1070 sum1 = tmp[row]; 1071 sum2 = tmp[row - 1]; 1072 sum3 = tmp[row - 2]; 1073 sum4 = tmp[row - 3]; 1074 v2 = aa + ai[row] - 1; 1075 v3 = aa + ai[row - 1] - 1; 1076 v4 = aa + ai[row - 2] - 1; 1077 1078 for (j = nz; j > 1; j -= 2) { 1079 vi -= 2; 1080 i0 = vi[2]; 1081 i1 = vi[1]; 1082 tmp0 = tmps[i0]; 1083 tmp1 = tmps[i1]; 1084 v1 -= 2; 1085 v2 -= 2; 1086 v3 -= 2; 1087 v4 -= 2; 1088 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1089 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1090 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1091 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1092 } 1093 if (j == 1) { 1094 tmp0 = tmps[*vi--]; 1095 sum1 -= *v1-- * tmp0; 1096 sum2 -= *v2-- * tmp0; 1097 sum3 -= *v3-- * tmp0; 1098 sum4 -= *v4-- * tmp0; 1099 } 1100 1101 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1102 row--; 1103 sum2 -= *v2-- * tmp0; 1104 sum3 -= *v3-- * tmp0; 1105 sum4 -= *v4-- * tmp0; 1106 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1107 row--; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1111 row--; 1112 sum4 -= *v4-- * tmp0; 1113 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1114 row--; 1115 break; 1116 case 5: 1117 sum1 = tmp[row]; 1118 sum2 = tmp[row - 1]; 1119 sum3 = tmp[row - 2]; 1120 sum4 = tmp[row - 3]; 1121 sum5 = tmp[row - 4]; 1122 v2 = aa + ai[row] - 1; 1123 v3 = aa + ai[row - 1] - 1; 1124 v4 = aa + ai[row - 2] - 1; 1125 v5 = aa + ai[row - 3] - 1; 1126 for (j = nz; j > 1; j -= 2) { 1127 vi -= 2; 1128 i0 = vi[2]; 1129 i1 = vi[1]; 1130 tmp0 = tmps[i0]; 1131 tmp1 = tmps[i1]; 1132 v1 -= 2; 1133 v2 -= 2; 1134 v3 -= 2; 1135 v4 -= 2; 1136 v5 -= 2; 1137 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1138 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1139 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1140 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1141 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1142 } 1143 if (j == 1) { 1144 tmp0 = tmps[*vi--]; 1145 sum1 -= *v1-- * tmp0; 1146 sum2 -= *v2-- * tmp0; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 } 1151 1152 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1153 row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1159 row--; 1160 sum3 -= *v3-- * tmp0; 1161 sum4 -= *v4-- * tmp0; 1162 sum5 -= *v5-- * tmp0; 1163 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1164 row--; 1165 sum4 -= *v4-- * tmp0; 1166 sum5 -= *v5-- * tmp0; 1167 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1168 row--; 1169 sum5 -= *v5-- * tmp0; 1170 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1171 row--; 1172 break; 1173 default: 1174 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1175 } 1176 } 1177 PetscCall(ISRestoreIndices(isrow, &rout)); 1178 PetscCall(ISRestoreIndices(iscol, &cout)); 1179 PetscCall(VecRestoreArrayRead(bb, &b)); 1180 PetscCall(VecRestoreArrayWrite(xx, &x)); 1181 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1182 PetscFunctionReturn(PETSC_SUCCESS); 1183 } 1184 1185 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1186 { 1187 Mat C = B; 1188 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1189 IS isrow = b->row, isicol = b->icol; 1190 const PetscInt *r, *ic, *ics; 1191 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1192 PetscInt i, j, k, nz, nzL, row, *pj; 1193 const PetscInt *ajtmp, *bjtmp; 1194 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1195 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1196 FactorShiftCtx sctx; 1197 const PetscInt *ddiag; 1198 PetscReal rs; 1199 MatScalar d; 1200 PetscInt inod, nodesz, node_max, col; 1201 const PetscInt *ns; 1202 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1203 1204 PetscFunctionBegin; 1205 /* MatPivotSetUp(): initialize shift context sctx */ 1206 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1207 1208 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1209 ddiag = a->diag; 1210 sctx.shift_top = info->zeropivot; 1211 for (i = 0; i < n; i++) { 1212 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1213 d = (aa)[ddiag[i]]; 1214 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1215 v = aa + ai[i]; 1216 nz = ai[i + 1] - ai[i]; 1217 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1218 if (rs > sctx.shift_top) sctx.shift_top = rs; 1219 } 1220 sctx.shift_top *= 1.1; 1221 sctx.nshift_max = 5; 1222 sctx.shift_lo = 0.; 1223 sctx.shift_hi = 1.; 1224 } 1225 1226 PetscCall(ISGetIndices(isrow, &r)); 1227 PetscCall(ISGetIndices(isicol, &ic)); 1228 1229 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1230 ics = ic; 1231 1232 node_max = a->inode.node_count; 1233 ns = a->inode.size; 1234 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1235 1236 /* If max inode size > 4, split it into two inodes.*/ 1237 /* also map the inode sizes according to the ordering */ 1238 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1239 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1240 if (ns[i] > 4) { 1241 tmp_vec1[j] = 4; 1242 ++j; 1243 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 1244 } else { 1245 tmp_vec1[j] = ns[i]; 1246 } 1247 } 1248 /* Use the correct node_max */ 1249 node_max = j; 1250 1251 /* Now reorder the inode info based on mat re-ordering info */ 1252 /* First create a row -> inode_size_array_index map */ 1253 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1254 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1255 for (i = 0, row = 0; i < node_max; i++) { 1256 nodesz = tmp_vec1[i]; 1257 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1258 } 1259 /* Using nsmap, create a reordered ns structure */ 1260 for (i = 0, j = 0; i < node_max; i++) { 1261 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1262 tmp_vec2[i] = nodesz; 1263 j += nodesz; 1264 } 1265 PetscCall(PetscFree(nsmap)); 1266 PetscCall(PetscFree(tmp_vec1)); 1267 1268 /* Now use the correct ns */ 1269 ns = tmp_vec2; 1270 1271 do { 1272 sctx.newshift = PETSC_FALSE; 1273 /* Now loop over each block-row, and do the factorization */ 1274 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1275 nodesz = ns[inod]; 1276 1277 switch (nodesz) { 1278 case 1: 1279 /* zero rtmp1 */ 1280 /* L part */ 1281 nz = bi[i + 1] - bi[i]; 1282 bjtmp = bj + bi[i]; 1283 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1284 1285 /* U part */ 1286 nz = bdiag[i] - bdiag[i + 1]; 1287 bjtmp = bj + bdiag[i + 1] + 1; 1288 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1289 1290 /* load in initial (unfactored row) */ 1291 nz = ai[r[i] + 1] - ai[r[i]]; 1292 ajtmp = aj + ai[r[i]]; 1293 v = aa + ai[r[i]]; 1294 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1295 1296 /* ZeropivotApply() */ 1297 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1298 1299 /* elimination */ 1300 bjtmp = bj + bi[i]; 1301 row = *bjtmp++; 1302 nzL = bi[i + 1] - bi[i]; 1303 for (k = 0; k < nzL; k++) { 1304 pc = rtmp1 + row; 1305 if (*pc != 0.0) { 1306 pv = b->a + bdiag[row]; 1307 mul1 = *pc * (*pv); 1308 *pc = mul1; 1309 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1310 pv = b->a + bdiag[row + 1] + 1; 1311 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1312 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1313 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1314 } 1315 row = *bjtmp++; 1316 } 1317 1318 /* finished row so stick it into b->a */ 1319 rs = 0.0; 1320 /* L part */ 1321 pv = b->a + bi[i]; 1322 pj = b->j + bi[i]; 1323 nz = bi[i + 1] - bi[i]; 1324 for (j = 0; j < nz; j++) { 1325 pv[j] = rtmp1[pj[j]]; 1326 rs += PetscAbsScalar(pv[j]); 1327 } 1328 1329 /* U part */ 1330 pv = b->a + bdiag[i + 1] + 1; 1331 pj = b->j + bdiag[i + 1] + 1; 1332 nz = bdiag[i] - bdiag[i + 1] - 1; 1333 for (j = 0; j < nz; j++) { 1334 pv[j] = rtmp1[pj[j]]; 1335 rs += PetscAbsScalar(pv[j]); 1336 } 1337 1338 /* Check zero pivot */ 1339 sctx.rs = rs; 1340 sctx.pv = rtmp1[i]; 1341 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1342 if (sctx.newshift) break; 1343 1344 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1345 pv = b->a + bdiag[i]; 1346 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1347 break; 1348 1349 case 2: 1350 /* zero rtmp1 and rtmp2 */ 1351 /* L part */ 1352 nz = bi[i + 1] - bi[i]; 1353 bjtmp = bj + bi[i]; 1354 for (j = 0; j < nz; j++) { 1355 col = bjtmp[j]; 1356 rtmp1[col] = 0.0; 1357 rtmp2[col] = 0.0; 1358 } 1359 1360 /* U part */ 1361 nz = bdiag[i] - bdiag[i + 1]; 1362 bjtmp = bj + bdiag[i + 1] + 1; 1363 for (j = 0; j < nz; j++) { 1364 col = bjtmp[j]; 1365 rtmp1[col] = 0.0; 1366 rtmp2[col] = 0.0; 1367 } 1368 1369 /* load in initial (unfactored row) */ 1370 nz = ai[r[i] + 1] - ai[r[i]]; 1371 ajtmp = aj + ai[r[i]]; 1372 v1 = aa + ai[r[i]]; 1373 v2 = aa + ai[r[i] + 1]; 1374 for (j = 0; j < nz; j++) { 1375 col = ics[ajtmp[j]]; 1376 rtmp1[col] = v1[j]; 1377 rtmp2[col] = v2[j]; 1378 } 1379 /* ZeropivotApply(): shift the diagonal of the matrix */ 1380 rtmp1[i] += sctx.shift_amount; 1381 rtmp2[i + 1] += sctx.shift_amount; 1382 1383 /* elimination */ 1384 bjtmp = bj + bi[i]; 1385 row = *bjtmp++; /* pivot row */ 1386 nzL = bi[i + 1] - bi[i]; 1387 for (k = 0; k < nzL; k++) { 1388 pc1 = rtmp1 + row; 1389 pc2 = rtmp2 + row; 1390 if (*pc1 != 0.0 || *pc2 != 0.0) { 1391 pv = b->a + bdiag[row]; 1392 mul1 = *pc1 * (*pv); 1393 mul2 = *pc2 * (*pv); 1394 *pc1 = mul1; 1395 *pc2 = mul2; 1396 1397 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1398 pv = b->a + bdiag[row + 1] + 1; 1399 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1400 for (j = 0; j < nz; j++) { 1401 col = pj[j]; 1402 rtmp1[col] -= mul1 * pv[j]; 1403 rtmp2[col] -= mul2 * pv[j]; 1404 } 1405 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1406 } 1407 row = *bjtmp++; 1408 } 1409 1410 /* finished row i; check zero pivot, then stick row i into b->a */ 1411 rs = 0.0; 1412 /* L part */ 1413 pc1 = b->a + bi[i]; 1414 pj = b->j + bi[i]; 1415 nz = bi[i + 1] - bi[i]; 1416 for (j = 0; j < nz; j++) { 1417 col = pj[j]; 1418 pc1[j] = rtmp1[col]; 1419 rs += PetscAbsScalar(pc1[j]); 1420 } 1421 /* U part */ 1422 pc1 = b->a + bdiag[i + 1] + 1; 1423 pj = b->j + bdiag[i + 1] + 1; 1424 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1425 for (j = 0; j < nz; j++) { 1426 col = pj[j]; 1427 pc1[j] = rtmp1[col]; 1428 rs += PetscAbsScalar(pc1[j]); 1429 } 1430 1431 sctx.rs = rs; 1432 sctx.pv = rtmp1[i]; 1433 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1434 if (sctx.newshift) break; 1435 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1436 *pc1 = 1.0 / sctx.pv; 1437 1438 /* Now take care of diagonal 2x2 block. */ 1439 pc2 = rtmp2 + i; 1440 if (*pc2 != 0.0) { 1441 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1442 *pc2 = mul1; /* insert L entry */ 1443 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1444 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1445 for (j = 0; j < nz; j++) { 1446 col = pj[j]; 1447 rtmp2[col] -= mul1 * rtmp1[col]; 1448 } 1449 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1450 } 1451 1452 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1453 rs = 0.0; 1454 /* L part */ 1455 pc2 = b->a + bi[i + 1]; 1456 pj = b->j + bi[i + 1]; 1457 nz = bi[i + 2] - bi[i + 1]; 1458 for (j = 0; j < nz; j++) { 1459 col = pj[j]; 1460 pc2[j] = rtmp2[col]; 1461 rs += PetscAbsScalar(pc2[j]); 1462 } 1463 /* U part */ 1464 pc2 = b->a + bdiag[i + 2] + 1; 1465 pj = b->j + bdiag[i + 2] + 1; 1466 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1467 for (j = 0; j < nz; j++) { 1468 col = pj[j]; 1469 pc2[j] = rtmp2[col]; 1470 rs += PetscAbsScalar(pc2[j]); 1471 } 1472 1473 sctx.rs = rs; 1474 sctx.pv = rtmp2[i + 1]; 1475 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1476 if (sctx.newshift) break; 1477 pc2 = b->a + bdiag[i + 1]; 1478 *pc2 = 1.0 / sctx.pv; 1479 break; 1480 1481 case 3: 1482 /* zero rtmp */ 1483 /* L part */ 1484 nz = bi[i + 1] - bi[i]; 1485 bjtmp = bj + bi[i]; 1486 for (j = 0; j < nz; j++) { 1487 col = bjtmp[j]; 1488 rtmp1[col] = 0.0; 1489 rtmp2[col] = 0.0; 1490 rtmp3[col] = 0.0; 1491 } 1492 1493 /* U part */ 1494 nz = bdiag[i] - bdiag[i + 1]; 1495 bjtmp = bj + bdiag[i + 1] + 1; 1496 for (j = 0; j < nz; j++) { 1497 col = bjtmp[j]; 1498 rtmp1[col] = 0.0; 1499 rtmp2[col] = 0.0; 1500 rtmp3[col] = 0.0; 1501 } 1502 1503 /* load in initial (unfactored row) */ 1504 nz = ai[r[i] + 1] - ai[r[i]]; 1505 ajtmp = aj + ai[r[i]]; 1506 v1 = aa + ai[r[i]]; 1507 v2 = aa + ai[r[i] + 1]; 1508 v3 = aa + ai[r[i] + 2]; 1509 for (j = 0; j < nz; j++) { 1510 col = ics[ajtmp[j]]; 1511 rtmp1[col] = v1[j]; 1512 rtmp2[col] = v2[j]; 1513 rtmp3[col] = v3[j]; 1514 } 1515 /* ZeropivotApply(): shift the diagonal of the matrix */ 1516 rtmp1[i] += sctx.shift_amount; 1517 rtmp2[i + 1] += sctx.shift_amount; 1518 rtmp3[i + 2] += sctx.shift_amount; 1519 1520 /* elimination */ 1521 bjtmp = bj + bi[i]; 1522 row = *bjtmp++; /* pivot row */ 1523 nzL = bi[i + 1] - bi[i]; 1524 for (k = 0; k < nzL; k++) { 1525 pc1 = rtmp1 + row; 1526 pc2 = rtmp2 + row; 1527 pc3 = rtmp3 + row; 1528 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1529 pv = b->a + bdiag[row]; 1530 mul1 = *pc1 * (*pv); 1531 mul2 = *pc2 * (*pv); 1532 mul3 = *pc3 * (*pv); 1533 *pc1 = mul1; 1534 *pc2 = mul2; 1535 *pc3 = mul3; 1536 1537 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1538 pv = b->a + bdiag[row + 1] + 1; 1539 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1540 for (j = 0; j < nz; j++) { 1541 col = pj[j]; 1542 rtmp1[col] -= mul1 * pv[j]; 1543 rtmp2[col] -= mul2 * pv[j]; 1544 rtmp3[col] -= mul3 * pv[j]; 1545 } 1546 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1547 } 1548 row = *bjtmp++; 1549 } 1550 1551 /* finished row i; check zero pivot, then stick row i into b->a */ 1552 rs = 0.0; 1553 /* L part */ 1554 pc1 = b->a + bi[i]; 1555 pj = b->j + bi[i]; 1556 nz = bi[i + 1] - bi[i]; 1557 for (j = 0; j < nz; j++) { 1558 col = pj[j]; 1559 pc1[j] = rtmp1[col]; 1560 rs += PetscAbsScalar(pc1[j]); 1561 } 1562 /* U part */ 1563 pc1 = b->a + bdiag[i + 1] + 1; 1564 pj = b->j + bdiag[i + 1] + 1; 1565 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1566 for (j = 0; j < nz; j++) { 1567 col = pj[j]; 1568 pc1[j] = rtmp1[col]; 1569 rs += PetscAbsScalar(pc1[j]); 1570 } 1571 1572 sctx.rs = rs; 1573 sctx.pv = rtmp1[i]; 1574 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1575 if (sctx.newshift) break; 1576 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1577 *pc1 = 1.0 / sctx.pv; 1578 1579 /* Now take care of 1st column of diagonal 3x3 block. */ 1580 pc2 = rtmp2 + i; 1581 pc3 = rtmp3 + i; 1582 if (*pc2 != 0.0 || *pc3 != 0.0) { 1583 mul2 = (*pc2) * (*pc1); 1584 *pc2 = mul2; 1585 mul3 = (*pc3) * (*pc1); 1586 *pc3 = mul3; 1587 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1588 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1589 for (j = 0; j < nz; j++) { 1590 col = pj[j]; 1591 rtmp2[col] -= mul2 * rtmp1[col]; 1592 rtmp3[col] -= mul3 * rtmp1[col]; 1593 } 1594 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1595 } 1596 1597 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1598 rs = 0.0; 1599 /* L part */ 1600 pc2 = b->a + bi[i + 1]; 1601 pj = b->j + bi[i + 1]; 1602 nz = bi[i + 2] - bi[i + 1]; 1603 for (j = 0; j < nz; j++) { 1604 col = pj[j]; 1605 pc2[j] = rtmp2[col]; 1606 rs += PetscAbsScalar(pc2[j]); 1607 } 1608 /* U part */ 1609 pc2 = b->a + bdiag[i + 2] + 1; 1610 pj = b->j + bdiag[i + 2] + 1; 1611 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1612 for (j = 0; j < nz; j++) { 1613 col = pj[j]; 1614 pc2[j] = rtmp2[col]; 1615 rs += PetscAbsScalar(pc2[j]); 1616 } 1617 1618 sctx.rs = rs; 1619 sctx.pv = rtmp2[i + 1]; 1620 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1621 if (sctx.newshift) break; 1622 pc2 = b->a + bdiag[i + 1]; 1623 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1624 1625 /* Now take care of 2nd column of diagonal 3x3 block. */ 1626 pc3 = rtmp3 + i + 1; 1627 if (*pc3 != 0.0) { 1628 mul3 = (*pc3) * (*pc2); 1629 *pc3 = mul3; 1630 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1631 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1632 for (j = 0; j < nz; j++) { 1633 col = pj[j]; 1634 rtmp3[col] -= mul3 * rtmp2[col]; 1635 } 1636 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1637 } 1638 1639 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1640 rs = 0.0; 1641 /* L part */ 1642 pc3 = b->a + bi[i + 2]; 1643 pj = b->j + bi[i + 2]; 1644 nz = bi[i + 3] - bi[i + 2]; 1645 for (j = 0; j < nz; j++) { 1646 col = pj[j]; 1647 pc3[j] = rtmp3[col]; 1648 rs += PetscAbsScalar(pc3[j]); 1649 } 1650 /* U part */ 1651 pc3 = b->a + bdiag[i + 3] + 1; 1652 pj = b->j + bdiag[i + 3] + 1; 1653 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1654 for (j = 0; j < nz; j++) { 1655 col = pj[j]; 1656 pc3[j] = rtmp3[col]; 1657 rs += PetscAbsScalar(pc3[j]); 1658 } 1659 1660 sctx.rs = rs; 1661 sctx.pv = rtmp3[i + 2]; 1662 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1663 if (sctx.newshift) break; 1664 pc3 = b->a + bdiag[i + 2]; 1665 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1666 break; 1667 case 4: 1668 /* zero rtmp */ 1669 /* L part */ 1670 nz = bi[i + 1] - bi[i]; 1671 bjtmp = bj + bi[i]; 1672 for (j = 0; j < nz; j++) { 1673 col = bjtmp[j]; 1674 rtmp1[col] = 0.0; 1675 rtmp2[col] = 0.0; 1676 rtmp3[col] = 0.0; 1677 rtmp4[col] = 0.0; 1678 } 1679 1680 /* U part */ 1681 nz = bdiag[i] - bdiag[i + 1]; 1682 bjtmp = bj + bdiag[i + 1] + 1; 1683 for (j = 0; j < nz; j++) { 1684 col = bjtmp[j]; 1685 rtmp1[col] = 0.0; 1686 rtmp2[col] = 0.0; 1687 rtmp3[col] = 0.0; 1688 rtmp4[col] = 0.0; 1689 } 1690 1691 /* load in initial (unfactored row) */ 1692 nz = ai[r[i] + 1] - ai[r[i]]; 1693 ajtmp = aj + ai[r[i]]; 1694 v1 = aa + ai[r[i]]; 1695 v2 = aa + ai[r[i] + 1]; 1696 v3 = aa + ai[r[i] + 2]; 1697 v4 = aa + ai[r[i] + 3]; 1698 for (j = 0; j < nz; j++) { 1699 col = ics[ajtmp[j]]; 1700 rtmp1[col] = v1[j]; 1701 rtmp2[col] = v2[j]; 1702 rtmp3[col] = v3[j]; 1703 rtmp4[col] = v4[j]; 1704 } 1705 /* ZeropivotApply(): shift the diagonal of the matrix */ 1706 rtmp1[i] += sctx.shift_amount; 1707 rtmp2[i + 1] += sctx.shift_amount; 1708 rtmp3[i + 2] += sctx.shift_amount; 1709 rtmp4[i + 3] += sctx.shift_amount; 1710 1711 /* elimination */ 1712 bjtmp = bj + bi[i]; 1713 row = *bjtmp++; /* pivot row */ 1714 nzL = bi[i + 1] - bi[i]; 1715 for (k = 0; k < nzL; k++) { 1716 pc1 = rtmp1 + row; 1717 pc2 = rtmp2 + row; 1718 pc3 = rtmp3 + row; 1719 pc4 = rtmp4 + row; 1720 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1721 pv = b->a + bdiag[row]; 1722 mul1 = *pc1 * (*pv); 1723 mul2 = *pc2 * (*pv); 1724 mul3 = *pc3 * (*pv); 1725 mul4 = *pc4 * (*pv); 1726 *pc1 = mul1; 1727 *pc2 = mul2; 1728 *pc3 = mul3; 1729 *pc4 = mul4; 1730 1731 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1732 pv = b->a + bdiag[row + 1] + 1; 1733 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1734 for (j = 0; j < nz; j++) { 1735 col = pj[j]; 1736 rtmp1[col] -= mul1 * pv[j]; 1737 rtmp2[col] -= mul2 * pv[j]; 1738 rtmp3[col] -= mul3 * pv[j]; 1739 rtmp4[col] -= mul4 * pv[j]; 1740 } 1741 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1742 } 1743 row = *bjtmp++; 1744 } 1745 1746 /* finished row i; check zero pivot, then stick row i into b->a */ 1747 rs = 0.0; 1748 /* L part */ 1749 pc1 = b->a + bi[i]; 1750 pj = b->j + bi[i]; 1751 nz = bi[i + 1] - bi[i]; 1752 for (j = 0; j < nz; j++) { 1753 col = pj[j]; 1754 pc1[j] = rtmp1[col]; 1755 rs += PetscAbsScalar(pc1[j]); 1756 } 1757 /* U part */ 1758 pc1 = b->a + bdiag[i + 1] + 1; 1759 pj = b->j + bdiag[i + 1] + 1; 1760 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1761 for (j = 0; j < nz; j++) { 1762 col = pj[j]; 1763 pc1[j] = rtmp1[col]; 1764 rs += PetscAbsScalar(pc1[j]); 1765 } 1766 1767 sctx.rs = rs; 1768 sctx.pv = rtmp1[i]; 1769 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1770 if (sctx.newshift) break; 1771 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1772 *pc1 = 1.0 / sctx.pv; 1773 1774 /* Now take care of 1st column of diagonal 4x4 block. */ 1775 pc2 = rtmp2 + i; 1776 pc3 = rtmp3 + i; 1777 pc4 = rtmp4 + i; 1778 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1779 mul2 = (*pc2) * (*pc1); 1780 *pc2 = mul2; 1781 mul3 = (*pc3) * (*pc1); 1782 *pc3 = mul3; 1783 mul4 = (*pc4) * (*pc1); 1784 *pc4 = mul4; 1785 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1786 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1787 for (j = 0; j < nz; j++) { 1788 col = pj[j]; 1789 rtmp2[col] -= mul2 * rtmp1[col]; 1790 rtmp3[col] -= mul3 * rtmp1[col]; 1791 rtmp4[col] -= mul4 * rtmp1[col]; 1792 } 1793 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1794 } 1795 1796 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1797 rs = 0.0; 1798 /* L part */ 1799 pc2 = b->a + bi[i + 1]; 1800 pj = b->j + bi[i + 1]; 1801 nz = bi[i + 2] - bi[i + 1]; 1802 for (j = 0; j < nz; j++) { 1803 col = pj[j]; 1804 pc2[j] = rtmp2[col]; 1805 rs += PetscAbsScalar(pc2[j]); 1806 } 1807 /* U part */ 1808 pc2 = b->a + bdiag[i + 2] + 1; 1809 pj = b->j + bdiag[i + 2] + 1; 1810 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1811 for (j = 0; j < nz; j++) { 1812 col = pj[j]; 1813 pc2[j] = rtmp2[col]; 1814 rs += PetscAbsScalar(pc2[j]); 1815 } 1816 1817 sctx.rs = rs; 1818 sctx.pv = rtmp2[i + 1]; 1819 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1820 if (sctx.newshift) break; 1821 pc2 = b->a + bdiag[i + 1]; 1822 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1823 1824 /* Now take care of 2nd column of diagonal 4x4 block. */ 1825 pc3 = rtmp3 + i + 1; 1826 pc4 = rtmp4 + i + 1; 1827 if (*pc3 != 0.0 || *pc4 != 0.0) { 1828 mul3 = (*pc3) * (*pc2); 1829 *pc3 = mul3; 1830 mul4 = (*pc4) * (*pc2); 1831 *pc4 = mul4; 1832 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1833 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1834 for (j = 0; j < nz; j++) { 1835 col = pj[j]; 1836 rtmp3[col] -= mul3 * rtmp2[col]; 1837 rtmp4[col] -= mul4 * rtmp2[col]; 1838 } 1839 PetscCall(PetscLogFlops(4.0 * nz)); 1840 } 1841 1842 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1843 rs = 0.0; 1844 /* L part */ 1845 pc3 = b->a + bi[i + 2]; 1846 pj = b->j + bi[i + 2]; 1847 nz = bi[i + 3] - bi[i + 2]; 1848 for (j = 0; j < nz; j++) { 1849 col = pj[j]; 1850 pc3[j] = rtmp3[col]; 1851 rs += PetscAbsScalar(pc3[j]); 1852 } 1853 /* U part */ 1854 pc3 = b->a + bdiag[i + 3] + 1; 1855 pj = b->j + bdiag[i + 3] + 1; 1856 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1857 for (j = 0; j < nz; j++) { 1858 col = pj[j]; 1859 pc3[j] = rtmp3[col]; 1860 rs += PetscAbsScalar(pc3[j]); 1861 } 1862 1863 sctx.rs = rs; 1864 sctx.pv = rtmp3[i + 2]; 1865 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1866 if (sctx.newshift) break; 1867 pc3 = b->a + bdiag[i + 2]; 1868 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1869 1870 /* Now take care of 3rd column of diagonal 4x4 block. */ 1871 pc4 = rtmp4 + i + 2; 1872 if (*pc4 != 0.0) { 1873 mul4 = (*pc4) * (*pc3); 1874 *pc4 = mul4; 1875 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1876 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1877 for (j = 0; j < nz; j++) { 1878 col = pj[j]; 1879 rtmp4[col] -= mul4 * rtmp3[col]; 1880 } 1881 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1882 } 1883 1884 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1885 rs = 0.0; 1886 /* L part */ 1887 pc4 = b->a + bi[i + 3]; 1888 pj = b->j + bi[i + 3]; 1889 nz = bi[i + 4] - bi[i + 3]; 1890 for (j = 0; j < nz; j++) { 1891 col = pj[j]; 1892 pc4[j] = rtmp4[col]; 1893 rs += PetscAbsScalar(pc4[j]); 1894 } 1895 /* U part */ 1896 pc4 = b->a + bdiag[i + 4] + 1; 1897 pj = b->j + bdiag[i + 4] + 1; 1898 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1899 for (j = 0; j < nz; j++) { 1900 col = pj[j]; 1901 pc4[j] = rtmp4[col]; 1902 rs += PetscAbsScalar(pc4[j]); 1903 } 1904 1905 sctx.rs = rs; 1906 sctx.pv = rtmp4[i + 3]; 1907 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1908 if (sctx.newshift) break; 1909 pc4 = b->a + bdiag[i + 3]; 1910 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1911 break; 1912 1913 default: 1914 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1915 } 1916 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1917 i += nodesz; /* Update the row */ 1918 } 1919 1920 /* MatPivotRefine() */ 1921 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1922 /* 1923 * if no shift in this attempt & shifting & started shifting & can refine, 1924 * then try lower shift 1925 */ 1926 sctx.shift_hi = sctx.shift_fraction; 1927 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1928 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1929 sctx.newshift = PETSC_TRUE; 1930 sctx.nshift++; 1931 } 1932 } while (sctx.newshift); 1933 1934 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1935 PetscCall(PetscFree(tmp_vec2)); 1936 PetscCall(ISRestoreIndices(isicol, &ic)); 1937 PetscCall(ISRestoreIndices(isrow, &r)); 1938 1939 if (b->inode.size) { 1940 C->ops->solve = MatSolve_SeqAIJ_Inode; 1941 } else { 1942 C->ops->solve = MatSolve_SeqAIJ; 1943 } 1944 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1945 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1946 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1947 C->ops->matsolve = MatMatSolve_SeqAIJ; 1948 C->assembled = PETSC_TRUE; 1949 C->preallocated = PETSC_TRUE; 1950 1951 PetscCall(PetscLogFlops(C->cmap->n)); 1952 1953 /* MatShiftView(A,info,&sctx) */ 1954 if (sctx.nshift) { 1955 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1956 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1957 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1958 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1959 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1960 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1961 } 1962 } 1963 PetscFunctionReturn(PETSC_SUCCESS); 1964 } 1965 1966 #if 0 1967 // unused 1968 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1969 { 1970 Mat C = B; 1971 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1972 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1973 const PetscInt *r, *ic, *c, *ics; 1974 PetscInt n = A->rmap->n, *bi = b->i; 1975 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1976 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1977 PetscInt *ai = a->i, *aj = a->j; 1978 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1979 PetscScalar mul1, mul2, mul3, tmp; 1980 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1981 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1982 PetscReal rs = 0.0; 1983 FactorShiftCtx sctx; 1984 1985 PetscFunctionBegin; 1986 sctx.shift_top = 0; 1987 sctx.nshift_max = 0; 1988 sctx.shift_lo = 0; 1989 sctx.shift_hi = 0; 1990 sctx.shift_fraction = 0; 1991 1992 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1993 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1994 sctx.shift_top = 0; 1995 for (i = 0; i < n; i++) { 1996 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1997 rs = 0.0; 1998 ajtmp = aj + ai[i]; 1999 rtmp1 = aa + ai[i]; 2000 nz = ai[i + 1] - ai[i]; 2001 for (j = 0; j < nz; j++) { 2002 if (*ajtmp != i) { 2003 rs += PetscAbsScalar(*rtmp1++); 2004 } else { 2005 rs -= PetscRealPart(*rtmp1++); 2006 } 2007 ajtmp++; 2008 } 2009 if (rs > sctx.shift_top) sctx.shift_top = rs; 2010 } 2011 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2012 sctx.shift_top *= 1.1; 2013 sctx.nshift_max = 5; 2014 sctx.shift_lo = 0.; 2015 sctx.shift_hi = 1.; 2016 } 2017 sctx.shift_amount = 0; 2018 sctx.nshift = 0; 2019 2020 PetscCall(ISGetIndices(isrow, &r)); 2021 PetscCall(ISGetIndices(iscol, &c)); 2022 PetscCall(ISGetIndices(isicol, &ic)); 2023 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2024 ics = ic; 2025 2026 node_max = a->inode.node_count; 2027 ns = a->inode.size; 2028 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2029 2030 /* If max inode size > 3, split it into two inodes.*/ 2031 /* also map the inode sizes according to the ordering */ 2032 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2033 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2034 if (ns[i] > 3) { 2035 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2036 ++j; 2037 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2038 } else { 2039 tmp_vec1[j] = ns[i]; 2040 } 2041 } 2042 /* Use the correct node_max */ 2043 node_max = j; 2044 2045 /* Now reorder the inode info based on mat re-ordering info */ 2046 /* First create a row -> inode_size_array_index map */ 2047 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2048 for (i = 0, row = 0; i < node_max; i++) { 2049 nodesz = tmp_vec1[i]; 2050 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2051 } 2052 /* Using nsmap, create a reordered ns structure */ 2053 for (i = 0, j = 0; i < node_max; i++) { 2054 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2055 tmp_vec2[i] = nodesz; 2056 j += nodesz; 2057 } 2058 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2059 /* Now use the correct ns */ 2060 ns = tmp_vec2; 2061 2062 do { 2063 sctx.newshift = PETSC_FALSE; 2064 /* Now loop over each block-row, and do the factorization */ 2065 for (i = 0, row = 0; i < node_max; i++) { 2066 nodesz = ns[i]; 2067 nz = bi[row + 1] - bi[row]; 2068 bjtmp = bj + bi[row]; 2069 2070 switch (nodesz) { 2071 case 1: 2072 for (j = 0; j < nz; j++) { 2073 idx = bjtmp[j]; 2074 rtmp11[idx] = 0.0; 2075 } 2076 2077 /* load in initial (unfactored row) */ 2078 idx = r[row]; 2079 nz_tmp = ai[idx + 1] - ai[idx]; 2080 ajtmp = aj + ai[idx]; 2081 v1 = aa + ai[idx]; 2082 2083 for (j = 0; j < nz_tmp; j++) { 2084 idx = ics[ajtmp[j]]; 2085 rtmp11[idx] = v1[j]; 2086 } 2087 rtmp11[ics[r[row]]] += sctx.shift_amount; 2088 2089 prow = *bjtmp++; 2090 while (prow < row) { 2091 pc1 = rtmp11 + prow; 2092 if (*pc1 != 0.0) { 2093 pv = ba + bd[prow]; 2094 pj = nbj + bd[prow]; 2095 mul1 = *pc1 * *pv++; 2096 *pc1 = mul1; 2097 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2098 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2099 for (j = 0; j < nz_tmp; j++) { 2100 tmp = pv[j]; 2101 idx = pj[j]; 2102 rtmp11[idx] -= mul1 * tmp; 2103 } 2104 } 2105 prow = *bjtmp++; 2106 } 2107 pj = bj + bi[row]; 2108 pc1 = ba + bi[row]; 2109 2110 sctx.pv = rtmp11[row]; 2111 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2112 rs = 0.0; 2113 for (j = 0; j < nz; j++) { 2114 idx = pj[j]; 2115 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2116 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2117 } 2118 sctx.rs = rs; 2119 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2120 if (sctx.newshift) goto endofwhile; 2121 break; 2122 2123 case 2: 2124 for (j = 0; j < nz; j++) { 2125 idx = bjtmp[j]; 2126 rtmp11[idx] = 0.0; 2127 rtmp22[idx] = 0.0; 2128 } 2129 2130 /* load in initial (unfactored row) */ 2131 idx = r[row]; 2132 nz_tmp = ai[idx + 1] - ai[idx]; 2133 ajtmp = aj + ai[idx]; 2134 v1 = aa + ai[idx]; 2135 v2 = aa + ai[idx + 1]; 2136 for (j = 0; j < nz_tmp; j++) { 2137 idx = ics[ajtmp[j]]; 2138 rtmp11[idx] = v1[j]; 2139 rtmp22[idx] = v2[j]; 2140 } 2141 rtmp11[ics[r[row]]] += sctx.shift_amount; 2142 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2143 2144 prow = *bjtmp++; 2145 while (prow < row) { 2146 pc1 = rtmp11 + prow; 2147 pc2 = rtmp22 + prow; 2148 if (*pc1 != 0.0 || *pc2 != 0.0) { 2149 pv = ba + bd[prow]; 2150 pj = nbj + bd[prow]; 2151 mul1 = *pc1 * *pv; 2152 mul2 = *pc2 * *pv; 2153 ++pv; 2154 *pc1 = mul1; 2155 *pc2 = mul2; 2156 2157 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2158 for (j = 0; j < nz_tmp; j++) { 2159 tmp = pv[j]; 2160 idx = pj[j]; 2161 rtmp11[idx] -= mul1 * tmp; 2162 rtmp22[idx] -= mul2 * tmp; 2163 } 2164 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2165 } 2166 prow = *bjtmp++; 2167 } 2168 2169 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2170 pc1 = rtmp11 + prow; 2171 pc2 = rtmp22 + prow; 2172 2173 sctx.pv = *pc1; 2174 pj = bj + bi[prow]; 2175 rs = 0.0; 2176 for (j = 0; j < nz; j++) { 2177 idx = pj[j]; 2178 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2179 } 2180 sctx.rs = rs; 2181 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2182 if (sctx.newshift) goto endofwhile; 2183 2184 if (*pc2 != 0.0) { 2185 pj = nbj + bd[prow]; 2186 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2187 *pc2 = mul2; 2188 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2189 for (j = 0; j < nz_tmp; j++) { 2190 idx = pj[j]; 2191 tmp = rtmp11[idx]; 2192 rtmp22[idx] -= mul2 * tmp; 2193 } 2194 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2195 } 2196 2197 pj = bj + bi[row]; 2198 pc1 = ba + bi[row]; 2199 pc2 = ba + bi[row + 1]; 2200 2201 sctx.pv = rtmp22[row + 1]; 2202 rs = 0.0; 2203 rtmp11[row] = 1.0 / rtmp11[row]; 2204 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2205 /* copy row entries from dense representation to sparse */ 2206 for (j = 0; j < nz; j++) { 2207 idx = pj[j]; 2208 pc1[j] = rtmp11[idx]; 2209 pc2[j] = rtmp22[idx]; 2210 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2211 } 2212 sctx.rs = rs; 2213 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2214 if (sctx.newshift) goto endofwhile; 2215 break; 2216 2217 case 3: 2218 for (j = 0; j < nz; j++) { 2219 idx = bjtmp[j]; 2220 rtmp11[idx] = 0.0; 2221 rtmp22[idx] = 0.0; 2222 rtmp33[idx] = 0.0; 2223 } 2224 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2225 idx = r[row]; 2226 nz_tmp = ai[idx + 1] - ai[idx]; 2227 ajtmp = aj + ai[idx]; 2228 v1 = aa + ai[idx]; 2229 v2 = aa + ai[idx + 1]; 2230 v3 = aa + ai[idx + 2]; 2231 for (j = 0; j < nz_tmp; j++) { 2232 idx = ics[ajtmp[j]]; 2233 rtmp11[idx] = v1[j]; 2234 rtmp22[idx] = v2[j]; 2235 rtmp33[idx] = v3[j]; 2236 } 2237 rtmp11[ics[r[row]]] += sctx.shift_amount; 2238 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2239 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2240 2241 /* loop over all pivot row blocks above this row block */ 2242 prow = *bjtmp++; 2243 while (prow < row) { 2244 pc1 = rtmp11 + prow; 2245 pc2 = rtmp22 + prow; 2246 pc3 = rtmp33 + prow; 2247 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2248 pv = ba + bd[prow]; 2249 pj = nbj + bd[prow]; 2250 mul1 = *pc1 * *pv; 2251 mul2 = *pc2 * *pv; 2252 mul3 = *pc3 * *pv; 2253 ++pv; 2254 *pc1 = mul1; 2255 *pc2 = mul2; 2256 *pc3 = mul3; 2257 2258 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2259 /* update this row based on pivot row */ 2260 for (j = 0; j < nz_tmp; j++) { 2261 tmp = pv[j]; 2262 idx = pj[j]; 2263 rtmp11[idx] -= mul1 * tmp; 2264 rtmp22[idx] -= mul2 * tmp; 2265 rtmp33[idx] -= mul3 * tmp; 2266 } 2267 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2268 } 2269 prow = *bjtmp++; 2270 } 2271 2272 /* Now take care of diagonal 3x3 block in this set of rows */ 2273 /* note: prow = row here */ 2274 pc1 = rtmp11 + prow; 2275 pc2 = rtmp22 + prow; 2276 pc3 = rtmp33 + prow; 2277 2278 sctx.pv = *pc1; 2279 pj = bj + bi[prow]; 2280 rs = 0.0; 2281 for (j = 0; j < nz; j++) { 2282 idx = pj[j]; 2283 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2284 } 2285 sctx.rs = rs; 2286 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2287 if (sctx.newshift) goto endofwhile; 2288 2289 if (*pc2 != 0.0 || *pc3 != 0.0) { 2290 mul2 = (*pc2) / (*pc1); 2291 mul3 = (*pc3) / (*pc1); 2292 *pc2 = mul2; 2293 *pc3 = mul3; 2294 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2295 pj = nbj + bd[prow]; 2296 for (j = 0; j < nz_tmp; j++) { 2297 idx = pj[j]; 2298 tmp = rtmp11[idx]; 2299 rtmp22[idx] -= mul2 * tmp; 2300 rtmp33[idx] -= mul3 * tmp; 2301 } 2302 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2303 } 2304 ++prow; 2305 2306 pc2 = rtmp22 + prow; 2307 pc3 = rtmp33 + prow; 2308 sctx.pv = *pc2; 2309 pj = bj + bi[prow]; 2310 rs = 0.0; 2311 for (j = 0; j < nz; j++) { 2312 idx = pj[j]; 2313 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2314 } 2315 sctx.rs = rs; 2316 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2317 if (sctx.newshift) goto endofwhile; 2318 2319 if (*pc3 != 0.0) { 2320 mul3 = (*pc3) / (*pc2); 2321 *pc3 = mul3; 2322 pj = nbj + bd[prow]; 2323 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2324 for (j = 0; j < nz_tmp; j++) { 2325 idx = pj[j]; 2326 tmp = rtmp22[idx]; 2327 rtmp33[idx] -= mul3 * tmp; 2328 } 2329 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2330 } 2331 2332 pj = bj + bi[row]; 2333 pc1 = ba + bi[row]; 2334 pc2 = ba + bi[row + 1]; 2335 pc3 = ba + bi[row + 2]; 2336 2337 sctx.pv = rtmp33[row + 2]; 2338 rs = 0.0; 2339 rtmp11[row] = 1.0 / rtmp11[row]; 2340 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2341 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2342 /* copy row entries from dense representation to sparse */ 2343 for (j = 0; j < nz; j++) { 2344 idx = pj[j]; 2345 pc1[j] = rtmp11[idx]; 2346 pc2[j] = rtmp22[idx]; 2347 pc3[j] = rtmp33[idx]; 2348 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2349 } 2350 2351 sctx.rs = rs; 2352 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2353 if (sctx.newshift) goto endofwhile; 2354 break; 2355 2356 default: 2357 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2358 } 2359 row += nodesz; /* Update the row */ 2360 } 2361 endofwhile:; 2362 } while (sctx.newshift); 2363 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2364 PetscCall(PetscFree(tmp_vec2)); 2365 PetscCall(ISRestoreIndices(isicol, &ic)); 2366 PetscCall(ISRestoreIndices(isrow, &r)); 2367 PetscCall(ISRestoreIndices(iscol, &c)); 2368 2369 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2370 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2371 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2372 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2373 C->assembled = PETSC_TRUE; 2374 C->preallocated = PETSC_TRUE; 2375 if (sctx.nshift) { 2376 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2377 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2378 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2379 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2380 } 2381 } 2382 PetscCall(PetscLogFlops(C->cmap->n)); 2383 PetscCall(MatSeqAIJCheckInode(C)); 2384 PetscFunctionReturn(PETSC_SUCCESS); 2385 } 2386 #endif 2387 2388 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2389 { 2390 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2391 IS iscol = a->col, isrow = a->row; 2392 const PetscInt *r, *c, *rout, *cout; 2393 PetscInt i, j, n = A->rmap->n; 2394 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2395 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2396 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2397 PetscScalar sum1, sum2, sum3, sum4, sum5; 2398 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2399 const PetscScalar *b; 2400 2401 PetscFunctionBegin; 2402 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2403 node_max = a->inode.node_count; 2404 ns = a->inode.size; /* Node Size array */ 2405 2406 PetscCall(VecGetArrayRead(bb, &b)); 2407 PetscCall(VecGetArrayWrite(xx, &x)); 2408 tmp = a->solve_work; 2409 2410 PetscCall(ISGetIndices(isrow, &rout)); 2411 r = rout; 2412 PetscCall(ISGetIndices(iscol, &cout)); 2413 c = cout; 2414 2415 /* forward solve the lower triangular */ 2416 tmps = tmp; 2417 aa = a_a; 2418 aj = a_j; 2419 ad = a->diag; 2420 2421 for (i = 0, row = 0; i < node_max; ++i) { 2422 nsz = ns[i]; 2423 aii = ai[row]; 2424 v1 = aa + aii; 2425 vi = aj + aii; 2426 nz = ai[row + 1] - ai[row]; 2427 2428 if (i < node_max - 1) { 2429 /* Prefetch the indices for the next block */ 2430 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2431 /* Prefetch the data for the next block */ 2432 PetscPrefetchBlock(aa + ai[row + nsz], ai[row + nsz + ns[i + 1]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2433 } 2434 2435 switch (nsz) { /* Each loop in 'case' is unrolled */ 2436 case 1: 2437 sum1 = b[r[row]]; 2438 for (j = 0; j < nz - 1; j += 2) { 2439 i0 = vi[j]; 2440 i1 = vi[j + 1]; 2441 tmp0 = tmps[i0]; 2442 tmp1 = tmps[i1]; 2443 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2444 } 2445 if (j == nz - 1) { 2446 tmp0 = tmps[vi[j]]; 2447 sum1 -= v1[j] * tmp0; 2448 } 2449 tmp[row++] = sum1; 2450 break; 2451 case 2: 2452 sum1 = b[r[row]]; 2453 sum2 = b[r[row + 1]]; 2454 v2 = aa + ai[row + 1]; 2455 2456 for (j = 0; j < nz - 1; j += 2) { 2457 i0 = vi[j]; 2458 i1 = vi[j + 1]; 2459 tmp0 = tmps[i0]; 2460 tmp1 = tmps[i1]; 2461 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2462 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2463 } 2464 if (j == nz - 1) { 2465 tmp0 = tmps[vi[j]]; 2466 sum1 -= v1[j] * tmp0; 2467 sum2 -= v2[j] * tmp0; 2468 } 2469 sum2 -= v2[nz] * sum1; 2470 tmp[row++] = sum1; 2471 tmp[row++] = sum2; 2472 break; 2473 case 3: 2474 sum1 = b[r[row]]; 2475 sum2 = b[r[row + 1]]; 2476 sum3 = b[r[row + 2]]; 2477 v2 = aa + ai[row + 1]; 2478 v3 = aa + ai[row + 2]; 2479 2480 for (j = 0; j < nz - 1; j += 2) { 2481 i0 = vi[j]; 2482 i1 = vi[j + 1]; 2483 tmp0 = tmps[i0]; 2484 tmp1 = tmps[i1]; 2485 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2486 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2487 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2488 } 2489 if (j == nz - 1) { 2490 tmp0 = tmps[vi[j]]; 2491 sum1 -= v1[j] * tmp0; 2492 sum2 -= v2[j] * tmp0; 2493 sum3 -= v3[j] * tmp0; 2494 } 2495 sum2 -= v2[nz] * sum1; 2496 sum3 -= v3[nz] * sum1; 2497 sum3 -= v3[nz + 1] * sum2; 2498 tmp[row++] = sum1; 2499 tmp[row++] = sum2; 2500 tmp[row++] = sum3; 2501 break; 2502 2503 case 4: 2504 sum1 = b[r[row]]; 2505 sum2 = b[r[row + 1]]; 2506 sum3 = b[r[row + 2]]; 2507 sum4 = b[r[row + 3]]; 2508 v2 = aa + ai[row + 1]; 2509 v3 = aa + ai[row + 2]; 2510 v4 = aa + ai[row + 3]; 2511 2512 for (j = 0; j < nz - 1; j += 2) { 2513 i0 = vi[j]; 2514 i1 = vi[j + 1]; 2515 tmp0 = tmps[i0]; 2516 tmp1 = tmps[i1]; 2517 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2518 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2519 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2520 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2521 } 2522 if (j == nz - 1) { 2523 tmp0 = tmps[vi[j]]; 2524 sum1 -= v1[j] * tmp0; 2525 sum2 -= v2[j] * tmp0; 2526 sum3 -= v3[j] * tmp0; 2527 sum4 -= v4[j] * tmp0; 2528 } 2529 sum2 -= v2[nz] * sum1; 2530 sum3 -= v3[nz] * sum1; 2531 sum4 -= v4[nz] * sum1; 2532 sum3 -= v3[nz + 1] * sum2; 2533 sum4 -= v4[nz + 1] * sum2; 2534 sum4 -= v4[nz + 2] * sum3; 2535 2536 tmp[row++] = sum1; 2537 tmp[row++] = sum2; 2538 tmp[row++] = sum3; 2539 tmp[row++] = sum4; 2540 break; 2541 case 5: 2542 sum1 = b[r[row]]; 2543 sum2 = b[r[row + 1]]; 2544 sum3 = b[r[row + 2]]; 2545 sum4 = b[r[row + 3]]; 2546 sum5 = b[r[row + 4]]; 2547 v2 = aa + ai[row + 1]; 2548 v3 = aa + ai[row + 2]; 2549 v4 = aa + ai[row + 3]; 2550 v5 = aa + ai[row + 4]; 2551 2552 for (j = 0; j < nz - 1; j += 2) { 2553 i0 = vi[j]; 2554 i1 = vi[j + 1]; 2555 tmp0 = tmps[i0]; 2556 tmp1 = tmps[i1]; 2557 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2558 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2559 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2560 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2561 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2562 } 2563 if (j == nz - 1) { 2564 tmp0 = tmps[vi[j]]; 2565 sum1 -= v1[j] * tmp0; 2566 sum2 -= v2[j] * tmp0; 2567 sum3 -= v3[j] * tmp0; 2568 sum4 -= v4[j] * tmp0; 2569 sum5 -= v5[j] * tmp0; 2570 } 2571 2572 sum2 -= v2[nz] * sum1; 2573 sum3 -= v3[nz] * sum1; 2574 sum4 -= v4[nz] * sum1; 2575 sum5 -= v5[nz] * sum1; 2576 sum3 -= v3[nz + 1] * sum2; 2577 sum4 -= v4[nz + 1] * sum2; 2578 sum5 -= v5[nz + 1] * sum2; 2579 sum4 -= v4[nz + 2] * sum3; 2580 sum5 -= v5[nz + 2] * sum3; 2581 sum5 -= v5[nz + 3] * sum4; 2582 2583 tmp[row++] = sum1; 2584 tmp[row++] = sum2; 2585 tmp[row++] = sum3; 2586 tmp[row++] = sum4; 2587 tmp[row++] = sum5; 2588 break; 2589 default: 2590 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2591 } 2592 } 2593 /* backward solve the upper triangular */ 2594 for (i = node_max - 1, row = n - 1; i >= 0; i--) { 2595 nsz = ns[i]; 2596 aii = ad[row + 1] + 1; 2597 v1 = aa + aii; 2598 vi = aj + aii; 2599 nz = ad[row] - ad[row + 1] - 1; 2600 2601 if (i > 0) { 2602 /* Prefetch the indices for the next block */ 2603 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2604 /* Prefetch the data for the next block */ 2605 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[row - nsz - ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2606 } 2607 2608 switch (nsz) { /* Each loop in 'case' is unrolled */ 2609 case 1: 2610 sum1 = tmp[row]; 2611 2612 for (j = 0; j < nz - 1; j += 2) { 2613 i0 = vi[j]; 2614 i1 = vi[j + 1]; 2615 tmp0 = tmps[i0]; 2616 tmp1 = tmps[i1]; 2617 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2618 } 2619 if (j == nz - 1) { 2620 tmp0 = tmps[vi[j]]; 2621 sum1 -= v1[j] * tmp0; 2622 } 2623 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2624 row--; 2625 break; 2626 case 2: 2627 sum1 = tmp[row]; 2628 sum2 = tmp[row - 1]; 2629 v2 = aa + ad[row] + 1; 2630 for (j = 0; j < nz - 1; j += 2) { 2631 i0 = vi[j]; 2632 i1 = vi[j + 1]; 2633 tmp0 = tmps[i0]; 2634 tmp1 = tmps[i1]; 2635 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2636 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2637 } 2638 if (j == nz - 1) { 2639 tmp0 = tmps[vi[j]]; 2640 sum1 -= v1[j] * tmp0; 2641 sum2 -= v2[j + 1] * tmp0; 2642 } 2643 2644 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2645 row--; 2646 sum2 -= v2[0] * tmp0; 2647 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2648 row--; 2649 break; 2650 case 3: 2651 sum1 = tmp[row]; 2652 sum2 = tmp[row - 1]; 2653 sum3 = tmp[row - 2]; 2654 v2 = aa + ad[row] + 1; 2655 v3 = aa + ad[row - 1] + 1; 2656 for (j = 0; j < nz - 1; j += 2) { 2657 i0 = vi[j]; 2658 i1 = vi[j + 1]; 2659 tmp0 = tmps[i0]; 2660 tmp1 = tmps[i1]; 2661 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2662 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2663 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2664 } 2665 if (j == nz - 1) { 2666 tmp0 = tmps[vi[j]]; 2667 sum1 -= v1[j] * tmp0; 2668 sum2 -= v2[j + 1] * tmp0; 2669 sum3 -= v3[j + 2] * tmp0; 2670 } 2671 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2672 row--; 2673 sum2 -= v2[0] * tmp0; 2674 sum3 -= v3[1] * tmp0; 2675 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2676 row--; 2677 sum3 -= v3[0] * tmp0; 2678 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2679 row--; 2680 2681 break; 2682 case 4: 2683 sum1 = tmp[row]; 2684 sum2 = tmp[row - 1]; 2685 sum3 = tmp[row - 2]; 2686 sum4 = tmp[row - 3]; 2687 v2 = aa + ad[row] + 1; 2688 v3 = aa + ad[row - 1] + 1; 2689 v4 = aa + ad[row - 2] + 1; 2690 2691 for (j = 0; j < nz - 1; j += 2) { 2692 i0 = vi[j]; 2693 i1 = vi[j + 1]; 2694 tmp0 = tmps[i0]; 2695 tmp1 = tmps[i1]; 2696 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2697 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2698 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2699 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2700 } 2701 if (j == nz - 1) { 2702 tmp0 = tmps[vi[j]]; 2703 sum1 -= v1[j] * tmp0; 2704 sum2 -= v2[j + 1] * tmp0; 2705 sum3 -= v3[j + 2] * tmp0; 2706 sum4 -= v4[j + 3] * tmp0; 2707 } 2708 2709 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2710 row--; 2711 sum2 -= v2[0] * tmp0; 2712 sum3 -= v3[1] * tmp0; 2713 sum4 -= v4[2] * tmp0; 2714 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2715 row--; 2716 sum3 -= v3[0] * tmp0; 2717 sum4 -= v4[1] * tmp0; 2718 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2719 row--; 2720 sum4 -= v4[0] * tmp0; 2721 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2722 row--; 2723 break; 2724 case 5: 2725 sum1 = tmp[row]; 2726 sum2 = tmp[row - 1]; 2727 sum3 = tmp[row - 2]; 2728 sum4 = tmp[row - 3]; 2729 sum5 = tmp[row - 4]; 2730 v2 = aa + ad[row] + 1; 2731 v3 = aa + ad[row - 1] + 1; 2732 v4 = aa + ad[row - 2] + 1; 2733 v5 = aa + ad[row - 3] + 1; 2734 for (j = 0; j < nz - 1; j += 2) { 2735 i0 = vi[j]; 2736 i1 = vi[j + 1]; 2737 tmp0 = tmps[i0]; 2738 tmp1 = tmps[i1]; 2739 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2740 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2741 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2742 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2743 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2744 } 2745 if (j == nz - 1) { 2746 tmp0 = tmps[vi[j]]; 2747 sum1 -= v1[j] * tmp0; 2748 sum2 -= v2[j + 1] * tmp0; 2749 sum3 -= v3[j + 2] * tmp0; 2750 sum4 -= v4[j + 3] * tmp0; 2751 sum5 -= v5[j + 4] * tmp0; 2752 } 2753 2754 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2755 row--; 2756 sum2 -= v2[0] * tmp0; 2757 sum3 -= v3[1] * tmp0; 2758 sum4 -= v4[2] * tmp0; 2759 sum5 -= v5[3] * tmp0; 2760 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2761 row--; 2762 sum3 -= v3[0] * tmp0; 2763 sum4 -= v4[1] * tmp0; 2764 sum5 -= v5[2] * tmp0; 2765 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2766 row--; 2767 sum4 -= v4[0] * tmp0; 2768 sum5 -= v5[1] * tmp0; 2769 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2770 row--; 2771 sum5 -= v5[0] * tmp0; 2772 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2773 row--; 2774 break; 2775 default: 2776 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2777 } 2778 } 2779 PetscCall(ISRestoreIndices(isrow, &rout)); 2780 PetscCall(ISRestoreIndices(iscol, &cout)); 2781 PetscCall(VecRestoreArrayRead(bb, &b)); 2782 PetscCall(VecRestoreArrayWrite(xx, &x)); 2783 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2784 PetscFunctionReturn(PETSC_SUCCESS); 2785 } 2786 2787 /* 2788 Makes a longer coloring[] array and calls the usual code with that 2789 */ 2790 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2791 { 2792 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2793 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size, row; 2794 PetscInt *colorused, i; 2795 ISColoringValue *newcolor; 2796 2797 PetscFunctionBegin; 2798 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2799 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2800 /* loop over inodes, marking a color for each column*/ 2801 row = 0; 2802 for (i = 0; i < m; i++) { 2803 for (j = 0; j < ns[i]; j++) newcolor[row++] = coloring[i] + j * ncolors; 2804 } 2805 2806 /* eliminate unneeded colors */ 2807 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2808 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2809 2810 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2811 ncolors = colorused[5 * ncolors - 1]; 2812 for (i = 0; i < n; i++) newcolor[i] = colorused[newcolor[i]] - 1; 2813 PetscCall(PetscFree(colorused)); 2814 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2815 PetscCall(PetscFree(coloring)); 2816 PetscFunctionReturn(PETSC_SUCCESS); 2817 } 2818 2819 #include <petsc/private/kernels/blockinvert.h> 2820 2821 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2822 { 2823 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2824 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2825 MatScalar *ibdiag, *bdiag, work[25], *t; 2826 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2827 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2828 const PetscScalar *xb, *b; 2829 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2830 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2; 2831 PetscInt sz, k, ipvt[5]; 2832 PetscBool allowzeropivot, zeropivotdetected; 2833 const PetscInt *sizes = a->inode.size, *idx, *diag = a->diag, *ii = a->i; 2834 2835 PetscFunctionBegin; 2836 allowzeropivot = PetscNot(A->erroriffailure); 2837 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2838 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2839 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2840 2841 if (!a->inode.ibdiagvalid) { 2842 if (!a->inode.ibdiag) { 2843 /* calculate space needed for diagonal blocks */ 2844 for (i = 0; i < m; i++) cnt += sizes[i] * sizes[i]; 2845 a->inode.bdiagsize = cnt; 2846 2847 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2848 } 2849 2850 /* copy over the diagonal blocks and invert them */ 2851 ibdiag = a->inode.ibdiag; 2852 bdiag = a->inode.bdiag; 2853 cnt = 0; 2854 for (i = 0, row = 0; i < m; i++) { 2855 for (j = 0; j < sizes[i]; j++) { 2856 for (k = 0; k < sizes[i]; k++) bdiag[cnt + k * sizes[i] + j] = v[diag[row + j] - j + k]; 2857 } 2858 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, sizes[i] * sizes[i])); 2859 2860 switch (sizes[i]) { 2861 case 1: 2862 /* Create matrix data structure */ 2863 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2864 if (allowzeropivot) { 2865 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2866 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2867 A->factorerror_zeropivot_row = row; 2868 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2869 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2870 } 2871 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2872 break; 2873 case 2: 2874 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2875 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2876 break; 2877 case 3: 2878 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2879 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2880 break; 2881 case 4: 2882 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2883 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2884 break; 2885 case 5: 2886 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2887 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2888 break; 2889 default: 2890 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 2891 } 2892 cnt += sizes[i] * sizes[i]; 2893 row += sizes[i]; 2894 } 2895 a->inode.ibdiagvalid = PETSC_TRUE; 2896 } 2897 ibdiag = a->inode.ibdiag; 2898 bdiag = a->inode.bdiag; 2899 t = a->inode.ssor_work; 2900 2901 PetscCall(VecGetArray(xx, &x)); 2902 PetscCall(VecGetArrayRead(bb, &b)); 2903 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2904 if (flag & SOR_ZERO_INITIAL_GUESS) { 2905 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2906 for (i = 0, row = 0; i < m; i++) { 2907 sz = diag[row] - ii[row]; 2908 v1 = a->a + ii[row]; 2909 idx = a->j + ii[row]; 2910 2911 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2912 switch (sizes[i]) { 2913 case 1: 2914 2915 sum1 = b[row]; 2916 for (n = 0; n < sz - 1; n += 2) { 2917 i1 = idx[0]; 2918 i2 = idx[1]; 2919 idx += 2; 2920 tmp0 = x[i1]; 2921 tmp1 = x[i2]; 2922 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2923 v1 += 2; 2924 } 2925 2926 if (n == sz - 1) { 2927 tmp0 = x[*idx]; 2928 sum1 -= *v1 * tmp0; 2929 } 2930 t[row] = sum1; 2931 x[row++] = sum1 * (*ibdiag++); 2932 break; 2933 case 2: 2934 v2 = a->a + ii[row + 1]; 2935 sum1 = b[row]; 2936 sum2 = b[row + 1]; 2937 for (n = 0; n < sz - 1; n += 2) { 2938 i1 = idx[0]; 2939 i2 = idx[1]; 2940 idx += 2; 2941 tmp0 = x[i1]; 2942 tmp1 = x[i2]; 2943 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2944 v1 += 2; 2945 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2946 v2 += 2; 2947 } 2948 2949 if (n == sz - 1) { 2950 tmp0 = x[*idx]; 2951 sum1 -= v1[0] * tmp0; 2952 sum2 -= v2[0] * tmp0; 2953 } 2954 t[row] = sum1; 2955 t[row + 1] = sum2; 2956 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2957 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2958 ibdiag += 4; 2959 break; 2960 case 3: 2961 v2 = a->a + ii[row + 1]; 2962 v3 = a->a + ii[row + 2]; 2963 sum1 = b[row]; 2964 sum2 = b[row + 1]; 2965 sum3 = b[row + 2]; 2966 for (n = 0; n < sz - 1; n += 2) { 2967 i1 = idx[0]; 2968 i2 = idx[1]; 2969 idx += 2; 2970 tmp0 = x[i1]; 2971 tmp1 = x[i2]; 2972 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2973 v1 += 2; 2974 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2975 v2 += 2; 2976 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2977 v3 += 2; 2978 } 2979 2980 if (n == sz - 1) { 2981 tmp0 = x[*idx]; 2982 sum1 -= v1[0] * tmp0; 2983 sum2 -= v2[0] * tmp0; 2984 sum3 -= v3[0] * tmp0; 2985 } 2986 t[row] = sum1; 2987 t[row + 1] = sum2; 2988 t[row + 2] = sum3; 2989 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 2990 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 2991 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 2992 ibdiag += 9; 2993 break; 2994 case 4: 2995 v2 = a->a + ii[row + 1]; 2996 v3 = a->a + ii[row + 2]; 2997 v4 = a->a + ii[row + 3]; 2998 sum1 = b[row]; 2999 sum2 = b[row + 1]; 3000 sum3 = b[row + 2]; 3001 sum4 = b[row + 3]; 3002 for (n = 0; n < sz - 1; n += 2) { 3003 i1 = idx[0]; 3004 i2 = idx[1]; 3005 idx += 2; 3006 tmp0 = x[i1]; 3007 tmp1 = x[i2]; 3008 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3009 v1 += 2; 3010 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3011 v2 += 2; 3012 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3013 v3 += 2; 3014 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3015 v4 += 2; 3016 } 3017 3018 if (n == sz - 1) { 3019 tmp0 = x[*idx]; 3020 sum1 -= v1[0] * tmp0; 3021 sum2 -= v2[0] * tmp0; 3022 sum3 -= v3[0] * tmp0; 3023 sum4 -= v4[0] * tmp0; 3024 } 3025 t[row] = sum1; 3026 t[row + 1] = sum2; 3027 t[row + 2] = sum3; 3028 t[row + 3] = sum4; 3029 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3030 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3031 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3032 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3033 ibdiag += 16; 3034 break; 3035 case 5: 3036 v2 = a->a + ii[row + 1]; 3037 v3 = a->a + ii[row + 2]; 3038 v4 = a->a + ii[row + 3]; 3039 v5 = a->a + ii[row + 4]; 3040 sum1 = b[row]; 3041 sum2 = b[row + 1]; 3042 sum3 = b[row + 2]; 3043 sum4 = b[row + 3]; 3044 sum5 = b[row + 4]; 3045 for (n = 0; n < sz - 1; n += 2) { 3046 i1 = idx[0]; 3047 i2 = idx[1]; 3048 idx += 2; 3049 tmp0 = x[i1]; 3050 tmp1 = x[i2]; 3051 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3052 v1 += 2; 3053 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3054 v2 += 2; 3055 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3056 v3 += 2; 3057 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3058 v4 += 2; 3059 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3060 v5 += 2; 3061 } 3062 3063 if (n == sz - 1) { 3064 tmp0 = x[*idx]; 3065 sum1 -= v1[0] * tmp0; 3066 sum2 -= v2[0] * tmp0; 3067 sum3 -= v3[0] * tmp0; 3068 sum4 -= v4[0] * tmp0; 3069 sum5 -= v5[0] * tmp0; 3070 } 3071 t[row] = sum1; 3072 t[row + 1] = sum2; 3073 t[row + 2] = sum3; 3074 t[row + 3] = sum4; 3075 t[row + 4] = sum5; 3076 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3077 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3078 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3079 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3080 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3081 ibdiag += 25; 3082 break; 3083 default: 3084 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3085 } 3086 } 3087 3088 xb = t; 3089 PetscCall(PetscLogFlops(a->nz)); 3090 } else xb = b; 3091 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3092 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3093 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3094 ibdiag -= sizes[i] * sizes[i]; 3095 sz = ii[row + 1] - diag[row] - 1; 3096 v1 = a->a + diag[row] + 1; 3097 idx = a->j + diag[row] + 1; 3098 3099 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3100 switch (sizes[i]) { 3101 case 1: 3102 3103 sum1 = xb[row]; 3104 for (n = 0; n < sz - 1; n += 2) { 3105 i1 = idx[0]; 3106 i2 = idx[1]; 3107 idx += 2; 3108 tmp0 = x[i1]; 3109 tmp1 = x[i2]; 3110 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3111 v1 += 2; 3112 } 3113 3114 if (n == sz - 1) { 3115 tmp0 = x[*idx]; 3116 sum1 -= *v1 * tmp0; 3117 } 3118 x[row--] = sum1 * (*ibdiag); 3119 break; 3120 3121 case 2: 3122 3123 sum1 = xb[row]; 3124 sum2 = xb[row - 1]; 3125 /* note that sum1 is associated with the second of the two rows */ 3126 v2 = a->a + diag[row - 1] + 2; 3127 for (n = 0; n < sz - 1; n += 2) { 3128 i1 = idx[0]; 3129 i2 = idx[1]; 3130 idx += 2; 3131 tmp0 = x[i1]; 3132 tmp1 = x[i2]; 3133 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3134 v1 += 2; 3135 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3136 v2 += 2; 3137 } 3138 3139 if (n == sz - 1) { 3140 tmp0 = x[*idx]; 3141 sum1 -= *v1 * tmp0; 3142 sum2 -= *v2 * tmp0; 3143 } 3144 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3145 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3146 break; 3147 case 3: 3148 3149 sum1 = xb[row]; 3150 sum2 = xb[row - 1]; 3151 sum3 = xb[row - 2]; 3152 v2 = a->a + diag[row - 1] + 2; 3153 v3 = a->a + diag[row - 2] + 3; 3154 for (n = 0; n < sz - 1; n += 2) { 3155 i1 = idx[0]; 3156 i2 = idx[1]; 3157 idx += 2; 3158 tmp0 = x[i1]; 3159 tmp1 = x[i2]; 3160 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3161 v1 += 2; 3162 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3163 v2 += 2; 3164 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3165 v3 += 2; 3166 } 3167 3168 if (n == sz - 1) { 3169 tmp0 = x[*idx]; 3170 sum1 -= *v1 * tmp0; 3171 sum2 -= *v2 * tmp0; 3172 sum3 -= *v3 * tmp0; 3173 } 3174 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3175 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3176 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3177 break; 3178 case 4: 3179 3180 sum1 = xb[row]; 3181 sum2 = xb[row - 1]; 3182 sum3 = xb[row - 2]; 3183 sum4 = xb[row - 3]; 3184 v2 = a->a + diag[row - 1] + 2; 3185 v3 = a->a + diag[row - 2] + 3; 3186 v4 = a->a + diag[row - 3] + 4; 3187 for (n = 0; n < sz - 1; n += 2) { 3188 i1 = idx[0]; 3189 i2 = idx[1]; 3190 idx += 2; 3191 tmp0 = x[i1]; 3192 tmp1 = x[i2]; 3193 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3194 v1 += 2; 3195 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3196 v2 += 2; 3197 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3198 v3 += 2; 3199 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3200 v4 += 2; 3201 } 3202 3203 if (n == sz - 1) { 3204 tmp0 = x[*idx]; 3205 sum1 -= *v1 * tmp0; 3206 sum2 -= *v2 * tmp0; 3207 sum3 -= *v3 * tmp0; 3208 sum4 -= *v4 * tmp0; 3209 } 3210 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3211 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3212 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3213 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3214 break; 3215 case 5: 3216 3217 sum1 = xb[row]; 3218 sum2 = xb[row - 1]; 3219 sum3 = xb[row - 2]; 3220 sum4 = xb[row - 3]; 3221 sum5 = xb[row - 4]; 3222 v2 = a->a + diag[row - 1] + 2; 3223 v3 = a->a + diag[row - 2] + 3; 3224 v4 = a->a + diag[row - 3] + 4; 3225 v5 = a->a + diag[row - 4] + 5; 3226 for (n = 0; n < sz - 1; n += 2) { 3227 i1 = idx[0]; 3228 i2 = idx[1]; 3229 idx += 2; 3230 tmp0 = x[i1]; 3231 tmp1 = x[i2]; 3232 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3233 v1 += 2; 3234 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3235 v2 += 2; 3236 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3237 v3 += 2; 3238 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3239 v4 += 2; 3240 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3241 v5 += 2; 3242 } 3243 3244 if (n == sz - 1) { 3245 tmp0 = x[*idx]; 3246 sum1 -= *v1 * tmp0; 3247 sum2 -= *v2 * tmp0; 3248 sum3 -= *v3 * tmp0; 3249 sum4 -= *v4 * tmp0; 3250 sum5 -= *v5 * tmp0; 3251 } 3252 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3253 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3254 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3255 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3256 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3257 break; 3258 default: 3259 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3260 } 3261 } 3262 3263 PetscCall(PetscLogFlops(a->nz)); 3264 } 3265 its--; 3266 } 3267 while (its--) { 3268 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3269 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += sizes[i], ibdiag += sizes[i] * sizes[i], i++) { 3270 sz = diag[row] - ii[row]; 3271 v1 = a->a + ii[row]; 3272 idx = a->j + ii[row]; 3273 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3274 switch (sizes[i]) { 3275 case 1: 3276 sum1 = b[row]; 3277 for (n = 0; n < sz - 1; n += 2) { 3278 i1 = idx[0]; 3279 i2 = idx[1]; 3280 idx += 2; 3281 tmp0 = x[i1]; 3282 tmp1 = x[i2]; 3283 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3284 v1 += 2; 3285 } 3286 if (n == sz - 1) { 3287 tmp0 = x[*idx++]; 3288 sum1 -= *v1 * tmp0; 3289 v1++; 3290 } 3291 t[row] = sum1; 3292 sz = ii[row + 1] - diag[row] - 1; 3293 idx = a->j + diag[row] + 1; 3294 v1 += 1; 3295 for (n = 0; n < sz - 1; n += 2) { 3296 i1 = idx[0]; 3297 i2 = idx[1]; 3298 idx += 2; 3299 tmp0 = x[i1]; 3300 tmp1 = x[i2]; 3301 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3302 v1 += 2; 3303 } 3304 if (n == sz - 1) { 3305 tmp0 = x[*idx++]; 3306 sum1 -= *v1 * tmp0; 3307 } 3308 /* in MatSOR_SeqAIJ this line would be 3309 * 3310 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3311 * 3312 * but omega == 1, so this becomes 3313 * 3314 * x[row] = sum1*(*ibdiag++); 3315 * 3316 */ 3317 x[row] = sum1 * (*ibdiag); 3318 break; 3319 case 2: 3320 v2 = a->a + ii[row + 1]; 3321 sum1 = b[row]; 3322 sum2 = b[row + 1]; 3323 for (n = 0; n < sz - 1; n += 2) { 3324 i1 = idx[0]; 3325 i2 = idx[1]; 3326 idx += 2; 3327 tmp0 = x[i1]; 3328 tmp1 = x[i2]; 3329 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3330 v1 += 2; 3331 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3332 v2 += 2; 3333 } 3334 if (n == sz - 1) { 3335 tmp0 = x[*idx++]; 3336 sum1 -= v1[0] * tmp0; 3337 sum2 -= v2[0] * tmp0; 3338 v1++; 3339 v2++; 3340 } 3341 t[row] = sum1; 3342 t[row + 1] = sum2; 3343 sz = ii[row + 1] - diag[row] - 2; 3344 idx = a->j + diag[row] + 2; 3345 v1 += 2; 3346 v2 += 2; 3347 for (n = 0; n < sz - 1; n += 2) { 3348 i1 = idx[0]; 3349 i2 = idx[1]; 3350 idx += 2; 3351 tmp0 = x[i1]; 3352 tmp1 = x[i2]; 3353 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3354 v1 += 2; 3355 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3356 v2 += 2; 3357 } 3358 if (n == sz - 1) { 3359 tmp0 = x[*idx]; 3360 sum1 -= v1[0] * tmp0; 3361 sum2 -= v2[0] * tmp0; 3362 } 3363 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3364 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3365 break; 3366 case 3: 3367 v2 = a->a + ii[row + 1]; 3368 v3 = a->a + ii[row + 2]; 3369 sum1 = b[row]; 3370 sum2 = b[row + 1]; 3371 sum3 = b[row + 2]; 3372 for (n = 0; n < sz - 1; n += 2) { 3373 i1 = idx[0]; 3374 i2 = idx[1]; 3375 idx += 2; 3376 tmp0 = x[i1]; 3377 tmp1 = x[i2]; 3378 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3379 v1 += 2; 3380 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3381 v2 += 2; 3382 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3383 v3 += 2; 3384 } 3385 if (n == sz - 1) { 3386 tmp0 = x[*idx++]; 3387 sum1 -= v1[0] * tmp0; 3388 sum2 -= v2[0] * tmp0; 3389 sum3 -= v3[0] * tmp0; 3390 v1++; 3391 v2++; 3392 v3++; 3393 } 3394 t[row] = sum1; 3395 t[row + 1] = sum2; 3396 t[row + 2] = sum3; 3397 sz = ii[row + 1] - diag[row] - 3; 3398 idx = a->j + diag[row] + 3; 3399 v1 += 3; 3400 v2 += 3; 3401 v3 += 3; 3402 for (n = 0; n < sz - 1; n += 2) { 3403 i1 = idx[0]; 3404 i2 = idx[1]; 3405 idx += 2; 3406 tmp0 = x[i1]; 3407 tmp1 = x[i2]; 3408 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3409 v1 += 2; 3410 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3411 v2 += 2; 3412 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3413 v3 += 2; 3414 } 3415 if (n == sz - 1) { 3416 tmp0 = x[*idx]; 3417 sum1 -= v1[0] * tmp0; 3418 sum2 -= v2[0] * tmp0; 3419 sum3 -= v3[0] * tmp0; 3420 } 3421 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3422 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3423 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3424 break; 3425 case 4: 3426 v2 = a->a + ii[row + 1]; 3427 v3 = a->a + ii[row + 2]; 3428 v4 = a->a + ii[row + 3]; 3429 sum1 = b[row]; 3430 sum2 = b[row + 1]; 3431 sum3 = b[row + 2]; 3432 sum4 = b[row + 3]; 3433 for (n = 0; n < sz - 1; n += 2) { 3434 i1 = idx[0]; 3435 i2 = idx[1]; 3436 idx += 2; 3437 tmp0 = x[i1]; 3438 tmp1 = x[i2]; 3439 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3440 v1 += 2; 3441 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3442 v2 += 2; 3443 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3444 v3 += 2; 3445 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3446 v4 += 2; 3447 } 3448 if (n == sz - 1) { 3449 tmp0 = x[*idx++]; 3450 sum1 -= v1[0] * tmp0; 3451 sum2 -= v2[0] * tmp0; 3452 sum3 -= v3[0] * tmp0; 3453 sum4 -= v4[0] * tmp0; 3454 v1++; 3455 v2++; 3456 v3++; 3457 v4++; 3458 } 3459 t[row] = sum1; 3460 t[row + 1] = sum2; 3461 t[row + 2] = sum3; 3462 t[row + 3] = sum4; 3463 sz = ii[row + 1] - diag[row] - 4; 3464 idx = a->j + diag[row] + 4; 3465 v1 += 4; 3466 v2 += 4; 3467 v3 += 4; 3468 v4 += 4; 3469 for (n = 0; n < sz - 1; n += 2) { 3470 i1 = idx[0]; 3471 i2 = idx[1]; 3472 idx += 2; 3473 tmp0 = x[i1]; 3474 tmp1 = x[i2]; 3475 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3476 v1 += 2; 3477 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3478 v2 += 2; 3479 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3480 v3 += 2; 3481 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3482 v4 += 2; 3483 } 3484 if (n == sz - 1) { 3485 tmp0 = x[*idx]; 3486 sum1 -= v1[0] * tmp0; 3487 sum2 -= v2[0] * tmp0; 3488 sum3 -= v3[0] * tmp0; 3489 sum4 -= v4[0] * tmp0; 3490 } 3491 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3492 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3493 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3494 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3495 break; 3496 case 5: 3497 v2 = a->a + ii[row + 1]; 3498 v3 = a->a + ii[row + 2]; 3499 v4 = a->a + ii[row + 3]; 3500 v5 = a->a + ii[row + 4]; 3501 sum1 = b[row]; 3502 sum2 = b[row + 1]; 3503 sum3 = b[row + 2]; 3504 sum4 = b[row + 3]; 3505 sum5 = b[row + 4]; 3506 for (n = 0; n < sz - 1; n += 2) { 3507 i1 = idx[0]; 3508 i2 = idx[1]; 3509 idx += 2; 3510 tmp0 = x[i1]; 3511 tmp1 = x[i2]; 3512 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3513 v1 += 2; 3514 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3515 v2 += 2; 3516 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3517 v3 += 2; 3518 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3519 v4 += 2; 3520 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3521 v5 += 2; 3522 } 3523 if (n == sz - 1) { 3524 tmp0 = x[*idx++]; 3525 sum1 -= v1[0] * tmp0; 3526 sum2 -= v2[0] * tmp0; 3527 sum3 -= v3[0] * tmp0; 3528 sum4 -= v4[0] * tmp0; 3529 sum5 -= v5[0] * tmp0; 3530 v1++; 3531 v2++; 3532 v3++; 3533 v4++; 3534 v5++; 3535 } 3536 t[row] = sum1; 3537 t[row + 1] = sum2; 3538 t[row + 2] = sum3; 3539 t[row + 3] = sum4; 3540 t[row + 4] = sum5; 3541 sz = ii[row + 1] - diag[row] - 5; 3542 idx = a->j + diag[row] + 5; 3543 v1 += 5; 3544 v2 += 5; 3545 v3 += 5; 3546 v4 += 5; 3547 v5 += 5; 3548 for (n = 0; n < sz - 1; n += 2) { 3549 i1 = idx[0]; 3550 i2 = idx[1]; 3551 idx += 2; 3552 tmp0 = x[i1]; 3553 tmp1 = x[i2]; 3554 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3555 v1 += 2; 3556 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3557 v2 += 2; 3558 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3559 v3 += 2; 3560 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3561 v4 += 2; 3562 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3563 v5 += 2; 3564 } 3565 if (n == sz - 1) { 3566 tmp0 = x[*idx]; 3567 sum1 -= v1[0] * tmp0; 3568 sum2 -= v2[0] * tmp0; 3569 sum3 -= v3[0] * tmp0; 3570 sum4 -= v4[0] * tmp0; 3571 sum5 -= v5[0] * tmp0; 3572 } 3573 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3574 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3575 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3576 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3577 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3578 break; 3579 default: 3580 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3581 } 3582 } 3583 xb = t; 3584 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3585 } else xb = b; 3586 3587 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3588 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3589 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3590 ibdiag -= sizes[i] * sizes[i]; 3591 3592 /* set RHS */ 3593 if (xb == b) { 3594 /* whole (old way) */ 3595 sz = ii[row + 1] - ii[row]; 3596 idx = a->j + ii[row]; 3597 switch (sizes[i]) { 3598 case 5: 3599 v5 = a->a + ii[row - 4]; /* fall through */ 3600 case 4: 3601 v4 = a->a + ii[row - 3]; /* fall through */ 3602 case 3: 3603 v3 = a->a + ii[row - 2]; /* fall through */ 3604 case 2: 3605 v2 = a->a + ii[row - 1]; /* fall through */ 3606 case 1: 3607 v1 = a->a + ii[row]; 3608 break; 3609 default: 3610 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3611 } 3612 } else { 3613 /* upper, no diag */ 3614 sz = ii[row + 1] - diag[row] - 1; 3615 idx = a->j + diag[row] + 1; 3616 switch (sizes[i]) { 3617 case 5: 3618 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3619 case 4: 3620 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3621 case 3: 3622 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3623 case 2: 3624 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3625 case 1: 3626 v1 = a->a + diag[row] + 1; 3627 } 3628 } 3629 /* set sum */ 3630 switch (sizes[i]) { 3631 case 5: 3632 sum5 = xb[row - 4]; /* fall through */ 3633 case 4: 3634 sum4 = xb[row - 3]; /* fall through */ 3635 case 3: 3636 sum3 = xb[row - 2]; /* fall through */ 3637 case 2: 3638 sum2 = xb[row - 1]; /* fall through */ 3639 case 1: 3640 /* note that sum1 is associated with the last row */ 3641 sum1 = xb[row]; 3642 } 3643 /* do sums */ 3644 for (n = 0; n < sz - 1; n += 2) { 3645 i1 = idx[0]; 3646 i2 = idx[1]; 3647 idx += 2; 3648 tmp0 = x[i1]; 3649 tmp1 = x[i2]; 3650 switch (sizes[i]) { 3651 case 5: 3652 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3653 v5 += 2; /* fall through */ 3654 case 4: 3655 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3656 v4 += 2; /* fall through */ 3657 case 3: 3658 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3659 v3 += 2; /* fall through */ 3660 case 2: 3661 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3662 v2 += 2; /* fall through */ 3663 case 1: 3664 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3665 v1 += 2; 3666 } 3667 } 3668 /* ragged edge */ 3669 if (n == sz - 1) { 3670 tmp0 = x[*idx]; 3671 switch (sizes[i]) { 3672 case 5: 3673 sum5 -= *v5 * tmp0; /* fall through */ 3674 case 4: 3675 sum4 -= *v4 * tmp0; /* fall through */ 3676 case 3: 3677 sum3 -= *v3 * tmp0; /* fall through */ 3678 case 2: 3679 sum2 -= *v2 * tmp0; /* fall through */ 3680 case 1: 3681 sum1 -= *v1 * tmp0; 3682 } 3683 } 3684 /* update */ 3685 if (xb == b) { 3686 /* whole (old way) w/ diag */ 3687 switch (sizes[i]) { 3688 case 5: 3689 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3690 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3691 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3692 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3693 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3694 break; 3695 case 4: 3696 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3697 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3698 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3699 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3700 break; 3701 case 3: 3702 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3703 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3704 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3705 break; 3706 case 2: 3707 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3708 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3709 break; 3710 case 1: 3711 x[row--] += sum1 * (*ibdiag); 3712 break; 3713 } 3714 } else { 3715 /* no diag so set = */ 3716 switch (sizes[i]) { 3717 case 5: 3718 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3719 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3720 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3721 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3722 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3723 break; 3724 case 4: 3725 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3726 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3727 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3728 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3729 break; 3730 case 3: 3731 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3732 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3733 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3734 break; 3735 case 2: 3736 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3737 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3738 break; 3739 case 1: 3740 x[row--] = sum1 * (*ibdiag); 3741 break; 3742 } 3743 } 3744 } 3745 if (xb == b) { 3746 PetscCall(PetscLogFlops(2.0 * a->nz)); 3747 } else { 3748 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3749 } 3750 } 3751 } 3752 if (flag & SOR_EISENSTAT) { 3753 /* 3754 Apply (U + D)^-1 where D is now the block diagonal 3755 */ 3756 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3757 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3758 ibdiag -= sizes[i] * sizes[i]; 3759 sz = ii[row + 1] - diag[row] - 1; 3760 v1 = a->a + diag[row] + 1; 3761 idx = a->j + diag[row] + 1; 3762 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3763 switch (sizes[i]) { 3764 case 1: 3765 3766 sum1 = b[row]; 3767 for (n = 0; n < sz - 1; n += 2) { 3768 i1 = idx[0]; 3769 i2 = idx[1]; 3770 idx += 2; 3771 tmp0 = x[i1]; 3772 tmp1 = x[i2]; 3773 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3774 v1 += 2; 3775 } 3776 3777 if (n == sz - 1) { 3778 tmp0 = x[*idx]; 3779 sum1 -= *v1 * tmp0; 3780 } 3781 x[row] = sum1 * (*ibdiag); 3782 row--; 3783 break; 3784 3785 case 2: 3786 3787 sum1 = b[row]; 3788 sum2 = b[row - 1]; 3789 /* note that sum1 is associated with the second of the two rows */ 3790 v2 = a->a + diag[row - 1] + 2; 3791 for (n = 0; n < sz - 1; n += 2) { 3792 i1 = idx[0]; 3793 i2 = idx[1]; 3794 idx += 2; 3795 tmp0 = x[i1]; 3796 tmp1 = x[i2]; 3797 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3798 v1 += 2; 3799 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3800 v2 += 2; 3801 } 3802 3803 if (n == sz - 1) { 3804 tmp0 = x[*idx]; 3805 sum1 -= *v1 * tmp0; 3806 sum2 -= *v2 * tmp0; 3807 } 3808 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3809 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3810 row -= 2; 3811 break; 3812 case 3: 3813 3814 sum1 = b[row]; 3815 sum2 = b[row - 1]; 3816 sum3 = b[row - 2]; 3817 v2 = a->a + diag[row - 1] + 2; 3818 v3 = a->a + diag[row - 2] + 3; 3819 for (n = 0; n < sz - 1; n += 2) { 3820 i1 = idx[0]; 3821 i2 = idx[1]; 3822 idx += 2; 3823 tmp0 = x[i1]; 3824 tmp1 = x[i2]; 3825 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3826 v1 += 2; 3827 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3828 v2 += 2; 3829 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3830 v3 += 2; 3831 } 3832 3833 if (n == sz - 1) { 3834 tmp0 = x[*idx]; 3835 sum1 -= *v1 * tmp0; 3836 sum2 -= *v2 * tmp0; 3837 sum3 -= *v3 * tmp0; 3838 } 3839 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3840 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3841 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3842 row -= 3; 3843 break; 3844 case 4: 3845 3846 sum1 = b[row]; 3847 sum2 = b[row - 1]; 3848 sum3 = b[row - 2]; 3849 sum4 = b[row - 3]; 3850 v2 = a->a + diag[row - 1] + 2; 3851 v3 = a->a + diag[row - 2] + 3; 3852 v4 = a->a + diag[row - 3] + 4; 3853 for (n = 0; n < sz - 1; n += 2) { 3854 i1 = idx[0]; 3855 i2 = idx[1]; 3856 idx += 2; 3857 tmp0 = x[i1]; 3858 tmp1 = x[i2]; 3859 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3860 v1 += 2; 3861 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3862 v2 += 2; 3863 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3864 v3 += 2; 3865 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3866 v4 += 2; 3867 } 3868 3869 if (n == sz - 1) { 3870 tmp0 = x[*idx]; 3871 sum1 -= *v1 * tmp0; 3872 sum2 -= *v2 * tmp0; 3873 sum3 -= *v3 * tmp0; 3874 sum4 -= *v4 * tmp0; 3875 } 3876 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3877 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3878 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3879 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3880 row -= 4; 3881 break; 3882 case 5: 3883 3884 sum1 = b[row]; 3885 sum2 = b[row - 1]; 3886 sum3 = b[row - 2]; 3887 sum4 = b[row - 3]; 3888 sum5 = b[row - 4]; 3889 v2 = a->a + diag[row - 1] + 2; 3890 v3 = a->a + diag[row - 2] + 3; 3891 v4 = a->a + diag[row - 3] + 4; 3892 v5 = a->a + diag[row - 4] + 5; 3893 for (n = 0; n < sz - 1; n += 2) { 3894 i1 = idx[0]; 3895 i2 = idx[1]; 3896 idx += 2; 3897 tmp0 = x[i1]; 3898 tmp1 = x[i2]; 3899 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3900 v1 += 2; 3901 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3902 v2 += 2; 3903 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3904 v3 += 2; 3905 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3906 v4 += 2; 3907 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3908 v5 += 2; 3909 } 3910 3911 if (n == sz - 1) { 3912 tmp0 = x[*idx]; 3913 sum1 -= *v1 * tmp0; 3914 sum2 -= *v2 * tmp0; 3915 sum3 -= *v3 * tmp0; 3916 sum4 -= *v4 * tmp0; 3917 sum5 -= *v5 * tmp0; 3918 } 3919 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3920 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3921 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3922 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3923 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3924 row -= 5; 3925 break; 3926 default: 3927 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 3928 } 3929 } 3930 PetscCall(PetscLogFlops(a->nz)); 3931 3932 /* 3933 t = b - D x where D is the block diagonal 3934 */ 3935 cnt = 0; 3936 for (i = 0, row = 0; i < m; i++) { 3937 switch (sizes[i]) { 3938 case 1: 3939 t[row] = b[row] - bdiag[cnt++] * x[row]; 3940 row++; 3941 break; 3942 case 2: 3943 x1 = x[row]; 3944 x2 = x[row + 1]; 3945 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3946 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3947 t[row] = b[row] - tmp1; 3948 t[row + 1] = b[row + 1] - tmp2; 3949 row += 2; 3950 cnt += 4; 3951 break; 3952 case 3: 3953 x1 = x[row]; 3954 x2 = x[row + 1]; 3955 x3 = x[row + 2]; 3956 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3957 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3958 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3959 t[row] = b[row] - tmp1; 3960 t[row + 1] = b[row + 1] - tmp2; 3961 t[row + 2] = b[row + 2] - tmp3; 3962 row += 3; 3963 cnt += 9; 3964 break; 3965 case 4: 3966 x1 = x[row]; 3967 x2 = x[row + 1]; 3968 x3 = x[row + 2]; 3969 x4 = x[row + 3]; 3970 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3971 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3972 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3973 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3974 t[row] = b[row] - tmp1; 3975 t[row + 1] = b[row + 1] - tmp2; 3976 t[row + 2] = b[row + 2] - tmp3; 3977 t[row + 3] = b[row + 3] - tmp4; 3978 row += 4; 3979 cnt += 16; 3980 break; 3981 case 5: 3982 x1 = x[row]; 3983 x2 = x[row + 1]; 3984 x3 = x[row + 2]; 3985 x4 = x[row + 3]; 3986 x5 = x[row + 4]; 3987 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 3988 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 3989 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 3990 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 3991 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 3992 t[row] = b[row] - tmp1; 3993 t[row + 1] = b[row + 1] - tmp2; 3994 t[row + 2] = b[row + 2] - tmp3; 3995 t[row + 3] = b[row + 3] - tmp4; 3996 t[row + 4] = b[row + 4] - tmp5; 3997 row += 5; 3998 cnt += 25; 3999 break; 4000 default: 4001 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4002 } 4003 } 4004 PetscCall(PetscLogFlops(m)); 4005 4006 /* 4007 Apply (L + D)^-1 where D is the block diagonal 4008 */ 4009 for (i = 0, row = 0; i < m; i++) { 4010 sz = diag[row] - ii[row]; 4011 v1 = a->a + ii[row]; 4012 idx = a->j + ii[row]; 4013 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4014 switch (sizes[i]) { 4015 case 1: 4016 4017 sum1 = t[row]; 4018 for (n = 0; n < sz - 1; n += 2) { 4019 i1 = idx[0]; 4020 i2 = idx[1]; 4021 idx += 2; 4022 tmp0 = t[i1]; 4023 tmp1 = t[i2]; 4024 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4025 v1 += 2; 4026 } 4027 4028 if (n == sz - 1) { 4029 tmp0 = t[*idx]; 4030 sum1 -= *v1 * tmp0; 4031 } 4032 x[row] += t[row] = sum1 * (*ibdiag++); 4033 row++; 4034 break; 4035 case 2: 4036 v2 = a->a + ii[row + 1]; 4037 sum1 = t[row]; 4038 sum2 = t[row + 1]; 4039 for (n = 0; n < sz - 1; n += 2) { 4040 i1 = idx[0]; 4041 i2 = idx[1]; 4042 idx += 2; 4043 tmp0 = t[i1]; 4044 tmp1 = t[i2]; 4045 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4046 v1 += 2; 4047 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4048 v2 += 2; 4049 } 4050 4051 if (n == sz - 1) { 4052 tmp0 = t[*idx]; 4053 sum1 -= v1[0] * tmp0; 4054 sum2 -= v2[0] * tmp0; 4055 } 4056 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4057 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4058 ibdiag += 4; 4059 row += 2; 4060 break; 4061 case 3: 4062 v2 = a->a + ii[row + 1]; 4063 v3 = a->a + ii[row + 2]; 4064 sum1 = t[row]; 4065 sum2 = t[row + 1]; 4066 sum3 = t[row + 2]; 4067 for (n = 0; n < sz - 1; n += 2) { 4068 i1 = idx[0]; 4069 i2 = idx[1]; 4070 idx += 2; 4071 tmp0 = t[i1]; 4072 tmp1 = t[i2]; 4073 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4074 v1 += 2; 4075 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4076 v2 += 2; 4077 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4078 v3 += 2; 4079 } 4080 4081 if (n == sz - 1) { 4082 tmp0 = t[*idx]; 4083 sum1 -= v1[0] * tmp0; 4084 sum2 -= v2[0] * tmp0; 4085 sum3 -= v3[0] * tmp0; 4086 } 4087 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4088 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4089 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4090 ibdiag += 9; 4091 row += 3; 4092 break; 4093 case 4: 4094 v2 = a->a + ii[row + 1]; 4095 v3 = a->a + ii[row + 2]; 4096 v4 = a->a + ii[row + 3]; 4097 sum1 = t[row]; 4098 sum2 = t[row + 1]; 4099 sum3 = t[row + 2]; 4100 sum4 = t[row + 3]; 4101 for (n = 0; n < sz - 1; n += 2) { 4102 i1 = idx[0]; 4103 i2 = idx[1]; 4104 idx += 2; 4105 tmp0 = t[i1]; 4106 tmp1 = t[i2]; 4107 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4108 v1 += 2; 4109 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4110 v2 += 2; 4111 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4112 v3 += 2; 4113 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4114 v4 += 2; 4115 } 4116 4117 if (n == sz - 1) { 4118 tmp0 = t[*idx]; 4119 sum1 -= v1[0] * tmp0; 4120 sum2 -= v2[0] * tmp0; 4121 sum3 -= v3[0] * tmp0; 4122 sum4 -= v4[0] * tmp0; 4123 } 4124 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4125 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4126 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4127 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4128 ibdiag += 16; 4129 row += 4; 4130 break; 4131 case 5: 4132 v2 = a->a + ii[row + 1]; 4133 v3 = a->a + ii[row + 2]; 4134 v4 = a->a + ii[row + 3]; 4135 v5 = a->a + ii[row + 4]; 4136 sum1 = t[row]; 4137 sum2 = t[row + 1]; 4138 sum3 = t[row + 2]; 4139 sum4 = t[row + 3]; 4140 sum5 = t[row + 4]; 4141 for (n = 0; n < sz - 1; n += 2) { 4142 i1 = idx[0]; 4143 i2 = idx[1]; 4144 idx += 2; 4145 tmp0 = t[i1]; 4146 tmp1 = t[i2]; 4147 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4148 v1 += 2; 4149 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4150 v2 += 2; 4151 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4152 v3 += 2; 4153 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4154 v4 += 2; 4155 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4156 v5 += 2; 4157 } 4158 4159 if (n == sz - 1) { 4160 tmp0 = t[*idx]; 4161 sum1 -= v1[0] * tmp0; 4162 sum2 -= v2[0] * tmp0; 4163 sum3 -= v3[0] * tmp0; 4164 sum4 -= v4[0] * tmp0; 4165 sum5 -= v5[0] * tmp0; 4166 } 4167 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4168 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4169 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4170 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4171 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4172 ibdiag += 25; 4173 row += 5; 4174 break; 4175 default: 4176 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4177 } 4178 } 4179 PetscCall(PetscLogFlops(a->nz)); 4180 } 4181 PetscCall(VecRestoreArray(xx, &x)); 4182 PetscCall(VecRestoreArrayRead(bb, &b)); 4183 PetscFunctionReturn(PETSC_SUCCESS); 4184 } 4185 4186 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4187 { 4188 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4189 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4190 const MatScalar *bdiag = a->inode.bdiag; 4191 const PetscScalar *b; 4192 PetscInt m = a->inode.node_count, cnt = 0, i, row; 4193 const PetscInt *sizes = a->inode.size; 4194 4195 PetscFunctionBegin; 4196 PetscCheck(a->inode.size, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4197 PetscCall(VecGetArray(xx, &x)); 4198 PetscCall(VecGetArrayRead(bb, &b)); 4199 cnt = 0; 4200 for (i = 0, row = 0; i < m; i++) { 4201 switch (sizes[i]) { 4202 case 1: 4203 x[row] = b[row] * bdiag[cnt++]; 4204 row++; 4205 break; 4206 case 2: 4207 x1 = b[row]; 4208 x2 = b[row + 1]; 4209 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4210 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4211 x[row++] = tmp1; 4212 x[row++] = tmp2; 4213 cnt += 4; 4214 break; 4215 case 3: 4216 x1 = b[row]; 4217 x2 = b[row + 1]; 4218 x3 = b[row + 2]; 4219 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4220 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4221 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4222 x[row++] = tmp1; 4223 x[row++] = tmp2; 4224 x[row++] = tmp3; 4225 cnt += 9; 4226 break; 4227 case 4: 4228 x1 = b[row]; 4229 x2 = b[row + 1]; 4230 x3 = b[row + 2]; 4231 x4 = b[row + 3]; 4232 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4233 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4234 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4235 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4236 x[row++] = tmp1; 4237 x[row++] = tmp2; 4238 x[row++] = tmp3; 4239 x[row++] = tmp4; 4240 cnt += 16; 4241 break; 4242 case 5: 4243 x1 = b[row]; 4244 x2 = b[row + 1]; 4245 x3 = b[row + 2]; 4246 x4 = b[row + 3]; 4247 x5 = b[row + 4]; 4248 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4249 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4250 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4251 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4252 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4253 x[row++] = tmp1; 4254 x[row++] = tmp2; 4255 x[row++] = tmp3; 4256 x[row++] = tmp4; 4257 x[row++] = tmp5; 4258 cnt += 25; 4259 break; 4260 default: 4261 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Inode size %" PetscInt_FMT " not supported", sizes[i]); 4262 } 4263 } 4264 PetscCall(PetscLogFlops(2.0 * cnt)); 4265 PetscCall(VecRestoreArray(xx, &x)); 4266 PetscCall(VecRestoreArrayRead(bb, &b)); 4267 PetscFunctionReturn(PETSC_SUCCESS); 4268 } 4269 4270 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4271 { 4272 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4273 4274 PetscFunctionBegin; 4275 a->inode.node_count = 0; 4276 a->inode.use = PETSC_FALSE; 4277 a->inode.checked = PETSC_FALSE; 4278 a->inode.mat_nonzerostate = -1; 4279 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4280 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4281 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4282 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4283 A->ops->coloringpatch = NULL; 4284 A->ops->multdiagonalblock = NULL; 4285 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4286 PetscFunctionReturn(PETSC_SUCCESS); 4287 } 4288 4289 /* 4290 samestructure indicates that the matrix has not changed its nonzero structure so we 4291 do not need to recompute the inodes 4292 */ 4293 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4294 { 4295 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4296 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4297 PetscBool flag; 4298 const PetscInt *idx, *idy, *ii; 4299 4300 PetscFunctionBegin; 4301 if (!a->inode.use) { 4302 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4303 PetscCall(PetscFree(a->inode.size)); 4304 PetscFunctionReturn(PETSC_SUCCESS); 4305 } 4306 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 4307 4308 m = A->rmap->n; 4309 if (!a->inode.size) PetscCall(PetscMalloc1(m + 1, &a->inode.size)); 4310 ns = a->inode.size; 4311 4312 i = 0; 4313 node_count = 0; 4314 idx = a->j; 4315 ii = a->i; 4316 if (idx) { 4317 while (i < m) { /* For each row */ 4318 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4319 /* Limits the number of elements in a node to 'a->inode.limit' */ 4320 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4321 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4322 if (nzy != nzx) break; 4323 idy += nzx; /* Same nonzero pattern */ 4324 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4325 if (!flag) break; 4326 } 4327 ns[node_count++] = blk_size; 4328 idx += blk_size * nzx; 4329 i = j; 4330 } 4331 } 4332 /* If not enough inodes found,, do not use inode version of the routines */ 4333 if (!m || !idx || node_count > .8 * m) { 4334 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4335 PetscCall(PetscFree(a->inode.size)); 4336 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4337 } else { 4338 if (!A->factortype) { 4339 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4340 if (A->rmap->n == A->cmap->n) { 4341 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4342 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4343 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4344 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4345 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4346 } 4347 } else { 4348 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4349 } 4350 a->inode.node_count = node_count; 4351 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4352 } 4353 a->inode.checked = PETSC_TRUE; 4354 a->inode.mat_nonzerostate = A->nonzerostate; 4355 PetscFunctionReturn(PETSC_SUCCESS); 4356 } 4357 4358 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4359 { 4360 Mat B = *C; 4361 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4362 PetscInt m = A->rmap->n; 4363 4364 PetscFunctionBegin; 4365 c->inode.use = a->inode.use; 4366 c->inode.limit = a->inode.limit; 4367 c->inode.max_limit = a->inode.max_limit; 4368 c->inode.checked = PETSC_FALSE; 4369 c->inode.size = NULL; 4370 c->inode.node_count = 0; 4371 c->inode.ibdiagvalid = PETSC_FALSE; 4372 c->inode.ibdiag = NULL; 4373 c->inode.bdiag = NULL; 4374 c->inode.mat_nonzerostate = -1; 4375 if (a->inode.use) { 4376 if (a->inode.checked && a->inode.size) { 4377 PetscCall(PetscMalloc1(m + 1, &c->inode.size)); 4378 PetscCall(PetscArraycpy(c->inode.size, a->inode.size, m + 1)); 4379 4380 c->inode.checked = PETSC_TRUE; 4381 c->inode.node_count = a->inode.node_count; 4382 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4383 } 4384 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4385 if (!B->factortype) { 4386 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4387 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4388 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4389 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4390 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4391 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4392 } else { 4393 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4394 } 4395 } 4396 PetscFunctionReturn(PETSC_SUCCESS); 4397 } 4398 4399 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4400 { 4401 PetscInt k; 4402 const PetscInt *vi; 4403 4404 PetscFunctionBegin; 4405 vi = aj + ai[row]; 4406 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4407 vi = aj + adiag[row]; 4408 cols[nzl] = vi[0]; 4409 vi = aj + adiag[row + 1] + 1; 4410 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4411 PetscFunctionReturn(PETSC_SUCCESS); 4412 } 4413 /* 4414 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4415 Modified from MatSeqAIJCheckInode(). 4416 4417 Input Parameters: 4418 . Mat A - ILU or LU matrix factor 4419 4420 */ 4421 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4422 { 4423 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4424 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4425 PetscInt *cols1, *cols2, *ns; 4426 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4427 PetscBool flag; 4428 4429 PetscFunctionBegin; 4430 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4431 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4432 4433 m = A->rmap->n; 4434 if (a->inode.size) ns = a->inode.size; 4435 else PetscCall(PetscMalloc1(m + 1, &ns)); 4436 4437 i = 0; 4438 node_count = 0; 4439 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4440 while (i < m) { /* For each row */ 4441 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4442 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4443 nzx = nzl1 + nzu1 + 1; 4444 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4445 4446 /* Limits the number of elements in a node to 'a->inode.limit' */ 4447 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4448 nzl2 = ai[j + 1] - ai[j]; 4449 nzu2 = adiag[j] - adiag[j + 1] - 1; 4450 nzy = nzl2 + nzu2 + 1; 4451 if (nzy != nzx) break; 4452 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4453 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4454 if (!flag) break; 4455 } 4456 ns[node_count++] = blk_size; 4457 i = j; 4458 } 4459 PetscCall(PetscFree2(cols1, cols2)); 4460 /* If not enough inodes found,, do not use inode version of the routines */ 4461 if (!m || node_count > .8 * m) { 4462 PetscCall(PetscFree(ns)); 4463 4464 a->inode.node_count = 0; 4465 a->inode.size = NULL; 4466 a->inode.use = PETSC_FALSE; 4467 4468 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4469 } else { 4470 A->ops->mult = NULL; 4471 A->ops->sor = NULL; 4472 A->ops->multadd = NULL; 4473 A->ops->getrowij = NULL; 4474 A->ops->restorerowij = NULL; 4475 A->ops->getcolumnij = NULL; 4476 A->ops->restorecolumnij = NULL; 4477 A->ops->coloringpatch = NULL; 4478 A->ops->multdiagonalblock = NULL; 4479 a->inode.node_count = node_count; 4480 a->inode.size = ns; 4481 4482 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4483 } 4484 a->inode.checked = PETSC_TRUE; 4485 PetscFunctionReturn(PETSC_SUCCESS); 4486 } 4487 4488 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4489 { 4490 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4491 4492 PetscFunctionBegin; 4493 a->inode.ibdiagvalid = PETSC_FALSE; 4494 PetscFunctionReturn(PETSC_SUCCESS); 4495 } 4496 4497 /* 4498 This is really ugly. if inodes are used this replaces the 4499 permutations with ones that correspond to rows/cols of the matrix 4500 rather then inode blocks 4501 */ 4502 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4503 { 4504 PetscFunctionBegin; 4505 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4506 PetscFunctionReturn(PETSC_SUCCESS); 4507 } 4508 4509 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4510 { 4511 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4512 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4513 const PetscInt *ridx, *cidx; 4514 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size, *tns, start_val, end_val, indx; 4515 PetscInt nslim_col, *ns_col; 4516 IS ris = *rperm, cis = *cperm; 4517 4518 PetscFunctionBegin; 4519 if (!a->inode.size) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4520 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4521 4522 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4523 PetscCall(PetscMalloc1(((nslim_row > nslim_col) ? nslim_row : nslim_col) + 1, &tns)); 4524 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4525 4526 PetscCall(ISGetIndices(ris, &ridx)); 4527 PetscCall(ISGetIndices(cis, &cidx)); 4528 4529 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4530 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + ns_row[i]; 4531 4532 /* Construct the permutations for rows*/ 4533 for (i = 0, row = 0; i < nslim_row; ++i) { 4534 indx = ridx[i]; 4535 start_val = tns[indx]; 4536 end_val = tns[indx + 1]; 4537 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4538 } 4539 4540 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4541 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + ns_col[i]; 4542 4543 /* Construct permutations for columns */ 4544 for (i = 0, col = 0; i < nslim_col; ++i) { 4545 indx = cidx[i]; 4546 start_val = tns[indx]; 4547 end_val = tns[indx + 1]; 4548 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4549 } 4550 4551 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4552 PetscCall(ISSetPermutation(*rperm)); 4553 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4554 PetscCall(ISSetPermutation(*cperm)); 4555 4556 PetscCall(ISRestoreIndices(ris, &ridx)); 4557 PetscCall(ISRestoreIndices(cis, &cidx)); 4558 4559 PetscCall(PetscFree(ns_col)); 4560 PetscCall(PetscFree2(permr, permc)); 4561 PetscCall(ISDestroy(&cis)); 4562 PetscCall(ISDestroy(&ris)); 4563 PetscCall(PetscFree(tns)); 4564 PetscFunctionReturn(PETSC_SUCCESS); 4565 } 4566 4567 /*@C 4568 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4569 4570 Not Collective 4571 4572 Input Parameter: 4573 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4574 4575 Output Parameters: 4576 + node_count - no of inodes present in the matrix. 4577 . sizes - an array of size `node_count`, with the sizes of each inode. 4578 - limit - the max size used to generate the inodes. 4579 4580 Level: advanced 4581 4582 Note: 4583 It should be called after the matrix is assembled. 4584 The contents of the sizes[] array should not be changed. 4585 `NULL` may be passed for information not needed 4586 4587 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 4588 @*/ 4589 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4590 { 4591 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4592 4593 PetscFunctionBegin; 4594 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4595 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4596 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4597 PetscFunctionReturn(PETSC_SUCCESS); 4598 } 4599 4600 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4601 { 4602 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4603 4604 PetscFunctionBegin; 4605 if (node_count) *node_count = a->inode.node_count; 4606 if (sizes) *sizes = a->inode.size; 4607 if (limit) *limit = a->inode.limit; 4608 PetscFunctionReturn(PETSC_SUCCESS); 4609 } 4610