1 /* 2 This file provides high performance routines for the Inode format (compressed sparse row) 3 by taking advantage of rows with identical nonzero structure (I-nodes). 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #if defined(PETSC_HAVE_XMMINTRIN_H) 7 #include <xmmintrin.h> 8 #endif 9 10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 13 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 14 15 PetscFunctionBegin; 16 n = A->cmap->n; 17 m = A->rmap->n; 18 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 19 ns_row = a->inode.size_csr; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++); 24 for (; count + 1 < n; count++, i++); 25 if (count < n) i++; 26 *size = i; 27 PetscFunctionReturn(PETSC_SUCCESS); 28 } 29 PetscCall(PetscMalloc1(n + 1, &ns_col)); 30 ns_col[0] = 0; 31 32 /* Use the same row structure wherever feasible. */ 33 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1]; 34 35 /* if m < n; pad up the remainder with inode_limit */ 36 for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1; 37 /* The last node is the odd ball. pad it up with the remaining rows; */ 38 if (count < n) { 39 ns_col[i + 1] = ns_col[i] + (n - count); 40 i++; 41 } else if (count > n) { 42 /* Adjust for the over estimation */ 43 ns_col[i] += n - count; 44 } 45 *size = i; 46 *ns = ns_col; 47 PetscFunctionReturn(PETSC_SUCCESS); 48 } 49 50 /* 51 This builds symmetric version of nonzero structure, 52 */ 53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 54 { 55 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 56 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 57 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2; 58 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 59 60 PetscFunctionBegin; 61 nslim_row = a->inode.node_count; 62 m = A->rmap->n; 63 n = A->cmap->n; 64 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 65 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 66 67 /* Use the row_inode as column_inode */ 68 nslim_col = nslim_row; 69 ns_col = ns_row; 70 71 /* allocate space for reformatted inode structure */ 72 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 73 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]); 74 75 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 76 nsz = ns_col[i1 + 1] - ns_col[i1]; 77 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 78 } 79 /* allocate space for row pointers */ 80 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 81 *iia = ia; 82 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 83 84 /* determine the number of columns in each row */ 85 ia[0] = oshift; 86 for (i1 = 0; i1 < nslim_row; i1++) { 87 row = ns_row[i1]; 88 j = aj + ai[row] + ishift; 89 jmax = aj + ai[row + 1] + ishift; 90 if (j == jmax) continue; /* empty row */ 91 col = *j++ + ishift; 92 i2 = tvc[col]; 93 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 94 ia[i1 + 1]++; 95 ia[i2 + 1]++; 96 i2++; /* Start col of next node */ 97 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 98 i2 = tvc[col]; 99 } 100 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 101 } 102 103 /* shift ia[i] to point to next row */ 104 for (i1 = 1; i1 < nslim_row + 1; i1++) { 105 row = ia[i1 - 1]; 106 ia[i1] += row; 107 work[i1 - 1] = row - oshift; 108 } 109 110 /* allocate space for column pointers */ 111 nz = ia[nslim_row] + (!ishift); 112 PetscCall(PetscMalloc1(nz, &ja)); 113 *jja = ja; 114 115 /* loop over lower triangular part putting into ja */ 116 for (i1 = 0; i1 < nslim_row; i1++) { 117 row = ns_row[i1]; 118 j = aj + ai[row] + ishift; 119 jmax = aj + ai[row + 1] + ishift; 120 if (j == jmax) continue; /* empty row */ 121 col = *j++ + ishift; 122 i2 = tvc[col]; 123 while (i2 < i1 && j < jmax) { 124 ja[work[i2]++] = i1 + oshift; 125 ja[work[i1]++] = i2 + oshift; 126 ++i2; 127 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 128 i2 = tvc[col]; 129 } 130 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 131 } 132 PetscCall(PetscFree(work)); 133 PetscCall(PetscFree2(tns, tvc)); 134 PetscFunctionReturn(PETSC_SUCCESS); 135 } 136 137 /* 138 This builds nonsymmetric version of nonzero structure, 139 */ 140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 141 { 142 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 143 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 144 PetscInt *tns, *tvc, nsz, i1, i2; 145 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr; 146 147 PetscFunctionBegin; 148 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 149 nslim_row = a->inode.node_count; 150 n = A->cmap->n; 151 152 /* Create The column_inode for this matrix */ 153 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 154 155 /* allocate space for reformatted column_inode structure */ 156 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 157 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 158 159 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 160 nsz = ns_col[i1 + 1] - ns_col[i1]; 161 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 162 } 163 /* allocate space for row pointers */ 164 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 165 *iia = ia; 166 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 167 168 /* determine the number of columns in each row */ 169 ia[0] = oshift; 170 for (i1 = 0; i1 < nslim_row; i1++) { 171 row = ns_row[i1]; 172 j = aj + ai[row] + ishift; 173 nz = ai[row + 1] - ai[row]; 174 if (!nz) continue; /* empty row */ 175 col = *j++ + ishift; 176 i2 = tvc[col]; 177 while (nz-- > 0) { /* off-diagonal elements */ 178 ia[i1 + 1]++; 179 i2++; /* Start col of next node */ 180 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 181 if (nz > 0) i2 = tvc[col]; 182 } 183 } 184 185 /* shift ia[i] to point to next row */ 186 for (i1 = 1; i1 < nslim_row + 1; i1++) { 187 row = ia[i1 - 1]; 188 ia[i1] += row; 189 work[i1 - 1] = row - oshift; 190 } 191 192 /* allocate space for column pointers */ 193 nz = ia[nslim_row] + (!ishift); 194 PetscCall(PetscMalloc1(nz, &ja)); 195 *jja = ja; 196 197 /* loop over matrix putting into ja */ 198 for (i1 = 0; i1 < nslim_row; i1++) { 199 row = ns_row[i1]; 200 j = aj + ai[row] + ishift; 201 nz = ai[row + 1] - ai[row]; 202 if (!nz) continue; /* empty row */ 203 col = *j++ + ishift; 204 i2 = tvc[col]; 205 while (nz-- > 0) { 206 ja[work[i1]++] = i2 + oshift; 207 ++i2; 208 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 209 if (nz > 0) i2 = tvc[col]; 210 } 211 } 212 PetscCall(PetscFree(ns_col)); 213 PetscCall(PetscFree(work)); 214 PetscCall(PetscFree2(tns, tvc)); 215 PetscFunctionReturn(PETSC_SUCCESS); 216 } 217 218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 219 { 220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 221 222 PetscFunctionBegin; 223 if (n) *n = a->inode.node_count; 224 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 225 if (!blockcompressed) { 226 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 227 } else if (symmetric) { 228 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 229 } else { 230 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 231 } 232 PetscFunctionReturn(PETSC_SUCCESS); 233 } 234 235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 236 { 237 PetscFunctionBegin; 238 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 239 240 if (!blockcompressed) { 241 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 242 } else { 243 PetscCall(PetscFree(*ia)); 244 PetscCall(PetscFree(*ja)); 245 } 246 PetscFunctionReturn(PETSC_SUCCESS); 247 } 248 249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 250 { 251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 252 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 253 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j; 254 255 PetscFunctionBegin; 256 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 257 nslim_row = a->inode.node_count; 258 n = A->cmap->n; 259 260 /* Create The column_inode for this matrix */ 261 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 262 263 /* allocate space for reformatted column_inode structure */ 264 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 265 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 266 267 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 268 nsz = ns_col[i1 + 1] - ns_col[i1]; 269 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 270 } 271 /* allocate space for column pointers */ 272 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 273 *iia = ia; 274 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 275 276 /* determine the number of columns in each row */ 277 ia[0] = oshift; 278 for (i1 = 0; i1 < nslim_row; i1++) { 279 row = ns_row[i1]; 280 j = aj + ai[row] + ishift; 281 col = *j++ + ishift; 282 i2 = tvc[col]; 283 nz = ai[row + 1] - ai[row]; 284 while (nz-- > 0) { /* off-diagonal elements */ 285 /* ia[i1+1]++; */ 286 ia[i2 + 1]++; 287 i2++; 288 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 289 if (nz > 0) i2 = tvc[col]; 290 } 291 } 292 293 /* shift ia[i] to point to next col */ 294 for (i1 = 1; i1 < nslim_col + 1; i1++) { 295 col = ia[i1 - 1]; 296 ia[i1] += col; 297 work[i1 - 1] = col - oshift; 298 } 299 300 /* allocate space for column pointers */ 301 nz = ia[nslim_col] + (!ishift); 302 PetscCall(PetscMalloc1(nz, &ja)); 303 *jja = ja; 304 305 /* loop over matrix putting into ja */ 306 for (i1 = 0; i1 < nslim_row; i1++) { 307 row = ns_row[i1]; 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row + 1] - ai[row]; 312 while (nz-- > 0) { 313 /* ja[work[i1]++] = i2 + oshift; */ 314 ja[work[i2]++] = i1 + oshift; 315 i2++; 316 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 PetscCall(PetscFree(ns_col)); 321 PetscCall(PetscFree(work)); 322 PetscCall(PetscFree2(tns, tvc)); 323 PetscFunctionReturn(PETSC_SUCCESS); 324 } 325 326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 327 { 328 PetscFunctionBegin; 329 PetscCall(MatCreateColInode_Private(A, n, NULL)); 330 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 331 332 if (!blockcompressed) { 333 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 334 } else if (symmetric) { 335 /* Since the indices are symmetric it doesn't matter */ 336 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 337 } else { 338 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 339 } 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 344 { 345 PetscFunctionBegin; 346 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 347 if (!blockcompressed) { 348 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 349 } else { 350 PetscCall(PetscFree(*ia)); 351 PetscCall(PetscFree(*ja)); 352 } 353 PetscFunctionReturn(PETSC_SUCCESS); 354 } 355 356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 357 { 358 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 359 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 360 PetscScalar *y; 361 const PetscScalar *x; 362 const MatScalar *v1, *v2, *v3, *v4, *v5; 363 PetscInt i1, i2, n, i, row, node_max, nsz, sz, nonzerorow = 0; 364 const PetscInt *idx, *ns, *ii; 365 366 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 367 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 368 #endif 369 370 PetscFunctionBegin; 371 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 372 node_max = a->inode.node_count; 373 ns = a->inode.size_csr; /* Node Size array */ 374 PetscCall(VecGetArrayRead(xx, &x)); 375 PetscCall(VecGetArray(yy, &y)); 376 idx = a->j; 377 v1 = a->a; 378 ii = a->i; 379 380 for (i = 0, row = 0; i < node_max; ++i) { 381 nsz = ns[i + 1] - ns[i]; 382 n = ii[1] - ii[0]; 383 nonzerorow += (n > 0) * nsz; 384 ii += nsz; 385 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 386 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 387 sz = n; /* No of non zeros in this row */ 388 /* Switch on the size of Node */ 389 switch (nsz) { /* Each loop in 'case' is unrolled */ 390 case 1: 391 sum1 = 0.; 392 393 for (n = 0; n < sz - 1; n += 2) { 394 i1 = idx[0]; /* The instructions are ordered to */ 395 i2 = idx[1]; /* make the compiler's job easy */ 396 idx += 2; 397 tmp0 = x[i1]; 398 tmp1 = x[i2]; 399 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 400 v1 += 2; 401 } 402 403 if (n == sz - 1) { /* Take care of the last nonzero */ 404 tmp0 = x[*idx++]; 405 sum1 += *v1++ * tmp0; 406 } 407 y[row++] = sum1; 408 break; 409 case 2: 410 sum1 = 0.; 411 sum2 = 0.; 412 v2 = v1 + n; 413 414 for (n = 0; n < sz - 1; n += 2) { 415 i1 = idx[0]; 416 i2 = idx[1]; 417 idx += 2; 418 tmp0 = x[i1]; 419 tmp1 = x[i2]; 420 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 421 v1 += 2; 422 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 423 v2 += 2; 424 } 425 if (n == sz - 1) { 426 tmp0 = x[*idx++]; 427 sum1 += *v1++ * tmp0; 428 sum2 += *v2++ * tmp0; 429 } 430 y[row++] = sum1; 431 y[row++] = sum2; 432 v1 = v2; /* Since the next block to be processed starts there*/ 433 idx += sz; 434 break; 435 case 3: 436 sum1 = 0.; 437 sum2 = 0.; 438 sum3 = 0.; 439 v2 = v1 + n; 440 v3 = v2 + n; 441 442 for (n = 0; n < sz - 1; n += 2) { 443 i1 = idx[0]; 444 i2 = idx[1]; 445 idx += 2; 446 tmp0 = x[i1]; 447 tmp1 = x[i2]; 448 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 449 v1 += 2; 450 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 451 v2 += 2; 452 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 453 v3 += 2; 454 } 455 if (n == sz - 1) { 456 tmp0 = x[*idx++]; 457 sum1 += *v1++ * tmp0; 458 sum2 += *v2++ * tmp0; 459 sum3 += *v3++ * tmp0; 460 } 461 y[row++] = sum1; 462 y[row++] = sum2; 463 y[row++] = sum3; 464 v1 = v3; /* Since the next block to be processed starts there*/ 465 idx += 2 * sz; 466 break; 467 case 4: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 sum4 = 0.; 472 v2 = v1 + n; 473 v3 = v2 + n; 474 v4 = v3 + n; 475 476 for (n = 0; n < sz - 1; n += 2) { 477 i1 = idx[0]; 478 i2 = idx[1]; 479 idx += 2; 480 tmp0 = x[i1]; 481 tmp1 = x[i2]; 482 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 483 v1 += 2; 484 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 485 v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 487 v3 += 2; 488 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 489 v4 += 2; 490 } 491 if (n == sz - 1) { 492 tmp0 = x[*idx++]; 493 sum1 += *v1++ * tmp0; 494 sum2 += *v2++ * tmp0; 495 sum3 += *v3++ * tmp0; 496 sum4 += *v4++ * tmp0; 497 } 498 y[row++] = sum1; 499 y[row++] = sum2; 500 y[row++] = sum3; 501 y[row++] = sum4; 502 v1 = v4; /* Since the next block to be processed starts there*/ 503 idx += 3 * sz; 504 break; 505 case 5: 506 sum1 = 0.; 507 sum2 = 0.; 508 sum3 = 0.; 509 sum4 = 0.; 510 sum5 = 0.; 511 v2 = v1 + n; 512 v3 = v2 + n; 513 v4 = v3 + n; 514 v5 = v4 + n; 515 516 for (n = 0; n < sz - 1; n += 2) { 517 i1 = idx[0]; 518 i2 = idx[1]; 519 idx += 2; 520 tmp0 = x[i1]; 521 tmp1 = x[i2]; 522 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 523 v1 += 2; 524 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 525 v2 += 2; 526 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 527 v3 += 2; 528 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 529 v4 += 2; 530 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 531 v5 += 2; 532 } 533 if (n == sz - 1) { 534 tmp0 = x[*idx++]; 535 sum1 += *v1++ * tmp0; 536 sum2 += *v2++ * tmp0; 537 sum3 += *v3++ * tmp0; 538 sum4 += *v4++ * tmp0; 539 sum5 += *v5++ * tmp0; 540 } 541 y[row++] = sum1; 542 y[row++] = sum2; 543 y[row++] = sum3; 544 y[row++] = sum4; 545 y[row++] = sum5; 546 v1 = v5; /* Since the next block to be processed starts there */ 547 idx += 4 * sz; 548 break; 549 default: 550 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz); 551 } 552 } 553 PetscCall(VecRestoreArrayRead(xx, &x)); 554 PetscCall(VecRestoreArray(yy, &y)); 555 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 556 PetscFunctionReturn(PETSC_SUCCESS); 557 } 558 559 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 560 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 561 { 562 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 563 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 564 const MatScalar *v1, *v2, *v3, *v4, *v5; 565 const PetscScalar *x; 566 PetscScalar *y, *z, *zt; 567 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 568 const PetscInt *idx, *ns, *ii; 569 570 PetscFunctionBegin; 571 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 572 node_max = a->inode.node_count; 573 ns = a->inode.size_csr; /* Node Size array */ 574 575 PetscCall(VecGetArrayRead(xx, &x)); 576 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 577 zt = z; 578 579 idx = a->j; 580 v1 = a->a; 581 ii = a->i; 582 583 for (i = 0; i < node_max; ++i) { 584 row = ns[i]; 585 nsz = ns[i + 1] - ns[i]; 586 n = ii[1] - ii[0]; 587 ii += nsz; 588 sz = n; /* No of non zeros in this row */ 589 /* Switch on the size of Node */ 590 switch (nsz) { /* Each loop in 'case' is unrolled */ 591 case 1: 592 sum1 = *zt++; 593 594 for (n = 0; n < sz - 1; n += 2) { 595 i1 = idx[0]; /* The instructions are ordered to */ 596 i2 = idx[1]; /* make the compiler's job easy */ 597 idx += 2; 598 tmp0 = x[i1]; 599 tmp1 = x[i2]; 600 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 601 v1 += 2; 602 } 603 604 if (n == sz - 1) { /* Take care of the last nonzero */ 605 tmp0 = x[*idx++]; 606 sum1 += *v1++ * tmp0; 607 } 608 y[row++] = sum1; 609 break; 610 case 2: 611 sum1 = *zt++; 612 sum2 = *zt++; 613 v2 = v1 + n; 614 615 for (n = 0; n < sz - 1; n += 2) { 616 i1 = idx[0]; 617 i2 = idx[1]; 618 idx += 2; 619 tmp0 = x[i1]; 620 tmp1 = x[i2]; 621 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 622 v1 += 2; 623 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 624 v2 += 2; 625 } 626 if (n == sz - 1) { 627 tmp0 = x[*idx++]; 628 sum1 += *v1++ * tmp0; 629 sum2 += *v2++ * tmp0; 630 } 631 y[row++] = sum1; 632 y[row++] = sum2; 633 v1 = v2; /* Since the next block to be processed starts there*/ 634 idx += sz; 635 break; 636 case 3: 637 sum1 = *zt++; 638 sum2 = *zt++; 639 sum3 = *zt++; 640 v2 = v1 + n; 641 v3 = v2 + n; 642 643 for (n = 0; n < sz - 1; n += 2) { 644 i1 = idx[0]; 645 i2 = idx[1]; 646 idx += 2; 647 tmp0 = x[i1]; 648 tmp1 = x[i2]; 649 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 650 v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 652 v2 += 2; 653 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 654 v3 += 2; 655 } 656 if (n == sz - 1) { 657 tmp0 = x[*idx++]; 658 sum1 += *v1++ * tmp0; 659 sum2 += *v2++ * tmp0; 660 sum3 += *v3++ * tmp0; 661 } 662 y[row++] = sum1; 663 y[row++] = sum2; 664 y[row++] = sum3; 665 v1 = v3; /* Since the next block to be processed starts there*/ 666 idx += 2 * sz; 667 break; 668 case 4: 669 sum1 = *zt++; 670 sum2 = *zt++; 671 sum3 = *zt++; 672 sum4 = *zt++; 673 v2 = v1 + n; 674 v3 = v2 + n; 675 v4 = v3 + n; 676 677 for (n = 0; n < sz - 1; n += 2) { 678 i1 = idx[0]; 679 i2 = idx[1]; 680 idx += 2; 681 tmp0 = x[i1]; 682 tmp1 = x[i2]; 683 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 684 v1 += 2; 685 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 686 v2 += 2; 687 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 688 v3 += 2; 689 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 690 v4 += 2; 691 } 692 if (n == sz - 1) { 693 tmp0 = x[*idx++]; 694 sum1 += *v1++ * tmp0; 695 sum2 += *v2++ * tmp0; 696 sum3 += *v3++ * tmp0; 697 sum4 += *v4++ * tmp0; 698 } 699 y[row++] = sum1; 700 y[row++] = sum2; 701 y[row++] = sum3; 702 y[row++] = sum4; 703 v1 = v4; /* Since the next block to be processed starts there*/ 704 idx += 3 * sz; 705 break; 706 case 5: 707 sum1 = *zt++; 708 sum2 = *zt++; 709 sum3 = *zt++; 710 sum4 = *zt++; 711 sum5 = *zt++; 712 v2 = v1 + n; 713 v3 = v2 + n; 714 v4 = v3 + n; 715 v5 = v4 + n; 716 717 for (n = 0; n < sz - 1; n += 2) { 718 i1 = idx[0]; 719 i2 = idx[1]; 720 idx += 2; 721 tmp0 = x[i1]; 722 tmp1 = x[i2]; 723 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 724 v1 += 2; 725 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 726 v2 += 2; 727 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 728 v3 += 2; 729 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 730 v4 += 2; 731 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 732 v5 += 2; 733 } 734 if (n == sz - 1) { 735 tmp0 = x[*idx++]; 736 sum1 += *v1++ * tmp0; 737 sum2 += *v2++ * tmp0; 738 sum3 += *v3++ * tmp0; 739 sum4 += *v4++ * tmp0; 740 sum5 += *v5++ * tmp0; 741 } 742 y[row++] = sum1; 743 y[row++] = sum2; 744 y[row++] = sum3; 745 y[row++] = sum4; 746 y[row++] = sum5; 747 v1 = v5; /* Since the next block to be processed starts there */ 748 idx += 4 * sz; 749 break; 750 default: 751 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 752 } 753 } 754 PetscCall(VecRestoreArrayRead(xx, &x)); 755 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 756 PetscCall(PetscLogFlops(2.0 * a->nz)); 757 PetscFunctionReturn(PETSC_SUCCESS); 758 } 759 760 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 761 { 762 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 763 IS iscol = a->col, isrow = a->row; 764 const PetscInt *r, *c, *rout, *cout; 765 PetscInt i, j, n = A->rmap->n, nz; 766 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 767 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 768 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 769 PetscScalar sum1, sum2, sum3, sum4, sum5; 770 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 771 const PetscScalar *b; 772 773 PetscFunctionBegin; 774 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 775 node_max = a->inode.node_count; 776 ns = a->inode.size_csr; /* Node Size array */ 777 778 PetscCall(VecGetArrayRead(bb, &b)); 779 PetscCall(VecGetArrayWrite(xx, &x)); 780 tmp = a->solve_work; 781 782 PetscCall(ISGetIndices(isrow, &rout)); 783 r = rout; 784 PetscCall(ISGetIndices(iscol, &cout)); 785 c = cout + (n - 1); 786 787 /* forward solve the lower triangular */ 788 tmps = tmp; 789 aa = a_a; 790 aj = a_j; 791 ad = a->diag; 792 793 for (i = 0, row = 0; i < node_max; ++i) { 794 row = ns[i]; 795 nsz = ns[i + 1] - ns[i]; 796 aii = ai[row]; 797 v1 = aa + aii; 798 vi = aj + aii; 799 nz = ad[row] - aii; 800 if (i < node_max - 1) { 801 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 802 * but our indexing to determine its size could. */ 803 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 804 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 805 PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 806 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 807 } 808 809 switch (nsz) { /* Each loop in 'case' is unrolled */ 810 case 1: 811 sum1 = b[*r++]; 812 for (j = 0; j < nz - 1; j += 2) { 813 i0 = vi[0]; 814 i1 = vi[1]; 815 vi += 2; 816 tmp0 = tmps[i0]; 817 tmp1 = tmps[i1]; 818 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 819 v1 += 2; 820 } 821 if (j == nz - 1) { 822 tmp0 = tmps[*vi++]; 823 sum1 -= *v1++ * tmp0; 824 } 825 tmp[row++] = sum1; 826 break; 827 case 2: 828 sum1 = b[*r++]; 829 sum2 = b[*r++]; 830 v2 = aa + ai[row + 1]; 831 832 for (j = 0; j < nz - 1; j += 2) { 833 i0 = vi[0]; 834 i1 = vi[1]; 835 vi += 2; 836 tmp0 = tmps[i0]; 837 tmp1 = tmps[i1]; 838 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 839 v1 += 2; 840 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 841 v2 += 2; 842 } 843 if (j == nz - 1) { 844 tmp0 = tmps[*vi++]; 845 sum1 -= *v1++ * tmp0; 846 sum2 -= *v2++ * tmp0; 847 } 848 sum2 -= *v2++ * sum1; 849 tmp[row++] = sum1; 850 tmp[row++] = sum2; 851 break; 852 case 3: 853 sum1 = b[*r++]; 854 sum2 = b[*r++]; 855 sum3 = b[*r++]; 856 v2 = aa + ai[row + 1]; 857 v3 = aa + ai[row + 2]; 858 859 for (j = 0; j < nz - 1; j += 2) { 860 i0 = vi[0]; 861 i1 = vi[1]; 862 vi += 2; 863 tmp0 = tmps[i0]; 864 tmp1 = tmps[i1]; 865 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 866 v1 += 2; 867 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 868 v2 += 2; 869 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 870 v3 += 2; 871 } 872 if (j == nz - 1) { 873 tmp0 = tmps[*vi++]; 874 sum1 -= *v1++ * tmp0; 875 sum2 -= *v2++ * tmp0; 876 sum3 -= *v3++ * tmp0; 877 } 878 sum2 -= *v2++ * sum1; 879 sum3 -= *v3++ * sum1; 880 sum3 -= *v3++ * sum2; 881 882 tmp[row++] = sum1; 883 tmp[row++] = sum2; 884 tmp[row++] = sum3; 885 break; 886 887 case 4: 888 sum1 = b[*r++]; 889 sum2 = b[*r++]; 890 sum3 = b[*r++]; 891 sum4 = b[*r++]; 892 v2 = aa + ai[row + 1]; 893 v3 = aa + ai[row + 2]; 894 v4 = aa + ai[row + 3]; 895 896 for (j = 0; j < nz - 1; j += 2) { 897 i0 = vi[0]; 898 i1 = vi[1]; 899 vi += 2; 900 tmp0 = tmps[i0]; 901 tmp1 = tmps[i1]; 902 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 903 v1 += 2; 904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 905 v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 907 v3 += 2; 908 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 909 v4 += 2; 910 } 911 if (j == nz - 1) { 912 tmp0 = tmps[*vi++]; 913 sum1 -= *v1++ * tmp0; 914 sum2 -= *v2++ * tmp0; 915 sum3 -= *v3++ * tmp0; 916 sum4 -= *v4++ * tmp0; 917 } 918 sum2 -= *v2++ * sum1; 919 sum3 -= *v3++ * sum1; 920 sum4 -= *v4++ * sum1; 921 sum3 -= *v3++ * sum2; 922 sum4 -= *v4++ * sum2; 923 sum4 -= *v4++ * sum3; 924 925 tmp[row++] = sum1; 926 tmp[row++] = sum2; 927 tmp[row++] = sum3; 928 tmp[row++] = sum4; 929 break; 930 case 5: 931 sum1 = b[*r++]; 932 sum2 = b[*r++]; 933 sum3 = b[*r++]; 934 sum4 = b[*r++]; 935 sum5 = b[*r++]; 936 v2 = aa + ai[row + 1]; 937 v3 = aa + ai[row + 2]; 938 v4 = aa + ai[row + 3]; 939 v5 = aa + ai[row + 4]; 940 941 for (j = 0; j < nz - 1; j += 2) { 942 i0 = vi[0]; 943 i1 = vi[1]; 944 vi += 2; 945 tmp0 = tmps[i0]; 946 tmp1 = tmps[i1]; 947 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 948 v1 += 2; 949 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 950 v2 += 2; 951 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 952 v3 += 2; 953 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 954 v4 += 2; 955 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 956 v5 += 2; 957 } 958 if (j == nz - 1) { 959 tmp0 = tmps[*vi++]; 960 sum1 -= *v1++ * tmp0; 961 sum2 -= *v2++ * tmp0; 962 sum3 -= *v3++ * tmp0; 963 sum4 -= *v4++ * tmp0; 964 sum5 -= *v5++ * tmp0; 965 } 966 967 sum2 -= *v2++ * sum1; 968 sum3 -= *v3++ * sum1; 969 sum4 -= *v4++ * sum1; 970 sum5 -= *v5++ * sum1; 971 sum3 -= *v3++ * sum2; 972 sum4 -= *v4++ * sum2; 973 sum5 -= *v5++ * sum2; 974 sum4 -= *v4++ * sum3; 975 sum5 -= *v5++ * sum3; 976 sum5 -= *v5++ * sum4; 977 978 tmp[row++] = sum1; 979 tmp[row++] = sum2; 980 tmp[row++] = sum3; 981 tmp[row++] = sum4; 982 tmp[row++] = sum5; 983 break; 984 default: 985 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 986 } 987 } 988 /* backward solve the upper triangular */ 989 for (i = node_max - 1; i >= 0; i--) { 990 row = ns[i + 1]; 991 nsz = ns[i + 1] - ns[i]; 992 aii = ai[row + 1] - 1; 993 v1 = aa + aii; 994 vi = aj + aii; 995 nz = aii - ad[row]; 996 switch (nsz) { /* Each loop in 'case' is unrolled */ 997 case 1: 998 sum1 = tmp[row]; 999 1000 for (j = nz; j > 1; j -= 2) { 1001 vi -= 2; 1002 i0 = vi[2]; 1003 i1 = vi[1]; 1004 tmp0 = tmps[i0]; 1005 tmp1 = tmps[i1]; 1006 v1 -= 2; 1007 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1008 } 1009 if (j == 1) { 1010 tmp0 = tmps[*vi--]; 1011 sum1 -= *v1-- * tmp0; 1012 } 1013 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1014 row--; 1015 break; 1016 case 2: 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row - 1]; 1019 v2 = aa + ai[row] - 1; 1020 for (j = nz; j > 1; j -= 2) { 1021 vi -= 2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j == 1) { 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1038 row--; 1039 sum2 -= *v2-- * tmp0; 1040 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1041 row--; 1042 break; 1043 case 3: 1044 sum1 = tmp[row]; 1045 sum2 = tmp[row - 1]; 1046 sum3 = tmp[row - 2]; 1047 v2 = aa + ai[row] - 1; 1048 v3 = aa + ai[row - 1] - 1; 1049 for (j = nz; j > 1; j -= 2) { 1050 vi -= 2; 1051 i0 = vi[2]; 1052 i1 = vi[1]; 1053 tmp0 = tmps[i0]; 1054 tmp1 = tmps[i1]; 1055 v1 -= 2; 1056 v2 -= 2; 1057 v3 -= 2; 1058 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1059 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1060 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1061 } 1062 if (j == 1) { 1063 tmp0 = tmps[*vi--]; 1064 sum1 -= *v1-- * tmp0; 1065 sum2 -= *v2-- * tmp0; 1066 sum3 -= *v3-- * tmp0; 1067 } 1068 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1069 row--; 1070 sum2 -= *v2-- * tmp0; 1071 sum3 -= *v3-- * tmp0; 1072 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1073 row--; 1074 sum3 -= *v3-- * tmp0; 1075 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1076 row--; 1077 1078 break; 1079 case 4: 1080 sum1 = tmp[row]; 1081 sum2 = tmp[row - 1]; 1082 sum3 = tmp[row - 2]; 1083 sum4 = tmp[row - 3]; 1084 v2 = aa + ai[row] - 1; 1085 v3 = aa + ai[row - 1] - 1; 1086 v4 = aa + ai[row - 2] - 1; 1087 1088 for (j = nz; j > 1; j -= 2) { 1089 vi -= 2; 1090 i0 = vi[2]; 1091 i1 = vi[1]; 1092 tmp0 = tmps[i0]; 1093 tmp1 = tmps[i1]; 1094 v1 -= 2; 1095 v2 -= 2; 1096 v3 -= 2; 1097 v4 -= 2; 1098 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1099 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1100 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1101 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1102 } 1103 if (j == 1) { 1104 tmp0 = tmps[*vi--]; 1105 sum1 -= *v1-- * tmp0; 1106 sum2 -= *v2-- * tmp0; 1107 sum3 -= *v3-- * tmp0; 1108 sum4 -= *v4-- * tmp0; 1109 } 1110 1111 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1112 row--; 1113 sum2 -= *v2-- * tmp0; 1114 sum3 -= *v3-- * tmp0; 1115 sum4 -= *v4-- * tmp0; 1116 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1117 row--; 1118 sum3 -= *v3-- * tmp0; 1119 sum4 -= *v4-- * tmp0; 1120 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1121 row--; 1122 sum4 -= *v4-- * tmp0; 1123 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1124 row--; 1125 break; 1126 case 5: 1127 sum1 = tmp[row]; 1128 sum2 = tmp[row - 1]; 1129 sum3 = tmp[row - 2]; 1130 sum4 = tmp[row - 3]; 1131 sum5 = tmp[row - 4]; 1132 v2 = aa + ai[row] - 1; 1133 v3 = aa + ai[row - 1] - 1; 1134 v4 = aa + ai[row - 2] - 1; 1135 v5 = aa + ai[row - 3] - 1; 1136 for (j = nz; j > 1; j -= 2) { 1137 vi -= 2; 1138 i0 = vi[2]; 1139 i1 = vi[1]; 1140 tmp0 = tmps[i0]; 1141 tmp1 = tmps[i1]; 1142 v1 -= 2; 1143 v2 -= 2; 1144 v3 -= 2; 1145 v4 -= 2; 1146 v5 -= 2; 1147 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1148 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1149 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1150 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1151 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1152 } 1153 if (j == 1) { 1154 tmp0 = tmps[*vi--]; 1155 sum1 -= *v1-- * tmp0; 1156 sum2 -= *v2-- * tmp0; 1157 sum3 -= *v3-- * tmp0; 1158 sum4 -= *v4-- * tmp0; 1159 sum5 -= *v5-- * tmp0; 1160 } 1161 1162 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1163 row--; 1164 sum2 -= *v2-- * tmp0; 1165 sum3 -= *v3-- * tmp0; 1166 sum4 -= *v4-- * tmp0; 1167 sum5 -= *v5-- * tmp0; 1168 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1169 row--; 1170 sum3 -= *v3-- * tmp0; 1171 sum4 -= *v4-- * tmp0; 1172 sum5 -= *v5-- * tmp0; 1173 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1174 row--; 1175 sum4 -= *v4-- * tmp0; 1176 sum5 -= *v5-- * tmp0; 1177 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1178 row--; 1179 sum5 -= *v5-- * tmp0; 1180 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1181 row--; 1182 break; 1183 default: 1184 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1185 } 1186 } 1187 PetscCall(ISRestoreIndices(isrow, &rout)); 1188 PetscCall(ISRestoreIndices(iscol, &cout)); 1189 PetscCall(VecRestoreArrayRead(bb, &b)); 1190 PetscCall(VecRestoreArrayWrite(xx, &x)); 1191 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1192 PetscFunctionReturn(PETSC_SUCCESS); 1193 } 1194 1195 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1196 { 1197 Mat C = B; 1198 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1199 IS isrow = b->row, isicol = b->icol; 1200 const PetscInt *r, *ic, *ics; 1201 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1202 PetscInt i, j, k, nz, nzL, row, *pj; 1203 const PetscInt *ajtmp, *bjtmp; 1204 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1205 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1206 FactorShiftCtx sctx; 1207 const PetscInt *ddiag; 1208 PetscReal rs; 1209 MatScalar d; 1210 PetscInt inod, nodesz, node_max, col; 1211 const PetscInt *ns; 1212 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1213 1214 PetscFunctionBegin; 1215 /* MatPivotSetUp(): initialize shift context sctx */ 1216 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1217 1218 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1219 ddiag = a->diag; 1220 sctx.shift_top = info->zeropivot; 1221 for (i = 0; i < n; i++) { 1222 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1223 d = (aa)[ddiag[i]]; 1224 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1225 v = aa + ai[i]; 1226 nz = ai[i + 1] - ai[i]; 1227 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1228 if (rs > sctx.shift_top) sctx.shift_top = rs; 1229 } 1230 sctx.shift_top *= 1.1; 1231 sctx.nshift_max = 5; 1232 sctx.shift_lo = 0.; 1233 sctx.shift_hi = 1.; 1234 } 1235 1236 PetscCall(ISGetIndices(isrow, &r)); 1237 PetscCall(ISGetIndices(isicol, &ic)); 1238 1239 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1240 ics = ic; 1241 1242 node_max = a->inode.node_count; 1243 ns = a->inode.size_csr; 1244 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1245 1246 /* If max inode size > 4, split it into two inodes.*/ 1247 /* also map the inode sizes according to the ordering */ 1248 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1249 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1250 nodesz = ns[i + 1] - ns[i]; 1251 if (nodesz > 4) { 1252 tmp_vec1[j] = 4; 1253 ++j; 1254 tmp_vec1[j] = nodesz - tmp_vec1[j - 1]; 1255 } else { 1256 tmp_vec1[j] = nodesz; 1257 } 1258 } 1259 /* Use the correct node_max */ 1260 node_max = j; 1261 1262 /* Now reorder the inode info based on mat re-ordering info */ 1263 /* First create a row -> inode_size_array_index map */ 1264 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1265 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1266 tmp_vec2[0] = 0; 1267 for (i = 0, row = 0; i < node_max; i++) { 1268 nodesz = tmp_vec1[i]; 1269 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1270 } 1271 /* Using nsmap, create a reordered ns structure */ 1272 for (i = 0, j = 0; i < node_max; i++) { 1273 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1274 tmp_vec2[i + 1] = tmp_vec2[i] + nodesz; 1275 j += nodesz; 1276 } 1277 PetscCall(PetscFree(nsmap)); 1278 PetscCall(PetscFree(tmp_vec1)); 1279 1280 /* Now use the correct ns */ 1281 ns = tmp_vec2; 1282 1283 do { 1284 sctx.newshift = PETSC_FALSE; 1285 /* Now loop over each block-row, and do the factorization */ 1286 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1287 nodesz = ns[inod + 1] - ns[inod]; 1288 1289 switch (nodesz) { 1290 case 1: 1291 /* zero rtmp1 */ 1292 /* L part */ 1293 nz = bi[i + 1] - bi[i]; 1294 bjtmp = bj + bi[i]; 1295 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1296 1297 /* U part */ 1298 nz = bdiag[i] - bdiag[i + 1]; 1299 bjtmp = bj + bdiag[i + 1] + 1; 1300 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1301 1302 /* load in initial (unfactored row) */ 1303 nz = ai[r[i] + 1] - ai[r[i]]; 1304 ajtmp = aj + ai[r[i]]; 1305 v = aa + ai[r[i]]; 1306 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1307 1308 /* ZeropivotApply() */ 1309 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1310 1311 /* elimination */ 1312 bjtmp = bj + bi[i]; 1313 row = *bjtmp++; 1314 nzL = bi[i + 1] - bi[i]; 1315 for (k = 0; k < nzL; k++) { 1316 pc = rtmp1 + row; 1317 if (*pc != 0.0) { 1318 pv = b->a + bdiag[row]; 1319 mul1 = *pc * (*pv); 1320 *pc = mul1; 1321 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1322 pv = b->a + bdiag[row + 1] + 1; 1323 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1324 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1325 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1326 } 1327 row = *bjtmp++; 1328 } 1329 1330 /* finished row so stick it into b->a */ 1331 rs = 0.0; 1332 /* L part */ 1333 pv = b->a + bi[i]; 1334 pj = b->j + bi[i]; 1335 nz = bi[i + 1] - bi[i]; 1336 for (j = 0; j < nz; j++) { 1337 pv[j] = rtmp1[pj[j]]; 1338 rs += PetscAbsScalar(pv[j]); 1339 } 1340 1341 /* U part */ 1342 pv = b->a + bdiag[i + 1] + 1; 1343 pj = b->j + bdiag[i + 1] + 1; 1344 nz = bdiag[i] - bdiag[i + 1] - 1; 1345 for (j = 0; j < nz; j++) { 1346 pv[j] = rtmp1[pj[j]]; 1347 rs += PetscAbsScalar(pv[j]); 1348 } 1349 1350 /* Check zero pivot */ 1351 sctx.rs = rs; 1352 sctx.pv = rtmp1[i]; 1353 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1354 if (sctx.newshift) break; 1355 1356 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1357 pv = b->a + bdiag[i]; 1358 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1359 break; 1360 1361 case 2: 1362 /* zero rtmp1 and rtmp2 */ 1363 /* L part */ 1364 nz = bi[i + 1] - bi[i]; 1365 bjtmp = bj + bi[i]; 1366 for (j = 0; j < nz; j++) { 1367 col = bjtmp[j]; 1368 rtmp1[col] = 0.0; 1369 rtmp2[col] = 0.0; 1370 } 1371 1372 /* U part */ 1373 nz = bdiag[i] - bdiag[i + 1]; 1374 bjtmp = bj + bdiag[i + 1] + 1; 1375 for (j = 0; j < nz; j++) { 1376 col = bjtmp[j]; 1377 rtmp1[col] = 0.0; 1378 rtmp2[col] = 0.0; 1379 } 1380 1381 /* load in initial (unfactored row) */ 1382 nz = ai[r[i] + 1] - ai[r[i]]; 1383 ajtmp = aj + ai[r[i]]; 1384 v1 = aa + ai[r[i]]; 1385 v2 = aa + ai[r[i] + 1]; 1386 for (j = 0; j < nz; j++) { 1387 col = ics[ajtmp[j]]; 1388 rtmp1[col] = v1[j]; 1389 rtmp2[col] = v2[j]; 1390 } 1391 /* ZeropivotApply(): shift the diagonal of the matrix */ 1392 rtmp1[i] += sctx.shift_amount; 1393 rtmp2[i + 1] += sctx.shift_amount; 1394 1395 /* elimination */ 1396 bjtmp = bj + bi[i]; 1397 row = *bjtmp++; /* pivot row */ 1398 nzL = bi[i + 1] - bi[i]; 1399 for (k = 0; k < nzL; k++) { 1400 pc1 = rtmp1 + row; 1401 pc2 = rtmp2 + row; 1402 if (*pc1 != 0.0 || *pc2 != 0.0) { 1403 pv = b->a + bdiag[row]; 1404 mul1 = *pc1 * (*pv); 1405 mul2 = *pc2 * (*pv); 1406 *pc1 = mul1; 1407 *pc2 = mul2; 1408 1409 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1410 pv = b->a + bdiag[row + 1] + 1; 1411 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1412 for (j = 0; j < nz; j++) { 1413 col = pj[j]; 1414 rtmp1[col] -= mul1 * pv[j]; 1415 rtmp2[col] -= mul2 * pv[j]; 1416 } 1417 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1418 } 1419 row = *bjtmp++; 1420 } 1421 1422 /* finished row i; check zero pivot, then stick row i into b->a */ 1423 rs = 0.0; 1424 /* L part */ 1425 pc1 = b->a + bi[i]; 1426 pj = b->j + bi[i]; 1427 nz = bi[i + 1] - bi[i]; 1428 for (j = 0; j < nz; j++) { 1429 col = pj[j]; 1430 pc1[j] = rtmp1[col]; 1431 rs += PetscAbsScalar(pc1[j]); 1432 } 1433 /* U part */ 1434 pc1 = b->a + bdiag[i + 1] + 1; 1435 pj = b->j + bdiag[i + 1] + 1; 1436 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1437 for (j = 0; j < nz; j++) { 1438 col = pj[j]; 1439 pc1[j] = rtmp1[col]; 1440 rs += PetscAbsScalar(pc1[j]); 1441 } 1442 1443 sctx.rs = rs; 1444 sctx.pv = rtmp1[i]; 1445 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1446 if (sctx.newshift) break; 1447 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1448 *pc1 = 1.0 / sctx.pv; 1449 1450 /* Now take care of diagonal 2x2 block. */ 1451 pc2 = rtmp2 + i; 1452 if (*pc2 != 0.0) { 1453 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1454 *pc2 = mul1; /* insert L entry */ 1455 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1456 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1457 for (j = 0; j < nz; j++) { 1458 col = pj[j]; 1459 rtmp2[col] -= mul1 * rtmp1[col]; 1460 } 1461 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1462 } 1463 1464 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1465 rs = 0.0; 1466 /* L part */ 1467 pc2 = b->a + bi[i + 1]; 1468 pj = b->j + bi[i + 1]; 1469 nz = bi[i + 2] - bi[i + 1]; 1470 for (j = 0; j < nz; j++) { 1471 col = pj[j]; 1472 pc2[j] = rtmp2[col]; 1473 rs += PetscAbsScalar(pc2[j]); 1474 } 1475 /* U part */ 1476 pc2 = b->a + bdiag[i + 2] + 1; 1477 pj = b->j + bdiag[i + 2] + 1; 1478 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1479 for (j = 0; j < nz; j++) { 1480 col = pj[j]; 1481 pc2[j] = rtmp2[col]; 1482 rs += PetscAbsScalar(pc2[j]); 1483 } 1484 1485 sctx.rs = rs; 1486 sctx.pv = rtmp2[i + 1]; 1487 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1488 if (sctx.newshift) break; 1489 pc2 = b->a + bdiag[i + 1]; 1490 *pc2 = 1.0 / sctx.pv; 1491 break; 1492 1493 case 3: 1494 /* zero rtmp */ 1495 /* L part */ 1496 nz = bi[i + 1] - bi[i]; 1497 bjtmp = bj + bi[i]; 1498 for (j = 0; j < nz; j++) { 1499 col = bjtmp[j]; 1500 rtmp1[col] = 0.0; 1501 rtmp2[col] = 0.0; 1502 rtmp3[col] = 0.0; 1503 } 1504 1505 /* U part */ 1506 nz = bdiag[i] - bdiag[i + 1]; 1507 bjtmp = bj + bdiag[i + 1] + 1; 1508 for (j = 0; j < nz; j++) { 1509 col = bjtmp[j]; 1510 rtmp1[col] = 0.0; 1511 rtmp2[col] = 0.0; 1512 rtmp3[col] = 0.0; 1513 } 1514 1515 /* load in initial (unfactored row) */ 1516 nz = ai[r[i] + 1] - ai[r[i]]; 1517 ajtmp = aj + ai[r[i]]; 1518 v1 = aa + ai[r[i]]; 1519 v2 = aa + ai[r[i] + 1]; 1520 v3 = aa + ai[r[i] + 2]; 1521 for (j = 0; j < nz; j++) { 1522 col = ics[ajtmp[j]]; 1523 rtmp1[col] = v1[j]; 1524 rtmp2[col] = v2[j]; 1525 rtmp3[col] = v3[j]; 1526 } 1527 /* ZeropivotApply(): shift the diagonal of the matrix */ 1528 rtmp1[i] += sctx.shift_amount; 1529 rtmp2[i + 1] += sctx.shift_amount; 1530 rtmp3[i + 2] += sctx.shift_amount; 1531 1532 /* elimination */ 1533 bjtmp = bj + bi[i]; 1534 row = *bjtmp++; /* pivot row */ 1535 nzL = bi[i + 1] - bi[i]; 1536 for (k = 0; k < nzL; k++) { 1537 pc1 = rtmp1 + row; 1538 pc2 = rtmp2 + row; 1539 pc3 = rtmp3 + row; 1540 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1541 pv = b->a + bdiag[row]; 1542 mul1 = *pc1 * (*pv); 1543 mul2 = *pc2 * (*pv); 1544 mul3 = *pc3 * (*pv); 1545 *pc1 = mul1; 1546 *pc2 = mul2; 1547 *pc3 = mul3; 1548 1549 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1550 pv = b->a + bdiag[row + 1] + 1; 1551 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1552 for (j = 0; j < nz; j++) { 1553 col = pj[j]; 1554 rtmp1[col] -= mul1 * pv[j]; 1555 rtmp2[col] -= mul2 * pv[j]; 1556 rtmp3[col] -= mul3 * pv[j]; 1557 } 1558 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1559 } 1560 row = *bjtmp++; 1561 } 1562 1563 /* finished row i; check zero pivot, then stick row i into b->a */ 1564 rs = 0.0; 1565 /* L part */ 1566 pc1 = b->a + bi[i]; 1567 pj = b->j + bi[i]; 1568 nz = bi[i + 1] - bi[i]; 1569 for (j = 0; j < nz; j++) { 1570 col = pj[j]; 1571 pc1[j] = rtmp1[col]; 1572 rs += PetscAbsScalar(pc1[j]); 1573 } 1574 /* U part */ 1575 pc1 = b->a + bdiag[i + 1] + 1; 1576 pj = b->j + bdiag[i + 1] + 1; 1577 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1578 for (j = 0; j < nz; j++) { 1579 col = pj[j]; 1580 pc1[j] = rtmp1[col]; 1581 rs += PetscAbsScalar(pc1[j]); 1582 } 1583 1584 sctx.rs = rs; 1585 sctx.pv = rtmp1[i]; 1586 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1587 if (sctx.newshift) break; 1588 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1589 *pc1 = 1.0 / sctx.pv; 1590 1591 /* Now take care of 1st column of diagonal 3x3 block. */ 1592 pc2 = rtmp2 + i; 1593 pc3 = rtmp3 + i; 1594 if (*pc2 != 0.0 || *pc3 != 0.0) { 1595 mul2 = (*pc2) * (*pc1); 1596 *pc2 = mul2; 1597 mul3 = (*pc3) * (*pc1); 1598 *pc3 = mul3; 1599 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1600 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1601 for (j = 0; j < nz; j++) { 1602 col = pj[j]; 1603 rtmp2[col] -= mul2 * rtmp1[col]; 1604 rtmp3[col] -= mul3 * rtmp1[col]; 1605 } 1606 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1607 } 1608 1609 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1610 rs = 0.0; 1611 /* L part */ 1612 pc2 = b->a + bi[i + 1]; 1613 pj = b->j + bi[i + 1]; 1614 nz = bi[i + 2] - bi[i + 1]; 1615 for (j = 0; j < nz; j++) { 1616 col = pj[j]; 1617 pc2[j] = rtmp2[col]; 1618 rs += PetscAbsScalar(pc2[j]); 1619 } 1620 /* U part */ 1621 pc2 = b->a + bdiag[i + 2] + 1; 1622 pj = b->j + bdiag[i + 2] + 1; 1623 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1624 for (j = 0; j < nz; j++) { 1625 col = pj[j]; 1626 pc2[j] = rtmp2[col]; 1627 rs += PetscAbsScalar(pc2[j]); 1628 } 1629 1630 sctx.rs = rs; 1631 sctx.pv = rtmp2[i + 1]; 1632 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1633 if (sctx.newshift) break; 1634 pc2 = b->a + bdiag[i + 1]; 1635 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1636 1637 /* Now take care of 2nd column of diagonal 3x3 block. */ 1638 pc3 = rtmp3 + i + 1; 1639 if (*pc3 != 0.0) { 1640 mul3 = (*pc3) * (*pc2); 1641 *pc3 = mul3; 1642 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1643 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1644 for (j = 0; j < nz; j++) { 1645 col = pj[j]; 1646 rtmp3[col] -= mul3 * rtmp2[col]; 1647 } 1648 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1649 } 1650 1651 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1652 rs = 0.0; 1653 /* L part */ 1654 pc3 = b->a + bi[i + 2]; 1655 pj = b->j + bi[i + 2]; 1656 nz = bi[i + 3] - bi[i + 2]; 1657 for (j = 0; j < nz; j++) { 1658 col = pj[j]; 1659 pc3[j] = rtmp3[col]; 1660 rs += PetscAbsScalar(pc3[j]); 1661 } 1662 /* U part */ 1663 pc3 = b->a + bdiag[i + 3] + 1; 1664 pj = b->j + bdiag[i + 3] + 1; 1665 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1666 for (j = 0; j < nz; j++) { 1667 col = pj[j]; 1668 pc3[j] = rtmp3[col]; 1669 rs += PetscAbsScalar(pc3[j]); 1670 } 1671 1672 sctx.rs = rs; 1673 sctx.pv = rtmp3[i + 2]; 1674 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1675 if (sctx.newshift) break; 1676 pc3 = b->a + bdiag[i + 2]; 1677 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1678 break; 1679 case 4: 1680 /* zero rtmp */ 1681 /* L part */ 1682 nz = bi[i + 1] - bi[i]; 1683 bjtmp = bj + bi[i]; 1684 for (j = 0; j < nz; j++) { 1685 col = bjtmp[j]; 1686 rtmp1[col] = 0.0; 1687 rtmp2[col] = 0.0; 1688 rtmp3[col] = 0.0; 1689 rtmp4[col] = 0.0; 1690 } 1691 1692 /* U part */ 1693 nz = bdiag[i] - bdiag[i + 1]; 1694 bjtmp = bj + bdiag[i + 1] + 1; 1695 for (j = 0; j < nz; j++) { 1696 col = bjtmp[j]; 1697 rtmp1[col] = 0.0; 1698 rtmp2[col] = 0.0; 1699 rtmp3[col] = 0.0; 1700 rtmp4[col] = 0.0; 1701 } 1702 1703 /* load in initial (unfactored row) */ 1704 nz = ai[r[i] + 1] - ai[r[i]]; 1705 ajtmp = aj + ai[r[i]]; 1706 v1 = aa + ai[r[i]]; 1707 v2 = aa + ai[r[i] + 1]; 1708 v3 = aa + ai[r[i] + 2]; 1709 v4 = aa + ai[r[i] + 3]; 1710 for (j = 0; j < nz; j++) { 1711 col = ics[ajtmp[j]]; 1712 rtmp1[col] = v1[j]; 1713 rtmp2[col] = v2[j]; 1714 rtmp3[col] = v3[j]; 1715 rtmp4[col] = v4[j]; 1716 } 1717 /* ZeropivotApply(): shift the diagonal of the matrix */ 1718 rtmp1[i] += sctx.shift_amount; 1719 rtmp2[i + 1] += sctx.shift_amount; 1720 rtmp3[i + 2] += sctx.shift_amount; 1721 rtmp4[i + 3] += sctx.shift_amount; 1722 1723 /* elimination */ 1724 bjtmp = bj + bi[i]; 1725 row = *bjtmp++; /* pivot row */ 1726 nzL = bi[i + 1] - bi[i]; 1727 for (k = 0; k < nzL; k++) { 1728 pc1 = rtmp1 + row; 1729 pc2 = rtmp2 + row; 1730 pc3 = rtmp3 + row; 1731 pc4 = rtmp4 + row; 1732 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1733 pv = b->a + bdiag[row]; 1734 mul1 = *pc1 * (*pv); 1735 mul2 = *pc2 * (*pv); 1736 mul3 = *pc3 * (*pv); 1737 mul4 = *pc4 * (*pv); 1738 *pc1 = mul1; 1739 *pc2 = mul2; 1740 *pc3 = mul3; 1741 *pc4 = mul4; 1742 1743 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1744 pv = b->a + bdiag[row + 1] + 1; 1745 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1746 for (j = 0; j < nz; j++) { 1747 col = pj[j]; 1748 rtmp1[col] -= mul1 * pv[j]; 1749 rtmp2[col] -= mul2 * pv[j]; 1750 rtmp3[col] -= mul3 * pv[j]; 1751 rtmp4[col] -= mul4 * pv[j]; 1752 } 1753 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1754 } 1755 row = *bjtmp++; 1756 } 1757 1758 /* finished row i; check zero pivot, then stick row i into b->a */ 1759 rs = 0.0; 1760 /* L part */ 1761 pc1 = b->a + bi[i]; 1762 pj = b->j + bi[i]; 1763 nz = bi[i + 1] - bi[i]; 1764 for (j = 0; j < nz; j++) { 1765 col = pj[j]; 1766 pc1[j] = rtmp1[col]; 1767 rs += PetscAbsScalar(pc1[j]); 1768 } 1769 /* U part */ 1770 pc1 = b->a + bdiag[i + 1] + 1; 1771 pj = b->j + bdiag[i + 1] + 1; 1772 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1773 for (j = 0; j < nz; j++) { 1774 col = pj[j]; 1775 pc1[j] = rtmp1[col]; 1776 rs += PetscAbsScalar(pc1[j]); 1777 } 1778 1779 sctx.rs = rs; 1780 sctx.pv = rtmp1[i]; 1781 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1782 if (sctx.newshift) break; 1783 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1784 *pc1 = 1.0 / sctx.pv; 1785 1786 /* Now take care of 1st column of diagonal 4x4 block. */ 1787 pc2 = rtmp2 + i; 1788 pc3 = rtmp3 + i; 1789 pc4 = rtmp4 + i; 1790 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1791 mul2 = (*pc2) * (*pc1); 1792 *pc2 = mul2; 1793 mul3 = (*pc3) * (*pc1); 1794 *pc3 = mul3; 1795 mul4 = (*pc4) * (*pc1); 1796 *pc4 = mul4; 1797 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1798 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1799 for (j = 0; j < nz; j++) { 1800 col = pj[j]; 1801 rtmp2[col] -= mul2 * rtmp1[col]; 1802 rtmp3[col] -= mul3 * rtmp1[col]; 1803 rtmp4[col] -= mul4 * rtmp1[col]; 1804 } 1805 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1806 } 1807 1808 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1809 rs = 0.0; 1810 /* L part */ 1811 pc2 = b->a + bi[i + 1]; 1812 pj = b->j + bi[i + 1]; 1813 nz = bi[i + 2] - bi[i + 1]; 1814 for (j = 0; j < nz; j++) { 1815 col = pj[j]; 1816 pc2[j] = rtmp2[col]; 1817 rs += PetscAbsScalar(pc2[j]); 1818 } 1819 /* U part */ 1820 pc2 = b->a + bdiag[i + 2] + 1; 1821 pj = b->j + bdiag[i + 2] + 1; 1822 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1823 for (j = 0; j < nz; j++) { 1824 col = pj[j]; 1825 pc2[j] = rtmp2[col]; 1826 rs += PetscAbsScalar(pc2[j]); 1827 } 1828 1829 sctx.rs = rs; 1830 sctx.pv = rtmp2[i + 1]; 1831 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1832 if (sctx.newshift) break; 1833 pc2 = b->a + bdiag[i + 1]; 1834 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1835 1836 /* Now take care of 2nd column of diagonal 4x4 block. */ 1837 pc3 = rtmp3 + i + 1; 1838 pc4 = rtmp4 + i + 1; 1839 if (*pc3 != 0.0 || *pc4 != 0.0) { 1840 mul3 = (*pc3) * (*pc2); 1841 *pc3 = mul3; 1842 mul4 = (*pc4) * (*pc2); 1843 *pc4 = mul4; 1844 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1845 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1846 for (j = 0; j < nz; j++) { 1847 col = pj[j]; 1848 rtmp3[col] -= mul3 * rtmp2[col]; 1849 rtmp4[col] -= mul4 * rtmp2[col]; 1850 } 1851 PetscCall(PetscLogFlops(4.0 * nz)); 1852 } 1853 1854 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1855 rs = 0.0; 1856 /* L part */ 1857 pc3 = b->a + bi[i + 2]; 1858 pj = b->j + bi[i + 2]; 1859 nz = bi[i + 3] - bi[i + 2]; 1860 for (j = 0; j < nz; j++) { 1861 col = pj[j]; 1862 pc3[j] = rtmp3[col]; 1863 rs += PetscAbsScalar(pc3[j]); 1864 } 1865 /* U part */ 1866 pc3 = b->a + bdiag[i + 3] + 1; 1867 pj = b->j + bdiag[i + 3] + 1; 1868 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1869 for (j = 0; j < nz; j++) { 1870 col = pj[j]; 1871 pc3[j] = rtmp3[col]; 1872 rs += PetscAbsScalar(pc3[j]); 1873 } 1874 1875 sctx.rs = rs; 1876 sctx.pv = rtmp3[i + 2]; 1877 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1878 if (sctx.newshift) break; 1879 pc3 = b->a + bdiag[i + 2]; 1880 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1881 1882 /* Now take care of 3rd column of diagonal 4x4 block. */ 1883 pc4 = rtmp4 + i + 2; 1884 if (*pc4 != 0.0) { 1885 mul4 = (*pc4) * (*pc3); 1886 *pc4 = mul4; 1887 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1888 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1889 for (j = 0; j < nz; j++) { 1890 col = pj[j]; 1891 rtmp4[col] -= mul4 * rtmp3[col]; 1892 } 1893 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1894 } 1895 1896 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1897 rs = 0.0; 1898 /* L part */ 1899 pc4 = b->a + bi[i + 3]; 1900 pj = b->j + bi[i + 3]; 1901 nz = bi[i + 4] - bi[i + 3]; 1902 for (j = 0; j < nz; j++) { 1903 col = pj[j]; 1904 pc4[j] = rtmp4[col]; 1905 rs += PetscAbsScalar(pc4[j]); 1906 } 1907 /* U part */ 1908 pc4 = b->a + bdiag[i + 4] + 1; 1909 pj = b->j + bdiag[i + 4] + 1; 1910 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1911 for (j = 0; j < nz; j++) { 1912 col = pj[j]; 1913 pc4[j] = rtmp4[col]; 1914 rs += PetscAbsScalar(pc4[j]); 1915 } 1916 1917 sctx.rs = rs; 1918 sctx.pv = rtmp4[i + 3]; 1919 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1920 if (sctx.newshift) break; 1921 pc4 = b->a + bdiag[i + 3]; 1922 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1923 break; 1924 1925 default: 1926 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1927 } 1928 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1929 i += nodesz; /* Update the row */ 1930 } 1931 1932 /* MatPivotRefine() */ 1933 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1934 /* 1935 * if no shift in this attempt & shifting & started shifting & can refine, 1936 * then try lower shift 1937 */ 1938 sctx.shift_hi = sctx.shift_fraction; 1939 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1940 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1941 sctx.newshift = PETSC_TRUE; 1942 sctx.nshift++; 1943 } 1944 } while (sctx.newshift); 1945 1946 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1947 PetscCall(PetscFree(tmp_vec2)); 1948 PetscCall(ISRestoreIndices(isicol, &ic)); 1949 PetscCall(ISRestoreIndices(isrow, &r)); 1950 1951 if (b->inode.size_csr) { 1952 C->ops->solve = MatSolve_SeqAIJ_Inode; 1953 } else { 1954 C->ops->solve = MatSolve_SeqAIJ; 1955 } 1956 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1957 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1958 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1959 C->ops->matsolve = MatMatSolve_SeqAIJ; 1960 C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ; 1961 C->assembled = PETSC_TRUE; 1962 C->preallocated = PETSC_TRUE; 1963 1964 PetscCall(PetscLogFlops(C->cmap->n)); 1965 1966 /* MatShiftView(A,info,&sctx) */ 1967 if (sctx.nshift) { 1968 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1969 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1970 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1971 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1972 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1973 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1974 } 1975 } 1976 PetscFunctionReturn(PETSC_SUCCESS); 1977 } 1978 1979 #if 0 1980 // unused 1981 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1982 { 1983 Mat C = B; 1984 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1985 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1986 const PetscInt *r, *ic, *c, *ics; 1987 PetscInt n = A->rmap->n, *bi = b->i; 1988 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1989 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1990 PetscInt *ai = a->i, *aj = a->j; 1991 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1992 PetscScalar mul1, mul2, mul3, tmp; 1993 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1994 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1995 PetscReal rs = 0.0; 1996 FactorShiftCtx sctx; 1997 1998 PetscFunctionBegin; 1999 sctx.shift_top = 0; 2000 sctx.nshift_max = 0; 2001 sctx.shift_lo = 0; 2002 sctx.shift_hi = 0; 2003 sctx.shift_fraction = 0; 2004 2005 /* if both shift schemes are chosen by user, only use info->shiftpd */ 2006 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 2007 sctx.shift_top = 0; 2008 for (i = 0; i < n; i++) { 2009 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2010 rs = 0.0; 2011 ajtmp = aj + ai[i]; 2012 rtmp1 = aa + ai[i]; 2013 nz = ai[i + 1] - ai[i]; 2014 for (j = 0; j < nz; j++) { 2015 if (*ajtmp != i) { 2016 rs += PetscAbsScalar(*rtmp1++); 2017 } else { 2018 rs -= PetscRealPart(*rtmp1++); 2019 } 2020 ajtmp++; 2021 } 2022 if (rs > sctx.shift_top) sctx.shift_top = rs; 2023 } 2024 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2025 sctx.shift_top *= 1.1; 2026 sctx.nshift_max = 5; 2027 sctx.shift_lo = 0.; 2028 sctx.shift_hi = 1.; 2029 } 2030 sctx.shift_amount = 0; 2031 sctx.nshift = 0; 2032 2033 PetscCall(ISGetIndices(isrow, &r)); 2034 PetscCall(ISGetIndices(iscol, &c)); 2035 PetscCall(ISGetIndices(isicol, &ic)); 2036 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2037 ics = ic; 2038 2039 node_max = a->inode.node_count; 2040 ns = a->inode.size; 2041 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2042 2043 /* If max inode size > 3, split it into two inodes.*/ 2044 /* also map the inode sizes according to the ordering */ 2045 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2046 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2047 if (ns[i] > 3) { 2048 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2049 ++j; 2050 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2051 } else { 2052 tmp_vec1[j] = ns[i]; 2053 } 2054 } 2055 /* Use the correct node_max */ 2056 node_max = j; 2057 2058 /* Now reorder the inode info based on mat re-ordering info */ 2059 /* First create a row -> inode_size_array_index map */ 2060 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2061 for (i = 0, row = 0; i < node_max; i++) { 2062 nodesz = tmp_vec1[i]; 2063 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2064 } 2065 /* Using nsmap, create a reordered ns structure */ 2066 for (i = 0, j = 0; i < node_max; i++) { 2067 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2068 tmp_vec2[i] = nodesz; 2069 j += nodesz; 2070 } 2071 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2072 /* Now use the correct ns */ 2073 ns = tmp_vec2; 2074 2075 do { 2076 sctx.newshift = PETSC_FALSE; 2077 /* Now loop over each block-row, and do the factorization */ 2078 for (i = 0, row = 0; i < node_max; i++) { 2079 nodesz = ns[i]; 2080 nz = bi[row + 1] - bi[row]; 2081 bjtmp = bj + bi[row]; 2082 2083 switch (nodesz) { 2084 case 1: 2085 for (j = 0; j < nz; j++) { 2086 idx = bjtmp[j]; 2087 rtmp11[idx] = 0.0; 2088 } 2089 2090 /* load in initial (unfactored row) */ 2091 idx = r[row]; 2092 nz_tmp = ai[idx + 1] - ai[idx]; 2093 ajtmp = aj + ai[idx]; 2094 v1 = aa + ai[idx]; 2095 2096 for (j = 0; j < nz_tmp; j++) { 2097 idx = ics[ajtmp[j]]; 2098 rtmp11[idx] = v1[j]; 2099 } 2100 rtmp11[ics[r[row]]] += sctx.shift_amount; 2101 2102 prow = *bjtmp++; 2103 while (prow < row) { 2104 pc1 = rtmp11 + prow; 2105 if (*pc1 != 0.0) { 2106 pv = ba + bd[prow]; 2107 pj = nbj + bd[prow]; 2108 mul1 = *pc1 * *pv++; 2109 *pc1 = mul1; 2110 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2111 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2112 for (j = 0; j < nz_tmp; j++) { 2113 tmp = pv[j]; 2114 idx = pj[j]; 2115 rtmp11[idx] -= mul1 * tmp; 2116 } 2117 } 2118 prow = *bjtmp++; 2119 } 2120 pj = bj + bi[row]; 2121 pc1 = ba + bi[row]; 2122 2123 sctx.pv = rtmp11[row]; 2124 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2125 rs = 0.0; 2126 for (j = 0; j < nz; j++) { 2127 idx = pj[j]; 2128 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2129 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2130 } 2131 sctx.rs = rs; 2132 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2133 if (sctx.newshift) goto endofwhile; 2134 break; 2135 2136 case 2: 2137 for (j = 0; j < nz; j++) { 2138 idx = bjtmp[j]; 2139 rtmp11[idx] = 0.0; 2140 rtmp22[idx] = 0.0; 2141 } 2142 2143 /* load in initial (unfactored row) */ 2144 idx = r[row]; 2145 nz_tmp = ai[idx + 1] - ai[idx]; 2146 ajtmp = aj + ai[idx]; 2147 v1 = aa + ai[idx]; 2148 v2 = aa + ai[idx + 1]; 2149 for (j = 0; j < nz_tmp; j++) { 2150 idx = ics[ajtmp[j]]; 2151 rtmp11[idx] = v1[j]; 2152 rtmp22[idx] = v2[j]; 2153 } 2154 rtmp11[ics[r[row]]] += sctx.shift_amount; 2155 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2156 2157 prow = *bjtmp++; 2158 while (prow < row) { 2159 pc1 = rtmp11 + prow; 2160 pc2 = rtmp22 + prow; 2161 if (*pc1 != 0.0 || *pc2 != 0.0) { 2162 pv = ba + bd[prow]; 2163 pj = nbj + bd[prow]; 2164 mul1 = *pc1 * *pv; 2165 mul2 = *pc2 * *pv; 2166 ++pv; 2167 *pc1 = mul1; 2168 *pc2 = mul2; 2169 2170 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2171 for (j = 0; j < nz_tmp; j++) { 2172 tmp = pv[j]; 2173 idx = pj[j]; 2174 rtmp11[idx] -= mul1 * tmp; 2175 rtmp22[idx] -= mul2 * tmp; 2176 } 2177 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2178 } 2179 prow = *bjtmp++; 2180 } 2181 2182 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2183 pc1 = rtmp11 + prow; 2184 pc2 = rtmp22 + prow; 2185 2186 sctx.pv = *pc1; 2187 pj = bj + bi[prow]; 2188 rs = 0.0; 2189 for (j = 0; j < nz; j++) { 2190 idx = pj[j]; 2191 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2192 } 2193 sctx.rs = rs; 2194 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2195 if (sctx.newshift) goto endofwhile; 2196 2197 if (*pc2 != 0.0) { 2198 pj = nbj + bd[prow]; 2199 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2200 *pc2 = mul2; 2201 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2202 for (j = 0; j < nz_tmp; j++) { 2203 idx = pj[j]; 2204 tmp = rtmp11[idx]; 2205 rtmp22[idx] -= mul2 * tmp; 2206 } 2207 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2208 } 2209 2210 pj = bj + bi[row]; 2211 pc1 = ba + bi[row]; 2212 pc2 = ba + bi[row + 1]; 2213 2214 sctx.pv = rtmp22[row + 1]; 2215 rs = 0.0; 2216 rtmp11[row] = 1.0 / rtmp11[row]; 2217 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2218 /* copy row entries from dense representation to sparse */ 2219 for (j = 0; j < nz; j++) { 2220 idx = pj[j]; 2221 pc1[j] = rtmp11[idx]; 2222 pc2[j] = rtmp22[idx]; 2223 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2224 } 2225 sctx.rs = rs; 2226 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2227 if (sctx.newshift) goto endofwhile; 2228 break; 2229 2230 case 3: 2231 for (j = 0; j < nz; j++) { 2232 idx = bjtmp[j]; 2233 rtmp11[idx] = 0.0; 2234 rtmp22[idx] = 0.0; 2235 rtmp33[idx] = 0.0; 2236 } 2237 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2238 idx = r[row]; 2239 nz_tmp = ai[idx + 1] - ai[idx]; 2240 ajtmp = aj + ai[idx]; 2241 v1 = aa + ai[idx]; 2242 v2 = aa + ai[idx + 1]; 2243 v3 = aa + ai[idx + 2]; 2244 for (j = 0; j < nz_tmp; j++) { 2245 idx = ics[ajtmp[j]]; 2246 rtmp11[idx] = v1[j]; 2247 rtmp22[idx] = v2[j]; 2248 rtmp33[idx] = v3[j]; 2249 } 2250 rtmp11[ics[r[row]]] += sctx.shift_amount; 2251 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2252 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2253 2254 /* loop over all pivot row blocks above this row block */ 2255 prow = *bjtmp++; 2256 while (prow < row) { 2257 pc1 = rtmp11 + prow; 2258 pc2 = rtmp22 + prow; 2259 pc3 = rtmp33 + prow; 2260 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2261 pv = ba + bd[prow]; 2262 pj = nbj + bd[prow]; 2263 mul1 = *pc1 * *pv; 2264 mul2 = *pc2 * *pv; 2265 mul3 = *pc3 * *pv; 2266 ++pv; 2267 *pc1 = mul1; 2268 *pc2 = mul2; 2269 *pc3 = mul3; 2270 2271 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2272 /* update this row based on pivot row */ 2273 for (j = 0; j < nz_tmp; j++) { 2274 tmp = pv[j]; 2275 idx = pj[j]; 2276 rtmp11[idx] -= mul1 * tmp; 2277 rtmp22[idx] -= mul2 * tmp; 2278 rtmp33[idx] -= mul3 * tmp; 2279 } 2280 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2281 } 2282 prow = *bjtmp++; 2283 } 2284 2285 /* Now take care of diagonal 3x3 block in this set of rows */ 2286 /* note: prow = row here */ 2287 pc1 = rtmp11 + prow; 2288 pc2 = rtmp22 + prow; 2289 pc3 = rtmp33 + prow; 2290 2291 sctx.pv = *pc1; 2292 pj = bj + bi[prow]; 2293 rs = 0.0; 2294 for (j = 0; j < nz; j++) { 2295 idx = pj[j]; 2296 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2297 } 2298 sctx.rs = rs; 2299 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2300 if (sctx.newshift) goto endofwhile; 2301 2302 if (*pc2 != 0.0 || *pc3 != 0.0) { 2303 mul2 = (*pc2) / (*pc1); 2304 mul3 = (*pc3) / (*pc1); 2305 *pc2 = mul2; 2306 *pc3 = mul3; 2307 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2308 pj = nbj + bd[prow]; 2309 for (j = 0; j < nz_tmp; j++) { 2310 idx = pj[j]; 2311 tmp = rtmp11[idx]; 2312 rtmp22[idx] -= mul2 * tmp; 2313 rtmp33[idx] -= mul3 * tmp; 2314 } 2315 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2316 } 2317 ++prow; 2318 2319 pc2 = rtmp22 + prow; 2320 pc3 = rtmp33 + prow; 2321 sctx.pv = *pc2; 2322 pj = bj + bi[prow]; 2323 rs = 0.0; 2324 for (j = 0; j < nz; j++) { 2325 idx = pj[j]; 2326 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2327 } 2328 sctx.rs = rs; 2329 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2330 if (sctx.newshift) goto endofwhile; 2331 2332 if (*pc3 != 0.0) { 2333 mul3 = (*pc3) / (*pc2); 2334 *pc3 = mul3; 2335 pj = nbj + bd[prow]; 2336 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2337 for (j = 0; j < nz_tmp; j++) { 2338 idx = pj[j]; 2339 tmp = rtmp22[idx]; 2340 rtmp33[idx] -= mul3 * tmp; 2341 } 2342 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2343 } 2344 2345 pj = bj + bi[row]; 2346 pc1 = ba + bi[row]; 2347 pc2 = ba + bi[row + 1]; 2348 pc3 = ba + bi[row + 2]; 2349 2350 sctx.pv = rtmp33[row + 2]; 2351 rs = 0.0; 2352 rtmp11[row] = 1.0 / rtmp11[row]; 2353 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2354 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2355 /* copy row entries from dense representation to sparse */ 2356 for (j = 0; j < nz; j++) { 2357 idx = pj[j]; 2358 pc1[j] = rtmp11[idx]; 2359 pc2[j] = rtmp22[idx]; 2360 pc3[j] = rtmp33[idx]; 2361 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2362 } 2363 2364 sctx.rs = rs; 2365 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2366 if (sctx.newshift) goto endofwhile; 2367 break; 2368 2369 default: 2370 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2371 } 2372 row += nodesz; /* Update the row */ 2373 } 2374 endofwhile:; 2375 } while (sctx.newshift); 2376 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2377 PetscCall(PetscFree(tmp_vec2)); 2378 PetscCall(ISRestoreIndices(isicol, &ic)); 2379 PetscCall(ISRestoreIndices(isrow, &r)); 2380 PetscCall(ISRestoreIndices(iscol, &c)); 2381 2382 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2383 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2384 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2385 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2386 C->assembled = PETSC_TRUE; 2387 C->preallocated = PETSC_TRUE; 2388 if (sctx.nshift) { 2389 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2390 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2391 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2392 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2393 } 2394 } 2395 PetscCall(PetscLogFlops(C->cmap->n)); 2396 PetscCall(MatSeqAIJCheckInode(C)); 2397 PetscFunctionReturn(PETSC_SUCCESS); 2398 } 2399 #endif 2400 2401 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2402 { 2403 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2404 IS iscol = a->col, isrow = a->row; 2405 const PetscInt *r, *c, *rout, *cout; 2406 PetscInt i, j; 2407 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2408 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2409 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2410 PetscScalar sum1, sum2, sum3, sum4, sum5; 2411 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2412 const PetscScalar *b; 2413 2414 PetscFunctionBegin; 2415 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2416 node_max = a->inode.node_count; 2417 ns = a->inode.size_csr; /* Node Size array */ 2418 2419 PetscCall(VecGetArrayRead(bb, &b)); 2420 PetscCall(VecGetArrayWrite(xx, &x)); 2421 tmp = a->solve_work; 2422 2423 PetscCall(ISGetIndices(isrow, &rout)); 2424 r = rout; 2425 PetscCall(ISGetIndices(iscol, &cout)); 2426 c = cout; 2427 2428 /* forward solve the lower triangular */ 2429 tmps = tmp; 2430 aa = a_a; 2431 aj = a_j; 2432 ad = a->diag; 2433 2434 for (i = 0; i < node_max; ++i) { 2435 row = ns[i]; 2436 nsz = ns[i + 1] - ns[i]; 2437 aii = ai[row]; 2438 v1 = aa + aii; 2439 vi = aj + aii; 2440 nz = ai[row + 1] - ai[row]; 2441 2442 if (i < node_max - 1) { 2443 /* Prefetch the indices for the next block */ 2444 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2445 /* Prefetch the data for the next block */ 2446 PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2447 } 2448 2449 switch (nsz) { /* Each loop in 'case' is unrolled */ 2450 case 1: 2451 sum1 = b[r[row]]; 2452 for (j = 0; j < nz - 1; j += 2) { 2453 i0 = vi[j]; 2454 i1 = vi[j + 1]; 2455 tmp0 = tmps[i0]; 2456 tmp1 = tmps[i1]; 2457 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2458 } 2459 if (j == nz - 1) { 2460 tmp0 = tmps[vi[j]]; 2461 sum1 -= v1[j] * tmp0; 2462 } 2463 tmp[row++] = sum1; 2464 break; 2465 case 2: 2466 sum1 = b[r[row]]; 2467 sum2 = b[r[row + 1]]; 2468 v2 = aa + ai[row + 1]; 2469 2470 for (j = 0; j < nz - 1; j += 2) { 2471 i0 = vi[j]; 2472 i1 = vi[j + 1]; 2473 tmp0 = tmps[i0]; 2474 tmp1 = tmps[i1]; 2475 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2476 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2477 } 2478 if (j == nz - 1) { 2479 tmp0 = tmps[vi[j]]; 2480 sum1 -= v1[j] * tmp0; 2481 sum2 -= v2[j] * tmp0; 2482 } 2483 sum2 -= v2[nz] * sum1; 2484 tmp[row++] = sum1; 2485 tmp[row++] = sum2; 2486 break; 2487 case 3: 2488 sum1 = b[r[row]]; 2489 sum2 = b[r[row + 1]]; 2490 sum3 = b[r[row + 2]]; 2491 v2 = aa + ai[row + 1]; 2492 v3 = aa + ai[row + 2]; 2493 2494 for (j = 0; j < nz - 1; j += 2) { 2495 i0 = vi[j]; 2496 i1 = vi[j + 1]; 2497 tmp0 = tmps[i0]; 2498 tmp1 = tmps[i1]; 2499 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2500 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2501 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2502 } 2503 if (j == nz - 1) { 2504 tmp0 = tmps[vi[j]]; 2505 sum1 -= v1[j] * tmp0; 2506 sum2 -= v2[j] * tmp0; 2507 sum3 -= v3[j] * tmp0; 2508 } 2509 sum2 -= v2[nz] * sum1; 2510 sum3 -= v3[nz] * sum1; 2511 sum3 -= v3[nz + 1] * sum2; 2512 tmp[row++] = sum1; 2513 tmp[row++] = sum2; 2514 tmp[row++] = sum3; 2515 break; 2516 2517 case 4: 2518 sum1 = b[r[row]]; 2519 sum2 = b[r[row + 1]]; 2520 sum3 = b[r[row + 2]]; 2521 sum4 = b[r[row + 3]]; 2522 v2 = aa + ai[row + 1]; 2523 v3 = aa + ai[row + 2]; 2524 v4 = aa + ai[row + 3]; 2525 2526 for (j = 0; j < nz - 1; j += 2) { 2527 i0 = vi[j]; 2528 i1 = vi[j + 1]; 2529 tmp0 = tmps[i0]; 2530 tmp1 = tmps[i1]; 2531 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2532 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2533 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2534 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2535 } 2536 if (j == nz - 1) { 2537 tmp0 = tmps[vi[j]]; 2538 sum1 -= v1[j] * tmp0; 2539 sum2 -= v2[j] * tmp0; 2540 sum3 -= v3[j] * tmp0; 2541 sum4 -= v4[j] * tmp0; 2542 } 2543 sum2 -= v2[nz] * sum1; 2544 sum3 -= v3[nz] * sum1; 2545 sum4 -= v4[nz] * sum1; 2546 sum3 -= v3[nz + 1] * sum2; 2547 sum4 -= v4[nz + 1] * sum2; 2548 sum4 -= v4[nz + 2] * sum3; 2549 2550 tmp[row++] = sum1; 2551 tmp[row++] = sum2; 2552 tmp[row++] = sum3; 2553 tmp[row++] = sum4; 2554 break; 2555 case 5: 2556 sum1 = b[r[row]]; 2557 sum2 = b[r[row + 1]]; 2558 sum3 = b[r[row + 2]]; 2559 sum4 = b[r[row + 3]]; 2560 sum5 = b[r[row + 4]]; 2561 v2 = aa + ai[row + 1]; 2562 v3 = aa + ai[row + 2]; 2563 v4 = aa + ai[row + 3]; 2564 v5 = aa + ai[row + 4]; 2565 2566 for (j = 0; j < nz - 1; j += 2) { 2567 i0 = vi[j]; 2568 i1 = vi[j + 1]; 2569 tmp0 = tmps[i0]; 2570 tmp1 = tmps[i1]; 2571 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2572 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2573 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2574 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2575 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2576 } 2577 if (j == nz - 1) { 2578 tmp0 = tmps[vi[j]]; 2579 sum1 -= v1[j] * tmp0; 2580 sum2 -= v2[j] * tmp0; 2581 sum3 -= v3[j] * tmp0; 2582 sum4 -= v4[j] * tmp0; 2583 sum5 -= v5[j] * tmp0; 2584 } 2585 2586 sum2 -= v2[nz] * sum1; 2587 sum3 -= v3[nz] * sum1; 2588 sum4 -= v4[nz] * sum1; 2589 sum5 -= v5[nz] * sum1; 2590 sum3 -= v3[nz + 1] * sum2; 2591 sum4 -= v4[nz + 1] * sum2; 2592 sum5 -= v5[nz + 1] * sum2; 2593 sum4 -= v4[nz + 2] * sum3; 2594 sum5 -= v5[nz + 2] * sum3; 2595 sum5 -= v5[nz + 3] * sum4; 2596 2597 tmp[row++] = sum1; 2598 tmp[row++] = sum2; 2599 tmp[row++] = sum3; 2600 tmp[row++] = sum4; 2601 tmp[row++] = sum5; 2602 break; 2603 default: 2604 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2605 } 2606 } 2607 /* backward solve the upper triangular */ 2608 for (i = node_max - 1; i >= 0; i--) { 2609 row = ns[i + 1] - 1; 2610 nsz = ns[i + 1] - ns[i]; 2611 aii = ad[row + 1] + 1; 2612 v1 = aa + aii; 2613 vi = aj + aii; 2614 nz = ad[row] - ad[row + 1] - 1; 2615 2616 if (i > 0) { 2617 /* Prefetch the indices for the next block */ 2618 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2619 /* Prefetch the data for the next block */ 2620 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2621 } 2622 2623 switch (nsz) { /* Each loop in 'case' is unrolled */ 2624 case 1: 2625 sum1 = tmp[row]; 2626 2627 for (j = 0; j < nz - 1; j += 2) { 2628 i0 = vi[j]; 2629 i1 = vi[j + 1]; 2630 tmp0 = tmps[i0]; 2631 tmp1 = tmps[i1]; 2632 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2633 } 2634 if (j == nz - 1) { 2635 tmp0 = tmps[vi[j]]; 2636 sum1 -= v1[j] * tmp0; 2637 } 2638 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2639 row--; 2640 break; 2641 case 2: 2642 sum1 = tmp[row]; 2643 sum2 = tmp[row - 1]; 2644 v2 = aa + ad[row] + 1; 2645 for (j = 0; j < nz - 1; j += 2) { 2646 i0 = vi[j]; 2647 i1 = vi[j + 1]; 2648 tmp0 = tmps[i0]; 2649 tmp1 = tmps[i1]; 2650 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2651 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2652 } 2653 if (j == nz - 1) { 2654 tmp0 = tmps[vi[j]]; 2655 sum1 -= v1[j] * tmp0; 2656 sum2 -= v2[j + 1] * tmp0; 2657 } 2658 2659 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2660 row--; 2661 sum2 -= v2[0] * tmp0; 2662 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2663 row--; 2664 break; 2665 case 3: 2666 sum1 = tmp[row]; 2667 sum2 = tmp[row - 1]; 2668 sum3 = tmp[row - 2]; 2669 v2 = aa + ad[row] + 1; 2670 v3 = aa + ad[row - 1] + 1; 2671 for (j = 0; j < nz - 1; j += 2) { 2672 i0 = vi[j]; 2673 i1 = vi[j + 1]; 2674 tmp0 = tmps[i0]; 2675 tmp1 = tmps[i1]; 2676 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2677 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2678 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2679 } 2680 if (j == nz - 1) { 2681 tmp0 = tmps[vi[j]]; 2682 sum1 -= v1[j] * tmp0; 2683 sum2 -= v2[j + 1] * tmp0; 2684 sum3 -= v3[j + 2] * tmp0; 2685 } 2686 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2687 row--; 2688 sum2 -= v2[0] * tmp0; 2689 sum3 -= v3[1] * tmp0; 2690 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2691 row--; 2692 sum3 -= v3[0] * tmp0; 2693 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2694 row--; 2695 2696 break; 2697 case 4: 2698 sum1 = tmp[row]; 2699 sum2 = tmp[row - 1]; 2700 sum3 = tmp[row - 2]; 2701 sum4 = tmp[row - 3]; 2702 v2 = aa + ad[row] + 1; 2703 v3 = aa + ad[row - 1] + 1; 2704 v4 = aa + ad[row - 2] + 1; 2705 2706 for (j = 0; j < nz - 1; j += 2) { 2707 i0 = vi[j]; 2708 i1 = vi[j + 1]; 2709 tmp0 = tmps[i0]; 2710 tmp1 = tmps[i1]; 2711 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2712 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2713 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2714 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2715 } 2716 if (j == nz - 1) { 2717 tmp0 = tmps[vi[j]]; 2718 sum1 -= v1[j] * tmp0; 2719 sum2 -= v2[j + 1] * tmp0; 2720 sum3 -= v3[j + 2] * tmp0; 2721 sum4 -= v4[j + 3] * tmp0; 2722 } 2723 2724 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2725 row--; 2726 sum2 -= v2[0] * tmp0; 2727 sum3 -= v3[1] * tmp0; 2728 sum4 -= v4[2] * tmp0; 2729 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2730 row--; 2731 sum3 -= v3[0] * tmp0; 2732 sum4 -= v4[1] * tmp0; 2733 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2734 row--; 2735 sum4 -= v4[0] * tmp0; 2736 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2737 row--; 2738 break; 2739 case 5: 2740 sum1 = tmp[row]; 2741 sum2 = tmp[row - 1]; 2742 sum3 = tmp[row - 2]; 2743 sum4 = tmp[row - 3]; 2744 sum5 = tmp[row - 4]; 2745 v2 = aa + ad[row] + 1; 2746 v3 = aa + ad[row - 1] + 1; 2747 v4 = aa + ad[row - 2] + 1; 2748 v5 = aa + ad[row - 3] + 1; 2749 for (j = 0; j < nz - 1; j += 2) { 2750 i0 = vi[j]; 2751 i1 = vi[j + 1]; 2752 tmp0 = tmps[i0]; 2753 tmp1 = tmps[i1]; 2754 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2755 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2756 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2757 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2758 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2759 } 2760 if (j == nz - 1) { 2761 tmp0 = tmps[vi[j]]; 2762 sum1 -= v1[j] * tmp0; 2763 sum2 -= v2[j + 1] * tmp0; 2764 sum3 -= v3[j + 2] * tmp0; 2765 sum4 -= v4[j + 3] * tmp0; 2766 sum5 -= v5[j + 4] * tmp0; 2767 } 2768 2769 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2770 row--; 2771 sum2 -= v2[0] * tmp0; 2772 sum3 -= v3[1] * tmp0; 2773 sum4 -= v4[2] * tmp0; 2774 sum5 -= v5[3] * tmp0; 2775 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2776 row--; 2777 sum3 -= v3[0] * tmp0; 2778 sum4 -= v4[1] * tmp0; 2779 sum5 -= v5[2] * tmp0; 2780 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2781 row--; 2782 sum4 -= v4[0] * tmp0; 2783 sum5 -= v5[1] * tmp0; 2784 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2785 row--; 2786 sum5 -= v5[0] * tmp0; 2787 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2788 row--; 2789 break; 2790 default: 2791 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2792 } 2793 } 2794 PetscCall(ISRestoreIndices(isrow, &rout)); 2795 PetscCall(ISRestoreIndices(iscol, &cout)); 2796 PetscCall(VecRestoreArrayRead(bb, &b)); 2797 PetscCall(VecRestoreArrayWrite(xx, &x)); 2798 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2799 PetscFunctionReturn(PETSC_SUCCESS); 2800 } 2801 2802 /* 2803 Makes a longer coloring[] array and calls the usual code with that 2804 */ 2805 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2806 { 2807 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2808 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row; 2809 PetscInt *colorused, i; 2810 ISColoringValue *newcolor; 2811 2812 PetscFunctionBegin; 2813 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2814 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2815 /* loop over inodes, marking a color for each column*/ 2816 row = 0; 2817 for (i = 0; i < m; i++) { 2818 for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++)); 2819 } 2820 2821 /* eliminate unneeded colors */ 2822 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2823 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2824 2825 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2826 ncolors = colorused[5 * ncolors - 1]; 2827 for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i)); 2828 PetscCall(PetscFree(colorused)); 2829 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2830 PetscCall(PetscFree(coloring)); 2831 PetscFunctionReturn(PETSC_SUCCESS); 2832 } 2833 2834 #include <petsc/private/kernels/blockinvert.h> 2835 2836 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2837 { 2838 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2839 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2840 MatScalar *ibdiag, *bdiag, work[25], *t; 2841 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2842 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2843 const PetscScalar *xb, *b; 2844 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2845 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz; 2846 PetscInt sz, k, ipvt[5]; 2847 PetscBool allowzeropivot, zeropivotdetected; 2848 const PetscInt *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i; 2849 2850 PetscFunctionBegin; 2851 allowzeropivot = PetscNot(A->erroriffailure); 2852 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2853 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2854 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2855 2856 if (!a->inode.ibdiagvalid) { 2857 if (!a->inode.ibdiag) { 2858 /* calculate space needed for diagonal blocks */ 2859 for (i = 0; i < m; i++) { 2860 nodesz = sizes[i + 1] - sizes[i]; 2861 cnt += nodesz * nodesz; 2862 } 2863 a->inode.bdiagsize = cnt; 2864 2865 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2866 } 2867 2868 /* copy over the diagonal blocks and invert them */ 2869 ibdiag = a->inode.ibdiag; 2870 bdiag = a->inode.bdiag; 2871 cnt = 0; 2872 for (i = 0, row = 0; i < m; i++) { 2873 nodesz = sizes[i + 1] - sizes[i]; 2874 for (j = 0; j < nodesz; j++) { 2875 for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k]; 2876 } 2877 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz)); 2878 2879 switch (nodesz) { 2880 case 1: 2881 /* Create matrix data structure */ 2882 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2883 if (allowzeropivot) { 2884 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2885 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2886 A->factorerror_zeropivot_row = row; 2887 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2888 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2889 } 2890 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2891 break; 2892 case 2: 2893 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2894 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2895 break; 2896 case 3: 2897 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2898 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2899 break; 2900 case 4: 2901 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2902 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2903 break; 2904 case 5: 2905 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2906 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2907 break; 2908 default: 2909 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 2910 } 2911 cnt += nodesz * nodesz; 2912 row += nodesz; 2913 } 2914 a->inode.ibdiagvalid = PETSC_TRUE; 2915 } 2916 ibdiag = a->inode.ibdiag; 2917 bdiag = a->inode.bdiag; 2918 t = a->inode.ssor_work; 2919 2920 PetscCall(VecGetArray(xx, &x)); 2921 PetscCall(VecGetArrayRead(bb, &b)); 2922 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2923 if (flag & SOR_ZERO_INITIAL_GUESS) { 2924 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2925 for (i = 0, row = 0; i < m; i++) { 2926 sz = diag[row] - ii[row]; 2927 v1 = a->a + ii[row]; 2928 idx = a->j + ii[row]; 2929 2930 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2931 nodesz = sizes[i + 1] - sizes[i]; 2932 switch (nodesz) { 2933 case 1: 2934 2935 sum1 = b[row]; 2936 for (n = 0; n < sz - 1; n += 2) { 2937 i1 = idx[0]; 2938 i2 = idx[1]; 2939 idx += 2; 2940 tmp0 = x[i1]; 2941 tmp1 = x[i2]; 2942 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2943 v1 += 2; 2944 } 2945 2946 if (n == sz - 1) { 2947 tmp0 = x[*idx]; 2948 sum1 -= *v1 * tmp0; 2949 } 2950 t[row] = sum1; 2951 x[row++] = sum1 * (*ibdiag++); 2952 break; 2953 case 2: 2954 v2 = a->a + ii[row + 1]; 2955 sum1 = b[row]; 2956 sum2 = b[row + 1]; 2957 for (n = 0; n < sz - 1; n += 2) { 2958 i1 = idx[0]; 2959 i2 = idx[1]; 2960 idx += 2; 2961 tmp0 = x[i1]; 2962 tmp1 = x[i2]; 2963 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2964 v1 += 2; 2965 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2966 v2 += 2; 2967 } 2968 2969 if (n == sz - 1) { 2970 tmp0 = x[*idx]; 2971 sum1 -= v1[0] * tmp0; 2972 sum2 -= v2[0] * tmp0; 2973 } 2974 t[row] = sum1; 2975 t[row + 1] = sum2; 2976 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2977 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2978 ibdiag += 4; 2979 break; 2980 case 3: 2981 v2 = a->a + ii[row + 1]; 2982 v3 = a->a + ii[row + 2]; 2983 sum1 = b[row]; 2984 sum2 = b[row + 1]; 2985 sum3 = b[row + 2]; 2986 for (n = 0; n < sz - 1; n += 2) { 2987 i1 = idx[0]; 2988 i2 = idx[1]; 2989 idx += 2; 2990 tmp0 = x[i1]; 2991 tmp1 = x[i2]; 2992 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2993 v1 += 2; 2994 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2995 v2 += 2; 2996 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2997 v3 += 2; 2998 } 2999 3000 if (n == sz - 1) { 3001 tmp0 = x[*idx]; 3002 sum1 -= v1[0] * tmp0; 3003 sum2 -= v2[0] * tmp0; 3004 sum3 -= v3[0] * tmp0; 3005 } 3006 t[row] = sum1; 3007 t[row + 1] = sum2; 3008 t[row + 2] = sum3; 3009 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3010 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3011 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3012 ibdiag += 9; 3013 break; 3014 case 4: 3015 v2 = a->a + ii[row + 1]; 3016 v3 = a->a + ii[row + 2]; 3017 v4 = a->a + ii[row + 3]; 3018 sum1 = b[row]; 3019 sum2 = b[row + 1]; 3020 sum3 = b[row + 2]; 3021 sum4 = b[row + 3]; 3022 for (n = 0; n < sz - 1; n += 2) { 3023 i1 = idx[0]; 3024 i2 = idx[1]; 3025 idx += 2; 3026 tmp0 = x[i1]; 3027 tmp1 = x[i2]; 3028 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3029 v1 += 2; 3030 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3031 v2 += 2; 3032 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3033 v3 += 2; 3034 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3035 v4 += 2; 3036 } 3037 3038 if (n == sz - 1) { 3039 tmp0 = x[*idx]; 3040 sum1 -= v1[0] * tmp0; 3041 sum2 -= v2[0] * tmp0; 3042 sum3 -= v3[0] * tmp0; 3043 sum4 -= v4[0] * tmp0; 3044 } 3045 t[row] = sum1; 3046 t[row + 1] = sum2; 3047 t[row + 2] = sum3; 3048 t[row + 3] = sum4; 3049 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3050 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3051 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3052 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3053 ibdiag += 16; 3054 break; 3055 case 5: 3056 v2 = a->a + ii[row + 1]; 3057 v3 = a->a + ii[row + 2]; 3058 v4 = a->a + ii[row + 3]; 3059 v5 = a->a + ii[row + 4]; 3060 sum1 = b[row]; 3061 sum2 = b[row + 1]; 3062 sum3 = b[row + 2]; 3063 sum4 = b[row + 3]; 3064 sum5 = b[row + 4]; 3065 for (n = 0; n < sz - 1; n += 2) { 3066 i1 = idx[0]; 3067 i2 = idx[1]; 3068 idx += 2; 3069 tmp0 = x[i1]; 3070 tmp1 = x[i2]; 3071 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3072 v1 += 2; 3073 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3074 v2 += 2; 3075 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3076 v3 += 2; 3077 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3078 v4 += 2; 3079 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3080 v5 += 2; 3081 } 3082 3083 if (n == sz - 1) { 3084 tmp0 = x[*idx]; 3085 sum1 -= v1[0] * tmp0; 3086 sum2 -= v2[0] * tmp0; 3087 sum3 -= v3[0] * tmp0; 3088 sum4 -= v4[0] * tmp0; 3089 sum5 -= v5[0] * tmp0; 3090 } 3091 t[row] = sum1; 3092 t[row + 1] = sum2; 3093 t[row + 2] = sum3; 3094 t[row + 3] = sum4; 3095 t[row + 4] = sum5; 3096 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3097 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3098 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3099 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3100 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3101 ibdiag += 25; 3102 break; 3103 default: 3104 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3105 } 3106 } 3107 3108 xb = t; 3109 PetscCall(PetscLogFlops(a->nz)); 3110 } else xb = b; 3111 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3112 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3113 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3114 nodesz = sizes[i + 1] - sizes[i]; 3115 ibdiag -= nodesz * nodesz; 3116 sz = ii[row + 1] - diag[row] - 1; 3117 v1 = a->a + diag[row] + 1; 3118 idx = a->j + diag[row] + 1; 3119 3120 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3121 switch (nodesz) { 3122 case 1: 3123 3124 sum1 = xb[row]; 3125 for (n = 0; n < sz - 1; n += 2) { 3126 i1 = idx[0]; 3127 i2 = idx[1]; 3128 idx += 2; 3129 tmp0 = x[i1]; 3130 tmp1 = x[i2]; 3131 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3132 v1 += 2; 3133 } 3134 3135 if (n == sz - 1) { 3136 tmp0 = x[*idx]; 3137 sum1 -= *v1 * tmp0; 3138 } 3139 x[row--] = sum1 * (*ibdiag); 3140 break; 3141 3142 case 2: 3143 3144 sum1 = xb[row]; 3145 sum2 = xb[row - 1]; 3146 /* note that sum1 is associated with the second of the two rows */ 3147 v2 = a->a + diag[row - 1] + 2; 3148 for (n = 0; n < sz - 1; n += 2) { 3149 i1 = idx[0]; 3150 i2 = idx[1]; 3151 idx += 2; 3152 tmp0 = x[i1]; 3153 tmp1 = x[i2]; 3154 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3155 v1 += 2; 3156 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3157 v2 += 2; 3158 } 3159 3160 if (n == sz - 1) { 3161 tmp0 = x[*idx]; 3162 sum1 -= *v1 * tmp0; 3163 sum2 -= *v2 * tmp0; 3164 } 3165 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3166 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3167 break; 3168 case 3: 3169 3170 sum1 = xb[row]; 3171 sum2 = xb[row - 1]; 3172 sum3 = xb[row - 2]; 3173 v2 = a->a + diag[row - 1] + 2; 3174 v3 = a->a + diag[row - 2] + 3; 3175 for (n = 0; n < sz - 1; n += 2) { 3176 i1 = idx[0]; 3177 i2 = idx[1]; 3178 idx += 2; 3179 tmp0 = x[i1]; 3180 tmp1 = x[i2]; 3181 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3182 v1 += 2; 3183 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3184 v2 += 2; 3185 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3186 v3 += 2; 3187 } 3188 3189 if (n == sz - 1) { 3190 tmp0 = x[*idx]; 3191 sum1 -= *v1 * tmp0; 3192 sum2 -= *v2 * tmp0; 3193 sum3 -= *v3 * tmp0; 3194 } 3195 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3196 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3197 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3198 break; 3199 case 4: 3200 3201 sum1 = xb[row]; 3202 sum2 = xb[row - 1]; 3203 sum3 = xb[row - 2]; 3204 sum4 = xb[row - 3]; 3205 v2 = a->a + diag[row - 1] + 2; 3206 v3 = a->a + diag[row - 2] + 3; 3207 v4 = a->a + diag[row - 3] + 4; 3208 for (n = 0; n < sz - 1; n += 2) { 3209 i1 = idx[0]; 3210 i2 = idx[1]; 3211 idx += 2; 3212 tmp0 = x[i1]; 3213 tmp1 = x[i2]; 3214 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3215 v1 += 2; 3216 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3217 v2 += 2; 3218 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3219 v3 += 2; 3220 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3221 v4 += 2; 3222 } 3223 3224 if (n == sz - 1) { 3225 tmp0 = x[*idx]; 3226 sum1 -= *v1 * tmp0; 3227 sum2 -= *v2 * tmp0; 3228 sum3 -= *v3 * tmp0; 3229 sum4 -= *v4 * tmp0; 3230 } 3231 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3232 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3233 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3234 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3235 break; 3236 case 5: 3237 3238 sum1 = xb[row]; 3239 sum2 = xb[row - 1]; 3240 sum3 = xb[row - 2]; 3241 sum4 = xb[row - 3]; 3242 sum5 = xb[row - 4]; 3243 v2 = a->a + diag[row - 1] + 2; 3244 v3 = a->a + diag[row - 2] + 3; 3245 v4 = a->a + diag[row - 3] + 4; 3246 v5 = a->a + diag[row - 4] + 5; 3247 for (n = 0; n < sz - 1; n += 2) { 3248 i1 = idx[0]; 3249 i2 = idx[1]; 3250 idx += 2; 3251 tmp0 = x[i1]; 3252 tmp1 = x[i2]; 3253 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3254 v1 += 2; 3255 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3256 v2 += 2; 3257 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3258 v3 += 2; 3259 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3260 v4 += 2; 3261 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3262 v5 += 2; 3263 } 3264 3265 if (n == sz - 1) { 3266 tmp0 = x[*idx]; 3267 sum1 -= *v1 * tmp0; 3268 sum2 -= *v2 * tmp0; 3269 sum3 -= *v3 * tmp0; 3270 sum4 -= *v4 * tmp0; 3271 sum5 -= *v5 * tmp0; 3272 } 3273 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3274 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3275 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3276 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3277 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3278 break; 3279 default: 3280 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3281 } 3282 } 3283 3284 PetscCall(PetscLogFlops(a->nz)); 3285 } 3286 its--; 3287 } 3288 while (its--) { 3289 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3290 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) { 3291 nodesz = sizes[i + 1] - sizes[i]; 3292 sz = diag[row] - ii[row]; 3293 v1 = a->a + ii[row]; 3294 idx = a->j + ii[row]; 3295 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3296 switch (nodesz) { 3297 case 1: 3298 sum1 = b[row]; 3299 for (n = 0; n < sz - 1; n += 2) { 3300 i1 = idx[0]; 3301 i2 = idx[1]; 3302 idx += 2; 3303 tmp0 = x[i1]; 3304 tmp1 = x[i2]; 3305 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3306 v1 += 2; 3307 } 3308 if (n == sz - 1) { 3309 tmp0 = x[*idx++]; 3310 sum1 -= *v1 * tmp0; 3311 v1++; 3312 } 3313 t[row] = sum1; 3314 sz = ii[row + 1] - diag[row] - 1; 3315 idx = a->j + diag[row] + 1; 3316 v1 += 1; 3317 for (n = 0; n < sz - 1; n += 2) { 3318 i1 = idx[0]; 3319 i2 = idx[1]; 3320 idx += 2; 3321 tmp0 = x[i1]; 3322 tmp1 = x[i2]; 3323 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3324 v1 += 2; 3325 } 3326 if (n == sz - 1) { 3327 tmp0 = x[*idx++]; 3328 sum1 -= *v1 * tmp0; 3329 } 3330 /* in MatSOR_SeqAIJ this line would be 3331 * 3332 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3333 * 3334 * but omega == 1, so this becomes 3335 * 3336 * x[row] = sum1*(*ibdiag++); 3337 * 3338 */ 3339 x[row] = sum1 * (*ibdiag); 3340 break; 3341 case 2: 3342 v2 = a->a + ii[row + 1]; 3343 sum1 = b[row]; 3344 sum2 = b[row + 1]; 3345 for (n = 0; n < sz - 1; n += 2) { 3346 i1 = idx[0]; 3347 i2 = idx[1]; 3348 idx += 2; 3349 tmp0 = x[i1]; 3350 tmp1 = x[i2]; 3351 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3352 v1 += 2; 3353 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3354 v2 += 2; 3355 } 3356 if (n == sz - 1) { 3357 tmp0 = x[*idx++]; 3358 sum1 -= v1[0] * tmp0; 3359 sum2 -= v2[0] * tmp0; 3360 v1++; 3361 v2++; 3362 } 3363 t[row] = sum1; 3364 t[row + 1] = sum2; 3365 sz = ii[row + 1] - diag[row] - 2; 3366 idx = a->j + diag[row] + 2; 3367 v1 += 2; 3368 v2 += 2; 3369 for (n = 0; n < sz - 1; n += 2) { 3370 i1 = idx[0]; 3371 i2 = idx[1]; 3372 idx += 2; 3373 tmp0 = x[i1]; 3374 tmp1 = x[i2]; 3375 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3376 v1 += 2; 3377 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3378 v2 += 2; 3379 } 3380 if (n == sz - 1) { 3381 tmp0 = x[*idx]; 3382 sum1 -= v1[0] * tmp0; 3383 sum2 -= v2[0] * tmp0; 3384 } 3385 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3386 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3387 break; 3388 case 3: 3389 v2 = a->a + ii[row + 1]; 3390 v3 = a->a + ii[row + 2]; 3391 sum1 = b[row]; 3392 sum2 = b[row + 1]; 3393 sum3 = b[row + 2]; 3394 for (n = 0; n < sz - 1; n += 2) { 3395 i1 = idx[0]; 3396 i2 = idx[1]; 3397 idx += 2; 3398 tmp0 = x[i1]; 3399 tmp1 = x[i2]; 3400 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3401 v1 += 2; 3402 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3403 v2 += 2; 3404 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3405 v3 += 2; 3406 } 3407 if (n == sz - 1) { 3408 tmp0 = x[*idx++]; 3409 sum1 -= v1[0] * tmp0; 3410 sum2 -= v2[0] * tmp0; 3411 sum3 -= v3[0] * tmp0; 3412 v1++; 3413 v2++; 3414 v3++; 3415 } 3416 t[row] = sum1; 3417 t[row + 1] = sum2; 3418 t[row + 2] = sum3; 3419 sz = ii[row + 1] - diag[row] - 3; 3420 idx = a->j + diag[row] + 3; 3421 v1 += 3; 3422 v2 += 3; 3423 v3 += 3; 3424 for (n = 0; n < sz - 1; n += 2) { 3425 i1 = idx[0]; 3426 i2 = idx[1]; 3427 idx += 2; 3428 tmp0 = x[i1]; 3429 tmp1 = x[i2]; 3430 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3431 v1 += 2; 3432 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3433 v2 += 2; 3434 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3435 v3 += 2; 3436 } 3437 if (n == sz - 1) { 3438 tmp0 = x[*idx]; 3439 sum1 -= v1[0] * tmp0; 3440 sum2 -= v2[0] * tmp0; 3441 sum3 -= v3[0] * tmp0; 3442 } 3443 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3444 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3445 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3446 break; 3447 case 4: 3448 v2 = a->a + ii[row + 1]; 3449 v3 = a->a + ii[row + 2]; 3450 v4 = a->a + ii[row + 3]; 3451 sum1 = b[row]; 3452 sum2 = b[row + 1]; 3453 sum3 = b[row + 2]; 3454 sum4 = b[row + 3]; 3455 for (n = 0; n < sz - 1; n += 2) { 3456 i1 = idx[0]; 3457 i2 = idx[1]; 3458 idx += 2; 3459 tmp0 = x[i1]; 3460 tmp1 = x[i2]; 3461 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3462 v1 += 2; 3463 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3464 v2 += 2; 3465 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3466 v3 += 2; 3467 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3468 v4 += 2; 3469 } 3470 if (n == sz - 1) { 3471 tmp0 = x[*idx++]; 3472 sum1 -= v1[0] * tmp0; 3473 sum2 -= v2[0] * tmp0; 3474 sum3 -= v3[0] * tmp0; 3475 sum4 -= v4[0] * tmp0; 3476 v1++; 3477 v2++; 3478 v3++; 3479 v4++; 3480 } 3481 t[row] = sum1; 3482 t[row + 1] = sum2; 3483 t[row + 2] = sum3; 3484 t[row + 3] = sum4; 3485 sz = ii[row + 1] - diag[row] - 4; 3486 idx = a->j + diag[row] + 4; 3487 v1 += 4; 3488 v2 += 4; 3489 v3 += 4; 3490 v4 += 4; 3491 for (n = 0; n < sz - 1; n += 2) { 3492 i1 = idx[0]; 3493 i2 = idx[1]; 3494 idx += 2; 3495 tmp0 = x[i1]; 3496 tmp1 = x[i2]; 3497 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3498 v1 += 2; 3499 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3500 v2 += 2; 3501 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3502 v3 += 2; 3503 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3504 v4 += 2; 3505 } 3506 if (n == sz - 1) { 3507 tmp0 = x[*idx]; 3508 sum1 -= v1[0] * tmp0; 3509 sum2 -= v2[0] * tmp0; 3510 sum3 -= v3[0] * tmp0; 3511 sum4 -= v4[0] * tmp0; 3512 } 3513 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3514 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3515 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3516 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3517 break; 3518 case 5: 3519 v2 = a->a + ii[row + 1]; 3520 v3 = a->a + ii[row + 2]; 3521 v4 = a->a + ii[row + 3]; 3522 v5 = a->a + ii[row + 4]; 3523 sum1 = b[row]; 3524 sum2 = b[row + 1]; 3525 sum3 = b[row + 2]; 3526 sum4 = b[row + 3]; 3527 sum5 = b[row + 4]; 3528 for (n = 0; n < sz - 1; n += 2) { 3529 i1 = idx[0]; 3530 i2 = idx[1]; 3531 idx += 2; 3532 tmp0 = x[i1]; 3533 tmp1 = x[i2]; 3534 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3535 v1 += 2; 3536 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3537 v2 += 2; 3538 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3539 v3 += 2; 3540 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3541 v4 += 2; 3542 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3543 v5 += 2; 3544 } 3545 if (n == sz - 1) { 3546 tmp0 = x[*idx++]; 3547 sum1 -= v1[0] * tmp0; 3548 sum2 -= v2[0] * tmp0; 3549 sum3 -= v3[0] * tmp0; 3550 sum4 -= v4[0] * tmp0; 3551 sum5 -= v5[0] * tmp0; 3552 v1++; 3553 v2++; 3554 v3++; 3555 v4++; 3556 v5++; 3557 } 3558 t[row] = sum1; 3559 t[row + 1] = sum2; 3560 t[row + 2] = sum3; 3561 t[row + 3] = sum4; 3562 t[row + 4] = sum5; 3563 sz = ii[row + 1] - diag[row] - 5; 3564 idx = a->j + diag[row] + 5; 3565 v1 += 5; 3566 v2 += 5; 3567 v3 += 5; 3568 v4 += 5; 3569 v5 += 5; 3570 for (n = 0; n < sz - 1; n += 2) { 3571 i1 = idx[0]; 3572 i2 = idx[1]; 3573 idx += 2; 3574 tmp0 = x[i1]; 3575 tmp1 = x[i2]; 3576 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3577 v1 += 2; 3578 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3579 v2 += 2; 3580 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3581 v3 += 2; 3582 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3583 v4 += 2; 3584 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3585 v5 += 2; 3586 } 3587 if (n == sz - 1) { 3588 tmp0 = x[*idx]; 3589 sum1 -= v1[0] * tmp0; 3590 sum2 -= v2[0] * tmp0; 3591 sum3 -= v3[0] * tmp0; 3592 sum4 -= v4[0] * tmp0; 3593 sum5 -= v5[0] * tmp0; 3594 } 3595 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3596 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3597 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3598 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3599 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3600 break; 3601 default: 3602 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3603 } 3604 } 3605 xb = t; 3606 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3607 } else xb = b; 3608 3609 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3610 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3611 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3612 nodesz = sizes[i + 1] - sizes[i]; 3613 ibdiag -= nodesz * nodesz; 3614 3615 /* set RHS */ 3616 if (xb == b) { 3617 /* whole (old way) */ 3618 sz = ii[row + 1] - ii[row]; 3619 idx = a->j + ii[row]; 3620 switch (nodesz) { 3621 case 5: 3622 v5 = a->a + ii[row - 4]; /* fall through */ 3623 case 4: 3624 v4 = a->a + ii[row - 3]; /* fall through */ 3625 case 3: 3626 v3 = a->a + ii[row - 2]; /* fall through */ 3627 case 2: 3628 v2 = a->a + ii[row - 1]; /* fall through */ 3629 case 1: 3630 v1 = a->a + ii[row]; 3631 break; 3632 default: 3633 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3634 } 3635 } else { 3636 /* upper, no diag */ 3637 sz = ii[row + 1] - diag[row] - 1; 3638 idx = a->j + diag[row] + 1; 3639 switch (nodesz) { 3640 case 5: 3641 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3642 case 4: 3643 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3644 case 3: 3645 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3646 case 2: 3647 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3648 case 1: 3649 v1 = a->a + diag[row] + 1; 3650 } 3651 } 3652 /* set sum */ 3653 switch (nodesz) { 3654 case 5: 3655 sum5 = xb[row - 4]; /* fall through */ 3656 case 4: 3657 sum4 = xb[row - 3]; /* fall through */ 3658 case 3: 3659 sum3 = xb[row - 2]; /* fall through */ 3660 case 2: 3661 sum2 = xb[row - 1]; /* fall through */ 3662 case 1: 3663 /* note that sum1 is associated with the last row */ 3664 sum1 = xb[row]; 3665 } 3666 /* do sums */ 3667 for (n = 0; n < sz - 1; n += 2) { 3668 i1 = idx[0]; 3669 i2 = idx[1]; 3670 idx += 2; 3671 tmp0 = x[i1]; 3672 tmp1 = x[i2]; 3673 switch (nodesz) { 3674 case 5: 3675 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3676 v5 += 2; /* fall through */ 3677 case 4: 3678 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3679 v4 += 2; /* fall through */ 3680 case 3: 3681 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3682 v3 += 2; /* fall through */ 3683 case 2: 3684 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3685 v2 += 2; /* fall through */ 3686 case 1: 3687 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3688 v1 += 2; 3689 } 3690 } 3691 /* ragged edge */ 3692 if (n == sz - 1) { 3693 tmp0 = x[*idx]; 3694 switch (nodesz) { 3695 case 5: 3696 sum5 -= *v5 * tmp0; /* fall through */ 3697 case 4: 3698 sum4 -= *v4 * tmp0; /* fall through */ 3699 case 3: 3700 sum3 -= *v3 * tmp0; /* fall through */ 3701 case 2: 3702 sum2 -= *v2 * tmp0; /* fall through */ 3703 case 1: 3704 sum1 -= *v1 * tmp0; 3705 } 3706 } 3707 /* update */ 3708 if (xb == b) { 3709 /* whole (old way) w/ diag */ 3710 switch (nodesz) { 3711 case 5: 3712 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3713 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3714 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3715 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3716 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3717 break; 3718 case 4: 3719 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3720 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3721 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3722 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3723 break; 3724 case 3: 3725 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3726 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3727 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3728 break; 3729 case 2: 3730 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3731 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3732 break; 3733 case 1: 3734 x[row--] += sum1 * (*ibdiag); 3735 break; 3736 } 3737 } else { 3738 /* no diag so set = */ 3739 switch (nodesz) { 3740 case 5: 3741 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3742 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3743 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3744 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3745 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3746 break; 3747 case 4: 3748 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3749 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3750 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3751 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3752 break; 3753 case 3: 3754 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3755 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3756 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3757 break; 3758 case 2: 3759 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3760 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3761 break; 3762 case 1: 3763 x[row--] = sum1 * (*ibdiag); 3764 break; 3765 } 3766 } 3767 } 3768 if (xb == b) { 3769 PetscCall(PetscLogFlops(2.0 * a->nz)); 3770 } else { 3771 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3772 } 3773 } 3774 } 3775 if (flag & SOR_EISENSTAT) { 3776 /* 3777 Apply (U + D)^-1 where D is now the block diagonal 3778 */ 3779 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3780 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3781 nodesz = sizes[i + 1] - sizes[i]; 3782 ibdiag -= nodesz * nodesz; 3783 sz = ii[row + 1] - diag[row] - 1; 3784 v1 = a->a + diag[row] + 1; 3785 idx = a->j + diag[row] + 1; 3786 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3787 switch (nodesz) { 3788 case 1: 3789 3790 sum1 = b[row]; 3791 for (n = 0; n < sz - 1; n += 2) { 3792 i1 = idx[0]; 3793 i2 = idx[1]; 3794 idx += 2; 3795 tmp0 = x[i1]; 3796 tmp1 = x[i2]; 3797 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3798 v1 += 2; 3799 } 3800 3801 if (n == sz - 1) { 3802 tmp0 = x[*idx]; 3803 sum1 -= *v1 * tmp0; 3804 } 3805 x[row] = sum1 * (*ibdiag); 3806 row--; 3807 break; 3808 3809 case 2: 3810 3811 sum1 = b[row]; 3812 sum2 = b[row - 1]; 3813 /* note that sum1 is associated with the second of the two rows */ 3814 v2 = a->a + diag[row - 1] + 2; 3815 for (n = 0; n < sz - 1; n += 2) { 3816 i1 = idx[0]; 3817 i2 = idx[1]; 3818 idx += 2; 3819 tmp0 = x[i1]; 3820 tmp1 = x[i2]; 3821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3822 v1 += 2; 3823 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3824 v2 += 2; 3825 } 3826 3827 if (n == sz - 1) { 3828 tmp0 = x[*idx]; 3829 sum1 -= *v1 * tmp0; 3830 sum2 -= *v2 * tmp0; 3831 } 3832 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3833 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3834 row -= 2; 3835 break; 3836 case 3: 3837 3838 sum1 = b[row]; 3839 sum2 = b[row - 1]; 3840 sum3 = b[row - 2]; 3841 v2 = a->a + diag[row - 1] + 2; 3842 v3 = a->a + diag[row - 2] + 3; 3843 for (n = 0; n < sz - 1; n += 2) { 3844 i1 = idx[0]; 3845 i2 = idx[1]; 3846 idx += 2; 3847 tmp0 = x[i1]; 3848 tmp1 = x[i2]; 3849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3850 v1 += 2; 3851 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3852 v2 += 2; 3853 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3854 v3 += 2; 3855 } 3856 3857 if (n == sz - 1) { 3858 tmp0 = x[*idx]; 3859 sum1 -= *v1 * tmp0; 3860 sum2 -= *v2 * tmp0; 3861 sum3 -= *v3 * tmp0; 3862 } 3863 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3864 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3865 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3866 row -= 3; 3867 break; 3868 case 4: 3869 3870 sum1 = b[row]; 3871 sum2 = b[row - 1]; 3872 sum3 = b[row - 2]; 3873 sum4 = b[row - 3]; 3874 v2 = a->a + diag[row - 1] + 2; 3875 v3 = a->a + diag[row - 2] + 3; 3876 v4 = a->a + diag[row - 3] + 4; 3877 for (n = 0; n < sz - 1; n += 2) { 3878 i1 = idx[0]; 3879 i2 = idx[1]; 3880 idx += 2; 3881 tmp0 = x[i1]; 3882 tmp1 = x[i2]; 3883 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3884 v1 += 2; 3885 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3886 v2 += 2; 3887 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3888 v3 += 2; 3889 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3890 v4 += 2; 3891 } 3892 3893 if (n == sz - 1) { 3894 tmp0 = x[*idx]; 3895 sum1 -= *v1 * tmp0; 3896 sum2 -= *v2 * tmp0; 3897 sum3 -= *v3 * tmp0; 3898 sum4 -= *v4 * tmp0; 3899 } 3900 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3901 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3902 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3903 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3904 row -= 4; 3905 break; 3906 case 5: 3907 3908 sum1 = b[row]; 3909 sum2 = b[row - 1]; 3910 sum3 = b[row - 2]; 3911 sum4 = b[row - 3]; 3912 sum5 = b[row - 4]; 3913 v2 = a->a + diag[row - 1] + 2; 3914 v3 = a->a + diag[row - 2] + 3; 3915 v4 = a->a + diag[row - 3] + 4; 3916 v5 = a->a + diag[row - 4] + 5; 3917 for (n = 0; n < sz - 1; n += 2) { 3918 i1 = idx[0]; 3919 i2 = idx[1]; 3920 idx += 2; 3921 tmp0 = x[i1]; 3922 tmp1 = x[i2]; 3923 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3924 v1 += 2; 3925 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3926 v2 += 2; 3927 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3928 v3 += 2; 3929 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3930 v4 += 2; 3931 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3932 v5 += 2; 3933 } 3934 3935 if (n == sz - 1) { 3936 tmp0 = x[*idx]; 3937 sum1 -= *v1 * tmp0; 3938 sum2 -= *v2 * tmp0; 3939 sum3 -= *v3 * tmp0; 3940 sum4 -= *v4 * tmp0; 3941 sum5 -= *v5 * tmp0; 3942 } 3943 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3944 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3945 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3946 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3947 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3948 row -= 5; 3949 break; 3950 default: 3951 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3952 } 3953 } 3954 PetscCall(PetscLogFlops(a->nz)); 3955 3956 /* 3957 t = b - D x where D is the block diagonal 3958 */ 3959 cnt = 0; 3960 for (i = 0, row = 0; i < m; i++) { 3961 nodesz = sizes[i + 1] - sizes[i]; 3962 switch (nodesz) { 3963 case 1: 3964 t[row] = b[row] - bdiag[cnt++] * x[row]; 3965 row++; 3966 break; 3967 case 2: 3968 x1 = x[row]; 3969 x2 = x[row + 1]; 3970 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3971 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3972 t[row] = b[row] - tmp1; 3973 t[row + 1] = b[row + 1] - tmp2; 3974 row += 2; 3975 cnt += 4; 3976 break; 3977 case 3: 3978 x1 = x[row]; 3979 x2 = x[row + 1]; 3980 x3 = x[row + 2]; 3981 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3982 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3983 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3984 t[row] = b[row] - tmp1; 3985 t[row + 1] = b[row + 1] - tmp2; 3986 t[row + 2] = b[row + 2] - tmp3; 3987 row += 3; 3988 cnt += 9; 3989 break; 3990 case 4: 3991 x1 = x[row]; 3992 x2 = x[row + 1]; 3993 x3 = x[row + 2]; 3994 x4 = x[row + 3]; 3995 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3996 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3997 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3998 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3999 t[row] = b[row] - tmp1; 4000 t[row + 1] = b[row + 1] - tmp2; 4001 t[row + 2] = b[row + 2] - tmp3; 4002 t[row + 3] = b[row + 3] - tmp4; 4003 row += 4; 4004 cnt += 16; 4005 break; 4006 case 5: 4007 x1 = x[row]; 4008 x2 = x[row + 1]; 4009 x3 = x[row + 2]; 4010 x4 = x[row + 3]; 4011 x5 = x[row + 4]; 4012 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4013 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4014 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4015 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4016 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4017 t[row] = b[row] - tmp1; 4018 t[row + 1] = b[row + 1] - tmp2; 4019 t[row + 2] = b[row + 2] - tmp3; 4020 t[row + 3] = b[row + 3] - tmp4; 4021 t[row + 4] = b[row + 4] - tmp5; 4022 row += 5; 4023 cnt += 25; 4024 break; 4025 default: 4026 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4027 } 4028 } 4029 PetscCall(PetscLogFlops(m)); 4030 4031 /* 4032 Apply (L + D)^-1 where D is the block diagonal 4033 */ 4034 for (i = 0, row = 0; i < m; i++) { 4035 nodesz = sizes[i + 1] - sizes[i]; 4036 sz = diag[row] - ii[row]; 4037 v1 = a->a + ii[row]; 4038 idx = a->j + ii[row]; 4039 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4040 switch (nodesz) { 4041 case 1: 4042 4043 sum1 = t[row]; 4044 for (n = 0; n < sz - 1; n += 2) { 4045 i1 = idx[0]; 4046 i2 = idx[1]; 4047 idx += 2; 4048 tmp0 = t[i1]; 4049 tmp1 = t[i2]; 4050 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4051 v1 += 2; 4052 } 4053 4054 if (n == sz - 1) { 4055 tmp0 = t[*idx]; 4056 sum1 -= *v1 * tmp0; 4057 } 4058 x[row] += t[row] = sum1 * (*ibdiag++); 4059 row++; 4060 break; 4061 case 2: 4062 v2 = a->a + ii[row + 1]; 4063 sum1 = t[row]; 4064 sum2 = t[row + 1]; 4065 for (n = 0; n < sz - 1; n += 2) { 4066 i1 = idx[0]; 4067 i2 = idx[1]; 4068 idx += 2; 4069 tmp0 = t[i1]; 4070 tmp1 = t[i2]; 4071 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4072 v1 += 2; 4073 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4074 v2 += 2; 4075 } 4076 4077 if (n == sz - 1) { 4078 tmp0 = t[*idx]; 4079 sum1 -= v1[0] * tmp0; 4080 sum2 -= v2[0] * tmp0; 4081 } 4082 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4083 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4084 ibdiag += 4; 4085 row += 2; 4086 break; 4087 case 3: 4088 v2 = a->a + ii[row + 1]; 4089 v3 = a->a + ii[row + 2]; 4090 sum1 = t[row]; 4091 sum2 = t[row + 1]; 4092 sum3 = t[row + 2]; 4093 for (n = 0; n < sz - 1; n += 2) { 4094 i1 = idx[0]; 4095 i2 = idx[1]; 4096 idx += 2; 4097 tmp0 = t[i1]; 4098 tmp1 = t[i2]; 4099 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4100 v1 += 2; 4101 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4102 v2 += 2; 4103 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4104 v3 += 2; 4105 } 4106 4107 if (n == sz - 1) { 4108 tmp0 = t[*idx]; 4109 sum1 -= v1[0] * tmp0; 4110 sum2 -= v2[0] * tmp0; 4111 sum3 -= v3[0] * tmp0; 4112 } 4113 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4114 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4115 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4116 ibdiag += 9; 4117 row += 3; 4118 break; 4119 case 4: 4120 v2 = a->a + ii[row + 1]; 4121 v3 = a->a + ii[row + 2]; 4122 v4 = a->a + ii[row + 3]; 4123 sum1 = t[row]; 4124 sum2 = t[row + 1]; 4125 sum3 = t[row + 2]; 4126 sum4 = t[row + 3]; 4127 for (n = 0; n < sz - 1; n += 2) { 4128 i1 = idx[0]; 4129 i2 = idx[1]; 4130 idx += 2; 4131 tmp0 = t[i1]; 4132 tmp1 = t[i2]; 4133 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4134 v1 += 2; 4135 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4136 v2 += 2; 4137 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4138 v3 += 2; 4139 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4140 v4 += 2; 4141 } 4142 4143 if (n == sz - 1) { 4144 tmp0 = t[*idx]; 4145 sum1 -= v1[0] * tmp0; 4146 sum2 -= v2[0] * tmp0; 4147 sum3 -= v3[0] * tmp0; 4148 sum4 -= v4[0] * tmp0; 4149 } 4150 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4151 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4152 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4153 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4154 ibdiag += 16; 4155 row += 4; 4156 break; 4157 case 5: 4158 v2 = a->a + ii[row + 1]; 4159 v3 = a->a + ii[row + 2]; 4160 v4 = a->a + ii[row + 3]; 4161 v5 = a->a + ii[row + 4]; 4162 sum1 = t[row]; 4163 sum2 = t[row + 1]; 4164 sum3 = t[row + 2]; 4165 sum4 = t[row + 3]; 4166 sum5 = t[row + 4]; 4167 for (n = 0; n < sz - 1; n += 2) { 4168 i1 = idx[0]; 4169 i2 = idx[1]; 4170 idx += 2; 4171 tmp0 = t[i1]; 4172 tmp1 = t[i2]; 4173 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4174 v1 += 2; 4175 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4176 v2 += 2; 4177 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4178 v3 += 2; 4179 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4180 v4 += 2; 4181 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4182 v5 += 2; 4183 } 4184 4185 if (n == sz - 1) { 4186 tmp0 = t[*idx]; 4187 sum1 -= v1[0] * tmp0; 4188 sum2 -= v2[0] * tmp0; 4189 sum3 -= v3[0] * tmp0; 4190 sum4 -= v4[0] * tmp0; 4191 sum5 -= v5[0] * tmp0; 4192 } 4193 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4194 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4195 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4196 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4197 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4198 ibdiag += 25; 4199 row += 5; 4200 break; 4201 default: 4202 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4203 } 4204 } 4205 PetscCall(PetscLogFlops(a->nz)); 4206 } 4207 PetscCall(VecRestoreArray(xx, &x)); 4208 PetscCall(VecRestoreArrayRead(bb, &b)); 4209 PetscFunctionReturn(PETSC_SUCCESS); 4210 } 4211 4212 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4213 { 4214 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4215 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4216 const MatScalar *bdiag = a->inode.bdiag; 4217 const PetscScalar *b; 4218 PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz; 4219 const PetscInt *sizes = a->inode.size_csr; 4220 4221 PetscFunctionBegin; 4222 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4223 PetscCall(VecGetArray(xx, &x)); 4224 PetscCall(VecGetArrayRead(bb, &b)); 4225 cnt = 0; 4226 for (i = 0, row = 0; i < m; i++) { 4227 nodesz = sizes[i + 1] - sizes[i]; 4228 switch (nodesz) { 4229 case 1: 4230 x[row] = b[row] * bdiag[cnt++]; 4231 row++; 4232 break; 4233 case 2: 4234 x1 = b[row]; 4235 x2 = b[row + 1]; 4236 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4237 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4238 x[row++] = tmp1; 4239 x[row++] = tmp2; 4240 cnt += 4; 4241 break; 4242 case 3: 4243 x1 = b[row]; 4244 x2 = b[row + 1]; 4245 x3 = b[row + 2]; 4246 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4247 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4248 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4249 x[row++] = tmp1; 4250 x[row++] = tmp2; 4251 x[row++] = tmp3; 4252 cnt += 9; 4253 break; 4254 case 4: 4255 x1 = b[row]; 4256 x2 = b[row + 1]; 4257 x3 = b[row + 2]; 4258 x4 = b[row + 3]; 4259 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4260 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4261 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4262 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4263 x[row++] = tmp1; 4264 x[row++] = tmp2; 4265 x[row++] = tmp3; 4266 x[row++] = tmp4; 4267 cnt += 16; 4268 break; 4269 case 5: 4270 x1 = b[row]; 4271 x2 = b[row + 1]; 4272 x3 = b[row + 2]; 4273 x4 = b[row + 3]; 4274 x5 = b[row + 4]; 4275 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4276 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4277 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4278 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4279 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4280 x[row++] = tmp1; 4281 x[row++] = tmp2; 4282 x[row++] = tmp3; 4283 x[row++] = tmp4; 4284 x[row++] = tmp5; 4285 cnt += 25; 4286 break; 4287 default: 4288 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4289 } 4290 } 4291 PetscCall(PetscLogFlops(2.0 * cnt)); 4292 PetscCall(VecRestoreArray(xx, &x)); 4293 PetscCall(VecRestoreArrayRead(bb, &b)); 4294 PetscFunctionReturn(PETSC_SUCCESS); 4295 } 4296 4297 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4298 { 4299 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4300 4301 PetscFunctionBegin; 4302 a->inode.node_count = 0; 4303 a->inode.use = PETSC_FALSE; 4304 a->inode.checked = PETSC_FALSE; 4305 a->inode.mat_nonzerostate = -1; 4306 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4307 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4308 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4309 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4310 A->ops->coloringpatch = NULL; 4311 A->ops->multdiagonalblock = NULL; 4312 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4313 PetscFunctionReturn(PETSC_SUCCESS); 4314 } 4315 4316 /* 4317 samestructure indicates that the matrix has not changed its nonzero structure so we 4318 do not need to recompute the inodes 4319 */ 4320 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4321 { 4322 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4323 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4324 PetscBool flag; 4325 const PetscInt *idx, *idy, *ii; 4326 4327 PetscFunctionBegin; 4328 if (!a->inode.use) { 4329 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4330 PetscCall(PetscFree(a->inode.size_csr)); 4331 PetscFunctionReturn(PETSC_SUCCESS); 4332 } 4333 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 4334 4335 m = A->rmap->n; 4336 if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr)); 4337 ns = a->inode.size_csr; 4338 ns[0] = 0; 4339 4340 i = 0; 4341 node_count = 0; 4342 idx = a->j; 4343 ii = a->i; 4344 if (idx) { 4345 while (i < m) { /* For each row */ 4346 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4347 /* Limits the number of elements in a node to 'a->inode.limit' */ 4348 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4349 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4350 if (nzy != nzx) break; 4351 idy += nzx; /* Same nonzero pattern */ 4352 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4353 if (!flag) break; 4354 } 4355 ns[node_count + 1] = ns[node_count] + blk_size; 4356 node_count++; 4357 idx += blk_size * nzx; 4358 i = j; 4359 } 4360 } 4361 /* If not enough inodes found,, do not use inode version of the routines */ 4362 if (!m || !idx || node_count > .8 * m) { 4363 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4364 PetscCall(PetscFree(a->inode.size_csr)); 4365 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4366 } else { 4367 if (!A->factortype) { 4368 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4369 if (A->rmap->n == A->cmap->n) { 4370 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4371 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4372 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4373 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4374 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4375 } 4376 } else { 4377 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4378 } 4379 a->inode.node_count = node_count; 4380 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4381 } 4382 a->inode.checked = PETSC_TRUE; 4383 a->inode.mat_nonzerostate = A->nonzerostate; 4384 PetscFunctionReturn(PETSC_SUCCESS); 4385 } 4386 4387 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4388 { 4389 Mat B = *C; 4390 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4391 PetscInt m = A->rmap->n; 4392 4393 PetscFunctionBegin; 4394 c->inode.use = a->inode.use; 4395 c->inode.limit = a->inode.limit; 4396 c->inode.max_limit = a->inode.max_limit; 4397 c->inode.checked = PETSC_FALSE; 4398 c->inode.size_csr = NULL; 4399 c->inode.node_count = 0; 4400 c->inode.ibdiagvalid = PETSC_FALSE; 4401 c->inode.ibdiag = NULL; 4402 c->inode.bdiag = NULL; 4403 c->inode.mat_nonzerostate = -1; 4404 if (a->inode.use) { 4405 if (a->inode.checked && a->inode.size_csr) { 4406 PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr)); 4407 PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1)); 4408 4409 c->inode.checked = PETSC_TRUE; 4410 c->inode.node_count = a->inode.node_count; 4411 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4412 } 4413 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4414 if (!B->factortype) { 4415 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4416 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4417 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4418 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4419 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4420 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4421 } else { 4422 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4423 } 4424 } 4425 PetscFunctionReturn(PETSC_SUCCESS); 4426 } 4427 4428 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4429 { 4430 PetscInt k; 4431 const PetscInt *vi; 4432 4433 PetscFunctionBegin; 4434 vi = aj + ai[row]; 4435 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4436 vi = aj + adiag[row]; 4437 cols[nzl] = vi[0]; 4438 vi = aj + adiag[row + 1] + 1; 4439 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4440 PetscFunctionReturn(PETSC_SUCCESS); 4441 } 4442 /* 4443 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4444 Modified from MatSeqAIJCheckInode(). 4445 4446 Input Parameters: 4447 . Mat A - ILU or LU matrix factor 4448 4449 */ 4450 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4451 { 4452 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4453 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4454 PetscInt *cols1, *cols2, *ns; 4455 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4456 PetscBool flag; 4457 4458 PetscFunctionBegin; 4459 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4460 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4461 4462 m = A->rmap->n; 4463 if (a->inode.size_csr) ns = a->inode.size_csr; 4464 else PetscCall(PetscMalloc1(m + 1, &ns)); 4465 ns[0] = 0; 4466 4467 i = 0; 4468 node_count = 0; 4469 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4470 while (i < m) { /* For each row */ 4471 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4472 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4473 nzx = nzl1 + nzu1 + 1; 4474 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4475 4476 /* Limits the number of elements in a node to 'a->inode.limit' */ 4477 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4478 nzl2 = ai[j + 1] - ai[j]; 4479 nzu2 = adiag[j] - adiag[j + 1] - 1; 4480 nzy = nzl2 + nzu2 + 1; 4481 if (nzy != nzx) break; 4482 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4483 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4484 if (!flag) break; 4485 } 4486 ns[node_count + 1] = ns[node_count] + blk_size; 4487 node_count++; 4488 i = j; 4489 } 4490 PetscCall(PetscFree2(cols1, cols2)); 4491 /* If not enough inodes found,, do not use inode version of the routines */ 4492 if (!m || node_count > .8 * m) { 4493 PetscCall(PetscFree(ns)); 4494 4495 a->inode.node_count = 0; 4496 a->inode.size_csr = NULL; 4497 a->inode.use = PETSC_FALSE; 4498 4499 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4500 } else { 4501 A->ops->mult = NULL; 4502 A->ops->sor = NULL; 4503 A->ops->multadd = NULL; 4504 A->ops->getrowij = NULL; 4505 A->ops->restorerowij = NULL; 4506 A->ops->getcolumnij = NULL; 4507 A->ops->restorecolumnij = NULL; 4508 A->ops->coloringpatch = NULL; 4509 A->ops->multdiagonalblock = NULL; 4510 a->inode.node_count = node_count; 4511 a->inode.size_csr = ns; 4512 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4513 } 4514 a->inode.checked = PETSC_TRUE; 4515 PetscFunctionReturn(PETSC_SUCCESS); 4516 } 4517 4518 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4519 { 4520 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4521 4522 PetscFunctionBegin; 4523 a->inode.ibdiagvalid = PETSC_FALSE; 4524 PetscFunctionReturn(PETSC_SUCCESS); 4525 } 4526 4527 /* 4528 This is really ugly. if inodes are used this replaces the 4529 permutations with ones that correspond to rows/cols of the matrix 4530 rather than inode blocks 4531 */ 4532 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4533 { 4534 PetscFunctionBegin; 4535 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4536 PetscFunctionReturn(PETSC_SUCCESS); 4537 } 4538 4539 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4540 { 4541 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4542 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4543 const PetscInt *ridx, *cidx; 4544 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx; 4545 PetscInt nslim_col, *ns_col; 4546 IS ris = *rperm, cis = *cperm; 4547 4548 PetscFunctionBegin; 4549 if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4550 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4551 4552 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4553 PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns)); 4554 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4555 4556 PetscCall(ISGetIndices(ris, &ridx)); 4557 PetscCall(ISGetIndices(cis, &cidx)); 4558 4559 /* Form the inode structure for the rows of permuted matrix using inv perm*/ 4560 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]); 4561 4562 /* Construct the permutations for rows*/ 4563 for (i = 0, row = 0; i < nslim_row; ++i) { 4564 indx = ridx[i]; 4565 start_val = tns[indx]; 4566 end_val = tns[indx + 1]; 4567 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4568 } 4569 4570 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4571 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]); 4572 4573 /* Construct permutations for columns */ 4574 for (i = 0, col = 0; i < nslim_col; ++i) { 4575 indx = cidx[i]; 4576 start_val = tns[indx]; 4577 end_val = tns[indx + 1]; 4578 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4579 } 4580 4581 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4582 PetscCall(ISSetPermutation(*rperm)); 4583 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4584 PetscCall(ISSetPermutation(*cperm)); 4585 4586 PetscCall(ISRestoreIndices(ris, &ridx)); 4587 PetscCall(ISRestoreIndices(cis, &cidx)); 4588 4589 PetscCall(PetscFree(ns_col)); 4590 PetscCall(PetscFree2(permr, permc)); 4591 PetscCall(ISDestroy(&cis)); 4592 PetscCall(ISDestroy(&ris)); 4593 PetscCall(PetscFree(tns)); 4594 PetscFunctionReturn(PETSC_SUCCESS); 4595 } 4596 4597 /*@C 4598 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4599 4600 Not Collective 4601 4602 Input Parameter: 4603 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4604 4605 Output Parameters: 4606 + node_count - no of inodes present in the matrix. 4607 . sizes - an array of size `node_count`, with the sizes of each inode. 4608 - limit - the max size used to generate the inodes. 4609 4610 Level: advanced 4611 4612 Note: 4613 It should be called after the matrix is assembled. 4614 The contents of the sizes[] array should not be changed. 4615 `NULL` may be passed for information not needed 4616 4617 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 4618 @*/ 4619 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4620 { 4621 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4622 4623 PetscFunctionBegin; 4624 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4625 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4626 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4627 PetscFunctionReturn(PETSC_SUCCESS); 4628 } 4629 4630 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4631 { 4632 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4633 4634 PetscFunctionBegin; 4635 if (node_count) *node_count = a->inode.node_count; 4636 if (sizes) *sizes = a->inode.size_csr; 4637 if (limit) *limit = a->inode.limit; 4638 PetscFunctionReturn(PETSC_SUCCESS); 4639 } 4640