1 /* 2 This file provides high performance routines for the Inode format (compressed sparse row) 3 by taking advantage of rows with identical nonzero structure (I-nodes). 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #if defined(PETSC_HAVE_XMMINTRIN_H) 7 #include <xmmintrin.h> 8 #endif 9 10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 13 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 14 15 PetscFunctionBegin; 16 n = A->cmap->n; 17 m = A->rmap->n; 18 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 19 ns_row = a->inode.size_csr; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++); 24 for (; count + 1 < n; count++, i++); 25 if (count < n) i++; 26 *size = i; 27 PetscFunctionReturn(PETSC_SUCCESS); 28 } 29 PetscCall(PetscMalloc1(n + 1, &ns_col)); 30 ns_col[0] = 0; 31 32 /* Use the same row structure wherever feasible. */ 33 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1]; 34 35 /* if m < n; pad up the remainder with inode_limit */ 36 for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1; 37 /* The last node is the odd ball. pad it up with the remaining rows; */ 38 if (count < n) { 39 ns_col[i + 1] = ns_col[i] + (n - count); 40 i++; 41 } else if (count > n) { 42 /* Adjust for the over estimation */ 43 ns_col[i] += n - count; 44 } 45 *size = i; 46 *ns = ns_col; 47 PetscFunctionReturn(PETSC_SUCCESS); 48 } 49 50 /* 51 This builds symmetric version of nonzero structure, 52 */ 53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 54 { 55 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 56 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 57 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2; 58 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 59 60 PetscFunctionBegin; 61 nslim_row = a->inode.node_count; 62 m = A->rmap->n; 63 n = A->cmap->n; 64 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 65 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 66 67 /* Use the row_inode as column_inode */ 68 nslim_col = nslim_row; 69 ns_col = ns_row; 70 71 /* allocate space for reformatted inode structure */ 72 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 73 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]); 74 75 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 76 nsz = ns_col[i1 + 1] - ns_col[i1]; 77 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 78 } 79 /* allocate space for row pointers */ 80 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 81 *iia = ia; 82 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 83 84 /* determine the number of columns in each row */ 85 ia[0] = oshift; 86 for (i1 = 0; i1 < nslim_row; i1++) { 87 row = ns_row[i1]; 88 j = aj + ai[row] + ishift; 89 jmax = aj + ai[row + 1] + ishift; 90 if (j == jmax) continue; /* empty row */ 91 col = *j++ + ishift; 92 i2 = tvc[col]; 93 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 94 ia[i1 + 1]++; 95 ia[i2 + 1]++; 96 i2++; /* Start col of next node */ 97 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 98 i2 = tvc[col]; 99 } 100 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 101 } 102 103 /* shift ia[i] to point to next row */ 104 for (i1 = 1; i1 < nslim_row + 1; i1++) { 105 row = ia[i1 - 1]; 106 ia[i1] += row; 107 work[i1 - 1] = row - oshift; 108 } 109 110 /* allocate space for column pointers */ 111 nz = ia[nslim_row] + (!ishift); 112 PetscCall(PetscMalloc1(nz, &ja)); 113 *jja = ja; 114 115 /* loop over lower triangular part putting into ja */ 116 for (i1 = 0; i1 < nslim_row; i1++) { 117 row = ns_row[i1]; 118 j = aj + ai[row] + ishift; 119 jmax = aj + ai[row + 1] + ishift; 120 if (j == jmax) continue; /* empty row */ 121 col = *j++ + ishift; 122 i2 = tvc[col]; 123 while (i2 < i1 && j < jmax) { 124 ja[work[i2]++] = i1 + oshift; 125 ja[work[i1]++] = i2 + oshift; 126 ++i2; 127 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 128 i2 = tvc[col]; 129 } 130 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 131 } 132 PetscCall(PetscFree(work)); 133 PetscCall(PetscFree2(tns, tvc)); 134 PetscFunctionReturn(PETSC_SUCCESS); 135 } 136 137 /* 138 This builds nonsymmetric version of nonzero structure, 139 */ 140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 141 { 142 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 143 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 144 PetscInt *tns, *tvc, nsz, i1, i2; 145 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr; 146 147 PetscFunctionBegin; 148 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 149 nslim_row = a->inode.node_count; 150 n = A->cmap->n; 151 152 /* Create The column_inode for this matrix */ 153 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 154 155 /* allocate space for reformatted column_inode structure */ 156 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 157 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 158 159 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 160 nsz = ns_col[i1 + 1] - ns_col[i1]; 161 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 162 } 163 /* allocate space for row pointers */ 164 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 165 *iia = ia; 166 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 167 168 /* determine the number of columns in each row */ 169 ia[0] = oshift; 170 for (i1 = 0; i1 < nslim_row; i1++) { 171 row = ns_row[i1]; 172 j = aj + ai[row] + ishift; 173 nz = ai[row + 1] - ai[row]; 174 if (!nz) continue; /* empty row */ 175 col = *j++ + ishift; 176 i2 = tvc[col]; 177 while (nz-- > 0) { /* off-diagonal elements */ 178 ia[i1 + 1]++; 179 i2++; /* Start col of next node */ 180 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 181 if (nz > 0) i2 = tvc[col]; 182 } 183 } 184 185 /* shift ia[i] to point to next row */ 186 for (i1 = 1; i1 < nslim_row + 1; i1++) { 187 row = ia[i1 - 1]; 188 ia[i1] += row; 189 work[i1 - 1] = row - oshift; 190 } 191 192 /* allocate space for column pointers */ 193 nz = ia[nslim_row] + (!ishift); 194 PetscCall(PetscMalloc1(nz, &ja)); 195 *jja = ja; 196 197 /* loop over matrix putting into ja */ 198 for (i1 = 0; i1 < nslim_row; i1++) { 199 row = ns_row[i1]; 200 j = aj + ai[row] + ishift; 201 nz = ai[row + 1] - ai[row]; 202 if (!nz) continue; /* empty row */ 203 col = *j++ + ishift; 204 i2 = tvc[col]; 205 while (nz-- > 0) { 206 ja[work[i1]++] = i2 + oshift; 207 ++i2; 208 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 209 if (nz > 0) i2 = tvc[col]; 210 } 211 } 212 PetscCall(PetscFree(ns_col)); 213 PetscCall(PetscFree(work)); 214 PetscCall(PetscFree2(tns, tvc)); 215 PetscFunctionReturn(PETSC_SUCCESS); 216 } 217 218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 219 { 220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 221 222 PetscFunctionBegin; 223 if (n) *n = a->inode.node_count; 224 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 225 if (!blockcompressed) { 226 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 227 } else if (symmetric) { 228 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 229 } else { 230 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 231 } 232 PetscFunctionReturn(PETSC_SUCCESS); 233 } 234 235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 236 { 237 PetscFunctionBegin; 238 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 239 240 if (!blockcompressed) { 241 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 242 } else { 243 PetscCall(PetscFree(*ia)); 244 PetscCall(PetscFree(*ja)); 245 } 246 PetscFunctionReturn(PETSC_SUCCESS); 247 } 248 249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 250 { 251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 252 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 253 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j; 254 255 PetscFunctionBegin; 256 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 257 nslim_row = a->inode.node_count; 258 n = A->cmap->n; 259 260 /* Create The column_inode for this matrix */ 261 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 262 263 /* allocate space for reformatted column_inode structure */ 264 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 265 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 266 267 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 268 nsz = ns_col[i1 + 1] - ns_col[i1]; 269 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 270 } 271 /* allocate space for column pointers */ 272 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 273 *iia = ia; 274 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 275 276 /* determine the number of columns in each row */ 277 ia[0] = oshift; 278 for (i1 = 0; i1 < nslim_row; i1++) { 279 row = ns_row[i1]; 280 j = aj + ai[row] + ishift; 281 col = *j++ + ishift; 282 i2 = tvc[col]; 283 nz = ai[row + 1] - ai[row]; 284 while (nz-- > 0) { /* off-diagonal elements */ 285 /* ia[i1+1]++; */ 286 ia[i2 + 1]++; 287 i2++; 288 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 289 if (nz > 0) i2 = tvc[col]; 290 } 291 } 292 293 /* shift ia[i] to point to next col */ 294 for (i1 = 1; i1 < nslim_col + 1; i1++) { 295 col = ia[i1 - 1]; 296 ia[i1] += col; 297 work[i1 - 1] = col - oshift; 298 } 299 300 /* allocate space for column pointers */ 301 nz = ia[nslim_col] + (!ishift); 302 PetscCall(PetscMalloc1(nz, &ja)); 303 *jja = ja; 304 305 /* loop over matrix putting into ja */ 306 for (i1 = 0; i1 < nslim_row; i1++) { 307 row = ns_row[i1]; 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row + 1] - ai[row]; 312 while (nz-- > 0) { 313 /* ja[work[i1]++] = i2 + oshift; */ 314 ja[work[i2]++] = i1 + oshift; 315 i2++; 316 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 PetscCall(PetscFree(ns_col)); 321 PetscCall(PetscFree(work)); 322 PetscCall(PetscFree2(tns, tvc)); 323 PetscFunctionReturn(PETSC_SUCCESS); 324 } 325 326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 327 { 328 PetscFunctionBegin; 329 PetscCall(MatCreateColInode_Private(A, n, NULL)); 330 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 331 332 if (!blockcompressed) { 333 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 334 } else if (symmetric) { 335 /* Since the indices are symmetric it doesn't matter */ 336 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 337 } else { 338 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 339 } 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 344 { 345 PetscFunctionBegin; 346 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 347 if (!blockcompressed) { 348 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 349 } else { 350 PetscCall(PetscFree(*ia)); 351 PetscCall(PetscFree(*ja)); 352 } 353 PetscFunctionReturn(PETSC_SUCCESS); 354 } 355 356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 357 { 358 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 359 PetscScalar *y; 360 const PetscScalar *x; 361 PetscInt row, node_max, nonzerorow = 0; 362 PetscInt *ns; 363 364 PetscFunctionBegin; 365 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 366 node_max = a->inode.node_count; 367 ns = a->inode.size_csr; /* Node Size array */ 368 PetscCall(VecGetArrayRead(xx, &x)); 369 PetscCall(VecGetArray(yy, &y)); 370 371 PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow)) 372 for (PetscInt i = 0; i < node_max; ++i) { 373 PetscInt i1, i2, nsz, n, sz; 374 const MatScalar *v1, *v2, *v3, *v4, *v5; 375 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 376 const PetscInt *idx; 377 378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 379 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 380 #endif 381 row = ns[i]; 382 nsz = ns[i + 1] - ns[i]; 383 n = a->i[row + 1] - a->i[row]; 384 nonzerorow += (n > 0) * nsz; 385 386 idx = &a->j[a->i[row]]; 387 v1 = &a->a[a->i[row]]; 388 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 389 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 390 sz = n; /* No of non zeros in this row */ 391 /* Switch on the size of Node */ 392 switch (nsz) { /* Each loop in 'case' is unrolled */ 393 case 1: 394 sum1 = 0.; 395 396 for (n = 0; n < sz - 1; n += 2) { 397 i1 = idx[0]; /* The instructions are ordered to */ 398 i2 = idx[1]; /* make the compiler's job easy */ 399 idx += 2; 400 tmp0 = x[i1]; 401 tmp1 = x[i2]; 402 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 403 v1 += 2; 404 } 405 406 if (n == sz - 1) { /* Take care of the last nonzero */ 407 tmp0 = x[*idx++]; 408 sum1 += *v1++ * tmp0; 409 } 410 y[row++] = sum1; 411 break; 412 case 2: 413 sum1 = 0.; 414 sum2 = 0.; 415 v2 = v1 + n; 416 417 for (n = 0; n < sz - 1; n += 2) { 418 i1 = idx[0]; 419 i2 = idx[1]; 420 idx += 2; 421 tmp0 = x[i1]; 422 tmp1 = x[i2]; 423 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 424 v1 += 2; 425 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 426 v2 += 2; 427 } 428 if (n == sz - 1) { 429 tmp0 = x[*idx++]; 430 sum1 += *v1++ * tmp0; 431 sum2 += *v2++ * tmp0; 432 } 433 y[row++] = sum1; 434 y[row++] = sum2; 435 v1 = v2; /* Since the next block to be processed starts there*/ 436 idx += sz; 437 break; 438 case 3: 439 sum1 = 0.; 440 sum2 = 0.; 441 sum3 = 0.; 442 v2 = v1 + n; 443 v3 = v2 + n; 444 445 for (n = 0; n < sz - 1; n += 2) { 446 i1 = idx[0]; 447 i2 = idx[1]; 448 idx += 2; 449 tmp0 = x[i1]; 450 tmp1 = x[i2]; 451 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 452 v1 += 2; 453 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 454 v2 += 2; 455 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 456 v3 += 2; 457 } 458 if (n == sz - 1) { 459 tmp0 = x[*idx++]; 460 sum1 += *v1++ * tmp0; 461 sum2 += *v2++ * tmp0; 462 sum3 += *v3++ * tmp0; 463 } 464 y[row++] = sum1; 465 y[row++] = sum2; 466 y[row++] = sum3; 467 v1 = v3; /* Since the next block to be processed starts there*/ 468 idx += 2 * sz; 469 break; 470 case 4: 471 sum1 = 0.; 472 sum2 = 0.; 473 sum3 = 0.; 474 sum4 = 0.; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 v4 = v3 + n; 478 479 for (n = 0; n < sz - 1; n += 2) { 480 i1 = idx[0]; 481 i2 = idx[1]; 482 idx += 2; 483 tmp0 = x[i1]; 484 tmp1 = x[i2]; 485 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 486 v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 488 v2 += 2; 489 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 490 v3 += 2; 491 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 492 v4 += 2; 493 } 494 if (n == sz - 1) { 495 tmp0 = x[*idx++]; 496 sum1 += *v1++ * tmp0; 497 sum2 += *v2++ * tmp0; 498 sum3 += *v3++ * tmp0; 499 sum4 += *v4++ * tmp0; 500 } 501 y[row++] = sum1; 502 y[row++] = sum2; 503 y[row++] = sum3; 504 y[row++] = sum4; 505 v1 = v4; /* Since the next block to be processed starts there*/ 506 idx += 3 * sz; 507 break; 508 case 5: 509 sum1 = 0.; 510 sum2 = 0.; 511 sum3 = 0.; 512 sum4 = 0.; 513 sum5 = 0.; 514 v2 = v1 + n; 515 v3 = v2 + n; 516 v4 = v3 + n; 517 v5 = v4 + n; 518 519 for (n = 0; n < sz - 1; n += 2) { 520 i1 = idx[0]; 521 i2 = idx[1]; 522 idx += 2; 523 tmp0 = x[i1]; 524 tmp1 = x[i2]; 525 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 526 v1 += 2; 527 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 528 v2 += 2; 529 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 530 v3 += 2; 531 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 532 v4 += 2; 533 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 534 v5 += 2; 535 } 536 if (n == sz - 1) { 537 tmp0 = x[*idx++]; 538 sum1 += *v1++ * tmp0; 539 sum2 += *v2++ * tmp0; 540 sum3 += *v3++ * tmp0; 541 sum4 += *v4++ * tmp0; 542 sum5 += *v5++ * tmp0; 543 } 544 y[row++] = sum1; 545 y[row++] = sum2; 546 y[row++] = sum3; 547 y[row++] = sum4; 548 y[row++] = sum5; 549 v1 = v5; /* Since the next block to be processed starts there */ 550 idx += 4 * sz; 551 break; 552 default: 553 SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz); 554 } 555 } 556 PetscCall(VecRestoreArrayRead(xx, &x)); 557 PetscCall(VecRestoreArray(yy, &y)); 558 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 559 PetscFunctionReturn(PETSC_SUCCESS); 560 } 561 562 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 563 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 564 { 565 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 566 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 567 const MatScalar *v1, *v2, *v3, *v4, *v5; 568 const PetscScalar *x; 569 PetscScalar *y, *z, *zt; 570 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 571 const PetscInt *idx, *ns, *ii; 572 573 PetscFunctionBegin; 574 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 575 node_max = a->inode.node_count; 576 ns = a->inode.size_csr; /* Node Size array */ 577 578 PetscCall(VecGetArrayRead(xx, &x)); 579 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 580 zt = z; 581 582 idx = a->j; 583 v1 = a->a; 584 ii = a->i; 585 586 for (i = 0; i < node_max; ++i) { 587 row = ns[i]; 588 nsz = ns[i + 1] - ns[i]; 589 n = ii[1] - ii[0]; 590 ii += nsz; 591 sz = n; /* No of non zeros in this row */ 592 /* Switch on the size of Node */ 593 switch (nsz) { /* Each loop in 'case' is unrolled */ 594 case 1: 595 sum1 = *zt++; 596 597 for (n = 0; n < sz - 1; n += 2) { 598 i1 = idx[0]; /* The instructions are ordered to */ 599 i2 = idx[1]; /* make the compiler's job easy */ 600 idx += 2; 601 tmp0 = x[i1]; 602 tmp1 = x[i2]; 603 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 604 v1 += 2; 605 } 606 607 if (n == sz - 1) { /* Take care of the last nonzero */ 608 tmp0 = x[*idx++]; 609 sum1 += *v1++ * tmp0; 610 } 611 y[row++] = sum1; 612 break; 613 case 2: 614 sum1 = *zt++; 615 sum2 = *zt++; 616 v2 = v1 + n; 617 618 for (n = 0; n < sz - 1; n += 2) { 619 i1 = idx[0]; 620 i2 = idx[1]; 621 idx += 2; 622 tmp0 = x[i1]; 623 tmp1 = x[i2]; 624 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 625 v1 += 2; 626 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 627 v2 += 2; 628 } 629 if (n == sz - 1) { 630 tmp0 = x[*idx++]; 631 sum1 += *v1++ * tmp0; 632 sum2 += *v2++ * tmp0; 633 } 634 y[row++] = sum1; 635 y[row++] = sum2; 636 v1 = v2; /* Since the next block to be processed starts there*/ 637 idx += sz; 638 break; 639 case 3: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 sum3 = *zt++; 643 v2 = v1 + n; 644 v3 = v2 + n; 645 646 for (n = 0; n < sz - 1; n += 2) { 647 i1 = idx[0]; 648 i2 = idx[1]; 649 idx += 2; 650 tmp0 = x[i1]; 651 tmp1 = x[i2]; 652 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 653 v1 += 2; 654 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 655 v2 += 2; 656 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 657 v3 += 2; 658 } 659 if (n == sz - 1) { 660 tmp0 = x[*idx++]; 661 sum1 += *v1++ * tmp0; 662 sum2 += *v2++ * tmp0; 663 sum3 += *v3++ * tmp0; 664 } 665 y[row++] = sum1; 666 y[row++] = sum2; 667 y[row++] = sum3; 668 v1 = v3; /* Since the next block to be processed starts there*/ 669 idx += 2 * sz; 670 break; 671 case 4: 672 sum1 = *zt++; 673 sum2 = *zt++; 674 sum3 = *zt++; 675 sum4 = *zt++; 676 v2 = v1 + n; 677 v3 = v2 + n; 678 v4 = v3 + n; 679 680 for (n = 0; n < sz - 1; n += 2) { 681 i1 = idx[0]; 682 i2 = idx[1]; 683 idx += 2; 684 tmp0 = x[i1]; 685 tmp1 = x[i2]; 686 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 687 v1 += 2; 688 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 689 v2 += 2; 690 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 691 v3 += 2; 692 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 693 v4 += 2; 694 } 695 if (n == sz - 1) { 696 tmp0 = x[*idx++]; 697 sum1 += *v1++ * tmp0; 698 sum2 += *v2++ * tmp0; 699 sum3 += *v3++ * tmp0; 700 sum4 += *v4++ * tmp0; 701 } 702 y[row++] = sum1; 703 y[row++] = sum2; 704 y[row++] = sum3; 705 y[row++] = sum4; 706 v1 = v4; /* Since the next block to be processed starts there*/ 707 idx += 3 * sz; 708 break; 709 case 5: 710 sum1 = *zt++; 711 sum2 = *zt++; 712 sum3 = *zt++; 713 sum4 = *zt++; 714 sum5 = *zt++; 715 v2 = v1 + n; 716 v3 = v2 + n; 717 v4 = v3 + n; 718 v5 = v4 + n; 719 720 for (n = 0; n < sz - 1; n += 2) { 721 i1 = idx[0]; 722 i2 = idx[1]; 723 idx += 2; 724 tmp0 = x[i1]; 725 tmp1 = x[i2]; 726 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 727 v1 += 2; 728 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 729 v2 += 2; 730 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 731 v3 += 2; 732 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 733 v4 += 2; 734 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 735 v5 += 2; 736 } 737 if (n == sz - 1) { 738 tmp0 = x[*idx++]; 739 sum1 += *v1++ * tmp0; 740 sum2 += *v2++ * tmp0; 741 sum3 += *v3++ * tmp0; 742 sum4 += *v4++ * tmp0; 743 sum5 += *v5++ * tmp0; 744 } 745 y[row++] = sum1; 746 y[row++] = sum2; 747 y[row++] = sum3; 748 y[row++] = sum4; 749 y[row++] = sum5; 750 v1 = v5; /* Since the next block to be processed starts there */ 751 idx += 4 * sz; 752 break; 753 default: 754 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 755 } 756 } 757 PetscCall(VecRestoreArrayRead(xx, &x)); 758 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 759 PetscCall(PetscLogFlops(2.0 * a->nz)); 760 PetscFunctionReturn(PETSC_SUCCESS); 761 } 762 763 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 764 { 765 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 766 IS iscol = a->col, isrow = a->row; 767 const PetscInt *r, *c, *rout, *cout; 768 PetscInt i, j, n = A->rmap->n, nz; 769 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 770 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 771 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 772 PetscScalar sum1, sum2, sum3, sum4, sum5; 773 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 774 const PetscScalar *b; 775 776 PetscFunctionBegin; 777 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 778 node_max = a->inode.node_count; 779 ns = a->inode.size_csr; /* Node Size array */ 780 781 PetscCall(VecGetArrayRead(bb, &b)); 782 PetscCall(VecGetArrayWrite(xx, &x)); 783 tmp = a->solve_work; 784 785 PetscCall(ISGetIndices(isrow, &rout)); 786 r = rout; 787 PetscCall(ISGetIndices(iscol, &cout)); 788 c = cout + (n - 1); 789 790 /* forward solve the lower triangular */ 791 tmps = tmp; 792 aa = a_a; 793 aj = a_j; 794 ad = a->diag; 795 796 for (i = 0, row = 0; i < node_max; ++i) { 797 row = ns[i]; 798 nsz = ns[i + 1] - ns[i]; 799 aii = ai[row]; 800 v1 = aa + aii; 801 vi = aj + aii; 802 nz = ad[row] - aii; 803 if (i < node_max - 1) { 804 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 805 * but our indexing to determine its size could. */ 806 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 807 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 808 PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 809 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 810 } 811 812 switch (nsz) { /* Each loop in 'case' is unrolled */ 813 case 1: 814 sum1 = b[*r++]; 815 for (j = 0; j < nz - 1; j += 2) { 816 i0 = vi[0]; 817 i1 = vi[1]; 818 vi += 2; 819 tmp0 = tmps[i0]; 820 tmp1 = tmps[i1]; 821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 822 v1 += 2; 823 } 824 if (j == nz - 1) { 825 tmp0 = tmps[*vi++]; 826 sum1 -= *v1++ * tmp0; 827 } 828 tmp[row++] = sum1; 829 break; 830 case 2: 831 sum1 = b[*r++]; 832 sum2 = b[*r++]; 833 v2 = aa + ai[row + 1]; 834 835 for (j = 0; j < nz - 1; j += 2) { 836 i0 = vi[0]; 837 i1 = vi[1]; 838 vi += 2; 839 tmp0 = tmps[i0]; 840 tmp1 = tmps[i1]; 841 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 842 v1 += 2; 843 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 844 v2 += 2; 845 } 846 if (j == nz - 1) { 847 tmp0 = tmps[*vi++]; 848 sum1 -= *v1++ * tmp0; 849 sum2 -= *v2++ * tmp0; 850 } 851 sum2 -= *v2++ * sum1; 852 tmp[row++] = sum1; 853 tmp[row++] = sum2; 854 break; 855 case 3: 856 sum1 = b[*r++]; 857 sum2 = b[*r++]; 858 sum3 = b[*r++]; 859 v2 = aa + ai[row + 1]; 860 v3 = aa + ai[row + 2]; 861 862 for (j = 0; j < nz - 1; j += 2) { 863 i0 = vi[0]; 864 i1 = vi[1]; 865 vi += 2; 866 tmp0 = tmps[i0]; 867 tmp1 = tmps[i1]; 868 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 869 v1 += 2; 870 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 871 v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 873 v3 += 2; 874 } 875 if (j == nz - 1) { 876 tmp0 = tmps[*vi++]; 877 sum1 -= *v1++ * tmp0; 878 sum2 -= *v2++ * tmp0; 879 sum3 -= *v3++ * tmp0; 880 } 881 sum2 -= *v2++ * sum1; 882 sum3 -= *v3++ * sum1; 883 sum3 -= *v3++ * sum2; 884 885 tmp[row++] = sum1; 886 tmp[row++] = sum2; 887 tmp[row++] = sum3; 888 break; 889 890 case 4: 891 sum1 = b[*r++]; 892 sum2 = b[*r++]; 893 sum3 = b[*r++]; 894 sum4 = b[*r++]; 895 v2 = aa + ai[row + 1]; 896 v3 = aa + ai[row + 2]; 897 v4 = aa + ai[row + 3]; 898 899 for (j = 0; j < nz - 1; j += 2) { 900 i0 = vi[0]; 901 i1 = vi[1]; 902 vi += 2; 903 tmp0 = tmps[i0]; 904 tmp1 = tmps[i1]; 905 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 906 v1 += 2; 907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 908 v2 += 2; 909 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 910 v3 += 2; 911 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 912 v4 += 2; 913 } 914 if (j == nz - 1) { 915 tmp0 = tmps[*vi++]; 916 sum1 -= *v1++ * tmp0; 917 sum2 -= *v2++ * tmp0; 918 sum3 -= *v3++ * tmp0; 919 sum4 -= *v4++ * tmp0; 920 } 921 sum2 -= *v2++ * sum1; 922 sum3 -= *v3++ * sum1; 923 sum4 -= *v4++ * sum1; 924 sum3 -= *v3++ * sum2; 925 sum4 -= *v4++ * sum2; 926 sum4 -= *v4++ * sum3; 927 928 tmp[row++] = sum1; 929 tmp[row++] = sum2; 930 tmp[row++] = sum3; 931 tmp[row++] = sum4; 932 break; 933 case 5: 934 sum1 = b[*r++]; 935 sum2 = b[*r++]; 936 sum3 = b[*r++]; 937 sum4 = b[*r++]; 938 sum5 = b[*r++]; 939 v2 = aa + ai[row + 1]; 940 v3 = aa + ai[row + 2]; 941 v4 = aa + ai[row + 3]; 942 v5 = aa + ai[row + 4]; 943 944 for (j = 0; j < nz - 1; j += 2) { 945 i0 = vi[0]; 946 i1 = vi[1]; 947 vi += 2; 948 tmp0 = tmps[i0]; 949 tmp1 = tmps[i1]; 950 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 951 v1 += 2; 952 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 953 v2 += 2; 954 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 955 v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 957 v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 959 v5 += 2; 960 } 961 if (j == nz - 1) { 962 tmp0 = tmps[*vi++]; 963 sum1 -= *v1++ * tmp0; 964 sum2 -= *v2++ * tmp0; 965 sum3 -= *v3++ * tmp0; 966 sum4 -= *v4++ * tmp0; 967 sum5 -= *v5++ * tmp0; 968 } 969 970 sum2 -= *v2++ * sum1; 971 sum3 -= *v3++ * sum1; 972 sum4 -= *v4++ * sum1; 973 sum5 -= *v5++ * sum1; 974 sum3 -= *v3++ * sum2; 975 sum4 -= *v4++ * sum2; 976 sum5 -= *v5++ * sum2; 977 sum4 -= *v4++ * sum3; 978 sum5 -= *v5++ * sum3; 979 sum5 -= *v5++ * sum4; 980 981 tmp[row++] = sum1; 982 tmp[row++] = sum2; 983 tmp[row++] = sum3; 984 tmp[row++] = sum4; 985 tmp[row++] = sum5; 986 break; 987 default: 988 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 989 } 990 } 991 /* backward solve the upper triangular */ 992 for (i = node_max - 1; i >= 0; i--) { 993 row = ns[i + 1]; 994 nsz = ns[i + 1] - ns[i]; 995 aii = ai[row + 1] - 1; 996 v1 = aa + aii; 997 vi = aj + aii; 998 nz = aii - ad[row]; 999 switch (nsz) { /* Each loop in 'case' is unrolled */ 1000 case 1: 1001 sum1 = tmp[row]; 1002 1003 for (j = nz; j > 1; j -= 2) { 1004 vi -= 2; 1005 i0 = vi[2]; 1006 i1 = vi[1]; 1007 tmp0 = tmps[i0]; 1008 tmp1 = tmps[i1]; 1009 v1 -= 2; 1010 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1011 } 1012 if (j == 1) { 1013 tmp0 = tmps[*vi--]; 1014 sum1 -= *v1-- * tmp0; 1015 } 1016 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1017 row--; 1018 break; 1019 case 2: 1020 sum1 = tmp[row]; 1021 sum2 = tmp[row - 1]; 1022 v2 = aa + ai[row] - 1; 1023 for (j = nz; j > 1; j -= 2) { 1024 vi -= 2; 1025 i0 = vi[2]; 1026 i1 = vi[1]; 1027 tmp0 = tmps[i0]; 1028 tmp1 = tmps[i1]; 1029 v1 -= 2; 1030 v2 -= 2; 1031 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1032 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1033 } 1034 if (j == 1) { 1035 tmp0 = tmps[*vi--]; 1036 sum1 -= *v1-- * tmp0; 1037 sum2 -= *v2-- * tmp0; 1038 } 1039 1040 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1041 row--; 1042 sum2 -= *v2-- * tmp0; 1043 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1044 row--; 1045 break; 1046 case 3: 1047 sum1 = tmp[row]; 1048 sum2 = tmp[row - 1]; 1049 sum3 = tmp[row - 2]; 1050 v2 = aa + ai[row] - 1; 1051 v3 = aa + ai[row - 1] - 1; 1052 for (j = nz; j > 1; j -= 2) { 1053 vi -= 2; 1054 i0 = vi[2]; 1055 i1 = vi[1]; 1056 tmp0 = tmps[i0]; 1057 tmp1 = tmps[i1]; 1058 v1 -= 2; 1059 v2 -= 2; 1060 v3 -= 2; 1061 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1062 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1063 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1064 } 1065 if (j == 1) { 1066 tmp0 = tmps[*vi--]; 1067 sum1 -= *v1-- * tmp0; 1068 sum2 -= *v2-- * tmp0; 1069 sum3 -= *v3-- * tmp0; 1070 } 1071 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1072 row--; 1073 sum2 -= *v2-- * tmp0; 1074 sum3 -= *v3-- * tmp0; 1075 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1076 row--; 1077 sum3 -= *v3-- * tmp0; 1078 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1079 row--; 1080 1081 break; 1082 case 4: 1083 sum1 = tmp[row]; 1084 sum2 = tmp[row - 1]; 1085 sum3 = tmp[row - 2]; 1086 sum4 = tmp[row - 3]; 1087 v2 = aa + ai[row] - 1; 1088 v3 = aa + ai[row - 1] - 1; 1089 v4 = aa + ai[row - 2] - 1; 1090 1091 for (j = nz; j > 1; j -= 2) { 1092 vi -= 2; 1093 i0 = vi[2]; 1094 i1 = vi[1]; 1095 tmp0 = tmps[i0]; 1096 tmp1 = tmps[i1]; 1097 v1 -= 2; 1098 v2 -= 2; 1099 v3 -= 2; 1100 v4 -= 2; 1101 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1102 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1103 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1104 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1105 } 1106 if (j == 1) { 1107 tmp0 = tmps[*vi--]; 1108 sum1 -= *v1-- * tmp0; 1109 sum2 -= *v2-- * tmp0; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 } 1113 1114 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1115 row--; 1116 sum2 -= *v2-- * tmp0; 1117 sum3 -= *v3-- * tmp0; 1118 sum4 -= *v4-- * tmp0; 1119 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1120 row--; 1121 sum3 -= *v3-- * tmp0; 1122 sum4 -= *v4-- * tmp0; 1123 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1124 row--; 1125 sum4 -= *v4-- * tmp0; 1126 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1127 row--; 1128 break; 1129 case 5: 1130 sum1 = tmp[row]; 1131 sum2 = tmp[row - 1]; 1132 sum3 = tmp[row - 2]; 1133 sum4 = tmp[row - 3]; 1134 sum5 = tmp[row - 4]; 1135 v2 = aa + ai[row] - 1; 1136 v3 = aa + ai[row - 1] - 1; 1137 v4 = aa + ai[row - 2] - 1; 1138 v5 = aa + ai[row - 3] - 1; 1139 for (j = nz; j > 1; j -= 2) { 1140 vi -= 2; 1141 i0 = vi[2]; 1142 i1 = vi[1]; 1143 tmp0 = tmps[i0]; 1144 tmp1 = tmps[i1]; 1145 v1 -= 2; 1146 v2 -= 2; 1147 v3 -= 2; 1148 v4 -= 2; 1149 v5 -= 2; 1150 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1151 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1152 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1153 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1154 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1155 } 1156 if (j == 1) { 1157 tmp0 = tmps[*vi--]; 1158 sum1 -= *v1-- * tmp0; 1159 sum2 -= *v2-- * tmp0; 1160 sum3 -= *v3-- * tmp0; 1161 sum4 -= *v4-- * tmp0; 1162 sum5 -= *v5-- * tmp0; 1163 } 1164 1165 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1166 row--; 1167 sum2 -= *v2-- * tmp0; 1168 sum3 -= *v3-- * tmp0; 1169 sum4 -= *v4-- * tmp0; 1170 sum5 -= *v5-- * tmp0; 1171 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1172 row--; 1173 sum3 -= *v3-- * tmp0; 1174 sum4 -= *v4-- * tmp0; 1175 sum5 -= *v5-- * tmp0; 1176 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1177 row--; 1178 sum4 -= *v4-- * tmp0; 1179 sum5 -= *v5-- * tmp0; 1180 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1181 row--; 1182 sum5 -= *v5-- * tmp0; 1183 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1184 row--; 1185 break; 1186 default: 1187 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1188 } 1189 } 1190 PetscCall(ISRestoreIndices(isrow, &rout)); 1191 PetscCall(ISRestoreIndices(iscol, &cout)); 1192 PetscCall(VecRestoreArrayRead(bb, &b)); 1193 PetscCall(VecRestoreArrayWrite(xx, &x)); 1194 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1195 PetscFunctionReturn(PETSC_SUCCESS); 1196 } 1197 1198 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1199 { 1200 Mat C = B; 1201 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1202 IS isrow = b->row, isicol = b->icol; 1203 const PetscInt *r, *ic, *ics; 1204 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1205 PetscInt i, j, k, nz, nzL, row, *pj; 1206 const PetscInt *ajtmp, *bjtmp; 1207 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1208 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1209 FactorShiftCtx sctx; 1210 const PetscInt *ddiag; 1211 PetscReal rs; 1212 MatScalar d; 1213 PetscInt inod, nodesz, node_max, col; 1214 const PetscInt *ns; 1215 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1216 1217 PetscFunctionBegin; 1218 /* MatPivotSetUp(): initialize shift context sctx */ 1219 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1220 1221 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1222 ddiag = a->diag; 1223 sctx.shift_top = info->zeropivot; 1224 for (i = 0; i < n; i++) { 1225 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1226 d = (aa)[ddiag[i]]; 1227 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1228 v = aa + ai[i]; 1229 nz = ai[i + 1] - ai[i]; 1230 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1231 if (rs > sctx.shift_top) sctx.shift_top = rs; 1232 } 1233 sctx.shift_top *= 1.1; 1234 sctx.nshift_max = 5; 1235 sctx.shift_lo = 0.; 1236 sctx.shift_hi = 1.; 1237 } 1238 1239 PetscCall(ISGetIndices(isrow, &r)); 1240 PetscCall(ISGetIndices(isicol, &ic)); 1241 1242 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1243 ics = ic; 1244 1245 node_max = a->inode.node_count; 1246 ns = a->inode.size_csr; 1247 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1248 1249 /* If max inode size > 4, split it into two inodes.*/ 1250 /* also map the inode sizes according to the ordering */ 1251 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1252 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1253 nodesz = ns[i + 1] - ns[i]; 1254 if (nodesz > 4) { 1255 tmp_vec1[j] = 4; 1256 ++j; 1257 tmp_vec1[j] = nodesz - tmp_vec1[j - 1]; 1258 } else { 1259 tmp_vec1[j] = nodesz; 1260 } 1261 } 1262 /* Use the correct node_max */ 1263 node_max = j; 1264 1265 /* Now reorder the inode info based on mat re-ordering info */ 1266 /* First create a row -> inode_size_array_index map */ 1267 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1268 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1269 tmp_vec2[0] = 0; 1270 for (i = 0, row = 0; i < node_max; i++) { 1271 nodesz = tmp_vec1[i]; 1272 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1273 } 1274 /* Using nsmap, create a reordered ns structure */ 1275 for (i = 0, j = 0; i < node_max; i++) { 1276 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1277 tmp_vec2[i + 1] = tmp_vec2[i] + nodesz; 1278 j += nodesz; 1279 } 1280 PetscCall(PetscFree(nsmap)); 1281 PetscCall(PetscFree(tmp_vec1)); 1282 1283 /* Now use the correct ns */ 1284 ns = tmp_vec2; 1285 1286 do { 1287 sctx.newshift = PETSC_FALSE; 1288 /* Now loop over each block-row, and do the factorization */ 1289 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1290 nodesz = ns[inod + 1] - ns[inod]; 1291 1292 switch (nodesz) { 1293 case 1: 1294 /* zero rtmp1 */ 1295 /* L part */ 1296 nz = bi[i + 1] - bi[i]; 1297 bjtmp = bj + bi[i]; 1298 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1299 1300 /* U part */ 1301 nz = bdiag[i] - bdiag[i + 1]; 1302 bjtmp = bj + bdiag[i + 1] + 1; 1303 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1304 1305 /* load in initial (unfactored row) */ 1306 nz = ai[r[i] + 1] - ai[r[i]]; 1307 ajtmp = aj + ai[r[i]]; 1308 v = aa + ai[r[i]]; 1309 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1310 1311 /* ZeropivotApply() */ 1312 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1313 1314 /* elimination */ 1315 bjtmp = bj + bi[i]; 1316 row = *bjtmp++; 1317 nzL = bi[i + 1] - bi[i]; 1318 for (k = 0; k < nzL; k++) { 1319 pc = rtmp1 + row; 1320 if (*pc != 0.0) { 1321 pv = b->a + bdiag[row]; 1322 mul1 = *pc * (*pv); 1323 *pc = mul1; 1324 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1325 pv = b->a + bdiag[row + 1] + 1; 1326 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1327 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1328 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1329 } 1330 row = *bjtmp++; 1331 } 1332 1333 /* finished row so stick it into b->a */ 1334 rs = 0.0; 1335 /* L part */ 1336 pv = b->a + bi[i]; 1337 pj = b->j + bi[i]; 1338 nz = bi[i + 1] - bi[i]; 1339 for (j = 0; j < nz; j++) { 1340 pv[j] = rtmp1[pj[j]]; 1341 rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* U part */ 1345 pv = b->a + bdiag[i + 1] + 1; 1346 pj = b->j + bdiag[i + 1] + 1; 1347 nz = bdiag[i] - bdiag[i + 1] - 1; 1348 for (j = 0; j < nz; j++) { 1349 pv[j] = rtmp1[pj[j]]; 1350 rs += PetscAbsScalar(pv[j]); 1351 } 1352 1353 /* Check zero pivot */ 1354 sctx.rs = rs; 1355 sctx.pv = rtmp1[i]; 1356 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1357 if (sctx.newshift) break; 1358 1359 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1360 pv = b->a + bdiag[i]; 1361 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1362 break; 1363 1364 case 2: 1365 /* zero rtmp1 and rtmp2 */ 1366 /* L part */ 1367 nz = bi[i + 1] - bi[i]; 1368 bjtmp = bj + bi[i]; 1369 for (j = 0; j < nz; j++) { 1370 col = bjtmp[j]; 1371 rtmp1[col] = 0.0; 1372 rtmp2[col] = 0.0; 1373 } 1374 1375 /* U part */ 1376 nz = bdiag[i] - bdiag[i + 1]; 1377 bjtmp = bj + bdiag[i + 1] + 1; 1378 for (j = 0; j < nz; j++) { 1379 col = bjtmp[j]; 1380 rtmp1[col] = 0.0; 1381 rtmp2[col] = 0.0; 1382 } 1383 1384 /* load in initial (unfactored row) */ 1385 nz = ai[r[i] + 1] - ai[r[i]]; 1386 ajtmp = aj + ai[r[i]]; 1387 v1 = aa + ai[r[i]]; 1388 v2 = aa + ai[r[i + 1]]; 1389 for (j = 0; j < nz; j++) { 1390 col = ics[ajtmp[j]]; 1391 rtmp1[col] = v1[j]; 1392 rtmp2[col] = v2[j]; 1393 } 1394 /* ZeropivotApply(): shift the diagonal of the matrix */ 1395 rtmp1[i] += sctx.shift_amount; 1396 rtmp2[i + 1] += sctx.shift_amount; 1397 1398 /* elimination */ 1399 bjtmp = bj + bi[i]; 1400 row = *bjtmp++; /* pivot row */ 1401 nzL = bi[i + 1] - bi[i]; 1402 for (k = 0; k < nzL; k++) { 1403 pc1 = rtmp1 + row; 1404 pc2 = rtmp2 + row; 1405 if (*pc1 != 0.0 || *pc2 != 0.0) { 1406 pv = b->a + bdiag[row]; 1407 mul1 = *pc1 * (*pv); 1408 mul2 = *pc2 * (*pv); 1409 *pc1 = mul1; 1410 *pc2 = mul2; 1411 1412 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1413 pv = b->a + bdiag[row + 1] + 1; 1414 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1415 for (j = 0; j < nz; j++) { 1416 col = pj[j]; 1417 rtmp1[col] -= mul1 * pv[j]; 1418 rtmp2[col] -= mul2 * pv[j]; 1419 } 1420 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1421 } 1422 row = *bjtmp++; 1423 } 1424 1425 /* finished row i; check zero pivot, then stick row i into b->a */ 1426 rs = 0.0; 1427 /* L part */ 1428 pc1 = b->a + bi[i]; 1429 pj = b->j + bi[i]; 1430 nz = bi[i + 1] - bi[i]; 1431 for (j = 0; j < nz; j++) { 1432 col = pj[j]; 1433 pc1[j] = rtmp1[col]; 1434 rs += PetscAbsScalar(pc1[j]); 1435 } 1436 /* U part */ 1437 pc1 = b->a + bdiag[i + 1] + 1; 1438 pj = b->j + bdiag[i + 1] + 1; 1439 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1440 for (j = 0; j < nz; j++) { 1441 col = pj[j]; 1442 pc1[j] = rtmp1[col]; 1443 rs += PetscAbsScalar(pc1[j]); 1444 } 1445 1446 sctx.rs = rs; 1447 sctx.pv = rtmp1[i]; 1448 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1449 if (sctx.newshift) break; 1450 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1451 *pc1 = 1.0 / sctx.pv; 1452 1453 /* Now take care of diagonal 2x2 block. */ 1454 pc2 = rtmp2 + i; 1455 if (*pc2 != 0.0) { 1456 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1457 *pc2 = mul1; /* insert L entry */ 1458 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1459 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1460 for (j = 0; j < nz; j++) { 1461 col = pj[j]; 1462 rtmp2[col] -= mul1 * rtmp1[col]; 1463 } 1464 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1465 } 1466 1467 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1468 rs = 0.0; 1469 /* L part */ 1470 pc2 = b->a + bi[i + 1]; 1471 pj = b->j + bi[i + 1]; 1472 nz = bi[i + 2] - bi[i + 1]; 1473 for (j = 0; j < nz; j++) { 1474 col = pj[j]; 1475 pc2[j] = rtmp2[col]; 1476 rs += PetscAbsScalar(pc2[j]); 1477 } 1478 /* U part */ 1479 pc2 = b->a + bdiag[i + 2] + 1; 1480 pj = b->j + bdiag[i + 2] + 1; 1481 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1482 for (j = 0; j < nz; j++) { 1483 col = pj[j]; 1484 pc2[j] = rtmp2[col]; 1485 rs += PetscAbsScalar(pc2[j]); 1486 } 1487 1488 sctx.rs = rs; 1489 sctx.pv = rtmp2[i + 1]; 1490 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1491 if (sctx.newshift) break; 1492 pc2 = b->a + bdiag[i + 1]; 1493 *pc2 = 1.0 / sctx.pv; 1494 break; 1495 1496 case 3: 1497 /* zero rtmp */ 1498 /* L part */ 1499 nz = bi[i + 1] - bi[i]; 1500 bjtmp = bj + bi[i]; 1501 for (j = 0; j < nz; j++) { 1502 col = bjtmp[j]; 1503 rtmp1[col] = 0.0; 1504 rtmp2[col] = 0.0; 1505 rtmp3[col] = 0.0; 1506 } 1507 1508 /* U part */ 1509 nz = bdiag[i] - bdiag[i + 1]; 1510 bjtmp = bj + bdiag[i + 1] + 1; 1511 for (j = 0; j < nz; j++) { 1512 col = bjtmp[j]; 1513 rtmp1[col] = 0.0; 1514 rtmp2[col] = 0.0; 1515 rtmp3[col] = 0.0; 1516 } 1517 1518 /* load in initial (unfactored row) */ 1519 nz = ai[r[i] + 1] - ai[r[i]]; 1520 ajtmp = aj + ai[r[i]]; 1521 v1 = aa + ai[r[i]]; 1522 v2 = aa + ai[r[i + 1]]; 1523 v3 = aa + ai[r[i + 2]]; 1524 for (j = 0; j < nz; j++) { 1525 col = ics[ajtmp[j]]; 1526 rtmp1[col] = v1[j]; 1527 rtmp2[col] = v2[j]; 1528 rtmp3[col] = v3[j]; 1529 } 1530 /* ZeropivotApply(): shift the diagonal of the matrix */ 1531 rtmp1[i] += sctx.shift_amount; 1532 rtmp2[i + 1] += sctx.shift_amount; 1533 rtmp3[i + 2] += sctx.shift_amount; 1534 1535 /* elimination */ 1536 bjtmp = bj + bi[i]; 1537 row = *bjtmp++; /* pivot row */ 1538 nzL = bi[i + 1] - bi[i]; 1539 for (k = 0; k < nzL; k++) { 1540 pc1 = rtmp1 + row; 1541 pc2 = rtmp2 + row; 1542 pc3 = rtmp3 + row; 1543 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1544 pv = b->a + bdiag[row]; 1545 mul1 = *pc1 * (*pv); 1546 mul2 = *pc2 * (*pv); 1547 mul3 = *pc3 * (*pv); 1548 *pc1 = mul1; 1549 *pc2 = mul2; 1550 *pc3 = mul3; 1551 1552 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1553 pv = b->a + bdiag[row + 1] + 1; 1554 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1555 for (j = 0; j < nz; j++) { 1556 col = pj[j]; 1557 rtmp1[col] -= mul1 * pv[j]; 1558 rtmp2[col] -= mul2 * pv[j]; 1559 rtmp3[col] -= mul3 * pv[j]; 1560 } 1561 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1562 } 1563 row = *bjtmp++; 1564 } 1565 1566 /* finished row i; check zero pivot, then stick row i into b->a */ 1567 rs = 0.0; 1568 /* L part */ 1569 pc1 = b->a + bi[i]; 1570 pj = b->j + bi[i]; 1571 nz = bi[i + 1] - bi[i]; 1572 for (j = 0; j < nz; j++) { 1573 col = pj[j]; 1574 pc1[j] = rtmp1[col]; 1575 rs += PetscAbsScalar(pc1[j]); 1576 } 1577 /* U part */ 1578 pc1 = b->a + bdiag[i + 1] + 1; 1579 pj = b->j + bdiag[i + 1] + 1; 1580 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1581 for (j = 0; j < nz; j++) { 1582 col = pj[j]; 1583 pc1[j] = rtmp1[col]; 1584 rs += PetscAbsScalar(pc1[j]); 1585 } 1586 1587 sctx.rs = rs; 1588 sctx.pv = rtmp1[i]; 1589 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1590 if (sctx.newshift) break; 1591 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1592 *pc1 = 1.0 / sctx.pv; 1593 1594 /* Now take care of 1st column of diagonal 3x3 block. */ 1595 pc2 = rtmp2 + i; 1596 pc3 = rtmp3 + i; 1597 if (*pc2 != 0.0 || *pc3 != 0.0) { 1598 mul2 = (*pc2) * (*pc1); 1599 *pc2 = mul2; 1600 mul3 = (*pc3) * (*pc1); 1601 *pc3 = mul3; 1602 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1603 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1604 for (j = 0; j < nz; j++) { 1605 col = pj[j]; 1606 rtmp2[col] -= mul2 * rtmp1[col]; 1607 rtmp3[col] -= mul3 * rtmp1[col]; 1608 } 1609 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1610 } 1611 1612 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1613 rs = 0.0; 1614 /* L part */ 1615 pc2 = b->a + bi[i + 1]; 1616 pj = b->j + bi[i + 1]; 1617 nz = bi[i + 2] - bi[i + 1]; 1618 for (j = 0; j < nz; j++) { 1619 col = pj[j]; 1620 pc2[j] = rtmp2[col]; 1621 rs += PetscAbsScalar(pc2[j]); 1622 } 1623 /* U part */ 1624 pc2 = b->a + bdiag[i + 2] + 1; 1625 pj = b->j + bdiag[i + 2] + 1; 1626 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1627 for (j = 0; j < nz; j++) { 1628 col = pj[j]; 1629 pc2[j] = rtmp2[col]; 1630 rs += PetscAbsScalar(pc2[j]); 1631 } 1632 1633 sctx.rs = rs; 1634 sctx.pv = rtmp2[i + 1]; 1635 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1636 if (sctx.newshift) break; 1637 pc2 = b->a + bdiag[i + 1]; 1638 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1639 1640 /* Now take care of 2nd column of diagonal 3x3 block. */ 1641 pc3 = rtmp3 + i + 1; 1642 if (*pc3 != 0.0) { 1643 mul3 = (*pc3) * (*pc2); 1644 *pc3 = mul3; 1645 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1646 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1647 for (j = 0; j < nz; j++) { 1648 col = pj[j]; 1649 rtmp3[col] -= mul3 * rtmp2[col]; 1650 } 1651 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1652 } 1653 1654 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1655 rs = 0.0; 1656 /* L part */ 1657 pc3 = b->a + bi[i + 2]; 1658 pj = b->j + bi[i + 2]; 1659 nz = bi[i + 3] - bi[i + 2]; 1660 for (j = 0; j < nz; j++) { 1661 col = pj[j]; 1662 pc3[j] = rtmp3[col]; 1663 rs += PetscAbsScalar(pc3[j]); 1664 } 1665 /* U part */ 1666 pc3 = b->a + bdiag[i + 3] + 1; 1667 pj = b->j + bdiag[i + 3] + 1; 1668 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1669 for (j = 0; j < nz; j++) { 1670 col = pj[j]; 1671 pc3[j] = rtmp3[col]; 1672 rs += PetscAbsScalar(pc3[j]); 1673 } 1674 1675 sctx.rs = rs; 1676 sctx.pv = rtmp3[i + 2]; 1677 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1678 if (sctx.newshift) break; 1679 pc3 = b->a + bdiag[i + 2]; 1680 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1681 break; 1682 case 4: 1683 /* zero rtmp */ 1684 /* L part */ 1685 nz = bi[i + 1] - bi[i]; 1686 bjtmp = bj + bi[i]; 1687 for (j = 0; j < nz; j++) { 1688 col = bjtmp[j]; 1689 rtmp1[col] = 0.0; 1690 rtmp2[col] = 0.0; 1691 rtmp3[col] = 0.0; 1692 rtmp4[col] = 0.0; 1693 } 1694 1695 /* U part */ 1696 nz = bdiag[i] - bdiag[i + 1]; 1697 bjtmp = bj + bdiag[i + 1] + 1; 1698 for (j = 0; j < nz; j++) { 1699 col = bjtmp[j]; 1700 rtmp1[col] = 0.0; 1701 rtmp2[col] = 0.0; 1702 rtmp3[col] = 0.0; 1703 rtmp4[col] = 0.0; 1704 } 1705 1706 /* load in initial (unfactored row) */ 1707 nz = ai[r[i] + 1] - ai[r[i]]; 1708 ajtmp = aj + ai[r[i]]; 1709 v1 = aa + ai[r[i]]; 1710 v2 = aa + ai[r[i + 1]]; 1711 v3 = aa + ai[r[i + 2]]; 1712 v4 = aa + ai[r[i + 3]]; 1713 for (j = 0; j < nz; j++) { 1714 col = ics[ajtmp[j]]; 1715 rtmp1[col] = v1[j]; 1716 rtmp2[col] = v2[j]; 1717 rtmp3[col] = v3[j]; 1718 rtmp4[col] = v4[j]; 1719 } 1720 /* ZeropivotApply(): shift the diagonal of the matrix */ 1721 rtmp1[i] += sctx.shift_amount; 1722 rtmp2[i + 1] += sctx.shift_amount; 1723 rtmp3[i + 2] += sctx.shift_amount; 1724 rtmp4[i + 3] += sctx.shift_amount; 1725 1726 /* elimination */ 1727 bjtmp = bj + bi[i]; 1728 row = *bjtmp++; /* pivot row */ 1729 nzL = bi[i + 1] - bi[i]; 1730 for (k = 0; k < nzL; k++) { 1731 pc1 = rtmp1 + row; 1732 pc2 = rtmp2 + row; 1733 pc3 = rtmp3 + row; 1734 pc4 = rtmp4 + row; 1735 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1736 pv = b->a + bdiag[row]; 1737 mul1 = *pc1 * (*pv); 1738 mul2 = *pc2 * (*pv); 1739 mul3 = *pc3 * (*pv); 1740 mul4 = *pc4 * (*pv); 1741 *pc1 = mul1; 1742 *pc2 = mul2; 1743 *pc3 = mul3; 1744 *pc4 = mul4; 1745 1746 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1747 pv = b->a + bdiag[row + 1] + 1; 1748 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1749 for (j = 0; j < nz; j++) { 1750 col = pj[j]; 1751 rtmp1[col] -= mul1 * pv[j]; 1752 rtmp2[col] -= mul2 * pv[j]; 1753 rtmp3[col] -= mul3 * pv[j]; 1754 rtmp4[col] -= mul4 * pv[j]; 1755 } 1756 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1757 } 1758 row = *bjtmp++; 1759 } 1760 1761 /* finished row i; check zero pivot, then stick row i into b->a */ 1762 rs = 0.0; 1763 /* L part */ 1764 pc1 = b->a + bi[i]; 1765 pj = b->j + bi[i]; 1766 nz = bi[i + 1] - bi[i]; 1767 for (j = 0; j < nz; j++) { 1768 col = pj[j]; 1769 pc1[j] = rtmp1[col]; 1770 rs += PetscAbsScalar(pc1[j]); 1771 } 1772 /* U part */ 1773 pc1 = b->a + bdiag[i + 1] + 1; 1774 pj = b->j + bdiag[i + 1] + 1; 1775 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1776 for (j = 0; j < nz; j++) { 1777 col = pj[j]; 1778 pc1[j] = rtmp1[col]; 1779 rs += PetscAbsScalar(pc1[j]); 1780 } 1781 1782 sctx.rs = rs; 1783 sctx.pv = rtmp1[i]; 1784 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1785 if (sctx.newshift) break; 1786 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1787 *pc1 = 1.0 / sctx.pv; 1788 1789 /* Now take care of 1st column of diagonal 4x4 block. */ 1790 pc2 = rtmp2 + i; 1791 pc3 = rtmp3 + i; 1792 pc4 = rtmp4 + i; 1793 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1794 mul2 = (*pc2) * (*pc1); 1795 *pc2 = mul2; 1796 mul3 = (*pc3) * (*pc1); 1797 *pc3 = mul3; 1798 mul4 = (*pc4) * (*pc1); 1799 *pc4 = mul4; 1800 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1801 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1802 for (j = 0; j < nz; j++) { 1803 col = pj[j]; 1804 rtmp2[col] -= mul2 * rtmp1[col]; 1805 rtmp3[col] -= mul3 * rtmp1[col]; 1806 rtmp4[col] -= mul4 * rtmp1[col]; 1807 } 1808 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1809 } 1810 1811 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1812 rs = 0.0; 1813 /* L part */ 1814 pc2 = b->a + bi[i + 1]; 1815 pj = b->j + bi[i + 1]; 1816 nz = bi[i + 2] - bi[i + 1]; 1817 for (j = 0; j < nz; j++) { 1818 col = pj[j]; 1819 pc2[j] = rtmp2[col]; 1820 rs += PetscAbsScalar(pc2[j]); 1821 } 1822 /* U part */ 1823 pc2 = b->a + bdiag[i + 2] + 1; 1824 pj = b->j + bdiag[i + 2] + 1; 1825 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1826 for (j = 0; j < nz; j++) { 1827 col = pj[j]; 1828 pc2[j] = rtmp2[col]; 1829 rs += PetscAbsScalar(pc2[j]); 1830 } 1831 1832 sctx.rs = rs; 1833 sctx.pv = rtmp2[i + 1]; 1834 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1835 if (sctx.newshift) break; 1836 pc2 = b->a + bdiag[i + 1]; 1837 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1838 1839 /* Now take care of 2nd column of diagonal 4x4 block. */ 1840 pc3 = rtmp3 + i + 1; 1841 pc4 = rtmp4 + i + 1; 1842 if (*pc3 != 0.0 || *pc4 != 0.0) { 1843 mul3 = (*pc3) * (*pc2); 1844 *pc3 = mul3; 1845 mul4 = (*pc4) * (*pc2); 1846 *pc4 = mul4; 1847 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1848 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1849 for (j = 0; j < nz; j++) { 1850 col = pj[j]; 1851 rtmp3[col] -= mul3 * rtmp2[col]; 1852 rtmp4[col] -= mul4 * rtmp2[col]; 1853 } 1854 PetscCall(PetscLogFlops(4.0 * nz)); 1855 } 1856 1857 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1858 rs = 0.0; 1859 /* L part */ 1860 pc3 = b->a + bi[i + 2]; 1861 pj = b->j + bi[i + 2]; 1862 nz = bi[i + 3] - bi[i + 2]; 1863 for (j = 0; j < nz; j++) { 1864 col = pj[j]; 1865 pc3[j] = rtmp3[col]; 1866 rs += PetscAbsScalar(pc3[j]); 1867 } 1868 /* U part */ 1869 pc3 = b->a + bdiag[i + 3] + 1; 1870 pj = b->j + bdiag[i + 3] + 1; 1871 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1872 for (j = 0; j < nz; j++) { 1873 col = pj[j]; 1874 pc3[j] = rtmp3[col]; 1875 rs += PetscAbsScalar(pc3[j]); 1876 } 1877 1878 sctx.rs = rs; 1879 sctx.pv = rtmp3[i + 2]; 1880 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1881 if (sctx.newshift) break; 1882 pc3 = b->a + bdiag[i + 2]; 1883 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1884 1885 /* Now take care of 3rd column of diagonal 4x4 block. */ 1886 pc4 = rtmp4 + i + 2; 1887 if (*pc4 != 0.0) { 1888 mul4 = (*pc4) * (*pc3); 1889 *pc4 = mul4; 1890 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1891 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1892 for (j = 0; j < nz; j++) { 1893 col = pj[j]; 1894 rtmp4[col] -= mul4 * rtmp3[col]; 1895 } 1896 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1897 } 1898 1899 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1900 rs = 0.0; 1901 /* L part */ 1902 pc4 = b->a + bi[i + 3]; 1903 pj = b->j + bi[i + 3]; 1904 nz = bi[i + 4] - bi[i + 3]; 1905 for (j = 0; j < nz; j++) { 1906 col = pj[j]; 1907 pc4[j] = rtmp4[col]; 1908 rs += PetscAbsScalar(pc4[j]); 1909 } 1910 /* U part */ 1911 pc4 = b->a + bdiag[i + 4] + 1; 1912 pj = b->j + bdiag[i + 4] + 1; 1913 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1914 for (j = 0; j < nz; j++) { 1915 col = pj[j]; 1916 pc4[j] = rtmp4[col]; 1917 rs += PetscAbsScalar(pc4[j]); 1918 } 1919 1920 sctx.rs = rs; 1921 sctx.pv = rtmp4[i + 3]; 1922 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1923 if (sctx.newshift) break; 1924 pc4 = b->a + bdiag[i + 3]; 1925 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1926 break; 1927 1928 default: 1929 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1930 } 1931 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1932 i += nodesz; /* Update the row */ 1933 } 1934 1935 /* MatPivotRefine() */ 1936 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1937 /* 1938 * if no shift in this attempt & shifting & started shifting & can refine, 1939 * then try lower shift 1940 */ 1941 sctx.shift_hi = sctx.shift_fraction; 1942 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1943 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1944 sctx.newshift = PETSC_TRUE; 1945 sctx.nshift++; 1946 } 1947 } while (sctx.newshift); 1948 1949 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1950 PetscCall(PetscFree(tmp_vec2)); 1951 PetscCall(ISRestoreIndices(isicol, &ic)); 1952 PetscCall(ISRestoreIndices(isrow, &r)); 1953 1954 if (b->inode.size_csr) { 1955 C->ops->solve = MatSolve_SeqAIJ_Inode; 1956 } else { 1957 C->ops->solve = MatSolve_SeqAIJ; 1958 } 1959 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1960 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1961 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1962 C->ops->matsolve = MatMatSolve_SeqAIJ; 1963 C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ; 1964 C->assembled = PETSC_TRUE; 1965 C->preallocated = PETSC_TRUE; 1966 1967 PetscCall(PetscLogFlops(C->cmap->n)); 1968 1969 /* MatShiftView(A,info,&sctx) */ 1970 if (sctx.nshift) { 1971 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1972 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1973 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1974 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1975 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1976 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1977 } 1978 } 1979 PetscFunctionReturn(PETSC_SUCCESS); 1980 } 1981 1982 #if 0 1983 // unused 1984 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1985 { 1986 Mat C = B; 1987 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1988 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1989 const PetscInt *r, *ic, *c, *ics; 1990 PetscInt n = A->rmap->n, *bi = b->i; 1991 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1992 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1993 PetscInt *ai = a->i, *aj = a->j; 1994 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1995 PetscScalar mul1, mul2, mul3, tmp; 1996 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1997 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1998 PetscReal rs = 0.0; 1999 FactorShiftCtx sctx; 2000 2001 PetscFunctionBegin; 2002 sctx.shift_top = 0; 2003 sctx.nshift_max = 0; 2004 sctx.shift_lo = 0; 2005 sctx.shift_hi = 0; 2006 sctx.shift_fraction = 0; 2007 2008 /* if both shift schemes are chosen by user, only use info->shiftpd */ 2009 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 2010 sctx.shift_top = 0; 2011 for (i = 0; i < n; i++) { 2012 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2013 rs = 0.0; 2014 ajtmp = aj + ai[i]; 2015 rtmp1 = aa + ai[i]; 2016 nz = ai[i + 1] - ai[i]; 2017 for (j = 0; j < nz; j++) { 2018 if (*ajtmp != i) { 2019 rs += PetscAbsScalar(*rtmp1++); 2020 } else { 2021 rs -= PetscRealPart(*rtmp1++); 2022 } 2023 ajtmp++; 2024 } 2025 if (rs > sctx.shift_top) sctx.shift_top = rs; 2026 } 2027 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2028 sctx.shift_top *= 1.1; 2029 sctx.nshift_max = 5; 2030 sctx.shift_lo = 0.; 2031 sctx.shift_hi = 1.; 2032 } 2033 sctx.shift_amount = 0; 2034 sctx.nshift = 0; 2035 2036 PetscCall(ISGetIndices(isrow, &r)); 2037 PetscCall(ISGetIndices(iscol, &c)); 2038 PetscCall(ISGetIndices(isicol, &ic)); 2039 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2040 ics = ic; 2041 2042 node_max = a->inode.node_count; 2043 ns = a->inode.size; 2044 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2045 2046 /* If max inode size > 3, split it into two inodes.*/ 2047 /* also map the inode sizes according to the ordering */ 2048 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2049 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2050 if (ns[i] > 3) { 2051 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2052 ++j; 2053 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2054 } else { 2055 tmp_vec1[j] = ns[i]; 2056 } 2057 } 2058 /* Use the correct node_max */ 2059 node_max = j; 2060 2061 /* Now reorder the inode info based on mat re-ordering info */ 2062 /* First create a row -> inode_size_array_index map */ 2063 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2064 for (i = 0, row = 0; i < node_max; i++) { 2065 nodesz = tmp_vec1[i]; 2066 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2067 } 2068 /* Using nsmap, create a reordered ns structure */ 2069 for (i = 0, j = 0; i < node_max; i++) { 2070 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2071 tmp_vec2[i] = nodesz; 2072 j += nodesz; 2073 } 2074 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2075 /* Now use the correct ns */ 2076 ns = tmp_vec2; 2077 2078 do { 2079 sctx.newshift = PETSC_FALSE; 2080 /* Now loop over each block-row, and do the factorization */ 2081 for (i = 0, row = 0; i < node_max; i++) { 2082 nodesz = ns[i]; 2083 nz = bi[row + 1] - bi[row]; 2084 bjtmp = bj + bi[row]; 2085 2086 switch (nodesz) { 2087 case 1: 2088 for (j = 0; j < nz; j++) { 2089 idx = bjtmp[j]; 2090 rtmp11[idx] = 0.0; 2091 } 2092 2093 /* load in initial (unfactored row) */ 2094 idx = r[row]; 2095 nz_tmp = ai[idx + 1] - ai[idx]; 2096 ajtmp = aj + ai[idx]; 2097 v1 = aa + ai[idx]; 2098 2099 for (j = 0; j < nz_tmp; j++) { 2100 idx = ics[ajtmp[j]]; 2101 rtmp11[idx] = v1[j]; 2102 } 2103 rtmp11[ics[r[row]]] += sctx.shift_amount; 2104 2105 prow = *bjtmp++; 2106 while (prow < row) { 2107 pc1 = rtmp11 + prow; 2108 if (*pc1 != 0.0) { 2109 pv = ba + bd[prow]; 2110 pj = nbj + bd[prow]; 2111 mul1 = *pc1 * *pv++; 2112 *pc1 = mul1; 2113 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2114 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2115 for (j = 0; j < nz_tmp; j++) { 2116 tmp = pv[j]; 2117 idx = pj[j]; 2118 rtmp11[idx] -= mul1 * tmp; 2119 } 2120 } 2121 prow = *bjtmp++; 2122 } 2123 pj = bj + bi[row]; 2124 pc1 = ba + bi[row]; 2125 2126 sctx.pv = rtmp11[row]; 2127 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2128 rs = 0.0; 2129 for (j = 0; j < nz; j++) { 2130 idx = pj[j]; 2131 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2132 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2133 } 2134 sctx.rs = rs; 2135 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2136 if (sctx.newshift) goto endofwhile; 2137 break; 2138 2139 case 2: 2140 for (j = 0; j < nz; j++) { 2141 idx = bjtmp[j]; 2142 rtmp11[idx] = 0.0; 2143 rtmp22[idx] = 0.0; 2144 } 2145 2146 /* load in initial (unfactored row) */ 2147 idx = r[row]; 2148 nz_tmp = ai[idx + 1] - ai[idx]; 2149 ajtmp = aj + ai[idx]; 2150 v1 = aa + ai[idx]; 2151 v2 = aa + ai[idx + 1]; 2152 for (j = 0; j < nz_tmp; j++) { 2153 idx = ics[ajtmp[j]]; 2154 rtmp11[idx] = v1[j]; 2155 rtmp22[idx] = v2[j]; 2156 } 2157 rtmp11[ics[r[row]]] += sctx.shift_amount; 2158 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2159 2160 prow = *bjtmp++; 2161 while (prow < row) { 2162 pc1 = rtmp11 + prow; 2163 pc2 = rtmp22 + prow; 2164 if (*pc1 != 0.0 || *pc2 != 0.0) { 2165 pv = ba + bd[prow]; 2166 pj = nbj + bd[prow]; 2167 mul1 = *pc1 * *pv; 2168 mul2 = *pc2 * *pv; 2169 ++pv; 2170 *pc1 = mul1; 2171 *pc2 = mul2; 2172 2173 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2174 for (j = 0; j < nz_tmp; j++) { 2175 tmp = pv[j]; 2176 idx = pj[j]; 2177 rtmp11[idx] -= mul1 * tmp; 2178 rtmp22[idx] -= mul2 * tmp; 2179 } 2180 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2181 } 2182 prow = *bjtmp++; 2183 } 2184 2185 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2186 pc1 = rtmp11 + prow; 2187 pc2 = rtmp22 + prow; 2188 2189 sctx.pv = *pc1; 2190 pj = bj + bi[prow]; 2191 rs = 0.0; 2192 for (j = 0; j < nz; j++) { 2193 idx = pj[j]; 2194 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2195 } 2196 sctx.rs = rs; 2197 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2198 if (sctx.newshift) goto endofwhile; 2199 2200 if (*pc2 != 0.0) { 2201 pj = nbj + bd[prow]; 2202 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2203 *pc2 = mul2; 2204 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2205 for (j = 0; j < nz_tmp; j++) { 2206 idx = pj[j]; 2207 tmp = rtmp11[idx]; 2208 rtmp22[idx] -= mul2 * tmp; 2209 } 2210 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2211 } 2212 2213 pj = bj + bi[row]; 2214 pc1 = ba + bi[row]; 2215 pc2 = ba + bi[row + 1]; 2216 2217 sctx.pv = rtmp22[row + 1]; 2218 rs = 0.0; 2219 rtmp11[row] = 1.0 / rtmp11[row]; 2220 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2221 /* copy row entries from dense representation to sparse */ 2222 for (j = 0; j < nz; j++) { 2223 idx = pj[j]; 2224 pc1[j] = rtmp11[idx]; 2225 pc2[j] = rtmp22[idx]; 2226 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2227 } 2228 sctx.rs = rs; 2229 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2230 if (sctx.newshift) goto endofwhile; 2231 break; 2232 2233 case 3: 2234 for (j = 0; j < nz; j++) { 2235 idx = bjtmp[j]; 2236 rtmp11[idx] = 0.0; 2237 rtmp22[idx] = 0.0; 2238 rtmp33[idx] = 0.0; 2239 } 2240 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2241 idx = r[row]; 2242 nz_tmp = ai[idx + 1] - ai[idx]; 2243 ajtmp = aj + ai[idx]; 2244 v1 = aa + ai[idx]; 2245 v2 = aa + ai[idx + 1]; 2246 v3 = aa + ai[idx + 2]; 2247 for (j = 0; j < nz_tmp; j++) { 2248 idx = ics[ajtmp[j]]; 2249 rtmp11[idx] = v1[j]; 2250 rtmp22[idx] = v2[j]; 2251 rtmp33[idx] = v3[j]; 2252 } 2253 rtmp11[ics[r[row]]] += sctx.shift_amount; 2254 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2255 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2256 2257 /* loop over all pivot row blocks above this row block */ 2258 prow = *bjtmp++; 2259 while (prow < row) { 2260 pc1 = rtmp11 + prow; 2261 pc2 = rtmp22 + prow; 2262 pc3 = rtmp33 + prow; 2263 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2264 pv = ba + bd[prow]; 2265 pj = nbj + bd[prow]; 2266 mul1 = *pc1 * *pv; 2267 mul2 = *pc2 * *pv; 2268 mul3 = *pc3 * *pv; 2269 ++pv; 2270 *pc1 = mul1; 2271 *pc2 = mul2; 2272 *pc3 = mul3; 2273 2274 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2275 /* update this row based on pivot row */ 2276 for (j = 0; j < nz_tmp; j++) { 2277 tmp = pv[j]; 2278 idx = pj[j]; 2279 rtmp11[idx] -= mul1 * tmp; 2280 rtmp22[idx] -= mul2 * tmp; 2281 rtmp33[idx] -= mul3 * tmp; 2282 } 2283 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2284 } 2285 prow = *bjtmp++; 2286 } 2287 2288 /* Now take care of diagonal 3x3 block in this set of rows */ 2289 /* note: prow = row here */ 2290 pc1 = rtmp11 + prow; 2291 pc2 = rtmp22 + prow; 2292 pc3 = rtmp33 + prow; 2293 2294 sctx.pv = *pc1; 2295 pj = bj + bi[prow]; 2296 rs = 0.0; 2297 for (j = 0; j < nz; j++) { 2298 idx = pj[j]; 2299 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2300 } 2301 sctx.rs = rs; 2302 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2303 if (sctx.newshift) goto endofwhile; 2304 2305 if (*pc2 != 0.0 || *pc3 != 0.0) { 2306 mul2 = (*pc2) / (*pc1); 2307 mul3 = (*pc3) / (*pc1); 2308 *pc2 = mul2; 2309 *pc3 = mul3; 2310 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2311 pj = nbj + bd[prow]; 2312 for (j = 0; j < nz_tmp; j++) { 2313 idx = pj[j]; 2314 tmp = rtmp11[idx]; 2315 rtmp22[idx] -= mul2 * tmp; 2316 rtmp33[idx] -= mul3 * tmp; 2317 } 2318 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2319 } 2320 ++prow; 2321 2322 pc2 = rtmp22 + prow; 2323 pc3 = rtmp33 + prow; 2324 sctx.pv = *pc2; 2325 pj = bj + bi[prow]; 2326 rs = 0.0; 2327 for (j = 0; j < nz; j++) { 2328 idx = pj[j]; 2329 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2330 } 2331 sctx.rs = rs; 2332 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2333 if (sctx.newshift) goto endofwhile; 2334 2335 if (*pc3 != 0.0) { 2336 mul3 = (*pc3) / (*pc2); 2337 *pc3 = mul3; 2338 pj = nbj + bd[prow]; 2339 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2340 for (j = 0; j < nz_tmp; j++) { 2341 idx = pj[j]; 2342 tmp = rtmp22[idx]; 2343 rtmp33[idx] -= mul3 * tmp; 2344 } 2345 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2346 } 2347 2348 pj = bj + bi[row]; 2349 pc1 = ba + bi[row]; 2350 pc2 = ba + bi[row + 1]; 2351 pc3 = ba + bi[row + 2]; 2352 2353 sctx.pv = rtmp33[row + 2]; 2354 rs = 0.0; 2355 rtmp11[row] = 1.0 / rtmp11[row]; 2356 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2357 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2358 /* copy row entries from dense representation to sparse */ 2359 for (j = 0; j < nz; j++) { 2360 idx = pj[j]; 2361 pc1[j] = rtmp11[idx]; 2362 pc2[j] = rtmp22[idx]; 2363 pc3[j] = rtmp33[idx]; 2364 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2365 } 2366 2367 sctx.rs = rs; 2368 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2369 if (sctx.newshift) goto endofwhile; 2370 break; 2371 2372 default: 2373 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2374 } 2375 row += nodesz; /* Update the row */ 2376 } 2377 endofwhile:; 2378 } while (sctx.newshift); 2379 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2380 PetscCall(PetscFree(tmp_vec2)); 2381 PetscCall(ISRestoreIndices(isicol, &ic)); 2382 PetscCall(ISRestoreIndices(isrow, &r)); 2383 PetscCall(ISRestoreIndices(iscol, &c)); 2384 2385 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2386 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2387 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2388 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2389 C->assembled = PETSC_TRUE; 2390 C->preallocated = PETSC_TRUE; 2391 if (sctx.nshift) { 2392 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2393 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2394 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2395 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2396 } 2397 } 2398 PetscCall(PetscLogFlops(C->cmap->n)); 2399 PetscCall(MatSeqAIJCheckInode(C)); 2400 PetscFunctionReturn(PETSC_SUCCESS); 2401 } 2402 #endif 2403 2404 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2405 { 2406 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2407 IS iscol = a->col, isrow = a->row; 2408 const PetscInt *r, *c, *rout, *cout; 2409 PetscInt i, j; 2410 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2411 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2412 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2413 PetscScalar sum1, sum2, sum3, sum4, sum5; 2414 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2415 const PetscScalar *b; 2416 2417 PetscFunctionBegin; 2418 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2419 node_max = a->inode.node_count; 2420 ns = a->inode.size_csr; /* Node Size array */ 2421 2422 PetscCall(VecGetArrayRead(bb, &b)); 2423 PetscCall(VecGetArrayWrite(xx, &x)); 2424 tmp = a->solve_work; 2425 2426 PetscCall(ISGetIndices(isrow, &rout)); 2427 r = rout; 2428 PetscCall(ISGetIndices(iscol, &cout)); 2429 c = cout; 2430 2431 /* forward solve the lower triangular */ 2432 tmps = tmp; 2433 aa = a_a; 2434 aj = a_j; 2435 ad = a->diag; 2436 2437 for (i = 0; i < node_max; ++i) { 2438 row = ns[i]; 2439 nsz = ns[i + 1] - ns[i]; 2440 aii = ai[row]; 2441 v1 = aa + aii; 2442 vi = aj + aii; 2443 nz = ai[row + 1] - ai[row]; 2444 2445 if (i < node_max - 1) { 2446 /* Prefetch the indices for the next block */ 2447 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2448 /* Prefetch the data for the next block */ 2449 PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2450 } 2451 2452 switch (nsz) { /* Each loop in 'case' is unrolled */ 2453 case 1: 2454 sum1 = b[r[row]]; 2455 for (j = 0; j < nz - 1; j += 2) { 2456 i0 = vi[j]; 2457 i1 = vi[j + 1]; 2458 tmp0 = tmps[i0]; 2459 tmp1 = tmps[i1]; 2460 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2461 } 2462 if (j == nz - 1) { 2463 tmp0 = tmps[vi[j]]; 2464 sum1 -= v1[j] * tmp0; 2465 } 2466 tmp[row++] = sum1; 2467 break; 2468 case 2: 2469 sum1 = b[r[row]]; 2470 sum2 = b[r[row + 1]]; 2471 v2 = aa + ai[row + 1]; 2472 2473 for (j = 0; j < nz - 1; j += 2) { 2474 i0 = vi[j]; 2475 i1 = vi[j + 1]; 2476 tmp0 = tmps[i0]; 2477 tmp1 = tmps[i1]; 2478 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2479 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2480 } 2481 if (j == nz - 1) { 2482 tmp0 = tmps[vi[j]]; 2483 sum1 -= v1[j] * tmp0; 2484 sum2 -= v2[j] * tmp0; 2485 } 2486 sum2 -= v2[nz] * sum1; 2487 tmp[row++] = sum1; 2488 tmp[row++] = sum2; 2489 break; 2490 case 3: 2491 sum1 = b[r[row]]; 2492 sum2 = b[r[row + 1]]; 2493 sum3 = b[r[row + 2]]; 2494 v2 = aa + ai[row + 1]; 2495 v3 = aa + ai[row + 2]; 2496 2497 for (j = 0; j < nz - 1; j += 2) { 2498 i0 = vi[j]; 2499 i1 = vi[j + 1]; 2500 tmp0 = tmps[i0]; 2501 tmp1 = tmps[i1]; 2502 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2503 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2504 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2505 } 2506 if (j == nz - 1) { 2507 tmp0 = tmps[vi[j]]; 2508 sum1 -= v1[j] * tmp0; 2509 sum2 -= v2[j] * tmp0; 2510 sum3 -= v3[j] * tmp0; 2511 } 2512 sum2 -= v2[nz] * sum1; 2513 sum3 -= v3[nz] * sum1; 2514 sum3 -= v3[nz + 1] * sum2; 2515 tmp[row++] = sum1; 2516 tmp[row++] = sum2; 2517 tmp[row++] = sum3; 2518 break; 2519 2520 case 4: 2521 sum1 = b[r[row]]; 2522 sum2 = b[r[row + 1]]; 2523 sum3 = b[r[row + 2]]; 2524 sum4 = b[r[row + 3]]; 2525 v2 = aa + ai[row + 1]; 2526 v3 = aa + ai[row + 2]; 2527 v4 = aa + ai[row + 3]; 2528 2529 for (j = 0; j < nz - 1; j += 2) { 2530 i0 = vi[j]; 2531 i1 = vi[j + 1]; 2532 tmp0 = tmps[i0]; 2533 tmp1 = tmps[i1]; 2534 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2535 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2536 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2537 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2538 } 2539 if (j == nz - 1) { 2540 tmp0 = tmps[vi[j]]; 2541 sum1 -= v1[j] * tmp0; 2542 sum2 -= v2[j] * tmp0; 2543 sum3 -= v3[j] * tmp0; 2544 sum4 -= v4[j] * tmp0; 2545 } 2546 sum2 -= v2[nz] * sum1; 2547 sum3 -= v3[nz] * sum1; 2548 sum4 -= v4[nz] * sum1; 2549 sum3 -= v3[nz + 1] * sum2; 2550 sum4 -= v4[nz + 1] * sum2; 2551 sum4 -= v4[nz + 2] * sum3; 2552 2553 tmp[row++] = sum1; 2554 tmp[row++] = sum2; 2555 tmp[row++] = sum3; 2556 tmp[row++] = sum4; 2557 break; 2558 case 5: 2559 sum1 = b[r[row]]; 2560 sum2 = b[r[row + 1]]; 2561 sum3 = b[r[row + 2]]; 2562 sum4 = b[r[row + 3]]; 2563 sum5 = b[r[row + 4]]; 2564 v2 = aa + ai[row + 1]; 2565 v3 = aa + ai[row + 2]; 2566 v4 = aa + ai[row + 3]; 2567 v5 = aa + ai[row + 4]; 2568 2569 for (j = 0; j < nz - 1; j += 2) { 2570 i0 = vi[j]; 2571 i1 = vi[j + 1]; 2572 tmp0 = tmps[i0]; 2573 tmp1 = tmps[i1]; 2574 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2575 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2576 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2577 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2578 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2579 } 2580 if (j == nz - 1) { 2581 tmp0 = tmps[vi[j]]; 2582 sum1 -= v1[j] * tmp0; 2583 sum2 -= v2[j] * tmp0; 2584 sum3 -= v3[j] * tmp0; 2585 sum4 -= v4[j] * tmp0; 2586 sum5 -= v5[j] * tmp0; 2587 } 2588 2589 sum2 -= v2[nz] * sum1; 2590 sum3 -= v3[nz] * sum1; 2591 sum4 -= v4[nz] * sum1; 2592 sum5 -= v5[nz] * sum1; 2593 sum3 -= v3[nz + 1] * sum2; 2594 sum4 -= v4[nz + 1] * sum2; 2595 sum5 -= v5[nz + 1] * sum2; 2596 sum4 -= v4[nz + 2] * sum3; 2597 sum5 -= v5[nz + 2] * sum3; 2598 sum5 -= v5[nz + 3] * sum4; 2599 2600 tmp[row++] = sum1; 2601 tmp[row++] = sum2; 2602 tmp[row++] = sum3; 2603 tmp[row++] = sum4; 2604 tmp[row++] = sum5; 2605 break; 2606 default: 2607 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2608 } 2609 } 2610 /* backward solve the upper triangular */ 2611 for (i = node_max - 1; i >= 0; i--) { 2612 row = ns[i + 1] - 1; 2613 nsz = ns[i + 1] - ns[i]; 2614 aii = ad[row + 1] + 1; 2615 v1 = aa + aii; 2616 vi = aj + aii; 2617 nz = ad[row] - ad[row + 1] - 1; 2618 2619 if (i > 0) { 2620 /* Prefetch the indices for the next block */ 2621 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2622 /* Prefetch the data for the next block */ 2623 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2624 } 2625 2626 switch (nsz) { /* Each loop in 'case' is unrolled */ 2627 case 1: 2628 sum1 = tmp[row]; 2629 2630 for (j = 0; j < nz - 1; j += 2) { 2631 i0 = vi[j]; 2632 i1 = vi[j + 1]; 2633 tmp0 = tmps[i0]; 2634 tmp1 = tmps[i1]; 2635 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2636 } 2637 if (j == nz - 1) { 2638 tmp0 = tmps[vi[j]]; 2639 sum1 -= v1[j] * tmp0; 2640 } 2641 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2642 row--; 2643 break; 2644 case 2: 2645 sum1 = tmp[row]; 2646 sum2 = tmp[row - 1]; 2647 v2 = aa + ad[row] + 1; 2648 for (j = 0; j < nz - 1; j += 2) { 2649 i0 = vi[j]; 2650 i1 = vi[j + 1]; 2651 tmp0 = tmps[i0]; 2652 tmp1 = tmps[i1]; 2653 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2654 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2655 } 2656 if (j == nz - 1) { 2657 tmp0 = tmps[vi[j]]; 2658 sum1 -= v1[j] * tmp0; 2659 sum2 -= v2[j + 1] * tmp0; 2660 } 2661 2662 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2663 row--; 2664 sum2 -= v2[0] * tmp0; 2665 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2666 row--; 2667 break; 2668 case 3: 2669 sum1 = tmp[row]; 2670 sum2 = tmp[row - 1]; 2671 sum3 = tmp[row - 2]; 2672 v2 = aa + ad[row] + 1; 2673 v3 = aa + ad[row - 1] + 1; 2674 for (j = 0; j < nz - 1; j += 2) { 2675 i0 = vi[j]; 2676 i1 = vi[j + 1]; 2677 tmp0 = tmps[i0]; 2678 tmp1 = tmps[i1]; 2679 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2680 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2681 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2682 } 2683 if (j == nz - 1) { 2684 tmp0 = tmps[vi[j]]; 2685 sum1 -= v1[j] * tmp0; 2686 sum2 -= v2[j + 1] * tmp0; 2687 sum3 -= v3[j + 2] * tmp0; 2688 } 2689 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2690 row--; 2691 sum2 -= v2[0] * tmp0; 2692 sum3 -= v3[1] * tmp0; 2693 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2694 row--; 2695 sum3 -= v3[0] * tmp0; 2696 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2697 row--; 2698 2699 break; 2700 case 4: 2701 sum1 = tmp[row]; 2702 sum2 = tmp[row - 1]; 2703 sum3 = tmp[row - 2]; 2704 sum4 = tmp[row - 3]; 2705 v2 = aa + ad[row] + 1; 2706 v3 = aa + ad[row - 1] + 1; 2707 v4 = aa + ad[row - 2] + 1; 2708 2709 for (j = 0; j < nz - 1; j += 2) { 2710 i0 = vi[j]; 2711 i1 = vi[j + 1]; 2712 tmp0 = tmps[i0]; 2713 tmp1 = tmps[i1]; 2714 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2715 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2716 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2717 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2718 } 2719 if (j == nz - 1) { 2720 tmp0 = tmps[vi[j]]; 2721 sum1 -= v1[j] * tmp0; 2722 sum2 -= v2[j + 1] * tmp0; 2723 sum3 -= v3[j + 2] * tmp0; 2724 sum4 -= v4[j + 3] * tmp0; 2725 } 2726 2727 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2728 row--; 2729 sum2 -= v2[0] * tmp0; 2730 sum3 -= v3[1] * tmp0; 2731 sum4 -= v4[2] * tmp0; 2732 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2733 row--; 2734 sum3 -= v3[0] * tmp0; 2735 sum4 -= v4[1] * tmp0; 2736 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2737 row--; 2738 sum4 -= v4[0] * tmp0; 2739 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2740 row--; 2741 break; 2742 case 5: 2743 sum1 = tmp[row]; 2744 sum2 = tmp[row - 1]; 2745 sum3 = tmp[row - 2]; 2746 sum4 = tmp[row - 3]; 2747 sum5 = tmp[row - 4]; 2748 v2 = aa + ad[row] + 1; 2749 v3 = aa + ad[row - 1] + 1; 2750 v4 = aa + ad[row - 2] + 1; 2751 v5 = aa + ad[row - 3] + 1; 2752 for (j = 0; j < nz - 1; j += 2) { 2753 i0 = vi[j]; 2754 i1 = vi[j + 1]; 2755 tmp0 = tmps[i0]; 2756 tmp1 = tmps[i1]; 2757 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2758 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2759 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2760 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2761 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2762 } 2763 if (j == nz - 1) { 2764 tmp0 = tmps[vi[j]]; 2765 sum1 -= v1[j] * tmp0; 2766 sum2 -= v2[j + 1] * tmp0; 2767 sum3 -= v3[j + 2] * tmp0; 2768 sum4 -= v4[j + 3] * tmp0; 2769 sum5 -= v5[j + 4] * tmp0; 2770 } 2771 2772 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2773 row--; 2774 sum2 -= v2[0] * tmp0; 2775 sum3 -= v3[1] * tmp0; 2776 sum4 -= v4[2] * tmp0; 2777 sum5 -= v5[3] * tmp0; 2778 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2779 row--; 2780 sum3 -= v3[0] * tmp0; 2781 sum4 -= v4[1] * tmp0; 2782 sum5 -= v5[2] * tmp0; 2783 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2784 row--; 2785 sum4 -= v4[0] * tmp0; 2786 sum5 -= v5[1] * tmp0; 2787 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2788 row--; 2789 sum5 -= v5[0] * tmp0; 2790 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2791 row--; 2792 break; 2793 default: 2794 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2795 } 2796 } 2797 PetscCall(ISRestoreIndices(isrow, &rout)); 2798 PetscCall(ISRestoreIndices(iscol, &cout)); 2799 PetscCall(VecRestoreArrayRead(bb, &b)); 2800 PetscCall(VecRestoreArrayWrite(xx, &x)); 2801 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2802 PetscFunctionReturn(PETSC_SUCCESS); 2803 } 2804 2805 /* 2806 Makes a longer coloring[] array and calls the usual code with that 2807 */ 2808 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2809 { 2810 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2811 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row; 2812 PetscInt *colorused, i; 2813 ISColoringValue *newcolor; 2814 2815 PetscFunctionBegin; 2816 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2817 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2818 /* loop over inodes, marking a color for each column*/ 2819 row = 0; 2820 for (i = 0; i < m; i++) { 2821 for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++)); 2822 } 2823 2824 /* eliminate unneeded colors */ 2825 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2826 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2827 2828 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2829 ncolors = colorused[5 * ncolors - 1]; 2830 for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i)); 2831 PetscCall(PetscFree(colorused)); 2832 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2833 PetscCall(PetscFree(coloring)); 2834 PetscFunctionReturn(PETSC_SUCCESS); 2835 } 2836 2837 #include <petsc/private/kernels/blockinvert.h> 2838 2839 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2840 { 2841 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2842 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2843 MatScalar *ibdiag, *bdiag, work[25], *t; 2844 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2845 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2846 const PetscScalar *xb, *b; 2847 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2848 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz; 2849 PetscInt sz, k, ipvt[5]; 2850 PetscBool allowzeropivot, zeropivotdetected; 2851 const PetscInt *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i; 2852 2853 PetscFunctionBegin; 2854 allowzeropivot = PetscNot(A->erroriffailure); 2855 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2856 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2857 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2858 2859 if (!a->inode.ibdiagvalid) { 2860 if (!a->inode.ibdiag) { 2861 /* calculate space needed for diagonal blocks */ 2862 for (i = 0; i < m; i++) { 2863 nodesz = sizes[i + 1] - sizes[i]; 2864 cnt += nodesz * nodesz; 2865 } 2866 a->inode.bdiagsize = cnt; 2867 2868 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2869 } 2870 2871 /* copy over the diagonal blocks and invert them */ 2872 ibdiag = a->inode.ibdiag; 2873 bdiag = a->inode.bdiag; 2874 cnt = 0; 2875 for (i = 0, row = 0; i < m; i++) { 2876 nodesz = sizes[i + 1] - sizes[i]; 2877 for (j = 0; j < nodesz; j++) { 2878 for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k]; 2879 } 2880 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz)); 2881 2882 switch (nodesz) { 2883 case 1: 2884 /* Create matrix data structure */ 2885 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2886 PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2887 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2888 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2889 A->factorerror_zeropivot_row = row; 2890 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2891 } 2892 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2893 break; 2894 case 2: 2895 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2896 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2897 break; 2898 case 3: 2899 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2900 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2901 break; 2902 case 4: 2903 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2904 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2905 break; 2906 case 5: 2907 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2908 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2909 break; 2910 default: 2911 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 2912 } 2913 cnt += nodesz * nodesz; 2914 row += nodesz; 2915 } 2916 a->inode.ibdiagvalid = PETSC_TRUE; 2917 } 2918 ibdiag = a->inode.ibdiag; 2919 bdiag = a->inode.bdiag; 2920 t = a->inode.ssor_work; 2921 2922 PetscCall(VecGetArray(xx, &x)); 2923 PetscCall(VecGetArrayRead(bb, &b)); 2924 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2925 if (flag & SOR_ZERO_INITIAL_GUESS) { 2926 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2927 for (i = 0, row = 0; i < m; i++) { 2928 sz = diag[row] - ii[row]; 2929 v1 = a->a + ii[row]; 2930 idx = a->j + ii[row]; 2931 2932 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2933 nodesz = sizes[i + 1] - sizes[i]; 2934 switch (nodesz) { 2935 case 1: 2936 2937 sum1 = b[row]; 2938 for (n = 0; n < sz - 1; n += 2) { 2939 i1 = idx[0]; 2940 i2 = idx[1]; 2941 idx += 2; 2942 tmp0 = x[i1]; 2943 tmp1 = x[i2]; 2944 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2945 v1 += 2; 2946 } 2947 2948 if (n == sz - 1) { 2949 tmp0 = x[*idx]; 2950 sum1 -= *v1 * tmp0; 2951 } 2952 t[row] = sum1; 2953 x[row++] = sum1 * (*ibdiag++); 2954 break; 2955 case 2: 2956 v2 = a->a + ii[row + 1]; 2957 sum1 = b[row]; 2958 sum2 = b[row + 1]; 2959 for (n = 0; n < sz - 1; n += 2) { 2960 i1 = idx[0]; 2961 i2 = idx[1]; 2962 idx += 2; 2963 tmp0 = x[i1]; 2964 tmp1 = x[i2]; 2965 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2966 v1 += 2; 2967 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2968 v2 += 2; 2969 } 2970 2971 if (n == sz - 1) { 2972 tmp0 = x[*idx]; 2973 sum1 -= v1[0] * tmp0; 2974 sum2 -= v2[0] * tmp0; 2975 } 2976 t[row] = sum1; 2977 t[row + 1] = sum2; 2978 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2979 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2980 ibdiag += 4; 2981 break; 2982 case 3: 2983 v2 = a->a + ii[row + 1]; 2984 v3 = a->a + ii[row + 2]; 2985 sum1 = b[row]; 2986 sum2 = b[row + 1]; 2987 sum3 = b[row + 2]; 2988 for (n = 0; n < sz - 1; n += 2) { 2989 i1 = idx[0]; 2990 i2 = idx[1]; 2991 idx += 2; 2992 tmp0 = x[i1]; 2993 tmp1 = x[i2]; 2994 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2995 v1 += 2; 2996 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2997 v2 += 2; 2998 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2999 v3 += 2; 3000 } 3001 3002 if (n == sz - 1) { 3003 tmp0 = x[*idx]; 3004 sum1 -= v1[0] * tmp0; 3005 sum2 -= v2[0] * tmp0; 3006 sum3 -= v3[0] * tmp0; 3007 } 3008 t[row] = sum1; 3009 t[row + 1] = sum2; 3010 t[row + 2] = sum3; 3011 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3012 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3013 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3014 ibdiag += 9; 3015 break; 3016 case 4: 3017 v2 = a->a + ii[row + 1]; 3018 v3 = a->a + ii[row + 2]; 3019 v4 = a->a + ii[row + 3]; 3020 sum1 = b[row]; 3021 sum2 = b[row + 1]; 3022 sum3 = b[row + 2]; 3023 sum4 = b[row + 3]; 3024 for (n = 0; n < sz - 1; n += 2) { 3025 i1 = idx[0]; 3026 i2 = idx[1]; 3027 idx += 2; 3028 tmp0 = x[i1]; 3029 tmp1 = x[i2]; 3030 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3031 v1 += 2; 3032 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3033 v2 += 2; 3034 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3035 v3 += 2; 3036 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3037 v4 += 2; 3038 } 3039 3040 if (n == sz - 1) { 3041 tmp0 = x[*idx]; 3042 sum1 -= v1[0] * tmp0; 3043 sum2 -= v2[0] * tmp0; 3044 sum3 -= v3[0] * tmp0; 3045 sum4 -= v4[0] * tmp0; 3046 } 3047 t[row] = sum1; 3048 t[row + 1] = sum2; 3049 t[row + 2] = sum3; 3050 t[row + 3] = sum4; 3051 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3052 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3053 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3054 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3055 ibdiag += 16; 3056 break; 3057 case 5: 3058 v2 = a->a + ii[row + 1]; 3059 v3 = a->a + ii[row + 2]; 3060 v4 = a->a + ii[row + 3]; 3061 v5 = a->a + ii[row + 4]; 3062 sum1 = b[row]; 3063 sum2 = b[row + 1]; 3064 sum3 = b[row + 2]; 3065 sum4 = b[row + 3]; 3066 sum5 = b[row + 4]; 3067 for (n = 0; n < sz - 1; n += 2) { 3068 i1 = idx[0]; 3069 i2 = idx[1]; 3070 idx += 2; 3071 tmp0 = x[i1]; 3072 tmp1 = x[i2]; 3073 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3074 v1 += 2; 3075 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3076 v2 += 2; 3077 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3078 v3 += 2; 3079 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3080 v4 += 2; 3081 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3082 v5 += 2; 3083 } 3084 3085 if (n == sz - 1) { 3086 tmp0 = x[*idx]; 3087 sum1 -= v1[0] * tmp0; 3088 sum2 -= v2[0] * tmp0; 3089 sum3 -= v3[0] * tmp0; 3090 sum4 -= v4[0] * tmp0; 3091 sum5 -= v5[0] * tmp0; 3092 } 3093 t[row] = sum1; 3094 t[row + 1] = sum2; 3095 t[row + 2] = sum3; 3096 t[row + 3] = sum4; 3097 t[row + 4] = sum5; 3098 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3099 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3100 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3101 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3102 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3103 ibdiag += 25; 3104 break; 3105 default: 3106 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3107 } 3108 } 3109 3110 xb = t; 3111 PetscCall(PetscLogFlops(a->nz)); 3112 } else xb = b; 3113 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3114 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3115 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3116 nodesz = sizes[i + 1] - sizes[i]; 3117 ibdiag -= nodesz * nodesz; 3118 sz = ii[row + 1] - diag[row] - 1; 3119 v1 = a->a + diag[row] + 1; 3120 idx = a->j + diag[row] + 1; 3121 3122 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3123 switch (nodesz) { 3124 case 1: 3125 3126 sum1 = xb[row]; 3127 for (n = 0; n < sz - 1; n += 2) { 3128 i1 = idx[0]; 3129 i2 = idx[1]; 3130 idx += 2; 3131 tmp0 = x[i1]; 3132 tmp1 = x[i2]; 3133 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3134 v1 += 2; 3135 } 3136 3137 if (n == sz - 1) { 3138 tmp0 = x[*idx]; 3139 sum1 -= *v1 * tmp0; 3140 } 3141 x[row--] = sum1 * (*ibdiag); 3142 break; 3143 3144 case 2: 3145 3146 sum1 = xb[row]; 3147 sum2 = xb[row - 1]; 3148 /* note that sum1 is associated with the second of the two rows */ 3149 v2 = a->a + diag[row - 1] + 2; 3150 for (n = 0; n < sz - 1; n += 2) { 3151 i1 = idx[0]; 3152 i2 = idx[1]; 3153 idx += 2; 3154 tmp0 = x[i1]; 3155 tmp1 = x[i2]; 3156 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3157 v1 += 2; 3158 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3159 v2 += 2; 3160 } 3161 3162 if (n == sz - 1) { 3163 tmp0 = x[*idx]; 3164 sum1 -= *v1 * tmp0; 3165 sum2 -= *v2 * tmp0; 3166 } 3167 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3168 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3169 break; 3170 case 3: 3171 3172 sum1 = xb[row]; 3173 sum2 = xb[row - 1]; 3174 sum3 = xb[row - 2]; 3175 v2 = a->a + diag[row - 1] + 2; 3176 v3 = a->a + diag[row - 2] + 3; 3177 for (n = 0; n < sz - 1; n += 2) { 3178 i1 = idx[0]; 3179 i2 = idx[1]; 3180 idx += 2; 3181 tmp0 = x[i1]; 3182 tmp1 = x[i2]; 3183 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3184 v1 += 2; 3185 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3186 v2 += 2; 3187 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3188 v3 += 2; 3189 } 3190 3191 if (n == sz - 1) { 3192 tmp0 = x[*idx]; 3193 sum1 -= *v1 * tmp0; 3194 sum2 -= *v2 * tmp0; 3195 sum3 -= *v3 * tmp0; 3196 } 3197 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3198 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3199 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3200 break; 3201 case 4: 3202 3203 sum1 = xb[row]; 3204 sum2 = xb[row - 1]; 3205 sum3 = xb[row - 2]; 3206 sum4 = xb[row - 3]; 3207 v2 = a->a + diag[row - 1] + 2; 3208 v3 = a->a + diag[row - 2] + 3; 3209 v4 = a->a + diag[row - 3] + 4; 3210 for (n = 0; n < sz - 1; n += 2) { 3211 i1 = idx[0]; 3212 i2 = idx[1]; 3213 idx += 2; 3214 tmp0 = x[i1]; 3215 tmp1 = x[i2]; 3216 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3217 v1 += 2; 3218 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3219 v2 += 2; 3220 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3221 v3 += 2; 3222 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3223 v4 += 2; 3224 } 3225 3226 if (n == sz - 1) { 3227 tmp0 = x[*idx]; 3228 sum1 -= *v1 * tmp0; 3229 sum2 -= *v2 * tmp0; 3230 sum3 -= *v3 * tmp0; 3231 sum4 -= *v4 * tmp0; 3232 } 3233 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3234 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3235 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3236 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3237 break; 3238 case 5: 3239 3240 sum1 = xb[row]; 3241 sum2 = xb[row - 1]; 3242 sum3 = xb[row - 2]; 3243 sum4 = xb[row - 3]; 3244 sum5 = xb[row - 4]; 3245 v2 = a->a + diag[row - 1] + 2; 3246 v3 = a->a + diag[row - 2] + 3; 3247 v4 = a->a + diag[row - 3] + 4; 3248 v5 = a->a + diag[row - 4] + 5; 3249 for (n = 0; n < sz - 1; n += 2) { 3250 i1 = idx[0]; 3251 i2 = idx[1]; 3252 idx += 2; 3253 tmp0 = x[i1]; 3254 tmp1 = x[i2]; 3255 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3256 v1 += 2; 3257 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3258 v2 += 2; 3259 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3260 v3 += 2; 3261 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3262 v4 += 2; 3263 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3264 v5 += 2; 3265 } 3266 3267 if (n == sz - 1) { 3268 tmp0 = x[*idx]; 3269 sum1 -= *v1 * tmp0; 3270 sum2 -= *v2 * tmp0; 3271 sum3 -= *v3 * tmp0; 3272 sum4 -= *v4 * tmp0; 3273 sum5 -= *v5 * tmp0; 3274 } 3275 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3276 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3277 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3278 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3279 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3280 break; 3281 default: 3282 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3283 } 3284 } 3285 3286 PetscCall(PetscLogFlops(a->nz)); 3287 } 3288 its--; 3289 } 3290 while (its--) { 3291 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3292 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) { 3293 nodesz = sizes[i + 1] - sizes[i]; 3294 sz = diag[row] - ii[row]; 3295 v1 = a->a + ii[row]; 3296 idx = a->j + ii[row]; 3297 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3298 switch (nodesz) { 3299 case 1: 3300 sum1 = b[row]; 3301 for (n = 0; n < sz - 1; n += 2) { 3302 i1 = idx[0]; 3303 i2 = idx[1]; 3304 idx += 2; 3305 tmp0 = x[i1]; 3306 tmp1 = x[i2]; 3307 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3308 v1 += 2; 3309 } 3310 if (n == sz - 1) { 3311 tmp0 = x[*idx++]; 3312 sum1 -= *v1 * tmp0; 3313 v1++; 3314 } 3315 t[row] = sum1; 3316 sz = ii[row + 1] - diag[row] - 1; 3317 idx = a->j + diag[row] + 1; 3318 v1 += 1; 3319 for (n = 0; n < sz - 1; n += 2) { 3320 i1 = idx[0]; 3321 i2 = idx[1]; 3322 idx += 2; 3323 tmp0 = x[i1]; 3324 tmp1 = x[i2]; 3325 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3326 v1 += 2; 3327 } 3328 if (n == sz - 1) { 3329 tmp0 = x[*idx++]; 3330 sum1 -= *v1 * tmp0; 3331 } 3332 /* in MatSOR_SeqAIJ this line would be 3333 * 3334 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3335 * 3336 * but omega == 1, so this becomes 3337 * 3338 * x[row] = sum1*(*ibdiag++); 3339 * 3340 */ 3341 x[row] = sum1 * (*ibdiag); 3342 break; 3343 case 2: 3344 v2 = a->a + ii[row + 1]; 3345 sum1 = b[row]; 3346 sum2 = b[row + 1]; 3347 for (n = 0; n < sz - 1; n += 2) { 3348 i1 = idx[0]; 3349 i2 = idx[1]; 3350 idx += 2; 3351 tmp0 = x[i1]; 3352 tmp1 = x[i2]; 3353 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3354 v1 += 2; 3355 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3356 v2 += 2; 3357 } 3358 if (n == sz - 1) { 3359 tmp0 = x[*idx++]; 3360 sum1 -= v1[0] * tmp0; 3361 sum2 -= v2[0] * tmp0; 3362 v1++; 3363 v2++; 3364 } 3365 t[row] = sum1; 3366 t[row + 1] = sum2; 3367 sz = ii[row + 1] - diag[row] - 2; 3368 idx = a->j + diag[row] + 2; 3369 v1 += 2; 3370 v2 += 2; 3371 for (n = 0; n < sz - 1; n += 2) { 3372 i1 = idx[0]; 3373 i2 = idx[1]; 3374 idx += 2; 3375 tmp0 = x[i1]; 3376 tmp1 = x[i2]; 3377 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3378 v1 += 2; 3379 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3380 v2 += 2; 3381 } 3382 if (n == sz - 1) { 3383 tmp0 = x[*idx]; 3384 sum1 -= v1[0] * tmp0; 3385 sum2 -= v2[0] * tmp0; 3386 } 3387 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3388 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3389 break; 3390 case 3: 3391 v2 = a->a + ii[row + 1]; 3392 v3 = a->a + ii[row + 2]; 3393 sum1 = b[row]; 3394 sum2 = b[row + 1]; 3395 sum3 = b[row + 2]; 3396 for (n = 0; n < sz - 1; n += 2) { 3397 i1 = idx[0]; 3398 i2 = idx[1]; 3399 idx += 2; 3400 tmp0 = x[i1]; 3401 tmp1 = x[i2]; 3402 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3403 v1 += 2; 3404 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3405 v2 += 2; 3406 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3407 v3 += 2; 3408 } 3409 if (n == sz - 1) { 3410 tmp0 = x[*idx++]; 3411 sum1 -= v1[0] * tmp0; 3412 sum2 -= v2[0] * tmp0; 3413 sum3 -= v3[0] * tmp0; 3414 v1++; 3415 v2++; 3416 v3++; 3417 } 3418 t[row] = sum1; 3419 t[row + 1] = sum2; 3420 t[row + 2] = sum3; 3421 sz = ii[row + 1] - diag[row] - 3; 3422 idx = a->j + diag[row] + 3; 3423 v1 += 3; 3424 v2 += 3; 3425 v3 += 3; 3426 for (n = 0; n < sz - 1; n += 2) { 3427 i1 = idx[0]; 3428 i2 = idx[1]; 3429 idx += 2; 3430 tmp0 = x[i1]; 3431 tmp1 = x[i2]; 3432 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3433 v1 += 2; 3434 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3435 v2 += 2; 3436 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3437 v3 += 2; 3438 } 3439 if (n == sz - 1) { 3440 tmp0 = x[*idx]; 3441 sum1 -= v1[0] * tmp0; 3442 sum2 -= v2[0] * tmp0; 3443 sum3 -= v3[0] * tmp0; 3444 } 3445 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3446 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3447 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3448 break; 3449 case 4: 3450 v2 = a->a + ii[row + 1]; 3451 v3 = a->a + ii[row + 2]; 3452 v4 = a->a + ii[row + 3]; 3453 sum1 = b[row]; 3454 sum2 = b[row + 1]; 3455 sum3 = b[row + 2]; 3456 sum4 = b[row + 3]; 3457 for (n = 0; n < sz - 1; n += 2) { 3458 i1 = idx[0]; 3459 i2 = idx[1]; 3460 idx += 2; 3461 tmp0 = x[i1]; 3462 tmp1 = x[i2]; 3463 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3464 v1 += 2; 3465 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3466 v2 += 2; 3467 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3468 v3 += 2; 3469 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3470 v4 += 2; 3471 } 3472 if (n == sz - 1) { 3473 tmp0 = x[*idx++]; 3474 sum1 -= v1[0] * tmp0; 3475 sum2 -= v2[0] * tmp0; 3476 sum3 -= v3[0] * tmp0; 3477 sum4 -= v4[0] * tmp0; 3478 v1++; 3479 v2++; 3480 v3++; 3481 v4++; 3482 } 3483 t[row] = sum1; 3484 t[row + 1] = sum2; 3485 t[row + 2] = sum3; 3486 t[row + 3] = sum4; 3487 sz = ii[row + 1] - diag[row] - 4; 3488 idx = a->j + diag[row] + 4; 3489 v1 += 4; 3490 v2 += 4; 3491 v3 += 4; 3492 v4 += 4; 3493 for (n = 0; n < sz - 1; n += 2) { 3494 i1 = idx[0]; 3495 i2 = idx[1]; 3496 idx += 2; 3497 tmp0 = x[i1]; 3498 tmp1 = x[i2]; 3499 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3500 v1 += 2; 3501 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3502 v2 += 2; 3503 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3504 v3 += 2; 3505 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3506 v4 += 2; 3507 } 3508 if (n == sz - 1) { 3509 tmp0 = x[*idx]; 3510 sum1 -= v1[0] * tmp0; 3511 sum2 -= v2[0] * tmp0; 3512 sum3 -= v3[0] * tmp0; 3513 sum4 -= v4[0] * tmp0; 3514 } 3515 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3516 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3517 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3518 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3519 break; 3520 case 5: 3521 v2 = a->a + ii[row + 1]; 3522 v3 = a->a + ii[row + 2]; 3523 v4 = a->a + ii[row + 3]; 3524 v5 = a->a + ii[row + 4]; 3525 sum1 = b[row]; 3526 sum2 = b[row + 1]; 3527 sum3 = b[row + 2]; 3528 sum4 = b[row + 3]; 3529 sum5 = b[row + 4]; 3530 for (n = 0; n < sz - 1; n += 2) { 3531 i1 = idx[0]; 3532 i2 = idx[1]; 3533 idx += 2; 3534 tmp0 = x[i1]; 3535 tmp1 = x[i2]; 3536 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3537 v1 += 2; 3538 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3539 v2 += 2; 3540 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3541 v3 += 2; 3542 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3543 v4 += 2; 3544 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3545 v5 += 2; 3546 } 3547 if (n == sz - 1) { 3548 tmp0 = x[*idx++]; 3549 sum1 -= v1[0] * tmp0; 3550 sum2 -= v2[0] * tmp0; 3551 sum3 -= v3[0] * tmp0; 3552 sum4 -= v4[0] * tmp0; 3553 sum5 -= v5[0] * tmp0; 3554 v1++; 3555 v2++; 3556 v3++; 3557 v4++; 3558 v5++; 3559 } 3560 t[row] = sum1; 3561 t[row + 1] = sum2; 3562 t[row + 2] = sum3; 3563 t[row + 3] = sum4; 3564 t[row + 4] = sum5; 3565 sz = ii[row + 1] - diag[row] - 5; 3566 idx = a->j + diag[row] + 5; 3567 v1 += 5; 3568 v2 += 5; 3569 v3 += 5; 3570 v4 += 5; 3571 v5 += 5; 3572 for (n = 0; n < sz - 1; n += 2) { 3573 i1 = idx[0]; 3574 i2 = idx[1]; 3575 idx += 2; 3576 tmp0 = x[i1]; 3577 tmp1 = x[i2]; 3578 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3579 v1 += 2; 3580 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3581 v2 += 2; 3582 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3583 v3 += 2; 3584 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3585 v4 += 2; 3586 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3587 v5 += 2; 3588 } 3589 if (n == sz - 1) { 3590 tmp0 = x[*idx]; 3591 sum1 -= v1[0] * tmp0; 3592 sum2 -= v2[0] * tmp0; 3593 sum3 -= v3[0] * tmp0; 3594 sum4 -= v4[0] * tmp0; 3595 sum5 -= v5[0] * tmp0; 3596 } 3597 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3598 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3599 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3600 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3601 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3602 break; 3603 default: 3604 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3605 } 3606 } 3607 xb = t; 3608 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3609 } else xb = b; 3610 3611 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3612 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3613 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3614 nodesz = sizes[i + 1] - sizes[i]; 3615 ibdiag -= nodesz * nodesz; 3616 3617 /* set RHS */ 3618 if (xb == b) { 3619 /* whole (old way) */ 3620 sz = ii[row + 1] - ii[row]; 3621 idx = a->j + ii[row]; 3622 switch (nodesz) { 3623 case 5: 3624 v5 = a->a + ii[row - 4]; /* fall through */ 3625 case 4: 3626 v4 = a->a + ii[row - 3]; /* fall through */ 3627 case 3: 3628 v3 = a->a + ii[row - 2]; /* fall through */ 3629 case 2: 3630 v2 = a->a + ii[row - 1]; /* fall through */ 3631 case 1: 3632 v1 = a->a + ii[row]; 3633 break; 3634 default: 3635 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3636 } 3637 } else { 3638 /* upper, no diag */ 3639 sz = ii[row + 1] - diag[row] - 1; 3640 idx = a->j + diag[row] + 1; 3641 switch (nodesz) { 3642 case 5: 3643 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3644 case 4: 3645 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3646 case 3: 3647 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3648 case 2: 3649 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3650 case 1: 3651 v1 = a->a + diag[row] + 1; 3652 } 3653 } 3654 /* set sum */ 3655 switch (nodesz) { 3656 case 5: 3657 sum5 = xb[row - 4]; /* fall through */ 3658 case 4: 3659 sum4 = xb[row - 3]; /* fall through */ 3660 case 3: 3661 sum3 = xb[row - 2]; /* fall through */ 3662 case 2: 3663 sum2 = xb[row - 1]; /* fall through */ 3664 case 1: 3665 /* note that sum1 is associated with the last row */ 3666 sum1 = xb[row]; 3667 } 3668 /* do sums */ 3669 for (n = 0; n < sz - 1; n += 2) { 3670 i1 = idx[0]; 3671 i2 = idx[1]; 3672 idx += 2; 3673 tmp0 = x[i1]; 3674 tmp1 = x[i2]; 3675 switch (nodesz) { 3676 case 5: 3677 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3678 v5 += 2; /* fall through */ 3679 case 4: 3680 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3681 v4 += 2; /* fall through */ 3682 case 3: 3683 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3684 v3 += 2; /* fall through */ 3685 case 2: 3686 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3687 v2 += 2; /* fall through */ 3688 case 1: 3689 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3690 v1 += 2; 3691 } 3692 } 3693 /* ragged edge */ 3694 if (n == sz - 1) { 3695 tmp0 = x[*idx]; 3696 switch (nodesz) { 3697 case 5: 3698 sum5 -= *v5 * tmp0; /* fall through */ 3699 case 4: 3700 sum4 -= *v4 * tmp0; /* fall through */ 3701 case 3: 3702 sum3 -= *v3 * tmp0; /* fall through */ 3703 case 2: 3704 sum2 -= *v2 * tmp0; /* fall through */ 3705 case 1: 3706 sum1 -= *v1 * tmp0; 3707 } 3708 } 3709 /* update */ 3710 if (xb == b) { 3711 /* whole (old way) w/ diag */ 3712 switch (nodesz) { 3713 case 5: 3714 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3715 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3716 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3717 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3718 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3719 break; 3720 case 4: 3721 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3722 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3723 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3724 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3725 break; 3726 case 3: 3727 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3728 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3729 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3730 break; 3731 case 2: 3732 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3733 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3734 break; 3735 case 1: 3736 x[row--] += sum1 * (*ibdiag); 3737 break; 3738 } 3739 } else { 3740 /* no diag so set = */ 3741 switch (nodesz) { 3742 case 5: 3743 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3744 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3745 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3746 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3747 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3748 break; 3749 case 4: 3750 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3751 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3752 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3753 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3754 break; 3755 case 3: 3756 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3757 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3758 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3759 break; 3760 case 2: 3761 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3762 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3763 break; 3764 case 1: 3765 x[row--] = sum1 * (*ibdiag); 3766 break; 3767 } 3768 } 3769 } 3770 if (xb == b) { 3771 PetscCall(PetscLogFlops(2.0 * a->nz)); 3772 } else { 3773 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3774 } 3775 } 3776 } 3777 if (flag & SOR_EISENSTAT) { 3778 /* 3779 Apply (U + D)^-1 where D is now the block diagonal 3780 */ 3781 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3782 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3783 nodesz = sizes[i + 1] - sizes[i]; 3784 ibdiag -= nodesz * nodesz; 3785 sz = ii[row + 1] - diag[row] - 1; 3786 v1 = a->a + diag[row] + 1; 3787 idx = a->j + diag[row] + 1; 3788 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3789 switch (nodesz) { 3790 case 1: 3791 3792 sum1 = b[row]; 3793 for (n = 0; n < sz - 1; n += 2) { 3794 i1 = idx[0]; 3795 i2 = idx[1]; 3796 idx += 2; 3797 tmp0 = x[i1]; 3798 tmp1 = x[i2]; 3799 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3800 v1 += 2; 3801 } 3802 3803 if (n == sz - 1) { 3804 tmp0 = x[*idx]; 3805 sum1 -= *v1 * tmp0; 3806 } 3807 x[row] = sum1 * (*ibdiag); 3808 row--; 3809 break; 3810 3811 case 2: 3812 3813 sum1 = b[row]; 3814 sum2 = b[row - 1]; 3815 /* note that sum1 is associated with the second of the two rows */ 3816 v2 = a->a + diag[row - 1] + 2; 3817 for (n = 0; n < sz - 1; n += 2) { 3818 i1 = idx[0]; 3819 i2 = idx[1]; 3820 idx += 2; 3821 tmp0 = x[i1]; 3822 tmp1 = x[i2]; 3823 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3824 v1 += 2; 3825 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3826 v2 += 2; 3827 } 3828 3829 if (n == sz - 1) { 3830 tmp0 = x[*idx]; 3831 sum1 -= *v1 * tmp0; 3832 sum2 -= *v2 * tmp0; 3833 } 3834 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3835 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3836 row -= 2; 3837 break; 3838 case 3: 3839 3840 sum1 = b[row]; 3841 sum2 = b[row - 1]; 3842 sum3 = b[row - 2]; 3843 v2 = a->a + diag[row - 1] + 2; 3844 v3 = a->a + diag[row - 2] + 3; 3845 for (n = 0; n < sz - 1; n += 2) { 3846 i1 = idx[0]; 3847 i2 = idx[1]; 3848 idx += 2; 3849 tmp0 = x[i1]; 3850 tmp1 = x[i2]; 3851 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3852 v1 += 2; 3853 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3854 v2 += 2; 3855 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3856 v3 += 2; 3857 } 3858 3859 if (n == sz - 1) { 3860 tmp0 = x[*idx]; 3861 sum1 -= *v1 * tmp0; 3862 sum2 -= *v2 * tmp0; 3863 sum3 -= *v3 * tmp0; 3864 } 3865 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3866 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3867 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3868 row -= 3; 3869 break; 3870 case 4: 3871 3872 sum1 = b[row]; 3873 sum2 = b[row - 1]; 3874 sum3 = b[row - 2]; 3875 sum4 = b[row - 3]; 3876 v2 = a->a + diag[row - 1] + 2; 3877 v3 = a->a + diag[row - 2] + 3; 3878 v4 = a->a + diag[row - 3] + 4; 3879 for (n = 0; n < sz - 1; n += 2) { 3880 i1 = idx[0]; 3881 i2 = idx[1]; 3882 idx += 2; 3883 tmp0 = x[i1]; 3884 tmp1 = x[i2]; 3885 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3886 v1 += 2; 3887 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3888 v2 += 2; 3889 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3890 v3 += 2; 3891 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3892 v4 += 2; 3893 } 3894 3895 if (n == sz - 1) { 3896 tmp0 = x[*idx]; 3897 sum1 -= *v1 * tmp0; 3898 sum2 -= *v2 * tmp0; 3899 sum3 -= *v3 * tmp0; 3900 sum4 -= *v4 * tmp0; 3901 } 3902 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3903 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3904 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3905 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3906 row -= 4; 3907 break; 3908 case 5: 3909 3910 sum1 = b[row]; 3911 sum2 = b[row - 1]; 3912 sum3 = b[row - 2]; 3913 sum4 = b[row - 3]; 3914 sum5 = b[row - 4]; 3915 v2 = a->a + diag[row - 1] + 2; 3916 v3 = a->a + diag[row - 2] + 3; 3917 v4 = a->a + diag[row - 3] + 4; 3918 v5 = a->a + diag[row - 4] + 5; 3919 for (n = 0; n < sz - 1; n += 2) { 3920 i1 = idx[0]; 3921 i2 = idx[1]; 3922 idx += 2; 3923 tmp0 = x[i1]; 3924 tmp1 = x[i2]; 3925 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3926 v1 += 2; 3927 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3928 v2 += 2; 3929 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3930 v3 += 2; 3931 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3932 v4 += 2; 3933 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3934 v5 += 2; 3935 } 3936 3937 if (n == sz - 1) { 3938 tmp0 = x[*idx]; 3939 sum1 -= *v1 * tmp0; 3940 sum2 -= *v2 * tmp0; 3941 sum3 -= *v3 * tmp0; 3942 sum4 -= *v4 * tmp0; 3943 sum5 -= *v5 * tmp0; 3944 } 3945 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3946 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3947 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3948 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3949 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3950 row -= 5; 3951 break; 3952 default: 3953 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3954 } 3955 } 3956 PetscCall(PetscLogFlops(a->nz)); 3957 3958 /* 3959 t = b - D x where D is the block diagonal 3960 */ 3961 cnt = 0; 3962 for (i = 0, row = 0; i < m; i++) { 3963 nodesz = sizes[i + 1] - sizes[i]; 3964 switch (nodesz) { 3965 case 1: 3966 t[row] = b[row] - bdiag[cnt++] * x[row]; 3967 row++; 3968 break; 3969 case 2: 3970 x1 = x[row]; 3971 x2 = x[row + 1]; 3972 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3973 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3974 t[row] = b[row] - tmp1; 3975 t[row + 1] = b[row + 1] - tmp2; 3976 row += 2; 3977 cnt += 4; 3978 break; 3979 case 3: 3980 x1 = x[row]; 3981 x2 = x[row + 1]; 3982 x3 = x[row + 2]; 3983 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3984 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3985 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3986 t[row] = b[row] - tmp1; 3987 t[row + 1] = b[row + 1] - tmp2; 3988 t[row + 2] = b[row + 2] - tmp3; 3989 row += 3; 3990 cnt += 9; 3991 break; 3992 case 4: 3993 x1 = x[row]; 3994 x2 = x[row + 1]; 3995 x3 = x[row + 2]; 3996 x4 = x[row + 3]; 3997 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3998 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3999 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4000 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4001 t[row] = b[row] - tmp1; 4002 t[row + 1] = b[row + 1] - tmp2; 4003 t[row + 2] = b[row + 2] - tmp3; 4004 t[row + 3] = b[row + 3] - tmp4; 4005 row += 4; 4006 cnt += 16; 4007 break; 4008 case 5: 4009 x1 = x[row]; 4010 x2 = x[row + 1]; 4011 x3 = x[row + 2]; 4012 x4 = x[row + 3]; 4013 x5 = x[row + 4]; 4014 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4015 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4016 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4017 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4018 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4019 t[row] = b[row] - tmp1; 4020 t[row + 1] = b[row + 1] - tmp2; 4021 t[row + 2] = b[row + 2] - tmp3; 4022 t[row + 3] = b[row + 3] - tmp4; 4023 t[row + 4] = b[row + 4] - tmp5; 4024 row += 5; 4025 cnt += 25; 4026 break; 4027 default: 4028 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4029 } 4030 } 4031 PetscCall(PetscLogFlops(m)); 4032 4033 /* 4034 Apply (L + D)^-1 where D is the block diagonal 4035 */ 4036 for (i = 0, row = 0; i < m; i++) { 4037 nodesz = sizes[i + 1] - sizes[i]; 4038 sz = diag[row] - ii[row]; 4039 v1 = a->a + ii[row]; 4040 idx = a->j + ii[row]; 4041 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4042 switch (nodesz) { 4043 case 1: 4044 4045 sum1 = t[row]; 4046 for (n = 0; n < sz - 1; n += 2) { 4047 i1 = idx[0]; 4048 i2 = idx[1]; 4049 idx += 2; 4050 tmp0 = t[i1]; 4051 tmp1 = t[i2]; 4052 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4053 v1 += 2; 4054 } 4055 4056 if (n == sz - 1) { 4057 tmp0 = t[*idx]; 4058 sum1 -= *v1 * tmp0; 4059 } 4060 x[row] += t[row] = sum1 * (*ibdiag++); 4061 row++; 4062 break; 4063 case 2: 4064 v2 = a->a + ii[row + 1]; 4065 sum1 = t[row]; 4066 sum2 = t[row + 1]; 4067 for (n = 0; n < sz - 1; n += 2) { 4068 i1 = idx[0]; 4069 i2 = idx[1]; 4070 idx += 2; 4071 tmp0 = t[i1]; 4072 tmp1 = t[i2]; 4073 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4074 v1 += 2; 4075 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4076 v2 += 2; 4077 } 4078 4079 if (n == sz - 1) { 4080 tmp0 = t[*idx]; 4081 sum1 -= v1[0] * tmp0; 4082 sum2 -= v2[0] * tmp0; 4083 } 4084 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4085 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4086 ibdiag += 4; 4087 row += 2; 4088 break; 4089 case 3: 4090 v2 = a->a + ii[row + 1]; 4091 v3 = a->a + ii[row + 2]; 4092 sum1 = t[row]; 4093 sum2 = t[row + 1]; 4094 sum3 = t[row + 2]; 4095 for (n = 0; n < sz - 1; n += 2) { 4096 i1 = idx[0]; 4097 i2 = idx[1]; 4098 idx += 2; 4099 tmp0 = t[i1]; 4100 tmp1 = t[i2]; 4101 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4102 v1 += 2; 4103 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4104 v2 += 2; 4105 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4106 v3 += 2; 4107 } 4108 4109 if (n == sz - 1) { 4110 tmp0 = t[*idx]; 4111 sum1 -= v1[0] * tmp0; 4112 sum2 -= v2[0] * tmp0; 4113 sum3 -= v3[0] * tmp0; 4114 } 4115 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4116 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4117 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4118 ibdiag += 9; 4119 row += 3; 4120 break; 4121 case 4: 4122 v2 = a->a + ii[row + 1]; 4123 v3 = a->a + ii[row + 2]; 4124 v4 = a->a + ii[row + 3]; 4125 sum1 = t[row]; 4126 sum2 = t[row + 1]; 4127 sum3 = t[row + 2]; 4128 sum4 = t[row + 3]; 4129 for (n = 0; n < sz - 1; n += 2) { 4130 i1 = idx[0]; 4131 i2 = idx[1]; 4132 idx += 2; 4133 tmp0 = t[i1]; 4134 tmp1 = t[i2]; 4135 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4136 v1 += 2; 4137 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4138 v2 += 2; 4139 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4140 v3 += 2; 4141 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4142 v4 += 2; 4143 } 4144 4145 if (n == sz - 1) { 4146 tmp0 = t[*idx]; 4147 sum1 -= v1[0] * tmp0; 4148 sum2 -= v2[0] * tmp0; 4149 sum3 -= v3[0] * tmp0; 4150 sum4 -= v4[0] * tmp0; 4151 } 4152 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4153 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4154 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4155 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4156 ibdiag += 16; 4157 row += 4; 4158 break; 4159 case 5: 4160 v2 = a->a + ii[row + 1]; 4161 v3 = a->a + ii[row + 2]; 4162 v4 = a->a + ii[row + 3]; 4163 v5 = a->a + ii[row + 4]; 4164 sum1 = t[row]; 4165 sum2 = t[row + 1]; 4166 sum3 = t[row + 2]; 4167 sum4 = t[row + 3]; 4168 sum5 = t[row + 4]; 4169 for (n = 0; n < sz - 1; n += 2) { 4170 i1 = idx[0]; 4171 i2 = idx[1]; 4172 idx += 2; 4173 tmp0 = t[i1]; 4174 tmp1 = t[i2]; 4175 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4176 v1 += 2; 4177 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4178 v2 += 2; 4179 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4180 v3 += 2; 4181 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4182 v4 += 2; 4183 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4184 v5 += 2; 4185 } 4186 4187 if (n == sz - 1) { 4188 tmp0 = t[*idx]; 4189 sum1 -= v1[0] * tmp0; 4190 sum2 -= v2[0] * tmp0; 4191 sum3 -= v3[0] * tmp0; 4192 sum4 -= v4[0] * tmp0; 4193 sum5 -= v5[0] * tmp0; 4194 } 4195 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4196 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4197 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4198 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4199 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4200 ibdiag += 25; 4201 row += 5; 4202 break; 4203 default: 4204 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4205 } 4206 } 4207 PetscCall(PetscLogFlops(a->nz)); 4208 } 4209 PetscCall(VecRestoreArray(xx, &x)); 4210 PetscCall(VecRestoreArrayRead(bb, &b)); 4211 PetscFunctionReturn(PETSC_SUCCESS); 4212 } 4213 4214 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4215 { 4216 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4217 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4218 const MatScalar *bdiag = a->inode.bdiag; 4219 const PetscScalar *b; 4220 PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz; 4221 const PetscInt *sizes = a->inode.size_csr; 4222 4223 PetscFunctionBegin; 4224 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4225 PetscCall(VecGetArray(xx, &x)); 4226 PetscCall(VecGetArrayRead(bb, &b)); 4227 cnt = 0; 4228 for (i = 0, row = 0; i < m; i++) { 4229 nodesz = sizes[i + 1] - sizes[i]; 4230 switch (nodesz) { 4231 case 1: 4232 x[row] = b[row] * bdiag[cnt++]; 4233 row++; 4234 break; 4235 case 2: 4236 x1 = b[row]; 4237 x2 = b[row + 1]; 4238 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4239 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4240 x[row++] = tmp1; 4241 x[row++] = tmp2; 4242 cnt += 4; 4243 break; 4244 case 3: 4245 x1 = b[row]; 4246 x2 = b[row + 1]; 4247 x3 = b[row + 2]; 4248 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4249 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4250 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4251 x[row++] = tmp1; 4252 x[row++] = tmp2; 4253 x[row++] = tmp3; 4254 cnt += 9; 4255 break; 4256 case 4: 4257 x1 = b[row]; 4258 x2 = b[row + 1]; 4259 x3 = b[row + 2]; 4260 x4 = b[row + 3]; 4261 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4262 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4263 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4264 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4265 x[row++] = tmp1; 4266 x[row++] = tmp2; 4267 x[row++] = tmp3; 4268 x[row++] = tmp4; 4269 cnt += 16; 4270 break; 4271 case 5: 4272 x1 = b[row]; 4273 x2 = b[row + 1]; 4274 x3 = b[row + 2]; 4275 x4 = b[row + 3]; 4276 x5 = b[row + 4]; 4277 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4278 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4279 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4280 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4281 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4282 x[row++] = tmp1; 4283 x[row++] = tmp2; 4284 x[row++] = tmp3; 4285 x[row++] = tmp4; 4286 x[row++] = tmp5; 4287 cnt += 25; 4288 break; 4289 default: 4290 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4291 } 4292 } 4293 PetscCall(PetscLogFlops(2.0 * cnt)); 4294 PetscCall(VecRestoreArray(xx, &x)); 4295 PetscCall(VecRestoreArrayRead(bb, &b)); 4296 PetscFunctionReturn(PETSC_SUCCESS); 4297 } 4298 4299 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4300 { 4301 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4302 4303 PetscFunctionBegin; 4304 a->inode.node_count = 0; 4305 a->inode.use = PETSC_FALSE; 4306 a->inode.checked = PETSC_FALSE; 4307 a->inode.mat_nonzerostate = -1; 4308 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4309 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4310 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4311 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4312 A->ops->coloringpatch = NULL; 4313 A->ops->multdiagonalblock = NULL; 4314 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4315 PetscFunctionReturn(PETSC_SUCCESS); 4316 } 4317 4318 /* 4319 samestructure indicates that the matrix has not changed its nonzero structure so we 4320 do not need to recompute the inodes 4321 */ 4322 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4323 { 4324 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4325 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4326 PetscBool flag; 4327 const PetscInt *idx, *idy, *ii; 4328 4329 PetscFunctionBegin; 4330 if (!a->inode.use) { 4331 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4332 PetscCall(PetscFree(a->inode.size_csr)); 4333 PetscFunctionReturn(PETSC_SUCCESS); 4334 } 4335 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 4336 4337 m = A->rmap->n; 4338 if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr)); 4339 ns = a->inode.size_csr; 4340 ns[0] = 0; 4341 4342 i = 0; 4343 node_count = 0; 4344 idx = a->j; 4345 ii = a->i; 4346 if (idx) { 4347 while (i < m) { /* For each row */ 4348 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4349 /* Limits the number of elements in a node to 'a->inode.limit' */ 4350 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4351 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4352 if (nzy != nzx) break; 4353 idy += nzx; /* Same nonzero pattern */ 4354 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4355 if (!flag) break; 4356 } 4357 ns[node_count + 1] = ns[node_count] + blk_size; 4358 node_count++; 4359 idx += blk_size * nzx; 4360 i = j; 4361 } 4362 } 4363 /* If not enough inodes found,, do not use inode version of the routines */ 4364 if (!m || !idx || node_count > .8 * m) { 4365 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4366 PetscCall(PetscFree(a->inode.size_csr)); 4367 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4368 } else { 4369 if (!A->factortype) { 4370 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4371 if (A->rmap->n == A->cmap->n) { 4372 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4373 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4374 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4375 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4376 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4377 } 4378 } else { 4379 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4380 } 4381 a->inode.node_count = node_count; 4382 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4383 } 4384 a->inode.checked = PETSC_TRUE; 4385 a->inode.mat_nonzerostate = A->nonzerostate; 4386 PetscFunctionReturn(PETSC_SUCCESS); 4387 } 4388 4389 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4390 { 4391 Mat B = *C; 4392 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4393 PetscInt m = A->rmap->n; 4394 4395 PetscFunctionBegin; 4396 c->inode.use = a->inode.use; 4397 c->inode.limit = a->inode.limit; 4398 c->inode.max_limit = a->inode.max_limit; 4399 c->inode.checked = PETSC_FALSE; 4400 c->inode.size_csr = NULL; 4401 c->inode.node_count = 0; 4402 c->inode.ibdiagvalid = PETSC_FALSE; 4403 c->inode.ibdiag = NULL; 4404 c->inode.bdiag = NULL; 4405 c->inode.mat_nonzerostate = -1; 4406 if (a->inode.use) { 4407 if (a->inode.checked && a->inode.size_csr) { 4408 PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr)); 4409 PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1)); 4410 4411 c->inode.checked = PETSC_TRUE; 4412 c->inode.node_count = a->inode.node_count; 4413 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4414 } 4415 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4416 if (!B->factortype) { 4417 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4418 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4419 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4420 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4421 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4422 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4423 } else { 4424 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4425 } 4426 } 4427 PetscFunctionReturn(PETSC_SUCCESS); 4428 } 4429 4430 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4431 { 4432 PetscInt k; 4433 const PetscInt *vi; 4434 4435 PetscFunctionBegin; 4436 vi = aj + ai[row]; 4437 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4438 vi = aj + adiag[row]; 4439 cols[nzl] = vi[0]; 4440 vi = aj + adiag[row + 1] + 1; 4441 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4442 PetscFunctionReturn(PETSC_SUCCESS); 4443 } 4444 /* 4445 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4446 Modified from MatSeqAIJCheckInode(). 4447 4448 Input Parameters: 4449 . Mat A - ILU or LU matrix factor 4450 4451 */ 4452 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4453 { 4454 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4455 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4456 PetscInt *cols1, *cols2, *ns; 4457 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4458 PetscBool flag; 4459 4460 PetscFunctionBegin; 4461 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4462 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4463 4464 m = A->rmap->n; 4465 if (a->inode.size_csr) ns = a->inode.size_csr; 4466 else PetscCall(PetscMalloc1(m + 1, &ns)); 4467 ns[0] = 0; 4468 4469 i = 0; 4470 node_count = 0; 4471 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4472 while (i < m) { /* For each row */ 4473 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4474 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4475 nzx = nzl1 + nzu1 + 1; 4476 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4477 4478 /* Limits the number of elements in a node to 'a->inode.limit' */ 4479 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4480 nzl2 = ai[j + 1] - ai[j]; 4481 nzu2 = adiag[j] - adiag[j + 1] - 1; 4482 nzy = nzl2 + nzu2 + 1; 4483 if (nzy != nzx) break; 4484 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4485 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4486 if (!flag) break; 4487 } 4488 ns[node_count + 1] = ns[node_count] + blk_size; 4489 node_count++; 4490 i = j; 4491 } 4492 PetscCall(PetscFree2(cols1, cols2)); 4493 /* If not enough inodes found,, do not use inode version of the routines */ 4494 if (!m || node_count > .8 * m) { 4495 PetscCall(PetscFree(ns)); 4496 4497 a->inode.node_count = 0; 4498 a->inode.size_csr = NULL; 4499 a->inode.use = PETSC_FALSE; 4500 4501 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4502 } else { 4503 A->ops->mult = NULL; 4504 A->ops->sor = NULL; 4505 A->ops->multadd = NULL; 4506 A->ops->getrowij = NULL; 4507 A->ops->restorerowij = NULL; 4508 A->ops->getcolumnij = NULL; 4509 A->ops->restorecolumnij = NULL; 4510 A->ops->coloringpatch = NULL; 4511 A->ops->multdiagonalblock = NULL; 4512 a->inode.node_count = node_count; 4513 a->inode.size_csr = ns; 4514 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4515 } 4516 a->inode.checked = PETSC_TRUE; 4517 PetscFunctionReturn(PETSC_SUCCESS); 4518 } 4519 4520 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4521 { 4522 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4523 4524 PetscFunctionBegin; 4525 a->inode.ibdiagvalid = PETSC_FALSE; 4526 PetscFunctionReturn(PETSC_SUCCESS); 4527 } 4528 4529 /* 4530 This is really ugly. if inodes are used this replaces the 4531 permutations with ones that correspond to rows/cols of the matrix 4532 rather than inode blocks 4533 */ 4534 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4535 { 4536 PetscFunctionBegin; 4537 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4538 PetscFunctionReturn(PETSC_SUCCESS); 4539 } 4540 4541 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4542 { 4543 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4544 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4545 const PetscInt *ridx, *cidx; 4546 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx; 4547 PetscInt nslim_col, *ns_col; 4548 IS ris = *rperm, cis = *cperm; 4549 4550 PetscFunctionBegin; 4551 if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4552 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4553 4554 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4555 PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns)); 4556 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4557 4558 PetscCall(ISGetIndices(ris, &ridx)); 4559 PetscCall(ISGetIndices(cis, &cidx)); 4560 4561 /* Form the inode structure for the rows of permuted matrix using inv perm*/ 4562 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]); 4563 4564 /* Construct the permutations for rows*/ 4565 for (i = 0, row = 0; i < nslim_row; ++i) { 4566 indx = ridx[i]; 4567 start_val = tns[indx]; 4568 end_val = tns[indx + 1]; 4569 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4570 } 4571 4572 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4573 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]); 4574 4575 /* Construct permutations for columns */ 4576 for (i = 0, col = 0; i < nslim_col; ++i) { 4577 indx = cidx[i]; 4578 start_val = tns[indx]; 4579 end_val = tns[indx + 1]; 4580 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4581 } 4582 4583 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4584 PetscCall(ISSetPermutation(*rperm)); 4585 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4586 PetscCall(ISSetPermutation(*cperm)); 4587 4588 PetscCall(ISRestoreIndices(ris, &ridx)); 4589 PetscCall(ISRestoreIndices(cis, &cidx)); 4590 4591 PetscCall(PetscFree(ns_col)); 4592 PetscCall(PetscFree2(permr, permc)); 4593 PetscCall(ISDestroy(&cis)); 4594 PetscCall(ISDestroy(&ris)); 4595 PetscCall(PetscFree(tns)); 4596 PetscFunctionReturn(PETSC_SUCCESS); 4597 } 4598 4599 /*@C 4600 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4601 4602 Not Collective 4603 4604 Input Parameter: 4605 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4606 4607 Output Parameters: 4608 + node_count - no of inodes present in the matrix. 4609 . sizes - an array of size `node_count`, with the sizes of each inode. 4610 - limit - the max size used to generate the inodes. 4611 4612 Level: advanced 4613 4614 Note: 4615 It should be called after the matrix is assembled. 4616 The contents of the sizes[] array should not be changed. 4617 `NULL` may be passed for information not needed 4618 4619 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 4620 @*/ 4621 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4622 { 4623 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4624 4625 PetscFunctionBegin; 4626 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4627 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4628 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4629 PetscFunctionReturn(PETSC_SUCCESS); 4630 } 4631 4632 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4633 { 4634 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4635 4636 PetscFunctionBegin; 4637 if (node_count) *node_count = a->inode.node_count; 4638 if (sizes) *sizes = a->inode.size_csr; 4639 if (limit) *limit = a->inode.limit; 4640 PetscFunctionReturn(PETSC_SUCCESS); 4641 } 4642