1 /* 2 This file provides high performance routines for the Inode format (compressed sparse row) 3 by taking advantage of rows with identical nonzero structure (I-nodes). 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #if defined(PETSC_HAVE_XMMINTRIN_H) 7 #include <xmmintrin.h> 8 #endif 9 10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 13 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 14 15 PetscFunctionBegin; 16 n = A->cmap->n; 17 m = A->rmap->n; 18 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 19 ns_row = a->inode.size_csr; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++); 24 for (; count + 1 < n; count++, i++); 25 if (count < n) i++; 26 *size = i; 27 PetscFunctionReturn(PETSC_SUCCESS); 28 } 29 PetscCall(PetscMalloc1(n + 1, &ns_col)); 30 ns_col[0] = 0; 31 32 /* Use the same row structure wherever feasible. */ 33 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1]; 34 35 /* if m < n; pad up the remainder with inode_limit */ 36 for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1; 37 /* The last node is the odd ball. pad it up with the remaining rows; */ 38 if (count < n) { 39 ns_col[i + 1] = ns_col[i] + (n - count); 40 i++; 41 } else if (count > n) { 42 /* Adjust for the over estimation */ 43 ns_col[i] += n - count; 44 } 45 *size = i; 46 *ns = ns_col; 47 PetscFunctionReturn(PETSC_SUCCESS); 48 } 49 50 /* 51 This builds symmetric version of nonzero structure, 52 */ 53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 54 { 55 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 56 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 57 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2; 58 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 59 60 PetscFunctionBegin; 61 nslim_row = a->inode.node_count; 62 m = A->rmap->n; 63 n = A->cmap->n; 64 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 65 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 66 67 /* Use the row_inode as column_inode */ 68 nslim_col = nslim_row; 69 ns_col = ns_row; 70 71 /* allocate space for reformatted inode structure */ 72 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 73 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]); 74 75 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 76 nsz = ns_col[i1 + 1] - ns_col[i1]; 77 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 78 } 79 /* allocate space for row pointers */ 80 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 81 *iia = ia; 82 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 83 84 /* determine the number of columns in each row */ 85 ia[0] = oshift; 86 for (i1 = 0; i1 < nslim_row; i1++) { 87 row = ns_row[i1]; 88 j = aj + ai[row] + ishift; 89 jmax = aj + ai[row + 1] + ishift; 90 if (j == jmax) continue; /* empty row */ 91 col = *j++ + ishift; 92 i2 = tvc[col]; 93 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 94 ia[i1 + 1]++; 95 ia[i2 + 1]++; 96 i2++; /* Start col of next node */ 97 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 98 i2 = tvc[col]; 99 } 100 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 101 } 102 103 /* shift ia[i] to point to next row */ 104 for (i1 = 1; i1 < nslim_row + 1; i1++) { 105 row = ia[i1 - 1]; 106 ia[i1] += row; 107 work[i1 - 1] = row - oshift; 108 } 109 110 /* allocate space for column pointers */ 111 nz = ia[nslim_row] + (!ishift); 112 PetscCall(PetscMalloc1(nz, &ja)); 113 *jja = ja; 114 115 /* loop over lower triangular part putting into ja */ 116 for (i1 = 0; i1 < nslim_row; i1++) { 117 row = ns_row[i1]; 118 j = aj + ai[row] + ishift; 119 jmax = aj + ai[row + 1] + ishift; 120 if (j == jmax) continue; /* empty row */ 121 col = *j++ + ishift; 122 i2 = tvc[col]; 123 while (i2 < i1 && j < jmax) { 124 ja[work[i2]++] = i1 + oshift; 125 ja[work[i1]++] = i2 + oshift; 126 ++i2; 127 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 128 i2 = tvc[col]; 129 } 130 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 131 } 132 PetscCall(PetscFree(work)); 133 PetscCall(PetscFree2(tns, tvc)); 134 PetscFunctionReturn(PETSC_SUCCESS); 135 } 136 137 /* 138 This builds nonsymmetric version of nonzero structure, 139 */ 140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 141 { 142 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 143 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 144 PetscInt *tns, *tvc, nsz, i1, i2; 145 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr; 146 147 PetscFunctionBegin; 148 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 149 nslim_row = a->inode.node_count; 150 n = A->cmap->n; 151 152 /* Create The column_inode for this matrix */ 153 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 154 155 /* allocate space for reformatted column_inode structure */ 156 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 157 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 158 159 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 160 nsz = ns_col[i1 + 1] - ns_col[i1]; 161 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 162 } 163 /* allocate space for row pointers */ 164 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 165 *iia = ia; 166 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 167 168 /* determine the number of columns in each row */ 169 ia[0] = oshift; 170 for (i1 = 0; i1 < nslim_row; i1++) { 171 row = ns_row[i1]; 172 j = aj + ai[row] + ishift; 173 nz = ai[row + 1] - ai[row]; 174 if (!nz) continue; /* empty row */ 175 col = *j++ + ishift; 176 i2 = tvc[col]; 177 while (nz-- > 0) { /* off-diagonal elements */ 178 ia[i1 + 1]++; 179 i2++; /* Start col of next node */ 180 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 181 if (nz > 0) i2 = tvc[col]; 182 } 183 } 184 185 /* shift ia[i] to point to next row */ 186 for (i1 = 1; i1 < nslim_row + 1; i1++) { 187 row = ia[i1 - 1]; 188 ia[i1] += row; 189 work[i1 - 1] = row - oshift; 190 } 191 192 /* allocate space for column pointers */ 193 nz = ia[nslim_row] + (!ishift); 194 PetscCall(PetscMalloc1(nz, &ja)); 195 *jja = ja; 196 197 /* loop over matrix putting into ja */ 198 for (i1 = 0; i1 < nslim_row; i1++) { 199 row = ns_row[i1]; 200 j = aj + ai[row] + ishift; 201 nz = ai[row + 1] - ai[row]; 202 if (!nz) continue; /* empty row */ 203 col = *j++ + ishift; 204 i2 = tvc[col]; 205 while (nz-- > 0) { 206 ja[work[i1]++] = i2 + oshift; 207 ++i2; 208 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 209 if (nz > 0) i2 = tvc[col]; 210 } 211 } 212 PetscCall(PetscFree(ns_col)); 213 PetscCall(PetscFree(work)); 214 PetscCall(PetscFree2(tns, tvc)); 215 PetscFunctionReturn(PETSC_SUCCESS); 216 } 217 218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 219 { 220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 221 222 PetscFunctionBegin; 223 if (n) *n = a->inode.node_count; 224 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 225 if (!blockcompressed) { 226 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 227 } else if (symmetric) { 228 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 229 } else { 230 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 231 } 232 PetscFunctionReturn(PETSC_SUCCESS); 233 } 234 235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 236 { 237 PetscFunctionBegin; 238 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 239 240 if (!blockcompressed) { 241 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 242 } else { 243 PetscCall(PetscFree(*ia)); 244 PetscCall(PetscFree(*ja)); 245 } 246 PetscFunctionReturn(PETSC_SUCCESS); 247 } 248 249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 250 { 251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 252 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 253 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j; 254 255 PetscFunctionBegin; 256 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 257 nslim_row = a->inode.node_count; 258 n = A->cmap->n; 259 260 /* Create The column_inode for this matrix */ 261 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 262 263 /* allocate space for reformatted column_inode structure */ 264 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 265 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 266 267 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 268 nsz = ns_col[i1 + 1] - ns_col[i1]; 269 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 270 } 271 /* allocate space for column pointers */ 272 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 273 *iia = ia; 274 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 275 276 /* determine the number of columns in each row */ 277 ia[0] = oshift; 278 for (i1 = 0; i1 < nslim_row; i1++) { 279 row = ns_row[i1]; 280 j = aj + ai[row] + ishift; 281 col = *j++ + ishift; 282 i2 = tvc[col]; 283 nz = ai[row + 1] - ai[row]; 284 while (nz-- > 0) { /* off-diagonal elements */ 285 /* ia[i1+1]++; */ 286 ia[i2 + 1]++; 287 i2++; 288 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 289 if (nz > 0) i2 = tvc[col]; 290 } 291 } 292 293 /* shift ia[i] to point to next col */ 294 for (i1 = 1; i1 < nslim_col + 1; i1++) { 295 col = ia[i1 - 1]; 296 ia[i1] += col; 297 work[i1 - 1] = col - oshift; 298 } 299 300 /* allocate space for column pointers */ 301 nz = ia[nslim_col] + (!ishift); 302 PetscCall(PetscMalloc1(nz, &ja)); 303 *jja = ja; 304 305 /* loop over matrix putting into ja */ 306 for (i1 = 0; i1 < nslim_row; i1++) { 307 row = ns_row[i1]; 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row + 1] - ai[row]; 312 while (nz-- > 0) { 313 /* ja[work[i1]++] = i2 + oshift; */ 314 ja[work[i2]++] = i1 + oshift; 315 i2++; 316 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 PetscCall(PetscFree(ns_col)); 321 PetscCall(PetscFree(work)); 322 PetscCall(PetscFree2(tns, tvc)); 323 PetscFunctionReturn(PETSC_SUCCESS); 324 } 325 326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 327 { 328 PetscFunctionBegin; 329 PetscCall(MatCreateColInode_Private(A, n, NULL)); 330 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 331 332 if (!blockcompressed) { 333 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 334 } else if (symmetric) { 335 /* Since the indices are symmetric it doesn't matter */ 336 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 337 } else { 338 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 339 } 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 344 { 345 PetscFunctionBegin; 346 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 347 if (!blockcompressed) { 348 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 349 } else { 350 PetscCall(PetscFree(*ia)); 351 PetscCall(PetscFree(*ja)); 352 } 353 PetscFunctionReturn(PETSC_SUCCESS); 354 } 355 356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 357 { 358 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 359 PetscScalar *y; 360 const PetscScalar *x; 361 PetscInt row, node_max, nonzerorow = 0; 362 PetscInt *ns; 363 364 PetscFunctionBegin; 365 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 366 node_max = a->inode.node_count; 367 ns = a->inode.size_csr; /* Node Size array */ 368 PetscCall(VecGetArrayRead(xx, &x)); 369 PetscCall(VecGetArray(yy, &y)); 370 371 PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow)) 372 for (PetscInt i = 0; i < node_max; ++i) { 373 PetscInt i1, i2, nsz, n, sz; 374 const MatScalar *v1, *v2, *v3, *v4, *v5; 375 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 376 const PetscInt *idx; 377 378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 379 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 380 #endif 381 row = ns[i]; 382 nsz = ns[i + 1] - ns[i]; 383 n = a->i[row + 1] - a->i[row]; 384 nonzerorow += (n > 0) * nsz; 385 386 idx = &a->j[a->i[row]]; 387 v1 = &a->a[a->i[row]]; 388 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 389 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 390 sz = n; /* No of non zeros in this row */ 391 /* Switch on the size of Node */ 392 switch (nsz) { /* Each loop in 'case' is unrolled */ 393 case 1: 394 sum1 = 0.; 395 396 for (n = 0; n < sz - 1; n += 2) { 397 i1 = idx[0]; /* The instructions are ordered to */ 398 i2 = idx[1]; /* make the compiler's job easy */ 399 idx += 2; 400 tmp0 = x[i1]; 401 tmp1 = x[i2]; 402 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 403 v1 += 2; 404 } 405 406 if (n == sz - 1) { /* Take care of the last nonzero */ 407 tmp0 = x[*idx++]; 408 sum1 += *v1++ * tmp0; 409 } 410 y[row++] = sum1; 411 break; 412 case 2: 413 sum1 = 0.; 414 sum2 = 0.; 415 v2 = v1 + n; 416 417 for (n = 0; n < sz - 1; n += 2) { 418 i1 = idx[0]; 419 i2 = idx[1]; 420 idx += 2; 421 tmp0 = x[i1]; 422 tmp1 = x[i2]; 423 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 424 v1 += 2; 425 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 426 v2 += 2; 427 } 428 if (n == sz - 1) { 429 tmp0 = x[*idx++]; 430 sum1 += *v1++ * tmp0; 431 sum2 += *v2++ * tmp0; 432 } 433 y[row++] = sum1; 434 y[row++] = sum2; 435 v1 = v2; /* Since the next block to be processed starts there*/ 436 idx += sz; 437 break; 438 case 3: 439 sum1 = 0.; 440 sum2 = 0.; 441 sum3 = 0.; 442 v2 = v1 + n; 443 v3 = v2 + n; 444 445 for (n = 0; n < sz - 1; n += 2) { 446 i1 = idx[0]; 447 i2 = idx[1]; 448 idx += 2; 449 tmp0 = x[i1]; 450 tmp1 = x[i2]; 451 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 452 v1 += 2; 453 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 454 v2 += 2; 455 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 456 v3 += 2; 457 } 458 if (n == sz - 1) { 459 tmp0 = x[*idx++]; 460 sum1 += *v1++ * tmp0; 461 sum2 += *v2++ * tmp0; 462 sum3 += *v3++ * tmp0; 463 } 464 y[row++] = sum1; 465 y[row++] = sum2; 466 y[row++] = sum3; 467 v1 = v3; /* Since the next block to be processed starts there*/ 468 idx += 2 * sz; 469 break; 470 case 4: 471 sum1 = 0.; 472 sum2 = 0.; 473 sum3 = 0.; 474 sum4 = 0.; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 v4 = v3 + n; 478 479 for (n = 0; n < sz - 1; n += 2) { 480 i1 = idx[0]; 481 i2 = idx[1]; 482 idx += 2; 483 tmp0 = x[i1]; 484 tmp1 = x[i2]; 485 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 486 v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 488 v2 += 2; 489 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 490 v3 += 2; 491 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 492 v4 += 2; 493 } 494 if (n == sz - 1) { 495 tmp0 = x[*idx++]; 496 sum1 += *v1++ * tmp0; 497 sum2 += *v2++ * tmp0; 498 sum3 += *v3++ * tmp0; 499 sum4 += *v4++ * tmp0; 500 } 501 y[row++] = sum1; 502 y[row++] = sum2; 503 y[row++] = sum3; 504 y[row++] = sum4; 505 v1 = v4; /* Since the next block to be processed starts there*/ 506 idx += 3 * sz; 507 break; 508 case 5: 509 sum1 = 0.; 510 sum2 = 0.; 511 sum3 = 0.; 512 sum4 = 0.; 513 sum5 = 0.; 514 v2 = v1 + n; 515 v3 = v2 + n; 516 v4 = v3 + n; 517 v5 = v4 + n; 518 519 for (n = 0; n < sz - 1; n += 2) { 520 i1 = idx[0]; 521 i2 = idx[1]; 522 idx += 2; 523 tmp0 = x[i1]; 524 tmp1 = x[i2]; 525 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 526 v1 += 2; 527 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 528 v2 += 2; 529 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 530 v3 += 2; 531 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 532 v4 += 2; 533 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 534 v5 += 2; 535 } 536 if (n == sz - 1) { 537 tmp0 = x[*idx++]; 538 sum1 += *v1++ * tmp0; 539 sum2 += *v2++ * tmp0; 540 sum3 += *v3++ * tmp0; 541 sum4 += *v4++ * tmp0; 542 sum5 += *v5++ * tmp0; 543 } 544 y[row++] = sum1; 545 y[row++] = sum2; 546 y[row++] = sum3; 547 y[row++] = sum4; 548 y[row++] = sum5; 549 v1 = v5; /* Since the next block to be processed starts there */ 550 idx += 4 * sz; 551 break; 552 default: 553 SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz); 554 } 555 } 556 PetscCall(VecRestoreArrayRead(xx, &x)); 557 PetscCall(VecRestoreArray(yy, &y)); 558 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 559 PetscFunctionReturn(PETSC_SUCCESS); 560 } 561 562 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 563 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 564 { 565 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 566 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 567 const MatScalar *v1, *v2, *v3, *v4, *v5; 568 const PetscScalar *x; 569 PetscScalar *y, *z, *zt; 570 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 571 const PetscInt *idx, *ns, *ii; 572 573 PetscFunctionBegin; 574 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 575 node_max = a->inode.node_count; 576 ns = a->inode.size_csr; /* Node Size array */ 577 578 PetscCall(VecGetArrayRead(xx, &x)); 579 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 580 zt = z; 581 582 idx = a->j; 583 v1 = a->a; 584 ii = a->i; 585 586 for (i = 0; i < node_max; ++i) { 587 row = ns[i]; 588 nsz = ns[i + 1] - ns[i]; 589 n = ii[1] - ii[0]; 590 ii += nsz; 591 sz = n; /* No of non zeros in this row */ 592 /* Switch on the size of Node */ 593 switch (nsz) { /* Each loop in 'case' is unrolled */ 594 case 1: 595 sum1 = *zt++; 596 597 for (n = 0; n < sz - 1; n += 2) { 598 i1 = idx[0]; /* The instructions are ordered to */ 599 i2 = idx[1]; /* make the compiler's job easy */ 600 idx += 2; 601 tmp0 = x[i1]; 602 tmp1 = x[i2]; 603 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 604 v1 += 2; 605 } 606 607 if (n == sz - 1) { /* Take care of the last nonzero */ 608 tmp0 = x[*idx++]; 609 sum1 += *v1++ * tmp0; 610 } 611 y[row++] = sum1; 612 break; 613 case 2: 614 sum1 = *zt++; 615 sum2 = *zt++; 616 v2 = v1 + n; 617 618 for (n = 0; n < sz - 1; n += 2) { 619 i1 = idx[0]; 620 i2 = idx[1]; 621 idx += 2; 622 tmp0 = x[i1]; 623 tmp1 = x[i2]; 624 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 625 v1 += 2; 626 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 627 v2 += 2; 628 } 629 if (n == sz - 1) { 630 tmp0 = x[*idx++]; 631 sum1 += *v1++ * tmp0; 632 sum2 += *v2++ * tmp0; 633 } 634 y[row++] = sum1; 635 y[row++] = sum2; 636 v1 = v2; /* Since the next block to be processed starts there*/ 637 idx += sz; 638 break; 639 case 3: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 sum3 = *zt++; 643 v2 = v1 + n; 644 v3 = v2 + n; 645 646 for (n = 0; n < sz - 1; n += 2) { 647 i1 = idx[0]; 648 i2 = idx[1]; 649 idx += 2; 650 tmp0 = x[i1]; 651 tmp1 = x[i2]; 652 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 653 v1 += 2; 654 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 655 v2 += 2; 656 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 657 v3 += 2; 658 } 659 if (n == sz - 1) { 660 tmp0 = x[*idx++]; 661 sum1 += *v1++ * tmp0; 662 sum2 += *v2++ * tmp0; 663 sum3 += *v3++ * tmp0; 664 } 665 y[row++] = sum1; 666 y[row++] = sum2; 667 y[row++] = sum3; 668 v1 = v3; /* Since the next block to be processed starts there*/ 669 idx += 2 * sz; 670 break; 671 case 4: 672 sum1 = *zt++; 673 sum2 = *zt++; 674 sum3 = *zt++; 675 sum4 = *zt++; 676 v2 = v1 + n; 677 v3 = v2 + n; 678 v4 = v3 + n; 679 680 for (n = 0; n < sz - 1; n += 2) { 681 i1 = idx[0]; 682 i2 = idx[1]; 683 idx += 2; 684 tmp0 = x[i1]; 685 tmp1 = x[i2]; 686 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 687 v1 += 2; 688 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 689 v2 += 2; 690 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 691 v3 += 2; 692 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 693 v4 += 2; 694 } 695 if (n == sz - 1) { 696 tmp0 = x[*idx++]; 697 sum1 += *v1++ * tmp0; 698 sum2 += *v2++ * tmp0; 699 sum3 += *v3++ * tmp0; 700 sum4 += *v4++ * tmp0; 701 } 702 y[row++] = sum1; 703 y[row++] = sum2; 704 y[row++] = sum3; 705 y[row++] = sum4; 706 v1 = v4; /* Since the next block to be processed starts there*/ 707 idx += 3 * sz; 708 break; 709 case 5: 710 sum1 = *zt++; 711 sum2 = *zt++; 712 sum3 = *zt++; 713 sum4 = *zt++; 714 sum5 = *zt++; 715 v2 = v1 + n; 716 v3 = v2 + n; 717 v4 = v3 + n; 718 v5 = v4 + n; 719 720 for (n = 0; n < sz - 1; n += 2) { 721 i1 = idx[0]; 722 i2 = idx[1]; 723 idx += 2; 724 tmp0 = x[i1]; 725 tmp1 = x[i2]; 726 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 727 v1 += 2; 728 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 729 v2 += 2; 730 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 731 v3 += 2; 732 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 733 v4 += 2; 734 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 735 v5 += 2; 736 } 737 if (n == sz - 1) { 738 tmp0 = x[*idx++]; 739 sum1 += *v1++ * tmp0; 740 sum2 += *v2++ * tmp0; 741 sum3 += *v3++ * tmp0; 742 sum4 += *v4++ * tmp0; 743 sum5 += *v5++ * tmp0; 744 } 745 y[row++] = sum1; 746 y[row++] = sum2; 747 y[row++] = sum3; 748 y[row++] = sum4; 749 y[row++] = sum5; 750 v1 = v5; /* Since the next block to be processed starts there */ 751 idx += 4 * sz; 752 break; 753 default: 754 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 755 } 756 } 757 PetscCall(VecRestoreArrayRead(xx, &x)); 758 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 759 PetscCall(PetscLogFlops(2.0 * a->nz)); 760 PetscFunctionReturn(PETSC_SUCCESS); 761 } 762 763 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 764 { 765 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 766 IS iscol = a->col, isrow = a->row; 767 const PetscInt *r, *c, *rout, *cout; 768 PetscInt i, j, n = A->rmap->n, nz; 769 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 770 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 771 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 772 PetscScalar sum1, sum2, sum3, sum4, sum5; 773 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 774 const PetscScalar *b; 775 776 PetscFunctionBegin; 777 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 778 node_max = a->inode.node_count; 779 ns = a->inode.size_csr; /* Node Size array */ 780 781 PetscCall(VecGetArrayRead(bb, &b)); 782 PetscCall(VecGetArrayWrite(xx, &x)); 783 tmp = a->solve_work; 784 785 PetscCall(ISGetIndices(isrow, &rout)); 786 r = rout; 787 PetscCall(ISGetIndices(iscol, &cout)); 788 c = cout + (n - 1); 789 790 /* forward solve the lower triangular */ 791 tmps = tmp; 792 aa = a_a; 793 aj = a_j; 794 ad = a->diag; 795 796 for (i = 0, row = 0; i < node_max; ++i) { 797 row = ns[i]; 798 nsz = ns[i + 1] - ns[i]; 799 aii = ai[row]; 800 v1 = aa + aii; 801 vi = aj + aii; 802 nz = ad[row] - aii; 803 if (i < node_max - 1) { 804 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 805 * but our indexing to determine its size could. */ 806 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 807 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 808 PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 809 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 810 } 811 812 switch (nsz) { /* Each loop in 'case' is unrolled */ 813 case 1: 814 sum1 = b[*r++]; 815 for (j = 0; j < nz - 1; j += 2) { 816 i0 = vi[0]; 817 i1 = vi[1]; 818 vi += 2; 819 tmp0 = tmps[i0]; 820 tmp1 = tmps[i1]; 821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 822 v1 += 2; 823 } 824 if (j == nz - 1) { 825 tmp0 = tmps[*vi++]; 826 sum1 -= *v1++ * tmp0; 827 } 828 tmp[row++] = sum1; 829 break; 830 case 2: 831 sum1 = b[*r++]; 832 sum2 = b[*r++]; 833 v2 = aa + ai[row + 1]; 834 835 for (j = 0; j < nz - 1; j += 2) { 836 i0 = vi[0]; 837 i1 = vi[1]; 838 vi += 2; 839 tmp0 = tmps[i0]; 840 tmp1 = tmps[i1]; 841 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 842 v1 += 2; 843 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 844 v2 += 2; 845 } 846 if (j == nz - 1) { 847 tmp0 = tmps[*vi++]; 848 sum1 -= *v1++ * tmp0; 849 sum2 -= *v2++ * tmp0; 850 } 851 sum2 -= *v2++ * sum1; 852 tmp[row++] = sum1; 853 tmp[row++] = sum2; 854 break; 855 case 3: 856 sum1 = b[*r++]; 857 sum2 = b[*r++]; 858 sum3 = b[*r++]; 859 v2 = aa + ai[row + 1]; 860 v3 = aa + ai[row + 2]; 861 862 for (j = 0; j < nz - 1; j += 2) { 863 i0 = vi[0]; 864 i1 = vi[1]; 865 vi += 2; 866 tmp0 = tmps[i0]; 867 tmp1 = tmps[i1]; 868 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 869 v1 += 2; 870 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 871 v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 873 v3 += 2; 874 } 875 if (j == nz - 1) { 876 tmp0 = tmps[*vi++]; 877 sum1 -= *v1++ * tmp0; 878 sum2 -= *v2++ * tmp0; 879 sum3 -= *v3++ * tmp0; 880 } 881 sum2 -= *v2++ * sum1; 882 sum3 -= *v3++ * sum1; 883 sum3 -= *v3++ * sum2; 884 885 tmp[row++] = sum1; 886 tmp[row++] = sum2; 887 tmp[row++] = sum3; 888 break; 889 890 case 4: 891 sum1 = b[*r++]; 892 sum2 = b[*r++]; 893 sum3 = b[*r++]; 894 sum4 = b[*r++]; 895 v2 = aa + ai[row + 1]; 896 v3 = aa + ai[row + 2]; 897 v4 = aa + ai[row + 3]; 898 899 for (j = 0; j < nz - 1; j += 2) { 900 i0 = vi[0]; 901 i1 = vi[1]; 902 vi += 2; 903 tmp0 = tmps[i0]; 904 tmp1 = tmps[i1]; 905 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 906 v1 += 2; 907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 908 v2 += 2; 909 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 910 v3 += 2; 911 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 912 v4 += 2; 913 } 914 if (j == nz - 1) { 915 tmp0 = tmps[*vi++]; 916 sum1 -= *v1++ * tmp0; 917 sum2 -= *v2++ * tmp0; 918 sum3 -= *v3++ * tmp0; 919 sum4 -= *v4++ * tmp0; 920 } 921 sum2 -= *v2++ * sum1; 922 sum3 -= *v3++ * sum1; 923 sum4 -= *v4++ * sum1; 924 sum3 -= *v3++ * sum2; 925 sum4 -= *v4++ * sum2; 926 sum4 -= *v4++ * sum3; 927 928 tmp[row++] = sum1; 929 tmp[row++] = sum2; 930 tmp[row++] = sum3; 931 tmp[row++] = sum4; 932 break; 933 case 5: 934 sum1 = b[*r++]; 935 sum2 = b[*r++]; 936 sum3 = b[*r++]; 937 sum4 = b[*r++]; 938 sum5 = b[*r++]; 939 v2 = aa + ai[row + 1]; 940 v3 = aa + ai[row + 2]; 941 v4 = aa + ai[row + 3]; 942 v5 = aa + ai[row + 4]; 943 944 for (j = 0; j < nz - 1; j += 2) { 945 i0 = vi[0]; 946 i1 = vi[1]; 947 vi += 2; 948 tmp0 = tmps[i0]; 949 tmp1 = tmps[i1]; 950 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 951 v1 += 2; 952 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 953 v2 += 2; 954 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 955 v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 957 v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 959 v5 += 2; 960 } 961 if (j == nz - 1) { 962 tmp0 = tmps[*vi++]; 963 sum1 -= *v1++ * tmp0; 964 sum2 -= *v2++ * tmp0; 965 sum3 -= *v3++ * tmp0; 966 sum4 -= *v4++ * tmp0; 967 sum5 -= *v5++ * tmp0; 968 } 969 970 sum2 -= *v2++ * sum1; 971 sum3 -= *v3++ * sum1; 972 sum4 -= *v4++ * sum1; 973 sum5 -= *v5++ * sum1; 974 sum3 -= *v3++ * sum2; 975 sum4 -= *v4++ * sum2; 976 sum5 -= *v5++ * sum2; 977 sum4 -= *v4++ * sum3; 978 sum5 -= *v5++ * sum3; 979 sum5 -= *v5++ * sum4; 980 981 tmp[row++] = sum1; 982 tmp[row++] = sum2; 983 tmp[row++] = sum3; 984 tmp[row++] = sum4; 985 tmp[row++] = sum5; 986 break; 987 default: 988 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 989 } 990 } 991 /* backward solve the upper triangular */ 992 for (i = node_max - 1; i >= 0; i--) { 993 row = ns[i + 1]; 994 nsz = ns[i + 1] - ns[i]; 995 aii = ai[row + 1] - 1; 996 v1 = aa + aii; 997 vi = aj + aii; 998 nz = aii - ad[row]; 999 switch (nsz) { /* Each loop in 'case' is unrolled */ 1000 case 1: 1001 sum1 = tmp[row]; 1002 1003 for (j = nz; j > 1; j -= 2) { 1004 vi -= 2; 1005 i0 = vi[2]; 1006 i1 = vi[1]; 1007 tmp0 = tmps[i0]; 1008 tmp1 = tmps[i1]; 1009 v1 -= 2; 1010 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1011 } 1012 if (j == 1) { 1013 tmp0 = tmps[*vi--]; 1014 sum1 -= *v1-- * tmp0; 1015 } 1016 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1017 row--; 1018 break; 1019 case 2: 1020 sum1 = tmp[row]; 1021 sum2 = tmp[row - 1]; 1022 v2 = aa + ai[row] - 1; 1023 for (j = nz; j > 1; j -= 2) { 1024 vi -= 2; 1025 i0 = vi[2]; 1026 i1 = vi[1]; 1027 tmp0 = tmps[i0]; 1028 tmp1 = tmps[i1]; 1029 v1 -= 2; 1030 v2 -= 2; 1031 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1032 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1033 } 1034 if (j == 1) { 1035 tmp0 = tmps[*vi--]; 1036 sum1 -= *v1-- * tmp0; 1037 sum2 -= *v2-- * tmp0; 1038 } 1039 1040 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1041 row--; 1042 sum2 -= *v2-- * tmp0; 1043 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1044 row--; 1045 break; 1046 case 3: 1047 sum1 = tmp[row]; 1048 sum2 = tmp[row - 1]; 1049 sum3 = tmp[row - 2]; 1050 v2 = aa + ai[row] - 1; 1051 v3 = aa + ai[row - 1] - 1; 1052 for (j = nz; j > 1; j -= 2) { 1053 vi -= 2; 1054 i0 = vi[2]; 1055 i1 = vi[1]; 1056 tmp0 = tmps[i0]; 1057 tmp1 = tmps[i1]; 1058 v1 -= 2; 1059 v2 -= 2; 1060 v3 -= 2; 1061 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1062 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1063 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1064 } 1065 if (j == 1) { 1066 tmp0 = tmps[*vi--]; 1067 sum1 -= *v1-- * tmp0; 1068 sum2 -= *v2-- * tmp0; 1069 sum3 -= *v3-- * tmp0; 1070 } 1071 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1072 row--; 1073 sum2 -= *v2-- * tmp0; 1074 sum3 -= *v3-- * tmp0; 1075 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1076 row--; 1077 sum3 -= *v3-- * tmp0; 1078 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1079 row--; 1080 1081 break; 1082 case 4: 1083 sum1 = tmp[row]; 1084 sum2 = tmp[row - 1]; 1085 sum3 = tmp[row - 2]; 1086 sum4 = tmp[row - 3]; 1087 v2 = aa + ai[row] - 1; 1088 v3 = aa + ai[row - 1] - 1; 1089 v4 = aa + ai[row - 2] - 1; 1090 1091 for (j = nz; j > 1; j -= 2) { 1092 vi -= 2; 1093 i0 = vi[2]; 1094 i1 = vi[1]; 1095 tmp0 = tmps[i0]; 1096 tmp1 = tmps[i1]; 1097 v1 -= 2; 1098 v2 -= 2; 1099 v3 -= 2; 1100 v4 -= 2; 1101 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1102 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1103 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1104 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1105 } 1106 if (j == 1) { 1107 tmp0 = tmps[*vi--]; 1108 sum1 -= *v1-- * tmp0; 1109 sum2 -= *v2-- * tmp0; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 } 1113 1114 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1115 row--; 1116 sum2 -= *v2-- * tmp0; 1117 sum3 -= *v3-- * tmp0; 1118 sum4 -= *v4-- * tmp0; 1119 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1120 row--; 1121 sum3 -= *v3-- * tmp0; 1122 sum4 -= *v4-- * tmp0; 1123 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1124 row--; 1125 sum4 -= *v4-- * tmp0; 1126 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1127 row--; 1128 break; 1129 case 5: 1130 sum1 = tmp[row]; 1131 sum2 = tmp[row - 1]; 1132 sum3 = tmp[row - 2]; 1133 sum4 = tmp[row - 3]; 1134 sum5 = tmp[row - 4]; 1135 v2 = aa + ai[row] - 1; 1136 v3 = aa + ai[row - 1] - 1; 1137 v4 = aa + ai[row - 2] - 1; 1138 v5 = aa + ai[row - 3] - 1; 1139 for (j = nz; j > 1; j -= 2) { 1140 vi -= 2; 1141 i0 = vi[2]; 1142 i1 = vi[1]; 1143 tmp0 = tmps[i0]; 1144 tmp1 = tmps[i1]; 1145 v1 -= 2; 1146 v2 -= 2; 1147 v3 -= 2; 1148 v4 -= 2; 1149 v5 -= 2; 1150 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1151 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1152 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1153 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1154 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1155 } 1156 if (j == 1) { 1157 tmp0 = tmps[*vi--]; 1158 sum1 -= *v1-- * tmp0; 1159 sum2 -= *v2-- * tmp0; 1160 sum3 -= *v3-- * tmp0; 1161 sum4 -= *v4-- * tmp0; 1162 sum5 -= *v5-- * tmp0; 1163 } 1164 1165 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1166 row--; 1167 sum2 -= *v2-- * tmp0; 1168 sum3 -= *v3-- * tmp0; 1169 sum4 -= *v4-- * tmp0; 1170 sum5 -= *v5-- * tmp0; 1171 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1172 row--; 1173 sum3 -= *v3-- * tmp0; 1174 sum4 -= *v4-- * tmp0; 1175 sum5 -= *v5-- * tmp0; 1176 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1177 row--; 1178 sum4 -= *v4-- * tmp0; 1179 sum5 -= *v5-- * tmp0; 1180 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1181 row--; 1182 sum5 -= *v5-- * tmp0; 1183 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1184 row--; 1185 break; 1186 default: 1187 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1188 } 1189 } 1190 PetscCall(ISRestoreIndices(isrow, &rout)); 1191 PetscCall(ISRestoreIndices(iscol, &cout)); 1192 PetscCall(VecRestoreArrayRead(bb, &b)); 1193 PetscCall(VecRestoreArrayWrite(xx, &x)); 1194 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1195 PetscFunctionReturn(PETSC_SUCCESS); 1196 } 1197 1198 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1199 { 1200 Mat C = B; 1201 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1202 IS isrow = b->row, isicol = b->icol; 1203 const PetscInt *r, *ic, *ics; 1204 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1205 PetscInt i, j, k, nz, nzL, row, *pj; 1206 const PetscInt *ajtmp, *bjtmp; 1207 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1208 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1209 FactorShiftCtx sctx; 1210 const PetscInt *ddiag; 1211 PetscReal rs; 1212 MatScalar d; 1213 PetscInt inod, nodesz, node_max, col; 1214 const PetscInt *ns; 1215 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1216 1217 PetscFunctionBegin; 1218 /* MatPivotSetUp(): initialize shift context sctx */ 1219 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1220 1221 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1222 ddiag = a->diag; 1223 sctx.shift_top = info->zeropivot; 1224 for (i = 0; i < n; i++) { 1225 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1226 d = (aa)[ddiag[i]]; 1227 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1228 v = aa + ai[i]; 1229 nz = ai[i + 1] - ai[i]; 1230 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1231 if (rs > sctx.shift_top) sctx.shift_top = rs; 1232 } 1233 sctx.shift_top *= 1.1; 1234 sctx.nshift_max = 5; 1235 sctx.shift_lo = 0.; 1236 sctx.shift_hi = 1.; 1237 } 1238 1239 PetscCall(ISGetIndices(isrow, &r)); 1240 PetscCall(ISGetIndices(isicol, &ic)); 1241 1242 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1243 ics = ic; 1244 1245 node_max = a->inode.node_count; 1246 ns = a->inode.size_csr; 1247 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1248 1249 /* If max inode size > 4, split it into two inodes.*/ 1250 /* also map the inode sizes according to the ordering */ 1251 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1252 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1253 nodesz = ns[i + 1] - ns[i]; 1254 if (nodesz > 4) { 1255 tmp_vec1[j] = 4; 1256 ++j; 1257 tmp_vec1[j] = nodesz - tmp_vec1[j - 1]; 1258 } else { 1259 tmp_vec1[j] = nodesz; 1260 } 1261 } 1262 /* Use the correct node_max */ 1263 node_max = j; 1264 1265 /* Now reorder the inode info based on mat re-ordering info */ 1266 /* First create a row -> inode_size_array_index map */ 1267 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1268 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1269 tmp_vec2[0] = 0; 1270 for (i = 0, row = 0; i < node_max; i++) { 1271 nodesz = tmp_vec1[i]; 1272 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1273 } 1274 /* Using nsmap, create a reordered ns structure */ 1275 for (i = 0, j = 0; i < node_max; i++) { 1276 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1277 tmp_vec2[i + 1] = tmp_vec2[i] + nodesz; 1278 j += nodesz; 1279 } 1280 PetscCall(PetscFree(nsmap)); 1281 PetscCall(PetscFree(tmp_vec1)); 1282 1283 /* Now use the correct ns */ 1284 ns = tmp_vec2; 1285 1286 do { 1287 sctx.newshift = PETSC_FALSE; 1288 /* Now loop over each block-row, and do the factorization */ 1289 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1290 nodesz = ns[inod + 1] - ns[inod]; 1291 1292 switch (nodesz) { 1293 case 1: 1294 /* zero rtmp1 */ 1295 /* L part */ 1296 nz = bi[i + 1] - bi[i]; 1297 bjtmp = bj + bi[i]; 1298 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1299 1300 /* U part */ 1301 nz = bdiag[i] - bdiag[i + 1]; 1302 bjtmp = bj + bdiag[i + 1] + 1; 1303 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1304 1305 /* load in initial (unfactored row) */ 1306 nz = ai[r[i] + 1] - ai[r[i]]; 1307 ajtmp = aj + ai[r[i]]; 1308 v = aa + ai[r[i]]; 1309 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1310 1311 /* ZeropivotApply() */ 1312 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1313 1314 /* elimination */ 1315 bjtmp = bj + bi[i]; 1316 row = *bjtmp++; 1317 nzL = bi[i + 1] - bi[i]; 1318 for (k = 0; k < nzL; k++) { 1319 pc = rtmp1 + row; 1320 if (*pc != 0.0) { 1321 pv = b->a + bdiag[row]; 1322 mul1 = *pc * (*pv); 1323 *pc = mul1; 1324 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1325 pv = b->a + bdiag[row + 1] + 1; 1326 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1327 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1328 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1329 } 1330 row = *bjtmp++; 1331 } 1332 1333 /* finished row so stick it into b->a */ 1334 rs = 0.0; 1335 /* L part */ 1336 pv = b->a + bi[i]; 1337 pj = b->j + bi[i]; 1338 nz = bi[i + 1] - bi[i]; 1339 for (j = 0; j < nz; j++) { 1340 pv[j] = rtmp1[pj[j]]; 1341 rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* U part */ 1345 pv = b->a + bdiag[i + 1] + 1; 1346 pj = b->j + bdiag[i + 1] + 1; 1347 nz = bdiag[i] - bdiag[i + 1] - 1; 1348 for (j = 0; j < nz; j++) { 1349 pv[j] = rtmp1[pj[j]]; 1350 rs += PetscAbsScalar(pv[j]); 1351 } 1352 1353 /* Check zero pivot */ 1354 sctx.rs = rs; 1355 sctx.pv = rtmp1[i]; 1356 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1357 if (sctx.newshift) break; 1358 1359 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1360 pv = b->a + bdiag[i]; 1361 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1362 break; 1363 1364 case 2: 1365 /* zero rtmp1 and rtmp2 */ 1366 /* L part */ 1367 nz = bi[i + 1] - bi[i]; 1368 bjtmp = bj + bi[i]; 1369 for (j = 0; j < nz; j++) { 1370 col = bjtmp[j]; 1371 rtmp1[col] = 0.0; 1372 rtmp2[col] = 0.0; 1373 } 1374 1375 /* U part */ 1376 nz = bdiag[i] - bdiag[i + 1]; 1377 bjtmp = bj + bdiag[i + 1] + 1; 1378 for (j = 0; j < nz; j++) { 1379 col = bjtmp[j]; 1380 rtmp1[col] = 0.0; 1381 rtmp2[col] = 0.0; 1382 } 1383 1384 /* load in initial (unfactored row) */ 1385 nz = ai[r[i] + 1] - ai[r[i]]; 1386 ajtmp = aj + ai[r[i]]; 1387 v1 = aa + ai[r[i]]; 1388 v2 = aa + ai[r[i + 1]]; 1389 for (j = 0; j < nz; j++) { 1390 col = ics[ajtmp[j]]; 1391 rtmp1[col] = v1[j]; 1392 rtmp2[col] = v2[j]; 1393 } 1394 /* ZeropivotApply(): shift the diagonal of the matrix */ 1395 rtmp1[i] += sctx.shift_amount; 1396 rtmp2[i + 1] += sctx.shift_amount; 1397 1398 /* elimination */ 1399 bjtmp = bj + bi[i]; 1400 row = *bjtmp++; /* pivot row */ 1401 nzL = bi[i + 1] - bi[i]; 1402 for (k = 0; k < nzL; k++) { 1403 pc1 = rtmp1 + row; 1404 pc2 = rtmp2 + row; 1405 if (*pc1 != 0.0 || *pc2 != 0.0) { 1406 pv = b->a + bdiag[row]; 1407 mul1 = *pc1 * (*pv); 1408 mul2 = *pc2 * (*pv); 1409 *pc1 = mul1; 1410 *pc2 = mul2; 1411 1412 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1413 pv = b->a + bdiag[row + 1] + 1; 1414 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1415 for (j = 0; j < nz; j++) { 1416 col = pj[j]; 1417 rtmp1[col] -= mul1 * pv[j]; 1418 rtmp2[col] -= mul2 * pv[j]; 1419 } 1420 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1421 } 1422 row = *bjtmp++; 1423 } 1424 1425 /* finished row i; check zero pivot, then stick row i into b->a */ 1426 rs = 0.0; 1427 /* L part */ 1428 pc1 = b->a + bi[i]; 1429 pj = b->j + bi[i]; 1430 nz = bi[i + 1] - bi[i]; 1431 for (j = 0; j < nz; j++) { 1432 col = pj[j]; 1433 pc1[j] = rtmp1[col]; 1434 rs += PetscAbsScalar(pc1[j]); 1435 } 1436 /* U part */ 1437 pc1 = b->a + bdiag[i + 1] + 1; 1438 pj = b->j + bdiag[i + 1] + 1; 1439 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1440 for (j = 0; j < nz; j++) { 1441 col = pj[j]; 1442 pc1[j] = rtmp1[col]; 1443 rs += PetscAbsScalar(pc1[j]); 1444 } 1445 1446 sctx.rs = rs; 1447 sctx.pv = rtmp1[i]; 1448 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1449 if (sctx.newshift) break; 1450 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1451 *pc1 = 1.0 / sctx.pv; 1452 1453 /* Now take care of diagonal 2x2 block. */ 1454 pc2 = rtmp2 + i; 1455 if (*pc2 != 0.0) { 1456 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1457 *pc2 = mul1; /* insert L entry */ 1458 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1459 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1460 for (j = 0; j < nz; j++) { 1461 col = pj[j]; 1462 rtmp2[col] -= mul1 * rtmp1[col]; 1463 } 1464 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1465 } 1466 1467 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1468 rs = 0.0; 1469 /* L part */ 1470 pc2 = b->a + bi[i + 1]; 1471 pj = b->j + bi[i + 1]; 1472 nz = bi[i + 2] - bi[i + 1]; 1473 for (j = 0; j < nz; j++) { 1474 col = pj[j]; 1475 pc2[j] = rtmp2[col]; 1476 rs += PetscAbsScalar(pc2[j]); 1477 } 1478 /* U part */ 1479 pc2 = b->a + bdiag[i + 2] + 1; 1480 pj = b->j + bdiag[i + 2] + 1; 1481 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1482 for (j = 0; j < nz; j++) { 1483 col = pj[j]; 1484 pc2[j] = rtmp2[col]; 1485 rs += PetscAbsScalar(pc2[j]); 1486 } 1487 1488 sctx.rs = rs; 1489 sctx.pv = rtmp2[i + 1]; 1490 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1491 if (sctx.newshift) break; 1492 pc2 = b->a + bdiag[i + 1]; 1493 *pc2 = 1.0 / sctx.pv; 1494 break; 1495 1496 case 3: 1497 /* zero rtmp */ 1498 /* L part */ 1499 nz = bi[i + 1] - bi[i]; 1500 bjtmp = bj + bi[i]; 1501 for (j = 0; j < nz; j++) { 1502 col = bjtmp[j]; 1503 rtmp1[col] = 0.0; 1504 rtmp2[col] = 0.0; 1505 rtmp3[col] = 0.0; 1506 } 1507 1508 /* U part */ 1509 nz = bdiag[i] - bdiag[i + 1]; 1510 bjtmp = bj + bdiag[i + 1] + 1; 1511 for (j = 0; j < nz; j++) { 1512 col = bjtmp[j]; 1513 rtmp1[col] = 0.0; 1514 rtmp2[col] = 0.0; 1515 rtmp3[col] = 0.0; 1516 } 1517 1518 /* load in initial (unfactored row) */ 1519 nz = ai[r[i] + 1] - ai[r[i]]; 1520 ajtmp = aj + ai[r[i]]; 1521 v1 = aa + ai[r[i]]; 1522 v2 = aa + ai[r[i + 1]]; 1523 v3 = aa + ai[r[i + 2]]; 1524 for (j = 0; j < nz; j++) { 1525 col = ics[ajtmp[j]]; 1526 rtmp1[col] = v1[j]; 1527 rtmp2[col] = v2[j]; 1528 rtmp3[col] = v3[j]; 1529 } 1530 /* ZeropivotApply(): shift the diagonal of the matrix */ 1531 rtmp1[i] += sctx.shift_amount; 1532 rtmp2[i + 1] += sctx.shift_amount; 1533 rtmp3[i + 2] += sctx.shift_amount; 1534 1535 /* elimination */ 1536 bjtmp = bj + bi[i]; 1537 row = *bjtmp++; /* pivot row */ 1538 nzL = bi[i + 1] - bi[i]; 1539 for (k = 0; k < nzL; k++) { 1540 pc1 = rtmp1 + row; 1541 pc2 = rtmp2 + row; 1542 pc3 = rtmp3 + row; 1543 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1544 pv = b->a + bdiag[row]; 1545 mul1 = *pc1 * (*pv); 1546 mul2 = *pc2 * (*pv); 1547 mul3 = *pc3 * (*pv); 1548 *pc1 = mul1; 1549 *pc2 = mul2; 1550 *pc3 = mul3; 1551 1552 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1553 pv = b->a + bdiag[row + 1] + 1; 1554 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1555 for (j = 0; j < nz; j++) { 1556 col = pj[j]; 1557 rtmp1[col] -= mul1 * pv[j]; 1558 rtmp2[col] -= mul2 * pv[j]; 1559 rtmp3[col] -= mul3 * pv[j]; 1560 } 1561 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1562 } 1563 row = *bjtmp++; 1564 } 1565 1566 /* finished row i; check zero pivot, then stick row i into b->a */ 1567 rs = 0.0; 1568 /* L part */ 1569 pc1 = b->a + bi[i]; 1570 pj = b->j + bi[i]; 1571 nz = bi[i + 1] - bi[i]; 1572 for (j = 0; j < nz; j++) { 1573 col = pj[j]; 1574 pc1[j] = rtmp1[col]; 1575 rs += PetscAbsScalar(pc1[j]); 1576 } 1577 /* U part */ 1578 pc1 = b->a + bdiag[i + 1] + 1; 1579 pj = b->j + bdiag[i + 1] + 1; 1580 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1581 for (j = 0; j < nz; j++) { 1582 col = pj[j]; 1583 pc1[j] = rtmp1[col]; 1584 rs += PetscAbsScalar(pc1[j]); 1585 } 1586 1587 sctx.rs = rs; 1588 sctx.pv = rtmp1[i]; 1589 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1590 if (sctx.newshift) break; 1591 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1592 *pc1 = 1.0 / sctx.pv; 1593 1594 /* Now take care of 1st column of diagonal 3x3 block. */ 1595 pc2 = rtmp2 + i; 1596 pc3 = rtmp3 + i; 1597 if (*pc2 != 0.0 || *pc3 != 0.0) { 1598 mul2 = (*pc2) * (*pc1); 1599 *pc2 = mul2; 1600 mul3 = (*pc3) * (*pc1); 1601 *pc3 = mul3; 1602 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1603 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1604 for (j = 0; j < nz; j++) { 1605 col = pj[j]; 1606 rtmp2[col] -= mul2 * rtmp1[col]; 1607 rtmp3[col] -= mul3 * rtmp1[col]; 1608 } 1609 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1610 } 1611 1612 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1613 rs = 0.0; 1614 /* L part */ 1615 pc2 = b->a + bi[i + 1]; 1616 pj = b->j + bi[i + 1]; 1617 nz = bi[i + 2] - bi[i + 1]; 1618 for (j = 0; j < nz; j++) { 1619 col = pj[j]; 1620 pc2[j] = rtmp2[col]; 1621 rs += PetscAbsScalar(pc2[j]); 1622 } 1623 /* U part */ 1624 pc2 = b->a + bdiag[i + 2] + 1; 1625 pj = b->j + bdiag[i + 2] + 1; 1626 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1627 for (j = 0; j < nz; j++) { 1628 col = pj[j]; 1629 pc2[j] = rtmp2[col]; 1630 rs += PetscAbsScalar(pc2[j]); 1631 } 1632 1633 sctx.rs = rs; 1634 sctx.pv = rtmp2[i + 1]; 1635 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1636 if (sctx.newshift) break; 1637 pc2 = b->a + bdiag[i + 1]; 1638 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1639 1640 /* Now take care of 2nd column of diagonal 3x3 block. */ 1641 pc3 = rtmp3 + i + 1; 1642 if (*pc3 != 0.0) { 1643 mul3 = (*pc3) * (*pc2); 1644 *pc3 = mul3; 1645 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1646 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1647 for (j = 0; j < nz; j++) { 1648 col = pj[j]; 1649 rtmp3[col] -= mul3 * rtmp2[col]; 1650 } 1651 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1652 } 1653 1654 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1655 rs = 0.0; 1656 /* L part */ 1657 pc3 = b->a + bi[i + 2]; 1658 pj = b->j + bi[i + 2]; 1659 nz = bi[i + 3] - bi[i + 2]; 1660 for (j = 0; j < nz; j++) { 1661 col = pj[j]; 1662 pc3[j] = rtmp3[col]; 1663 rs += PetscAbsScalar(pc3[j]); 1664 } 1665 /* U part */ 1666 pc3 = b->a + bdiag[i + 3] + 1; 1667 pj = b->j + bdiag[i + 3] + 1; 1668 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1669 for (j = 0; j < nz; j++) { 1670 col = pj[j]; 1671 pc3[j] = rtmp3[col]; 1672 rs += PetscAbsScalar(pc3[j]); 1673 } 1674 1675 sctx.rs = rs; 1676 sctx.pv = rtmp3[i + 2]; 1677 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1678 if (sctx.newshift) break; 1679 pc3 = b->a + bdiag[i + 2]; 1680 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1681 break; 1682 case 4: 1683 /* zero rtmp */ 1684 /* L part */ 1685 nz = bi[i + 1] - bi[i]; 1686 bjtmp = bj + bi[i]; 1687 for (j = 0; j < nz; j++) { 1688 col = bjtmp[j]; 1689 rtmp1[col] = 0.0; 1690 rtmp2[col] = 0.0; 1691 rtmp3[col] = 0.0; 1692 rtmp4[col] = 0.0; 1693 } 1694 1695 /* U part */ 1696 nz = bdiag[i] - bdiag[i + 1]; 1697 bjtmp = bj + bdiag[i + 1] + 1; 1698 for (j = 0; j < nz; j++) { 1699 col = bjtmp[j]; 1700 rtmp1[col] = 0.0; 1701 rtmp2[col] = 0.0; 1702 rtmp3[col] = 0.0; 1703 rtmp4[col] = 0.0; 1704 } 1705 1706 /* load in initial (unfactored row) */ 1707 nz = ai[r[i] + 1] - ai[r[i]]; 1708 ajtmp = aj + ai[r[i]]; 1709 v1 = aa + ai[r[i]]; 1710 v2 = aa + ai[r[i + 1]]; 1711 v3 = aa + ai[r[i + 2]]; 1712 v4 = aa + ai[r[i + 3]]; 1713 for (j = 0; j < nz; j++) { 1714 col = ics[ajtmp[j]]; 1715 rtmp1[col] = v1[j]; 1716 rtmp2[col] = v2[j]; 1717 rtmp3[col] = v3[j]; 1718 rtmp4[col] = v4[j]; 1719 } 1720 /* ZeropivotApply(): shift the diagonal of the matrix */ 1721 rtmp1[i] += sctx.shift_amount; 1722 rtmp2[i + 1] += sctx.shift_amount; 1723 rtmp3[i + 2] += sctx.shift_amount; 1724 rtmp4[i + 3] += sctx.shift_amount; 1725 1726 /* elimination */ 1727 bjtmp = bj + bi[i]; 1728 row = *bjtmp++; /* pivot row */ 1729 nzL = bi[i + 1] - bi[i]; 1730 for (k = 0; k < nzL; k++) { 1731 pc1 = rtmp1 + row; 1732 pc2 = rtmp2 + row; 1733 pc3 = rtmp3 + row; 1734 pc4 = rtmp4 + row; 1735 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1736 pv = b->a + bdiag[row]; 1737 mul1 = *pc1 * (*pv); 1738 mul2 = *pc2 * (*pv); 1739 mul3 = *pc3 * (*pv); 1740 mul4 = *pc4 * (*pv); 1741 *pc1 = mul1; 1742 *pc2 = mul2; 1743 *pc3 = mul3; 1744 *pc4 = mul4; 1745 1746 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1747 pv = b->a + bdiag[row + 1] + 1; 1748 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1749 for (j = 0; j < nz; j++) { 1750 col = pj[j]; 1751 rtmp1[col] -= mul1 * pv[j]; 1752 rtmp2[col] -= mul2 * pv[j]; 1753 rtmp3[col] -= mul3 * pv[j]; 1754 rtmp4[col] -= mul4 * pv[j]; 1755 } 1756 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1757 } 1758 row = *bjtmp++; 1759 } 1760 1761 /* finished row i; check zero pivot, then stick row i into b->a */ 1762 rs = 0.0; 1763 /* L part */ 1764 pc1 = b->a + bi[i]; 1765 pj = b->j + bi[i]; 1766 nz = bi[i + 1] - bi[i]; 1767 for (j = 0; j < nz; j++) { 1768 col = pj[j]; 1769 pc1[j] = rtmp1[col]; 1770 rs += PetscAbsScalar(pc1[j]); 1771 } 1772 /* U part */ 1773 pc1 = b->a + bdiag[i + 1] + 1; 1774 pj = b->j + bdiag[i + 1] + 1; 1775 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1776 for (j = 0; j < nz; j++) { 1777 col = pj[j]; 1778 pc1[j] = rtmp1[col]; 1779 rs += PetscAbsScalar(pc1[j]); 1780 } 1781 1782 sctx.rs = rs; 1783 sctx.pv = rtmp1[i]; 1784 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1785 if (sctx.newshift) break; 1786 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1787 *pc1 = 1.0 / sctx.pv; 1788 1789 /* Now take care of 1st column of diagonal 4x4 block. */ 1790 pc2 = rtmp2 + i; 1791 pc3 = rtmp3 + i; 1792 pc4 = rtmp4 + i; 1793 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1794 mul2 = (*pc2) * (*pc1); 1795 *pc2 = mul2; 1796 mul3 = (*pc3) * (*pc1); 1797 *pc3 = mul3; 1798 mul4 = (*pc4) * (*pc1); 1799 *pc4 = mul4; 1800 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1801 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1802 for (j = 0; j < nz; j++) { 1803 col = pj[j]; 1804 rtmp2[col] -= mul2 * rtmp1[col]; 1805 rtmp3[col] -= mul3 * rtmp1[col]; 1806 rtmp4[col] -= mul4 * rtmp1[col]; 1807 } 1808 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1809 } 1810 1811 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1812 rs = 0.0; 1813 /* L part */ 1814 pc2 = b->a + bi[i + 1]; 1815 pj = b->j + bi[i + 1]; 1816 nz = bi[i + 2] - bi[i + 1]; 1817 for (j = 0; j < nz; j++) { 1818 col = pj[j]; 1819 pc2[j] = rtmp2[col]; 1820 rs += PetscAbsScalar(pc2[j]); 1821 } 1822 /* U part */ 1823 pc2 = b->a + bdiag[i + 2] + 1; 1824 pj = b->j + bdiag[i + 2] + 1; 1825 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1826 for (j = 0; j < nz; j++) { 1827 col = pj[j]; 1828 pc2[j] = rtmp2[col]; 1829 rs += PetscAbsScalar(pc2[j]); 1830 } 1831 1832 sctx.rs = rs; 1833 sctx.pv = rtmp2[i + 1]; 1834 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1835 if (sctx.newshift) break; 1836 pc2 = b->a + bdiag[i + 1]; 1837 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1838 1839 /* Now take care of 2nd column of diagonal 4x4 block. */ 1840 pc3 = rtmp3 + i + 1; 1841 pc4 = rtmp4 + i + 1; 1842 if (*pc3 != 0.0 || *pc4 != 0.0) { 1843 mul3 = (*pc3) * (*pc2); 1844 *pc3 = mul3; 1845 mul4 = (*pc4) * (*pc2); 1846 *pc4 = mul4; 1847 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1848 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1849 for (j = 0; j < nz; j++) { 1850 col = pj[j]; 1851 rtmp3[col] -= mul3 * rtmp2[col]; 1852 rtmp4[col] -= mul4 * rtmp2[col]; 1853 } 1854 PetscCall(PetscLogFlops(4.0 * nz)); 1855 } 1856 1857 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1858 rs = 0.0; 1859 /* L part */ 1860 pc3 = b->a + bi[i + 2]; 1861 pj = b->j + bi[i + 2]; 1862 nz = bi[i + 3] - bi[i + 2]; 1863 for (j = 0; j < nz; j++) { 1864 col = pj[j]; 1865 pc3[j] = rtmp3[col]; 1866 rs += PetscAbsScalar(pc3[j]); 1867 } 1868 /* U part */ 1869 pc3 = b->a + bdiag[i + 3] + 1; 1870 pj = b->j + bdiag[i + 3] + 1; 1871 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1872 for (j = 0; j < nz; j++) { 1873 col = pj[j]; 1874 pc3[j] = rtmp3[col]; 1875 rs += PetscAbsScalar(pc3[j]); 1876 } 1877 1878 sctx.rs = rs; 1879 sctx.pv = rtmp3[i + 2]; 1880 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1881 if (sctx.newshift) break; 1882 pc3 = b->a + bdiag[i + 2]; 1883 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1884 1885 /* Now take care of 3rd column of diagonal 4x4 block. */ 1886 pc4 = rtmp4 + i + 2; 1887 if (*pc4 != 0.0) { 1888 mul4 = (*pc4) * (*pc3); 1889 *pc4 = mul4; 1890 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1891 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1892 for (j = 0; j < nz; j++) { 1893 col = pj[j]; 1894 rtmp4[col] -= mul4 * rtmp3[col]; 1895 } 1896 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1897 } 1898 1899 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1900 rs = 0.0; 1901 /* L part */ 1902 pc4 = b->a + bi[i + 3]; 1903 pj = b->j + bi[i + 3]; 1904 nz = bi[i + 4] - bi[i + 3]; 1905 for (j = 0; j < nz; j++) { 1906 col = pj[j]; 1907 pc4[j] = rtmp4[col]; 1908 rs += PetscAbsScalar(pc4[j]); 1909 } 1910 /* U part */ 1911 pc4 = b->a + bdiag[i + 4] + 1; 1912 pj = b->j + bdiag[i + 4] + 1; 1913 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1914 for (j = 0; j < nz; j++) { 1915 col = pj[j]; 1916 pc4[j] = rtmp4[col]; 1917 rs += PetscAbsScalar(pc4[j]); 1918 } 1919 1920 sctx.rs = rs; 1921 sctx.pv = rtmp4[i + 3]; 1922 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1923 if (sctx.newshift) break; 1924 pc4 = b->a + bdiag[i + 3]; 1925 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1926 break; 1927 1928 default: 1929 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1930 } 1931 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1932 i += nodesz; /* Update the row */ 1933 } 1934 1935 /* MatPivotRefine() */ 1936 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1937 /* 1938 * if no shift in this attempt & shifting & started shifting & can refine, 1939 * then try lower shift 1940 */ 1941 sctx.shift_hi = sctx.shift_fraction; 1942 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1943 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1944 sctx.newshift = PETSC_TRUE; 1945 sctx.nshift++; 1946 } 1947 } while (sctx.newshift); 1948 1949 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1950 PetscCall(PetscFree(tmp_vec2)); 1951 PetscCall(ISRestoreIndices(isicol, &ic)); 1952 PetscCall(ISRestoreIndices(isrow, &r)); 1953 1954 if (b->inode.size_csr) { 1955 C->ops->solve = MatSolve_SeqAIJ_Inode; 1956 } else { 1957 C->ops->solve = MatSolve_SeqAIJ; 1958 } 1959 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1960 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1961 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1962 C->ops->matsolve = MatMatSolve_SeqAIJ; 1963 C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ; 1964 C->assembled = PETSC_TRUE; 1965 C->preallocated = PETSC_TRUE; 1966 1967 PetscCall(PetscLogFlops(C->cmap->n)); 1968 1969 /* MatShiftView(A,info,&sctx) */ 1970 if (sctx.nshift) { 1971 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1972 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1973 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1974 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1975 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1976 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1977 } 1978 } 1979 PetscFunctionReturn(PETSC_SUCCESS); 1980 } 1981 1982 #if 0 1983 // unused 1984 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1985 { 1986 Mat C = B; 1987 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1988 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1989 const PetscInt *r, *ic, *c, *ics; 1990 PetscInt n = A->rmap->n, *bi = b->i; 1991 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1992 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1993 PetscInt *ai = a->i, *aj = a->j; 1994 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1995 PetscScalar mul1, mul2, mul3, tmp; 1996 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1997 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1998 PetscReal rs = 0.0; 1999 FactorShiftCtx sctx; 2000 2001 PetscFunctionBegin; 2002 sctx.shift_top = 0; 2003 sctx.nshift_max = 0; 2004 sctx.shift_lo = 0; 2005 sctx.shift_hi = 0; 2006 sctx.shift_fraction = 0; 2007 2008 /* if both shift schemes are chosen by user, only use info->shiftpd */ 2009 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 2010 sctx.shift_top = 0; 2011 for (i = 0; i < n; i++) { 2012 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2013 rs = 0.0; 2014 ajtmp = aj + ai[i]; 2015 rtmp1 = aa + ai[i]; 2016 nz = ai[i + 1] - ai[i]; 2017 for (j = 0; j < nz; j++) { 2018 if (*ajtmp != i) { 2019 rs += PetscAbsScalar(*rtmp1++); 2020 } else { 2021 rs -= PetscRealPart(*rtmp1++); 2022 } 2023 ajtmp++; 2024 } 2025 if (rs > sctx.shift_top) sctx.shift_top = rs; 2026 } 2027 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2028 sctx.shift_top *= 1.1; 2029 sctx.nshift_max = 5; 2030 sctx.shift_lo = 0.; 2031 sctx.shift_hi = 1.; 2032 } 2033 sctx.shift_amount = 0; 2034 sctx.nshift = 0; 2035 2036 PetscCall(ISGetIndices(isrow, &r)); 2037 PetscCall(ISGetIndices(iscol, &c)); 2038 PetscCall(ISGetIndices(isicol, &ic)); 2039 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2040 ics = ic; 2041 2042 node_max = a->inode.node_count; 2043 ns = a->inode.size; 2044 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2045 2046 /* If max inode size > 3, split it into two inodes.*/ 2047 /* also map the inode sizes according to the ordering */ 2048 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2049 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2050 if (ns[i] > 3) { 2051 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2052 ++j; 2053 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2054 } else { 2055 tmp_vec1[j] = ns[i]; 2056 } 2057 } 2058 /* Use the correct node_max */ 2059 node_max = j; 2060 2061 /* Now reorder the inode info based on mat re-ordering info */ 2062 /* First create a row -> inode_size_array_index map */ 2063 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2064 for (i = 0, row = 0; i < node_max; i++) { 2065 nodesz = tmp_vec1[i]; 2066 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2067 } 2068 /* Using nsmap, create a reordered ns structure */ 2069 for (i = 0, j = 0; i < node_max; i++) { 2070 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2071 tmp_vec2[i] = nodesz; 2072 j += nodesz; 2073 } 2074 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2075 /* Now use the correct ns */ 2076 ns = tmp_vec2; 2077 2078 do { 2079 sctx.newshift = PETSC_FALSE; 2080 /* Now loop over each block-row, and do the factorization */ 2081 for (i = 0, row = 0; i < node_max; i++) { 2082 nodesz = ns[i]; 2083 nz = bi[row + 1] - bi[row]; 2084 bjtmp = bj + bi[row]; 2085 2086 switch (nodesz) { 2087 case 1: 2088 for (j = 0; j < nz; j++) { 2089 idx = bjtmp[j]; 2090 rtmp11[idx] = 0.0; 2091 } 2092 2093 /* load in initial (unfactored row) */ 2094 idx = r[row]; 2095 nz_tmp = ai[idx + 1] - ai[idx]; 2096 ajtmp = aj + ai[idx]; 2097 v1 = aa + ai[idx]; 2098 2099 for (j = 0; j < nz_tmp; j++) { 2100 idx = ics[ajtmp[j]]; 2101 rtmp11[idx] = v1[j]; 2102 } 2103 rtmp11[ics[r[row]]] += sctx.shift_amount; 2104 2105 prow = *bjtmp++; 2106 while (prow < row) { 2107 pc1 = rtmp11 + prow; 2108 if (*pc1 != 0.0) { 2109 pv = ba + bd[prow]; 2110 pj = nbj + bd[prow]; 2111 mul1 = *pc1 * *pv++; 2112 *pc1 = mul1; 2113 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2114 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2115 for (j = 0; j < nz_tmp; j++) { 2116 tmp = pv[j]; 2117 idx = pj[j]; 2118 rtmp11[idx] -= mul1 * tmp; 2119 } 2120 } 2121 prow = *bjtmp++; 2122 } 2123 pj = bj + bi[row]; 2124 pc1 = ba + bi[row]; 2125 2126 sctx.pv = rtmp11[row]; 2127 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2128 rs = 0.0; 2129 for (j = 0; j < nz; j++) { 2130 idx = pj[j]; 2131 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2132 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2133 } 2134 sctx.rs = rs; 2135 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2136 if (sctx.newshift) goto endofwhile; 2137 break; 2138 2139 case 2: 2140 for (j = 0; j < nz; j++) { 2141 idx = bjtmp[j]; 2142 rtmp11[idx] = 0.0; 2143 rtmp22[idx] = 0.0; 2144 } 2145 2146 /* load in initial (unfactored row) */ 2147 idx = r[row]; 2148 nz_tmp = ai[idx + 1] - ai[idx]; 2149 ajtmp = aj + ai[idx]; 2150 v1 = aa + ai[idx]; 2151 v2 = aa + ai[idx + 1]; 2152 for (j = 0; j < nz_tmp; j++) { 2153 idx = ics[ajtmp[j]]; 2154 rtmp11[idx] = v1[j]; 2155 rtmp22[idx] = v2[j]; 2156 } 2157 rtmp11[ics[r[row]]] += sctx.shift_amount; 2158 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2159 2160 prow = *bjtmp++; 2161 while (prow < row) { 2162 pc1 = rtmp11 + prow; 2163 pc2 = rtmp22 + prow; 2164 if (*pc1 != 0.0 || *pc2 != 0.0) { 2165 pv = ba + bd[prow]; 2166 pj = nbj + bd[prow]; 2167 mul1 = *pc1 * *pv; 2168 mul2 = *pc2 * *pv; 2169 ++pv; 2170 *pc1 = mul1; 2171 *pc2 = mul2; 2172 2173 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2174 for (j = 0; j < nz_tmp; j++) { 2175 tmp = pv[j]; 2176 idx = pj[j]; 2177 rtmp11[idx] -= mul1 * tmp; 2178 rtmp22[idx] -= mul2 * tmp; 2179 } 2180 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2181 } 2182 prow = *bjtmp++; 2183 } 2184 2185 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2186 pc1 = rtmp11 + prow; 2187 pc2 = rtmp22 + prow; 2188 2189 sctx.pv = *pc1; 2190 pj = bj + bi[prow]; 2191 rs = 0.0; 2192 for (j = 0; j < nz; j++) { 2193 idx = pj[j]; 2194 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2195 } 2196 sctx.rs = rs; 2197 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2198 if (sctx.newshift) goto endofwhile; 2199 2200 if (*pc2 != 0.0) { 2201 pj = nbj + bd[prow]; 2202 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2203 *pc2 = mul2; 2204 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2205 for (j = 0; j < nz_tmp; j++) { 2206 idx = pj[j]; 2207 tmp = rtmp11[idx]; 2208 rtmp22[idx] -= mul2 * tmp; 2209 } 2210 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2211 } 2212 2213 pj = bj + bi[row]; 2214 pc1 = ba + bi[row]; 2215 pc2 = ba + bi[row + 1]; 2216 2217 sctx.pv = rtmp22[row + 1]; 2218 rs = 0.0; 2219 rtmp11[row] = 1.0 / rtmp11[row]; 2220 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2221 /* copy row entries from dense representation to sparse */ 2222 for (j = 0; j < nz; j++) { 2223 idx = pj[j]; 2224 pc1[j] = rtmp11[idx]; 2225 pc2[j] = rtmp22[idx]; 2226 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2227 } 2228 sctx.rs = rs; 2229 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2230 if (sctx.newshift) goto endofwhile; 2231 break; 2232 2233 case 3: 2234 for (j = 0; j < nz; j++) { 2235 idx = bjtmp[j]; 2236 rtmp11[idx] = 0.0; 2237 rtmp22[idx] = 0.0; 2238 rtmp33[idx] = 0.0; 2239 } 2240 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2241 idx = r[row]; 2242 nz_tmp = ai[idx + 1] - ai[idx]; 2243 ajtmp = aj + ai[idx]; 2244 v1 = aa + ai[idx]; 2245 v2 = aa + ai[idx + 1]; 2246 v3 = aa + ai[idx + 2]; 2247 for (j = 0; j < nz_tmp; j++) { 2248 idx = ics[ajtmp[j]]; 2249 rtmp11[idx] = v1[j]; 2250 rtmp22[idx] = v2[j]; 2251 rtmp33[idx] = v3[j]; 2252 } 2253 rtmp11[ics[r[row]]] += sctx.shift_amount; 2254 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2255 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2256 2257 /* loop over all pivot row blocks above this row block */ 2258 prow = *bjtmp++; 2259 while (prow < row) { 2260 pc1 = rtmp11 + prow; 2261 pc2 = rtmp22 + prow; 2262 pc3 = rtmp33 + prow; 2263 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2264 pv = ba + bd[prow]; 2265 pj = nbj + bd[prow]; 2266 mul1 = *pc1 * *pv; 2267 mul2 = *pc2 * *pv; 2268 mul3 = *pc3 * *pv; 2269 ++pv; 2270 *pc1 = mul1; 2271 *pc2 = mul2; 2272 *pc3 = mul3; 2273 2274 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2275 /* update this row based on pivot row */ 2276 for (j = 0; j < nz_tmp; j++) { 2277 tmp = pv[j]; 2278 idx = pj[j]; 2279 rtmp11[idx] -= mul1 * tmp; 2280 rtmp22[idx] -= mul2 * tmp; 2281 rtmp33[idx] -= mul3 * tmp; 2282 } 2283 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2284 } 2285 prow = *bjtmp++; 2286 } 2287 2288 /* Now take care of diagonal 3x3 block in this set of rows */ 2289 /* note: prow = row here */ 2290 pc1 = rtmp11 + prow; 2291 pc2 = rtmp22 + prow; 2292 pc3 = rtmp33 + prow; 2293 2294 sctx.pv = *pc1; 2295 pj = bj + bi[prow]; 2296 rs = 0.0; 2297 for (j = 0; j < nz; j++) { 2298 idx = pj[j]; 2299 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2300 } 2301 sctx.rs = rs; 2302 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2303 if (sctx.newshift) goto endofwhile; 2304 2305 if (*pc2 != 0.0 || *pc3 != 0.0) { 2306 mul2 = (*pc2) / (*pc1); 2307 mul3 = (*pc3) / (*pc1); 2308 *pc2 = mul2; 2309 *pc3 = mul3; 2310 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2311 pj = nbj + bd[prow]; 2312 for (j = 0; j < nz_tmp; j++) { 2313 idx = pj[j]; 2314 tmp = rtmp11[idx]; 2315 rtmp22[idx] -= mul2 * tmp; 2316 rtmp33[idx] -= mul3 * tmp; 2317 } 2318 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2319 } 2320 ++prow; 2321 2322 pc2 = rtmp22 + prow; 2323 pc3 = rtmp33 + prow; 2324 sctx.pv = *pc2; 2325 pj = bj + bi[prow]; 2326 rs = 0.0; 2327 for (j = 0; j < nz; j++) { 2328 idx = pj[j]; 2329 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2330 } 2331 sctx.rs = rs; 2332 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2333 if (sctx.newshift) goto endofwhile; 2334 2335 if (*pc3 != 0.0) { 2336 mul3 = (*pc3) / (*pc2); 2337 *pc3 = mul3; 2338 pj = nbj + bd[prow]; 2339 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2340 for (j = 0; j < nz_tmp; j++) { 2341 idx = pj[j]; 2342 tmp = rtmp22[idx]; 2343 rtmp33[idx] -= mul3 * tmp; 2344 } 2345 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2346 } 2347 2348 pj = bj + bi[row]; 2349 pc1 = ba + bi[row]; 2350 pc2 = ba + bi[row + 1]; 2351 pc3 = ba + bi[row + 2]; 2352 2353 sctx.pv = rtmp33[row + 2]; 2354 rs = 0.0; 2355 rtmp11[row] = 1.0 / rtmp11[row]; 2356 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2357 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2358 /* copy row entries from dense representation to sparse */ 2359 for (j = 0; j < nz; j++) { 2360 idx = pj[j]; 2361 pc1[j] = rtmp11[idx]; 2362 pc2[j] = rtmp22[idx]; 2363 pc3[j] = rtmp33[idx]; 2364 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2365 } 2366 2367 sctx.rs = rs; 2368 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2369 if (sctx.newshift) goto endofwhile; 2370 break; 2371 2372 default: 2373 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2374 } 2375 row += nodesz; /* Update the row */ 2376 } 2377 endofwhile:; 2378 } while (sctx.newshift); 2379 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2380 PetscCall(PetscFree(tmp_vec2)); 2381 PetscCall(ISRestoreIndices(isicol, &ic)); 2382 PetscCall(ISRestoreIndices(isrow, &r)); 2383 PetscCall(ISRestoreIndices(iscol, &c)); 2384 2385 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2386 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2387 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2388 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2389 C->assembled = PETSC_TRUE; 2390 C->preallocated = PETSC_TRUE; 2391 if (sctx.nshift) { 2392 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2393 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2394 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2395 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2396 } 2397 } 2398 PetscCall(PetscLogFlops(C->cmap->n)); 2399 PetscCall(MatSeqAIJCheckInode(C)); 2400 PetscFunctionReturn(PETSC_SUCCESS); 2401 } 2402 #endif 2403 2404 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2405 { 2406 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2407 IS iscol = a->col, isrow = a->row; 2408 const PetscInt *r, *c, *rout, *cout; 2409 PetscInt i, j; 2410 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2411 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2412 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2413 PetscScalar sum1, sum2, sum3, sum4, sum5; 2414 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2415 const PetscScalar *b; 2416 2417 PetscFunctionBegin; 2418 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2419 node_max = a->inode.node_count; 2420 ns = a->inode.size_csr; /* Node Size array */ 2421 2422 PetscCall(VecGetArrayRead(bb, &b)); 2423 PetscCall(VecGetArrayWrite(xx, &x)); 2424 tmp = a->solve_work; 2425 2426 PetscCall(ISGetIndices(isrow, &rout)); 2427 r = rout; 2428 PetscCall(ISGetIndices(iscol, &cout)); 2429 c = cout; 2430 2431 /* forward solve the lower triangular */ 2432 tmps = tmp; 2433 aa = a_a; 2434 aj = a_j; 2435 ad = a->diag; 2436 2437 for (i = 0; i < node_max; ++i) { 2438 row = ns[i]; 2439 nsz = ns[i + 1] - ns[i]; 2440 aii = ai[row]; 2441 v1 = aa + aii; 2442 vi = aj + aii; 2443 nz = ai[row + 1] - ai[row]; 2444 2445 if (i < node_max - 1) { 2446 /* Prefetch the indices for the next block */ 2447 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2448 /* Prefetch the data for the next block */ 2449 PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2450 } 2451 2452 switch (nsz) { /* Each loop in 'case' is unrolled */ 2453 case 1: 2454 sum1 = b[r[row]]; 2455 for (j = 0; j < nz - 1; j += 2) { 2456 i0 = vi[j]; 2457 i1 = vi[j + 1]; 2458 tmp0 = tmps[i0]; 2459 tmp1 = tmps[i1]; 2460 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2461 } 2462 if (j == nz - 1) { 2463 tmp0 = tmps[vi[j]]; 2464 sum1 -= v1[j] * tmp0; 2465 } 2466 tmp[row++] = sum1; 2467 break; 2468 case 2: 2469 sum1 = b[r[row]]; 2470 sum2 = b[r[row + 1]]; 2471 v2 = aa + ai[row + 1]; 2472 2473 for (j = 0; j < nz - 1; j += 2) { 2474 i0 = vi[j]; 2475 i1 = vi[j + 1]; 2476 tmp0 = tmps[i0]; 2477 tmp1 = tmps[i1]; 2478 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2479 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2480 } 2481 if (j == nz - 1) { 2482 tmp0 = tmps[vi[j]]; 2483 sum1 -= v1[j] * tmp0; 2484 sum2 -= v2[j] * tmp0; 2485 } 2486 sum2 -= v2[nz] * sum1; 2487 tmp[row++] = sum1; 2488 tmp[row++] = sum2; 2489 break; 2490 case 3: 2491 sum1 = b[r[row]]; 2492 sum2 = b[r[row + 1]]; 2493 sum3 = b[r[row + 2]]; 2494 v2 = aa + ai[row + 1]; 2495 v3 = aa + ai[row + 2]; 2496 2497 for (j = 0; j < nz - 1; j += 2) { 2498 i0 = vi[j]; 2499 i1 = vi[j + 1]; 2500 tmp0 = tmps[i0]; 2501 tmp1 = tmps[i1]; 2502 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2503 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2504 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2505 } 2506 if (j == nz - 1) { 2507 tmp0 = tmps[vi[j]]; 2508 sum1 -= v1[j] * tmp0; 2509 sum2 -= v2[j] * tmp0; 2510 sum3 -= v3[j] * tmp0; 2511 } 2512 sum2 -= v2[nz] * sum1; 2513 sum3 -= v3[nz] * sum1; 2514 sum3 -= v3[nz + 1] * sum2; 2515 tmp[row++] = sum1; 2516 tmp[row++] = sum2; 2517 tmp[row++] = sum3; 2518 break; 2519 2520 case 4: 2521 sum1 = b[r[row]]; 2522 sum2 = b[r[row + 1]]; 2523 sum3 = b[r[row + 2]]; 2524 sum4 = b[r[row + 3]]; 2525 v2 = aa + ai[row + 1]; 2526 v3 = aa + ai[row + 2]; 2527 v4 = aa + ai[row + 3]; 2528 2529 for (j = 0; j < nz - 1; j += 2) { 2530 i0 = vi[j]; 2531 i1 = vi[j + 1]; 2532 tmp0 = tmps[i0]; 2533 tmp1 = tmps[i1]; 2534 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2535 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2536 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2537 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2538 } 2539 if (j == nz - 1) { 2540 tmp0 = tmps[vi[j]]; 2541 sum1 -= v1[j] * tmp0; 2542 sum2 -= v2[j] * tmp0; 2543 sum3 -= v3[j] * tmp0; 2544 sum4 -= v4[j] * tmp0; 2545 } 2546 sum2 -= v2[nz] * sum1; 2547 sum3 -= v3[nz] * sum1; 2548 sum4 -= v4[nz] * sum1; 2549 sum3 -= v3[nz + 1] * sum2; 2550 sum4 -= v4[nz + 1] * sum2; 2551 sum4 -= v4[nz + 2] * sum3; 2552 2553 tmp[row++] = sum1; 2554 tmp[row++] = sum2; 2555 tmp[row++] = sum3; 2556 tmp[row++] = sum4; 2557 break; 2558 case 5: 2559 sum1 = b[r[row]]; 2560 sum2 = b[r[row + 1]]; 2561 sum3 = b[r[row + 2]]; 2562 sum4 = b[r[row + 3]]; 2563 sum5 = b[r[row + 4]]; 2564 v2 = aa + ai[row + 1]; 2565 v3 = aa + ai[row + 2]; 2566 v4 = aa + ai[row + 3]; 2567 v5 = aa + ai[row + 4]; 2568 2569 for (j = 0; j < nz - 1; j += 2) { 2570 i0 = vi[j]; 2571 i1 = vi[j + 1]; 2572 tmp0 = tmps[i0]; 2573 tmp1 = tmps[i1]; 2574 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2575 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2576 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2577 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2578 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2579 } 2580 if (j == nz - 1) { 2581 tmp0 = tmps[vi[j]]; 2582 sum1 -= v1[j] * tmp0; 2583 sum2 -= v2[j] * tmp0; 2584 sum3 -= v3[j] * tmp0; 2585 sum4 -= v4[j] * tmp0; 2586 sum5 -= v5[j] * tmp0; 2587 } 2588 2589 sum2 -= v2[nz] * sum1; 2590 sum3 -= v3[nz] * sum1; 2591 sum4 -= v4[nz] * sum1; 2592 sum5 -= v5[nz] * sum1; 2593 sum3 -= v3[nz + 1] * sum2; 2594 sum4 -= v4[nz + 1] * sum2; 2595 sum5 -= v5[nz + 1] * sum2; 2596 sum4 -= v4[nz + 2] * sum3; 2597 sum5 -= v5[nz + 2] * sum3; 2598 sum5 -= v5[nz + 3] * sum4; 2599 2600 tmp[row++] = sum1; 2601 tmp[row++] = sum2; 2602 tmp[row++] = sum3; 2603 tmp[row++] = sum4; 2604 tmp[row++] = sum5; 2605 break; 2606 default: 2607 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2608 } 2609 } 2610 /* backward solve the upper triangular */ 2611 for (i = node_max - 1; i >= 0; i--) { 2612 row = ns[i + 1] - 1; 2613 nsz = ns[i + 1] - ns[i]; 2614 aii = ad[row + 1] + 1; 2615 v1 = aa + aii; 2616 vi = aj + aii; 2617 nz = ad[row] - ad[row + 1] - 1; 2618 2619 if (i > 0) { 2620 /* Prefetch the indices for the next block */ 2621 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2622 /* Prefetch the data for the next block */ 2623 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2624 } 2625 2626 switch (nsz) { /* Each loop in 'case' is unrolled */ 2627 case 1: 2628 sum1 = tmp[row]; 2629 2630 for (j = 0; j < nz - 1; j += 2) { 2631 i0 = vi[j]; 2632 i1 = vi[j + 1]; 2633 tmp0 = tmps[i0]; 2634 tmp1 = tmps[i1]; 2635 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2636 } 2637 if (j == nz - 1) { 2638 tmp0 = tmps[vi[j]]; 2639 sum1 -= v1[j] * tmp0; 2640 } 2641 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2642 row--; 2643 break; 2644 case 2: 2645 sum1 = tmp[row]; 2646 sum2 = tmp[row - 1]; 2647 v2 = aa + ad[row] + 1; 2648 for (j = 0; j < nz - 1; j += 2) { 2649 i0 = vi[j]; 2650 i1 = vi[j + 1]; 2651 tmp0 = tmps[i0]; 2652 tmp1 = tmps[i1]; 2653 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2654 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2655 } 2656 if (j == nz - 1) { 2657 tmp0 = tmps[vi[j]]; 2658 sum1 -= v1[j] * tmp0; 2659 sum2 -= v2[j + 1] * tmp0; 2660 } 2661 2662 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2663 row--; 2664 sum2 -= v2[0] * tmp0; 2665 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2666 row--; 2667 break; 2668 case 3: 2669 sum1 = tmp[row]; 2670 sum2 = tmp[row - 1]; 2671 sum3 = tmp[row - 2]; 2672 v2 = aa + ad[row] + 1; 2673 v3 = aa + ad[row - 1] + 1; 2674 for (j = 0; j < nz - 1; j += 2) { 2675 i0 = vi[j]; 2676 i1 = vi[j + 1]; 2677 tmp0 = tmps[i0]; 2678 tmp1 = tmps[i1]; 2679 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2680 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2681 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2682 } 2683 if (j == nz - 1) { 2684 tmp0 = tmps[vi[j]]; 2685 sum1 -= v1[j] * tmp0; 2686 sum2 -= v2[j + 1] * tmp0; 2687 sum3 -= v3[j + 2] * tmp0; 2688 } 2689 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2690 row--; 2691 sum2 -= v2[0] * tmp0; 2692 sum3 -= v3[1] * tmp0; 2693 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2694 row--; 2695 sum3 -= v3[0] * tmp0; 2696 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2697 row--; 2698 2699 break; 2700 case 4: 2701 sum1 = tmp[row]; 2702 sum2 = tmp[row - 1]; 2703 sum3 = tmp[row - 2]; 2704 sum4 = tmp[row - 3]; 2705 v2 = aa + ad[row] + 1; 2706 v3 = aa + ad[row - 1] + 1; 2707 v4 = aa + ad[row - 2] + 1; 2708 2709 for (j = 0; j < nz - 1; j += 2) { 2710 i0 = vi[j]; 2711 i1 = vi[j + 1]; 2712 tmp0 = tmps[i0]; 2713 tmp1 = tmps[i1]; 2714 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2715 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2716 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2717 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2718 } 2719 if (j == nz - 1) { 2720 tmp0 = tmps[vi[j]]; 2721 sum1 -= v1[j] * tmp0; 2722 sum2 -= v2[j + 1] * tmp0; 2723 sum3 -= v3[j + 2] * tmp0; 2724 sum4 -= v4[j + 3] * tmp0; 2725 } 2726 2727 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2728 row--; 2729 sum2 -= v2[0] * tmp0; 2730 sum3 -= v3[1] * tmp0; 2731 sum4 -= v4[2] * tmp0; 2732 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2733 row--; 2734 sum3 -= v3[0] * tmp0; 2735 sum4 -= v4[1] * tmp0; 2736 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2737 row--; 2738 sum4 -= v4[0] * tmp0; 2739 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2740 row--; 2741 break; 2742 case 5: 2743 sum1 = tmp[row]; 2744 sum2 = tmp[row - 1]; 2745 sum3 = tmp[row - 2]; 2746 sum4 = tmp[row - 3]; 2747 sum5 = tmp[row - 4]; 2748 v2 = aa + ad[row] + 1; 2749 v3 = aa + ad[row - 1] + 1; 2750 v4 = aa + ad[row - 2] + 1; 2751 v5 = aa + ad[row - 3] + 1; 2752 for (j = 0; j < nz - 1; j += 2) { 2753 i0 = vi[j]; 2754 i1 = vi[j + 1]; 2755 tmp0 = tmps[i0]; 2756 tmp1 = tmps[i1]; 2757 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2758 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2759 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2760 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2761 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2762 } 2763 if (j == nz - 1) { 2764 tmp0 = tmps[vi[j]]; 2765 sum1 -= v1[j] * tmp0; 2766 sum2 -= v2[j + 1] * tmp0; 2767 sum3 -= v3[j + 2] * tmp0; 2768 sum4 -= v4[j + 3] * tmp0; 2769 sum5 -= v5[j + 4] * tmp0; 2770 } 2771 2772 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2773 row--; 2774 sum2 -= v2[0] * tmp0; 2775 sum3 -= v3[1] * tmp0; 2776 sum4 -= v4[2] * tmp0; 2777 sum5 -= v5[3] * tmp0; 2778 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2779 row--; 2780 sum3 -= v3[0] * tmp0; 2781 sum4 -= v4[1] * tmp0; 2782 sum5 -= v5[2] * tmp0; 2783 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2784 row--; 2785 sum4 -= v4[0] * tmp0; 2786 sum5 -= v5[1] * tmp0; 2787 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2788 row--; 2789 sum5 -= v5[0] * tmp0; 2790 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2791 row--; 2792 break; 2793 default: 2794 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2795 } 2796 } 2797 PetscCall(ISRestoreIndices(isrow, &rout)); 2798 PetscCall(ISRestoreIndices(iscol, &cout)); 2799 PetscCall(VecRestoreArrayRead(bb, &b)); 2800 PetscCall(VecRestoreArrayWrite(xx, &x)); 2801 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2802 PetscFunctionReturn(PETSC_SUCCESS); 2803 } 2804 2805 /* 2806 Makes a longer coloring[] array and calls the usual code with that 2807 */ 2808 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2809 { 2810 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2811 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row; 2812 PetscInt *colorused, i; 2813 ISColoringValue *newcolor; 2814 2815 PetscFunctionBegin; 2816 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2817 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2818 /* loop over inodes, marking a color for each column*/ 2819 row = 0; 2820 for (i = 0; i < m; i++) { 2821 for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++)); 2822 } 2823 2824 /* eliminate unneeded colors */ 2825 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2826 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2827 2828 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2829 ncolors = colorused[5 * ncolors - 1]; 2830 for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i)); 2831 PetscCall(PetscFree(colorused)); 2832 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2833 PetscCall(PetscFree(coloring)); 2834 PetscFunctionReturn(PETSC_SUCCESS); 2835 } 2836 2837 #include <petsc/private/kernels/blockinvert.h> 2838 2839 /* 2840 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 2841 */ 2842 static PetscErrorCode MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A, PetscScalar omega, PetscScalar fshift) 2843 { 2844 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2845 MatScalar *ibdiag, *bdiag, work[25]; 2846 const MatScalar *v = a->a; 2847 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2848 PetscInt m = a->inode.node_count, cnt = 0, i, j, row, nodesz; 2849 PetscInt k, ipvt[5]; 2850 PetscBool allowzeropivot = PetscNot(A->erroriffailure), zeropivotdetected; 2851 const PetscInt *sizes = a->inode.size_csr, *diag = a->diag; 2852 2853 PetscFunctionBegin; 2854 if (a->idiagState == ((PetscObject)A)->state) PetscFunctionReturn(PETSC_SUCCESS); 2855 PetscCall(MatMarkDiagonal_SeqAIJ(A)); 2856 diag = a->diag; 2857 if (!a->inode.ibdiag) { 2858 /* calculate space needed for diagonal blocks */ 2859 for (i = 0; i < m; i++) { 2860 nodesz = sizes[i + 1] - sizes[i]; 2861 cnt += nodesz * nodesz; 2862 } 2863 a->inode.bdiagsize = cnt; 2864 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2865 } 2866 2867 /* copy over the diagonal blocks and invert them */ 2868 ibdiag = a->inode.ibdiag; 2869 bdiag = a->inode.bdiag; 2870 cnt = 0; 2871 for (i = 0, row = 0; i < m; i++) { 2872 nodesz = sizes[i + 1] - sizes[i]; 2873 for (j = 0; j < nodesz; j++) { 2874 for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k]; 2875 } 2876 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz)); 2877 2878 switch (nodesz) { 2879 case 1: 2880 /* Create matrix data structure */ 2881 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2882 PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2883 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2884 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2885 A->factorerror_zeropivot_row = row; 2886 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2887 } 2888 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2889 break; 2890 case 2: 2891 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2892 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2893 break; 2894 case 3: 2895 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2896 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2897 break; 2898 case 4: 2899 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2900 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2901 break; 2902 case 5: 2903 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2904 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2905 break; 2906 default: 2907 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 2908 } 2909 cnt += nodesz * nodesz; 2910 row += nodesz; 2911 } 2912 a->inode.ibdiagState = ((PetscObject)A)->state; 2913 PetscFunctionReturn(PETSC_SUCCESS); 2914 } 2915 2916 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2917 { 2918 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2919 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2920 MatScalar *ibdiag, *bdiag, *t; 2921 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2922 const MatScalar *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2923 const PetscScalar *xb, *b; 2924 PetscInt n, m = a->inode.node_count, cnt = 0, i, row, i1, i2, nodesz; 2925 PetscInt sz; 2926 const PetscInt *sizes = a->inode.size_csr, *idx, *diag, *ii = a->i; 2927 2928 PetscFunctionBegin; 2929 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2930 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2931 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2932 PetscCall(MatInvertDiagonalForSOR_SeqAIJ_Inode(A, omega, fshift)); 2933 diag = a->diag; 2934 2935 ibdiag = a->inode.ibdiag; 2936 bdiag = a->inode.bdiag; 2937 t = a->inode.ssor_work; 2938 2939 PetscCall(VecGetArray(xx, &x)); 2940 PetscCall(VecGetArrayRead(bb, &b)); 2941 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2942 if (flag & SOR_ZERO_INITIAL_GUESS) { 2943 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2944 for (i = 0, row = 0; i < m; i++) { 2945 sz = diag[row] - ii[row]; 2946 v1 = a->a + ii[row]; 2947 idx = a->j + ii[row]; 2948 2949 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2950 nodesz = sizes[i + 1] - sizes[i]; 2951 switch (nodesz) { 2952 case 1: 2953 2954 sum1 = b[row]; 2955 for (n = 0; n < sz - 1; n += 2) { 2956 i1 = idx[0]; 2957 i2 = idx[1]; 2958 idx += 2; 2959 tmp0 = x[i1]; 2960 tmp1 = x[i2]; 2961 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2962 v1 += 2; 2963 } 2964 2965 if (n == sz - 1) { 2966 tmp0 = x[*idx]; 2967 sum1 -= *v1 * tmp0; 2968 } 2969 t[row] = sum1; 2970 x[row++] = sum1 * (*ibdiag++); 2971 break; 2972 case 2: 2973 v2 = a->a + ii[row + 1]; 2974 sum1 = b[row]; 2975 sum2 = b[row + 1]; 2976 for (n = 0; n < sz - 1; n += 2) { 2977 i1 = idx[0]; 2978 i2 = idx[1]; 2979 idx += 2; 2980 tmp0 = x[i1]; 2981 tmp1 = x[i2]; 2982 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2983 v1 += 2; 2984 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2985 v2 += 2; 2986 } 2987 2988 if (n == sz - 1) { 2989 tmp0 = x[*idx]; 2990 sum1 -= v1[0] * tmp0; 2991 sum2 -= v2[0] * tmp0; 2992 } 2993 t[row] = sum1; 2994 t[row + 1] = sum2; 2995 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2996 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2997 ibdiag += 4; 2998 break; 2999 case 3: 3000 v2 = a->a + ii[row + 1]; 3001 v3 = a->a + ii[row + 2]; 3002 sum1 = b[row]; 3003 sum2 = b[row + 1]; 3004 sum3 = b[row + 2]; 3005 for (n = 0; n < sz - 1; n += 2) { 3006 i1 = idx[0]; 3007 i2 = idx[1]; 3008 idx += 2; 3009 tmp0 = x[i1]; 3010 tmp1 = x[i2]; 3011 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3012 v1 += 2; 3013 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3014 v2 += 2; 3015 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3016 v3 += 2; 3017 } 3018 3019 if (n == sz - 1) { 3020 tmp0 = x[*idx]; 3021 sum1 -= v1[0] * tmp0; 3022 sum2 -= v2[0] * tmp0; 3023 sum3 -= v3[0] * tmp0; 3024 } 3025 t[row] = sum1; 3026 t[row + 1] = sum2; 3027 t[row + 2] = sum3; 3028 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3029 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3030 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3031 ibdiag += 9; 3032 break; 3033 case 4: 3034 v2 = a->a + ii[row + 1]; 3035 v3 = a->a + ii[row + 2]; 3036 v4 = a->a + ii[row + 3]; 3037 sum1 = b[row]; 3038 sum2 = b[row + 1]; 3039 sum3 = b[row + 2]; 3040 sum4 = b[row + 3]; 3041 for (n = 0; n < sz - 1; n += 2) { 3042 i1 = idx[0]; 3043 i2 = idx[1]; 3044 idx += 2; 3045 tmp0 = x[i1]; 3046 tmp1 = x[i2]; 3047 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3048 v1 += 2; 3049 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3050 v2 += 2; 3051 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3052 v3 += 2; 3053 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3054 v4 += 2; 3055 } 3056 3057 if (n == sz - 1) { 3058 tmp0 = x[*idx]; 3059 sum1 -= v1[0] * tmp0; 3060 sum2 -= v2[0] * tmp0; 3061 sum3 -= v3[0] * tmp0; 3062 sum4 -= v4[0] * tmp0; 3063 } 3064 t[row] = sum1; 3065 t[row + 1] = sum2; 3066 t[row + 2] = sum3; 3067 t[row + 3] = sum4; 3068 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3069 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3070 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3071 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3072 ibdiag += 16; 3073 break; 3074 case 5: 3075 v2 = a->a + ii[row + 1]; 3076 v3 = a->a + ii[row + 2]; 3077 v4 = a->a + ii[row + 3]; 3078 v5 = a->a + ii[row + 4]; 3079 sum1 = b[row]; 3080 sum2 = b[row + 1]; 3081 sum3 = b[row + 2]; 3082 sum4 = b[row + 3]; 3083 sum5 = b[row + 4]; 3084 for (n = 0; n < sz - 1; n += 2) { 3085 i1 = idx[0]; 3086 i2 = idx[1]; 3087 idx += 2; 3088 tmp0 = x[i1]; 3089 tmp1 = x[i2]; 3090 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3091 v1 += 2; 3092 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3093 v2 += 2; 3094 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3095 v3 += 2; 3096 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3097 v4 += 2; 3098 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3099 v5 += 2; 3100 } 3101 3102 if (n == sz - 1) { 3103 tmp0 = x[*idx]; 3104 sum1 -= v1[0] * tmp0; 3105 sum2 -= v2[0] * tmp0; 3106 sum3 -= v3[0] * tmp0; 3107 sum4 -= v4[0] * tmp0; 3108 sum5 -= v5[0] * tmp0; 3109 } 3110 t[row] = sum1; 3111 t[row + 1] = sum2; 3112 t[row + 2] = sum3; 3113 t[row + 3] = sum4; 3114 t[row + 4] = sum5; 3115 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3116 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3117 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3118 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3119 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3120 ibdiag += 25; 3121 break; 3122 default: 3123 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3124 } 3125 } 3126 3127 xb = t; 3128 PetscCall(PetscLogFlops(a->nz)); 3129 } else xb = b; 3130 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3131 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3132 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3133 nodesz = sizes[i + 1] - sizes[i]; 3134 ibdiag -= nodesz * nodesz; 3135 sz = ii[row + 1] - diag[row] - 1; 3136 v1 = a->a + diag[row] + 1; 3137 idx = a->j + diag[row] + 1; 3138 3139 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3140 switch (nodesz) { 3141 case 1: 3142 3143 sum1 = xb[row]; 3144 for (n = 0; n < sz - 1; n += 2) { 3145 i1 = idx[0]; 3146 i2 = idx[1]; 3147 idx += 2; 3148 tmp0 = x[i1]; 3149 tmp1 = x[i2]; 3150 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3151 v1 += 2; 3152 } 3153 3154 if (n == sz - 1) { 3155 tmp0 = x[*idx]; 3156 sum1 -= *v1 * tmp0; 3157 } 3158 x[row--] = sum1 * (*ibdiag); 3159 break; 3160 3161 case 2: 3162 3163 sum1 = xb[row]; 3164 sum2 = xb[row - 1]; 3165 /* note that sum1 is associated with the second of the two rows */ 3166 v2 = a->a + diag[row - 1] + 2; 3167 for (n = 0; n < sz - 1; n += 2) { 3168 i1 = idx[0]; 3169 i2 = idx[1]; 3170 idx += 2; 3171 tmp0 = x[i1]; 3172 tmp1 = x[i2]; 3173 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3174 v1 += 2; 3175 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3176 v2 += 2; 3177 } 3178 3179 if (n == sz - 1) { 3180 tmp0 = x[*idx]; 3181 sum1 -= *v1 * tmp0; 3182 sum2 -= *v2 * tmp0; 3183 } 3184 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3185 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3186 break; 3187 case 3: 3188 3189 sum1 = xb[row]; 3190 sum2 = xb[row - 1]; 3191 sum3 = xb[row - 2]; 3192 v2 = a->a + diag[row - 1] + 2; 3193 v3 = a->a + diag[row - 2] + 3; 3194 for (n = 0; n < sz - 1; n += 2) { 3195 i1 = idx[0]; 3196 i2 = idx[1]; 3197 idx += 2; 3198 tmp0 = x[i1]; 3199 tmp1 = x[i2]; 3200 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3201 v1 += 2; 3202 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3203 v2 += 2; 3204 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3205 v3 += 2; 3206 } 3207 3208 if (n == sz - 1) { 3209 tmp0 = x[*idx]; 3210 sum1 -= *v1 * tmp0; 3211 sum2 -= *v2 * tmp0; 3212 sum3 -= *v3 * tmp0; 3213 } 3214 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3215 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3216 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3217 break; 3218 case 4: 3219 3220 sum1 = xb[row]; 3221 sum2 = xb[row - 1]; 3222 sum3 = xb[row - 2]; 3223 sum4 = xb[row - 3]; 3224 v2 = a->a + diag[row - 1] + 2; 3225 v3 = a->a + diag[row - 2] + 3; 3226 v4 = a->a + diag[row - 3] + 4; 3227 for (n = 0; n < sz - 1; n += 2) { 3228 i1 = idx[0]; 3229 i2 = idx[1]; 3230 idx += 2; 3231 tmp0 = x[i1]; 3232 tmp1 = x[i2]; 3233 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3234 v1 += 2; 3235 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3236 v2 += 2; 3237 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3238 v3 += 2; 3239 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3240 v4 += 2; 3241 } 3242 3243 if (n == sz - 1) { 3244 tmp0 = x[*idx]; 3245 sum1 -= *v1 * tmp0; 3246 sum2 -= *v2 * tmp0; 3247 sum3 -= *v3 * tmp0; 3248 sum4 -= *v4 * tmp0; 3249 } 3250 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3251 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3252 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3253 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3254 break; 3255 case 5: 3256 3257 sum1 = xb[row]; 3258 sum2 = xb[row - 1]; 3259 sum3 = xb[row - 2]; 3260 sum4 = xb[row - 3]; 3261 sum5 = xb[row - 4]; 3262 v2 = a->a + diag[row - 1] + 2; 3263 v3 = a->a + diag[row - 2] + 3; 3264 v4 = a->a + diag[row - 3] + 4; 3265 v5 = a->a + diag[row - 4] + 5; 3266 for (n = 0; n < sz - 1; n += 2) { 3267 i1 = idx[0]; 3268 i2 = idx[1]; 3269 idx += 2; 3270 tmp0 = x[i1]; 3271 tmp1 = x[i2]; 3272 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3273 v1 += 2; 3274 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3275 v2 += 2; 3276 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3277 v3 += 2; 3278 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3279 v4 += 2; 3280 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3281 v5 += 2; 3282 } 3283 3284 if (n == sz - 1) { 3285 tmp0 = x[*idx]; 3286 sum1 -= *v1 * tmp0; 3287 sum2 -= *v2 * tmp0; 3288 sum3 -= *v3 * tmp0; 3289 sum4 -= *v4 * tmp0; 3290 sum5 -= *v5 * tmp0; 3291 } 3292 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3293 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3294 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3295 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3296 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3297 break; 3298 default: 3299 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3300 } 3301 } 3302 3303 PetscCall(PetscLogFlops(a->nz)); 3304 } 3305 its--; 3306 } 3307 while (its--) { 3308 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3309 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) { 3310 nodesz = sizes[i + 1] - sizes[i]; 3311 sz = diag[row] - ii[row]; 3312 v1 = a->a + ii[row]; 3313 idx = a->j + ii[row]; 3314 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3315 switch (nodesz) { 3316 case 1: 3317 sum1 = b[row]; 3318 for (n = 0; n < sz - 1; n += 2) { 3319 i1 = idx[0]; 3320 i2 = idx[1]; 3321 idx += 2; 3322 tmp0 = x[i1]; 3323 tmp1 = x[i2]; 3324 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3325 v1 += 2; 3326 } 3327 if (n == sz - 1) { 3328 tmp0 = x[*idx++]; 3329 sum1 -= *v1 * tmp0; 3330 v1++; 3331 } 3332 t[row] = sum1; 3333 sz = ii[row + 1] - diag[row] - 1; 3334 idx = a->j + diag[row] + 1; 3335 v1 += 1; 3336 for (n = 0; n < sz - 1; n += 2) { 3337 i1 = idx[0]; 3338 i2 = idx[1]; 3339 idx += 2; 3340 tmp0 = x[i1]; 3341 tmp1 = x[i2]; 3342 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3343 v1 += 2; 3344 } 3345 if (n == sz - 1) { 3346 tmp0 = x[*idx++]; 3347 sum1 -= *v1 * tmp0; 3348 } 3349 /* in MatSOR_SeqAIJ this line would be 3350 * 3351 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3352 * 3353 * but omega == 1, so this becomes 3354 * 3355 * x[row] = sum1*(*ibdiag++); 3356 * 3357 */ 3358 x[row] = sum1 * (*ibdiag); 3359 break; 3360 case 2: 3361 v2 = a->a + ii[row + 1]; 3362 sum1 = b[row]; 3363 sum2 = b[row + 1]; 3364 for (n = 0; n < sz - 1; n += 2) { 3365 i1 = idx[0]; 3366 i2 = idx[1]; 3367 idx += 2; 3368 tmp0 = x[i1]; 3369 tmp1 = x[i2]; 3370 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3371 v1 += 2; 3372 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3373 v2 += 2; 3374 } 3375 if (n == sz - 1) { 3376 tmp0 = x[*idx++]; 3377 sum1 -= v1[0] * tmp0; 3378 sum2 -= v2[0] * tmp0; 3379 v1++; 3380 v2++; 3381 } 3382 t[row] = sum1; 3383 t[row + 1] = sum2; 3384 sz = ii[row + 1] - diag[row] - 2; 3385 idx = a->j + diag[row] + 2; 3386 v1 += 2; 3387 v2 += 2; 3388 for (n = 0; n < sz - 1; n += 2) { 3389 i1 = idx[0]; 3390 i2 = idx[1]; 3391 idx += 2; 3392 tmp0 = x[i1]; 3393 tmp1 = x[i2]; 3394 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3395 v1 += 2; 3396 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3397 v2 += 2; 3398 } 3399 if (n == sz - 1) { 3400 tmp0 = x[*idx]; 3401 sum1 -= v1[0] * tmp0; 3402 sum2 -= v2[0] * tmp0; 3403 } 3404 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3405 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3406 break; 3407 case 3: 3408 v2 = a->a + ii[row + 1]; 3409 v3 = a->a + ii[row + 2]; 3410 sum1 = b[row]; 3411 sum2 = b[row + 1]; 3412 sum3 = b[row + 2]; 3413 for (n = 0; n < sz - 1; n += 2) { 3414 i1 = idx[0]; 3415 i2 = idx[1]; 3416 idx += 2; 3417 tmp0 = x[i1]; 3418 tmp1 = x[i2]; 3419 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3420 v1 += 2; 3421 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3422 v2 += 2; 3423 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3424 v3 += 2; 3425 } 3426 if (n == sz - 1) { 3427 tmp0 = x[*idx++]; 3428 sum1 -= v1[0] * tmp0; 3429 sum2 -= v2[0] * tmp0; 3430 sum3 -= v3[0] * tmp0; 3431 v1++; 3432 v2++; 3433 v3++; 3434 } 3435 t[row] = sum1; 3436 t[row + 1] = sum2; 3437 t[row + 2] = sum3; 3438 sz = ii[row + 1] - diag[row] - 3; 3439 idx = a->j + diag[row] + 3; 3440 v1 += 3; 3441 v2 += 3; 3442 v3 += 3; 3443 for (n = 0; n < sz - 1; n += 2) { 3444 i1 = idx[0]; 3445 i2 = idx[1]; 3446 idx += 2; 3447 tmp0 = x[i1]; 3448 tmp1 = x[i2]; 3449 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3450 v1 += 2; 3451 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3452 v2 += 2; 3453 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3454 v3 += 2; 3455 } 3456 if (n == sz - 1) { 3457 tmp0 = x[*idx]; 3458 sum1 -= v1[0] * tmp0; 3459 sum2 -= v2[0] * tmp0; 3460 sum3 -= v3[0] * tmp0; 3461 } 3462 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3463 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3464 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3465 break; 3466 case 4: 3467 v2 = a->a + ii[row + 1]; 3468 v3 = a->a + ii[row + 2]; 3469 v4 = a->a + ii[row + 3]; 3470 sum1 = b[row]; 3471 sum2 = b[row + 1]; 3472 sum3 = b[row + 2]; 3473 sum4 = b[row + 3]; 3474 for (n = 0; n < sz - 1; n += 2) { 3475 i1 = idx[0]; 3476 i2 = idx[1]; 3477 idx += 2; 3478 tmp0 = x[i1]; 3479 tmp1 = x[i2]; 3480 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3481 v1 += 2; 3482 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3483 v2 += 2; 3484 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3485 v3 += 2; 3486 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3487 v4 += 2; 3488 } 3489 if (n == sz - 1) { 3490 tmp0 = x[*idx++]; 3491 sum1 -= v1[0] * tmp0; 3492 sum2 -= v2[0] * tmp0; 3493 sum3 -= v3[0] * tmp0; 3494 sum4 -= v4[0] * tmp0; 3495 v1++; 3496 v2++; 3497 v3++; 3498 v4++; 3499 } 3500 t[row] = sum1; 3501 t[row + 1] = sum2; 3502 t[row + 2] = sum3; 3503 t[row + 3] = sum4; 3504 sz = ii[row + 1] - diag[row] - 4; 3505 idx = a->j + diag[row] + 4; 3506 v1 += 4; 3507 v2 += 4; 3508 v3 += 4; 3509 v4 += 4; 3510 for (n = 0; n < sz - 1; n += 2) { 3511 i1 = idx[0]; 3512 i2 = idx[1]; 3513 idx += 2; 3514 tmp0 = x[i1]; 3515 tmp1 = x[i2]; 3516 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3517 v1 += 2; 3518 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3519 v2 += 2; 3520 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3521 v3 += 2; 3522 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3523 v4 += 2; 3524 } 3525 if (n == sz - 1) { 3526 tmp0 = x[*idx]; 3527 sum1 -= v1[0] * tmp0; 3528 sum2 -= v2[0] * tmp0; 3529 sum3 -= v3[0] * tmp0; 3530 sum4 -= v4[0] * tmp0; 3531 } 3532 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3533 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3534 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3535 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3536 break; 3537 case 5: 3538 v2 = a->a + ii[row + 1]; 3539 v3 = a->a + ii[row + 2]; 3540 v4 = a->a + ii[row + 3]; 3541 v5 = a->a + ii[row + 4]; 3542 sum1 = b[row]; 3543 sum2 = b[row + 1]; 3544 sum3 = b[row + 2]; 3545 sum4 = b[row + 3]; 3546 sum5 = b[row + 4]; 3547 for (n = 0; n < sz - 1; n += 2) { 3548 i1 = idx[0]; 3549 i2 = idx[1]; 3550 idx += 2; 3551 tmp0 = x[i1]; 3552 tmp1 = x[i2]; 3553 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3554 v1 += 2; 3555 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3556 v2 += 2; 3557 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3558 v3 += 2; 3559 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3560 v4 += 2; 3561 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3562 v5 += 2; 3563 } 3564 if (n == sz - 1) { 3565 tmp0 = x[*idx++]; 3566 sum1 -= v1[0] * tmp0; 3567 sum2 -= v2[0] * tmp0; 3568 sum3 -= v3[0] * tmp0; 3569 sum4 -= v4[0] * tmp0; 3570 sum5 -= v5[0] * tmp0; 3571 v1++; 3572 v2++; 3573 v3++; 3574 v4++; 3575 v5++; 3576 } 3577 t[row] = sum1; 3578 t[row + 1] = sum2; 3579 t[row + 2] = sum3; 3580 t[row + 3] = sum4; 3581 t[row + 4] = sum5; 3582 sz = ii[row + 1] - diag[row] - 5; 3583 idx = a->j + diag[row] + 5; 3584 v1 += 5; 3585 v2 += 5; 3586 v3 += 5; 3587 v4 += 5; 3588 v5 += 5; 3589 for (n = 0; n < sz - 1; n += 2) { 3590 i1 = idx[0]; 3591 i2 = idx[1]; 3592 idx += 2; 3593 tmp0 = x[i1]; 3594 tmp1 = x[i2]; 3595 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3596 v1 += 2; 3597 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3598 v2 += 2; 3599 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3600 v3 += 2; 3601 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3602 v4 += 2; 3603 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3604 v5 += 2; 3605 } 3606 if (n == sz - 1) { 3607 tmp0 = x[*idx]; 3608 sum1 -= v1[0] * tmp0; 3609 sum2 -= v2[0] * tmp0; 3610 sum3 -= v3[0] * tmp0; 3611 sum4 -= v4[0] * tmp0; 3612 sum5 -= v5[0] * tmp0; 3613 } 3614 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3615 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3616 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3617 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3618 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3619 break; 3620 default: 3621 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3622 } 3623 } 3624 xb = t; 3625 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3626 } else xb = b; 3627 3628 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3629 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3630 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3631 nodesz = sizes[i + 1] - sizes[i]; 3632 ibdiag -= nodesz * nodesz; 3633 3634 /* set RHS */ 3635 if (xb == b) { 3636 /* whole (old way) */ 3637 sz = ii[row + 1] - ii[row]; 3638 idx = a->j + ii[row]; 3639 switch (nodesz) { 3640 case 5: 3641 v5 = a->a + ii[row - 4]; /* fall through */ 3642 case 4: 3643 v4 = a->a + ii[row - 3]; /* fall through */ 3644 case 3: 3645 v3 = a->a + ii[row - 2]; /* fall through */ 3646 case 2: 3647 v2 = a->a + ii[row - 1]; /* fall through */ 3648 case 1: 3649 v1 = a->a + ii[row]; 3650 break; 3651 default: 3652 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3653 } 3654 } else { 3655 /* upper, no diag */ 3656 sz = ii[row + 1] - diag[row] - 1; 3657 idx = a->j + diag[row] + 1; 3658 switch (nodesz) { 3659 case 5: 3660 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3661 case 4: 3662 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3663 case 3: 3664 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3665 case 2: 3666 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3667 case 1: 3668 v1 = a->a + diag[row] + 1; 3669 } 3670 } 3671 /* set sum */ 3672 switch (nodesz) { 3673 case 5: 3674 sum5 = xb[row - 4]; /* fall through */ 3675 case 4: 3676 sum4 = xb[row - 3]; /* fall through */ 3677 case 3: 3678 sum3 = xb[row - 2]; /* fall through */ 3679 case 2: 3680 sum2 = xb[row - 1]; /* fall through */ 3681 case 1: 3682 /* note that sum1 is associated with the last row */ 3683 sum1 = xb[row]; 3684 } 3685 /* do sums */ 3686 for (n = 0; n < sz - 1; n += 2) { 3687 i1 = idx[0]; 3688 i2 = idx[1]; 3689 idx += 2; 3690 tmp0 = x[i1]; 3691 tmp1 = x[i2]; 3692 switch (nodesz) { 3693 case 5: 3694 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3695 v5 += 2; /* fall through */ 3696 case 4: 3697 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3698 v4 += 2; /* fall through */ 3699 case 3: 3700 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3701 v3 += 2; /* fall through */ 3702 case 2: 3703 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3704 v2 += 2; /* fall through */ 3705 case 1: 3706 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3707 v1 += 2; 3708 } 3709 } 3710 /* ragged edge */ 3711 if (n == sz - 1) { 3712 tmp0 = x[*idx]; 3713 switch (nodesz) { 3714 case 5: 3715 sum5 -= *v5 * tmp0; /* fall through */ 3716 case 4: 3717 sum4 -= *v4 * tmp0; /* fall through */ 3718 case 3: 3719 sum3 -= *v3 * tmp0; /* fall through */ 3720 case 2: 3721 sum2 -= *v2 * tmp0; /* fall through */ 3722 case 1: 3723 sum1 -= *v1 * tmp0; 3724 } 3725 } 3726 /* update */ 3727 if (xb == b) { 3728 /* whole (old way) w/ diag */ 3729 switch (nodesz) { 3730 case 5: 3731 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3732 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3733 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3734 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3735 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3736 break; 3737 case 4: 3738 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3739 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3740 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3741 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3742 break; 3743 case 3: 3744 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3745 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3746 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3747 break; 3748 case 2: 3749 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3750 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3751 break; 3752 case 1: 3753 x[row--] += sum1 * (*ibdiag); 3754 break; 3755 } 3756 } else { 3757 /* no diag so set = */ 3758 switch (nodesz) { 3759 case 5: 3760 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3761 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3762 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3763 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3764 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3765 break; 3766 case 4: 3767 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3768 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3769 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3770 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3771 break; 3772 case 3: 3773 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3774 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3775 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3776 break; 3777 case 2: 3778 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3779 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3780 break; 3781 case 1: 3782 x[row--] = sum1 * (*ibdiag); 3783 break; 3784 } 3785 } 3786 } 3787 if (xb == b) { 3788 PetscCall(PetscLogFlops(2.0 * a->nz)); 3789 } else { 3790 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3791 } 3792 } 3793 } 3794 if (flag & SOR_EISENSTAT) { 3795 /* 3796 Apply (U + D)^-1 where D is now the block diagonal 3797 */ 3798 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3799 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3800 nodesz = sizes[i + 1] - sizes[i]; 3801 ibdiag -= nodesz * nodesz; 3802 sz = ii[row + 1] - diag[row] - 1; 3803 v1 = a->a + diag[row] + 1; 3804 idx = a->j + diag[row] + 1; 3805 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3806 switch (nodesz) { 3807 case 1: 3808 3809 sum1 = b[row]; 3810 for (n = 0; n < sz - 1; n += 2) { 3811 i1 = idx[0]; 3812 i2 = idx[1]; 3813 idx += 2; 3814 tmp0 = x[i1]; 3815 tmp1 = x[i2]; 3816 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3817 v1 += 2; 3818 } 3819 3820 if (n == sz - 1) { 3821 tmp0 = x[*idx]; 3822 sum1 -= *v1 * tmp0; 3823 } 3824 x[row] = sum1 * (*ibdiag); 3825 row--; 3826 break; 3827 3828 case 2: 3829 3830 sum1 = b[row]; 3831 sum2 = b[row - 1]; 3832 /* note that sum1 is associated with the second of the two rows */ 3833 v2 = a->a + diag[row - 1] + 2; 3834 for (n = 0; n < sz - 1; n += 2) { 3835 i1 = idx[0]; 3836 i2 = idx[1]; 3837 idx += 2; 3838 tmp0 = x[i1]; 3839 tmp1 = x[i2]; 3840 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3841 v1 += 2; 3842 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3843 v2 += 2; 3844 } 3845 3846 if (n == sz - 1) { 3847 tmp0 = x[*idx]; 3848 sum1 -= *v1 * tmp0; 3849 sum2 -= *v2 * tmp0; 3850 } 3851 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3852 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3853 row -= 2; 3854 break; 3855 case 3: 3856 3857 sum1 = b[row]; 3858 sum2 = b[row - 1]; 3859 sum3 = b[row - 2]; 3860 v2 = a->a + diag[row - 1] + 2; 3861 v3 = a->a + diag[row - 2] + 3; 3862 for (n = 0; n < sz - 1; n += 2) { 3863 i1 = idx[0]; 3864 i2 = idx[1]; 3865 idx += 2; 3866 tmp0 = x[i1]; 3867 tmp1 = x[i2]; 3868 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3869 v1 += 2; 3870 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3871 v2 += 2; 3872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3873 v3 += 2; 3874 } 3875 3876 if (n == sz - 1) { 3877 tmp0 = x[*idx]; 3878 sum1 -= *v1 * tmp0; 3879 sum2 -= *v2 * tmp0; 3880 sum3 -= *v3 * tmp0; 3881 } 3882 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3883 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3884 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3885 row -= 3; 3886 break; 3887 case 4: 3888 3889 sum1 = b[row]; 3890 sum2 = b[row - 1]; 3891 sum3 = b[row - 2]; 3892 sum4 = b[row - 3]; 3893 v2 = a->a + diag[row - 1] + 2; 3894 v3 = a->a + diag[row - 2] + 3; 3895 v4 = a->a + diag[row - 3] + 4; 3896 for (n = 0; n < sz - 1; n += 2) { 3897 i1 = idx[0]; 3898 i2 = idx[1]; 3899 idx += 2; 3900 tmp0 = x[i1]; 3901 tmp1 = x[i2]; 3902 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3903 v1 += 2; 3904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3905 v2 += 2; 3906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3907 v3 += 2; 3908 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3909 v4 += 2; 3910 } 3911 3912 if (n == sz - 1) { 3913 tmp0 = x[*idx]; 3914 sum1 -= *v1 * tmp0; 3915 sum2 -= *v2 * tmp0; 3916 sum3 -= *v3 * tmp0; 3917 sum4 -= *v4 * tmp0; 3918 } 3919 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3920 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3921 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3922 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3923 row -= 4; 3924 break; 3925 case 5: 3926 3927 sum1 = b[row]; 3928 sum2 = b[row - 1]; 3929 sum3 = b[row - 2]; 3930 sum4 = b[row - 3]; 3931 sum5 = b[row - 4]; 3932 v2 = a->a + diag[row - 1] + 2; 3933 v3 = a->a + diag[row - 2] + 3; 3934 v4 = a->a + diag[row - 3] + 4; 3935 v5 = a->a + diag[row - 4] + 5; 3936 for (n = 0; n < sz - 1; n += 2) { 3937 i1 = idx[0]; 3938 i2 = idx[1]; 3939 idx += 2; 3940 tmp0 = x[i1]; 3941 tmp1 = x[i2]; 3942 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3943 v1 += 2; 3944 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3945 v2 += 2; 3946 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3947 v3 += 2; 3948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3949 v4 += 2; 3950 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3951 v5 += 2; 3952 } 3953 3954 if (n == sz - 1) { 3955 tmp0 = x[*idx]; 3956 sum1 -= *v1 * tmp0; 3957 sum2 -= *v2 * tmp0; 3958 sum3 -= *v3 * tmp0; 3959 sum4 -= *v4 * tmp0; 3960 sum5 -= *v5 * tmp0; 3961 } 3962 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3963 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3964 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3965 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3966 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3967 row -= 5; 3968 break; 3969 default: 3970 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3971 } 3972 } 3973 PetscCall(PetscLogFlops(a->nz)); 3974 3975 /* 3976 t = b - D x where D is the block diagonal 3977 */ 3978 cnt = 0; 3979 for (i = 0, row = 0; i < m; i++) { 3980 nodesz = sizes[i + 1] - sizes[i]; 3981 switch (nodesz) { 3982 case 1: 3983 t[row] = b[row] - bdiag[cnt++] * x[row]; 3984 row++; 3985 break; 3986 case 2: 3987 x1 = x[row]; 3988 x2 = x[row + 1]; 3989 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3990 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3991 t[row] = b[row] - tmp1; 3992 t[row + 1] = b[row + 1] - tmp2; 3993 row += 2; 3994 cnt += 4; 3995 break; 3996 case 3: 3997 x1 = x[row]; 3998 x2 = x[row + 1]; 3999 x3 = x[row + 2]; 4000 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4001 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4002 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4003 t[row] = b[row] - tmp1; 4004 t[row + 1] = b[row + 1] - tmp2; 4005 t[row + 2] = b[row + 2] - tmp3; 4006 row += 3; 4007 cnt += 9; 4008 break; 4009 case 4: 4010 x1 = x[row]; 4011 x2 = x[row + 1]; 4012 x3 = x[row + 2]; 4013 x4 = x[row + 3]; 4014 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4015 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4016 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4017 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4018 t[row] = b[row] - tmp1; 4019 t[row + 1] = b[row + 1] - tmp2; 4020 t[row + 2] = b[row + 2] - tmp3; 4021 t[row + 3] = b[row + 3] - tmp4; 4022 row += 4; 4023 cnt += 16; 4024 break; 4025 case 5: 4026 x1 = x[row]; 4027 x2 = x[row + 1]; 4028 x3 = x[row + 2]; 4029 x4 = x[row + 3]; 4030 x5 = x[row + 4]; 4031 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4032 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4033 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4034 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4035 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4036 t[row] = b[row] - tmp1; 4037 t[row + 1] = b[row + 1] - tmp2; 4038 t[row + 2] = b[row + 2] - tmp3; 4039 t[row + 3] = b[row + 3] - tmp4; 4040 t[row + 4] = b[row + 4] - tmp5; 4041 row += 5; 4042 cnt += 25; 4043 break; 4044 default: 4045 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4046 } 4047 } 4048 PetscCall(PetscLogFlops(m)); 4049 4050 /* 4051 Apply (L + D)^-1 where D is the block diagonal 4052 */ 4053 for (i = 0, row = 0; i < m; i++) { 4054 nodesz = sizes[i + 1] - sizes[i]; 4055 sz = diag[row] - ii[row]; 4056 v1 = a->a + ii[row]; 4057 idx = a->j + ii[row]; 4058 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4059 switch (nodesz) { 4060 case 1: 4061 4062 sum1 = t[row]; 4063 for (n = 0; n < sz - 1; n += 2) { 4064 i1 = idx[0]; 4065 i2 = idx[1]; 4066 idx += 2; 4067 tmp0 = t[i1]; 4068 tmp1 = t[i2]; 4069 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4070 v1 += 2; 4071 } 4072 4073 if (n == sz - 1) { 4074 tmp0 = t[*idx]; 4075 sum1 -= *v1 * tmp0; 4076 } 4077 x[row] += t[row] = sum1 * (*ibdiag++); 4078 row++; 4079 break; 4080 case 2: 4081 v2 = a->a + ii[row + 1]; 4082 sum1 = t[row]; 4083 sum2 = t[row + 1]; 4084 for (n = 0; n < sz - 1; n += 2) { 4085 i1 = idx[0]; 4086 i2 = idx[1]; 4087 idx += 2; 4088 tmp0 = t[i1]; 4089 tmp1 = t[i2]; 4090 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4091 v1 += 2; 4092 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4093 v2 += 2; 4094 } 4095 4096 if (n == sz - 1) { 4097 tmp0 = t[*idx]; 4098 sum1 -= v1[0] * tmp0; 4099 sum2 -= v2[0] * tmp0; 4100 } 4101 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4102 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4103 ibdiag += 4; 4104 row += 2; 4105 break; 4106 case 3: 4107 v2 = a->a + ii[row + 1]; 4108 v3 = a->a + ii[row + 2]; 4109 sum1 = t[row]; 4110 sum2 = t[row + 1]; 4111 sum3 = t[row + 2]; 4112 for (n = 0; n < sz - 1; n += 2) { 4113 i1 = idx[0]; 4114 i2 = idx[1]; 4115 idx += 2; 4116 tmp0 = t[i1]; 4117 tmp1 = t[i2]; 4118 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4119 v1 += 2; 4120 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4121 v2 += 2; 4122 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4123 v3 += 2; 4124 } 4125 4126 if (n == sz - 1) { 4127 tmp0 = t[*idx]; 4128 sum1 -= v1[0] * tmp0; 4129 sum2 -= v2[0] * tmp0; 4130 sum3 -= v3[0] * tmp0; 4131 } 4132 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4133 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4134 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4135 ibdiag += 9; 4136 row += 3; 4137 break; 4138 case 4: 4139 v2 = a->a + ii[row + 1]; 4140 v3 = a->a + ii[row + 2]; 4141 v4 = a->a + ii[row + 3]; 4142 sum1 = t[row]; 4143 sum2 = t[row + 1]; 4144 sum3 = t[row + 2]; 4145 sum4 = t[row + 3]; 4146 for (n = 0; n < sz - 1; n += 2) { 4147 i1 = idx[0]; 4148 i2 = idx[1]; 4149 idx += 2; 4150 tmp0 = t[i1]; 4151 tmp1 = t[i2]; 4152 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4153 v1 += 2; 4154 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4155 v2 += 2; 4156 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4157 v3 += 2; 4158 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4159 v4 += 2; 4160 } 4161 4162 if (n == sz - 1) { 4163 tmp0 = t[*idx]; 4164 sum1 -= v1[0] * tmp0; 4165 sum2 -= v2[0] * tmp0; 4166 sum3 -= v3[0] * tmp0; 4167 sum4 -= v4[0] * tmp0; 4168 } 4169 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4170 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4171 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4172 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4173 ibdiag += 16; 4174 row += 4; 4175 break; 4176 case 5: 4177 v2 = a->a + ii[row + 1]; 4178 v3 = a->a + ii[row + 2]; 4179 v4 = a->a + ii[row + 3]; 4180 v5 = a->a + ii[row + 4]; 4181 sum1 = t[row]; 4182 sum2 = t[row + 1]; 4183 sum3 = t[row + 2]; 4184 sum4 = t[row + 3]; 4185 sum5 = t[row + 4]; 4186 for (n = 0; n < sz - 1; n += 2) { 4187 i1 = idx[0]; 4188 i2 = idx[1]; 4189 idx += 2; 4190 tmp0 = t[i1]; 4191 tmp1 = t[i2]; 4192 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4193 v1 += 2; 4194 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4195 v2 += 2; 4196 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4197 v3 += 2; 4198 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4199 v4 += 2; 4200 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4201 v5 += 2; 4202 } 4203 4204 if (n == sz - 1) { 4205 tmp0 = t[*idx]; 4206 sum1 -= v1[0] * tmp0; 4207 sum2 -= v2[0] * tmp0; 4208 sum3 -= v3[0] * tmp0; 4209 sum4 -= v4[0] * tmp0; 4210 sum5 -= v5[0] * tmp0; 4211 } 4212 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4213 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4214 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4215 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4216 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4217 ibdiag += 25; 4218 row += 5; 4219 break; 4220 default: 4221 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4222 } 4223 } 4224 PetscCall(PetscLogFlops(a->nz)); 4225 } 4226 PetscCall(VecRestoreArray(xx, &x)); 4227 PetscCall(VecRestoreArrayRead(bb, &b)); 4228 PetscFunctionReturn(PETSC_SUCCESS); 4229 } 4230 4231 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4232 { 4233 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4234 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4235 const MatScalar *bdiag = a->inode.bdiag; 4236 const PetscScalar *b; 4237 PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz; 4238 const PetscInt *sizes = a->inode.size_csr; 4239 4240 PetscFunctionBegin; 4241 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4242 PetscCall(VecGetArray(xx, &x)); 4243 PetscCall(VecGetArrayRead(bb, &b)); 4244 cnt = 0; 4245 for (i = 0, row = 0; i < m; i++) { 4246 nodesz = sizes[i + 1] - sizes[i]; 4247 switch (nodesz) { 4248 case 1: 4249 x[row] = b[row] * bdiag[cnt++]; 4250 row++; 4251 break; 4252 case 2: 4253 x1 = b[row]; 4254 x2 = b[row + 1]; 4255 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4256 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4257 x[row++] = tmp1; 4258 x[row++] = tmp2; 4259 cnt += 4; 4260 break; 4261 case 3: 4262 x1 = b[row]; 4263 x2 = b[row + 1]; 4264 x3 = b[row + 2]; 4265 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4266 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4267 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4268 x[row++] = tmp1; 4269 x[row++] = tmp2; 4270 x[row++] = tmp3; 4271 cnt += 9; 4272 break; 4273 case 4: 4274 x1 = b[row]; 4275 x2 = b[row + 1]; 4276 x3 = b[row + 2]; 4277 x4 = b[row + 3]; 4278 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4279 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4280 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4281 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4282 x[row++] = tmp1; 4283 x[row++] = tmp2; 4284 x[row++] = tmp3; 4285 x[row++] = tmp4; 4286 cnt += 16; 4287 break; 4288 case 5: 4289 x1 = b[row]; 4290 x2 = b[row + 1]; 4291 x3 = b[row + 2]; 4292 x4 = b[row + 3]; 4293 x5 = b[row + 4]; 4294 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4295 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4296 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4297 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4298 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4299 x[row++] = tmp1; 4300 x[row++] = tmp2; 4301 x[row++] = tmp3; 4302 x[row++] = tmp4; 4303 x[row++] = tmp5; 4304 cnt += 25; 4305 break; 4306 default: 4307 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4308 } 4309 } 4310 PetscCall(PetscLogFlops(2.0 * cnt)); 4311 PetscCall(VecRestoreArray(xx, &x)); 4312 PetscCall(VecRestoreArrayRead(bb, &b)); 4313 PetscFunctionReturn(PETSC_SUCCESS); 4314 } 4315 4316 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4317 { 4318 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4319 4320 PetscFunctionBegin; 4321 a->inode.node_count = 0; 4322 a->inode.use = PETSC_FALSE; 4323 a->inode.checked = PETSC_FALSE; 4324 a->inode.mat_nonzerostate = -1; 4325 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4326 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4327 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4328 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4329 A->ops->coloringpatch = NULL; 4330 A->ops->multdiagonalblock = NULL; 4331 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4332 PetscFunctionReturn(PETSC_SUCCESS); 4333 } 4334 4335 /* 4336 samestructure indicates that the matrix has not changed its nonzero structure so we 4337 do not need to recompute the inodes 4338 */ 4339 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4340 { 4341 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4342 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4343 PetscBool flag; 4344 const PetscInt *idx, *idy, *ii; 4345 4346 PetscFunctionBegin; 4347 if (!a->inode.use) { 4348 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4349 PetscCall(PetscFree(a->inode.size_csr)); 4350 PetscFunctionReturn(PETSC_SUCCESS); 4351 } 4352 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 4353 4354 m = A->rmap->n; 4355 if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr)); 4356 ns = a->inode.size_csr; 4357 ns[0] = 0; 4358 4359 i = 0; 4360 node_count = 0; 4361 idx = a->j; 4362 ii = a->i; 4363 if (idx) { 4364 while (i < m) { /* For each row */ 4365 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4366 /* Limits the number of elements in a node to 'a->inode.limit' */ 4367 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4368 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4369 if (nzy != nzx) break; 4370 idy += nzx; /* Same nonzero pattern */ 4371 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4372 if (!flag) break; 4373 } 4374 ns[node_count + 1] = ns[node_count] + blk_size; 4375 node_count++; 4376 idx += blk_size * nzx; 4377 i = j; 4378 } 4379 } 4380 /* If not enough inodes found,, do not use inode version of the routines */ 4381 if (!m || !idx || node_count > .8 * m) { 4382 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4383 PetscCall(PetscFree(a->inode.size_csr)); 4384 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4385 } else { 4386 if (!A->factortype) { 4387 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4388 if (A->rmap->n == A->cmap->n) { 4389 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4390 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4391 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4392 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4393 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4394 } 4395 } else { 4396 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4397 } 4398 a->inode.node_count = node_count; 4399 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4400 } 4401 a->inode.checked = PETSC_TRUE; 4402 a->inode.mat_nonzerostate = A->nonzerostate; 4403 PetscFunctionReturn(PETSC_SUCCESS); 4404 } 4405 4406 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4407 { 4408 Mat B = *C; 4409 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4410 PetscInt m = A->rmap->n; 4411 4412 PetscFunctionBegin; 4413 c->inode.use = a->inode.use; 4414 c->inode.limit = a->inode.limit; 4415 c->inode.max_limit = a->inode.max_limit; 4416 c->inode.checked = PETSC_FALSE; 4417 c->inode.size_csr = NULL; 4418 c->inode.node_count = 0; 4419 c->inode.ibdiag = NULL; 4420 c->inode.bdiag = NULL; 4421 c->inode.mat_nonzerostate = -1; 4422 if (a->inode.use) { 4423 if (a->inode.checked && a->inode.size_csr) { 4424 PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr)); 4425 PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1)); 4426 4427 c->inode.checked = PETSC_TRUE; 4428 c->inode.node_count = a->inode.node_count; 4429 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4430 } 4431 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4432 if (!B->factortype) { 4433 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4434 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4435 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4436 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4437 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4438 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4439 } else { 4440 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4441 } 4442 } 4443 PetscFunctionReturn(PETSC_SUCCESS); 4444 } 4445 4446 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4447 { 4448 PetscInt k; 4449 const PetscInt *vi; 4450 4451 PetscFunctionBegin; 4452 vi = aj + ai[row]; 4453 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4454 vi = aj + adiag[row]; 4455 cols[nzl] = vi[0]; 4456 vi = aj + adiag[row + 1] + 1; 4457 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4458 PetscFunctionReturn(PETSC_SUCCESS); 4459 } 4460 /* 4461 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4462 Modified from MatSeqAIJCheckInode(). 4463 4464 Input Parameters: 4465 . Mat A - ILU or LU matrix factor 4466 4467 */ 4468 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4469 { 4470 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4471 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4472 PetscInt *cols1, *cols2, *ns; 4473 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4474 PetscBool flag; 4475 4476 PetscFunctionBegin; 4477 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4478 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4479 4480 m = A->rmap->n; 4481 if (a->inode.size_csr) ns = a->inode.size_csr; 4482 else PetscCall(PetscMalloc1(m + 1, &ns)); 4483 ns[0] = 0; 4484 4485 i = 0; 4486 node_count = 0; 4487 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4488 while (i < m) { /* For each row */ 4489 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4490 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4491 nzx = nzl1 + nzu1 + 1; 4492 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4493 4494 /* Limits the number of elements in a node to 'a->inode.limit' */ 4495 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4496 nzl2 = ai[j + 1] - ai[j]; 4497 nzu2 = adiag[j] - adiag[j + 1] - 1; 4498 nzy = nzl2 + nzu2 + 1; 4499 if (nzy != nzx) break; 4500 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4501 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4502 if (!flag) break; 4503 } 4504 ns[node_count + 1] = ns[node_count] + blk_size; 4505 node_count++; 4506 i = j; 4507 } 4508 PetscCall(PetscFree2(cols1, cols2)); 4509 /* If not enough inodes found,, do not use inode version of the routines */ 4510 if (!m || node_count > .8 * m) { 4511 PetscCall(PetscFree(ns)); 4512 4513 a->inode.node_count = 0; 4514 a->inode.size_csr = NULL; 4515 a->inode.use = PETSC_FALSE; 4516 4517 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4518 } else { 4519 A->ops->mult = NULL; 4520 A->ops->sor = NULL; 4521 A->ops->multadd = NULL; 4522 A->ops->getrowij = NULL; 4523 A->ops->restorerowij = NULL; 4524 A->ops->getcolumnij = NULL; 4525 A->ops->restorecolumnij = NULL; 4526 A->ops->coloringpatch = NULL; 4527 A->ops->multdiagonalblock = NULL; 4528 a->inode.node_count = node_count; 4529 a->inode.size_csr = ns; 4530 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4531 } 4532 a->inode.checked = PETSC_TRUE; 4533 PetscFunctionReturn(PETSC_SUCCESS); 4534 } 4535 4536 /* 4537 This is really ugly. if inodes are used this replaces the 4538 permutations with ones that correspond to rows/cols of the matrix 4539 rather than inode blocks 4540 */ 4541 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4542 { 4543 PetscFunctionBegin; 4544 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4545 PetscFunctionReturn(PETSC_SUCCESS); 4546 } 4547 4548 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4549 { 4550 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4551 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4552 const PetscInt *ridx, *cidx; 4553 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx; 4554 PetscInt nslim_col, *ns_col; 4555 IS ris = *rperm, cis = *cperm; 4556 4557 PetscFunctionBegin; 4558 if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4559 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4560 4561 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4562 PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns)); 4563 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4564 4565 PetscCall(ISGetIndices(ris, &ridx)); 4566 PetscCall(ISGetIndices(cis, &cidx)); 4567 4568 /* Form the inode structure for the rows of permuted matrix using inv perm*/ 4569 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]); 4570 4571 /* Construct the permutations for rows*/ 4572 for (i = 0, row = 0; i < nslim_row; ++i) { 4573 indx = ridx[i]; 4574 start_val = tns[indx]; 4575 end_val = tns[indx + 1]; 4576 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4577 } 4578 4579 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4580 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]); 4581 4582 /* Construct permutations for columns */ 4583 for (i = 0, col = 0; i < nslim_col; ++i) { 4584 indx = cidx[i]; 4585 start_val = tns[indx]; 4586 end_val = tns[indx + 1]; 4587 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4588 } 4589 4590 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4591 PetscCall(ISSetPermutation(*rperm)); 4592 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4593 PetscCall(ISSetPermutation(*cperm)); 4594 4595 PetscCall(ISRestoreIndices(ris, &ridx)); 4596 PetscCall(ISRestoreIndices(cis, &cidx)); 4597 4598 PetscCall(PetscFree(ns_col)); 4599 PetscCall(PetscFree2(permr, permc)); 4600 PetscCall(ISDestroy(&cis)); 4601 PetscCall(ISDestroy(&ris)); 4602 PetscCall(PetscFree(tns)); 4603 PetscFunctionReturn(PETSC_SUCCESS); 4604 } 4605 4606 /*@C 4607 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4608 4609 Not Collective 4610 4611 Input Parameter: 4612 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4613 4614 Output Parameters: 4615 + node_count - no of inodes present in the matrix. 4616 . sizes - an array of size `node_count`, with the sizes of each inode. 4617 - limit - the max size used to generate the inodes. 4618 4619 Level: advanced 4620 4621 Note: 4622 It should be called after the matrix is assembled. 4623 The contents of the sizes[] array should not be changed. 4624 `NULL` may be passed for information not needed 4625 4626 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 4627 @*/ 4628 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4629 { 4630 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4631 4632 PetscFunctionBegin; 4633 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4634 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4635 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4636 PetscFunctionReturn(PETSC_SUCCESS); 4637 } 4638 4639 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4640 { 4641 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4642 4643 PetscFunctionBegin; 4644 if (node_count) *node_count = a->inode.node_count; 4645 if (sizes) *sizes = a->inode.size_csr; 4646 if (limit) *limit = a->inode.limit; 4647 PetscFunctionReturn(PETSC_SUCCESS); 4648 } 4649