1 /* 2 This file provides high performance routines for the Inode format (compressed sparse row) 3 by taking advantage of rows with identical nonzero structure (I-nodes). 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #if defined(PETSC_HAVE_XMMINTRIN_H) 7 #include <xmmintrin.h> 8 #endif 9 10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 13 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 14 15 PetscFunctionBegin; 16 n = A->cmap->n; 17 m = A->rmap->n; 18 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 19 ns_row = a->inode.size_csr; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++); 24 for (; count + 1 < n; count++, i++); 25 if (count < n) i++; 26 *size = i; 27 PetscFunctionReturn(PETSC_SUCCESS); 28 } 29 PetscCall(PetscMalloc1(n + 1, &ns_col)); 30 ns_col[0] = 0; 31 32 /* Use the same row structure wherever feasible. */ 33 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1]; 34 35 /* if m < n; pad up the remainder with inode_limit */ 36 for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1; 37 /* The last node is the odd ball. pad it up with the remaining rows; */ 38 if (count < n) { 39 ns_col[i + 1] = ns_col[i] + (n - count); 40 i++; 41 } else if (count > n) { 42 /* Adjust for the over estimation */ 43 ns_col[i] += n - count; 44 } 45 *size = i; 46 *ns = ns_col; 47 PetscFunctionReturn(PETSC_SUCCESS); 48 } 49 50 /* 51 This builds symmetric version of nonzero structure, 52 */ 53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 54 { 55 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 56 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 57 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2; 58 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 59 60 PetscFunctionBegin; 61 nslim_row = a->inode.node_count; 62 m = A->rmap->n; 63 n = A->cmap->n; 64 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 65 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 66 67 /* Use the row_inode as column_inode */ 68 nslim_col = nslim_row; 69 ns_col = ns_row; 70 71 /* allocate space for reformatted inode structure */ 72 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 73 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]); 74 75 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 76 nsz = ns_col[i1 + 1] - ns_col[i1]; 77 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 78 } 79 /* allocate space for row pointers */ 80 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 81 *iia = ia; 82 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 83 84 /* determine the number of columns in each row */ 85 ia[0] = oshift; 86 for (i1 = 0; i1 < nslim_row; i1++) { 87 row = ns_row[i1]; 88 j = aj + ai[row] + ishift; 89 jmax = aj + ai[row + 1] + ishift; 90 if (j == jmax) continue; /* empty row */ 91 col = *j++ + ishift; 92 i2 = tvc[col]; 93 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 94 ia[i1 + 1]++; 95 ia[i2 + 1]++; 96 i2++; /* Start col of next node */ 97 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 98 i2 = tvc[col]; 99 } 100 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 101 } 102 103 /* shift ia[i] to point to next row */ 104 for (i1 = 1; i1 < nslim_row + 1; i1++) { 105 row = ia[i1 - 1]; 106 ia[i1] += row; 107 work[i1 - 1] = row - oshift; 108 } 109 110 /* allocate space for column pointers */ 111 nz = ia[nslim_row] + (!ishift); 112 PetscCall(PetscMalloc1(nz, &ja)); 113 *jja = ja; 114 115 /* loop over lower triangular part putting into ja */ 116 for (i1 = 0; i1 < nslim_row; i1++) { 117 row = ns_row[i1]; 118 j = aj + ai[row] + ishift; 119 jmax = aj + ai[row + 1] + ishift; 120 if (j == jmax) continue; /* empty row */ 121 col = *j++ + ishift; 122 i2 = tvc[col]; 123 while (i2 < i1 && j < jmax) { 124 ja[work[i2]++] = i1 + oshift; 125 ja[work[i1]++] = i2 + oshift; 126 ++i2; 127 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 128 i2 = tvc[col]; 129 } 130 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 131 } 132 PetscCall(PetscFree(work)); 133 PetscCall(PetscFree2(tns, tvc)); 134 PetscFunctionReturn(PETSC_SUCCESS); 135 } 136 137 /* 138 This builds nonsymmetric version of nonzero structure, 139 */ 140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 141 { 142 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 143 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 144 PetscInt *tns, *tvc, nsz, i1, i2; 145 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr; 146 147 PetscFunctionBegin; 148 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 149 nslim_row = a->inode.node_count; 150 n = A->cmap->n; 151 152 /* Create The column_inode for this matrix */ 153 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 154 155 /* allocate space for reformatted column_inode structure */ 156 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 157 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 158 159 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 160 nsz = ns_col[i1 + 1] - ns_col[i1]; 161 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 162 } 163 /* allocate space for row pointers */ 164 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 165 *iia = ia; 166 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 167 168 /* determine the number of columns in each row */ 169 ia[0] = oshift; 170 for (i1 = 0; i1 < nslim_row; i1++) { 171 row = ns_row[i1]; 172 j = aj + ai[row] + ishift; 173 nz = ai[row + 1] - ai[row]; 174 if (!nz) continue; /* empty row */ 175 col = *j++ + ishift; 176 i2 = tvc[col]; 177 while (nz-- > 0) { /* off-diagonal elements */ 178 ia[i1 + 1]++; 179 i2++; /* Start col of next node */ 180 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 181 if (nz > 0) i2 = tvc[col]; 182 } 183 } 184 185 /* shift ia[i] to point to next row */ 186 for (i1 = 1; i1 < nslim_row + 1; i1++) { 187 row = ia[i1 - 1]; 188 ia[i1] += row; 189 work[i1 - 1] = row - oshift; 190 } 191 192 /* allocate space for column pointers */ 193 nz = ia[nslim_row] + (!ishift); 194 PetscCall(PetscMalloc1(nz, &ja)); 195 *jja = ja; 196 197 /* loop over matrix putting into ja */ 198 for (i1 = 0; i1 < nslim_row; i1++) { 199 row = ns_row[i1]; 200 j = aj + ai[row] + ishift; 201 nz = ai[row + 1] - ai[row]; 202 if (!nz) continue; /* empty row */ 203 col = *j++ + ishift; 204 i2 = tvc[col]; 205 while (nz-- > 0) { 206 ja[work[i1]++] = i2 + oshift; 207 ++i2; 208 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 209 if (nz > 0) i2 = tvc[col]; 210 } 211 } 212 PetscCall(PetscFree(ns_col)); 213 PetscCall(PetscFree(work)); 214 PetscCall(PetscFree2(tns, tvc)); 215 PetscFunctionReturn(PETSC_SUCCESS); 216 } 217 218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 219 { 220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 221 222 PetscFunctionBegin; 223 if (n) *n = a->inode.node_count; 224 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 225 if (!blockcompressed) { 226 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 227 } else if (symmetric) { 228 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 229 } else { 230 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 231 } 232 PetscFunctionReturn(PETSC_SUCCESS); 233 } 234 235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 236 { 237 PetscFunctionBegin; 238 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 239 240 if (!blockcompressed) { 241 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 242 } else { 243 PetscCall(PetscFree(*ia)); 244 PetscCall(PetscFree(*ja)); 245 } 246 PetscFunctionReturn(PETSC_SUCCESS); 247 } 248 249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 250 { 251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 252 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 253 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j; 254 255 PetscFunctionBegin; 256 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 257 nslim_row = a->inode.node_count; 258 n = A->cmap->n; 259 260 /* Create The column_inode for this matrix */ 261 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 262 263 /* allocate space for reformatted column_inode structure */ 264 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 265 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 266 267 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 268 nsz = ns_col[i1 + 1] - ns_col[i1]; 269 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 270 } 271 /* allocate space for column pointers */ 272 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 273 *iia = ia; 274 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 275 276 /* determine the number of columns in each row */ 277 ia[0] = oshift; 278 for (i1 = 0; i1 < nslim_row; i1++) { 279 row = ns_row[i1]; 280 j = aj + ai[row] + ishift; 281 col = *j++ + ishift; 282 i2 = tvc[col]; 283 nz = ai[row + 1] - ai[row]; 284 while (nz-- > 0) { /* off-diagonal elements */ 285 /* ia[i1+1]++; */ 286 ia[i2 + 1]++; 287 i2++; 288 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 289 if (nz > 0) i2 = tvc[col]; 290 } 291 } 292 293 /* shift ia[i] to point to next col */ 294 for (i1 = 1; i1 < nslim_col + 1; i1++) { 295 col = ia[i1 - 1]; 296 ia[i1] += col; 297 work[i1 - 1] = col - oshift; 298 } 299 300 /* allocate space for column pointers */ 301 nz = ia[nslim_col] + (!ishift); 302 PetscCall(PetscMalloc1(nz, &ja)); 303 *jja = ja; 304 305 /* loop over matrix putting into ja */ 306 for (i1 = 0; i1 < nslim_row; i1++) { 307 row = ns_row[i1]; 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row + 1] - ai[row]; 312 while (nz-- > 0) { 313 /* ja[work[i1]++] = i2 + oshift; */ 314 ja[work[i2]++] = i1 + oshift; 315 i2++; 316 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 PetscCall(PetscFree(ns_col)); 321 PetscCall(PetscFree(work)); 322 PetscCall(PetscFree2(tns, tvc)); 323 PetscFunctionReturn(PETSC_SUCCESS); 324 } 325 326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 327 { 328 PetscFunctionBegin; 329 PetscCall(MatCreateColInode_Private(A, n, NULL)); 330 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 331 332 if (!blockcompressed) { 333 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 334 } else if (symmetric) { 335 /* Since the indices are symmetric it doesn't matter */ 336 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 337 } else { 338 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 339 } 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 344 { 345 PetscFunctionBegin; 346 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 347 if (!blockcompressed) { 348 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 349 } else { 350 PetscCall(PetscFree(*ia)); 351 PetscCall(PetscFree(*ja)); 352 } 353 PetscFunctionReturn(PETSC_SUCCESS); 354 } 355 356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 357 { 358 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 359 PetscScalar *y; 360 const PetscScalar *x; 361 PetscInt row, node_max, nonzerorow = 0; 362 PetscInt *ns; 363 364 PetscFunctionBegin; 365 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 366 node_max = a->inode.node_count; 367 ns = a->inode.size_csr; /* Node Size array */ 368 PetscCall(VecGetArrayRead(xx, &x)); 369 PetscCall(VecGetArray(yy, &y)); 370 371 PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow)) 372 for (PetscInt i = 0; i < node_max; ++i) { 373 PetscInt i1, i2, nsz, n, sz; 374 const MatScalar *v1, *v2, *v3, *v4, *v5; 375 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 376 const PetscInt *idx; 377 378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 379 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 380 #endif 381 row = ns[i]; 382 nsz = ns[i + 1] - ns[i]; 383 n = a->i[row + 1] - a->i[row]; 384 nonzerorow += (n > 0) * nsz; 385 386 idx = &a->j[a->i[row]]; 387 v1 = &a->a[a->i[row]]; 388 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 389 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 390 sz = n; /* No of non zeros in this row */ 391 /* Switch on the size of Node */ 392 switch (nsz) { /* Each loop in 'case' is unrolled */ 393 case 1: 394 sum1 = 0.; 395 396 for (n = 0; n < sz - 1; n += 2) { 397 i1 = idx[0]; /* The instructions are ordered to */ 398 i2 = idx[1]; /* make the compiler's job easy */ 399 idx += 2; 400 tmp0 = x[i1]; 401 tmp1 = x[i2]; 402 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 403 v1 += 2; 404 } 405 406 if (n == sz - 1) { /* Take care of the last nonzero */ 407 tmp0 = x[*idx++]; 408 sum1 += *v1++ * tmp0; 409 } 410 y[row++] = sum1; 411 break; 412 case 2: 413 sum1 = 0.; 414 sum2 = 0.; 415 v2 = v1 + n; 416 417 for (n = 0; n < sz - 1; n += 2) { 418 i1 = idx[0]; 419 i2 = idx[1]; 420 idx += 2; 421 tmp0 = x[i1]; 422 tmp1 = x[i2]; 423 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 424 v1 += 2; 425 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 426 v2 += 2; 427 } 428 if (n == sz - 1) { 429 tmp0 = x[*idx++]; 430 sum1 += *v1++ * tmp0; 431 sum2 += *v2++ * tmp0; 432 } 433 y[row++] = sum1; 434 y[row++] = sum2; 435 v1 = v2; /* Since the next block to be processed starts there*/ 436 idx += sz; 437 break; 438 case 3: 439 sum1 = 0.; 440 sum2 = 0.; 441 sum3 = 0.; 442 v2 = v1 + n; 443 v3 = v2 + n; 444 445 for (n = 0; n < sz - 1; n += 2) { 446 i1 = idx[0]; 447 i2 = idx[1]; 448 idx += 2; 449 tmp0 = x[i1]; 450 tmp1 = x[i2]; 451 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 452 v1 += 2; 453 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 454 v2 += 2; 455 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 456 v3 += 2; 457 } 458 if (n == sz - 1) { 459 tmp0 = x[*idx++]; 460 sum1 += *v1++ * tmp0; 461 sum2 += *v2++ * tmp0; 462 sum3 += *v3++ * tmp0; 463 } 464 y[row++] = sum1; 465 y[row++] = sum2; 466 y[row++] = sum3; 467 v1 = v3; /* Since the next block to be processed starts there*/ 468 idx += 2 * sz; 469 break; 470 case 4: 471 sum1 = 0.; 472 sum2 = 0.; 473 sum3 = 0.; 474 sum4 = 0.; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 v4 = v3 + n; 478 479 for (n = 0; n < sz - 1; n += 2) { 480 i1 = idx[0]; 481 i2 = idx[1]; 482 idx += 2; 483 tmp0 = x[i1]; 484 tmp1 = x[i2]; 485 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 486 v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 488 v2 += 2; 489 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 490 v3 += 2; 491 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 492 v4 += 2; 493 } 494 if (n == sz - 1) { 495 tmp0 = x[*idx++]; 496 sum1 += *v1++ * tmp0; 497 sum2 += *v2++ * tmp0; 498 sum3 += *v3++ * tmp0; 499 sum4 += *v4++ * tmp0; 500 } 501 y[row++] = sum1; 502 y[row++] = sum2; 503 y[row++] = sum3; 504 y[row++] = sum4; 505 v1 = v4; /* Since the next block to be processed starts there*/ 506 idx += 3 * sz; 507 break; 508 case 5: 509 sum1 = 0.; 510 sum2 = 0.; 511 sum3 = 0.; 512 sum4 = 0.; 513 sum5 = 0.; 514 v2 = v1 + n; 515 v3 = v2 + n; 516 v4 = v3 + n; 517 v5 = v4 + n; 518 519 for (n = 0; n < sz - 1; n += 2) { 520 i1 = idx[0]; 521 i2 = idx[1]; 522 idx += 2; 523 tmp0 = x[i1]; 524 tmp1 = x[i2]; 525 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 526 v1 += 2; 527 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 528 v2 += 2; 529 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 530 v3 += 2; 531 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 532 v4 += 2; 533 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 534 v5 += 2; 535 } 536 if (n == sz - 1) { 537 tmp0 = x[*idx++]; 538 sum1 += *v1++ * tmp0; 539 sum2 += *v2++ * tmp0; 540 sum3 += *v3++ * tmp0; 541 sum4 += *v4++ * tmp0; 542 sum5 += *v5++ * tmp0; 543 } 544 y[row++] = sum1; 545 y[row++] = sum2; 546 y[row++] = sum3; 547 y[row++] = sum4; 548 y[row++] = sum5; 549 v1 = v5; /* Since the next block to be processed starts there */ 550 idx += 4 * sz; 551 break; 552 default: 553 SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz); 554 } 555 } 556 PetscCall(VecRestoreArrayRead(xx, &x)); 557 PetscCall(VecRestoreArray(yy, &y)); 558 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 559 PetscFunctionReturn(PETSC_SUCCESS); 560 } 561 562 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 563 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 564 { 565 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 566 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 567 const MatScalar *v1, *v2, *v3, *v4, *v5; 568 const PetscScalar *x; 569 PetscScalar *y, *z, *zt; 570 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 571 const PetscInt *idx, *ns, *ii; 572 573 PetscFunctionBegin; 574 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 575 node_max = a->inode.node_count; 576 ns = a->inode.size_csr; /* Node Size array */ 577 578 PetscCall(VecGetArrayRead(xx, &x)); 579 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 580 zt = z; 581 582 idx = a->j; 583 v1 = a->a; 584 ii = a->i; 585 586 for (i = 0; i < node_max; ++i) { 587 row = ns[i]; 588 nsz = ns[i + 1] - ns[i]; 589 n = ii[1] - ii[0]; 590 ii += nsz; 591 sz = n; /* No of non zeros in this row */ 592 /* Switch on the size of Node */ 593 switch (nsz) { /* Each loop in 'case' is unrolled */ 594 case 1: 595 sum1 = *zt++; 596 597 for (n = 0; n < sz - 1; n += 2) { 598 i1 = idx[0]; /* The instructions are ordered to */ 599 i2 = idx[1]; /* make the compiler's job easy */ 600 idx += 2; 601 tmp0 = x[i1]; 602 tmp1 = x[i2]; 603 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 604 v1 += 2; 605 } 606 607 if (n == sz - 1) { /* Take care of the last nonzero */ 608 tmp0 = x[*idx++]; 609 sum1 += *v1++ * tmp0; 610 } 611 y[row++] = sum1; 612 break; 613 case 2: 614 sum1 = *zt++; 615 sum2 = *zt++; 616 v2 = v1 + n; 617 618 for (n = 0; n < sz - 1; n += 2) { 619 i1 = idx[0]; 620 i2 = idx[1]; 621 idx += 2; 622 tmp0 = x[i1]; 623 tmp1 = x[i2]; 624 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 625 v1 += 2; 626 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 627 v2 += 2; 628 } 629 if (n == sz - 1) { 630 tmp0 = x[*idx++]; 631 sum1 += *v1++ * tmp0; 632 sum2 += *v2++ * tmp0; 633 } 634 y[row++] = sum1; 635 y[row++] = sum2; 636 v1 = v2; /* Since the next block to be processed starts there*/ 637 idx += sz; 638 break; 639 case 3: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 sum3 = *zt++; 643 v2 = v1 + n; 644 v3 = v2 + n; 645 646 for (n = 0; n < sz - 1; n += 2) { 647 i1 = idx[0]; 648 i2 = idx[1]; 649 idx += 2; 650 tmp0 = x[i1]; 651 tmp1 = x[i2]; 652 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 653 v1 += 2; 654 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 655 v2 += 2; 656 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 657 v3 += 2; 658 } 659 if (n == sz - 1) { 660 tmp0 = x[*idx++]; 661 sum1 += *v1++ * tmp0; 662 sum2 += *v2++ * tmp0; 663 sum3 += *v3++ * tmp0; 664 } 665 y[row++] = sum1; 666 y[row++] = sum2; 667 y[row++] = sum3; 668 v1 = v3; /* Since the next block to be processed starts there*/ 669 idx += 2 * sz; 670 break; 671 case 4: 672 sum1 = *zt++; 673 sum2 = *zt++; 674 sum3 = *zt++; 675 sum4 = *zt++; 676 v2 = v1 + n; 677 v3 = v2 + n; 678 v4 = v3 + n; 679 680 for (n = 0; n < sz - 1; n += 2) { 681 i1 = idx[0]; 682 i2 = idx[1]; 683 idx += 2; 684 tmp0 = x[i1]; 685 tmp1 = x[i2]; 686 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 687 v1 += 2; 688 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 689 v2 += 2; 690 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 691 v3 += 2; 692 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 693 v4 += 2; 694 } 695 if (n == sz - 1) { 696 tmp0 = x[*idx++]; 697 sum1 += *v1++ * tmp0; 698 sum2 += *v2++ * tmp0; 699 sum3 += *v3++ * tmp0; 700 sum4 += *v4++ * tmp0; 701 } 702 y[row++] = sum1; 703 y[row++] = sum2; 704 y[row++] = sum3; 705 y[row++] = sum4; 706 v1 = v4; /* Since the next block to be processed starts there*/ 707 idx += 3 * sz; 708 break; 709 case 5: 710 sum1 = *zt++; 711 sum2 = *zt++; 712 sum3 = *zt++; 713 sum4 = *zt++; 714 sum5 = *zt++; 715 v2 = v1 + n; 716 v3 = v2 + n; 717 v4 = v3 + n; 718 v5 = v4 + n; 719 720 for (n = 0; n < sz - 1; n += 2) { 721 i1 = idx[0]; 722 i2 = idx[1]; 723 idx += 2; 724 tmp0 = x[i1]; 725 tmp1 = x[i2]; 726 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 727 v1 += 2; 728 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 729 v2 += 2; 730 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 731 v3 += 2; 732 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 733 v4 += 2; 734 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 735 v5 += 2; 736 } 737 if (n == sz - 1) { 738 tmp0 = x[*idx++]; 739 sum1 += *v1++ * tmp0; 740 sum2 += *v2++ * tmp0; 741 sum3 += *v3++ * tmp0; 742 sum4 += *v4++ * tmp0; 743 sum5 += *v5++ * tmp0; 744 } 745 y[row++] = sum1; 746 y[row++] = sum2; 747 y[row++] = sum3; 748 y[row++] = sum4; 749 y[row++] = sum5; 750 v1 = v5; /* Since the next block to be processed starts there */ 751 idx += 4 * sz; 752 break; 753 default: 754 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 755 } 756 } 757 PetscCall(VecRestoreArrayRead(xx, &x)); 758 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 759 PetscCall(PetscLogFlops(2.0 * a->nz)); 760 PetscFunctionReturn(PETSC_SUCCESS); 761 } 762 763 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 764 { 765 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 766 IS iscol = a->col, isrow = a->row; 767 const PetscInt *r, *c, *rout, *cout; 768 PetscInt i, j, n = A->rmap->n, nz; 769 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 770 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 771 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 772 PetscScalar sum1, sum2, sum3, sum4, sum5; 773 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 774 const PetscScalar *b; 775 776 PetscFunctionBegin; 777 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 778 node_max = a->inode.node_count; 779 ns = a->inode.size_csr; /* Node Size array */ 780 781 PetscCall(VecGetArrayRead(bb, &b)); 782 PetscCall(VecGetArrayWrite(xx, &x)); 783 tmp = a->solve_work; 784 785 PetscCall(ISGetIndices(isrow, &rout)); 786 r = rout; 787 PetscCall(ISGetIndices(iscol, &cout)); 788 c = cout + (n - 1); 789 790 /* forward solve the lower triangular */ 791 tmps = tmp; 792 aa = a_a; 793 aj = a_j; 794 ad = a->diag; 795 796 for (i = 0, row = 0; i < node_max; ++i) { 797 row = ns[i]; 798 nsz = ns[i + 1] - ns[i]; 799 aii = ai[row]; 800 v1 = aa + aii; 801 vi = aj + aii; 802 nz = ad[row] - aii; 803 if (i < node_max - 1) { 804 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 805 * but our indexing to determine its size could. */ 806 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 807 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 808 PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 809 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 810 } 811 812 switch (nsz) { /* Each loop in 'case' is unrolled */ 813 case 1: 814 sum1 = b[*r++]; 815 for (j = 0; j < nz - 1; j += 2) { 816 i0 = vi[0]; 817 i1 = vi[1]; 818 vi += 2; 819 tmp0 = tmps[i0]; 820 tmp1 = tmps[i1]; 821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 822 v1 += 2; 823 } 824 if (j == nz - 1) { 825 tmp0 = tmps[*vi++]; 826 sum1 -= *v1++ * tmp0; 827 } 828 tmp[row++] = sum1; 829 break; 830 case 2: 831 sum1 = b[*r++]; 832 sum2 = b[*r++]; 833 v2 = aa + ai[row + 1]; 834 835 for (j = 0; j < nz - 1; j += 2) { 836 i0 = vi[0]; 837 i1 = vi[1]; 838 vi += 2; 839 tmp0 = tmps[i0]; 840 tmp1 = tmps[i1]; 841 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 842 v1 += 2; 843 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 844 v2 += 2; 845 } 846 if (j == nz - 1) { 847 tmp0 = tmps[*vi++]; 848 sum1 -= *v1++ * tmp0; 849 sum2 -= *v2++ * tmp0; 850 } 851 sum2 -= *v2++ * sum1; 852 tmp[row++] = sum1; 853 tmp[row++] = sum2; 854 break; 855 case 3: 856 sum1 = b[*r++]; 857 sum2 = b[*r++]; 858 sum3 = b[*r++]; 859 v2 = aa + ai[row + 1]; 860 v3 = aa + ai[row + 2]; 861 862 for (j = 0; j < nz - 1; j += 2) { 863 i0 = vi[0]; 864 i1 = vi[1]; 865 vi += 2; 866 tmp0 = tmps[i0]; 867 tmp1 = tmps[i1]; 868 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 869 v1 += 2; 870 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 871 v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 873 v3 += 2; 874 } 875 if (j == nz - 1) { 876 tmp0 = tmps[*vi++]; 877 sum1 -= *v1++ * tmp0; 878 sum2 -= *v2++ * tmp0; 879 sum3 -= *v3++ * tmp0; 880 } 881 sum2 -= *v2++ * sum1; 882 sum3 -= *v3++ * sum1; 883 sum3 -= *v3++ * sum2; 884 885 tmp[row++] = sum1; 886 tmp[row++] = sum2; 887 tmp[row++] = sum3; 888 break; 889 890 case 4: 891 sum1 = b[*r++]; 892 sum2 = b[*r++]; 893 sum3 = b[*r++]; 894 sum4 = b[*r++]; 895 v2 = aa + ai[row + 1]; 896 v3 = aa + ai[row + 2]; 897 v4 = aa + ai[row + 3]; 898 899 for (j = 0; j < nz - 1; j += 2) { 900 i0 = vi[0]; 901 i1 = vi[1]; 902 vi += 2; 903 tmp0 = tmps[i0]; 904 tmp1 = tmps[i1]; 905 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 906 v1 += 2; 907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 908 v2 += 2; 909 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 910 v3 += 2; 911 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 912 v4 += 2; 913 } 914 if (j == nz - 1) { 915 tmp0 = tmps[*vi++]; 916 sum1 -= *v1++ * tmp0; 917 sum2 -= *v2++ * tmp0; 918 sum3 -= *v3++ * tmp0; 919 sum4 -= *v4++ * tmp0; 920 } 921 sum2 -= *v2++ * sum1; 922 sum3 -= *v3++ * sum1; 923 sum4 -= *v4++ * sum1; 924 sum3 -= *v3++ * sum2; 925 sum4 -= *v4++ * sum2; 926 sum4 -= *v4++ * sum3; 927 928 tmp[row++] = sum1; 929 tmp[row++] = sum2; 930 tmp[row++] = sum3; 931 tmp[row++] = sum4; 932 break; 933 case 5: 934 sum1 = b[*r++]; 935 sum2 = b[*r++]; 936 sum3 = b[*r++]; 937 sum4 = b[*r++]; 938 sum5 = b[*r++]; 939 v2 = aa + ai[row + 1]; 940 v3 = aa + ai[row + 2]; 941 v4 = aa + ai[row + 3]; 942 v5 = aa + ai[row + 4]; 943 944 for (j = 0; j < nz - 1; j += 2) { 945 i0 = vi[0]; 946 i1 = vi[1]; 947 vi += 2; 948 tmp0 = tmps[i0]; 949 tmp1 = tmps[i1]; 950 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 951 v1 += 2; 952 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 953 v2 += 2; 954 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 955 v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 957 v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 959 v5 += 2; 960 } 961 if (j == nz - 1) { 962 tmp0 = tmps[*vi++]; 963 sum1 -= *v1++ * tmp0; 964 sum2 -= *v2++ * tmp0; 965 sum3 -= *v3++ * tmp0; 966 sum4 -= *v4++ * tmp0; 967 sum5 -= *v5++ * tmp0; 968 } 969 970 sum2 -= *v2++ * sum1; 971 sum3 -= *v3++ * sum1; 972 sum4 -= *v4++ * sum1; 973 sum5 -= *v5++ * sum1; 974 sum3 -= *v3++ * sum2; 975 sum4 -= *v4++ * sum2; 976 sum5 -= *v5++ * sum2; 977 sum4 -= *v4++ * sum3; 978 sum5 -= *v5++ * sum3; 979 sum5 -= *v5++ * sum4; 980 981 tmp[row++] = sum1; 982 tmp[row++] = sum2; 983 tmp[row++] = sum3; 984 tmp[row++] = sum4; 985 tmp[row++] = sum5; 986 break; 987 default: 988 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 989 } 990 } 991 /* backward solve the upper triangular */ 992 for (i = node_max - 1; i >= 0; i--) { 993 row = ns[i + 1]; 994 nsz = ns[i + 1] - ns[i]; 995 aii = ai[row + 1] - 1; 996 v1 = aa + aii; 997 vi = aj + aii; 998 nz = aii - ad[row]; 999 switch (nsz) { /* Each loop in 'case' is unrolled */ 1000 case 1: 1001 sum1 = tmp[row]; 1002 1003 for (j = nz; j > 1; j -= 2) { 1004 vi -= 2; 1005 i0 = vi[2]; 1006 i1 = vi[1]; 1007 tmp0 = tmps[i0]; 1008 tmp1 = tmps[i1]; 1009 v1 -= 2; 1010 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1011 } 1012 if (j == 1) { 1013 tmp0 = tmps[*vi--]; 1014 sum1 -= *v1-- * tmp0; 1015 } 1016 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1017 row--; 1018 break; 1019 case 2: 1020 sum1 = tmp[row]; 1021 sum2 = tmp[row - 1]; 1022 v2 = aa + ai[row] - 1; 1023 for (j = nz; j > 1; j -= 2) { 1024 vi -= 2; 1025 i0 = vi[2]; 1026 i1 = vi[1]; 1027 tmp0 = tmps[i0]; 1028 tmp1 = tmps[i1]; 1029 v1 -= 2; 1030 v2 -= 2; 1031 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1032 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1033 } 1034 if (j == 1) { 1035 tmp0 = tmps[*vi--]; 1036 sum1 -= *v1-- * tmp0; 1037 sum2 -= *v2-- * tmp0; 1038 } 1039 1040 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1041 row--; 1042 sum2 -= *v2-- * tmp0; 1043 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1044 row--; 1045 break; 1046 case 3: 1047 sum1 = tmp[row]; 1048 sum2 = tmp[row - 1]; 1049 sum3 = tmp[row - 2]; 1050 v2 = aa + ai[row] - 1; 1051 v3 = aa + ai[row - 1] - 1; 1052 for (j = nz; j > 1; j -= 2) { 1053 vi -= 2; 1054 i0 = vi[2]; 1055 i1 = vi[1]; 1056 tmp0 = tmps[i0]; 1057 tmp1 = tmps[i1]; 1058 v1 -= 2; 1059 v2 -= 2; 1060 v3 -= 2; 1061 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1062 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1063 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1064 } 1065 if (j == 1) { 1066 tmp0 = tmps[*vi--]; 1067 sum1 -= *v1-- * tmp0; 1068 sum2 -= *v2-- * tmp0; 1069 sum3 -= *v3-- * tmp0; 1070 } 1071 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1072 row--; 1073 sum2 -= *v2-- * tmp0; 1074 sum3 -= *v3-- * tmp0; 1075 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1076 row--; 1077 sum3 -= *v3-- * tmp0; 1078 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1079 row--; 1080 1081 break; 1082 case 4: 1083 sum1 = tmp[row]; 1084 sum2 = tmp[row - 1]; 1085 sum3 = tmp[row - 2]; 1086 sum4 = tmp[row - 3]; 1087 v2 = aa + ai[row] - 1; 1088 v3 = aa + ai[row - 1] - 1; 1089 v4 = aa + ai[row - 2] - 1; 1090 1091 for (j = nz; j > 1; j -= 2) { 1092 vi -= 2; 1093 i0 = vi[2]; 1094 i1 = vi[1]; 1095 tmp0 = tmps[i0]; 1096 tmp1 = tmps[i1]; 1097 v1 -= 2; 1098 v2 -= 2; 1099 v3 -= 2; 1100 v4 -= 2; 1101 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1102 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1103 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1104 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1105 } 1106 if (j == 1) { 1107 tmp0 = tmps[*vi--]; 1108 sum1 -= *v1-- * tmp0; 1109 sum2 -= *v2-- * tmp0; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 } 1113 1114 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1115 row--; 1116 sum2 -= *v2-- * tmp0; 1117 sum3 -= *v3-- * tmp0; 1118 sum4 -= *v4-- * tmp0; 1119 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1120 row--; 1121 sum3 -= *v3-- * tmp0; 1122 sum4 -= *v4-- * tmp0; 1123 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1124 row--; 1125 sum4 -= *v4-- * tmp0; 1126 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1127 row--; 1128 break; 1129 case 5: 1130 sum1 = tmp[row]; 1131 sum2 = tmp[row - 1]; 1132 sum3 = tmp[row - 2]; 1133 sum4 = tmp[row - 3]; 1134 sum5 = tmp[row - 4]; 1135 v2 = aa + ai[row] - 1; 1136 v3 = aa + ai[row - 1] - 1; 1137 v4 = aa + ai[row - 2] - 1; 1138 v5 = aa + ai[row - 3] - 1; 1139 for (j = nz; j > 1; j -= 2) { 1140 vi -= 2; 1141 i0 = vi[2]; 1142 i1 = vi[1]; 1143 tmp0 = tmps[i0]; 1144 tmp1 = tmps[i1]; 1145 v1 -= 2; 1146 v2 -= 2; 1147 v3 -= 2; 1148 v4 -= 2; 1149 v5 -= 2; 1150 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1151 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1152 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1153 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1154 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1155 } 1156 if (j == 1) { 1157 tmp0 = tmps[*vi--]; 1158 sum1 -= *v1-- * tmp0; 1159 sum2 -= *v2-- * tmp0; 1160 sum3 -= *v3-- * tmp0; 1161 sum4 -= *v4-- * tmp0; 1162 sum5 -= *v5-- * tmp0; 1163 } 1164 1165 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1166 row--; 1167 sum2 -= *v2-- * tmp0; 1168 sum3 -= *v3-- * tmp0; 1169 sum4 -= *v4-- * tmp0; 1170 sum5 -= *v5-- * tmp0; 1171 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1172 row--; 1173 sum3 -= *v3-- * tmp0; 1174 sum4 -= *v4-- * tmp0; 1175 sum5 -= *v5-- * tmp0; 1176 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1177 row--; 1178 sum4 -= *v4-- * tmp0; 1179 sum5 -= *v5-- * tmp0; 1180 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1181 row--; 1182 sum5 -= *v5-- * tmp0; 1183 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1184 row--; 1185 break; 1186 default: 1187 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1188 } 1189 } 1190 PetscCall(ISRestoreIndices(isrow, &rout)); 1191 PetscCall(ISRestoreIndices(iscol, &cout)); 1192 PetscCall(VecRestoreArrayRead(bb, &b)); 1193 PetscCall(VecRestoreArrayWrite(xx, &x)); 1194 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1195 PetscFunctionReturn(PETSC_SUCCESS); 1196 } 1197 1198 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1199 { 1200 Mat C = B; 1201 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1202 IS isrow = b->row, isicol = b->icol; 1203 const PetscInt *r, *ic, *ics; 1204 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1205 PetscInt i, j, k, nz, nzL, row, *pj; 1206 const PetscInt *ajtmp, *bjtmp; 1207 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1208 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1209 FactorShiftCtx sctx; 1210 const PetscInt *ddiag; 1211 PetscReal rs; 1212 MatScalar d; 1213 PetscInt inod, nodesz, node_max, col; 1214 const PetscInt *ns; 1215 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1216 1217 PetscFunctionBegin; 1218 /* MatPivotSetUp(): initialize shift context sctx */ 1219 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1220 1221 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1222 ddiag = a->diag; 1223 sctx.shift_top = info->zeropivot; 1224 for (i = 0; i < n; i++) { 1225 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1226 d = (aa)[ddiag[i]]; 1227 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1228 v = aa + ai[i]; 1229 nz = ai[i + 1] - ai[i]; 1230 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1231 if (rs > sctx.shift_top) sctx.shift_top = rs; 1232 } 1233 sctx.shift_top *= 1.1; 1234 sctx.nshift_max = 5; 1235 sctx.shift_lo = 0.; 1236 sctx.shift_hi = 1.; 1237 } 1238 1239 PetscCall(ISGetIndices(isrow, &r)); 1240 PetscCall(ISGetIndices(isicol, &ic)); 1241 1242 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1243 ics = ic; 1244 1245 node_max = a->inode.node_count; 1246 ns = a->inode.size_csr; 1247 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1248 1249 /* If max inode size > 4, split it into two inodes.*/ 1250 /* also map the inode sizes according to the ordering */ 1251 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1252 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1253 nodesz = ns[i + 1] - ns[i]; 1254 if (nodesz > 4) { 1255 tmp_vec1[j] = 4; 1256 ++j; 1257 tmp_vec1[j] = nodesz - tmp_vec1[j - 1]; 1258 } else { 1259 tmp_vec1[j] = nodesz; 1260 } 1261 } 1262 /* Use the correct node_max */ 1263 node_max = j; 1264 1265 /* Now reorder the inode info based on mat re-ordering info */ 1266 /* First create a row -> inode_size_array_index map */ 1267 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1268 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1269 tmp_vec2[0] = 0; 1270 for (i = 0, row = 0; i < node_max; i++) { 1271 nodesz = tmp_vec1[i]; 1272 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1273 } 1274 /* Using nsmap, create a reordered ns structure */ 1275 for (i = 0, j = 0; i < node_max; i++) { 1276 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1277 tmp_vec2[i + 1] = tmp_vec2[i] + nodesz; 1278 j += nodesz; 1279 } 1280 PetscCall(PetscFree(nsmap)); 1281 PetscCall(PetscFree(tmp_vec1)); 1282 1283 /* Now use the correct ns */ 1284 ns = tmp_vec2; 1285 1286 do { 1287 sctx.newshift = PETSC_FALSE; 1288 /* Now loop over each block-row, and do the factorization */ 1289 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1290 nodesz = ns[inod + 1] - ns[inod]; 1291 1292 switch (nodesz) { 1293 case 1: 1294 /* zero rtmp1 */ 1295 /* L part */ 1296 nz = bi[i + 1] - bi[i]; 1297 bjtmp = bj + bi[i]; 1298 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1299 1300 /* U part */ 1301 nz = bdiag[i] - bdiag[i + 1]; 1302 bjtmp = bj + bdiag[i + 1] + 1; 1303 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1304 1305 /* load in initial (unfactored row) */ 1306 nz = ai[r[i] + 1] - ai[r[i]]; 1307 ajtmp = aj + ai[r[i]]; 1308 v = aa + ai[r[i]]; 1309 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1310 1311 /* ZeropivotApply() */ 1312 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1313 1314 /* elimination */ 1315 bjtmp = bj + bi[i]; 1316 row = *bjtmp++; 1317 nzL = bi[i + 1] - bi[i]; 1318 for (k = 0; k < nzL; k++) { 1319 pc = rtmp1 + row; 1320 if (*pc != 0.0) { 1321 pv = b->a + bdiag[row]; 1322 mul1 = *pc * (*pv); 1323 *pc = mul1; 1324 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1325 pv = b->a + bdiag[row + 1] + 1; 1326 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1327 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1328 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1329 } 1330 row = *bjtmp++; 1331 } 1332 1333 /* finished row so stick it into b->a */ 1334 rs = 0.0; 1335 /* L part */ 1336 pv = b->a + bi[i]; 1337 pj = b->j + bi[i]; 1338 nz = bi[i + 1] - bi[i]; 1339 for (j = 0; j < nz; j++) { 1340 pv[j] = rtmp1[pj[j]]; 1341 rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* U part */ 1345 pv = b->a + bdiag[i + 1] + 1; 1346 pj = b->j + bdiag[i + 1] + 1; 1347 nz = bdiag[i] - bdiag[i + 1] - 1; 1348 for (j = 0; j < nz; j++) { 1349 pv[j] = rtmp1[pj[j]]; 1350 rs += PetscAbsScalar(pv[j]); 1351 } 1352 1353 /* Check zero pivot */ 1354 sctx.rs = rs; 1355 sctx.pv = rtmp1[i]; 1356 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1357 if (sctx.newshift) break; 1358 1359 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1360 pv = b->a + bdiag[i]; 1361 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1362 break; 1363 1364 case 2: 1365 /* zero rtmp1 and rtmp2 */ 1366 /* L part */ 1367 nz = bi[i + 1] - bi[i]; 1368 bjtmp = bj + bi[i]; 1369 for (j = 0; j < nz; j++) { 1370 col = bjtmp[j]; 1371 rtmp1[col] = 0.0; 1372 rtmp2[col] = 0.0; 1373 } 1374 1375 /* U part */ 1376 nz = bdiag[i] - bdiag[i + 1]; 1377 bjtmp = bj + bdiag[i + 1] + 1; 1378 for (j = 0; j < nz; j++) { 1379 col = bjtmp[j]; 1380 rtmp1[col] = 0.0; 1381 rtmp2[col] = 0.0; 1382 } 1383 1384 /* load in initial (unfactored row) */ 1385 nz = ai[r[i] + 1] - ai[r[i]]; 1386 ajtmp = aj + ai[r[i]]; 1387 v1 = aa + ai[r[i]]; 1388 v2 = aa + ai[r[i] + 1]; 1389 for (j = 0; j < nz; j++) { 1390 col = ics[ajtmp[j]]; 1391 rtmp1[col] = v1[j]; 1392 rtmp2[col] = v2[j]; 1393 } 1394 /* ZeropivotApply(): shift the diagonal of the matrix */ 1395 rtmp1[i] += sctx.shift_amount; 1396 rtmp2[i + 1] += sctx.shift_amount; 1397 1398 /* elimination */ 1399 bjtmp = bj + bi[i]; 1400 row = *bjtmp++; /* pivot row */ 1401 nzL = bi[i + 1] - bi[i]; 1402 for (k = 0; k < nzL; k++) { 1403 pc1 = rtmp1 + row; 1404 pc2 = rtmp2 + row; 1405 if (*pc1 != 0.0 || *pc2 != 0.0) { 1406 pv = b->a + bdiag[row]; 1407 mul1 = *pc1 * (*pv); 1408 mul2 = *pc2 * (*pv); 1409 *pc1 = mul1; 1410 *pc2 = mul2; 1411 1412 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1413 pv = b->a + bdiag[row + 1] + 1; 1414 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1415 for (j = 0; j < nz; j++) { 1416 col = pj[j]; 1417 rtmp1[col] -= mul1 * pv[j]; 1418 rtmp2[col] -= mul2 * pv[j]; 1419 } 1420 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1421 } 1422 row = *bjtmp++; 1423 } 1424 1425 /* finished row i; check zero pivot, then stick row i into b->a */ 1426 rs = 0.0; 1427 /* L part */ 1428 pc1 = b->a + bi[i]; 1429 pj = b->j + bi[i]; 1430 nz = bi[i + 1] - bi[i]; 1431 for (j = 0; j < nz; j++) { 1432 col = pj[j]; 1433 pc1[j] = rtmp1[col]; 1434 rs += PetscAbsScalar(pc1[j]); 1435 } 1436 /* U part */ 1437 pc1 = b->a + bdiag[i + 1] + 1; 1438 pj = b->j + bdiag[i + 1] + 1; 1439 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1440 for (j = 0; j < nz; j++) { 1441 col = pj[j]; 1442 pc1[j] = rtmp1[col]; 1443 rs += PetscAbsScalar(pc1[j]); 1444 } 1445 1446 sctx.rs = rs; 1447 sctx.pv = rtmp1[i]; 1448 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1449 if (sctx.newshift) break; 1450 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1451 *pc1 = 1.0 / sctx.pv; 1452 1453 /* Now take care of diagonal 2x2 block. */ 1454 pc2 = rtmp2 + i; 1455 if (*pc2 != 0.0) { 1456 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1457 *pc2 = mul1; /* insert L entry */ 1458 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1459 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1460 for (j = 0; j < nz; j++) { 1461 col = pj[j]; 1462 rtmp2[col] -= mul1 * rtmp1[col]; 1463 } 1464 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1465 } 1466 1467 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1468 rs = 0.0; 1469 /* L part */ 1470 pc2 = b->a + bi[i + 1]; 1471 pj = b->j + bi[i + 1]; 1472 nz = bi[i + 2] - bi[i + 1]; 1473 for (j = 0; j < nz; j++) { 1474 col = pj[j]; 1475 pc2[j] = rtmp2[col]; 1476 rs += PetscAbsScalar(pc2[j]); 1477 } 1478 /* U part */ 1479 pc2 = b->a + bdiag[i + 2] + 1; 1480 pj = b->j + bdiag[i + 2] + 1; 1481 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1482 for (j = 0; j < nz; j++) { 1483 col = pj[j]; 1484 pc2[j] = rtmp2[col]; 1485 rs += PetscAbsScalar(pc2[j]); 1486 } 1487 1488 sctx.rs = rs; 1489 sctx.pv = rtmp2[i + 1]; 1490 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1491 if (sctx.newshift) break; 1492 pc2 = b->a + bdiag[i + 1]; 1493 *pc2 = 1.0 / sctx.pv; 1494 break; 1495 1496 case 3: 1497 /* zero rtmp */ 1498 /* L part */ 1499 nz = bi[i + 1] - bi[i]; 1500 bjtmp = bj + bi[i]; 1501 for (j = 0; j < nz; j++) { 1502 col = bjtmp[j]; 1503 rtmp1[col] = 0.0; 1504 rtmp2[col] = 0.0; 1505 rtmp3[col] = 0.0; 1506 } 1507 1508 /* U part */ 1509 nz = bdiag[i] - bdiag[i + 1]; 1510 bjtmp = bj + bdiag[i + 1] + 1; 1511 for (j = 0; j < nz; j++) { 1512 col = bjtmp[j]; 1513 rtmp1[col] = 0.0; 1514 rtmp2[col] = 0.0; 1515 rtmp3[col] = 0.0; 1516 } 1517 1518 /* load in initial (unfactored row) */ 1519 nz = ai[r[i] + 1] - ai[r[i]]; 1520 ajtmp = aj + ai[r[i]]; 1521 v1 = aa + ai[r[i]]; 1522 v2 = aa + ai[r[i] + 1]; 1523 v3 = aa + ai[r[i] + 2]; 1524 for (j = 0; j < nz; j++) { 1525 col = ics[ajtmp[j]]; 1526 rtmp1[col] = v1[j]; 1527 rtmp2[col] = v2[j]; 1528 rtmp3[col] = v3[j]; 1529 } 1530 /* ZeropivotApply(): shift the diagonal of the matrix */ 1531 rtmp1[i] += sctx.shift_amount; 1532 rtmp2[i + 1] += sctx.shift_amount; 1533 rtmp3[i + 2] += sctx.shift_amount; 1534 1535 /* elimination */ 1536 bjtmp = bj + bi[i]; 1537 row = *bjtmp++; /* pivot row */ 1538 nzL = bi[i + 1] - bi[i]; 1539 for (k = 0; k < nzL; k++) { 1540 pc1 = rtmp1 + row; 1541 pc2 = rtmp2 + row; 1542 pc3 = rtmp3 + row; 1543 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1544 pv = b->a + bdiag[row]; 1545 mul1 = *pc1 * (*pv); 1546 mul2 = *pc2 * (*pv); 1547 mul3 = *pc3 * (*pv); 1548 *pc1 = mul1; 1549 *pc2 = mul2; 1550 *pc3 = mul3; 1551 1552 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1553 pv = b->a + bdiag[row + 1] + 1; 1554 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1555 for (j = 0; j < nz; j++) { 1556 col = pj[j]; 1557 rtmp1[col] -= mul1 * pv[j]; 1558 rtmp2[col] -= mul2 * pv[j]; 1559 rtmp3[col] -= mul3 * pv[j]; 1560 } 1561 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1562 } 1563 row = *bjtmp++; 1564 } 1565 1566 /* finished row i; check zero pivot, then stick row i into b->a */ 1567 rs = 0.0; 1568 /* L part */ 1569 pc1 = b->a + bi[i]; 1570 pj = b->j + bi[i]; 1571 nz = bi[i + 1] - bi[i]; 1572 for (j = 0; j < nz; j++) { 1573 col = pj[j]; 1574 pc1[j] = rtmp1[col]; 1575 rs += PetscAbsScalar(pc1[j]); 1576 } 1577 /* U part */ 1578 pc1 = b->a + bdiag[i + 1] + 1; 1579 pj = b->j + bdiag[i + 1] + 1; 1580 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1581 for (j = 0; j < nz; j++) { 1582 col = pj[j]; 1583 pc1[j] = rtmp1[col]; 1584 rs += PetscAbsScalar(pc1[j]); 1585 } 1586 1587 sctx.rs = rs; 1588 sctx.pv = rtmp1[i]; 1589 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1590 if (sctx.newshift) break; 1591 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1592 *pc1 = 1.0 / sctx.pv; 1593 1594 /* Now take care of 1st column of diagonal 3x3 block. */ 1595 pc2 = rtmp2 + i; 1596 pc3 = rtmp3 + i; 1597 if (*pc2 != 0.0 || *pc3 != 0.0) { 1598 mul2 = (*pc2) * (*pc1); 1599 *pc2 = mul2; 1600 mul3 = (*pc3) * (*pc1); 1601 *pc3 = mul3; 1602 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1603 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1604 for (j = 0; j < nz; j++) { 1605 col = pj[j]; 1606 rtmp2[col] -= mul2 * rtmp1[col]; 1607 rtmp3[col] -= mul3 * rtmp1[col]; 1608 } 1609 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1610 } 1611 1612 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1613 rs = 0.0; 1614 /* L part */ 1615 pc2 = b->a + bi[i + 1]; 1616 pj = b->j + bi[i + 1]; 1617 nz = bi[i + 2] - bi[i + 1]; 1618 for (j = 0; j < nz; j++) { 1619 col = pj[j]; 1620 pc2[j] = rtmp2[col]; 1621 rs += PetscAbsScalar(pc2[j]); 1622 } 1623 /* U part */ 1624 pc2 = b->a + bdiag[i + 2] + 1; 1625 pj = b->j + bdiag[i + 2] + 1; 1626 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1627 for (j = 0; j < nz; j++) { 1628 col = pj[j]; 1629 pc2[j] = rtmp2[col]; 1630 rs += PetscAbsScalar(pc2[j]); 1631 } 1632 1633 sctx.rs = rs; 1634 sctx.pv = rtmp2[i + 1]; 1635 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1636 if (sctx.newshift) break; 1637 pc2 = b->a + bdiag[i + 1]; 1638 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1639 1640 /* Now take care of 2nd column of diagonal 3x3 block. */ 1641 pc3 = rtmp3 + i + 1; 1642 if (*pc3 != 0.0) { 1643 mul3 = (*pc3) * (*pc2); 1644 *pc3 = mul3; 1645 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1646 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1647 for (j = 0; j < nz; j++) { 1648 col = pj[j]; 1649 rtmp3[col] -= mul3 * rtmp2[col]; 1650 } 1651 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1652 } 1653 1654 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1655 rs = 0.0; 1656 /* L part */ 1657 pc3 = b->a + bi[i + 2]; 1658 pj = b->j + bi[i + 2]; 1659 nz = bi[i + 3] - bi[i + 2]; 1660 for (j = 0; j < nz; j++) { 1661 col = pj[j]; 1662 pc3[j] = rtmp3[col]; 1663 rs += PetscAbsScalar(pc3[j]); 1664 } 1665 /* U part */ 1666 pc3 = b->a + bdiag[i + 3] + 1; 1667 pj = b->j + bdiag[i + 3] + 1; 1668 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1669 for (j = 0; j < nz; j++) { 1670 col = pj[j]; 1671 pc3[j] = rtmp3[col]; 1672 rs += PetscAbsScalar(pc3[j]); 1673 } 1674 1675 sctx.rs = rs; 1676 sctx.pv = rtmp3[i + 2]; 1677 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1678 if (sctx.newshift) break; 1679 pc3 = b->a + bdiag[i + 2]; 1680 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1681 break; 1682 case 4: 1683 /* zero rtmp */ 1684 /* L part */ 1685 nz = bi[i + 1] - bi[i]; 1686 bjtmp = bj + bi[i]; 1687 for (j = 0; j < nz; j++) { 1688 col = bjtmp[j]; 1689 rtmp1[col] = 0.0; 1690 rtmp2[col] = 0.0; 1691 rtmp3[col] = 0.0; 1692 rtmp4[col] = 0.0; 1693 } 1694 1695 /* U part */ 1696 nz = bdiag[i] - bdiag[i + 1]; 1697 bjtmp = bj + bdiag[i + 1] + 1; 1698 for (j = 0; j < nz; j++) { 1699 col = bjtmp[j]; 1700 rtmp1[col] = 0.0; 1701 rtmp2[col] = 0.0; 1702 rtmp3[col] = 0.0; 1703 rtmp4[col] = 0.0; 1704 } 1705 1706 /* load in initial (unfactored row) */ 1707 nz = ai[r[i] + 1] - ai[r[i]]; 1708 ajtmp = aj + ai[r[i]]; 1709 v1 = aa + ai[r[i]]; 1710 v2 = aa + ai[r[i] + 1]; 1711 v3 = aa + ai[r[i] + 2]; 1712 v4 = aa + ai[r[i] + 3]; 1713 for (j = 0; j < nz; j++) { 1714 col = ics[ajtmp[j]]; 1715 rtmp1[col] = v1[j]; 1716 rtmp2[col] = v2[j]; 1717 rtmp3[col] = v3[j]; 1718 rtmp4[col] = v4[j]; 1719 } 1720 /* ZeropivotApply(): shift the diagonal of the matrix */ 1721 rtmp1[i] += sctx.shift_amount; 1722 rtmp2[i + 1] += sctx.shift_amount; 1723 rtmp3[i + 2] += sctx.shift_amount; 1724 rtmp4[i + 3] += sctx.shift_amount; 1725 1726 /* elimination */ 1727 bjtmp = bj + bi[i]; 1728 row = *bjtmp++; /* pivot row */ 1729 nzL = bi[i + 1] - bi[i]; 1730 for (k = 0; k < nzL; k++) { 1731 pc1 = rtmp1 + row; 1732 pc2 = rtmp2 + row; 1733 pc3 = rtmp3 + row; 1734 pc4 = rtmp4 + row; 1735 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1736 pv = b->a + bdiag[row]; 1737 mul1 = *pc1 * (*pv); 1738 mul2 = *pc2 * (*pv); 1739 mul3 = *pc3 * (*pv); 1740 mul4 = *pc4 * (*pv); 1741 *pc1 = mul1; 1742 *pc2 = mul2; 1743 *pc3 = mul3; 1744 *pc4 = mul4; 1745 1746 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1747 pv = b->a + bdiag[row + 1] + 1; 1748 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1749 for (j = 0; j < nz; j++) { 1750 col = pj[j]; 1751 rtmp1[col] -= mul1 * pv[j]; 1752 rtmp2[col] -= mul2 * pv[j]; 1753 rtmp3[col] -= mul3 * pv[j]; 1754 rtmp4[col] -= mul4 * pv[j]; 1755 } 1756 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1757 } 1758 row = *bjtmp++; 1759 } 1760 1761 /* finished row i; check zero pivot, then stick row i into b->a */ 1762 rs = 0.0; 1763 /* L part */ 1764 pc1 = b->a + bi[i]; 1765 pj = b->j + bi[i]; 1766 nz = bi[i + 1] - bi[i]; 1767 for (j = 0; j < nz; j++) { 1768 col = pj[j]; 1769 pc1[j] = rtmp1[col]; 1770 rs += PetscAbsScalar(pc1[j]); 1771 } 1772 /* U part */ 1773 pc1 = b->a + bdiag[i + 1] + 1; 1774 pj = b->j + bdiag[i + 1] + 1; 1775 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1776 for (j = 0; j < nz; j++) { 1777 col = pj[j]; 1778 pc1[j] = rtmp1[col]; 1779 rs += PetscAbsScalar(pc1[j]); 1780 } 1781 1782 sctx.rs = rs; 1783 sctx.pv = rtmp1[i]; 1784 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1785 if (sctx.newshift) break; 1786 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1787 *pc1 = 1.0 / sctx.pv; 1788 1789 /* Now take care of 1st column of diagonal 4x4 block. */ 1790 pc2 = rtmp2 + i; 1791 pc3 = rtmp3 + i; 1792 pc4 = rtmp4 + i; 1793 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1794 mul2 = (*pc2) * (*pc1); 1795 *pc2 = mul2; 1796 mul3 = (*pc3) * (*pc1); 1797 *pc3 = mul3; 1798 mul4 = (*pc4) * (*pc1); 1799 *pc4 = mul4; 1800 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1801 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1802 for (j = 0; j < nz; j++) { 1803 col = pj[j]; 1804 rtmp2[col] -= mul2 * rtmp1[col]; 1805 rtmp3[col] -= mul3 * rtmp1[col]; 1806 rtmp4[col] -= mul4 * rtmp1[col]; 1807 } 1808 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1809 } 1810 1811 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1812 rs = 0.0; 1813 /* L part */ 1814 pc2 = b->a + bi[i + 1]; 1815 pj = b->j + bi[i + 1]; 1816 nz = bi[i + 2] - bi[i + 1]; 1817 for (j = 0; j < nz; j++) { 1818 col = pj[j]; 1819 pc2[j] = rtmp2[col]; 1820 rs += PetscAbsScalar(pc2[j]); 1821 } 1822 /* U part */ 1823 pc2 = b->a + bdiag[i + 2] + 1; 1824 pj = b->j + bdiag[i + 2] + 1; 1825 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1826 for (j = 0; j < nz; j++) { 1827 col = pj[j]; 1828 pc2[j] = rtmp2[col]; 1829 rs += PetscAbsScalar(pc2[j]); 1830 } 1831 1832 sctx.rs = rs; 1833 sctx.pv = rtmp2[i + 1]; 1834 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1835 if (sctx.newshift) break; 1836 pc2 = b->a + bdiag[i + 1]; 1837 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1838 1839 /* Now take care of 2nd column of diagonal 4x4 block. */ 1840 pc3 = rtmp3 + i + 1; 1841 pc4 = rtmp4 + i + 1; 1842 if (*pc3 != 0.0 || *pc4 != 0.0) { 1843 mul3 = (*pc3) * (*pc2); 1844 *pc3 = mul3; 1845 mul4 = (*pc4) * (*pc2); 1846 *pc4 = mul4; 1847 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1848 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1849 for (j = 0; j < nz; j++) { 1850 col = pj[j]; 1851 rtmp3[col] -= mul3 * rtmp2[col]; 1852 rtmp4[col] -= mul4 * rtmp2[col]; 1853 } 1854 PetscCall(PetscLogFlops(4.0 * nz)); 1855 } 1856 1857 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1858 rs = 0.0; 1859 /* L part */ 1860 pc3 = b->a + bi[i + 2]; 1861 pj = b->j + bi[i + 2]; 1862 nz = bi[i + 3] - bi[i + 2]; 1863 for (j = 0; j < nz; j++) { 1864 col = pj[j]; 1865 pc3[j] = rtmp3[col]; 1866 rs += PetscAbsScalar(pc3[j]); 1867 } 1868 /* U part */ 1869 pc3 = b->a + bdiag[i + 3] + 1; 1870 pj = b->j + bdiag[i + 3] + 1; 1871 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1872 for (j = 0; j < nz; j++) { 1873 col = pj[j]; 1874 pc3[j] = rtmp3[col]; 1875 rs += PetscAbsScalar(pc3[j]); 1876 } 1877 1878 sctx.rs = rs; 1879 sctx.pv = rtmp3[i + 2]; 1880 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1881 if (sctx.newshift) break; 1882 pc3 = b->a + bdiag[i + 2]; 1883 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1884 1885 /* Now take care of 3rd column of diagonal 4x4 block. */ 1886 pc4 = rtmp4 + i + 2; 1887 if (*pc4 != 0.0) { 1888 mul4 = (*pc4) * (*pc3); 1889 *pc4 = mul4; 1890 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1891 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1892 for (j = 0; j < nz; j++) { 1893 col = pj[j]; 1894 rtmp4[col] -= mul4 * rtmp3[col]; 1895 } 1896 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1897 } 1898 1899 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1900 rs = 0.0; 1901 /* L part */ 1902 pc4 = b->a + bi[i + 3]; 1903 pj = b->j + bi[i + 3]; 1904 nz = bi[i + 4] - bi[i + 3]; 1905 for (j = 0; j < nz; j++) { 1906 col = pj[j]; 1907 pc4[j] = rtmp4[col]; 1908 rs += PetscAbsScalar(pc4[j]); 1909 } 1910 /* U part */ 1911 pc4 = b->a + bdiag[i + 4] + 1; 1912 pj = b->j + bdiag[i + 4] + 1; 1913 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1914 for (j = 0; j < nz; j++) { 1915 col = pj[j]; 1916 pc4[j] = rtmp4[col]; 1917 rs += PetscAbsScalar(pc4[j]); 1918 } 1919 1920 sctx.rs = rs; 1921 sctx.pv = rtmp4[i + 3]; 1922 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1923 if (sctx.newshift) break; 1924 pc4 = b->a + bdiag[i + 3]; 1925 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1926 break; 1927 1928 default: 1929 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1930 } 1931 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1932 i += nodesz; /* Update the row */ 1933 } 1934 1935 /* MatPivotRefine() */ 1936 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1937 /* 1938 * if no shift in this attempt & shifting & started shifting & can refine, 1939 * then try lower shift 1940 */ 1941 sctx.shift_hi = sctx.shift_fraction; 1942 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1943 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1944 sctx.newshift = PETSC_TRUE; 1945 sctx.nshift++; 1946 } 1947 } while (sctx.newshift); 1948 1949 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1950 PetscCall(PetscFree(tmp_vec2)); 1951 PetscCall(ISRestoreIndices(isicol, &ic)); 1952 PetscCall(ISRestoreIndices(isrow, &r)); 1953 1954 if (b->inode.size_csr) { 1955 C->ops->solve = MatSolve_SeqAIJ_Inode; 1956 } else { 1957 C->ops->solve = MatSolve_SeqAIJ; 1958 } 1959 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1960 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1961 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1962 C->ops->matsolve = MatMatSolve_SeqAIJ; 1963 C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ; 1964 C->assembled = PETSC_TRUE; 1965 C->preallocated = PETSC_TRUE; 1966 1967 PetscCall(PetscLogFlops(C->cmap->n)); 1968 1969 /* MatShiftView(A,info,&sctx) */ 1970 if (sctx.nshift) { 1971 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1972 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1973 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1974 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1975 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1976 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1977 } 1978 } 1979 PetscFunctionReturn(PETSC_SUCCESS); 1980 } 1981 1982 #if 0 1983 // unused 1984 static PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B, Mat A, const MatFactorInfo *info) 1985 { 1986 Mat C = B; 1987 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1988 IS iscol = b->col, isrow = b->row, isicol = b->icol; 1989 const PetscInt *r, *ic, *c, *ics; 1990 PetscInt n = A->rmap->n, *bi = b->i; 1991 PetscInt *bj = b->j, *nbj = b->j + 1, *ajtmp, *bjtmp, nz, nz_tmp, row, prow; 1992 PetscInt i, j, idx, *bd = b->diag, node_max, nodesz; 1993 PetscInt *ai = a->i, *aj = a->j; 1994 PetscInt *ns, *tmp_vec1, *tmp_vec2, *nsmap, *pj; 1995 PetscScalar mul1, mul2, mul3, tmp; 1996 MatScalar *pc1, *pc2, *pc3, *ba = b->a, *pv, *rtmp11, *rtmp22, *rtmp33; 1997 const MatScalar *v1, *v2, *v3, *aa = a->a, *rtmp1; 1998 PetscReal rs = 0.0; 1999 FactorShiftCtx sctx; 2000 2001 PetscFunctionBegin; 2002 sctx.shift_top = 0; 2003 sctx.nshift_max = 0; 2004 sctx.shift_lo = 0; 2005 sctx.shift_hi = 0; 2006 sctx.shift_fraction = 0; 2007 2008 /* if both shift schemes are chosen by user, only use info->shiftpd */ 2009 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 2010 sctx.shift_top = 0; 2011 for (i = 0; i < n; i++) { 2012 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 2013 rs = 0.0; 2014 ajtmp = aj + ai[i]; 2015 rtmp1 = aa + ai[i]; 2016 nz = ai[i + 1] - ai[i]; 2017 for (j = 0; j < nz; j++) { 2018 if (*ajtmp != i) { 2019 rs += PetscAbsScalar(*rtmp1++); 2020 } else { 2021 rs -= PetscRealPart(*rtmp1++); 2022 } 2023 ajtmp++; 2024 } 2025 if (rs > sctx.shift_top) sctx.shift_top = rs; 2026 } 2027 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 2028 sctx.shift_top *= 1.1; 2029 sctx.nshift_max = 5; 2030 sctx.shift_lo = 0.; 2031 sctx.shift_hi = 1.; 2032 } 2033 sctx.shift_amount = 0; 2034 sctx.nshift = 0; 2035 2036 PetscCall(ISGetIndices(isrow, &r)); 2037 PetscCall(ISGetIndices(iscol, &c)); 2038 PetscCall(ISGetIndices(isicol, &ic)); 2039 PetscCall(PetscCalloc3(n, &rtmp11, n, &rtmp22, n, &rtmp33)); 2040 ics = ic; 2041 2042 node_max = a->inode.node_count; 2043 ns = a->inode.size; 2044 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 2045 2046 /* If max inode size > 3, split it into two inodes.*/ 2047 /* also map the inode sizes according to the ordering */ 2048 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 2049 for (i = 0, j = 0; i < node_max; ++i, ++j) { 2050 if (ns[i] > 3) { 2051 tmp_vec1[j] = ns[i] / 2; /* Assuming ns[i] < =5 */ 2052 ++j; 2053 tmp_vec1[j] = ns[i] - tmp_vec1[j - 1]; 2054 } else { 2055 tmp_vec1[j] = ns[i]; 2056 } 2057 } 2058 /* Use the correct node_max */ 2059 node_max = j; 2060 2061 /* Now reorder the inode info based on mat re-ordering info */ 2062 /* First create a row -> inode_size_array_index map */ 2063 PetscCall(PetscMalloc2(n + 1, &nsmap, node_max + 1, &tmp_vec2)); 2064 for (i = 0, row = 0; i < node_max; i++) { 2065 nodesz = tmp_vec1[i]; 2066 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 2067 } 2068 /* Using nsmap, create a reordered ns structure */ 2069 for (i = 0, j = 0; i < node_max; i++) { 2070 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 2071 tmp_vec2[i] = nodesz; 2072 j += nodesz; 2073 } 2074 PetscCall(PetscFree2(nsmap, tmp_vec1)); 2075 /* Now use the correct ns */ 2076 ns = tmp_vec2; 2077 2078 do { 2079 sctx.newshift = PETSC_FALSE; 2080 /* Now loop over each block-row, and do the factorization */ 2081 for (i = 0, row = 0; i < node_max; i++) { 2082 nodesz = ns[i]; 2083 nz = bi[row + 1] - bi[row]; 2084 bjtmp = bj + bi[row]; 2085 2086 switch (nodesz) { 2087 case 1: 2088 for (j = 0; j < nz; j++) { 2089 idx = bjtmp[j]; 2090 rtmp11[idx] = 0.0; 2091 } 2092 2093 /* load in initial (unfactored row) */ 2094 idx = r[row]; 2095 nz_tmp = ai[idx + 1] - ai[idx]; 2096 ajtmp = aj + ai[idx]; 2097 v1 = aa + ai[idx]; 2098 2099 for (j = 0; j < nz_tmp; j++) { 2100 idx = ics[ajtmp[j]]; 2101 rtmp11[idx] = v1[j]; 2102 } 2103 rtmp11[ics[r[row]]] += sctx.shift_amount; 2104 2105 prow = *bjtmp++; 2106 while (prow < row) { 2107 pc1 = rtmp11 + prow; 2108 if (*pc1 != 0.0) { 2109 pv = ba + bd[prow]; 2110 pj = nbj + bd[prow]; 2111 mul1 = *pc1 * *pv++; 2112 *pc1 = mul1; 2113 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2114 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2115 for (j = 0; j < nz_tmp; j++) { 2116 tmp = pv[j]; 2117 idx = pj[j]; 2118 rtmp11[idx] -= mul1 * tmp; 2119 } 2120 } 2121 prow = *bjtmp++; 2122 } 2123 pj = bj + bi[row]; 2124 pc1 = ba + bi[row]; 2125 2126 sctx.pv = rtmp11[row]; 2127 rtmp11[row] = 1.0 / rtmp11[row]; /* invert diag */ 2128 rs = 0.0; 2129 for (j = 0; j < nz; j++) { 2130 idx = pj[j]; 2131 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2132 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2133 } 2134 sctx.rs = rs; 2135 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2136 if (sctx.newshift) goto endofwhile; 2137 break; 2138 2139 case 2: 2140 for (j = 0; j < nz; j++) { 2141 idx = bjtmp[j]; 2142 rtmp11[idx] = 0.0; 2143 rtmp22[idx] = 0.0; 2144 } 2145 2146 /* load in initial (unfactored row) */ 2147 idx = r[row]; 2148 nz_tmp = ai[idx + 1] - ai[idx]; 2149 ajtmp = aj + ai[idx]; 2150 v1 = aa + ai[idx]; 2151 v2 = aa + ai[idx + 1]; 2152 for (j = 0; j < nz_tmp; j++) { 2153 idx = ics[ajtmp[j]]; 2154 rtmp11[idx] = v1[j]; 2155 rtmp22[idx] = v2[j]; 2156 } 2157 rtmp11[ics[r[row]]] += sctx.shift_amount; 2158 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2159 2160 prow = *bjtmp++; 2161 while (prow < row) { 2162 pc1 = rtmp11 + prow; 2163 pc2 = rtmp22 + prow; 2164 if (*pc1 != 0.0 || *pc2 != 0.0) { 2165 pv = ba + bd[prow]; 2166 pj = nbj + bd[prow]; 2167 mul1 = *pc1 * *pv; 2168 mul2 = *pc2 * *pv; 2169 ++pv; 2170 *pc1 = mul1; 2171 *pc2 = mul2; 2172 2173 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2174 for (j = 0; j < nz_tmp; j++) { 2175 tmp = pv[j]; 2176 idx = pj[j]; 2177 rtmp11[idx] -= mul1 * tmp; 2178 rtmp22[idx] -= mul2 * tmp; 2179 } 2180 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2181 } 2182 prow = *bjtmp++; 2183 } 2184 2185 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2186 pc1 = rtmp11 + prow; 2187 pc2 = rtmp22 + prow; 2188 2189 sctx.pv = *pc1; 2190 pj = bj + bi[prow]; 2191 rs = 0.0; 2192 for (j = 0; j < nz; j++) { 2193 idx = pj[j]; 2194 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2195 } 2196 sctx.rs = rs; 2197 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2198 if (sctx.newshift) goto endofwhile; 2199 2200 if (*pc2 != 0.0) { 2201 pj = nbj + bd[prow]; 2202 mul2 = (*pc2) / (*pc1); /* since diag is not yet inverted.*/ 2203 *pc2 = mul2; 2204 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2205 for (j = 0; j < nz_tmp; j++) { 2206 idx = pj[j]; 2207 tmp = rtmp11[idx]; 2208 rtmp22[idx] -= mul2 * tmp; 2209 } 2210 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2211 } 2212 2213 pj = bj + bi[row]; 2214 pc1 = ba + bi[row]; 2215 pc2 = ba + bi[row + 1]; 2216 2217 sctx.pv = rtmp22[row + 1]; 2218 rs = 0.0; 2219 rtmp11[row] = 1.0 / rtmp11[row]; 2220 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2221 /* copy row entries from dense representation to sparse */ 2222 for (j = 0; j < nz; j++) { 2223 idx = pj[j]; 2224 pc1[j] = rtmp11[idx]; 2225 pc2[j] = rtmp22[idx]; 2226 if (idx != row + 1) rs += PetscAbsScalar(pc2[j]); 2227 } 2228 sctx.rs = rs; 2229 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2230 if (sctx.newshift) goto endofwhile; 2231 break; 2232 2233 case 3: 2234 for (j = 0; j < nz; j++) { 2235 idx = bjtmp[j]; 2236 rtmp11[idx] = 0.0; 2237 rtmp22[idx] = 0.0; 2238 rtmp33[idx] = 0.0; 2239 } 2240 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2241 idx = r[row]; 2242 nz_tmp = ai[idx + 1] - ai[idx]; 2243 ajtmp = aj + ai[idx]; 2244 v1 = aa + ai[idx]; 2245 v2 = aa + ai[idx + 1]; 2246 v3 = aa + ai[idx + 2]; 2247 for (j = 0; j < nz_tmp; j++) { 2248 idx = ics[ajtmp[j]]; 2249 rtmp11[idx] = v1[j]; 2250 rtmp22[idx] = v2[j]; 2251 rtmp33[idx] = v3[j]; 2252 } 2253 rtmp11[ics[r[row]]] += sctx.shift_amount; 2254 rtmp22[ics[r[row + 1]]] += sctx.shift_amount; 2255 rtmp33[ics[r[row + 2]]] += sctx.shift_amount; 2256 2257 /* loop over all pivot row blocks above this row block */ 2258 prow = *bjtmp++; 2259 while (prow < row) { 2260 pc1 = rtmp11 + prow; 2261 pc2 = rtmp22 + prow; 2262 pc3 = rtmp33 + prow; 2263 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 2264 pv = ba + bd[prow]; 2265 pj = nbj + bd[prow]; 2266 mul1 = *pc1 * *pv; 2267 mul2 = *pc2 * *pv; 2268 mul3 = *pc3 * *pv; 2269 ++pv; 2270 *pc1 = mul1; 2271 *pc2 = mul2; 2272 *pc3 = mul3; 2273 2274 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2275 /* update this row based on pivot row */ 2276 for (j = 0; j < nz_tmp; j++) { 2277 tmp = pv[j]; 2278 idx = pj[j]; 2279 rtmp11[idx] -= mul1 * tmp; 2280 rtmp22[idx] -= mul2 * tmp; 2281 rtmp33[idx] -= mul3 * tmp; 2282 } 2283 PetscCall(PetscLogFlops(3 + 6.0 * nz_tmp)); 2284 } 2285 prow = *bjtmp++; 2286 } 2287 2288 /* Now take care of diagonal 3x3 block in this set of rows */ 2289 /* note: prow = row here */ 2290 pc1 = rtmp11 + prow; 2291 pc2 = rtmp22 + prow; 2292 pc3 = rtmp33 + prow; 2293 2294 sctx.pv = *pc1; 2295 pj = bj + bi[prow]; 2296 rs = 0.0; 2297 for (j = 0; j < nz; j++) { 2298 idx = pj[j]; 2299 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2300 } 2301 sctx.rs = rs; 2302 PetscCall(MatPivotCheck(B, A, info, &sctx, row)); 2303 if (sctx.newshift) goto endofwhile; 2304 2305 if (*pc2 != 0.0 || *pc3 != 0.0) { 2306 mul2 = (*pc2) / (*pc1); 2307 mul3 = (*pc3) / (*pc1); 2308 *pc2 = mul2; 2309 *pc3 = mul3; 2310 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2311 pj = nbj + bd[prow]; 2312 for (j = 0; j < nz_tmp; j++) { 2313 idx = pj[j]; 2314 tmp = rtmp11[idx]; 2315 rtmp22[idx] -= mul2 * tmp; 2316 rtmp33[idx] -= mul3 * tmp; 2317 } 2318 PetscCall(PetscLogFlops(2 + 4.0 * nz_tmp)); 2319 } 2320 ++prow; 2321 2322 pc2 = rtmp22 + prow; 2323 pc3 = rtmp33 + prow; 2324 sctx.pv = *pc2; 2325 pj = bj + bi[prow]; 2326 rs = 0.0; 2327 for (j = 0; j < nz; j++) { 2328 idx = pj[j]; 2329 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2330 } 2331 sctx.rs = rs; 2332 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 1)); 2333 if (sctx.newshift) goto endofwhile; 2334 2335 if (*pc3 != 0.0) { 2336 mul3 = (*pc3) / (*pc2); 2337 *pc3 = mul3; 2338 pj = nbj + bd[prow]; 2339 nz_tmp = bi[prow + 1] - bd[prow] - 1; 2340 for (j = 0; j < nz_tmp; j++) { 2341 idx = pj[j]; 2342 tmp = rtmp22[idx]; 2343 rtmp33[idx] -= mul3 * tmp; 2344 } 2345 PetscCall(PetscLogFlops(1 + 2.0 * nz_tmp)); 2346 } 2347 2348 pj = bj + bi[row]; 2349 pc1 = ba + bi[row]; 2350 pc2 = ba + bi[row + 1]; 2351 pc3 = ba + bi[row + 2]; 2352 2353 sctx.pv = rtmp33[row + 2]; 2354 rs = 0.0; 2355 rtmp11[row] = 1.0 / rtmp11[row]; 2356 rtmp22[row + 1] = 1.0 / rtmp22[row + 1]; 2357 rtmp33[row + 2] = 1.0 / rtmp33[row + 2]; 2358 /* copy row entries from dense representation to sparse */ 2359 for (j = 0; j < nz; j++) { 2360 idx = pj[j]; 2361 pc1[j] = rtmp11[idx]; 2362 pc2[j] = rtmp22[idx]; 2363 pc3[j] = rtmp33[idx]; 2364 if (idx != row + 2) rs += PetscAbsScalar(pc3[j]); 2365 } 2366 2367 sctx.rs = rs; 2368 PetscCall(MatPivotCheck(B, A, info, &sctx, row + 2)); 2369 if (sctx.newshift) goto endofwhile; 2370 break; 2371 2372 default: 2373 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 2374 } 2375 row += nodesz; /* Update the row */ 2376 } 2377 endofwhile:; 2378 } while (sctx.newshift); 2379 PetscCall(PetscFree3(rtmp11, rtmp22, rtmp33)); 2380 PetscCall(PetscFree(tmp_vec2)); 2381 PetscCall(ISRestoreIndices(isicol, &ic)); 2382 PetscCall(ISRestoreIndices(isrow, &r)); 2383 PetscCall(ISRestoreIndices(iscol, &c)); 2384 2385 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2386 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2387 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2388 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2389 C->assembled = PETSC_TRUE; 2390 C->preallocated = PETSC_TRUE; 2391 if (sctx.nshift) { 2392 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2393 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 2394 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2395 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 2396 } 2397 } 2398 PetscCall(PetscLogFlops(C->cmap->n)); 2399 PetscCall(MatSeqAIJCheckInode(C)); 2400 PetscFunctionReturn(PETSC_SUCCESS); 2401 } 2402 #endif 2403 2404 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 2405 { 2406 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2407 IS iscol = a->col, isrow = a->row; 2408 const PetscInt *r, *c, *rout, *cout; 2409 PetscInt i, j; 2410 PetscInt node_max, row, nsz, aii, i0, i1, nz; 2411 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 2412 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 2413 PetscScalar sum1, sum2, sum3, sum4, sum5; 2414 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 2415 const PetscScalar *b; 2416 2417 PetscFunctionBegin; 2418 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2419 node_max = a->inode.node_count; 2420 ns = a->inode.size_csr; /* Node Size array */ 2421 2422 PetscCall(VecGetArrayRead(bb, &b)); 2423 PetscCall(VecGetArrayWrite(xx, &x)); 2424 tmp = a->solve_work; 2425 2426 PetscCall(ISGetIndices(isrow, &rout)); 2427 r = rout; 2428 PetscCall(ISGetIndices(iscol, &cout)); 2429 c = cout; 2430 2431 /* forward solve the lower triangular */ 2432 tmps = tmp; 2433 aa = a_a; 2434 aj = a_j; 2435 ad = a->diag; 2436 2437 for (i = 0; i < node_max; ++i) { 2438 row = ns[i]; 2439 nsz = ns[i + 1] - ns[i]; 2440 aii = ai[row]; 2441 v1 = aa + aii; 2442 vi = aj + aii; 2443 nz = ai[row + 1] - ai[row]; 2444 2445 if (i < node_max - 1) { 2446 /* Prefetch the indices for the next block */ 2447 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2448 /* Prefetch the data for the next block */ 2449 PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2450 } 2451 2452 switch (nsz) { /* Each loop in 'case' is unrolled */ 2453 case 1: 2454 sum1 = b[r[row]]; 2455 for (j = 0; j < nz - 1; j += 2) { 2456 i0 = vi[j]; 2457 i1 = vi[j + 1]; 2458 tmp0 = tmps[i0]; 2459 tmp1 = tmps[i1]; 2460 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2461 } 2462 if (j == nz - 1) { 2463 tmp0 = tmps[vi[j]]; 2464 sum1 -= v1[j] * tmp0; 2465 } 2466 tmp[row++] = sum1; 2467 break; 2468 case 2: 2469 sum1 = b[r[row]]; 2470 sum2 = b[r[row + 1]]; 2471 v2 = aa + ai[row + 1]; 2472 2473 for (j = 0; j < nz - 1; j += 2) { 2474 i0 = vi[j]; 2475 i1 = vi[j + 1]; 2476 tmp0 = tmps[i0]; 2477 tmp1 = tmps[i1]; 2478 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2479 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2480 } 2481 if (j == nz - 1) { 2482 tmp0 = tmps[vi[j]]; 2483 sum1 -= v1[j] * tmp0; 2484 sum2 -= v2[j] * tmp0; 2485 } 2486 sum2 -= v2[nz] * sum1; 2487 tmp[row++] = sum1; 2488 tmp[row++] = sum2; 2489 break; 2490 case 3: 2491 sum1 = b[r[row]]; 2492 sum2 = b[r[row + 1]]; 2493 sum3 = b[r[row + 2]]; 2494 v2 = aa + ai[row + 1]; 2495 v3 = aa + ai[row + 2]; 2496 2497 for (j = 0; j < nz - 1; j += 2) { 2498 i0 = vi[j]; 2499 i1 = vi[j + 1]; 2500 tmp0 = tmps[i0]; 2501 tmp1 = tmps[i1]; 2502 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2503 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2504 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2505 } 2506 if (j == nz - 1) { 2507 tmp0 = tmps[vi[j]]; 2508 sum1 -= v1[j] * tmp0; 2509 sum2 -= v2[j] * tmp0; 2510 sum3 -= v3[j] * tmp0; 2511 } 2512 sum2 -= v2[nz] * sum1; 2513 sum3 -= v3[nz] * sum1; 2514 sum3 -= v3[nz + 1] * sum2; 2515 tmp[row++] = sum1; 2516 tmp[row++] = sum2; 2517 tmp[row++] = sum3; 2518 break; 2519 2520 case 4: 2521 sum1 = b[r[row]]; 2522 sum2 = b[r[row + 1]]; 2523 sum3 = b[r[row + 2]]; 2524 sum4 = b[r[row + 3]]; 2525 v2 = aa + ai[row + 1]; 2526 v3 = aa + ai[row + 2]; 2527 v4 = aa + ai[row + 3]; 2528 2529 for (j = 0; j < nz - 1; j += 2) { 2530 i0 = vi[j]; 2531 i1 = vi[j + 1]; 2532 tmp0 = tmps[i0]; 2533 tmp1 = tmps[i1]; 2534 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2535 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2536 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2537 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2538 } 2539 if (j == nz - 1) { 2540 tmp0 = tmps[vi[j]]; 2541 sum1 -= v1[j] * tmp0; 2542 sum2 -= v2[j] * tmp0; 2543 sum3 -= v3[j] * tmp0; 2544 sum4 -= v4[j] * tmp0; 2545 } 2546 sum2 -= v2[nz] * sum1; 2547 sum3 -= v3[nz] * sum1; 2548 sum4 -= v4[nz] * sum1; 2549 sum3 -= v3[nz + 1] * sum2; 2550 sum4 -= v4[nz + 1] * sum2; 2551 sum4 -= v4[nz + 2] * sum3; 2552 2553 tmp[row++] = sum1; 2554 tmp[row++] = sum2; 2555 tmp[row++] = sum3; 2556 tmp[row++] = sum4; 2557 break; 2558 case 5: 2559 sum1 = b[r[row]]; 2560 sum2 = b[r[row + 1]]; 2561 sum3 = b[r[row + 2]]; 2562 sum4 = b[r[row + 3]]; 2563 sum5 = b[r[row + 4]]; 2564 v2 = aa + ai[row + 1]; 2565 v3 = aa + ai[row + 2]; 2566 v4 = aa + ai[row + 3]; 2567 v5 = aa + ai[row + 4]; 2568 2569 for (j = 0; j < nz - 1; j += 2) { 2570 i0 = vi[j]; 2571 i1 = vi[j + 1]; 2572 tmp0 = tmps[i0]; 2573 tmp1 = tmps[i1]; 2574 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2575 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2576 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2577 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2578 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2579 } 2580 if (j == nz - 1) { 2581 tmp0 = tmps[vi[j]]; 2582 sum1 -= v1[j] * tmp0; 2583 sum2 -= v2[j] * tmp0; 2584 sum3 -= v3[j] * tmp0; 2585 sum4 -= v4[j] * tmp0; 2586 sum5 -= v5[j] * tmp0; 2587 } 2588 2589 sum2 -= v2[nz] * sum1; 2590 sum3 -= v3[nz] * sum1; 2591 sum4 -= v4[nz] * sum1; 2592 sum5 -= v5[nz] * sum1; 2593 sum3 -= v3[nz + 1] * sum2; 2594 sum4 -= v4[nz + 1] * sum2; 2595 sum5 -= v5[nz + 1] * sum2; 2596 sum4 -= v4[nz + 2] * sum3; 2597 sum5 -= v5[nz + 2] * sum3; 2598 sum5 -= v5[nz + 3] * sum4; 2599 2600 tmp[row++] = sum1; 2601 tmp[row++] = sum2; 2602 tmp[row++] = sum3; 2603 tmp[row++] = sum4; 2604 tmp[row++] = sum5; 2605 break; 2606 default: 2607 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2608 } 2609 } 2610 /* backward solve the upper triangular */ 2611 for (i = node_max - 1; i >= 0; i--) { 2612 row = ns[i + 1] - 1; 2613 nsz = ns[i + 1] - ns[i]; 2614 aii = ad[row + 1] + 1; 2615 v1 = aa + aii; 2616 vi = aj + aii; 2617 nz = ad[row] - ad[row + 1] - 1; 2618 2619 if (i > 0) { 2620 /* Prefetch the indices for the next block */ 2621 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2622 /* Prefetch the data for the next block */ 2623 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2624 } 2625 2626 switch (nsz) { /* Each loop in 'case' is unrolled */ 2627 case 1: 2628 sum1 = tmp[row]; 2629 2630 for (j = 0; j < nz - 1; j += 2) { 2631 i0 = vi[j]; 2632 i1 = vi[j + 1]; 2633 tmp0 = tmps[i0]; 2634 tmp1 = tmps[i1]; 2635 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2636 } 2637 if (j == nz - 1) { 2638 tmp0 = tmps[vi[j]]; 2639 sum1 -= v1[j] * tmp0; 2640 } 2641 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2642 row--; 2643 break; 2644 case 2: 2645 sum1 = tmp[row]; 2646 sum2 = tmp[row - 1]; 2647 v2 = aa + ad[row] + 1; 2648 for (j = 0; j < nz - 1; j += 2) { 2649 i0 = vi[j]; 2650 i1 = vi[j + 1]; 2651 tmp0 = tmps[i0]; 2652 tmp1 = tmps[i1]; 2653 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2654 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2655 } 2656 if (j == nz - 1) { 2657 tmp0 = tmps[vi[j]]; 2658 sum1 -= v1[j] * tmp0; 2659 sum2 -= v2[j + 1] * tmp0; 2660 } 2661 2662 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2663 row--; 2664 sum2 -= v2[0] * tmp0; 2665 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2666 row--; 2667 break; 2668 case 3: 2669 sum1 = tmp[row]; 2670 sum2 = tmp[row - 1]; 2671 sum3 = tmp[row - 2]; 2672 v2 = aa + ad[row] + 1; 2673 v3 = aa + ad[row - 1] + 1; 2674 for (j = 0; j < nz - 1; j += 2) { 2675 i0 = vi[j]; 2676 i1 = vi[j + 1]; 2677 tmp0 = tmps[i0]; 2678 tmp1 = tmps[i1]; 2679 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2680 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2681 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2682 } 2683 if (j == nz - 1) { 2684 tmp0 = tmps[vi[j]]; 2685 sum1 -= v1[j] * tmp0; 2686 sum2 -= v2[j + 1] * tmp0; 2687 sum3 -= v3[j + 2] * tmp0; 2688 } 2689 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2690 row--; 2691 sum2 -= v2[0] * tmp0; 2692 sum3 -= v3[1] * tmp0; 2693 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2694 row--; 2695 sum3 -= v3[0] * tmp0; 2696 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2697 row--; 2698 2699 break; 2700 case 4: 2701 sum1 = tmp[row]; 2702 sum2 = tmp[row - 1]; 2703 sum3 = tmp[row - 2]; 2704 sum4 = tmp[row - 3]; 2705 v2 = aa + ad[row] + 1; 2706 v3 = aa + ad[row - 1] + 1; 2707 v4 = aa + ad[row - 2] + 1; 2708 2709 for (j = 0; j < nz - 1; j += 2) { 2710 i0 = vi[j]; 2711 i1 = vi[j + 1]; 2712 tmp0 = tmps[i0]; 2713 tmp1 = tmps[i1]; 2714 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2715 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2716 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2717 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2718 } 2719 if (j == nz - 1) { 2720 tmp0 = tmps[vi[j]]; 2721 sum1 -= v1[j] * tmp0; 2722 sum2 -= v2[j + 1] * tmp0; 2723 sum3 -= v3[j + 2] * tmp0; 2724 sum4 -= v4[j + 3] * tmp0; 2725 } 2726 2727 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2728 row--; 2729 sum2 -= v2[0] * tmp0; 2730 sum3 -= v3[1] * tmp0; 2731 sum4 -= v4[2] * tmp0; 2732 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2733 row--; 2734 sum3 -= v3[0] * tmp0; 2735 sum4 -= v4[1] * tmp0; 2736 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2737 row--; 2738 sum4 -= v4[0] * tmp0; 2739 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2740 row--; 2741 break; 2742 case 5: 2743 sum1 = tmp[row]; 2744 sum2 = tmp[row - 1]; 2745 sum3 = tmp[row - 2]; 2746 sum4 = tmp[row - 3]; 2747 sum5 = tmp[row - 4]; 2748 v2 = aa + ad[row] + 1; 2749 v3 = aa + ad[row - 1] + 1; 2750 v4 = aa + ad[row - 2] + 1; 2751 v5 = aa + ad[row - 3] + 1; 2752 for (j = 0; j < nz - 1; j += 2) { 2753 i0 = vi[j]; 2754 i1 = vi[j + 1]; 2755 tmp0 = tmps[i0]; 2756 tmp1 = tmps[i1]; 2757 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2758 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2759 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2760 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2761 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2762 } 2763 if (j == nz - 1) { 2764 tmp0 = tmps[vi[j]]; 2765 sum1 -= v1[j] * tmp0; 2766 sum2 -= v2[j + 1] * tmp0; 2767 sum3 -= v3[j + 2] * tmp0; 2768 sum4 -= v4[j + 3] * tmp0; 2769 sum5 -= v5[j + 4] * tmp0; 2770 } 2771 2772 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2773 row--; 2774 sum2 -= v2[0] * tmp0; 2775 sum3 -= v3[1] * tmp0; 2776 sum4 -= v4[2] * tmp0; 2777 sum5 -= v5[3] * tmp0; 2778 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2779 row--; 2780 sum3 -= v3[0] * tmp0; 2781 sum4 -= v4[1] * tmp0; 2782 sum5 -= v5[2] * tmp0; 2783 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2784 row--; 2785 sum4 -= v4[0] * tmp0; 2786 sum5 -= v5[1] * tmp0; 2787 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2788 row--; 2789 sum5 -= v5[0] * tmp0; 2790 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2791 row--; 2792 break; 2793 default: 2794 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2795 } 2796 } 2797 PetscCall(ISRestoreIndices(isrow, &rout)); 2798 PetscCall(ISRestoreIndices(iscol, &cout)); 2799 PetscCall(VecRestoreArrayRead(bb, &b)); 2800 PetscCall(VecRestoreArrayWrite(xx, &x)); 2801 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2802 PetscFunctionReturn(PETSC_SUCCESS); 2803 } 2804 2805 /* 2806 Makes a longer coloring[] array and calls the usual code with that 2807 */ 2808 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2809 { 2810 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2811 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row; 2812 PetscInt *colorused, i; 2813 ISColoringValue *newcolor; 2814 2815 PetscFunctionBegin; 2816 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2817 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2818 /* loop over inodes, marking a color for each column*/ 2819 row = 0; 2820 for (i = 0; i < m; i++) { 2821 for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++)); 2822 } 2823 2824 /* eliminate unneeded colors */ 2825 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2826 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2827 2828 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2829 ncolors = colorused[5 * ncolors - 1]; 2830 for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i)); 2831 PetscCall(PetscFree(colorused)); 2832 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2833 PetscCall(PetscFree(coloring)); 2834 PetscFunctionReturn(PETSC_SUCCESS); 2835 } 2836 2837 #include <petsc/private/kernels/blockinvert.h> 2838 2839 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2840 { 2841 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2842 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2843 MatScalar *ibdiag, *bdiag, work[25], *t; 2844 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2845 const MatScalar *v = a->a, *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2846 const PetscScalar *xb, *b; 2847 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2848 PetscInt n, m = a->inode.node_count, cnt = 0, i, j, row, i1, i2, nodesz; 2849 PetscInt sz, k, ipvt[5]; 2850 PetscBool allowzeropivot, zeropivotdetected; 2851 const PetscInt *sizes = a->inode.size_csr, *idx, *diag = a->diag, *ii = a->i; 2852 2853 PetscFunctionBegin; 2854 allowzeropivot = PetscNot(A->erroriffailure); 2855 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2856 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2857 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2858 2859 if (!a->inode.ibdiagvalid) { 2860 if (!a->inode.ibdiag) { 2861 /* calculate space needed for diagonal blocks */ 2862 for (i = 0; i < m; i++) { 2863 nodesz = sizes[i + 1] - sizes[i]; 2864 cnt += nodesz * nodesz; 2865 } 2866 a->inode.bdiagsize = cnt; 2867 2868 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2869 } 2870 2871 /* copy over the diagonal blocks and invert them */ 2872 ibdiag = a->inode.ibdiag; 2873 bdiag = a->inode.bdiag; 2874 cnt = 0; 2875 for (i = 0, row = 0; i < m; i++) { 2876 nodesz = sizes[i + 1] - sizes[i]; 2877 for (j = 0; j < nodesz; j++) { 2878 for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k]; 2879 } 2880 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz)); 2881 2882 switch (nodesz) { 2883 case 1: 2884 /* Create matrix data structure */ 2885 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2886 if (allowzeropivot) { 2887 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2888 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2889 A->factorerror_zeropivot_row = row; 2890 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2891 } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2892 } 2893 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2894 break; 2895 case 2: 2896 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2897 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2898 break; 2899 case 3: 2900 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2901 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2902 break; 2903 case 4: 2904 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2905 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2906 break; 2907 case 5: 2908 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2909 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2910 break; 2911 default: 2912 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 2913 } 2914 cnt += nodesz * nodesz; 2915 row += nodesz; 2916 } 2917 a->inode.ibdiagvalid = PETSC_TRUE; 2918 } 2919 ibdiag = a->inode.ibdiag; 2920 bdiag = a->inode.bdiag; 2921 t = a->inode.ssor_work; 2922 2923 PetscCall(VecGetArray(xx, &x)); 2924 PetscCall(VecGetArrayRead(bb, &b)); 2925 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2926 if (flag & SOR_ZERO_INITIAL_GUESS) { 2927 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2928 for (i = 0, row = 0; i < m; i++) { 2929 sz = diag[row] - ii[row]; 2930 v1 = a->a + ii[row]; 2931 idx = a->j + ii[row]; 2932 2933 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2934 nodesz = sizes[i + 1] - sizes[i]; 2935 switch (nodesz) { 2936 case 1: 2937 2938 sum1 = b[row]; 2939 for (n = 0; n < sz - 1; n += 2) { 2940 i1 = idx[0]; 2941 i2 = idx[1]; 2942 idx += 2; 2943 tmp0 = x[i1]; 2944 tmp1 = x[i2]; 2945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2946 v1 += 2; 2947 } 2948 2949 if (n == sz - 1) { 2950 tmp0 = x[*idx]; 2951 sum1 -= *v1 * tmp0; 2952 } 2953 t[row] = sum1; 2954 x[row++] = sum1 * (*ibdiag++); 2955 break; 2956 case 2: 2957 v2 = a->a + ii[row + 1]; 2958 sum1 = b[row]; 2959 sum2 = b[row + 1]; 2960 for (n = 0; n < sz - 1; n += 2) { 2961 i1 = idx[0]; 2962 i2 = idx[1]; 2963 idx += 2; 2964 tmp0 = x[i1]; 2965 tmp1 = x[i2]; 2966 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2967 v1 += 2; 2968 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2969 v2 += 2; 2970 } 2971 2972 if (n == sz - 1) { 2973 tmp0 = x[*idx]; 2974 sum1 -= v1[0] * tmp0; 2975 sum2 -= v2[0] * tmp0; 2976 } 2977 t[row] = sum1; 2978 t[row + 1] = sum2; 2979 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2980 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2981 ibdiag += 4; 2982 break; 2983 case 3: 2984 v2 = a->a + ii[row + 1]; 2985 v3 = a->a + ii[row + 2]; 2986 sum1 = b[row]; 2987 sum2 = b[row + 1]; 2988 sum3 = b[row + 2]; 2989 for (n = 0; n < sz - 1; n += 2) { 2990 i1 = idx[0]; 2991 i2 = idx[1]; 2992 idx += 2; 2993 tmp0 = x[i1]; 2994 tmp1 = x[i2]; 2995 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2996 v1 += 2; 2997 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2998 v2 += 2; 2999 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3000 v3 += 2; 3001 } 3002 3003 if (n == sz - 1) { 3004 tmp0 = x[*idx]; 3005 sum1 -= v1[0] * tmp0; 3006 sum2 -= v2[0] * tmp0; 3007 sum3 -= v3[0] * tmp0; 3008 } 3009 t[row] = sum1; 3010 t[row + 1] = sum2; 3011 t[row + 2] = sum3; 3012 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3013 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3014 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3015 ibdiag += 9; 3016 break; 3017 case 4: 3018 v2 = a->a + ii[row + 1]; 3019 v3 = a->a + ii[row + 2]; 3020 v4 = a->a + ii[row + 3]; 3021 sum1 = b[row]; 3022 sum2 = b[row + 1]; 3023 sum3 = b[row + 2]; 3024 sum4 = b[row + 3]; 3025 for (n = 0; n < sz - 1; n += 2) { 3026 i1 = idx[0]; 3027 i2 = idx[1]; 3028 idx += 2; 3029 tmp0 = x[i1]; 3030 tmp1 = x[i2]; 3031 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3032 v1 += 2; 3033 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3034 v2 += 2; 3035 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3036 v3 += 2; 3037 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3038 v4 += 2; 3039 } 3040 3041 if (n == sz - 1) { 3042 tmp0 = x[*idx]; 3043 sum1 -= v1[0] * tmp0; 3044 sum2 -= v2[0] * tmp0; 3045 sum3 -= v3[0] * tmp0; 3046 sum4 -= v4[0] * tmp0; 3047 } 3048 t[row] = sum1; 3049 t[row + 1] = sum2; 3050 t[row + 2] = sum3; 3051 t[row + 3] = sum4; 3052 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3053 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3054 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3055 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3056 ibdiag += 16; 3057 break; 3058 case 5: 3059 v2 = a->a + ii[row + 1]; 3060 v3 = a->a + ii[row + 2]; 3061 v4 = a->a + ii[row + 3]; 3062 v5 = a->a + ii[row + 4]; 3063 sum1 = b[row]; 3064 sum2 = b[row + 1]; 3065 sum3 = b[row + 2]; 3066 sum4 = b[row + 3]; 3067 sum5 = b[row + 4]; 3068 for (n = 0; n < sz - 1; n += 2) { 3069 i1 = idx[0]; 3070 i2 = idx[1]; 3071 idx += 2; 3072 tmp0 = x[i1]; 3073 tmp1 = x[i2]; 3074 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3075 v1 += 2; 3076 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3077 v2 += 2; 3078 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3079 v3 += 2; 3080 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3081 v4 += 2; 3082 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3083 v5 += 2; 3084 } 3085 3086 if (n == sz - 1) { 3087 tmp0 = x[*idx]; 3088 sum1 -= v1[0] * tmp0; 3089 sum2 -= v2[0] * tmp0; 3090 sum3 -= v3[0] * tmp0; 3091 sum4 -= v4[0] * tmp0; 3092 sum5 -= v5[0] * tmp0; 3093 } 3094 t[row] = sum1; 3095 t[row + 1] = sum2; 3096 t[row + 2] = sum3; 3097 t[row + 3] = sum4; 3098 t[row + 4] = sum5; 3099 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3100 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3101 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3102 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3103 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3104 ibdiag += 25; 3105 break; 3106 default: 3107 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3108 } 3109 } 3110 3111 xb = t; 3112 PetscCall(PetscLogFlops(a->nz)); 3113 } else xb = b; 3114 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3115 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3116 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3117 nodesz = sizes[i + 1] - sizes[i]; 3118 ibdiag -= nodesz * nodesz; 3119 sz = ii[row + 1] - diag[row] - 1; 3120 v1 = a->a + diag[row] + 1; 3121 idx = a->j + diag[row] + 1; 3122 3123 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3124 switch (nodesz) { 3125 case 1: 3126 3127 sum1 = xb[row]; 3128 for (n = 0; n < sz - 1; n += 2) { 3129 i1 = idx[0]; 3130 i2 = idx[1]; 3131 idx += 2; 3132 tmp0 = x[i1]; 3133 tmp1 = x[i2]; 3134 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3135 v1 += 2; 3136 } 3137 3138 if (n == sz - 1) { 3139 tmp0 = x[*idx]; 3140 sum1 -= *v1 * tmp0; 3141 } 3142 x[row--] = sum1 * (*ibdiag); 3143 break; 3144 3145 case 2: 3146 3147 sum1 = xb[row]; 3148 sum2 = xb[row - 1]; 3149 /* note that sum1 is associated with the second of the two rows */ 3150 v2 = a->a + diag[row - 1] + 2; 3151 for (n = 0; n < sz - 1; n += 2) { 3152 i1 = idx[0]; 3153 i2 = idx[1]; 3154 idx += 2; 3155 tmp0 = x[i1]; 3156 tmp1 = x[i2]; 3157 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3158 v1 += 2; 3159 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3160 v2 += 2; 3161 } 3162 3163 if (n == sz - 1) { 3164 tmp0 = x[*idx]; 3165 sum1 -= *v1 * tmp0; 3166 sum2 -= *v2 * tmp0; 3167 } 3168 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3169 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3170 break; 3171 case 3: 3172 3173 sum1 = xb[row]; 3174 sum2 = xb[row - 1]; 3175 sum3 = xb[row - 2]; 3176 v2 = a->a + diag[row - 1] + 2; 3177 v3 = a->a + diag[row - 2] + 3; 3178 for (n = 0; n < sz - 1; n += 2) { 3179 i1 = idx[0]; 3180 i2 = idx[1]; 3181 idx += 2; 3182 tmp0 = x[i1]; 3183 tmp1 = x[i2]; 3184 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3185 v1 += 2; 3186 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3187 v2 += 2; 3188 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3189 v3 += 2; 3190 } 3191 3192 if (n == sz - 1) { 3193 tmp0 = x[*idx]; 3194 sum1 -= *v1 * tmp0; 3195 sum2 -= *v2 * tmp0; 3196 sum3 -= *v3 * tmp0; 3197 } 3198 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3199 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3200 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3201 break; 3202 case 4: 3203 3204 sum1 = xb[row]; 3205 sum2 = xb[row - 1]; 3206 sum3 = xb[row - 2]; 3207 sum4 = xb[row - 3]; 3208 v2 = a->a + diag[row - 1] + 2; 3209 v3 = a->a + diag[row - 2] + 3; 3210 v4 = a->a + diag[row - 3] + 4; 3211 for (n = 0; n < sz - 1; n += 2) { 3212 i1 = idx[0]; 3213 i2 = idx[1]; 3214 idx += 2; 3215 tmp0 = x[i1]; 3216 tmp1 = x[i2]; 3217 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3218 v1 += 2; 3219 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3220 v2 += 2; 3221 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3222 v3 += 2; 3223 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3224 v4 += 2; 3225 } 3226 3227 if (n == sz - 1) { 3228 tmp0 = x[*idx]; 3229 sum1 -= *v1 * tmp0; 3230 sum2 -= *v2 * tmp0; 3231 sum3 -= *v3 * tmp0; 3232 sum4 -= *v4 * tmp0; 3233 } 3234 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3235 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3236 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3237 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3238 break; 3239 case 5: 3240 3241 sum1 = xb[row]; 3242 sum2 = xb[row - 1]; 3243 sum3 = xb[row - 2]; 3244 sum4 = xb[row - 3]; 3245 sum5 = xb[row - 4]; 3246 v2 = a->a + diag[row - 1] + 2; 3247 v3 = a->a + diag[row - 2] + 3; 3248 v4 = a->a + diag[row - 3] + 4; 3249 v5 = a->a + diag[row - 4] + 5; 3250 for (n = 0; n < sz - 1; n += 2) { 3251 i1 = idx[0]; 3252 i2 = idx[1]; 3253 idx += 2; 3254 tmp0 = x[i1]; 3255 tmp1 = x[i2]; 3256 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3257 v1 += 2; 3258 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3259 v2 += 2; 3260 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3261 v3 += 2; 3262 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3263 v4 += 2; 3264 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3265 v5 += 2; 3266 } 3267 3268 if (n == sz - 1) { 3269 tmp0 = x[*idx]; 3270 sum1 -= *v1 * tmp0; 3271 sum2 -= *v2 * tmp0; 3272 sum3 -= *v3 * tmp0; 3273 sum4 -= *v4 * tmp0; 3274 sum5 -= *v5 * tmp0; 3275 } 3276 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3277 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3278 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3279 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3280 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3281 break; 3282 default: 3283 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3284 } 3285 } 3286 3287 PetscCall(PetscLogFlops(a->nz)); 3288 } 3289 its--; 3290 } 3291 while (its--) { 3292 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3293 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) { 3294 nodesz = sizes[i + 1] - sizes[i]; 3295 sz = diag[row] - ii[row]; 3296 v1 = a->a + ii[row]; 3297 idx = a->j + ii[row]; 3298 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3299 switch (nodesz) { 3300 case 1: 3301 sum1 = b[row]; 3302 for (n = 0; n < sz - 1; n += 2) { 3303 i1 = idx[0]; 3304 i2 = idx[1]; 3305 idx += 2; 3306 tmp0 = x[i1]; 3307 tmp1 = x[i2]; 3308 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3309 v1 += 2; 3310 } 3311 if (n == sz - 1) { 3312 tmp0 = x[*idx++]; 3313 sum1 -= *v1 * tmp0; 3314 v1++; 3315 } 3316 t[row] = sum1; 3317 sz = ii[row + 1] - diag[row] - 1; 3318 idx = a->j + diag[row] + 1; 3319 v1 += 1; 3320 for (n = 0; n < sz - 1; n += 2) { 3321 i1 = idx[0]; 3322 i2 = idx[1]; 3323 idx += 2; 3324 tmp0 = x[i1]; 3325 tmp1 = x[i2]; 3326 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3327 v1 += 2; 3328 } 3329 if (n == sz - 1) { 3330 tmp0 = x[*idx++]; 3331 sum1 -= *v1 * tmp0; 3332 } 3333 /* in MatSOR_SeqAIJ this line would be 3334 * 3335 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3336 * 3337 * but omega == 1, so this becomes 3338 * 3339 * x[row] = sum1*(*ibdiag++); 3340 * 3341 */ 3342 x[row] = sum1 * (*ibdiag); 3343 break; 3344 case 2: 3345 v2 = a->a + ii[row + 1]; 3346 sum1 = b[row]; 3347 sum2 = b[row + 1]; 3348 for (n = 0; n < sz - 1; n += 2) { 3349 i1 = idx[0]; 3350 i2 = idx[1]; 3351 idx += 2; 3352 tmp0 = x[i1]; 3353 tmp1 = x[i2]; 3354 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3355 v1 += 2; 3356 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3357 v2 += 2; 3358 } 3359 if (n == sz - 1) { 3360 tmp0 = x[*idx++]; 3361 sum1 -= v1[0] * tmp0; 3362 sum2 -= v2[0] * tmp0; 3363 v1++; 3364 v2++; 3365 } 3366 t[row] = sum1; 3367 t[row + 1] = sum2; 3368 sz = ii[row + 1] - diag[row] - 2; 3369 idx = a->j + diag[row] + 2; 3370 v1 += 2; 3371 v2 += 2; 3372 for (n = 0; n < sz - 1; n += 2) { 3373 i1 = idx[0]; 3374 i2 = idx[1]; 3375 idx += 2; 3376 tmp0 = x[i1]; 3377 tmp1 = x[i2]; 3378 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3379 v1 += 2; 3380 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3381 v2 += 2; 3382 } 3383 if (n == sz - 1) { 3384 tmp0 = x[*idx]; 3385 sum1 -= v1[0] * tmp0; 3386 sum2 -= v2[0] * tmp0; 3387 } 3388 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3389 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3390 break; 3391 case 3: 3392 v2 = a->a + ii[row + 1]; 3393 v3 = a->a + ii[row + 2]; 3394 sum1 = b[row]; 3395 sum2 = b[row + 1]; 3396 sum3 = b[row + 2]; 3397 for (n = 0; n < sz - 1; n += 2) { 3398 i1 = idx[0]; 3399 i2 = idx[1]; 3400 idx += 2; 3401 tmp0 = x[i1]; 3402 tmp1 = x[i2]; 3403 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3404 v1 += 2; 3405 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3406 v2 += 2; 3407 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3408 v3 += 2; 3409 } 3410 if (n == sz - 1) { 3411 tmp0 = x[*idx++]; 3412 sum1 -= v1[0] * tmp0; 3413 sum2 -= v2[0] * tmp0; 3414 sum3 -= v3[0] * tmp0; 3415 v1++; 3416 v2++; 3417 v3++; 3418 } 3419 t[row] = sum1; 3420 t[row + 1] = sum2; 3421 t[row + 2] = sum3; 3422 sz = ii[row + 1] - diag[row] - 3; 3423 idx = a->j + diag[row] + 3; 3424 v1 += 3; 3425 v2 += 3; 3426 v3 += 3; 3427 for (n = 0; n < sz - 1; n += 2) { 3428 i1 = idx[0]; 3429 i2 = idx[1]; 3430 idx += 2; 3431 tmp0 = x[i1]; 3432 tmp1 = x[i2]; 3433 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3434 v1 += 2; 3435 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3436 v2 += 2; 3437 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3438 v3 += 2; 3439 } 3440 if (n == sz - 1) { 3441 tmp0 = x[*idx]; 3442 sum1 -= v1[0] * tmp0; 3443 sum2 -= v2[0] * tmp0; 3444 sum3 -= v3[0] * tmp0; 3445 } 3446 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3447 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3448 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3449 break; 3450 case 4: 3451 v2 = a->a + ii[row + 1]; 3452 v3 = a->a + ii[row + 2]; 3453 v4 = a->a + ii[row + 3]; 3454 sum1 = b[row]; 3455 sum2 = b[row + 1]; 3456 sum3 = b[row + 2]; 3457 sum4 = b[row + 3]; 3458 for (n = 0; n < sz - 1; n += 2) { 3459 i1 = idx[0]; 3460 i2 = idx[1]; 3461 idx += 2; 3462 tmp0 = x[i1]; 3463 tmp1 = x[i2]; 3464 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3465 v1 += 2; 3466 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3467 v2 += 2; 3468 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3469 v3 += 2; 3470 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3471 v4 += 2; 3472 } 3473 if (n == sz - 1) { 3474 tmp0 = x[*idx++]; 3475 sum1 -= v1[0] * tmp0; 3476 sum2 -= v2[0] * tmp0; 3477 sum3 -= v3[0] * tmp0; 3478 sum4 -= v4[0] * tmp0; 3479 v1++; 3480 v2++; 3481 v3++; 3482 v4++; 3483 } 3484 t[row] = sum1; 3485 t[row + 1] = sum2; 3486 t[row + 2] = sum3; 3487 t[row + 3] = sum4; 3488 sz = ii[row + 1] - diag[row] - 4; 3489 idx = a->j + diag[row] + 4; 3490 v1 += 4; 3491 v2 += 4; 3492 v3 += 4; 3493 v4 += 4; 3494 for (n = 0; n < sz - 1; n += 2) { 3495 i1 = idx[0]; 3496 i2 = idx[1]; 3497 idx += 2; 3498 tmp0 = x[i1]; 3499 tmp1 = x[i2]; 3500 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3501 v1 += 2; 3502 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3503 v2 += 2; 3504 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3505 v3 += 2; 3506 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3507 v4 += 2; 3508 } 3509 if (n == sz - 1) { 3510 tmp0 = x[*idx]; 3511 sum1 -= v1[0] * tmp0; 3512 sum2 -= v2[0] * tmp0; 3513 sum3 -= v3[0] * tmp0; 3514 sum4 -= v4[0] * tmp0; 3515 } 3516 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3517 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3518 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3519 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3520 break; 3521 case 5: 3522 v2 = a->a + ii[row + 1]; 3523 v3 = a->a + ii[row + 2]; 3524 v4 = a->a + ii[row + 3]; 3525 v5 = a->a + ii[row + 4]; 3526 sum1 = b[row]; 3527 sum2 = b[row + 1]; 3528 sum3 = b[row + 2]; 3529 sum4 = b[row + 3]; 3530 sum5 = b[row + 4]; 3531 for (n = 0; n < sz - 1; n += 2) { 3532 i1 = idx[0]; 3533 i2 = idx[1]; 3534 idx += 2; 3535 tmp0 = x[i1]; 3536 tmp1 = x[i2]; 3537 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3538 v1 += 2; 3539 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3540 v2 += 2; 3541 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3542 v3 += 2; 3543 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3544 v4 += 2; 3545 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3546 v5 += 2; 3547 } 3548 if (n == sz - 1) { 3549 tmp0 = x[*idx++]; 3550 sum1 -= v1[0] * tmp0; 3551 sum2 -= v2[0] * tmp0; 3552 sum3 -= v3[0] * tmp0; 3553 sum4 -= v4[0] * tmp0; 3554 sum5 -= v5[0] * tmp0; 3555 v1++; 3556 v2++; 3557 v3++; 3558 v4++; 3559 v5++; 3560 } 3561 t[row] = sum1; 3562 t[row + 1] = sum2; 3563 t[row + 2] = sum3; 3564 t[row + 3] = sum4; 3565 t[row + 4] = sum5; 3566 sz = ii[row + 1] - diag[row] - 5; 3567 idx = a->j + diag[row] + 5; 3568 v1 += 5; 3569 v2 += 5; 3570 v3 += 5; 3571 v4 += 5; 3572 v5 += 5; 3573 for (n = 0; n < sz - 1; n += 2) { 3574 i1 = idx[0]; 3575 i2 = idx[1]; 3576 idx += 2; 3577 tmp0 = x[i1]; 3578 tmp1 = x[i2]; 3579 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3580 v1 += 2; 3581 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3582 v2 += 2; 3583 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3584 v3 += 2; 3585 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3586 v4 += 2; 3587 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3588 v5 += 2; 3589 } 3590 if (n == sz - 1) { 3591 tmp0 = x[*idx]; 3592 sum1 -= v1[0] * tmp0; 3593 sum2 -= v2[0] * tmp0; 3594 sum3 -= v3[0] * tmp0; 3595 sum4 -= v4[0] * tmp0; 3596 sum5 -= v5[0] * tmp0; 3597 } 3598 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3599 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3600 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3601 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3602 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3603 break; 3604 default: 3605 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3606 } 3607 } 3608 xb = t; 3609 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3610 } else xb = b; 3611 3612 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3613 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3614 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3615 nodesz = sizes[i + 1] - sizes[i]; 3616 ibdiag -= nodesz * nodesz; 3617 3618 /* set RHS */ 3619 if (xb == b) { 3620 /* whole (old way) */ 3621 sz = ii[row + 1] - ii[row]; 3622 idx = a->j + ii[row]; 3623 switch (nodesz) { 3624 case 5: 3625 v5 = a->a + ii[row - 4]; /* fall through */ 3626 case 4: 3627 v4 = a->a + ii[row - 3]; /* fall through */ 3628 case 3: 3629 v3 = a->a + ii[row - 2]; /* fall through */ 3630 case 2: 3631 v2 = a->a + ii[row - 1]; /* fall through */ 3632 case 1: 3633 v1 = a->a + ii[row]; 3634 break; 3635 default: 3636 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3637 } 3638 } else { 3639 /* upper, no diag */ 3640 sz = ii[row + 1] - diag[row] - 1; 3641 idx = a->j + diag[row] + 1; 3642 switch (nodesz) { 3643 case 5: 3644 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3645 case 4: 3646 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3647 case 3: 3648 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3649 case 2: 3650 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3651 case 1: 3652 v1 = a->a + diag[row] + 1; 3653 } 3654 } 3655 /* set sum */ 3656 switch (nodesz) { 3657 case 5: 3658 sum5 = xb[row - 4]; /* fall through */ 3659 case 4: 3660 sum4 = xb[row - 3]; /* fall through */ 3661 case 3: 3662 sum3 = xb[row - 2]; /* fall through */ 3663 case 2: 3664 sum2 = xb[row - 1]; /* fall through */ 3665 case 1: 3666 /* note that sum1 is associated with the last row */ 3667 sum1 = xb[row]; 3668 } 3669 /* do sums */ 3670 for (n = 0; n < sz - 1; n += 2) { 3671 i1 = idx[0]; 3672 i2 = idx[1]; 3673 idx += 2; 3674 tmp0 = x[i1]; 3675 tmp1 = x[i2]; 3676 switch (nodesz) { 3677 case 5: 3678 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3679 v5 += 2; /* fall through */ 3680 case 4: 3681 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3682 v4 += 2; /* fall through */ 3683 case 3: 3684 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3685 v3 += 2; /* fall through */ 3686 case 2: 3687 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3688 v2 += 2; /* fall through */ 3689 case 1: 3690 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3691 v1 += 2; 3692 } 3693 } 3694 /* ragged edge */ 3695 if (n == sz - 1) { 3696 tmp0 = x[*idx]; 3697 switch (nodesz) { 3698 case 5: 3699 sum5 -= *v5 * tmp0; /* fall through */ 3700 case 4: 3701 sum4 -= *v4 * tmp0; /* fall through */ 3702 case 3: 3703 sum3 -= *v3 * tmp0; /* fall through */ 3704 case 2: 3705 sum2 -= *v2 * tmp0; /* fall through */ 3706 case 1: 3707 sum1 -= *v1 * tmp0; 3708 } 3709 } 3710 /* update */ 3711 if (xb == b) { 3712 /* whole (old way) w/ diag */ 3713 switch (nodesz) { 3714 case 5: 3715 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3716 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3717 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3718 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3719 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3720 break; 3721 case 4: 3722 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3723 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3724 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3725 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3726 break; 3727 case 3: 3728 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3729 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3730 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3731 break; 3732 case 2: 3733 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3734 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3735 break; 3736 case 1: 3737 x[row--] += sum1 * (*ibdiag); 3738 break; 3739 } 3740 } else { 3741 /* no diag so set = */ 3742 switch (nodesz) { 3743 case 5: 3744 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3745 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3746 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3747 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3748 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3749 break; 3750 case 4: 3751 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3752 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3753 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3754 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3755 break; 3756 case 3: 3757 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3758 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3759 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3760 break; 3761 case 2: 3762 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3763 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3764 break; 3765 case 1: 3766 x[row--] = sum1 * (*ibdiag); 3767 break; 3768 } 3769 } 3770 } 3771 if (xb == b) { 3772 PetscCall(PetscLogFlops(2.0 * a->nz)); 3773 } else { 3774 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3775 } 3776 } 3777 } 3778 if (flag & SOR_EISENSTAT) { 3779 /* 3780 Apply (U + D)^-1 where D is now the block diagonal 3781 */ 3782 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3783 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3784 nodesz = sizes[i + 1] - sizes[i]; 3785 ibdiag -= nodesz * nodesz; 3786 sz = ii[row + 1] - diag[row] - 1; 3787 v1 = a->a + diag[row] + 1; 3788 idx = a->j + diag[row] + 1; 3789 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3790 switch (nodesz) { 3791 case 1: 3792 3793 sum1 = b[row]; 3794 for (n = 0; n < sz - 1; n += 2) { 3795 i1 = idx[0]; 3796 i2 = idx[1]; 3797 idx += 2; 3798 tmp0 = x[i1]; 3799 tmp1 = x[i2]; 3800 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3801 v1 += 2; 3802 } 3803 3804 if (n == sz - 1) { 3805 tmp0 = x[*idx]; 3806 sum1 -= *v1 * tmp0; 3807 } 3808 x[row] = sum1 * (*ibdiag); 3809 row--; 3810 break; 3811 3812 case 2: 3813 3814 sum1 = b[row]; 3815 sum2 = b[row - 1]; 3816 /* note that sum1 is associated with the second of the two rows */ 3817 v2 = a->a + diag[row - 1] + 2; 3818 for (n = 0; n < sz - 1; n += 2) { 3819 i1 = idx[0]; 3820 i2 = idx[1]; 3821 idx += 2; 3822 tmp0 = x[i1]; 3823 tmp1 = x[i2]; 3824 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3825 v1 += 2; 3826 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3827 v2 += 2; 3828 } 3829 3830 if (n == sz - 1) { 3831 tmp0 = x[*idx]; 3832 sum1 -= *v1 * tmp0; 3833 sum2 -= *v2 * tmp0; 3834 } 3835 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3836 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3837 row -= 2; 3838 break; 3839 case 3: 3840 3841 sum1 = b[row]; 3842 sum2 = b[row - 1]; 3843 sum3 = b[row - 2]; 3844 v2 = a->a + diag[row - 1] + 2; 3845 v3 = a->a + diag[row - 2] + 3; 3846 for (n = 0; n < sz - 1; n += 2) { 3847 i1 = idx[0]; 3848 i2 = idx[1]; 3849 idx += 2; 3850 tmp0 = x[i1]; 3851 tmp1 = x[i2]; 3852 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3853 v1 += 2; 3854 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3855 v2 += 2; 3856 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3857 v3 += 2; 3858 } 3859 3860 if (n == sz - 1) { 3861 tmp0 = x[*idx]; 3862 sum1 -= *v1 * tmp0; 3863 sum2 -= *v2 * tmp0; 3864 sum3 -= *v3 * tmp0; 3865 } 3866 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3867 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3868 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3869 row -= 3; 3870 break; 3871 case 4: 3872 3873 sum1 = b[row]; 3874 sum2 = b[row - 1]; 3875 sum3 = b[row - 2]; 3876 sum4 = b[row - 3]; 3877 v2 = a->a + diag[row - 1] + 2; 3878 v3 = a->a + diag[row - 2] + 3; 3879 v4 = a->a + diag[row - 3] + 4; 3880 for (n = 0; n < sz - 1; n += 2) { 3881 i1 = idx[0]; 3882 i2 = idx[1]; 3883 idx += 2; 3884 tmp0 = x[i1]; 3885 tmp1 = x[i2]; 3886 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3887 v1 += 2; 3888 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3889 v2 += 2; 3890 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3891 v3 += 2; 3892 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3893 v4 += 2; 3894 } 3895 3896 if (n == sz - 1) { 3897 tmp0 = x[*idx]; 3898 sum1 -= *v1 * tmp0; 3899 sum2 -= *v2 * tmp0; 3900 sum3 -= *v3 * tmp0; 3901 sum4 -= *v4 * tmp0; 3902 } 3903 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3904 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3905 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3906 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3907 row -= 4; 3908 break; 3909 case 5: 3910 3911 sum1 = b[row]; 3912 sum2 = b[row - 1]; 3913 sum3 = b[row - 2]; 3914 sum4 = b[row - 3]; 3915 sum5 = b[row - 4]; 3916 v2 = a->a + diag[row - 1] + 2; 3917 v3 = a->a + diag[row - 2] + 3; 3918 v4 = a->a + diag[row - 3] + 4; 3919 v5 = a->a + diag[row - 4] + 5; 3920 for (n = 0; n < sz - 1; n += 2) { 3921 i1 = idx[0]; 3922 i2 = idx[1]; 3923 idx += 2; 3924 tmp0 = x[i1]; 3925 tmp1 = x[i2]; 3926 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3927 v1 += 2; 3928 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3929 v2 += 2; 3930 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3931 v3 += 2; 3932 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3933 v4 += 2; 3934 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3935 v5 += 2; 3936 } 3937 3938 if (n == sz - 1) { 3939 tmp0 = x[*idx]; 3940 sum1 -= *v1 * tmp0; 3941 sum2 -= *v2 * tmp0; 3942 sum3 -= *v3 * tmp0; 3943 sum4 -= *v4 * tmp0; 3944 sum5 -= *v5 * tmp0; 3945 } 3946 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3947 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3948 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3949 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3950 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3951 row -= 5; 3952 break; 3953 default: 3954 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3955 } 3956 } 3957 PetscCall(PetscLogFlops(a->nz)); 3958 3959 /* 3960 t = b - D x where D is the block diagonal 3961 */ 3962 cnt = 0; 3963 for (i = 0, row = 0; i < m; i++) { 3964 nodesz = sizes[i + 1] - sizes[i]; 3965 switch (nodesz) { 3966 case 1: 3967 t[row] = b[row] - bdiag[cnt++] * x[row]; 3968 row++; 3969 break; 3970 case 2: 3971 x1 = x[row]; 3972 x2 = x[row + 1]; 3973 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3974 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3975 t[row] = b[row] - tmp1; 3976 t[row + 1] = b[row + 1] - tmp2; 3977 row += 2; 3978 cnt += 4; 3979 break; 3980 case 3: 3981 x1 = x[row]; 3982 x2 = x[row + 1]; 3983 x3 = x[row + 2]; 3984 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3985 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3986 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3987 t[row] = b[row] - tmp1; 3988 t[row + 1] = b[row + 1] - tmp2; 3989 t[row + 2] = b[row + 2] - tmp3; 3990 row += 3; 3991 cnt += 9; 3992 break; 3993 case 4: 3994 x1 = x[row]; 3995 x2 = x[row + 1]; 3996 x3 = x[row + 2]; 3997 x4 = x[row + 3]; 3998 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3999 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4000 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4001 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4002 t[row] = b[row] - tmp1; 4003 t[row + 1] = b[row + 1] - tmp2; 4004 t[row + 2] = b[row + 2] - tmp3; 4005 t[row + 3] = b[row + 3] - tmp4; 4006 row += 4; 4007 cnt += 16; 4008 break; 4009 case 5: 4010 x1 = x[row]; 4011 x2 = x[row + 1]; 4012 x3 = x[row + 2]; 4013 x4 = x[row + 3]; 4014 x5 = x[row + 4]; 4015 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4016 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4017 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4018 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4019 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4020 t[row] = b[row] - tmp1; 4021 t[row + 1] = b[row + 1] - tmp2; 4022 t[row + 2] = b[row + 2] - tmp3; 4023 t[row + 3] = b[row + 3] - tmp4; 4024 t[row + 4] = b[row + 4] - tmp5; 4025 row += 5; 4026 cnt += 25; 4027 break; 4028 default: 4029 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4030 } 4031 } 4032 PetscCall(PetscLogFlops(m)); 4033 4034 /* 4035 Apply (L + D)^-1 where D is the block diagonal 4036 */ 4037 for (i = 0, row = 0; i < m; i++) { 4038 nodesz = sizes[i + 1] - sizes[i]; 4039 sz = diag[row] - ii[row]; 4040 v1 = a->a + ii[row]; 4041 idx = a->j + ii[row]; 4042 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 4043 switch (nodesz) { 4044 case 1: 4045 4046 sum1 = t[row]; 4047 for (n = 0; n < sz - 1; n += 2) { 4048 i1 = idx[0]; 4049 i2 = idx[1]; 4050 idx += 2; 4051 tmp0 = t[i1]; 4052 tmp1 = t[i2]; 4053 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4054 v1 += 2; 4055 } 4056 4057 if (n == sz - 1) { 4058 tmp0 = t[*idx]; 4059 sum1 -= *v1 * tmp0; 4060 } 4061 x[row] += t[row] = sum1 * (*ibdiag++); 4062 row++; 4063 break; 4064 case 2: 4065 v2 = a->a + ii[row + 1]; 4066 sum1 = t[row]; 4067 sum2 = t[row + 1]; 4068 for (n = 0; n < sz - 1; n += 2) { 4069 i1 = idx[0]; 4070 i2 = idx[1]; 4071 idx += 2; 4072 tmp0 = t[i1]; 4073 tmp1 = t[i2]; 4074 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4075 v1 += 2; 4076 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4077 v2 += 2; 4078 } 4079 4080 if (n == sz - 1) { 4081 tmp0 = t[*idx]; 4082 sum1 -= v1[0] * tmp0; 4083 sum2 -= v2[0] * tmp0; 4084 } 4085 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 4086 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 4087 ibdiag += 4; 4088 row += 2; 4089 break; 4090 case 3: 4091 v2 = a->a + ii[row + 1]; 4092 v3 = a->a + ii[row + 2]; 4093 sum1 = t[row]; 4094 sum2 = t[row + 1]; 4095 sum3 = t[row + 2]; 4096 for (n = 0; n < sz - 1; n += 2) { 4097 i1 = idx[0]; 4098 i2 = idx[1]; 4099 idx += 2; 4100 tmp0 = t[i1]; 4101 tmp1 = t[i2]; 4102 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4103 v1 += 2; 4104 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4105 v2 += 2; 4106 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4107 v3 += 2; 4108 } 4109 4110 if (n == sz - 1) { 4111 tmp0 = t[*idx]; 4112 sum1 -= v1[0] * tmp0; 4113 sum2 -= v2[0] * tmp0; 4114 sum3 -= v3[0] * tmp0; 4115 } 4116 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 4117 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 4118 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 4119 ibdiag += 9; 4120 row += 3; 4121 break; 4122 case 4: 4123 v2 = a->a + ii[row + 1]; 4124 v3 = a->a + ii[row + 2]; 4125 v4 = a->a + ii[row + 3]; 4126 sum1 = t[row]; 4127 sum2 = t[row + 1]; 4128 sum3 = t[row + 2]; 4129 sum4 = t[row + 3]; 4130 for (n = 0; n < sz - 1; n += 2) { 4131 i1 = idx[0]; 4132 i2 = idx[1]; 4133 idx += 2; 4134 tmp0 = t[i1]; 4135 tmp1 = t[i2]; 4136 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4137 v1 += 2; 4138 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4139 v2 += 2; 4140 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4141 v3 += 2; 4142 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4143 v4 += 2; 4144 } 4145 4146 if (n == sz - 1) { 4147 tmp0 = t[*idx]; 4148 sum1 -= v1[0] * tmp0; 4149 sum2 -= v2[0] * tmp0; 4150 sum3 -= v3[0] * tmp0; 4151 sum4 -= v4[0] * tmp0; 4152 } 4153 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 4154 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 4155 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 4156 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 4157 ibdiag += 16; 4158 row += 4; 4159 break; 4160 case 5: 4161 v2 = a->a + ii[row + 1]; 4162 v3 = a->a + ii[row + 2]; 4163 v4 = a->a + ii[row + 3]; 4164 v5 = a->a + ii[row + 4]; 4165 sum1 = t[row]; 4166 sum2 = t[row + 1]; 4167 sum3 = t[row + 2]; 4168 sum4 = t[row + 3]; 4169 sum5 = t[row + 4]; 4170 for (n = 0; n < sz - 1; n += 2) { 4171 i1 = idx[0]; 4172 i2 = idx[1]; 4173 idx += 2; 4174 tmp0 = t[i1]; 4175 tmp1 = t[i2]; 4176 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 4177 v1 += 2; 4178 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 4179 v2 += 2; 4180 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 4181 v3 += 2; 4182 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 4183 v4 += 2; 4184 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 4185 v5 += 2; 4186 } 4187 4188 if (n == sz - 1) { 4189 tmp0 = t[*idx]; 4190 sum1 -= v1[0] * tmp0; 4191 sum2 -= v2[0] * tmp0; 4192 sum3 -= v3[0] * tmp0; 4193 sum4 -= v4[0] * tmp0; 4194 sum5 -= v5[0] * tmp0; 4195 } 4196 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 4197 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 4198 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 4199 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 4200 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 4201 ibdiag += 25; 4202 row += 5; 4203 break; 4204 default: 4205 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4206 } 4207 } 4208 PetscCall(PetscLogFlops(a->nz)); 4209 } 4210 PetscCall(VecRestoreArray(xx, &x)); 4211 PetscCall(VecRestoreArrayRead(bb, &b)); 4212 PetscFunctionReturn(PETSC_SUCCESS); 4213 } 4214 4215 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 4216 { 4217 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4218 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 4219 const MatScalar *bdiag = a->inode.bdiag; 4220 const PetscScalar *b; 4221 PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz; 4222 const PetscInt *sizes = a->inode.size_csr; 4223 4224 PetscFunctionBegin; 4225 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 4226 PetscCall(VecGetArray(xx, &x)); 4227 PetscCall(VecGetArrayRead(bb, &b)); 4228 cnt = 0; 4229 for (i = 0, row = 0; i < m; i++) { 4230 nodesz = sizes[i + 1] - sizes[i]; 4231 switch (nodesz) { 4232 case 1: 4233 x[row] = b[row] * bdiag[cnt++]; 4234 row++; 4235 break; 4236 case 2: 4237 x1 = b[row]; 4238 x2 = b[row + 1]; 4239 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 4240 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 4241 x[row++] = tmp1; 4242 x[row++] = tmp2; 4243 cnt += 4; 4244 break; 4245 case 3: 4246 x1 = b[row]; 4247 x2 = b[row + 1]; 4248 x3 = b[row + 2]; 4249 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 4250 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 4251 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 4252 x[row++] = tmp1; 4253 x[row++] = tmp2; 4254 x[row++] = tmp3; 4255 cnt += 9; 4256 break; 4257 case 4: 4258 x1 = b[row]; 4259 x2 = b[row + 1]; 4260 x3 = b[row + 2]; 4261 x4 = b[row + 3]; 4262 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 4263 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 4264 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 4265 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 4266 x[row++] = tmp1; 4267 x[row++] = tmp2; 4268 x[row++] = tmp3; 4269 x[row++] = tmp4; 4270 cnt += 16; 4271 break; 4272 case 5: 4273 x1 = b[row]; 4274 x2 = b[row + 1]; 4275 x3 = b[row + 2]; 4276 x4 = b[row + 3]; 4277 x5 = b[row + 4]; 4278 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 4279 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 4280 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 4281 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 4282 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 4283 x[row++] = tmp1; 4284 x[row++] = tmp2; 4285 x[row++] = tmp3; 4286 x[row++] = tmp4; 4287 x[row++] = tmp5; 4288 cnt += 25; 4289 break; 4290 default: 4291 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 4292 } 4293 } 4294 PetscCall(PetscLogFlops(2.0 * cnt)); 4295 PetscCall(VecRestoreArray(xx, &x)); 4296 PetscCall(VecRestoreArrayRead(bb, &b)); 4297 PetscFunctionReturn(PETSC_SUCCESS); 4298 } 4299 4300 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4301 { 4302 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4303 4304 PetscFunctionBegin; 4305 a->inode.node_count = 0; 4306 a->inode.use = PETSC_FALSE; 4307 a->inode.checked = PETSC_FALSE; 4308 a->inode.mat_nonzerostate = -1; 4309 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4310 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4311 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4312 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4313 A->ops->coloringpatch = NULL; 4314 A->ops->multdiagonalblock = NULL; 4315 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 4316 PetscFunctionReturn(PETSC_SUCCESS); 4317 } 4318 4319 /* 4320 samestructure indicates that the matrix has not changed its nonzero structure so we 4321 do not need to recompute the inodes 4322 */ 4323 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4324 { 4325 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4326 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 4327 PetscBool flag; 4328 const PetscInt *idx, *idy, *ii; 4329 4330 PetscFunctionBegin; 4331 if (!a->inode.use) { 4332 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4333 PetscCall(PetscFree(a->inode.size_csr)); 4334 PetscFunctionReturn(PETSC_SUCCESS); 4335 } 4336 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 4337 4338 m = A->rmap->n; 4339 if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr)); 4340 ns = a->inode.size_csr; 4341 ns[0] = 0; 4342 4343 i = 0; 4344 node_count = 0; 4345 idx = a->j; 4346 ii = a->i; 4347 if (idx) { 4348 while (i < m) { /* For each row */ 4349 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 4350 /* Limits the number of elements in a node to 'a->inode.limit' */ 4351 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4352 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 4353 if (nzy != nzx) break; 4354 idy += nzx; /* Same nonzero pattern */ 4355 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 4356 if (!flag) break; 4357 } 4358 ns[node_count + 1] = ns[node_count] + blk_size; 4359 node_count++; 4360 idx += blk_size * nzx; 4361 i = j; 4362 } 4363 } 4364 /* If not enough inodes found,, do not use inode version of the routines */ 4365 if (!m || !idx || node_count > .8 * m) { 4366 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4367 PetscCall(PetscFree(a->inode.size_csr)); 4368 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4369 } else { 4370 if (!A->factortype) { 4371 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4372 if (A->rmap->n == A->cmap->n) { 4373 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4374 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4375 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4376 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4377 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4378 } 4379 } else { 4380 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4381 } 4382 a->inode.node_count = node_count; 4383 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4384 } 4385 a->inode.checked = PETSC_TRUE; 4386 a->inode.mat_nonzerostate = A->nonzerostate; 4387 PetscFunctionReturn(PETSC_SUCCESS); 4388 } 4389 4390 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 4391 { 4392 Mat B = *C; 4393 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 4394 PetscInt m = A->rmap->n; 4395 4396 PetscFunctionBegin; 4397 c->inode.use = a->inode.use; 4398 c->inode.limit = a->inode.limit; 4399 c->inode.max_limit = a->inode.max_limit; 4400 c->inode.checked = PETSC_FALSE; 4401 c->inode.size_csr = NULL; 4402 c->inode.node_count = 0; 4403 c->inode.ibdiagvalid = PETSC_FALSE; 4404 c->inode.ibdiag = NULL; 4405 c->inode.bdiag = NULL; 4406 c->inode.mat_nonzerostate = -1; 4407 if (a->inode.use) { 4408 if (a->inode.checked && a->inode.size_csr) { 4409 PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr)); 4410 PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1)); 4411 4412 c->inode.checked = PETSC_TRUE; 4413 c->inode.node_count = a->inode.node_count; 4414 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4415 } 4416 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4417 if (!B->factortype) { 4418 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4419 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4420 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4421 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4422 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4423 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4424 } else { 4425 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4426 } 4427 } 4428 PetscFunctionReturn(PETSC_SUCCESS); 4429 } 4430 4431 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4432 { 4433 PetscInt k; 4434 const PetscInt *vi; 4435 4436 PetscFunctionBegin; 4437 vi = aj + ai[row]; 4438 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4439 vi = aj + adiag[row]; 4440 cols[nzl] = vi[0]; 4441 vi = aj + adiag[row + 1] + 1; 4442 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4443 PetscFunctionReturn(PETSC_SUCCESS); 4444 } 4445 /* 4446 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4447 Modified from MatSeqAIJCheckInode(). 4448 4449 Input Parameters: 4450 . Mat A - ILU or LU matrix factor 4451 4452 */ 4453 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4454 { 4455 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4456 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4457 PetscInt *cols1, *cols2, *ns; 4458 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4459 PetscBool flag; 4460 4461 PetscFunctionBegin; 4462 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4463 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4464 4465 m = A->rmap->n; 4466 if (a->inode.size_csr) ns = a->inode.size_csr; 4467 else PetscCall(PetscMalloc1(m + 1, &ns)); 4468 ns[0] = 0; 4469 4470 i = 0; 4471 node_count = 0; 4472 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4473 while (i < m) { /* For each row */ 4474 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4475 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4476 nzx = nzl1 + nzu1 + 1; 4477 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4478 4479 /* Limits the number of elements in a node to 'a->inode.limit' */ 4480 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4481 nzl2 = ai[j + 1] - ai[j]; 4482 nzu2 = adiag[j] - adiag[j + 1] - 1; 4483 nzy = nzl2 + nzu2 + 1; 4484 if (nzy != nzx) break; 4485 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4486 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4487 if (!flag) break; 4488 } 4489 ns[node_count + 1] = ns[node_count] + blk_size; 4490 node_count++; 4491 i = j; 4492 } 4493 PetscCall(PetscFree2(cols1, cols2)); 4494 /* If not enough inodes found,, do not use inode version of the routines */ 4495 if (!m || node_count > .8 * m) { 4496 PetscCall(PetscFree(ns)); 4497 4498 a->inode.node_count = 0; 4499 a->inode.size_csr = NULL; 4500 a->inode.use = PETSC_FALSE; 4501 4502 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4503 } else { 4504 A->ops->mult = NULL; 4505 A->ops->sor = NULL; 4506 A->ops->multadd = NULL; 4507 A->ops->getrowij = NULL; 4508 A->ops->restorerowij = NULL; 4509 A->ops->getcolumnij = NULL; 4510 A->ops->restorecolumnij = NULL; 4511 A->ops->coloringpatch = NULL; 4512 A->ops->multdiagonalblock = NULL; 4513 a->inode.node_count = node_count; 4514 a->inode.size_csr = ns; 4515 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4516 } 4517 a->inode.checked = PETSC_TRUE; 4518 PetscFunctionReturn(PETSC_SUCCESS); 4519 } 4520 4521 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4522 { 4523 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4524 4525 PetscFunctionBegin; 4526 a->inode.ibdiagvalid = PETSC_FALSE; 4527 PetscFunctionReturn(PETSC_SUCCESS); 4528 } 4529 4530 /* 4531 This is really ugly. if inodes are used this replaces the 4532 permutations with ones that correspond to rows/cols of the matrix 4533 rather than inode blocks 4534 */ 4535 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4536 { 4537 PetscFunctionBegin; 4538 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4539 PetscFunctionReturn(PETSC_SUCCESS); 4540 } 4541 4542 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4543 { 4544 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4545 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4546 const PetscInt *ridx, *cidx; 4547 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx; 4548 PetscInt nslim_col, *ns_col; 4549 IS ris = *rperm, cis = *cperm; 4550 4551 PetscFunctionBegin; 4552 if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4553 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4554 4555 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4556 PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns)); 4557 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4558 4559 PetscCall(ISGetIndices(ris, &ridx)); 4560 PetscCall(ISGetIndices(cis, &cidx)); 4561 4562 /* Form the inode structure for the rows of permuted matrix using inv perm*/ 4563 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]); 4564 4565 /* Construct the permutations for rows*/ 4566 for (i = 0, row = 0; i < nslim_row; ++i) { 4567 indx = ridx[i]; 4568 start_val = tns[indx]; 4569 end_val = tns[indx + 1]; 4570 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4571 } 4572 4573 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4574 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]); 4575 4576 /* Construct permutations for columns */ 4577 for (i = 0, col = 0; i < nslim_col; ++i) { 4578 indx = cidx[i]; 4579 start_val = tns[indx]; 4580 end_val = tns[indx + 1]; 4581 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4582 } 4583 4584 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4585 PetscCall(ISSetPermutation(*rperm)); 4586 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4587 PetscCall(ISSetPermutation(*cperm)); 4588 4589 PetscCall(ISRestoreIndices(ris, &ridx)); 4590 PetscCall(ISRestoreIndices(cis, &cidx)); 4591 4592 PetscCall(PetscFree(ns_col)); 4593 PetscCall(PetscFree2(permr, permc)); 4594 PetscCall(ISDestroy(&cis)); 4595 PetscCall(ISDestroy(&ris)); 4596 PetscCall(PetscFree(tns)); 4597 PetscFunctionReturn(PETSC_SUCCESS); 4598 } 4599 4600 /*@C 4601 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4602 4603 Not Collective 4604 4605 Input Parameter: 4606 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4607 4608 Output Parameters: 4609 + node_count - no of inodes present in the matrix. 4610 . sizes - an array of size `node_count`, with the sizes of each inode. 4611 - limit - the max size used to generate the inodes. 4612 4613 Level: advanced 4614 4615 Note: 4616 It should be called after the matrix is assembled. 4617 The contents of the sizes[] array should not be changed. 4618 `NULL` may be passed for information not needed 4619 4620 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 4621 @*/ 4622 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4623 { 4624 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4625 4626 PetscFunctionBegin; 4627 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4628 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4629 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4630 PetscFunctionReturn(PETSC_SUCCESS); 4631 } 4632 4633 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4634 { 4635 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4636 4637 PetscFunctionBegin; 4638 if (node_count) *node_count = a->inode.node_count; 4639 if (sizes) *sizes = a->inode.size_csr; 4640 if (limit) *limit = a->inode.limit; 4641 PetscFunctionReturn(PETSC_SUCCESS); 4642 } 4643