1 /* 2 This file provides high performance routines for the Inode format (compressed sparse row) 3 by taking advantage of rows with identical nonzero structure (I-nodes). 4 */ 5 #include <../src/mat/impls/aij/seq/aij.h> 6 #if defined(PETSC_HAVE_XMMINTRIN_H) 7 #include <xmmintrin.h> 8 #endif 9 10 static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 13 PetscInt i, count, m, n, min_mn, *ns_row, *ns_col; 14 15 PetscFunctionBegin; 16 n = A->cmap->n; 17 m = A->rmap->n; 18 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 19 ns_row = a->inode.size_csr; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++); 24 for (; count + 1 < n; count++, i++); 25 if (count < n) i++; 26 *size = i; 27 PetscFunctionReturn(PETSC_SUCCESS); 28 } 29 PetscCall(PetscMalloc1(n + 1, &ns_col)); 30 ns_col[0] = 0; 31 32 /* Use the same row structure wherever feasible. */ 33 for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1]; 34 35 /* if m < n; pad up the remainder with inode_limit */ 36 for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1; 37 /* The last node is the odd ball. pad it up with the remaining rows; */ 38 if (count < n) { 39 ns_col[i + 1] = ns_col[i] + (n - count); 40 i++; 41 } else if (count > n) { 42 /* Adjust for the over estimation */ 43 ns_col[i] += n - count; 44 } 45 *size = i; 46 *ns = ns_col; 47 PetscFunctionReturn(PETSC_SUCCESS); 48 } 49 50 /* 51 This builds symmetric version of nonzero structure, 52 */ 53 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 54 { 55 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 56 PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n; 57 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2; 58 const PetscInt *j, *jmax, *ai = a->i, *aj = a->j; 59 60 PetscFunctionBegin; 61 nslim_row = a->inode.node_count; 62 m = A->rmap->n; 63 n = A->cmap->n; 64 PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 65 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 66 67 /* Use the row_inode as column_inode */ 68 nslim_col = nslim_row; 69 ns_col = ns_row; 70 71 /* allocate space for reformatted inode structure */ 72 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 73 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]); 74 75 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 76 nsz = ns_col[i1 + 1] - ns_col[i1]; 77 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 78 } 79 /* allocate space for row pointers */ 80 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 81 *iia = ia; 82 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 83 84 /* determine the number of columns in each row */ 85 ia[0] = oshift; 86 for (i1 = 0; i1 < nslim_row; i1++) { 87 row = ns_row[i1]; 88 j = aj + ai[row] + ishift; 89 jmax = aj + ai[row + 1] + ishift; 90 if (j == jmax) continue; /* empty row */ 91 col = *j++ + ishift; 92 i2 = tvc[col]; 93 while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 94 ia[i1 + 1]++; 95 ia[i2 + 1]++; 96 i2++; /* Start col of next node */ 97 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; 98 i2 = tvc[col]; 99 } 100 if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */ 101 } 102 103 /* shift ia[i] to point to next row */ 104 for (i1 = 1; i1 < nslim_row + 1; i1++) { 105 row = ia[i1 - 1]; 106 ia[i1] += row; 107 work[i1 - 1] = row - oshift; 108 } 109 110 /* allocate space for column pointers */ 111 nz = ia[nslim_row] + (!ishift); 112 PetscCall(PetscMalloc1(nz, &ja)); 113 *jja = ja; 114 115 /* loop over lower triangular part putting into ja */ 116 for (i1 = 0; i1 < nslim_row; i1++) { 117 row = ns_row[i1]; 118 j = aj + ai[row] + ishift; 119 jmax = aj + ai[row + 1] + ishift; 120 if (j == jmax) continue; /* empty row */ 121 col = *j++ + ishift; 122 i2 = tvc[col]; 123 while (i2 < i1 && j < jmax) { 124 ja[work[i2]++] = i1 + oshift; 125 ja[work[i1]++] = i2 + oshift; 126 ++i2; 127 while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */ 128 i2 = tvc[col]; 129 } 130 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 131 } 132 PetscCall(PetscFree(work)); 133 PetscCall(PetscFree2(tns, tvc)); 134 PetscFunctionReturn(PETSC_SUCCESS); 135 } 136 137 /* 138 This builds nonsymmetric version of nonzero structure, 139 */ 140 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 141 { 142 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 143 PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col; 144 PetscInt *tns, *tvc, nsz, i1, i2; 145 const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr; 146 147 PetscFunctionBegin; 148 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 149 nslim_row = a->inode.node_count; 150 n = A->cmap->n; 151 152 /* Create The column_inode for this matrix */ 153 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 154 155 /* allocate space for reformatted column_inode structure */ 156 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 157 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 158 159 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 160 nsz = ns_col[i1 + 1] - ns_col[i1]; 161 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 162 } 163 /* allocate space for row pointers */ 164 PetscCall(PetscCalloc1(nslim_row + 1, &ia)); 165 *iia = ia; 166 PetscCall(PetscMalloc1(nslim_row + 1, &work)); 167 168 /* determine the number of columns in each row */ 169 ia[0] = oshift; 170 for (i1 = 0; i1 < nslim_row; i1++) { 171 row = ns_row[i1]; 172 j = aj + ai[row] + ishift; 173 nz = ai[row + 1] - ai[row]; 174 if (!nz) continue; /* empty row */ 175 col = *j++ + ishift; 176 i2 = tvc[col]; 177 while (nz-- > 0) { /* off-diagonal elements */ 178 ia[i1 + 1]++; 179 i2++; /* Start col of next node */ 180 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 181 if (nz > 0) i2 = tvc[col]; 182 } 183 } 184 185 /* shift ia[i] to point to next row */ 186 for (i1 = 1; i1 < nslim_row + 1; i1++) { 187 row = ia[i1 - 1]; 188 ia[i1] += row; 189 work[i1 - 1] = row - oshift; 190 } 191 192 /* allocate space for column pointers */ 193 nz = ia[nslim_row] + (!ishift); 194 PetscCall(PetscMalloc1(nz, &ja)); 195 *jja = ja; 196 197 /* loop over matrix putting into ja */ 198 for (i1 = 0; i1 < nslim_row; i1++) { 199 row = ns_row[i1]; 200 j = aj + ai[row] + ishift; 201 nz = ai[row + 1] - ai[row]; 202 if (!nz) continue; /* empty row */ 203 col = *j++ + ishift; 204 i2 = tvc[col]; 205 while (nz-- > 0) { 206 ja[work[i1]++] = i2 + oshift; 207 ++i2; 208 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 209 if (nz > 0) i2 = tvc[col]; 210 } 211 } 212 PetscCall(PetscFree(ns_col)); 213 PetscCall(PetscFree(work)); 214 PetscCall(PetscFree2(tns, tvc)); 215 PetscFunctionReturn(PETSC_SUCCESS); 216 } 217 218 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 219 { 220 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 221 222 PetscFunctionBegin; 223 if (n) *n = a->inode.node_count; 224 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 225 if (!blockcompressed) { 226 PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 227 } else if (symmetric) { 228 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 229 } else { 230 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 231 } 232 PetscFunctionReturn(PETSC_SUCCESS); 233 } 234 235 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 236 { 237 PetscFunctionBegin; 238 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 239 240 if (!blockcompressed) { 241 PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 242 } else { 243 PetscCall(PetscFree(*ia)); 244 PetscCall(PetscFree(*ja)); 245 } 246 PetscFunctionReturn(PETSC_SUCCESS); 247 } 248 249 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift) 250 { 251 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 252 PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col; 253 PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j; 254 255 PetscFunctionBegin; 256 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 257 nslim_row = a->inode.node_count; 258 n = A->cmap->n; 259 260 /* Create The column_inode for this matrix */ 261 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 262 263 /* allocate space for reformatted column_inode structure */ 264 PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc)); 265 for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]); 266 267 for (i1 = 0, col = 0; i1 < nslim_col; ++i1) { 268 nsz = ns_col[i1 + 1] - ns_col[i1]; 269 for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1; 270 } 271 /* allocate space for column pointers */ 272 PetscCall(PetscCalloc1(nslim_col + 1, &ia)); 273 *iia = ia; 274 PetscCall(PetscMalloc1(nslim_col + 1, &work)); 275 276 /* determine the number of columns in each row */ 277 ia[0] = oshift; 278 for (i1 = 0; i1 < nslim_row; i1++) { 279 row = ns_row[i1]; 280 j = aj + ai[row] + ishift; 281 col = *j++ + ishift; 282 i2 = tvc[col]; 283 nz = ai[row + 1] - ai[row]; 284 while (nz-- > 0) { /* off-diagonal elements */ 285 /* ia[i1+1]++; */ 286 ia[i2 + 1]++; 287 i2++; 288 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 289 if (nz > 0) i2 = tvc[col]; 290 } 291 } 292 293 /* shift ia[i] to point to next col */ 294 for (i1 = 1; i1 < nslim_col + 1; i1++) { 295 col = ia[i1 - 1]; 296 ia[i1] += col; 297 work[i1 - 1] = col - oshift; 298 } 299 300 /* allocate space for column pointers */ 301 nz = ia[nslim_col] + (!ishift); 302 PetscCall(PetscMalloc1(nz, &ja)); 303 *jja = ja; 304 305 /* loop over matrix putting into ja */ 306 for (i1 = 0; i1 < nslim_row; i1++) { 307 row = ns_row[i1]; 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row + 1] - ai[row]; 312 while (nz-- > 0) { 313 /* ja[work[i1]++] = i2 + oshift; */ 314 ja[work[i2]++] = i1 + oshift; 315 i2++; 316 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 PetscCall(PetscFree(ns_col)); 321 PetscCall(PetscFree(work)); 322 PetscCall(PetscFree2(tns, tvc)); 323 PetscFunctionReturn(PETSC_SUCCESS); 324 } 325 326 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 327 { 328 PetscFunctionBegin; 329 PetscCall(MatCreateColInode_Private(A, n, NULL)); 330 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 331 332 if (!blockcompressed) { 333 PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 334 } else if (symmetric) { 335 /* Since the indices are symmetric it doesn't matter */ 336 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift)); 337 } else { 338 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift)); 339 } 340 PetscFunctionReturn(PETSC_SUCCESS); 341 } 342 343 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done) 344 { 345 PetscFunctionBegin; 346 if (!ia) PetscFunctionReturn(PETSC_SUCCESS); 347 if (!blockcompressed) { 348 PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done)); 349 } else { 350 PetscCall(PetscFree(*ia)); 351 PetscCall(PetscFree(*ja)); 352 } 353 PetscFunctionReturn(PETSC_SUCCESS); 354 } 355 356 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy) 357 { 358 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 359 PetscScalar *y; 360 const PetscScalar *x; 361 PetscInt row, node_max, nonzerorow = 0; 362 PetscInt *ns; 363 364 PetscFunctionBegin; 365 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 366 node_max = a->inode.node_count; 367 ns = a->inode.size_csr; /* Node Size array */ 368 PetscCall(VecGetArrayRead(xx, &x)); 369 PetscCall(VecGetArray(yy, &y)); 370 371 PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow)) 372 for (PetscInt i = 0; i < node_max; ++i) { 373 PetscInt i1, i2, nsz, n, sz; 374 const MatScalar *v1, *v2, *v3, *v4, *v5; 375 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 376 const PetscInt *idx; 377 378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 379 #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5) 380 #endif 381 row = ns[i]; 382 nsz = ns[i + 1] - ns[i]; 383 n = a->i[row + 1] - a->i[row]; 384 nonzerorow += (n > 0) * nsz; 385 386 idx = &a->j[a->i[row]]; 387 v1 = &a->a[a->i[row]]; 388 PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 389 PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 390 sz = n; /* No of non zeros in this row */ 391 /* Switch on the size of Node */ 392 switch (nsz) { /* Each loop in 'case' is unrolled */ 393 case 1: 394 sum1 = 0.; 395 396 for (n = 0; n < sz - 1; n += 2) { 397 i1 = idx[0]; /* The instructions are ordered to */ 398 i2 = idx[1]; /* make the compiler's job easy */ 399 idx += 2; 400 tmp0 = x[i1]; 401 tmp1 = x[i2]; 402 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 403 v1 += 2; 404 } 405 406 if (n == sz - 1) { /* Take care of the last nonzero */ 407 tmp0 = x[*idx++]; 408 sum1 += *v1++ * tmp0; 409 } 410 y[row++] = sum1; 411 break; 412 case 2: 413 sum1 = 0.; 414 sum2 = 0.; 415 v2 = v1 + n; 416 417 for (n = 0; n < sz - 1; n += 2) { 418 i1 = idx[0]; 419 i2 = idx[1]; 420 idx += 2; 421 tmp0 = x[i1]; 422 tmp1 = x[i2]; 423 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 424 v1 += 2; 425 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 426 v2 += 2; 427 } 428 if (n == sz - 1) { 429 tmp0 = x[*idx++]; 430 sum1 += *v1++ * tmp0; 431 sum2 += *v2++ * tmp0; 432 } 433 y[row++] = sum1; 434 y[row++] = sum2; 435 v1 = v2; /* Since the next block to be processed starts there*/ 436 idx += sz; 437 break; 438 case 3: 439 sum1 = 0.; 440 sum2 = 0.; 441 sum3 = 0.; 442 v2 = v1 + n; 443 v3 = v2 + n; 444 445 for (n = 0; n < sz - 1; n += 2) { 446 i1 = idx[0]; 447 i2 = idx[1]; 448 idx += 2; 449 tmp0 = x[i1]; 450 tmp1 = x[i2]; 451 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 452 v1 += 2; 453 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 454 v2 += 2; 455 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 456 v3 += 2; 457 } 458 if (n == sz - 1) { 459 tmp0 = x[*idx++]; 460 sum1 += *v1++ * tmp0; 461 sum2 += *v2++ * tmp0; 462 sum3 += *v3++ * tmp0; 463 } 464 y[row++] = sum1; 465 y[row++] = sum2; 466 y[row++] = sum3; 467 v1 = v3; /* Since the next block to be processed starts there*/ 468 idx += 2 * sz; 469 break; 470 case 4: 471 sum1 = 0.; 472 sum2 = 0.; 473 sum3 = 0.; 474 sum4 = 0.; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 v4 = v3 + n; 478 479 for (n = 0; n < sz - 1; n += 2) { 480 i1 = idx[0]; 481 i2 = idx[1]; 482 idx += 2; 483 tmp0 = x[i1]; 484 tmp1 = x[i2]; 485 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 486 v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 488 v2 += 2; 489 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 490 v3 += 2; 491 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 492 v4 += 2; 493 } 494 if (n == sz - 1) { 495 tmp0 = x[*idx++]; 496 sum1 += *v1++ * tmp0; 497 sum2 += *v2++ * tmp0; 498 sum3 += *v3++ * tmp0; 499 sum4 += *v4++ * tmp0; 500 } 501 y[row++] = sum1; 502 y[row++] = sum2; 503 y[row++] = sum3; 504 y[row++] = sum4; 505 v1 = v4; /* Since the next block to be processed starts there*/ 506 idx += 3 * sz; 507 break; 508 case 5: 509 sum1 = 0.; 510 sum2 = 0.; 511 sum3 = 0.; 512 sum4 = 0.; 513 sum5 = 0.; 514 v2 = v1 + n; 515 v3 = v2 + n; 516 v4 = v3 + n; 517 v5 = v4 + n; 518 519 for (n = 0; n < sz - 1; n += 2) { 520 i1 = idx[0]; 521 i2 = idx[1]; 522 idx += 2; 523 tmp0 = x[i1]; 524 tmp1 = x[i2]; 525 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 526 v1 += 2; 527 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 528 v2 += 2; 529 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 530 v3 += 2; 531 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 532 v4 += 2; 533 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 534 v5 += 2; 535 } 536 if (n == sz - 1) { 537 tmp0 = x[*idx++]; 538 sum1 += *v1++ * tmp0; 539 sum2 += *v2++ * tmp0; 540 sum3 += *v3++ * tmp0; 541 sum4 += *v4++ * tmp0; 542 sum5 += *v5++ * tmp0; 543 } 544 y[row++] = sum1; 545 y[row++] = sum2; 546 y[row++] = sum3; 547 y[row++] = sum4; 548 y[row++] = sum5; 549 v1 = v5; /* Since the next block to be processed starts there */ 550 idx += 4 * sz; 551 break; 552 default: 553 SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz); 554 } 555 } 556 PetscCall(VecRestoreArrayRead(xx, &x)); 557 PetscCall(VecRestoreArray(yy, &y)); 558 PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow)); 559 PetscFunctionReturn(PETSC_SUCCESS); 560 } 561 562 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 563 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy) 564 { 565 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 566 PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1; 567 const MatScalar *v1, *v2, *v3, *v4, *v5; 568 const PetscScalar *x; 569 PetscScalar *y, *z, *zt; 570 PetscInt i1, i2, n, i, row, node_max, nsz, sz; 571 const PetscInt *idx, *ns, *ii; 572 573 PetscFunctionBegin; 574 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 575 node_max = a->inode.node_count; 576 ns = a->inode.size_csr; /* Node Size array */ 577 578 PetscCall(VecGetArrayRead(xx, &x)); 579 PetscCall(VecGetArrayPair(zz, yy, &z, &y)); 580 zt = z; 581 582 idx = a->j; 583 v1 = a->a; 584 ii = a->i; 585 586 for (i = 0; i < node_max; ++i) { 587 row = ns[i]; 588 nsz = ns[i + 1] - ns[i]; 589 n = ii[1] - ii[0]; 590 ii += nsz; 591 sz = n; /* No of non zeros in this row */ 592 /* Switch on the size of Node */ 593 switch (nsz) { /* Each loop in 'case' is unrolled */ 594 case 1: 595 sum1 = *zt++; 596 597 for (n = 0; n < sz - 1; n += 2) { 598 i1 = idx[0]; /* The instructions are ordered to */ 599 i2 = idx[1]; /* make the compiler's job easy */ 600 idx += 2; 601 tmp0 = x[i1]; 602 tmp1 = x[i2]; 603 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 604 v1 += 2; 605 } 606 607 if (n == sz - 1) { /* Take care of the last nonzero */ 608 tmp0 = x[*idx++]; 609 sum1 += *v1++ * tmp0; 610 } 611 y[row++] = sum1; 612 break; 613 case 2: 614 sum1 = *zt++; 615 sum2 = *zt++; 616 v2 = v1 + n; 617 618 for (n = 0; n < sz - 1; n += 2) { 619 i1 = idx[0]; 620 i2 = idx[1]; 621 idx += 2; 622 tmp0 = x[i1]; 623 tmp1 = x[i2]; 624 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 625 v1 += 2; 626 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 627 v2 += 2; 628 } 629 if (n == sz - 1) { 630 tmp0 = x[*idx++]; 631 sum1 += *v1++ * tmp0; 632 sum2 += *v2++ * tmp0; 633 } 634 y[row++] = sum1; 635 y[row++] = sum2; 636 v1 = v2; /* Since the next block to be processed starts there*/ 637 idx += sz; 638 break; 639 case 3: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 sum3 = *zt++; 643 v2 = v1 + n; 644 v3 = v2 + n; 645 646 for (n = 0; n < sz - 1; n += 2) { 647 i1 = idx[0]; 648 i2 = idx[1]; 649 idx += 2; 650 tmp0 = x[i1]; 651 tmp1 = x[i2]; 652 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 653 v1 += 2; 654 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 655 v2 += 2; 656 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 657 v3 += 2; 658 } 659 if (n == sz - 1) { 660 tmp0 = x[*idx++]; 661 sum1 += *v1++ * tmp0; 662 sum2 += *v2++ * tmp0; 663 sum3 += *v3++ * tmp0; 664 } 665 y[row++] = sum1; 666 y[row++] = sum2; 667 y[row++] = sum3; 668 v1 = v3; /* Since the next block to be processed starts there*/ 669 idx += 2 * sz; 670 break; 671 case 4: 672 sum1 = *zt++; 673 sum2 = *zt++; 674 sum3 = *zt++; 675 sum4 = *zt++; 676 v2 = v1 + n; 677 v3 = v2 + n; 678 v4 = v3 + n; 679 680 for (n = 0; n < sz - 1; n += 2) { 681 i1 = idx[0]; 682 i2 = idx[1]; 683 idx += 2; 684 tmp0 = x[i1]; 685 tmp1 = x[i2]; 686 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 687 v1 += 2; 688 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 689 v2 += 2; 690 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 691 v3 += 2; 692 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 693 v4 += 2; 694 } 695 if (n == sz - 1) { 696 tmp0 = x[*idx++]; 697 sum1 += *v1++ * tmp0; 698 sum2 += *v2++ * tmp0; 699 sum3 += *v3++ * tmp0; 700 sum4 += *v4++ * tmp0; 701 } 702 y[row++] = sum1; 703 y[row++] = sum2; 704 y[row++] = sum3; 705 y[row++] = sum4; 706 v1 = v4; /* Since the next block to be processed starts there*/ 707 idx += 3 * sz; 708 break; 709 case 5: 710 sum1 = *zt++; 711 sum2 = *zt++; 712 sum3 = *zt++; 713 sum4 = *zt++; 714 sum5 = *zt++; 715 v2 = v1 + n; 716 v3 = v2 + n; 717 v4 = v3 + n; 718 v5 = v4 + n; 719 720 for (n = 0; n < sz - 1; n += 2) { 721 i1 = idx[0]; 722 i2 = idx[1]; 723 idx += 2; 724 tmp0 = x[i1]; 725 tmp1 = x[i2]; 726 sum1 += v1[0] * tmp0 + v1[1] * tmp1; 727 v1 += 2; 728 sum2 += v2[0] * tmp0 + v2[1] * tmp1; 729 v2 += 2; 730 sum3 += v3[0] * tmp0 + v3[1] * tmp1; 731 v3 += 2; 732 sum4 += v4[0] * tmp0 + v4[1] * tmp1; 733 v4 += 2; 734 sum5 += v5[0] * tmp0 + v5[1] * tmp1; 735 v5 += 2; 736 } 737 if (n == sz - 1) { 738 tmp0 = x[*idx++]; 739 sum1 += *v1++ * tmp0; 740 sum2 += *v2++ * tmp0; 741 sum3 += *v3++ * tmp0; 742 sum4 += *v4++ * tmp0; 743 sum5 += *v5++ * tmp0; 744 } 745 y[row++] = sum1; 746 y[row++] = sum2; 747 y[row++] = sum3; 748 y[row++] = sum4; 749 y[row++] = sum5; 750 v1 = v5; /* Since the next block to be processed starts there */ 751 idx += 4 * sz; 752 break; 753 default: 754 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported"); 755 } 756 } 757 PetscCall(VecRestoreArrayRead(xx, &x)); 758 PetscCall(VecRestoreArrayPair(zz, yy, &z, &y)); 759 PetscCall(PetscLogFlops(2.0 * a->nz)); 760 PetscFunctionReturn(PETSC_SUCCESS); 761 } 762 763 static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx) 764 { 765 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 766 IS iscol = a->col, isrow = a->row; 767 const PetscInt *r, *c, *rout, *cout; 768 PetscInt i, j, n = A->rmap->n, nz; 769 PetscInt node_max, *ns, row, nsz, aii, i0, i1; 770 const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj; 771 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 772 PetscScalar sum1, sum2, sum3, sum4, sum5; 773 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 774 const PetscScalar *b; 775 776 PetscFunctionBegin; 777 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 778 node_max = a->inode.node_count; 779 ns = a->inode.size_csr; /* Node Size array */ 780 781 PetscCall(VecGetArrayRead(bb, &b)); 782 PetscCall(VecGetArrayWrite(xx, &x)); 783 tmp = a->solve_work; 784 785 PetscCall(ISGetIndices(isrow, &rout)); 786 r = rout; 787 PetscCall(ISGetIndices(iscol, &cout)); 788 c = cout + (n - 1); 789 790 /* forward solve the lower triangular */ 791 tmps = tmp; 792 aa = a_a; 793 aj = a_j; 794 ad = a->diag; 795 796 for (i = 0, row = 0; i < node_max; ++i) { 797 row = ns[i]; 798 nsz = ns[i + 1] - ns[i]; 799 aii = ai[row]; 800 v1 = aa + aii; 801 vi = aj + aii; 802 nz = ad[row] - aii; 803 if (i < node_max - 1) { 804 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 805 * but our indexing to determine its size could. */ 806 PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 807 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 808 PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 809 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 810 } 811 812 switch (nsz) { /* Each loop in 'case' is unrolled */ 813 case 1: 814 sum1 = b[*r++]; 815 for (j = 0; j < nz - 1; j += 2) { 816 i0 = vi[0]; 817 i1 = vi[1]; 818 vi += 2; 819 tmp0 = tmps[i0]; 820 tmp1 = tmps[i1]; 821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 822 v1 += 2; 823 } 824 if (j == nz - 1) { 825 tmp0 = tmps[*vi++]; 826 sum1 -= *v1++ * tmp0; 827 } 828 tmp[row++] = sum1; 829 break; 830 case 2: 831 sum1 = b[*r++]; 832 sum2 = b[*r++]; 833 v2 = aa + ai[row + 1]; 834 835 for (j = 0; j < nz - 1; j += 2) { 836 i0 = vi[0]; 837 i1 = vi[1]; 838 vi += 2; 839 tmp0 = tmps[i0]; 840 tmp1 = tmps[i1]; 841 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 842 v1 += 2; 843 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 844 v2 += 2; 845 } 846 if (j == nz - 1) { 847 tmp0 = tmps[*vi++]; 848 sum1 -= *v1++ * tmp0; 849 sum2 -= *v2++ * tmp0; 850 } 851 sum2 -= *v2++ * sum1; 852 tmp[row++] = sum1; 853 tmp[row++] = sum2; 854 break; 855 case 3: 856 sum1 = b[*r++]; 857 sum2 = b[*r++]; 858 sum3 = b[*r++]; 859 v2 = aa + ai[row + 1]; 860 v3 = aa + ai[row + 2]; 861 862 for (j = 0; j < nz - 1; j += 2) { 863 i0 = vi[0]; 864 i1 = vi[1]; 865 vi += 2; 866 tmp0 = tmps[i0]; 867 tmp1 = tmps[i1]; 868 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 869 v1 += 2; 870 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 871 v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 873 v3 += 2; 874 } 875 if (j == nz - 1) { 876 tmp0 = tmps[*vi++]; 877 sum1 -= *v1++ * tmp0; 878 sum2 -= *v2++ * tmp0; 879 sum3 -= *v3++ * tmp0; 880 } 881 sum2 -= *v2++ * sum1; 882 sum3 -= *v3++ * sum1; 883 sum3 -= *v3++ * sum2; 884 885 tmp[row++] = sum1; 886 tmp[row++] = sum2; 887 tmp[row++] = sum3; 888 break; 889 890 case 4: 891 sum1 = b[*r++]; 892 sum2 = b[*r++]; 893 sum3 = b[*r++]; 894 sum4 = b[*r++]; 895 v2 = aa + ai[row + 1]; 896 v3 = aa + ai[row + 2]; 897 v4 = aa + ai[row + 3]; 898 899 for (j = 0; j < nz - 1; j += 2) { 900 i0 = vi[0]; 901 i1 = vi[1]; 902 vi += 2; 903 tmp0 = tmps[i0]; 904 tmp1 = tmps[i1]; 905 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 906 v1 += 2; 907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 908 v2 += 2; 909 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 910 v3 += 2; 911 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 912 v4 += 2; 913 } 914 if (j == nz - 1) { 915 tmp0 = tmps[*vi++]; 916 sum1 -= *v1++ * tmp0; 917 sum2 -= *v2++ * tmp0; 918 sum3 -= *v3++ * tmp0; 919 sum4 -= *v4++ * tmp0; 920 } 921 sum2 -= *v2++ * sum1; 922 sum3 -= *v3++ * sum1; 923 sum4 -= *v4++ * sum1; 924 sum3 -= *v3++ * sum2; 925 sum4 -= *v4++ * sum2; 926 sum4 -= *v4++ * sum3; 927 928 tmp[row++] = sum1; 929 tmp[row++] = sum2; 930 tmp[row++] = sum3; 931 tmp[row++] = sum4; 932 break; 933 case 5: 934 sum1 = b[*r++]; 935 sum2 = b[*r++]; 936 sum3 = b[*r++]; 937 sum4 = b[*r++]; 938 sum5 = b[*r++]; 939 v2 = aa + ai[row + 1]; 940 v3 = aa + ai[row + 2]; 941 v4 = aa + ai[row + 3]; 942 v5 = aa + ai[row + 4]; 943 944 for (j = 0; j < nz - 1; j += 2) { 945 i0 = vi[0]; 946 i1 = vi[1]; 947 vi += 2; 948 tmp0 = tmps[i0]; 949 tmp1 = tmps[i1]; 950 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 951 v1 += 2; 952 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 953 v2 += 2; 954 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 955 v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 957 v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 959 v5 += 2; 960 } 961 if (j == nz - 1) { 962 tmp0 = tmps[*vi++]; 963 sum1 -= *v1++ * tmp0; 964 sum2 -= *v2++ * tmp0; 965 sum3 -= *v3++ * tmp0; 966 sum4 -= *v4++ * tmp0; 967 sum5 -= *v5++ * tmp0; 968 } 969 970 sum2 -= *v2++ * sum1; 971 sum3 -= *v3++ * sum1; 972 sum4 -= *v4++ * sum1; 973 sum5 -= *v5++ * sum1; 974 sum3 -= *v3++ * sum2; 975 sum4 -= *v4++ * sum2; 976 sum5 -= *v5++ * sum2; 977 sum4 -= *v4++ * sum3; 978 sum5 -= *v5++ * sum3; 979 sum5 -= *v5++ * sum4; 980 981 tmp[row++] = sum1; 982 tmp[row++] = sum2; 983 tmp[row++] = sum3; 984 tmp[row++] = sum4; 985 tmp[row++] = sum5; 986 break; 987 default: 988 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 989 } 990 } 991 /* backward solve the upper triangular */ 992 for (i = node_max - 1; i >= 0; i--) { 993 row = ns[i + 1]; 994 nsz = ns[i + 1] - ns[i]; 995 aii = ai[row + 1] - 1; 996 v1 = aa + aii; 997 vi = aj + aii; 998 nz = aii - ad[row]; 999 switch (nsz) { /* Each loop in 'case' is unrolled */ 1000 case 1: 1001 sum1 = tmp[row]; 1002 1003 for (j = nz; j > 1; j -= 2) { 1004 vi -= 2; 1005 i0 = vi[2]; 1006 i1 = vi[1]; 1007 tmp0 = tmps[i0]; 1008 tmp1 = tmps[i1]; 1009 v1 -= 2; 1010 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1011 } 1012 if (j == 1) { 1013 tmp0 = tmps[*vi--]; 1014 sum1 -= *v1-- * tmp0; 1015 } 1016 x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1017 row--; 1018 break; 1019 case 2: 1020 sum1 = tmp[row]; 1021 sum2 = tmp[row - 1]; 1022 v2 = aa + ai[row] - 1; 1023 for (j = nz; j > 1; j -= 2) { 1024 vi -= 2; 1025 i0 = vi[2]; 1026 i1 = vi[1]; 1027 tmp0 = tmps[i0]; 1028 tmp1 = tmps[i1]; 1029 v1 -= 2; 1030 v2 -= 2; 1031 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1032 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1033 } 1034 if (j == 1) { 1035 tmp0 = tmps[*vi--]; 1036 sum1 -= *v1-- * tmp0; 1037 sum2 -= *v2-- * tmp0; 1038 } 1039 1040 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1041 row--; 1042 sum2 -= *v2-- * tmp0; 1043 x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1044 row--; 1045 break; 1046 case 3: 1047 sum1 = tmp[row]; 1048 sum2 = tmp[row - 1]; 1049 sum3 = tmp[row - 2]; 1050 v2 = aa + ai[row] - 1; 1051 v3 = aa + ai[row - 1] - 1; 1052 for (j = nz; j > 1; j -= 2) { 1053 vi -= 2; 1054 i0 = vi[2]; 1055 i1 = vi[1]; 1056 tmp0 = tmps[i0]; 1057 tmp1 = tmps[i1]; 1058 v1 -= 2; 1059 v2 -= 2; 1060 v3 -= 2; 1061 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1062 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1063 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1064 } 1065 if (j == 1) { 1066 tmp0 = tmps[*vi--]; 1067 sum1 -= *v1-- * tmp0; 1068 sum2 -= *v2-- * tmp0; 1069 sum3 -= *v3-- * tmp0; 1070 } 1071 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1072 row--; 1073 sum2 -= *v2-- * tmp0; 1074 sum3 -= *v3-- * tmp0; 1075 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1076 row--; 1077 sum3 -= *v3-- * tmp0; 1078 x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1079 row--; 1080 1081 break; 1082 case 4: 1083 sum1 = tmp[row]; 1084 sum2 = tmp[row - 1]; 1085 sum3 = tmp[row - 2]; 1086 sum4 = tmp[row - 3]; 1087 v2 = aa + ai[row] - 1; 1088 v3 = aa + ai[row - 1] - 1; 1089 v4 = aa + ai[row - 2] - 1; 1090 1091 for (j = nz; j > 1; j -= 2) { 1092 vi -= 2; 1093 i0 = vi[2]; 1094 i1 = vi[1]; 1095 tmp0 = tmps[i0]; 1096 tmp1 = tmps[i1]; 1097 v1 -= 2; 1098 v2 -= 2; 1099 v3 -= 2; 1100 v4 -= 2; 1101 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1102 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1103 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1104 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1105 } 1106 if (j == 1) { 1107 tmp0 = tmps[*vi--]; 1108 sum1 -= *v1-- * tmp0; 1109 sum2 -= *v2-- * tmp0; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 } 1113 1114 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1115 row--; 1116 sum2 -= *v2-- * tmp0; 1117 sum3 -= *v3-- * tmp0; 1118 sum4 -= *v4-- * tmp0; 1119 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1120 row--; 1121 sum3 -= *v3-- * tmp0; 1122 sum4 -= *v4-- * tmp0; 1123 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1124 row--; 1125 sum4 -= *v4-- * tmp0; 1126 x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1127 row--; 1128 break; 1129 case 5: 1130 sum1 = tmp[row]; 1131 sum2 = tmp[row - 1]; 1132 sum3 = tmp[row - 2]; 1133 sum4 = tmp[row - 3]; 1134 sum5 = tmp[row - 4]; 1135 v2 = aa + ai[row] - 1; 1136 v3 = aa + ai[row - 1] - 1; 1137 v4 = aa + ai[row - 2] - 1; 1138 v5 = aa + ai[row - 3] - 1; 1139 for (j = nz; j > 1; j -= 2) { 1140 vi -= 2; 1141 i0 = vi[2]; 1142 i1 = vi[1]; 1143 tmp0 = tmps[i0]; 1144 tmp1 = tmps[i1]; 1145 v1 -= 2; 1146 v2 -= 2; 1147 v3 -= 2; 1148 v4 -= 2; 1149 v5 -= 2; 1150 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1151 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1152 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1153 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1154 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1155 } 1156 if (j == 1) { 1157 tmp0 = tmps[*vi--]; 1158 sum1 -= *v1-- * tmp0; 1159 sum2 -= *v2-- * tmp0; 1160 sum3 -= *v3-- * tmp0; 1161 sum4 -= *v4-- * tmp0; 1162 sum5 -= *v5-- * tmp0; 1163 } 1164 1165 tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]]; 1166 row--; 1167 sum2 -= *v2-- * tmp0; 1168 sum3 -= *v3-- * tmp0; 1169 sum4 -= *v4-- * tmp0; 1170 sum5 -= *v5-- * tmp0; 1171 tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]]; 1172 row--; 1173 sum3 -= *v3-- * tmp0; 1174 sum4 -= *v4-- * tmp0; 1175 sum5 -= *v5-- * tmp0; 1176 tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]]; 1177 row--; 1178 sum4 -= *v4-- * tmp0; 1179 sum5 -= *v5-- * tmp0; 1180 tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]]; 1181 row--; 1182 sum5 -= *v5-- * tmp0; 1183 x[*c--] = tmp[row] = sum5 * a_a[ad[row]]; 1184 row--; 1185 break; 1186 default: 1187 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 1188 } 1189 } 1190 PetscCall(ISRestoreIndices(isrow, &rout)); 1191 PetscCall(ISRestoreIndices(iscol, &cout)); 1192 PetscCall(VecRestoreArrayRead(bb, &b)); 1193 PetscCall(VecRestoreArrayWrite(xx, &x)); 1194 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1195 PetscFunctionReturn(PETSC_SUCCESS); 1196 } 1197 1198 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info) 1199 { 1200 Mat C = B; 1201 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data; 1202 IS isrow = b->row, isicol = b->icol; 1203 const PetscInt *r, *ic, *ics; 1204 const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 1205 PetscInt i, j, k, nz, nzL, row, *pj; 1206 const PetscInt *ajtmp, *bjtmp; 1207 MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4; 1208 const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4; 1209 FactorShiftCtx sctx; 1210 const PetscInt *ddiag; 1211 PetscReal rs; 1212 MatScalar d; 1213 PetscInt inod, nodesz, node_max, col; 1214 const PetscInt *ns; 1215 PetscInt *tmp_vec1, *tmp_vec2, *nsmap; 1216 1217 PetscFunctionBegin; 1218 /* MatPivotSetUp(): initialize shift context sctx */ 1219 PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 1220 1221 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1222 ddiag = a->diag; 1223 sctx.shift_top = info->zeropivot; 1224 for (i = 0; i < n; i++) { 1225 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1226 d = (aa)[ddiag[i]]; 1227 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1228 v = aa + ai[i]; 1229 nz = ai[i + 1] - ai[i]; 1230 for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 1231 if (rs > sctx.shift_top) sctx.shift_top = rs; 1232 } 1233 sctx.shift_top *= 1.1; 1234 sctx.nshift_max = 5; 1235 sctx.shift_lo = 0.; 1236 sctx.shift_hi = 1.; 1237 } 1238 1239 PetscCall(ISGetIndices(isrow, &r)); 1240 PetscCall(ISGetIndices(isicol, &ic)); 1241 1242 PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4)); 1243 ics = ic; 1244 1245 node_max = a->inode.node_count; 1246 ns = a->inode.size_csr; 1247 PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information"); 1248 1249 /* If max inode size > 4, split it into two inodes.*/ 1250 /* also map the inode sizes according to the ordering */ 1251 PetscCall(PetscMalloc1(n + 1, &tmp_vec1)); 1252 for (i = 0, j = 0; i < node_max; ++i, ++j) { 1253 nodesz = ns[i + 1] - ns[i]; 1254 if (nodesz > 4) { 1255 tmp_vec1[j] = 4; 1256 ++j; 1257 tmp_vec1[j] = nodesz - tmp_vec1[j - 1]; 1258 } else { 1259 tmp_vec1[j] = nodesz; 1260 } 1261 } 1262 /* Use the correct node_max */ 1263 node_max = j; 1264 1265 /* Now reorder the inode info based on mat re-ordering info */ 1266 /* First create a row -> inode_size_array_index map */ 1267 PetscCall(PetscMalloc1(n + 1, &nsmap)); 1268 PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2)); 1269 tmp_vec2[0] = 0; 1270 for (i = 0, row = 0; i < node_max; i++) { 1271 nodesz = tmp_vec1[i]; 1272 for (j = 0; j < nodesz; j++, row++) nsmap[row] = i; 1273 } 1274 /* Using nsmap, create a reordered ns structure */ 1275 for (i = 0, j = 0; i < node_max; i++) { 1276 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1277 tmp_vec2[i + 1] = tmp_vec2[i] + nodesz; 1278 j += nodesz; 1279 } 1280 PetscCall(PetscFree(nsmap)); 1281 PetscCall(PetscFree(tmp_vec1)); 1282 1283 /* Now use the correct ns */ 1284 ns = tmp_vec2; 1285 1286 do { 1287 sctx.newshift = PETSC_FALSE; 1288 /* Now loop over each block-row, and do the factorization */ 1289 for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */ 1290 nodesz = ns[inod + 1] - ns[inod]; 1291 1292 switch (nodesz) { 1293 case 1: 1294 /* zero rtmp1 */ 1295 /* L part */ 1296 nz = bi[i + 1] - bi[i]; 1297 bjtmp = bj + bi[i]; 1298 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1299 1300 /* U part */ 1301 nz = bdiag[i] - bdiag[i + 1]; 1302 bjtmp = bj + bdiag[i + 1] + 1; 1303 for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0; 1304 1305 /* load in initial (unfactored row) */ 1306 nz = ai[r[i] + 1] - ai[r[i]]; 1307 ajtmp = aj + ai[r[i]]; 1308 v = aa + ai[r[i]]; 1309 for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1310 1311 /* ZeropivotApply() */ 1312 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1313 1314 /* elimination */ 1315 bjtmp = bj + bi[i]; 1316 row = *bjtmp++; 1317 nzL = bi[i + 1] - bi[i]; 1318 for (k = 0; k < nzL; k++) { 1319 pc = rtmp1 + row; 1320 if (*pc != 0.0) { 1321 pv = b->a + bdiag[row]; 1322 mul1 = *pc * (*pv); 1323 *pc = mul1; 1324 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1325 pv = b->a + bdiag[row + 1] + 1; 1326 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1327 for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1328 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1329 } 1330 row = *bjtmp++; 1331 } 1332 1333 /* finished row so stick it into b->a */ 1334 rs = 0.0; 1335 /* L part */ 1336 pv = b->a + bi[i]; 1337 pj = b->j + bi[i]; 1338 nz = bi[i + 1] - bi[i]; 1339 for (j = 0; j < nz; j++) { 1340 pv[j] = rtmp1[pj[j]]; 1341 rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* U part */ 1345 pv = b->a + bdiag[i + 1] + 1; 1346 pj = b->j + bdiag[i + 1] + 1; 1347 nz = bdiag[i] - bdiag[i + 1] - 1; 1348 for (j = 0; j < nz; j++) { 1349 pv[j] = rtmp1[pj[j]]; 1350 rs += PetscAbsScalar(pv[j]); 1351 } 1352 1353 /* Check zero pivot */ 1354 sctx.rs = rs; 1355 sctx.pv = rtmp1[i]; 1356 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1357 if (sctx.newshift) break; 1358 1359 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1360 pv = b->a + bdiag[i]; 1361 *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1362 break; 1363 1364 case 2: 1365 /* zero rtmp1 and rtmp2 */ 1366 /* L part */ 1367 nz = bi[i + 1] - bi[i]; 1368 bjtmp = bj + bi[i]; 1369 for (j = 0; j < nz; j++) { 1370 col = bjtmp[j]; 1371 rtmp1[col] = 0.0; 1372 rtmp2[col] = 0.0; 1373 } 1374 1375 /* U part */ 1376 nz = bdiag[i] - bdiag[i + 1]; 1377 bjtmp = bj + bdiag[i + 1] + 1; 1378 for (j = 0; j < nz; j++) { 1379 col = bjtmp[j]; 1380 rtmp1[col] = 0.0; 1381 rtmp2[col] = 0.0; 1382 } 1383 1384 /* load in initial (unfactored row) */ 1385 nz = ai[r[i] + 1] - ai[r[i]]; 1386 ajtmp = aj + ai[r[i]]; 1387 v1 = aa + ai[r[i]]; 1388 v2 = aa + ai[r[i + 1]]; 1389 for (j = 0; j < nz; j++) { 1390 col = ics[ajtmp[j]]; 1391 rtmp1[col] = v1[j]; 1392 rtmp2[col] = v2[j]; 1393 } 1394 /* ZeropivotApply(): shift the diagonal of the matrix */ 1395 rtmp1[i] += sctx.shift_amount; 1396 rtmp2[i + 1] += sctx.shift_amount; 1397 1398 /* elimination */ 1399 bjtmp = bj + bi[i]; 1400 row = *bjtmp++; /* pivot row */ 1401 nzL = bi[i + 1] - bi[i]; 1402 for (k = 0; k < nzL; k++) { 1403 pc1 = rtmp1 + row; 1404 pc2 = rtmp2 + row; 1405 if (*pc1 != 0.0 || *pc2 != 0.0) { 1406 pv = b->a + bdiag[row]; 1407 mul1 = *pc1 * (*pv); 1408 mul2 = *pc2 * (*pv); 1409 *pc1 = mul1; 1410 *pc2 = mul2; 1411 1412 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1413 pv = b->a + bdiag[row + 1] + 1; 1414 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1415 for (j = 0; j < nz; j++) { 1416 col = pj[j]; 1417 rtmp1[col] -= mul1 * pv[j]; 1418 rtmp2[col] -= mul2 * pv[j]; 1419 } 1420 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1421 } 1422 row = *bjtmp++; 1423 } 1424 1425 /* finished row i; check zero pivot, then stick row i into b->a */ 1426 rs = 0.0; 1427 /* L part */ 1428 pc1 = b->a + bi[i]; 1429 pj = b->j + bi[i]; 1430 nz = bi[i + 1] - bi[i]; 1431 for (j = 0; j < nz; j++) { 1432 col = pj[j]; 1433 pc1[j] = rtmp1[col]; 1434 rs += PetscAbsScalar(pc1[j]); 1435 } 1436 /* U part */ 1437 pc1 = b->a + bdiag[i + 1] + 1; 1438 pj = b->j + bdiag[i + 1] + 1; 1439 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1440 for (j = 0; j < nz; j++) { 1441 col = pj[j]; 1442 pc1[j] = rtmp1[col]; 1443 rs += PetscAbsScalar(pc1[j]); 1444 } 1445 1446 sctx.rs = rs; 1447 sctx.pv = rtmp1[i]; 1448 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1449 if (sctx.newshift) break; 1450 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1451 *pc1 = 1.0 / sctx.pv; 1452 1453 /* Now take care of diagonal 2x2 block. */ 1454 pc2 = rtmp2 + i; 1455 if (*pc2 != 0.0) { 1456 mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */ 1457 *pc2 = mul1; /* insert L entry */ 1458 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1459 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1460 for (j = 0; j < nz; j++) { 1461 col = pj[j]; 1462 rtmp2[col] -= mul1 * rtmp1[col]; 1463 } 1464 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1465 } 1466 1467 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1468 rs = 0.0; 1469 /* L part */ 1470 pc2 = b->a + bi[i + 1]; 1471 pj = b->j + bi[i + 1]; 1472 nz = bi[i + 2] - bi[i + 1]; 1473 for (j = 0; j < nz; j++) { 1474 col = pj[j]; 1475 pc2[j] = rtmp2[col]; 1476 rs += PetscAbsScalar(pc2[j]); 1477 } 1478 /* U part */ 1479 pc2 = b->a + bdiag[i + 2] + 1; 1480 pj = b->j + bdiag[i + 2] + 1; 1481 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1482 for (j = 0; j < nz; j++) { 1483 col = pj[j]; 1484 pc2[j] = rtmp2[col]; 1485 rs += PetscAbsScalar(pc2[j]); 1486 } 1487 1488 sctx.rs = rs; 1489 sctx.pv = rtmp2[i + 1]; 1490 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1491 if (sctx.newshift) break; 1492 pc2 = b->a + bdiag[i + 1]; 1493 *pc2 = 1.0 / sctx.pv; 1494 break; 1495 1496 case 3: 1497 /* zero rtmp */ 1498 /* L part */ 1499 nz = bi[i + 1] - bi[i]; 1500 bjtmp = bj + bi[i]; 1501 for (j = 0; j < nz; j++) { 1502 col = bjtmp[j]; 1503 rtmp1[col] = 0.0; 1504 rtmp2[col] = 0.0; 1505 rtmp3[col] = 0.0; 1506 } 1507 1508 /* U part */ 1509 nz = bdiag[i] - bdiag[i + 1]; 1510 bjtmp = bj + bdiag[i + 1] + 1; 1511 for (j = 0; j < nz; j++) { 1512 col = bjtmp[j]; 1513 rtmp1[col] = 0.0; 1514 rtmp2[col] = 0.0; 1515 rtmp3[col] = 0.0; 1516 } 1517 1518 /* load in initial (unfactored row) */ 1519 nz = ai[r[i] + 1] - ai[r[i]]; 1520 ajtmp = aj + ai[r[i]]; 1521 v1 = aa + ai[r[i]]; 1522 v2 = aa + ai[r[i + 1]]; 1523 v3 = aa + ai[r[i + 2]]; 1524 for (j = 0; j < nz; j++) { 1525 col = ics[ajtmp[j]]; 1526 rtmp1[col] = v1[j]; 1527 rtmp2[col] = v2[j]; 1528 rtmp3[col] = v3[j]; 1529 } 1530 /* ZeropivotApply(): shift the diagonal of the matrix */ 1531 rtmp1[i] += sctx.shift_amount; 1532 rtmp2[i + 1] += sctx.shift_amount; 1533 rtmp3[i + 2] += sctx.shift_amount; 1534 1535 /* elimination */ 1536 bjtmp = bj + bi[i]; 1537 row = *bjtmp++; /* pivot row */ 1538 nzL = bi[i + 1] - bi[i]; 1539 for (k = 0; k < nzL; k++) { 1540 pc1 = rtmp1 + row; 1541 pc2 = rtmp2 + row; 1542 pc3 = rtmp3 + row; 1543 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1544 pv = b->a + bdiag[row]; 1545 mul1 = *pc1 * (*pv); 1546 mul2 = *pc2 * (*pv); 1547 mul3 = *pc3 * (*pv); 1548 *pc1 = mul1; 1549 *pc2 = mul2; 1550 *pc3 = mul3; 1551 1552 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1553 pv = b->a + bdiag[row + 1] + 1; 1554 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1555 for (j = 0; j < nz; j++) { 1556 col = pj[j]; 1557 rtmp1[col] -= mul1 * pv[j]; 1558 rtmp2[col] -= mul2 * pv[j]; 1559 rtmp3[col] -= mul3 * pv[j]; 1560 } 1561 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1562 } 1563 row = *bjtmp++; 1564 } 1565 1566 /* finished row i; check zero pivot, then stick row i into b->a */ 1567 rs = 0.0; 1568 /* L part */ 1569 pc1 = b->a + bi[i]; 1570 pj = b->j + bi[i]; 1571 nz = bi[i + 1] - bi[i]; 1572 for (j = 0; j < nz; j++) { 1573 col = pj[j]; 1574 pc1[j] = rtmp1[col]; 1575 rs += PetscAbsScalar(pc1[j]); 1576 } 1577 /* U part */ 1578 pc1 = b->a + bdiag[i + 1] + 1; 1579 pj = b->j + bdiag[i + 1] + 1; 1580 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1581 for (j = 0; j < nz; j++) { 1582 col = pj[j]; 1583 pc1[j] = rtmp1[col]; 1584 rs += PetscAbsScalar(pc1[j]); 1585 } 1586 1587 sctx.rs = rs; 1588 sctx.pv = rtmp1[i]; 1589 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1590 if (sctx.newshift) break; 1591 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1592 *pc1 = 1.0 / sctx.pv; 1593 1594 /* Now take care of 1st column of diagonal 3x3 block. */ 1595 pc2 = rtmp2 + i; 1596 pc3 = rtmp3 + i; 1597 if (*pc2 != 0.0 || *pc3 != 0.0) { 1598 mul2 = (*pc2) * (*pc1); 1599 *pc2 = mul2; 1600 mul3 = (*pc3) * (*pc1); 1601 *pc3 = mul3; 1602 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1603 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1604 for (j = 0; j < nz; j++) { 1605 col = pj[j]; 1606 rtmp2[col] -= mul2 * rtmp1[col]; 1607 rtmp3[col] -= mul3 * rtmp1[col]; 1608 } 1609 PetscCall(PetscLogFlops(2 + 4.0 * nz)); 1610 } 1611 1612 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1613 rs = 0.0; 1614 /* L part */ 1615 pc2 = b->a + bi[i + 1]; 1616 pj = b->j + bi[i + 1]; 1617 nz = bi[i + 2] - bi[i + 1]; 1618 for (j = 0; j < nz; j++) { 1619 col = pj[j]; 1620 pc2[j] = rtmp2[col]; 1621 rs += PetscAbsScalar(pc2[j]); 1622 } 1623 /* U part */ 1624 pc2 = b->a + bdiag[i + 2] + 1; 1625 pj = b->j + bdiag[i + 2] + 1; 1626 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1627 for (j = 0; j < nz; j++) { 1628 col = pj[j]; 1629 pc2[j] = rtmp2[col]; 1630 rs += PetscAbsScalar(pc2[j]); 1631 } 1632 1633 sctx.rs = rs; 1634 sctx.pv = rtmp2[i + 1]; 1635 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1636 if (sctx.newshift) break; 1637 pc2 = b->a + bdiag[i + 1]; 1638 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1639 1640 /* Now take care of 2nd column of diagonal 3x3 block. */ 1641 pc3 = rtmp3 + i + 1; 1642 if (*pc3 != 0.0) { 1643 mul3 = (*pc3) * (*pc2); 1644 *pc3 = mul3; 1645 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1646 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1647 for (j = 0; j < nz; j++) { 1648 col = pj[j]; 1649 rtmp3[col] -= mul3 * rtmp2[col]; 1650 } 1651 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1652 } 1653 1654 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1655 rs = 0.0; 1656 /* L part */ 1657 pc3 = b->a + bi[i + 2]; 1658 pj = b->j + bi[i + 2]; 1659 nz = bi[i + 3] - bi[i + 2]; 1660 for (j = 0; j < nz; j++) { 1661 col = pj[j]; 1662 pc3[j] = rtmp3[col]; 1663 rs += PetscAbsScalar(pc3[j]); 1664 } 1665 /* U part */ 1666 pc3 = b->a + bdiag[i + 3] + 1; 1667 pj = b->j + bdiag[i + 3] + 1; 1668 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1669 for (j = 0; j < nz; j++) { 1670 col = pj[j]; 1671 pc3[j] = rtmp3[col]; 1672 rs += PetscAbsScalar(pc3[j]); 1673 } 1674 1675 sctx.rs = rs; 1676 sctx.pv = rtmp3[i + 2]; 1677 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1678 if (sctx.newshift) break; 1679 pc3 = b->a + bdiag[i + 2]; 1680 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1681 break; 1682 case 4: 1683 /* zero rtmp */ 1684 /* L part */ 1685 nz = bi[i + 1] - bi[i]; 1686 bjtmp = bj + bi[i]; 1687 for (j = 0; j < nz; j++) { 1688 col = bjtmp[j]; 1689 rtmp1[col] = 0.0; 1690 rtmp2[col] = 0.0; 1691 rtmp3[col] = 0.0; 1692 rtmp4[col] = 0.0; 1693 } 1694 1695 /* U part */ 1696 nz = bdiag[i] - bdiag[i + 1]; 1697 bjtmp = bj + bdiag[i + 1] + 1; 1698 for (j = 0; j < nz; j++) { 1699 col = bjtmp[j]; 1700 rtmp1[col] = 0.0; 1701 rtmp2[col] = 0.0; 1702 rtmp3[col] = 0.0; 1703 rtmp4[col] = 0.0; 1704 } 1705 1706 /* load in initial (unfactored row) */ 1707 nz = ai[r[i] + 1] - ai[r[i]]; 1708 ajtmp = aj + ai[r[i]]; 1709 v1 = aa + ai[r[i]]; 1710 v2 = aa + ai[r[i + 1]]; 1711 v3 = aa + ai[r[i + 2]]; 1712 v4 = aa + ai[r[i + 3]]; 1713 for (j = 0; j < nz; j++) { 1714 col = ics[ajtmp[j]]; 1715 rtmp1[col] = v1[j]; 1716 rtmp2[col] = v2[j]; 1717 rtmp3[col] = v3[j]; 1718 rtmp4[col] = v4[j]; 1719 } 1720 /* ZeropivotApply(): shift the diagonal of the matrix */ 1721 rtmp1[i] += sctx.shift_amount; 1722 rtmp2[i + 1] += sctx.shift_amount; 1723 rtmp3[i + 2] += sctx.shift_amount; 1724 rtmp4[i + 3] += sctx.shift_amount; 1725 1726 /* elimination */ 1727 bjtmp = bj + bi[i]; 1728 row = *bjtmp++; /* pivot row */ 1729 nzL = bi[i + 1] - bi[i]; 1730 for (k = 0; k < nzL; k++) { 1731 pc1 = rtmp1 + row; 1732 pc2 = rtmp2 + row; 1733 pc3 = rtmp3 + row; 1734 pc4 = rtmp4 + row; 1735 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1736 pv = b->a + bdiag[row]; 1737 mul1 = *pc1 * (*pv); 1738 mul2 = *pc2 * (*pv); 1739 mul3 = *pc3 * (*pv); 1740 mul4 = *pc4 * (*pv); 1741 *pc1 = mul1; 1742 *pc2 = mul2; 1743 *pc3 = mul3; 1744 *pc4 = mul4; 1745 1746 pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 1747 pv = b->a + bdiag[row + 1] + 1; 1748 nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 1749 for (j = 0; j < nz; j++) { 1750 col = pj[j]; 1751 rtmp1[col] -= mul1 * pv[j]; 1752 rtmp2[col] -= mul2 * pv[j]; 1753 rtmp3[col] -= mul3 * pv[j]; 1754 rtmp4[col] -= mul4 * pv[j]; 1755 } 1756 PetscCall(PetscLogFlops(4 + 8.0 * nz)); 1757 } 1758 row = *bjtmp++; 1759 } 1760 1761 /* finished row i; check zero pivot, then stick row i into b->a */ 1762 rs = 0.0; 1763 /* L part */ 1764 pc1 = b->a + bi[i]; 1765 pj = b->j + bi[i]; 1766 nz = bi[i + 1] - bi[i]; 1767 for (j = 0; j < nz; j++) { 1768 col = pj[j]; 1769 pc1[j] = rtmp1[col]; 1770 rs += PetscAbsScalar(pc1[j]); 1771 } 1772 /* U part */ 1773 pc1 = b->a + bdiag[i + 1] + 1; 1774 pj = b->j + bdiag[i + 1] + 1; 1775 nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */ 1776 for (j = 0; j < nz; j++) { 1777 col = pj[j]; 1778 pc1[j] = rtmp1[col]; 1779 rs += PetscAbsScalar(pc1[j]); 1780 } 1781 1782 sctx.rs = rs; 1783 sctx.pv = rtmp1[i]; 1784 PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 1785 if (sctx.newshift) break; 1786 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1787 *pc1 = 1.0 / sctx.pv; 1788 1789 /* Now take care of 1st column of diagonal 4x4 block. */ 1790 pc2 = rtmp2 + i; 1791 pc3 = rtmp3 + i; 1792 pc4 = rtmp4 + i; 1793 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1794 mul2 = (*pc2) * (*pc1); 1795 *pc2 = mul2; 1796 mul3 = (*pc3) * (*pc1); 1797 *pc3 = mul3; 1798 mul4 = (*pc4) * (*pc1); 1799 *pc4 = mul4; 1800 pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */ 1801 nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */ 1802 for (j = 0; j < nz; j++) { 1803 col = pj[j]; 1804 rtmp2[col] -= mul2 * rtmp1[col]; 1805 rtmp3[col] -= mul3 * rtmp1[col]; 1806 rtmp4[col] -= mul4 * rtmp1[col]; 1807 } 1808 PetscCall(PetscLogFlops(3 + 6.0 * nz)); 1809 } 1810 1811 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1812 rs = 0.0; 1813 /* L part */ 1814 pc2 = b->a + bi[i + 1]; 1815 pj = b->j + bi[i + 1]; 1816 nz = bi[i + 2] - bi[i + 1]; 1817 for (j = 0; j < nz; j++) { 1818 col = pj[j]; 1819 pc2[j] = rtmp2[col]; 1820 rs += PetscAbsScalar(pc2[j]); 1821 } 1822 /* U part */ 1823 pc2 = b->a + bdiag[i + 2] + 1; 1824 pj = b->j + bdiag[i + 2] + 1; 1825 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */ 1826 for (j = 0; j < nz; j++) { 1827 col = pj[j]; 1828 pc2[j] = rtmp2[col]; 1829 rs += PetscAbsScalar(pc2[j]); 1830 } 1831 1832 sctx.rs = rs; 1833 sctx.pv = rtmp2[i + 1]; 1834 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1)); 1835 if (sctx.newshift) break; 1836 pc2 = b->a + bdiag[i + 1]; 1837 *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */ 1838 1839 /* Now take care of 2nd column of diagonal 4x4 block. */ 1840 pc3 = rtmp3 + i + 1; 1841 pc4 = rtmp4 + i + 1; 1842 if (*pc3 != 0.0 || *pc4 != 0.0) { 1843 mul3 = (*pc3) * (*pc2); 1844 *pc3 = mul3; 1845 mul4 = (*pc4) * (*pc2); 1846 *pc4 = mul4; 1847 pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */ 1848 nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */ 1849 for (j = 0; j < nz; j++) { 1850 col = pj[j]; 1851 rtmp3[col] -= mul3 * rtmp2[col]; 1852 rtmp4[col] -= mul4 * rtmp2[col]; 1853 } 1854 PetscCall(PetscLogFlops(4.0 * nz)); 1855 } 1856 1857 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1858 rs = 0.0; 1859 /* L part */ 1860 pc3 = b->a + bi[i + 2]; 1861 pj = b->j + bi[i + 2]; 1862 nz = bi[i + 3] - bi[i + 2]; 1863 for (j = 0; j < nz; j++) { 1864 col = pj[j]; 1865 pc3[j] = rtmp3[col]; 1866 rs += PetscAbsScalar(pc3[j]); 1867 } 1868 /* U part */ 1869 pc3 = b->a + bdiag[i + 3] + 1; 1870 pj = b->j + bdiag[i + 3] + 1; 1871 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */ 1872 for (j = 0; j < nz; j++) { 1873 col = pj[j]; 1874 pc3[j] = rtmp3[col]; 1875 rs += PetscAbsScalar(pc3[j]); 1876 } 1877 1878 sctx.rs = rs; 1879 sctx.pv = rtmp3[i + 2]; 1880 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2)); 1881 if (sctx.newshift) break; 1882 pc3 = b->a + bdiag[i + 2]; 1883 *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */ 1884 1885 /* Now take care of 3rd column of diagonal 4x4 block. */ 1886 pc4 = rtmp4 + i + 2; 1887 if (*pc4 != 0.0) { 1888 mul4 = (*pc4) * (*pc3); 1889 *pc4 = mul4; 1890 pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */ 1891 nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */ 1892 for (j = 0; j < nz; j++) { 1893 col = pj[j]; 1894 rtmp4[col] -= mul4 * rtmp3[col]; 1895 } 1896 PetscCall(PetscLogFlops(1 + 2.0 * nz)); 1897 } 1898 1899 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1900 rs = 0.0; 1901 /* L part */ 1902 pc4 = b->a + bi[i + 3]; 1903 pj = b->j + bi[i + 3]; 1904 nz = bi[i + 4] - bi[i + 3]; 1905 for (j = 0; j < nz; j++) { 1906 col = pj[j]; 1907 pc4[j] = rtmp4[col]; 1908 rs += PetscAbsScalar(pc4[j]); 1909 } 1910 /* U part */ 1911 pc4 = b->a + bdiag[i + 4] + 1; 1912 pj = b->j + bdiag[i + 4] + 1; 1913 nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */ 1914 for (j = 0; j < nz; j++) { 1915 col = pj[j]; 1916 pc4[j] = rtmp4[col]; 1917 rs += PetscAbsScalar(pc4[j]); 1918 } 1919 1920 sctx.rs = rs; 1921 sctx.pv = rtmp4[i + 3]; 1922 PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3)); 1923 if (sctx.newshift) break; 1924 pc4 = b->a + bdiag[i + 3]; 1925 *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */ 1926 break; 1927 1928 default: 1929 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported "); 1930 } 1931 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1932 i += nodesz; /* Update the row */ 1933 } 1934 1935 /* MatPivotRefine() */ 1936 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 1937 /* 1938 * if no shift in this attempt & shifting & started shifting & can refine, 1939 * then try lower shift 1940 */ 1941 sctx.shift_hi = sctx.shift_fraction; 1942 sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 1943 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1944 sctx.newshift = PETSC_TRUE; 1945 sctx.nshift++; 1946 } 1947 } while (sctx.newshift); 1948 1949 PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4)); 1950 PetscCall(PetscFree(tmp_vec2)); 1951 PetscCall(ISRestoreIndices(isicol, &ic)); 1952 PetscCall(ISRestoreIndices(isrow, &r)); 1953 1954 if (b->inode.size_csr) { 1955 C->ops->solve = MatSolve_SeqAIJ_Inode; 1956 } else { 1957 C->ops->solve = MatSolve_SeqAIJ; 1958 } 1959 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1960 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1961 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1962 C->ops->matsolve = MatMatSolve_SeqAIJ; 1963 C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ; 1964 C->assembled = PETSC_TRUE; 1965 C->preallocated = PETSC_TRUE; 1966 1967 PetscCall(PetscLogFlops(C->cmap->n)); 1968 1969 /* MatShiftView(A,info,&sctx) */ 1970 if (sctx.nshift) { 1971 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 1972 PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 1973 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1974 PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 1975 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1976 PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 1977 } 1978 } 1979 PetscFunctionReturn(PETSC_SUCCESS); 1980 } 1981 1982 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 1983 { 1984 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 1985 IS iscol = a->col, isrow = a->row; 1986 const PetscInt *r, *c, *rout, *cout; 1987 PetscInt i, j; 1988 PetscInt node_max, row, nsz, aii, i0, i1, nz; 1989 const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj; 1990 PetscScalar *x, *tmp, *tmps, tmp0, tmp1; 1991 PetscScalar sum1, sum2, sum3, sum4, sum5; 1992 const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa; 1993 const PetscScalar *b; 1994 1995 PetscFunctionBegin; 1996 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 1997 node_max = a->inode.node_count; 1998 ns = a->inode.size_csr; /* Node Size array */ 1999 2000 PetscCall(VecGetArrayRead(bb, &b)); 2001 PetscCall(VecGetArrayWrite(xx, &x)); 2002 tmp = a->solve_work; 2003 2004 PetscCall(ISGetIndices(isrow, &rout)); 2005 r = rout; 2006 PetscCall(ISGetIndices(iscol, &cout)); 2007 c = cout; 2008 2009 /* forward solve the lower triangular */ 2010 tmps = tmp; 2011 aa = a_a; 2012 aj = a_j; 2013 ad = a->diag; 2014 2015 for (i = 0; i < node_max; ++i) { 2016 row = ns[i]; 2017 nsz = ns[i + 1] - ns[i]; 2018 aii = ai[row]; 2019 v1 = aa + aii; 2020 vi = aj + aii; 2021 nz = ai[row + 1] - ai[row]; 2022 2023 if (i < node_max - 1) { 2024 /* Prefetch the indices for the next block */ 2025 PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */ 2026 /* Prefetch the data for the next block */ 2027 PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); 2028 } 2029 2030 switch (nsz) { /* Each loop in 'case' is unrolled */ 2031 case 1: 2032 sum1 = b[r[row]]; 2033 for (j = 0; j < nz - 1; j += 2) { 2034 i0 = vi[j]; 2035 i1 = vi[j + 1]; 2036 tmp0 = tmps[i0]; 2037 tmp1 = tmps[i1]; 2038 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2039 } 2040 if (j == nz - 1) { 2041 tmp0 = tmps[vi[j]]; 2042 sum1 -= v1[j] * tmp0; 2043 } 2044 tmp[row++] = sum1; 2045 break; 2046 case 2: 2047 sum1 = b[r[row]]; 2048 sum2 = b[r[row + 1]]; 2049 v2 = aa + ai[row + 1]; 2050 2051 for (j = 0; j < nz - 1; j += 2) { 2052 i0 = vi[j]; 2053 i1 = vi[j + 1]; 2054 tmp0 = tmps[i0]; 2055 tmp1 = tmps[i1]; 2056 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2057 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2058 } 2059 if (j == nz - 1) { 2060 tmp0 = tmps[vi[j]]; 2061 sum1 -= v1[j] * tmp0; 2062 sum2 -= v2[j] * tmp0; 2063 } 2064 sum2 -= v2[nz] * sum1; 2065 tmp[row++] = sum1; 2066 tmp[row++] = sum2; 2067 break; 2068 case 3: 2069 sum1 = b[r[row]]; 2070 sum2 = b[r[row + 1]]; 2071 sum3 = b[r[row + 2]]; 2072 v2 = aa + ai[row + 1]; 2073 v3 = aa + ai[row + 2]; 2074 2075 for (j = 0; j < nz - 1; j += 2) { 2076 i0 = vi[j]; 2077 i1 = vi[j + 1]; 2078 tmp0 = tmps[i0]; 2079 tmp1 = tmps[i1]; 2080 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2081 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2082 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2083 } 2084 if (j == nz - 1) { 2085 tmp0 = tmps[vi[j]]; 2086 sum1 -= v1[j] * tmp0; 2087 sum2 -= v2[j] * tmp0; 2088 sum3 -= v3[j] * tmp0; 2089 } 2090 sum2 -= v2[nz] * sum1; 2091 sum3 -= v3[nz] * sum1; 2092 sum3 -= v3[nz + 1] * sum2; 2093 tmp[row++] = sum1; 2094 tmp[row++] = sum2; 2095 tmp[row++] = sum3; 2096 break; 2097 2098 case 4: 2099 sum1 = b[r[row]]; 2100 sum2 = b[r[row + 1]]; 2101 sum3 = b[r[row + 2]]; 2102 sum4 = b[r[row + 3]]; 2103 v2 = aa + ai[row + 1]; 2104 v3 = aa + ai[row + 2]; 2105 v4 = aa + ai[row + 3]; 2106 2107 for (j = 0; j < nz - 1; j += 2) { 2108 i0 = vi[j]; 2109 i1 = vi[j + 1]; 2110 tmp0 = tmps[i0]; 2111 tmp1 = tmps[i1]; 2112 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2113 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2114 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2115 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2116 } 2117 if (j == nz - 1) { 2118 tmp0 = tmps[vi[j]]; 2119 sum1 -= v1[j] * tmp0; 2120 sum2 -= v2[j] * tmp0; 2121 sum3 -= v3[j] * tmp0; 2122 sum4 -= v4[j] * tmp0; 2123 } 2124 sum2 -= v2[nz] * sum1; 2125 sum3 -= v3[nz] * sum1; 2126 sum4 -= v4[nz] * sum1; 2127 sum3 -= v3[nz + 1] * sum2; 2128 sum4 -= v4[nz + 1] * sum2; 2129 sum4 -= v4[nz + 2] * sum3; 2130 2131 tmp[row++] = sum1; 2132 tmp[row++] = sum2; 2133 tmp[row++] = sum3; 2134 tmp[row++] = sum4; 2135 break; 2136 case 5: 2137 sum1 = b[r[row]]; 2138 sum2 = b[r[row + 1]]; 2139 sum3 = b[r[row + 2]]; 2140 sum4 = b[r[row + 3]]; 2141 sum5 = b[r[row + 4]]; 2142 v2 = aa + ai[row + 1]; 2143 v3 = aa + ai[row + 2]; 2144 v4 = aa + ai[row + 3]; 2145 v5 = aa + ai[row + 4]; 2146 2147 for (j = 0; j < nz - 1; j += 2) { 2148 i0 = vi[j]; 2149 i1 = vi[j + 1]; 2150 tmp0 = tmps[i0]; 2151 tmp1 = tmps[i1]; 2152 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2153 sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1; 2154 sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1; 2155 sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1; 2156 sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1; 2157 } 2158 if (j == nz - 1) { 2159 tmp0 = tmps[vi[j]]; 2160 sum1 -= v1[j] * tmp0; 2161 sum2 -= v2[j] * tmp0; 2162 sum3 -= v3[j] * tmp0; 2163 sum4 -= v4[j] * tmp0; 2164 sum5 -= v5[j] * tmp0; 2165 } 2166 2167 sum2 -= v2[nz] * sum1; 2168 sum3 -= v3[nz] * sum1; 2169 sum4 -= v4[nz] * sum1; 2170 sum5 -= v5[nz] * sum1; 2171 sum3 -= v3[nz + 1] * sum2; 2172 sum4 -= v4[nz + 1] * sum2; 2173 sum5 -= v5[nz + 1] * sum2; 2174 sum4 -= v4[nz + 2] * sum3; 2175 sum5 -= v5[nz + 2] * sum3; 2176 sum5 -= v5[nz + 3] * sum4; 2177 2178 tmp[row++] = sum1; 2179 tmp[row++] = sum2; 2180 tmp[row++] = sum3; 2181 tmp[row++] = sum4; 2182 tmp[row++] = sum5; 2183 break; 2184 default: 2185 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2186 } 2187 } 2188 /* backward solve the upper triangular */ 2189 for (i = node_max - 1; i >= 0; i--) { 2190 row = ns[i + 1] - 1; 2191 nsz = ns[i + 1] - ns[i]; 2192 aii = ad[row + 1] + 1; 2193 v1 = aa + aii; 2194 vi = aj + aii; 2195 nz = ad[row] - ad[row + 1] - 1; 2196 2197 if (i > 0) { 2198 /* Prefetch the indices for the next block */ 2199 PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2200 /* Prefetch the data for the next block */ 2201 PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA); 2202 } 2203 2204 switch (nsz) { /* Each loop in 'case' is unrolled */ 2205 case 1: 2206 sum1 = tmp[row]; 2207 2208 for (j = 0; j < nz - 1; j += 2) { 2209 i0 = vi[j]; 2210 i1 = vi[j + 1]; 2211 tmp0 = tmps[i0]; 2212 tmp1 = tmps[i1]; 2213 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2214 } 2215 if (j == nz - 1) { 2216 tmp0 = tmps[vi[j]]; 2217 sum1 -= v1[j] * tmp0; 2218 } 2219 x[c[row]] = tmp[row] = sum1 * v1[nz]; 2220 row--; 2221 break; 2222 case 2: 2223 sum1 = tmp[row]; 2224 sum2 = tmp[row - 1]; 2225 v2 = aa + ad[row] + 1; 2226 for (j = 0; j < nz - 1; j += 2) { 2227 i0 = vi[j]; 2228 i1 = vi[j + 1]; 2229 tmp0 = tmps[i0]; 2230 tmp1 = tmps[i1]; 2231 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2232 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2233 } 2234 if (j == nz - 1) { 2235 tmp0 = tmps[vi[j]]; 2236 sum1 -= v1[j] * tmp0; 2237 sum2 -= v2[j + 1] * tmp0; 2238 } 2239 2240 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2241 row--; 2242 sum2 -= v2[0] * tmp0; 2243 x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2244 row--; 2245 break; 2246 case 3: 2247 sum1 = tmp[row]; 2248 sum2 = tmp[row - 1]; 2249 sum3 = tmp[row - 2]; 2250 v2 = aa + ad[row] + 1; 2251 v3 = aa + ad[row - 1] + 1; 2252 for (j = 0; j < nz - 1; j += 2) { 2253 i0 = vi[j]; 2254 i1 = vi[j + 1]; 2255 tmp0 = tmps[i0]; 2256 tmp1 = tmps[i1]; 2257 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2258 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2259 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2260 } 2261 if (j == nz - 1) { 2262 tmp0 = tmps[vi[j]]; 2263 sum1 -= v1[j] * tmp0; 2264 sum2 -= v2[j + 1] * tmp0; 2265 sum3 -= v3[j + 2] * tmp0; 2266 } 2267 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2268 row--; 2269 sum2 -= v2[0] * tmp0; 2270 sum3 -= v3[1] * tmp0; 2271 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2272 row--; 2273 sum3 -= v3[0] * tmp0; 2274 x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2275 row--; 2276 2277 break; 2278 case 4: 2279 sum1 = tmp[row]; 2280 sum2 = tmp[row - 1]; 2281 sum3 = tmp[row - 2]; 2282 sum4 = tmp[row - 3]; 2283 v2 = aa + ad[row] + 1; 2284 v3 = aa + ad[row - 1] + 1; 2285 v4 = aa + ad[row - 2] + 1; 2286 2287 for (j = 0; j < nz - 1; j += 2) { 2288 i0 = vi[j]; 2289 i1 = vi[j + 1]; 2290 tmp0 = tmps[i0]; 2291 tmp1 = tmps[i1]; 2292 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2293 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2294 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2295 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2296 } 2297 if (j == nz - 1) { 2298 tmp0 = tmps[vi[j]]; 2299 sum1 -= v1[j] * tmp0; 2300 sum2 -= v2[j + 1] * tmp0; 2301 sum3 -= v3[j + 2] * tmp0; 2302 sum4 -= v4[j + 3] * tmp0; 2303 } 2304 2305 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2306 row--; 2307 sum2 -= v2[0] * tmp0; 2308 sum3 -= v3[1] * tmp0; 2309 sum4 -= v4[2] * tmp0; 2310 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2311 row--; 2312 sum3 -= v3[0] * tmp0; 2313 sum4 -= v4[1] * tmp0; 2314 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2315 row--; 2316 sum4 -= v4[0] * tmp0; 2317 x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2318 row--; 2319 break; 2320 case 5: 2321 sum1 = tmp[row]; 2322 sum2 = tmp[row - 1]; 2323 sum3 = tmp[row - 2]; 2324 sum4 = tmp[row - 3]; 2325 sum5 = tmp[row - 4]; 2326 v2 = aa + ad[row] + 1; 2327 v3 = aa + ad[row - 1] + 1; 2328 v4 = aa + ad[row - 2] + 1; 2329 v5 = aa + ad[row - 3] + 1; 2330 for (j = 0; j < nz - 1; j += 2) { 2331 i0 = vi[j]; 2332 i1 = vi[j + 1]; 2333 tmp0 = tmps[i0]; 2334 tmp1 = tmps[i1]; 2335 sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1; 2336 sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1; 2337 sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1; 2338 sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1; 2339 sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1; 2340 } 2341 if (j == nz - 1) { 2342 tmp0 = tmps[vi[j]]; 2343 sum1 -= v1[j] * tmp0; 2344 sum2 -= v2[j + 1] * tmp0; 2345 sum3 -= v3[j + 2] * tmp0; 2346 sum4 -= v4[j + 3] * tmp0; 2347 sum5 -= v5[j + 4] * tmp0; 2348 } 2349 2350 tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz]; 2351 row--; 2352 sum2 -= v2[0] * tmp0; 2353 sum3 -= v3[1] * tmp0; 2354 sum4 -= v4[2] * tmp0; 2355 sum5 -= v5[3] * tmp0; 2356 tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1]; 2357 row--; 2358 sum3 -= v3[0] * tmp0; 2359 sum4 -= v4[1] * tmp0; 2360 sum5 -= v5[2] * tmp0; 2361 tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2]; 2362 row--; 2363 sum4 -= v4[0] * tmp0; 2364 sum5 -= v5[1] * tmp0; 2365 tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3]; 2366 row--; 2367 sum5 -= v5[0] * tmp0; 2368 x[c[row]] = tmp[row] = sum5 * v5[nz + 4]; 2369 row--; 2370 break; 2371 default: 2372 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported "); 2373 } 2374 } 2375 PetscCall(ISRestoreIndices(isrow, &rout)); 2376 PetscCall(ISRestoreIndices(iscol, &cout)); 2377 PetscCall(VecRestoreArrayRead(bb, &b)); 2378 PetscCall(VecRestoreArrayWrite(xx, &x)); 2379 PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 2380 PetscFunctionReturn(PETSC_SUCCESS); 2381 } 2382 2383 /* 2384 Makes a longer coloring[] array and calls the usual code with that 2385 */ 2386 static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring) 2387 { 2388 Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data; 2389 PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row; 2390 PetscInt *colorused, i; 2391 ISColoringValue *newcolor; 2392 2393 PetscFunctionBegin; 2394 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2395 PetscCall(PetscMalloc1(n + 1, &newcolor)); 2396 /* loop over inodes, marking a color for each column*/ 2397 row = 0; 2398 for (i = 0; i < m; i++) { 2399 for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++)); 2400 } 2401 2402 /* eliminate unneeded colors */ 2403 PetscCall(PetscCalloc1(5 * ncolors, &colorused)); 2404 for (i = 0; i < n; i++) colorused[newcolor[i]] = 1; 2405 2406 for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1]; 2407 ncolors = colorused[5 * ncolors - 1]; 2408 for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i)); 2409 PetscCall(PetscFree(colorused)); 2410 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring)); 2411 PetscCall(PetscFree(coloring)); 2412 PetscFunctionReturn(PETSC_SUCCESS); 2413 } 2414 2415 #include <petsc/private/kernels/blockinvert.h> 2416 2417 /* 2418 Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways 2419 */ 2420 static PetscErrorCode MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A, PetscScalar omega, PetscScalar fshift) 2421 { 2422 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2423 MatScalar *ibdiag, *bdiag, work[25]; 2424 const MatScalar *v = a->a; 2425 PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0; 2426 PetscInt m = a->inode.node_count, cnt = 0, i, j, row, nodesz; 2427 PetscInt k, ipvt[5]; 2428 PetscBool allowzeropivot = PetscNot(A->erroriffailure), zeropivotdetected; 2429 const PetscInt *sizes = a->inode.size_csr, *diag; 2430 2431 PetscFunctionBegin; 2432 if (a->idiagState == ((PetscObject)A)->state) PetscFunctionReturn(PETSC_SUCCESS); 2433 PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &diag, NULL)); 2434 if (!a->inode.ibdiag) { 2435 /* calculate space needed for diagonal blocks */ 2436 for (i = 0; i < m; i++) { 2437 nodesz = sizes[i + 1] - sizes[i]; 2438 cnt += nodesz * nodesz; 2439 } 2440 a->inode.bdiagsize = cnt; 2441 PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work)); 2442 } 2443 2444 /* copy over the diagonal blocks and invert them */ 2445 ibdiag = a->inode.ibdiag; 2446 bdiag = a->inode.bdiag; 2447 cnt = 0; 2448 for (i = 0, row = 0; i < m; i++) { 2449 nodesz = sizes[i + 1] - sizes[i]; 2450 for (j = 0; j < nodesz; j++) { 2451 for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k]; 2452 } 2453 PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz)); 2454 2455 switch (nodesz) { 2456 case 1: 2457 /* Create matrix data structure */ 2458 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2459 PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row); 2460 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2461 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2462 A->factorerror_zeropivot_row = row; 2463 PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row)); 2464 } 2465 ibdiag[cnt] = 1.0 / ibdiag[cnt]; 2466 break; 2467 case 2: 2468 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2469 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2470 break; 2471 case 3: 2472 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2473 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2474 break; 2475 case 4: 2476 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected)); 2477 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2478 break; 2479 case 5: 2480 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected)); 2481 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2482 break; 2483 default: 2484 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 2485 } 2486 cnt += nodesz * nodesz; 2487 row += nodesz; 2488 } 2489 a->inode.ibdiagState = ((PetscObject)A)->state; 2490 PetscFunctionReturn(PETSC_SUCCESS); 2491 } 2492 2493 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx) 2494 { 2495 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 2496 PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3; 2497 MatScalar *ibdiag, *bdiag, *t; 2498 PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5; 2499 const MatScalar *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL; 2500 const PetscScalar *xb, *b; 2501 PetscInt n, m = a->inode.node_count, cnt = 0, i, row, i1, i2, nodesz; 2502 PetscInt sz; 2503 const PetscInt *sizes = a->inode.size_csr, *idx, *diag, *ii = a->i; 2504 2505 PetscFunctionBegin; 2506 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 2507 PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode"); 2508 PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode"); 2509 PetscCall(MatInvertDiagonalForSOR_SeqAIJ_Inode(A, omega, fshift)); 2510 diag = a->diag; 2511 2512 ibdiag = a->inode.ibdiag; 2513 bdiag = a->inode.bdiag; 2514 t = a->inode.ssor_work; 2515 2516 PetscCall(VecGetArray(xx, &x)); 2517 PetscCall(VecGetArrayRead(bb, &b)); 2518 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2519 if (flag & SOR_ZERO_INITIAL_GUESS) { 2520 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2521 for (i = 0, row = 0; i < m; i++) { 2522 sz = diag[row] - ii[row]; 2523 v1 = a->a + ii[row]; 2524 idx = a->j + ii[row]; 2525 2526 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2527 nodesz = sizes[i + 1] - sizes[i]; 2528 switch (nodesz) { 2529 case 1: 2530 2531 sum1 = b[row]; 2532 for (n = 0; n < sz - 1; n += 2) { 2533 i1 = idx[0]; 2534 i2 = idx[1]; 2535 idx += 2; 2536 tmp0 = x[i1]; 2537 tmp1 = x[i2]; 2538 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2539 v1 += 2; 2540 } 2541 2542 if (n == sz - 1) { 2543 tmp0 = x[*idx]; 2544 sum1 -= *v1 * tmp0; 2545 } 2546 t[row] = sum1; 2547 x[row++] = sum1 * (*ibdiag++); 2548 break; 2549 case 2: 2550 v2 = a->a + ii[row + 1]; 2551 sum1 = b[row]; 2552 sum2 = b[row + 1]; 2553 for (n = 0; n < sz - 1; n += 2) { 2554 i1 = idx[0]; 2555 i2 = idx[1]; 2556 idx += 2; 2557 tmp0 = x[i1]; 2558 tmp1 = x[i2]; 2559 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2560 v1 += 2; 2561 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2562 v2 += 2; 2563 } 2564 2565 if (n == sz - 1) { 2566 tmp0 = x[*idx]; 2567 sum1 -= v1[0] * tmp0; 2568 sum2 -= v2[0] * tmp0; 2569 } 2570 t[row] = sum1; 2571 t[row + 1] = sum2; 2572 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2573 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2574 ibdiag += 4; 2575 break; 2576 case 3: 2577 v2 = a->a + ii[row + 1]; 2578 v3 = a->a + ii[row + 2]; 2579 sum1 = b[row]; 2580 sum2 = b[row + 1]; 2581 sum3 = b[row + 2]; 2582 for (n = 0; n < sz - 1; n += 2) { 2583 i1 = idx[0]; 2584 i2 = idx[1]; 2585 idx += 2; 2586 tmp0 = x[i1]; 2587 tmp1 = x[i2]; 2588 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2589 v1 += 2; 2590 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2591 v2 += 2; 2592 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2593 v3 += 2; 2594 } 2595 2596 if (n == sz - 1) { 2597 tmp0 = x[*idx]; 2598 sum1 -= v1[0] * tmp0; 2599 sum2 -= v2[0] * tmp0; 2600 sum3 -= v3[0] * tmp0; 2601 } 2602 t[row] = sum1; 2603 t[row + 1] = sum2; 2604 t[row + 2] = sum3; 2605 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 2606 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 2607 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 2608 ibdiag += 9; 2609 break; 2610 case 4: 2611 v2 = a->a + ii[row + 1]; 2612 v3 = a->a + ii[row + 2]; 2613 v4 = a->a + ii[row + 3]; 2614 sum1 = b[row]; 2615 sum2 = b[row + 1]; 2616 sum3 = b[row + 2]; 2617 sum4 = b[row + 3]; 2618 for (n = 0; n < sz - 1; n += 2) { 2619 i1 = idx[0]; 2620 i2 = idx[1]; 2621 idx += 2; 2622 tmp0 = x[i1]; 2623 tmp1 = x[i2]; 2624 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2625 v1 += 2; 2626 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2627 v2 += 2; 2628 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2629 v3 += 2; 2630 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 2631 v4 += 2; 2632 } 2633 2634 if (n == sz - 1) { 2635 tmp0 = x[*idx]; 2636 sum1 -= v1[0] * tmp0; 2637 sum2 -= v2[0] * tmp0; 2638 sum3 -= v3[0] * tmp0; 2639 sum4 -= v4[0] * tmp0; 2640 } 2641 t[row] = sum1; 2642 t[row + 1] = sum2; 2643 t[row + 2] = sum3; 2644 t[row + 3] = sum4; 2645 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 2646 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 2647 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 2648 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 2649 ibdiag += 16; 2650 break; 2651 case 5: 2652 v2 = a->a + ii[row + 1]; 2653 v3 = a->a + ii[row + 2]; 2654 v4 = a->a + ii[row + 3]; 2655 v5 = a->a + ii[row + 4]; 2656 sum1 = b[row]; 2657 sum2 = b[row + 1]; 2658 sum3 = b[row + 2]; 2659 sum4 = b[row + 3]; 2660 sum5 = b[row + 4]; 2661 for (n = 0; n < sz - 1; n += 2) { 2662 i1 = idx[0]; 2663 i2 = idx[1]; 2664 idx += 2; 2665 tmp0 = x[i1]; 2666 tmp1 = x[i2]; 2667 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2668 v1 += 2; 2669 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2670 v2 += 2; 2671 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2672 v3 += 2; 2673 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 2674 v4 += 2; 2675 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 2676 v5 += 2; 2677 } 2678 2679 if (n == sz - 1) { 2680 tmp0 = x[*idx]; 2681 sum1 -= v1[0] * tmp0; 2682 sum2 -= v2[0] * tmp0; 2683 sum3 -= v3[0] * tmp0; 2684 sum4 -= v4[0] * tmp0; 2685 sum5 -= v5[0] * tmp0; 2686 } 2687 t[row] = sum1; 2688 t[row + 1] = sum2; 2689 t[row + 2] = sum3; 2690 t[row + 3] = sum4; 2691 t[row + 4] = sum5; 2692 x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 2693 x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 2694 x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 2695 x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 2696 x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 2697 ibdiag += 25; 2698 break; 2699 default: 2700 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 2701 } 2702 } 2703 2704 xb = t; 2705 PetscCall(PetscLogFlops(a->nz)); 2706 } else xb = b; 2707 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 2708 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 2709 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 2710 nodesz = sizes[i + 1] - sizes[i]; 2711 ibdiag -= nodesz * nodesz; 2712 sz = ii[row + 1] - diag[row] - 1; 2713 v1 = a->a + diag[row] + 1; 2714 idx = a->j + diag[row] + 1; 2715 2716 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2717 switch (nodesz) { 2718 case 1: 2719 2720 sum1 = xb[row]; 2721 for (n = 0; n < sz - 1; n += 2) { 2722 i1 = idx[0]; 2723 i2 = idx[1]; 2724 idx += 2; 2725 tmp0 = x[i1]; 2726 tmp1 = x[i2]; 2727 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2728 v1 += 2; 2729 } 2730 2731 if (n == sz - 1) { 2732 tmp0 = x[*idx]; 2733 sum1 -= *v1 * tmp0; 2734 } 2735 x[row--] = sum1 * (*ibdiag); 2736 break; 2737 2738 case 2: 2739 2740 sum1 = xb[row]; 2741 sum2 = xb[row - 1]; 2742 /* note that sum1 is associated with the second of the two rows */ 2743 v2 = a->a + diag[row - 1] + 2; 2744 for (n = 0; n < sz - 1; n += 2) { 2745 i1 = idx[0]; 2746 i2 = idx[1]; 2747 idx += 2; 2748 tmp0 = x[i1]; 2749 tmp1 = x[i2]; 2750 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2751 v1 += 2; 2752 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2753 v2 += 2; 2754 } 2755 2756 if (n == sz - 1) { 2757 tmp0 = x[*idx]; 2758 sum1 -= *v1 * tmp0; 2759 sum2 -= *v2 * tmp0; 2760 } 2761 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 2762 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 2763 break; 2764 case 3: 2765 2766 sum1 = xb[row]; 2767 sum2 = xb[row - 1]; 2768 sum3 = xb[row - 2]; 2769 v2 = a->a + diag[row - 1] + 2; 2770 v3 = a->a + diag[row - 2] + 3; 2771 for (n = 0; n < sz - 1; n += 2) { 2772 i1 = idx[0]; 2773 i2 = idx[1]; 2774 idx += 2; 2775 tmp0 = x[i1]; 2776 tmp1 = x[i2]; 2777 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2778 v1 += 2; 2779 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2780 v2 += 2; 2781 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2782 v3 += 2; 2783 } 2784 2785 if (n == sz - 1) { 2786 tmp0 = x[*idx]; 2787 sum1 -= *v1 * tmp0; 2788 sum2 -= *v2 * tmp0; 2789 sum3 -= *v3 * tmp0; 2790 } 2791 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 2792 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 2793 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 2794 break; 2795 case 4: 2796 2797 sum1 = xb[row]; 2798 sum2 = xb[row - 1]; 2799 sum3 = xb[row - 2]; 2800 sum4 = xb[row - 3]; 2801 v2 = a->a + diag[row - 1] + 2; 2802 v3 = a->a + diag[row - 2] + 3; 2803 v4 = a->a + diag[row - 3] + 4; 2804 for (n = 0; n < sz - 1; n += 2) { 2805 i1 = idx[0]; 2806 i2 = idx[1]; 2807 idx += 2; 2808 tmp0 = x[i1]; 2809 tmp1 = x[i2]; 2810 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2811 v1 += 2; 2812 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2813 v2 += 2; 2814 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2815 v3 += 2; 2816 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 2817 v4 += 2; 2818 } 2819 2820 if (n == sz - 1) { 2821 tmp0 = x[*idx]; 2822 sum1 -= *v1 * tmp0; 2823 sum2 -= *v2 * tmp0; 2824 sum3 -= *v3 * tmp0; 2825 sum4 -= *v4 * tmp0; 2826 } 2827 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 2828 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 2829 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 2830 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 2831 break; 2832 case 5: 2833 2834 sum1 = xb[row]; 2835 sum2 = xb[row - 1]; 2836 sum3 = xb[row - 2]; 2837 sum4 = xb[row - 3]; 2838 sum5 = xb[row - 4]; 2839 v2 = a->a + diag[row - 1] + 2; 2840 v3 = a->a + diag[row - 2] + 3; 2841 v4 = a->a + diag[row - 3] + 4; 2842 v5 = a->a + diag[row - 4] + 5; 2843 for (n = 0; n < sz - 1; n += 2) { 2844 i1 = idx[0]; 2845 i2 = idx[1]; 2846 idx += 2; 2847 tmp0 = x[i1]; 2848 tmp1 = x[i2]; 2849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2850 v1 += 2; 2851 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2852 v2 += 2; 2853 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 2854 v3 += 2; 2855 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 2856 v4 += 2; 2857 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 2858 v5 += 2; 2859 } 2860 2861 if (n == sz - 1) { 2862 tmp0 = x[*idx]; 2863 sum1 -= *v1 * tmp0; 2864 sum2 -= *v2 * tmp0; 2865 sum3 -= *v3 * tmp0; 2866 sum4 -= *v4 * tmp0; 2867 sum5 -= *v5 * tmp0; 2868 } 2869 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 2870 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 2871 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 2872 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 2873 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 2874 break; 2875 default: 2876 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 2877 } 2878 } 2879 2880 PetscCall(PetscLogFlops(a->nz)); 2881 } 2882 its--; 2883 } 2884 while (its--) { 2885 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2886 for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) { 2887 nodesz = sizes[i + 1] - sizes[i]; 2888 sz = diag[row] - ii[row]; 2889 v1 = a->a + ii[row]; 2890 idx = a->j + ii[row]; 2891 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2892 switch (nodesz) { 2893 case 1: 2894 sum1 = b[row]; 2895 for (n = 0; n < sz - 1; n += 2) { 2896 i1 = idx[0]; 2897 i2 = idx[1]; 2898 idx += 2; 2899 tmp0 = x[i1]; 2900 tmp1 = x[i2]; 2901 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2902 v1 += 2; 2903 } 2904 if (n == sz - 1) { 2905 tmp0 = x[*idx++]; 2906 sum1 -= *v1 * tmp0; 2907 v1++; 2908 } 2909 t[row] = sum1; 2910 sz = ii[row + 1] - diag[row] - 1; 2911 idx = a->j + diag[row] + 1; 2912 v1 += 1; 2913 for (n = 0; n < sz - 1; n += 2) { 2914 i1 = idx[0]; 2915 i2 = idx[1]; 2916 idx += 2; 2917 tmp0 = x[i1]; 2918 tmp1 = x[i2]; 2919 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2920 v1 += 2; 2921 } 2922 if (n == sz - 1) { 2923 tmp0 = x[*idx++]; 2924 sum1 -= *v1 * tmp0; 2925 } 2926 /* in MatSOR_SeqAIJ this line would be 2927 * 2928 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 2929 * 2930 * but omega == 1, so this becomes 2931 * 2932 * x[row] = sum1*(*ibdiag++); 2933 * 2934 */ 2935 x[row] = sum1 * (*ibdiag); 2936 break; 2937 case 2: 2938 v2 = a->a + ii[row + 1]; 2939 sum1 = b[row]; 2940 sum2 = b[row + 1]; 2941 for (n = 0; n < sz - 1; n += 2) { 2942 i1 = idx[0]; 2943 i2 = idx[1]; 2944 idx += 2; 2945 tmp0 = x[i1]; 2946 tmp1 = x[i2]; 2947 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2948 v1 += 2; 2949 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2950 v2 += 2; 2951 } 2952 if (n == sz - 1) { 2953 tmp0 = x[*idx++]; 2954 sum1 -= v1[0] * tmp0; 2955 sum2 -= v2[0] * tmp0; 2956 v1++; 2957 v2++; 2958 } 2959 t[row] = sum1; 2960 t[row + 1] = sum2; 2961 sz = ii[row + 1] - diag[row] - 2; 2962 idx = a->j + diag[row] + 2; 2963 v1 += 2; 2964 v2 += 2; 2965 for (n = 0; n < sz - 1; n += 2) { 2966 i1 = idx[0]; 2967 i2 = idx[1]; 2968 idx += 2; 2969 tmp0 = x[i1]; 2970 tmp1 = x[i2]; 2971 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2972 v1 += 2; 2973 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2974 v2 += 2; 2975 } 2976 if (n == sz - 1) { 2977 tmp0 = x[*idx]; 2978 sum1 -= v1[0] * tmp0; 2979 sum2 -= v2[0] * tmp0; 2980 } 2981 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 2982 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 2983 break; 2984 case 3: 2985 v2 = a->a + ii[row + 1]; 2986 v3 = a->a + ii[row + 2]; 2987 sum1 = b[row]; 2988 sum2 = b[row + 1]; 2989 sum3 = b[row + 2]; 2990 for (n = 0; n < sz - 1; n += 2) { 2991 i1 = idx[0]; 2992 i2 = idx[1]; 2993 idx += 2; 2994 tmp0 = x[i1]; 2995 tmp1 = x[i2]; 2996 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 2997 v1 += 2; 2998 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 2999 v2 += 2; 3000 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3001 v3 += 2; 3002 } 3003 if (n == sz - 1) { 3004 tmp0 = x[*idx++]; 3005 sum1 -= v1[0] * tmp0; 3006 sum2 -= v2[0] * tmp0; 3007 sum3 -= v3[0] * tmp0; 3008 v1++; 3009 v2++; 3010 v3++; 3011 } 3012 t[row] = sum1; 3013 t[row + 1] = sum2; 3014 t[row + 2] = sum3; 3015 sz = ii[row + 1] - diag[row] - 3; 3016 idx = a->j + diag[row] + 3; 3017 v1 += 3; 3018 v2 += 3; 3019 v3 += 3; 3020 for (n = 0; n < sz - 1; n += 2) { 3021 i1 = idx[0]; 3022 i2 = idx[1]; 3023 idx += 2; 3024 tmp0 = x[i1]; 3025 tmp1 = x[i2]; 3026 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3027 v1 += 2; 3028 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3029 v2 += 2; 3030 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3031 v3 += 2; 3032 } 3033 if (n == sz - 1) { 3034 tmp0 = x[*idx]; 3035 sum1 -= v1[0] * tmp0; 3036 sum2 -= v2[0] * tmp0; 3037 sum3 -= v3[0] * tmp0; 3038 } 3039 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3040 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3041 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3042 break; 3043 case 4: 3044 v2 = a->a + ii[row + 1]; 3045 v3 = a->a + ii[row + 2]; 3046 v4 = a->a + ii[row + 3]; 3047 sum1 = b[row]; 3048 sum2 = b[row + 1]; 3049 sum3 = b[row + 2]; 3050 sum4 = b[row + 3]; 3051 for (n = 0; n < sz - 1; n += 2) { 3052 i1 = idx[0]; 3053 i2 = idx[1]; 3054 idx += 2; 3055 tmp0 = x[i1]; 3056 tmp1 = x[i2]; 3057 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3058 v1 += 2; 3059 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3060 v2 += 2; 3061 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3062 v3 += 2; 3063 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3064 v4 += 2; 3065 } 3066 if (n == sz - 1) { 3067 tmp0 = x[*idx++]; 3068 sum1 -= v1[0] * tmp0; 3069 sum2 -= v2[0] * tmp0; 3070 sum3 -= v3[0] * tmp0; 3071 sum4 -= v4[0] * tmp0; 3072 v1++; 3073 v2++; 3074 v3++; 3075 v4++; 3076 } 3077 t[row] = sum1; 3078 t[row + 1] = sum2; 3079 t[row + 2] = sum3; 3080 t[row + 3] = sum4; 3081 sz = ii[row + 1] - diag[row] - 4; 3082 idx = a->j + diag[row] + 4; 3083 v1 += 4; 3084 v2 += 4; 3085 v3 += 4; 3086 v4 += 4; 3087 for (n = 0; n < sz - 1; n += 2) { 3088 i1 = idx[0]; 3089 i2 = idx[1]; 3090 idx += 2; 3091 tmp0 = x[i1]; 3092 tmp1 = x[i2]; 3093 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3094 v1 += 2; 3095 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3096 v2 += 2; 3097 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3098 v3 += 2; 3099 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3100 v4 += 2; 3101 } 3102 if (n == sz - 1) { 3103 tmp0 = x[*idx]; 3104 sum1 -= v1[0] * tmp0; 3105 sum2 -= v2[0] * tmp0; 3106 sum3 -= v3[0] * tmp0; 3107 sum4 -= v4[0] * tmp0; 3108 } 3109 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3110 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3111 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3112 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3113 break; 3114 case 5: 3115 v2 = a->a + ii[row + 1]; 3116 v3 = a->a + ii[row + 2]; 3117 v4 = a->a + ii[row + 3]; 3118 v5 = a->a + ii[row + 4]; 3119 sum1 = b[row]; 3120 sum2 = b[row + 1]; 3121 sum3 = b[row + 2]; 3122 sum4 = b[row + 3]; 3123 sum5 = b[row + 4]; 3124 for (n = 0; n < sz - 1; n += 2) { 3125 i1 = idx[0]; 3126 i2 = idx[1]; 3127 idx += 2; 3128 tmp0 = x[i1]; 3129 tmp1 = x[i2]; 3130 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3131 v1 += 2; 3132 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3133 v2 += 2; 3134 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3135 v3 += 2; 3136 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3137 v4 += 2; 3138 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3139 v5 += 2; 3140 } 3141 if (n == sz - 1) { 3142 tmp0 = x[*idx++]; 3143 sum1 -= v1[0] * tmp0; 3144 sum2 -= v2[0] * tmp0; 3145 sum3 -= v3[0] * tmp0; 3146 sum4 -= v4[0] * tmp0; 3147 sum5 -= v5[0] * tmp0; 3148 v1++; 3149 v2++; 3150 v3++; 3151 v4++; 3152 v5++; 3153 } 3154 t[row] = sum1; 3155 t[row + 1] = sum2; 3156 t[row + 2] = sum3; 3157 t[row + 3] = sum4; 3158 t[row + 4] = sum5; 3159 sz = ii[row + 1] - diag[row] - 5; 3160 idx = a->j + diag[row] + 5; 3161 v1 += 5; 3162 v2 += 5; 3163 v3 += 5; 3164 v4 += 5; 3165 v5 += 5; 3166 for (n = 0; n < sz - 1; n += 2) { 3167 i1 = idx[0]; 3168 i2 = idx[1]; 3169 idx += 2; 3170 tmp0 = x[i1]; 3171 tmp1 = x[i2]; 3172 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3173 v1 += 2; 3174 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3175 v2 += 2; 3176 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3177 v3 += 2; 3178 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3179 v4 += 2; 3180 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3181 v5 += 2; 3182 } 3183 if (n == sz - 1) { 3184 tmp0 = x[*idx]; 3185 sum1 -= v1[0] * tmp0; 3186 sum2 -= v2[0] * tmp0; 3187 sum3 -= v3[0] * tmp0; 3188 sum4 -= v4[0] * tmp0; 3189 sum5 -= v5[0] * tmp0; 3190 } 3191 x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3192 x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3193 x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3194 x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3195 x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3196 break; 3197 default: 3198 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3199 } 3200 } 3201 xb = t; 3202 PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */ 3203 } else xb = b; 3204 3205 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3206 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3207 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3208 nodesz = sizes[i + 1] - sizes[i]; 3209 ibdiag -= nodesz * nodesz; 3210 3211 /* set RHS */ 3212 if (xb == b) { 3213 /* whole (old way) */ 3214 sz = ii[row + 1] - ii[row]; 3215 idx = a->j + ii[row]; 3216 switch (nodesz) { 3217 case 5: 3218 v5 = a->a + ii[row - 4]; /* fall through */ 3219 case 4: 3220 v4 = a->a + ii[row - 3]; /* fall through */ 3221 case 3: 3222 v3 = a->a + ii[row - 2]; /* fall through */ 3223 case 2: 3224 v2 = a->a + ii[row - 1]; /* fall through */ 3225 case 1: 3226 v1 = a->a + ii[row]; 3227 break; 3228 default: 3229 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3230 } 3231 } else { 3232 /* upper, no diag */ 3233 sz = ii[row + 1] - diag[row] - 1; 3234 idx = a->j + diag[row] + 1; 3235 switch (nodesz) { 3236 case 5: 3237 v5 = a->a + diag[row - 4] + 5; /* fall through */ 3238 case 4: 3239 v4 = a->a + diag[row - 3] + 4; /* fall through */ 3240 case 3: 3241 v3 = a->a + diag[row - 2] + 3; /* fall through */ 3242 case 2: 3243 v2 = a->a + diag[row - 1] + 2; /* fall through */ 3244 case 1: 3245 v1 = a->a + diag[row] + 1; 3246 } 3247 } 3248 /* set sum */ 3249 switch (nodesz) { 3250 case 5: 3251 sum5 = xb[row - 4]; /* fall through */ 3252 case 4: 3253 sum4 = xb[row - 3]; /* fall through */ 3254 case 3: 3255 sum3 = xb[row - 2]; /* fall through */ 3256 case 2: 3257 sum2 = xb[row - 1]; /* fall through */ 3258 case 1: 3259 /* note that sum1 is associated with the last row */ 3260 sum1 = xb[row]; 3261 } 3262 /* do sums */ 3263 for (n = 0; n < sz - 1; n += 2) { 3264 i1 = idx[0]; 3265 i2 = idx[1]; 3266 idx += 2; 3267 tmp0 = x[i1]; 3268 tmp1 = x[i2]; 3269 switch (nodesz) { 3270 case 5: 3271 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3272 v5 += 2; /* fall through */ 3273 case 4: 3274 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3275 v4 += 2; /* fall through */ 3276 case 3: 3277 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3278 v3 += 2; /* fall through */ 3279 case 2: 3280 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3281 v2 += 2; /* fall through */ 3282 case 1: 3283 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3284 v1 += 2; 3285 } 3286 } 3287 /* ragged edge */ 3288 if (n == sz - 1) { 3289 tmp0 = x[*idx]; 3290 switch (nodesz) { 3291 case 5: 3292 sum5 -= *v5 * tmp0; /* fall through */ 3293 case 4: 3294 sum4 -= *v4 * tmp0; /* fall through */ 3295 case 3: 3296 sum3 -= *v3 * tmp0; /* fall through */ 3297 case 2: 3298 sum2 -= *v2 * tmp0; /* fall through */ 3299 case 1: 3300 sum1 -= *v1 * tmp0; 3301 } 3302 } 3303 /* update */ 3304 if (xb == b) { 3305 /* whole (old way) w/ diag */ 3306 switch (nodesz) { 3307 case 5: 3308 x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3309 x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3310 x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3311 x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3312 x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3313 break; 3314 case 4: 3315 x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3316 x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3317 x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3318 x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3319 break; 3320 case 3: 3321 x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3322 x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3323 x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3324 break; 3325 case 2: 3326 x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3327 x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3328 break; 3329 case 1: 3330 x[row--] += sum1 * (*ibdiag); 3331 break; 3332 } 3333 } else { 3334 /* no diag so set = */ 3335 switch (nodesz) { 3336 case 5: 3337 x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3338 x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3339 x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3340 x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3341 x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3342 break; 3343 case 4: 3344 x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3345 x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3346 x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3347 x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3348 break; 3349 case 3: 3350 x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3351 x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3352 x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3353 break; 3354 case 2: 3355 x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3356 x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3357 break; 3358 case 1: 3359 x[row--] = sum1 * (*ibdiag); 3360 break; 3361 } 3362 } 3363 } 3364 if (xb == b) { 3365 PetscCall(PetscLogFlops(2.0 * a->nz)); 3366 } else { 3367 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3368 } 3369 } 3370 } 3371 if (flag & SOR_EISENSTAT) { 3372 /* 3373 Apply (U + D)^-1 where D is now the block diagonal 3374 */ 3375 ibdiag = a->inode.ibdiag + a->inode.bdiagsize; 3376 for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) { 3377 nodesz = sizes[i + 1] - sizes[i]; 3378 ibdiag -= nodesz * nodesz; 3379 sz = ii[row + 1] - diag[row] - 1; 3380 v1 = a->a + diag[row] + 1; 3381 idx = a->j + diag[row] + 1; 3382 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3383 switch (nodesz) { 3384 case 1: 3385 3386 sum1 = b[row]; 3387 for (n = 0; n < sz - 1; n += 2) { 3388 i1 = idx[0]; 3389 i2 = idx[1]; 3390 idx += 2; 3391 tmp0 = x[i1]; 3392 tmp1 = x[i2]; 3393 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3394 v1 += 2; 3395 } 3396 3397 if (n == sz - 1) { 3398 tmp0 = x[*idx]; 3399 sum1 -= *v1 * tmp0; 3400 } 3401 x[row] = sum1 * (*ibdiag); 3402 row--; 3403 break; 3404 3405 case 2: 3406 3407 sum1 = b[row]; 3408 sum2 = b[row - 1]; 3409 /* note that sum1 is associated with the second of the two rows */ 3410 v2 = a->a + diag[row - 1] + 2; 3411 for (n = 0; n < sz - 1; n += 2) { 3412 i1 = idx[0]; 3413 i2 = idx[1]; 3414 idx += 2; 3415 tmp0 = x[i1]; 3416 tmp1 = x[i2]; 3417 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3418 v1 += 2; 3419 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3420 v2 += 2; 3421 } 3422 3423 if (n == sz - 1) { 3424 tmp0 = x[*idx]; 3425 sum1 -= *v1 * tmp0; 3426 sum2 -= *v2 * tmp0; 3427 } 3428 x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3]; 3429 x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2]; 3430 row -= 2; 3431 break; 3432 case 3: 3433 3434 sum1 = b[row]; 3435 sum2 = b[row - 1]; 3436 sum3 = b[row - 2]; 3437 v2 = a->a + diag[row - 1] + 2; 3438 v3 = a->a + diag[row - 2] + 3; 3439 for (n = 0; n < sz - 1; n += 2) { 3440 i1 = idx[0]; 3441 i2 = idx[1]; 3442 idx += 2; 3443 tmp0 = x[i1]; 3444 tmp1 = x[i2]; 3445 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3446 v1 += 2; 3447 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3448 v2 += 2; 3449 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3450 v3 += 2; 3451 } 3452 3453 if (n == sz - 1) { 3454 tmp0 = x[*idx]; 3455 sum1 -= *v1 * tmp0; 3456 sum2 -= *v2 * tmp0; 3457 sum3 -= *v3 * tmp0; 3458 } 3459 x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8]; 3460 x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7]; 3461 x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6]; 3462 row -= 3; 3463 break; 3464 case 4: 3465 3466 sum1 = b[row]; 3467 sum2 = b[row - 1]; 3468 sum3 = b[row - 2]; 3469 sum4 = b[row - 3]; 3470 v2 = a->a + diag[row - 1] + 2; 3471 v3 = a->a + diag[row - 2] + 3; 3472 v4 = a->a + diag[row - 3] + 4; 3473 for (n = 0; n < sz - 1; n += 2) { 3474 i1 = idx[0]; 3475 i2 = idx[1]; 3476 idx += 2; 3477 tmp0 = x[i1]; 3478 tmp1 = x[i2]; 3479 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3480 v1 += 2; 3481 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3482 v2 += 2; 3483 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3484 v3 += 2; 3485 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3486 v4 += 2; 3487 } 3488 3489 if (n == sz - 1) { 3490 tmp0 = x[*idx]; 3491 sum1 -= *v1 * tmp0; 3492 sum2 -= *v2 * tmp0; 3493 sum3 -= *v3 * tmp0; 3494 sum4 -= *v4 * tmp0; 3495 } 3496 x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15]; 3497 x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14]; 3498 x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13]; 3499 x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12]; 3500 row -= 4; 3501 break; 3502 case 5: 3503 3504 sum1 = b[row]; 3505 sum2 = b[row - 1]; 3506 sum3 = b[row - 2]; 3507 sum4 = b[row - 3]; 3508 sum5 = b[row - 4]; 3509 v2 = a->a + diag[row - 1] + 2; 3510 v3 = a->a + diag[row - 2] + 3; 3511 v4 = a->a + diag[row - 3] + 4; 3512 v5 = a->a + diag[row - 4] + 5; 3513 for (n = 0; n < sz - 1; n += 2) { 3514 i1 = idx[0]; 3515 i2 = idx[1]; 3516 idx += 2; 3517 tmp0 = x[i1]; 3518 tmp1 = x[i2]; 3519 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3520 v1 += 2; 3521 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3522 v2 += 2; 3523 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3524 v3 += 2; 3525 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3526 v4 += 2; 3527 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3528 v5 += 2; 3529 } 3530 3531 if (n == sz - 1) { 3532 tmp0 = x[*idx]; 3533 sum1 -= *v1 * tmp0; 3534 sum2 -= *v2 * tmp0; 3535 sum3 -= *v3 * tmp0; 3536 sum4 -= *v4 * tmp0; 3537 sum5 -= *v5 * tmp0; 3538 } 3539 x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24]; 3540 x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23]; 3541 x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22]; 3542 x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21]; 3543 x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20]; 3544 row -= 5; 3545 break; 3546 default: 3547 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3548 } 3549 } 3550 PetscCall(PetscLogFlops(a->nz)); 3551 3552 /* 3553 t = b - D x where D is the block diagonal 3554 */ 3555 cnt = 0; 3556 for (i = 0, row = 0; i < m; i++) { 3557 nodesz = sizes[i + 1] - sizes[i]; 3558 switch (nodesz) { 3559 case 1: 3560 t[row] = b[row] - bdiag[cnt++] * x[row]; 3561 row++; 3562 break; 3563 case 2: 3564 x1 = x[row]; 3565 x2 = x[row + 1]; 3566 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3567 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3568 t[row] = b[row] - tmp1; 3569 t[row + 1] = b[row + 1] - tmp2; 3570 row += 2; 3571 cnt += 4; 3572 break; 3573 case 3: 3574 x1 = x[row]; 3575 x2 = x[row + 1]; 3576 x3 = x[row + 2]; 3577 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3578 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3579 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3580 t[row] = b[row] - tmp1; 3581 t[row + 1] = b[row + 1] - tmp2; 3582 t[row + 2] = b[row + 2] - tmp3; 3583 row += 3; 3584 cnt += 9; 3585 break; 3586 case 4: 3587 x1 = x[row]; 3588 x2 = x[row + 1]; 3589 x3 = x[row + 2]; 3590 x4 = x[row + 3]; 3591 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3592 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3593 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3594 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3595 t[row] = b[row] - tmp1; 3596 t[row + 1] = b[row + 1] - tmp2; 3597 t[row + 2] = b[row + 2] - tmp3; 3598 t[row + 3] = b[row + 3] - tmp4; 3599 row += 4; 3600 cnt += 16; 3601 break; 3602 case 5: 3603 x1 = x[row]; 3604 x2 = x[row + 1]; 3605 x3 = x[row + 2]; 3606 x4 = x[row + 3]; 3607 x5 = x[row + 4]; 3608 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 3609 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 3610 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 3611 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 3612 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 3613 t[row] = b[row] - tmp1; 3614 t[row + 1] = b[row + 1] - tmp2; 3615 t[row + 2] = b[row + 2] - tmp3; 3616 t[row + 3] = b[row + 3] - tmp4; 3617 t[row + 4] = b[row + 4] - tmp5; 3618 row += 5; 3619 cnt += 25; 3620 break; 3621 default: 3622 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3623 } 3624 } 3625 PetscCall(PetscLogFlops(m)); 3626 3627 /* 3628 Apply (L + D)^-1 where D is the block diagonal 3629 */ 3630 for (i = 0, row = 0; i < m; i++) { 3631 nodesz = sizes[i + 1] - sizes[i]; 3632 sz = diag[row] - ii[row]; 3633 v1 = a->a + ii[row]; 3634 idx = a->j + ii[row]; 3635 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3636 switch (nodesz) { 3637 case 1: 3638 3639 sum1 = t[row]; 3640 for (n = 0; n < sz - 1; n += 2) { 3641 i1 = idx[0]; 3642 i2 = idx[1]; 3643 idx += 2; 3644 tmp0 = t[i1]; 3645 tmp1 = t[i2]; 3646 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3647 v1 += 2; 3648 } 3649 3650 if (n == sz - 1) { 3651 tmp0 = t[*idx]; 3652 sum1 -= *v1 * tmp0; 3653 } 3654 x[row] += t[row] = sum1 * (*ibdiag++); 3655 row++; 3656 break; 3657 case 2: 3658 v2 = a->a + ii[row + 1]; 3659 sum1 = t[row]; 3660 sum2 = t[row + 1]; 3661 for (n = 0; n < sz - 1; n += 2) { 3662 i1 = idx[0]; 3663 i2 = idx[1]; 3664 idx += 2; 3665 tmp0 = t[i1]; 3666 tmp1 = t[i2]; 3667 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3668 v1 += 2; 3669 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3670 v2 += 2; 3671 } 3672 3673 if (n == sz - 1) { 3674 tmp0 = t[*idx]; 3675 sum1 -= v1[0] * tmp0; 3676 sum2 -= v2[0] * tmp0; 3677 } 3678 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2]; 3679 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3]; 3680 ibdiag += 4; 3681 row += 2; 3682 break; 3683 case 3: 3684 v2 = a->a + ii[row + 1]; 3685 v3 = a->a + ii[row + 2]; 3686 sum1 = t[row]; 3687 sum2 = t[row + 1]; 3688 sum3 = t[row + 2]; 3689 for (n = 0; n < sz - 1; n += 2) { 3690 i1 = idx[0]; 3691 i2 = idx[1]; 3692 idx += 2; 3693 tmp0 = t[i1]; 3694 tmp1 = t[i2]; 3695 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3696 v1 += 2; 3697 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3698 v2 += 2; 3699 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3700 v3 += 2; 3701 } 3702 3703 if (n == sz - 1) { 3704 tmp0 = t[*idx]; 3705 sum1 -= v1[0] * tmp0; 3706 sum2 -= v2[0] * tmp0; 3707 sum3 -= v3[0] * tmp0; 3708 } 3709 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6]; 3710 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7]; 3711 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8]; 3712 ibdiag += 9; 3713 row += 3; 3714 break; 3715 case 4: 3716 v2 = a->a + ii[row + 1]; 3717 v3 = a->a + ii[row + 2]; 3718 v4 = a->a + ii[row + 3]; 3719 sum1 = t[row]; 3720 sum2 = t[row + 1]; 3721 sum3 = t[row + 2]; 3722 sum4 = t[row + 3]; 3723 for (n = 0; n < sz - 1; n += 2) { 3724 i1 = idx[0]; 3725 i2 = idx[1]; 3726 idx += 2; 3727 tmp0 = t[i1]; 3728 tmp1 = t[i2]; 3729 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3730 v1 += 2; 3731 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3732 v2 += 2; 3733 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3734 v3 += 2; 3735 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3736 v4 += 2; 3737 } 3738 3739 if (n == sz - 1) { 3740 tmp0 = t[*idx]; 3741 sum1 -= v1[0] * tmp0; 3742 sum2 -= v2[0] * tmp0; 3743 sum3 -= v3[0] * tmp0; 3744 sum4 -= v4[0] * tmp0; 3745 } 3746 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12]; 3747 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13]; 3748 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14]; 3749 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15]; 3750 ibdiag += 16; 3751 row += 4; 3752 break; 3753 case 5: 3754 v2 = a->a + ii[row + 1]; 3755 v3 = a->a + ii[row + 2]; 3756 v4 = a->a + ii[row + 3]; 3757 v5 = a->a + ii[row + 4]; 3758 sum1 = t[row]; 3759 sum2 = t[row + 1]; 3760 sum3 = t[row + 2]; 3761 sum4 = t[row + 3]; 3762 sum5 = t[row + 4]; 3763 for (n = 0; n < sz - 1; n += 2) { 3764 i1 = idx[0]; 3765 i2 = idx[1]; 3766 idx += 2; 3767 tmp0 = t[i1]; 3768 tmp1 = t[i2]; 3769 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; 3770 v1 += 2; 3771 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; 3772 v2 += 2; 3773 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; 3774 v3 += 2; 3775 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; 3776 v4 += 2; 3777 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; 3778 v5 += 2; 3779 } 3780 3781 if (n == sz - 1) { 3782 tmp0 = t[*idx]; 3783 sum1 -= v1[0] * tmp0; 3784 sum2 -= v2[0] * tmp0; 3785 sum3 -= v3[0] * tmp0; 3786 sum4 -= v4[0] * tmp0; 3787 sum5 -= v5[0] * tmp0; 3788 } 3789 x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20]; 3790 x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21]; 3791 x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22]; 3792 x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23]; 3793 x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24]; 3794 ibdiag += 25; 3795 row += 5; 3796 break; 3797 default: 3798 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3799 } 3800 } 3801 PetscCall(PetscLogFlops(a->nz)); 3802 } 3803 PetscCall(VecRestoreArray(xx, &x)); 3804 PetscCall(VecRestoreArrayRead(bb, &b)); 3805 PetscFunctionReturn(PETSC_SUCCESS); 3806 } 3807 3808 static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx) 3809 { 3810 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3811 PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5; 3812 const MatScalar *bdiag = a->inode.bdiag; 3813 const PetscScalar *b; 3814 PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz; 3815 const PetscInt *sizes = a->inode.size_csr; 3816 3817 PetscFunctionBegin; 3818 PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure"); 3819 PetscCall(VecGetArray(xx, &x)); 3820 PetscCall(VecGetArrayRead(bb, &b)); 3821 cnt = 0; 3822 for (i = 0, row = 0; i < m; i++) { 3823 nodesz = sizes[i + 1] - sizes[i]; 3824 switch (nodesz) { 3825 case 1: 3826 x[row] = b[row] * bdiag[cnt++]; 3827 row++; 3828 break; 3829 case 2: 3830 x1 = b[row]; 3831 x2 = b[row + 1]; 3832 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2]; 3833 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3]; 3834 x[row++] = tmp1; 3835 x[row++] = tmp2; 3836 cnt += 4; 3837 break; 3838 case 3: 3839 x1 = b[row]; 3840 x2 = b[row + 1]; 3841 x3 = b[row + 2]; 3842 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6]; 3843 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7]; 3844 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8]; 3845 x[row++] = tmp1; 3846 x[row++] = tmp2; 3847 x[row++] = tmp3; 3848 cnt += 9; 3849 break; 3850 case 4: 3851 x1 = b[row]; 3852 x2 = b[row + 1]; 3853 x3 = b[row + 2]; 3854 x4 = b[row + 3]; 3855 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12]; 3856 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13]; 3857 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14]; 3858 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15]; 3859 x[row++] = tmp1; 3860 x[row++] = tmp2; 3861 x[row++] = tmp3; 3862 x[row++] = tmp4; 3863 cnt += 16; 3864 break; 3865 case 5: 3866 x1 = b[row]; 3867 x2 = b[row + 1]; 3868 x3 = b[row + 2]; 3869 x4 = b[row + 3]; 3870 x5 = b[row + 4]; 3871 tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20]; 3872 tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21]; 3873 tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22]; 3874 tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23]; 3875 tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24]; 3876 x[row++] = tmp1; 3877 x[row++] = tmp2; 3878 x[row++] = tmp3; 3879 x[row++] = tmp4; 3880 x[row++] = tmp5; 3881 cnt += 25; 3882 break; 3883 default: 3884 SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz); 3885 } 3886 } 3887 PetscCall(PetscLogFlops(2.0 * cnt)); 3888 PetscCall(VecRestoreArray(xx, &x)); 3889 PetscCall(VecRestoreArrayRead(bb, &b)); 3890 PetscFunctionReturn(PETSC_SUCCESS); 3891 } 3892 3893 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 3894 { 3895 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3896 3897 PetscFunctionBegin; 3898 a->inode.node_count = 0; 3899 a->inode.use = PETSC_FALSE; 3900 a->inode.checked = PETSC_FALSE; 3901 a->inode.mat_nonzerostate = -1; 3902 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 3903 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 3904 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 3905 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 3906 A->ops->coloringpatch = NULL; 3907 A->ops->multdiagonalblock = NULL; 3908 if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace; 3909 PetscFunctionReturn(PETSC_SUCCESS); 3910 } 3911 3912 /* 3913 samestructure indicates that the matrix has not changed its nonzero structure so we 3914 do not need to recompute the inodes 3915 */ 3916 PetscErrorCode MatSeqAIJCheckInode(Mat A) 3917 { 3918 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 3919 PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size; 3920 PetscBool flag; 3921 const PetscInt *idx, *idy, *ii; 3922 3923 PetscFunctionBegin; 3924 if (!a->inode.use) { 3925 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 3926 PetscCall(PetscFree(a->inode.size_csr)); 3927 PetscFunctionReturn(PETSC_SUCCESS); 3928 } 3929 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS); 3930 3931 m = A->rmap->n; 3932 if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr)); 3933 ns = a->inode.size_csr; 3934 ns[0] = 0; 3935 3936 i = 0; 3937 node_count = 0; 3938 idx = a->j; 3939 ii = a->i; 3940 if (idx) { 3941 while (i < m) { /* For each row */ 3942 nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */ 3943 /* Limits the number of elements in a node to 'a->inode.limit' */ 3944 for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 3945 nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */ 3946 if (nzy != nzx) break; 3947 idy += nzx; /* Same nonzero pattern */ 3948 PetscCall(PetscArraycmp(idx, idy, nzx, &flag)); 3949 if (!flag) break; 3950 } 3951 ns[node_count + 1] = ns[node_count] + blk_size; 3952 node_count++; 3953 idx += blk_size * nzx; 3954 i = j; 3955 } 3956 } 3957 /* If not enough inodes found,, do not use inode version of the routines */ 3958 if (!m || !idx || node_count > .8 * m) { 3959 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 3960 PetscCall(PetscFree(a->inode.size_csr)); 3961 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 3962 } else { 3963 if (!A->factortype) { 3964 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3965 if (A->rmap->n == A->cmap->n) { 3966 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3967 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3968 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3969 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3970 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3971 } 3972 } else { 3973 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 3974 } 3975 a->inode.node_count = node_count; 3976 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 3977 } 3978 a->inode.checked = PETSC_TRUE; 3979 a->inode.mat_nonzerostate = A->nonzerostate; 3980 PetscFunctionReturn(PETSC_SUCCESS); 3981 } 3982 3983 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C) 3984 { 3985 Mat B = *C; 3986 Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data; 3987 PetscInt m = A->rmap->n; 3988 3989 PetscFunctionBegin; 3990 c->inode.use = a->inode.use; 3991 c->inode.limit = a->inode.limit; 3992 c->inode.max_limit = a->inode.max_limit; 3993 c->inode.checked = PETSC_FALSE; 3994 c->inode.size_csr = NULL; 3995 c->inode.node_count = 0; 3996 c->inode.ibdiag = NULL; 3997 c->inode.bdiag = NULL; 3998 c->inode.mat_nonzerostate = -1; 3999 if (a->inode.use) { 4000 if (a->inode.checked && a->inode.size_csr) { 4001 PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr)); 4002 PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1)); 4003 4004 c->inode.checked = PETSC_TRUE; 4005 c->inode.node_count = a->inode.node_count; 4006 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4007 } 4008 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4009 if (!B->factortype) { 4010 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4011 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4012 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4013 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4014 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4015 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4016 } else { 4017 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4018 } 4019 } 4020 PetscFunctionReturn(PETSC_SUCCESS); 4021 } 4022 4023 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row) 4024 { 4025 PetscInt k; 4026 const PetscInt *vi; 4027 4028 PetscFunctionBegin; 4029 vi = aj + ai[row]; 4030 for (k = 0; k < nzl; k++) cols[k] = vi[k]; 4031 vi = aj + adiag[row]; 4032 cols[nzl] = vi[0]; 4033 vi = aj + adiag[row + 1] + 1; 4034 for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k]; 4035 PetscFunctionReturn(PETSC_SUCCESS); 4036 } 4037 /* 4038 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4039 Modified from MatSeqAIJCheckInode(). 4040 4041 Input Parameters: 4042 . Mat A - ILU or LU matrix factor 4043 4044 */ 4045 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4046 { 4047 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4048 PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size; 4049 PetscInt *cols1, *cols2, *ns; 4050 const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag; 4051 PetscBool flag; 4052 4053 PetscFunctionBegin; 4054 if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS); 4055 if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS); 4056 4057 m = A->rmap->n; 4058 if (a->inode.size_csr) ns = a->inode.size_csr; 4059 else PetscCall(PetscMalloc1(m + 1, &ns)); 4060 ns[0] = 0; 4061 4062 i = 0; 4063 node_count = 0; 4064 PetscCall(PetscMalloc2(m, &cols1, m, &cols2)); 4065 while (i < m) { /* For each row */ 4066 nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */ 4067 nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4068 nzx = nzl1 + nzu1 + 1; 4069 PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i)); 4070 4071 /* Limits the number of elements in a node to 'a->inode.limit' */ 4072 for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) { 4073 nzl2 = ai[j + 1] - ai[j]; 4074 nzu2 = adiag[j] - adiag[j + 1] - 1; 4075 nzy = nzl2 + nzu2 + 1; 4076 if (nzy != nzx) break; 4077 PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j)); 4078 PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag)); 4079 if (!flag) break; 4080 } 4081 ns[node_count + 1] = ns[node_count] + blk_size; 4082 node_count++; 4083 i = j; 4084 } 4085 PetscCall(PetscFree2(cols1, cols2)); 4086 /* If not enough inodes found,, do not use inode version of the routines */ 4087 if (!m || node_count > .8 * m) { 4088 PetscCall(PetscFree(ns)); 4089 4090 a->inode.node_count = 0; 4091 a->inode.size_csr = NULL; 4092 a->inode.use = PETSC_FALSE; 4093 4094 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m)); 4095 } else { 4096 A->ops->mult = NULL; 4097 A->ops->sor = NULL; 4098 A->ops->multadd = NULL; 4099 A->ops->getrowij = NULL; 4100 A->ops->restorerowij = NULL; 4101 A->ops->getcolumnij = NULL; 4102 A->ops->restorecolumnij = NULL; 4103 A->ops->coloringpatch = NULL; 4104 A->ops->multdiagonalblock = NULL; 4105 a->inode.node_count = node_count; 4106 a->inode.size_csr = ns; 4107 PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit)); 4108 } 4109 a->inode.checked = PETSC_TRUE; 4110 PetscFunctionReturn(PETSC_SUCCESS); 4111 } 4112 4113 /* 4114 This is really ugly. if inodes are used this replaces the 4115 permutations with ones that correspond to rows/cols of the matrix 4116 rather than inode blocks 4117 */ 4118 PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm) 4119 { 4120 PetscFunctionBegin; 4121 PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm)); 4122 PetscFunctionReturn(PETSC_SUCCESS); 4123 } 4124 4125 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm) 4126 { 4127 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4128 PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count; 4129 const PetscInt *ridx, *cidx; 4130 PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx; 4131 PetscInt nslim_col, *ns_col; 4132 IS ris = *rperm, cis = *cperm; 4133 4134 PetscFunctionBegin; 4135 if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */ 4136 if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */ 4137 4138 PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col)); 4139 PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns)); 4140 PetscCall(PetscMalloc2(m, &permr, n, &permc)); 4141 4142 PetscCall(ISGetIndices(ris, &ridx)); 4143 PetscCall(ISGetIndices(cis, &cidx)); 4144 4145 /* Form the inode structure for the rows of permuted matrix using inv perm*/ 4146 for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]); 4147 4148 /* Construct the permutations for rows*/ 4149 for (i = 0, row = 0; i < nslim_row; ++i) { 4150 indx = ridx[i]; 4151 start_val = tns[indx]; 4152 end_val = tns[indx + 1]; 4153 for (j = start_val; j < end_val; ++j, ++row) permr[row] = j; 4154 } 4155 4156 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4157 for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]); 4158 4159 /* Construct permutations for columns */ 4160 for (i = 0, col = 0; i < nslim_col; ++i) { 4161 indx = cidx[i]; 4162 start_val = tns[indx]; 4163 end_val = tns[indx + 1]; 4164 for (j = start_val; j < end_val; ++j, ++col) permc[col] = j; 4165 } 4166 4167 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm)); 4168 PetscCall(ISSetPermutation(*rperm)); 4169 PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm)); 4170 PetscCall(ISSetPermutation(*cperm)); 4171 4172 PetscCall(ISRestoreIndices(ris, &ridx)); 4173 PetscCall(ISRestoreIndices(cis, &cidx)); 4174 4175 PetscCall(PetscFree(ns_col)); 4176 PetscCall(PetscFree2(permr, permc)); 4177 PetscCall(ISDestroy(&cis)); 4178 PetscCall(ISDestroy(&ris)); 4179 PetscCall(PetscFree(tns)); 4180 PetscFunctionReturn(PETSC_SUCCESS); 4181 } 4182 4183 /*@C 4184 MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes 4185 4186 Not Collective 4187 4188 Input Parameter: 4189 . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ` 4190 4191 Output Parameters: 4192 + node_count - no of inodes present in the matrix. 4193 . sizes - an array of size `node_count`, with the sizes of each inode. 4194 - limit - the max size used to generate the inodes. 4195 4196 Level: advanced 4197 4198 Note: 4199 It should be called after the matrix is assembled. 4200 The contents of the sizes[] array should not be changed. 4201 `NULL` may be passed for information not needed 4202 4203 .seealso: [](ch_matrices), `Mat`, `MatGetInfo()` 4204 @*/ 4205 PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4206 { 4207 PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *); 4208 4209 PetscFunctionBegin; 4210 PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix"); 4211 PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f)); 4212 if (f) PetscCall((*f)(A, node_count, sizes, limit)); 4213 PetscFunctionReturn(PETSC_SUCCESS); 4214 } 4215 4216 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit) 4217 { 4218 Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data; 4219 4220 PetscFunctionBegin; 4221 if (node_count) *node_count = a->inode.node_count; 4222 if (sizes) *sizes = a->inode.size_csr; 4223 if (limit) *limit = a->inode.limit; 4224 PetscFunctionReturn(PETSC_SUCCESS); 4225 } 4226