14c1414c8SBarry Smith /*
24c1414c8SBarry Smith This file provides high performance routines for the Inode format (compressed sparse row)
34c1414c8SBarry Smith by taking advantage of rows with identical nonzero structure (I-nodes).
44c1414c8SBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/aij/seq/aij.h>
6fb56d528SJed Brown #if defined(PETSC_HAVE_XMMINTRIN_H)
7fb56d528SJed Brown #include <xmmintrin.h>
8fb56d528SJed Brown #endif
94c1414c8SBarry Smith
MatCreateColInode_Private(Mat A,PetscInt * size,PetscInt ** ns)10d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatCreateColInode_Private(Mat A, PetscInt *size, PetscInt **ns)
11d71ae5a4SJacob Faibussowitsch {
124c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
134c1414c8SBarry Smith PetscInt i, count, m, n, min_mn, *ns_row, *ns_col;
144c1414c8SBarry Smith
154c1414c8SBarry Smith PetscFunctionBegin;
16d0f46423SBarry Smith n = A->cmap->n;
17d0f46423SBarry Smith m = A->rmap->n;
184d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
194d12350bSJunchao Zhang ns_row = a->inode.size_csr;
204c1414c8SBarry Smith
214c1414c8SBarry Smith min_mn = (m < n) ? m : n;
224c1414c8SBarry Smith if (!ns) {
234d12350bSJunchao Zhang for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++);
24fbccb6d4SPierre Jolivet for (; count + 1 < n; count++, i++);
25ad540459SPierre Jolivet if (count < n) i++;
264c1414c8SBarry Smith *size = i;
273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
284c1414c8SBarry Smith }
299566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &ns_col));
304d12350bSJunchao Zhang ns_col[0] = 0;
314c1414c8SBarry Smith
324c1414c8SBarry Smith /* Use the same row structure wherever feasible. */
334d12350bSJunchao Zhang for (count = 0, i = 0; count < min_mn; count += (ns_row[i + 1] - ns_row[i]), i++) ns_col[i + 1] = ns_row[i + 1];
344c1414c8SBarry Smith
354c1414c8SBarry Smith /* if m < n; pad up the remainder with inode_limit */
364d12350bSJunchao Zhang for (; count + 1 < n; count++, i++) ns_col[i + 1] = ns_col[i] + 1;
37aaa8cc7dSPierre Jolivet /* The last node is the odd ball. pad it up with the remaining rows; */
384c1414c8SBarry Smith if (count < n) {
394d12350bSJunchao Zhang ns_col[i + 1] = ns_col[i] + (n - count);
404c1414c8SBarry Smith i++;
414c1414c8SBarry Smith } else if (count > n) {
424c1414c8SBarry Smith /* Adjust for the over estimation */
434d12350bSJunchao Zhang ns_col[i] += n - count;
444c1414c8SBarry Smith }
454c1414c8SBarry Smith *size = i;
464c1414c8SBarry Smith *ns = ns_col;
473ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
484c1414c8SBarry Smith }
494c1414c8SBarry Smith
504c1414c8SBarry Smith /*
514c1414c8SBarry Smith This builds symmetric version of nonzero structure,
524c1414c8SBarry Smith */
MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)53d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
54d71ae5a4SJacob Faibussowitsch {
554c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
568758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, nslim_col, m, row, col, n;
574d12350bSJunchao Zhang PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, *ns_col, nsz, i1, i2;
588758e1faSBarry Smith const PetscInt *j, *jmax, *ai = a->i, *aj = a->j;
594c1414c8SBarry Smith
604c1414c8SBarry Smith PetscFunctionBegin;
614c1414c8SBarry Smith nslim_row = a->inode.node_count;
62d0f46423SBarry Smith m = A->rmap->n;
63d0f46423SBarry Smith n = A->cmap->n;
6408401ef6SPierre Jolivet PetscCheck(m == n, PETSC_COMM_SELF, PETSC_ERR_SUP, "MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square");
654d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
664c1414c8SBarry Smith
674c1414c8SBarry Smith /* Use the row_inode as column_inode */
684c1414c8SBarry Smith nslim_col = nslim_row;
694c1414c8SBarry Smith ns_col = ns_row;
704c1414c8SBarry Smith
7135cb6cd3SPierre Jolivet /* allocate space for reformatted inode structure */
729566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
734d12350bSJunchao Zhang for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_row[i1 + 1] - ns_row[i1]);
744c1414c8SBarry Smith
754c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
764d12350bSJunchao Zhang nsz = ns_col[i1 + 1] - ns_col[i1];
772205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
784c1414c8SBarry Smith }
794c1414c8SBarry Smith /* allocate space for row pointers */
809566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia));
814c1414c8SBarry Smith *iia = ia;
829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work));
834c1414c8SBarry Smith
844c1414c8SBarry Smith /* determine the number of columns in each row */
854c1414c8SBarry Smith ia[0] = oshift;
864d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) {
874d12350bSJunchao Zhang row = ns_row[i1];
884c1414c8SBarry Smith j = aj + ai[row] + ishift;
894c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift;
9083fed2edSSatish Balay if (j == jmax) continue; /* empty row */
914c1414c8SBarry Smith col = *j++ + ishift;
924c1414c8SBarry Smith i2 = tvc[col];
936aad120cSJose E. Roman while (i2 < i1 && j < jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */
944c1414c8SBarry Smith ia[i1 + 1]++;
954c1414c8SBarry Smith ia[i2 + 1]++;
964c1414c8SBarry Smith i2++; /* Start col of next node */
9790d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j;
984c1414c8SBarry Smith i2 = tvc[col];
994c1414c8SBarry Smith }
1004c1414c8SBarry Smith if (i2 == i1) ia[i2 + 1]++; /* now the diagonal element */
1014c1414c8SBarry Smith }
1024c1414c8SBarry Smith
1034c1414c8SBarry Smith /* shift ia[i] to point to next row */
1044c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) {
1054c1414c8SBarry Smith row = ia[i1 - 1];
1064c1414c8SBarry Smith ia[i1] += row;
1074c1414c8SBarry Smith work[i1 - 1] = row - oshift;
1084c1414c8SBarry Smith }
1094c1414c8SBarry Smith
1104c1414c8SBarry Smith /* allocate space for column pointers */
1114c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift);
1129566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja));
1134c1414c8SBarry Smith *jja = ja;
1144c1414c8SBarry Smith
1154c1414c8SBarry Smith /* loop over lower triangular part putting into ja */
1164d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) {
1174d12350bSJunchao Zhang row = ns_row[i1];
1184c1414c8SBarry Smith j = aj + ai[row] + ishift;
1194c1414c8SBarry Smith jmax = aj + ai[row + 1] + ishift;
12083fed2edSSatish Balay if (j == jmax) continue; /* empty row */
1214c1414c8SBarry Smith col = *j++ + ishift;
1224c1414c8SBarry Smith i2 = tvc[col];
1234c1414c8SBarry Smith while (i2 < i1 && j < jmax) {
1244c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift;
1254c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift;
1264c1414c8SBarry Smith ++i2;
12790d2dec7SBarry Smith while ((j < jmax) && ((col = *j + ishift) < tns[i2])) ++j; /* Skip rest col indices in this node */
1284c1414c8SBarry Smith i2 = tvc[col];
1294c1414c8SBarry Smith }
1304c1414c8SBarry Smith if (i2 == i1) ja[work[i1]++] = i2 + oshift;
1314c1414c8SBarry Smith }
1329566063dSJacob Faibussowitsch PetscCall(PetscFree(work));
1339566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc));
1343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
1354c1414c8SBarry Smith }
1364c1414c8SBarry Smith
1374c1414c8SBarry Smith /*
1384c1414c8SBarry Smith This builds nonsymmetric version of nonzero structure,
1394c1414c8SBarry Smith */
MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)140d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
141d71ae5a4SJacob Faibussowitsch {
1424c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1438758e1faSBarry Smith PetscInt *work, *ia, *ja, nz, nslim_row, n, row, col, *ns_col, nslim_col;
1448758e1faSBarry Smith PetscInt *tns, *tvc, nsz, i1, i2;
1454d12350bSJunchao Zhang const PetscInt *j, *ai = a->i, *aj = a->j, *ns_row = a->inode.size_csr;
1464c1414c8SBarry Smith
1474c1414c8SBarry Smith PetscFunctionBegin;
1484d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1494c1414c8SBarry Smith nslim_row = a->inode.node_count;
150d0f46423SBarry Smith n = A->cmap->n;
1514c1414c8SBarry Smith
1524c1414c8SBarry Smith /* Create The column_inode for this matrix */
1539566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
1544c1414c8SBarry Smith
15535cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */
1569566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
1574d12350bSJunchao Zhang for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
1584c1414c8SBarry Smith
1594c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
1604d12350bSJunchao Zhang nsz = ns_col[i1 + 1] - ns_col[i1];
1612205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
1624c1414c8SBarry Smith }
1634c1414c8SBarry Smith /* allocate space for row pointers */
1649566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_row + 1, &ia));
1654c1414c8SBarry Smith *iia = ia;
1669566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_row + 1, &work));
1674c1414c8SBarry Smith
1684c1414c8SBarry Smith /* determine the number of columns in each row */
1694c1414c8SBarry Smith ia[0] = oshift;
1704d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) {
1714d12350bSJunchao Zhang row = ns_row[i1];
1724c1414c8SBarry Smith j = aj + ai[row] + ishift;
17383fed2edSSatish Balay nz = ai[row + 1] - ai[row];
17483fed2edSSatish Balay if (!nz) continue; /* empty row */
1754c1414c8SBarry Smith col = *j++ + ishift;
1764c1414c8SBarry Smith i2 = tvc[col];
1776aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */
1784c1414c8SBarry Smith ia[i1 + 1]++;
1794c1414c8SBarry Smith i2++; /* Start col of next node */
180a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
1814c1414c8SBarry Smith if (nz > 0) i2 = tvc[col];
1824c1414c8SBarry Smith }
1834c1414c8SBarry Smith }
1844c1414c8SBarry Smith
1854c1414c8SBarry Smith /* shift ia[i] to point to next row */
1864c1414c8SBarry Smith for (i1 = 1; i1 < nslim_row + 1; i1++) {
1874c1414c8SBarry Smith row = ia[i1 - 1];
1884c1414c8SBarry Smith ia[i1] += row;
1894c1414c8SBarry Smith work[i1 - 1] = row - oshift;
1904c1414c8SBarry Smith }
1914c1414c8SBarry Smith
1924c1414c8SBarry Smith /* allocate space for column pointers */
1934c1414c8SBarry Smith nz = ia[nslim_row] + (!ishift);
1949566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja));
1954c1414c8SBarry Smith *jja = ja;
1964c1414c8SBarry Smith
1974c1414c8SBarry Smith /* loop over matrix putting into ja */
1984d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) {
1994d12350bSJunchao Zhang row = ns_row[i1];
2004c1414c8SBarry Smith j = aj + ai[row] + ishift;
20183fed2edSSatish Balay nz = ai[row + 1] - ai[row];
20283fed2edSSatish Balay if (!nz) continue; /* empty row */
2034c1414c8SBarry Smith col = *j++ + ishift;
2044c1414c8SBarry Smith i2 = tvc[col];
2054c1414c8SBarry Smith while (nz-- > 0) {
2064c1414c8SBarry Smith ja[work[i1]++] = i2 + oshift;
2074c1414c8SBarry Smith ++i2;
208a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2094c1414c8SBarry Smith if (nz > 0) i2 = tvc[col];
2104c1414c8SBarry Smith }
2114c1414c8SBarry Smith }
2129566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col));
2139566063dSJacob Faibussowitsch PetscCall(PetscFree(work));
2149566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc));
2153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
2164c1414c8SBarry Smith }
2174c1414c8SBarry Smith
MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)218d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
219d71ae5a4SJacob Faibussowitsch {
2204c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2214c1414c8SBarry Smith
2224c1414c8SBarry Smith PetscFunctionBegin;
22350ba90b4SBarry Smith if (n) *n = a->inode.node_count;
2243ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2258f7157efSSatish Balay if (!blockcompressed) {
2269566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2278f7157efSSatish Balay } else if (symmetric) {
2289566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
2294c1414c8SBarry Smith } else {
2309566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
2314c1414c8SBarry Smith }
2323ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
2334c1414c8SBarry Smith }
2344c1414c8SBarry Smith
MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)235d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
236d71ae5a4SJacob Faibussowitsch {
2374c1414c8SBarry Smith PetscFunctionBegin;
2383ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
2398f7157efSSatish Balay
2408f7157efSSatish Balay if (!blockcompressed) {
2419566063dSJacob Faibussowitsch PetscCall(MatRestoreRowIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
2428f7157efSSatish Balay } else {
2439566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia));
2449566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja));
2458f7157efSSatish Balay }
2463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
2474c1414c8SBarry Smith }
2484c1414c8SBarry Smith
MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt * iia[],const PetscInt * jja[],PetscInt ishift,PetscInt oshift)249d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A, const PetscInt *iia[], const PetscInt *jja[], PetscInt ishift, PetscInt oshift)
250d71ae5a4SJacob Faibussowitsch {
2514c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
2524c1414c8SBarry Smith PetscInt *work, *ia, *ja, *j, nz, nslim_row, n, row, col, *ns_col, nslim_col;
2534d12350bSJunchao Zhang PetscInt *tns, *tvc, *ns_row = a->inode.size_csr, nsz, i1, i2, *ai = a->i, *aj = a->j;
2544c1414c8SBarry Smith
2554c1414c8SBarry Smith PetscFunctionBegin;
2564d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
2574c1414c8SBarry Smith nslim_row = a->inode.node_count;
258d0f46423SBarry Smith n = A->cmap->n;
2594c1414c8SBarry Smith
2604c1414c8SBarry Smith /* Create The column_inode for this matrix */
2619566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
2624c1414c8SBarry Smith
26335cb6cd3SPierre Jolivet /* allocate space for reformatted column_inode structure */
2649566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(nslim_col + 1, &tns, n + 1, &tvc));
2654d12350bSJunchao Zhang for (i1 = 0, tns[0] = 0; i1 < nslim_col; ++i1) tns[i1 + 1] = tns[i1] + (ns_col[i1 + 1] - ns_col[i1]);
2664c1414c8SBarry Smith
2674c1414c8SBarry Smith for (i1 = 0, col = 0; i1 < nslim_col; ++i1) {
2684d12350bSJunchao Zhang nsz = ns_col[i1 + 1] - ns_col[i1];
2692205254eSKarl Rupp for (i2 = 0; i2 < nsz; ++i2, ++col) tvc[col] = i1;
2704c1414c8SBarry Smith }
2714c1414c8SBarry Smith /* allocate space for column pointers */
2729566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(nslim_col + 1, &ia));
2734c1414c8SBarry Smith *iia = ia;
2749566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nslim_col + 1, &work));
2754c1414c8SBarry Smith
2764c1414c8SBarry Smith /* determine the number of columns in each row */
2774c1414c8SBarry Smith ia[0] = oshift;
2784d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) {
2794d12350bSJunchao Zhang row = ns_row[i1];
2804c1414c8SBarry Smith j = aj + ai[row] + ishift;
2814c1414c8SBarry Smith col = *j++ + ishift;
2824c1414c8SBarry Smith i2 = tvc[col];
2834c1414c8SBarry Smith nz = ai[row + 1] - ai[row];
2846aad120cSJose E. Roman while (nz-- > 0) { /* off-diagonal elements */
2854c1414c8SBarry Smith /* ia[i1+1]++; */
2864c1414c8SBarry Smith ia[i2 + 1]++;
2874c1414c8SBarry Smith i2++;
288a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
2894c1414c8SBarry Smith if (nz > 0) i2 = tvc[col];
2904c1414c8SBarry Smith }
2914c1414c8SBarry Smith }
2924c1414c8SBarry Smith
2934c1414c8SBarry Smith /* shift ia[i] to point to next col */
2944c1414c8SBarry Smith for (i1 = 1; i1 < nslim_col + 1; i1++) {
2954c1414c8SBarry Smith col = ia[i1 - 1];
2964c1414c8SBarry Smith ia[i1] += col;
2974c1414c8SBarry Smith work[i1 - 1] = col - oshift;
2984c1414c8SBarry Smith }
2994c1414c8SBarry Smith
3004c1414c8SBarry Smith /* allocate space for column pointers */
3014c1414c8SBarry Smith nz = ia[nslim_col] + (!ishift);
3029566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nz, &ja));
3034c1414c8SBarry Smith *jja = ja;
3044c1414c8SBarry Smith
3054c1414c8SBarry Smith /* loop over matrix putting into ja */
3064d12350bSJunchao Zhang for (i1 = 0; i1 < nslim_row; i1++) {
3074d12350bSJunchao Zhang row = ns_row[i1];
3084c1414c8SBarry Smith j = aj + ai[row] + ishift;
3094c1414c8SBarry Smith col = *j++ + ishift;
3104c1414c8SBarry Smith i2 = tvc[col];
3114c1414c8SBarry Smith nz = ai[row + 1] - ai[row];
3124c1414c8SBarry Smith while (nz-- > 0) {
3134c1414c8SBarry Smith /* ja[work[i1]++] = i2 + oshift; */
3144c1414c8SBarry Smith ja[work[i2]++] = i1 + oshift;
3154c1414c8SBarry Smith i2++;
316a8e3a797SJed Brown while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--;
3174c1414c8SBarry Smith if (nz > 0) i2 = tvc[col];
3184c1414c8SBarry Smith }
3194c1414c8SBarry Smith }
3209566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col));
3219566063dSJacob Faibussowitsch PetscCall(PetscFree(work));
3229566063dSJacob Faibussowitsch PetscCall(PetscFree2(tns, tvc));
3233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
3244c1414c8SBarry Smith }
3254c1414c8SBarry Smith
MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)326d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
327d71ae5a4SJacob Faibussowitsch {
3284c1414c8SBarry Smith PetscFunctionBegin;
3299566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, n, NULL));
3303ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3314c1414c8SBarry Smith
3328f7157efSSatish Balay if (!blockcompressed) {
3339566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3348f7157efSSatish Balay } else if (symmetric) {
335a5b23f4aSJose E. Roman /* Since the indices are symmetric it doesn't matter */
3369566063dSJacob Faibussowitsch PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A, ia, ja, 0, oshift));
3374c1414c8SBarry Smith } else {
3389566063dSJacob Faibussowitsch PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A, ia, ja, 0, oshift));
3394c1414c8SBarry Smith }
3403ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
3414c1414c8SBarry Smith }
3424c1414c8SBarry Smith
MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt * n,const PetscInt * ia[],const PetscInt * ja[],PetscBool * done)343d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A, PetscInt oshift, PetscBool symmetric, PetscBool blockcompressed, PetscInt *n, const PetscInt *ia[], const PetscInt *ja[], PetscBool *done)
344d71ae5a4SJacob Faibussowitsch {
3454c1414c8SBarry Smith PetscFunctionBegin;
3463ba16761SJacob Faibussowitsch if (!ia) PetscFunctionReturn(PETSC_SUCCESS);
3478f7157efSSatish Balay if (!blockcompressed) {
3489566063dSJacob Faibussowitsch PetscCall(MatRestoreColumnIJ_SeqAIJ(A, oshift, symmetric, blockcompressed, n, ia, ja, done));
3498f7157efSSatish Balay } else {
3509566063dSJacob Faibussowitsch PetscCall(PetscFree(*ia));
3519566063dSJacob Faibussowitsch PetscCall(PetscFree(*ja));
3528f7157efSSatish Balay }
3533ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
3544c1414c8SBarry Smith }
3554c1414c8SBarry Smith
MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy)356d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMult_SeqAIJ_Inode(Mat A, Vec xx, Vec yy)
357d71ae5a4SJacob Faibussowitsch {
3584c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
359d9fead3dSBarry Smith PetscScalar *y;
360dd6ea824SBarry Smith const PetscScalar *x;
361708a0e70SJunchao Zhang PetscInt row, node_max, nonzerorow = 0;
362708a0e70SJunchao Zhang PetscInt *ns;
3634c1414c8SBarry Smith
3644c1414c8SBarry Smith PetscFunctionBegin;
3654d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
3664c1414c8SBarry Smith node_max = a->inode.node_count;
3674d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */
3689566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x));
3699566063dSJacob Faibussowitsch PetscCall(VecGetArray(yy, &y));
3704c1414c8SBarry Smith
371708a0e70SJunchao Zhang PetscPragmaUseOMPKernels(parallel for private(row) reduction(+:nonzerorow))
372708a0e70SJunchao Zhang for (PetscInt i = 0; i < node_max; ++i) {
373708a0e70SJunchao Zhang PetscInt i1, i2, nsz, n, sz;
374708a0e70SJunchao Zhang const MatScalar *v1, *v2, *v3, *v4, *v5;
375708a0e70SJunchao Zhang PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
376708a0e70SJunchao Zhang const PetscInt *idx;
377708a0e70SJunchao Zhang
378708a0e70SJunchao Zhang #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
379708a0e70SJunchao Zhang #pragma disjoint(*x, *y, *v1, *v2, *v3, *v4, *v5)
380708a0e70SJunchao Zhang #endif
381708a0e70SJunchao Zhang row = ns[i];
3824d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i];
383708a0e70SJunchao Zhang n = a->i[row + 1] - a->i[row];
38498c9bda7SSatish Balay nonzerorow += (n > 0) * nsz;
385708a0e70SJunchao Zhang
386708a0e70SJunchao Zhang idx = &a->j[a->i[row]];
387708a0e70SJunchao Zhang v1 = &a->a[a->i[row]];
38850d8bf02SJed Brown PetscPrefetchBlock(idx + nsz * n, n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */
38950d8bf02SJed Brown PetscPrefetchBlock(v1 + nsz * n, nsz * n, 0, PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */
3904c1414c8SBarry Smith sz = n; /* No of non zeros in this row */
3914c1414c8SBarry Smith /* Switch on the size of Node */
3924c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */
3934c1414c8SBarry Smith case 1:
39475567043SBarry Smith sum1 = 0.;
3954c1414c8SBarry Smith
3964c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
3974c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */
3984c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */
3994c1414c8SBarry Smith idx += 2;
4004c1414c8SBarry Smith tmp0 = x[i1];
4014c1414c8SBarry Smith tmp1 = x[i2];
4029371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4039371c9d4SSatish Balay v1 += 2;
4044c1414c8SBarry Smith }
4054c1414c8SBarry Smith
4064c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */
4074c1414c8SBarry Smith tmp0 = x[*idx++];
4084c1414c8SBarry Smith sum1 += *v1++ * tmp0;
4094c1414c8SBarry Smith }
4104c1414c8SBarry Smith y[row++] = sum1;
4114c1414c8SBarry Smith break;
4124c1414c8SBarry Smith case 2:
41375567043SBarry Smith sum1 = 0.;
41475567043SBarry Smith sum2 = 0.;
4154c1414c8SBarry Smith v2 = v1 + n;
4164c1414c8SBarry Smith
4174c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
4184c1414c8SBarry Smith i1 = idx[0];
4194c1414c8SBarry Smith i2 = idx[1];
4204c1414c8SBarry Smith idx += 2;
4214c1414c8SBarry Smith tmp0 = x[i1];
4224c1414c8SBarry Smith tmp1 = x[i2];
4239371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4249371c9d4SSatish Balay v1 += 2;
4259371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4269371c9d4SSatish Balay v2 += 2;
4274c1414c8SBarry Smith }
4284c1414c8SBarry Smith if (n == sz - 1) {
4294c1414c8SBarry Smith tmp0 = x[*idx++];
4304c1414c8SBarry Smith sum1 += *v1++ * tmp0;
4314c1414c8SBarry Smith sum2 += *v2++ * tmp0;
4324c1414c8SBarry Smith }
4334c1414c8SBarry Smith y[row++] = sum1;
4344c1414c8SBarry Smith y[row++] = sum2;
4354c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/
4364c1414c8SBarry Smith idx += sz;
4374c1414c8SBarry Smith break;
4384c1414c8SBarry Smith case 3:
43975567043SBarry Smith sum1 = 0.;
44075567043SBarry Smith sum2 = 0.;
44175567043SBarry Smith sum3 = 0.;
4424c1414c8SBarry Smith v2 = v1 + n;
4434c1414c8SBarry Smith v3 = v2 + n;
4444c1414c8SBarry Smith
4454c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
4464c1414c8SBarry Smith i1 = idx[0];
4474c1414c8SBarry Smith i2 = idx[1];
4484c1414c8SBarry Smith idx += 2;
4494c1414c8SBarry Smith tmp0 = x[i1];
4504c1414c8SBarry Smith tmp1 = x[i2];
4519371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4529371c9d4SSatish Balay v1 += 2;
4539371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4549371c9d4SSatish Balay v2 += 2;
4559371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4569371c9d4SSatish Balay v3 += 2;
4574c1414c8SBarry Smith }
4584c1414c8SBarry Smith if (n == sz - 1) {
4594c1414c8SBarry Smith tmp0 = x[*idx++];
4604c1414c8SBarry Smith sum1 += *v1++ * tmp0;
4614c1414c8SBarry Smith sum2 += *v2++ * tmp0;
4624c1414c8SBarry Smith sum3 += *v3++ * tmp0;
4634c1414c8SBarry Smith }
4644c1414c8SBarry Smith y[row++] = sum1;
4654c1414c8SBarry Smith y[row++] = sum2;
4664c1414c8SBarry Smith y[row++] = sum3;
4674c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/
4684c1414c8SBarry Smith idx += 2 * sz;
4694c1414c8SBarry Smith break;
4704c1414c8SBarry Smith case 4:
47175567043SBarry Smith sum1 = 0.;
47275567043SBarry Smith sum2 = 0.;
47375567043SBarry Smith sum3 = 0.;
47475567043SBarry Smith sum4 = 0.;
4754c1414c8SBarry Smith v2 = v1 + n;
4764c1414c8SBarry Smith v3 = v2 + n;
4774c1414c8SBarry Smith v4 = v3 + n;
4784c1414c8SBarry Smith
4794c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
4804c1414c8SBarry Smith i1 = idx[0];
4814c1414c8SBarry Smith i2 = idx[1];
4824c1414c8SBarry Smith idx += 2;
4834c1414c8SBarry Smith tmp0 = x[i1];
4844c1414c8SBarry Smith tmp1 = x[i2];
4859371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
4869371c9d4SSatish Balay v1 += 2;
4879371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1;
4889371c9d4SSatish Balay v2 += 2;
4899371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1;
4909371c9d4SSatish Balay v3 += 2;
4919371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1;
4929371c9d4SSatish Balay v4 += 2;
4934c1414c8SBarry Smith }
4944c1414c8SBarry Smith if (n == sz - 1) {
4954c1414c8SBarry Smith tmp0 = x[*idx++];
4964c1414c8SBarry Smith sum1 += *v1++ * tmp0;
4974c1414c8SBarry Smith sum2 += *v2++ * tmp0;
4984c1414c8SBarry Smith sum3 += *v3++ * tmp0;
4994c1414c8SBarry Smith sum4 += *v4++ * tmp0;
5004c1414c8SBarry Smith }
5014c1414c8SBarry Smith y[row++] = sum1;
5024c1414c8SBarry Smith y[row++] = sum2;
5034c1414c8SBarry Smith y[row++] = sum3;
5044c1414c8SBarry Smith y[row++] = sum4;
5054c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/
5064c1414c8SBarry Smith idx += 3 * sz;
5074c1414c8SBarry Smith break;
5084c1414c8SBarry Smith case 5:
50975567043SBarry Smith sum1 = 0.;
51075567043SBarry Smith sum2 = 0.;
51175567043SBarry Smith sum3 = 0.;
51275567043SBarry Smith sum4 = 0.;
51375567043SBarry Smith sum5 = 0.;
5144c1414c8SBarry Smith v2 = v1 + n;
5154c1414c8SBarry Smith v3 = v2 + n;
5164c1414c8SBarry Smith v4 = v3 + n;
5174c1414c8SBarry Smith v5 = v4 + n;
5184c1414c8SBarry Smith
5194c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
5204c1414c8SBarry Smith i1 = idx[0];
5214c1414c8SBarry Smith i2 = idx[1];
5224c1414c8SBarry Smith idx += 2;
5234c1414c8SBarry Smith tmp0 = x[i1];
5244c1414c8SBarry Smith tmp1 = x[i2];
5259371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
5269371c9d4SSatish Balay v1 += 2;
5279371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1;
5289371c9d4SSatish Balay v2 += 2;
5299371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1;
5309371c9d4SSatish Balay v3 += 2;
5319371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1;
5329371c9d4SSatish Balay v4 += 2;
5339371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1;
5349371c9d4SSatish Balay v5 += 2;
5354c1414c8SBarry Smith }
5364c1414c8SBarry Smith if (n == sz - 1) {
5374c1414c8SBarry Smith tmp0 = x[*idx++];
5384c1414c8SBarry Smith sum1 += *v1++ * tmp0;
5394c1414c8SBarry Smith sum2 += *v2++ * tmp0;
5404c1414c8SBarry Smith sum3 += *v3++ * tmp0;
5414c1414c8SBarry Smith sum4 += *v4++ * tmp0;
5424c1414c8SBarry Smith sum5 += *v5++ * tmp0;
5434c1414c8SBarry Smith }
5444c1414c8SBarry Smith y[row++] = sum1;
5454c1414c8SBarry Smith y[row++] = sum2;
5464c1414c8SBarry Smith y[row++] = sum3;
5474c1414c8SBarry Smith y[row++] = sum4;
5484c1414c8SBarry Smith y[row++] = sum5;
5494c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */
5504c1414c8SBarry Smith idx += 4 * sz;
5514c1414c8SBarry Smith break;
552d71ae5a4SJacob Faibussowitsch default:
553708a0e70SJunchao Zhang SETERRABORT(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nsz);
5544c1414c8SBarry Smith }
5554c1414c8SBarry Smith }
5569566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x));
5579566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(yy, &y));
5589566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - nonzerorow));
5593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
5604c1414c8SBarry Smith }
5612ef1f0ffSBarry Smith
5624108e4d5SBarry Smith /* Almost same code as the MatMult_SeqAIJ_Inode() */
MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy)563d71ae5a4SJacob Faibussowitsch PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A, Vec xx, Vec zz, Vec yy)
564d71ae5a4SJacob Faibussowitsch {
5654c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
5664c1414c8SBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5, tmp0, tmp1;
5678758e1faSBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5;
5688758e1faSBarry Smith const PetscScalar *x;
5698758e1faSBarry Smith PetscScalar *y, *z, *zt;
5708758e1faSBarry Smith PetscInt i1, i2, n, i, row, node_max, nsz, sz;
5718758e1faSBarry Smith const PetscInt *idx, *ns, *ii;
5724c1414c8SBarry Smith
5734c1414c8SBarry Smith PetscFunctionBegin;
5744d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
5754c1414c8SBarry Smith node_max = a->inode.node_count;
5764d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */
5772205254eSKarl Rupp
5789566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x));
5799566063dSJacob Faibussowitsch PetscCall(VecGetArrayPair(zz, yy, &z, &y));
5804c1414c8SBarry Smith zt = z;
5814c1414c8SBarry Smith
5824c1414c8SBarry Smith idx = a->j;
5834c1414c8SBarry Smith v1 = a->a;
5844c1414c8SBarry Smith ii = a->i;
5854c1414c8SBarry Smith
5864d12350bSJunchao Zhang for (i = 0; i < node_max; ++i) {
5874d12350bSJunchao Zhang row = ns[i];
5884d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i];
5894c1414c8SBarry Smith n = ii[1] - ii[0];
5904c1414c8SBarry Smith ii += nsz;
5914c1414c8SBarry Smith sz = n; /* No of non zeros in this row */
5924c1414c8SBarry Smith /* Switch on the size of Node */
5934c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */
5944c1414c8SBarry Smith case 1:
5954c1414c8SBarry Smith sum1 = *zt++;
5964c1414c8SBarry Smith
5974c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
5984c1414c8SBarry Smith i1 = idx[0]; /* The instructions are ordered to */
5994c1414c8SBarry Smith i2 = idx[1]; /* make the compiler's job easy */
6004c1414c8SBarry Smith idx += 2;
6014c1414c8SBarry Smith tmp0 = x[i1];
6024c1414c8SBarry Smith tmp1 = x[i2];
6039371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6049371c9d4SSatish Balay v1 += 2;
6054c1414c8SBarry Smith }
6064c1414c8SBarry Smith
6074c1414c8SBarry Smith if (n == sz - 1) { /* Take care of the last nonzero */
6084c1414c8SBarry Smith tmp0 = x[*idx++];
6094c1414c8SBarry Smith sum1 += *v1++ * tmp0;
6104c1414c8SBarry Smith }
6114c1414c8SBarry Smith y[row++] = sum1;
6124c1414c8SBarry Smith break;
6134c1414c8SBarry Smith case 2:
6144c1414c8SBarry Smith sum1 = *zt++;
6154c1414c8SBarry Smith sum2 = *zt++;
6164c1414c8SBarry Smith v2 = v1 + n;
6174c1414c8SBarry Smith
6184c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
6194c1414c8SBarry Smith i1 = idx[0];
6204c1414c8SBarry Smith i2 = idx[1];
6214c1414c8SBarry Smith idx += 2;
6224c1414c8SBarry Smith tmp0 = x[i1];
6234c1414c8SBarry Smith tmp1 = x[i2];
6249371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6259371c9d4SSatish Balay v1 += 2;
6269371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6279371c9d4SSatish Balay v2 += 2;
6284c1414c8SBarry Smith }
6294c1414c8SBarry Smith if (n == sz - 1) {
6304c1414c8SBarry Smith tmp0 = x[*idx++];
6314c1414c8SBarry Smith sum1 += *v1++ * tmp0;
6324c1414c8SBarry Smith sum2 += *v2++ * tmp0;
6334c1414c8SBarry Smith }
6344c1414c8SBarry Smith y[row++] = sum1;
6354c1414c8SBarry Smith y[row++] = sum2;
6364c1414c8SBarry Smith v1 = v2; /* Since the next block to be processed starts there*/
6374c1414c8SBarry Smith idx += sz;
6384c1414c8SBarry Smith break;
6394c1414c8SBarry Smith case 3:
6404c1414c8SBarry Smith sum1 = *zt++;
6414c1414c8SBarry Smith sum2 = *zt++;
6424c1414c8SBarry Smith sum3 = *zt++;
6434c1414c8SBarry Smith v2 = v1 + n;
6444c1414c8SBarry Smith v3 = v2 + n;
6454c1414c8SBarry Smith
6464c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
6474c1414c8SBarry Smith i1 = idx[0];
6484c1414c8SBarry Smith i2 = idx[1];
6494c1414c8SBarry Smith idx += 2;
6504c1414c8SBarry Smith tmp0 = x[i1];
6514c1414c8SBarry Smith tmp1 = x[i2];
6529371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6539371c9d4SSatish Balay v1 += 2;
6549371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6559371c9d4SSatish Balay v2 += 2;
6569371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6579371c9d4SSatish Balay v3 += 2;
6584c1414c8SBarry Smith }
6594c1414c8SBarry Smith if (n == sz - 1) {
6604c1414c8SBarry Smith tmp0 = x[*idx++];
6614c1414c8SBarry Smith sum1 += *v1++ * tmp0;
6624c1414c8SBarry Smith sum2 += *v2++ * tmp0;
6634c1414c8SBarry Smith sum3 += *v3++ * tmp0;
6644c1414c8SBarry Smith }
6654c1414c8SBarry Smith y[row++] = sum1;
6664c1414c8SBarry Smith y[row++] = sum2;
6674c1414c8SBarry Smith y[row++] = sum3;
6684c1414c8SBarry Smith v1 = v3; /* Since the next block to be processed starts there*/
6694c1414c8SBarry Smith idx += 2 * sz;
6704c1414c8SBarry Smith break;
6714c1414c8SBarry Smith case 4:
6724c1414c8SBarry Smith sum1 = *zt++;
6734c1414c8SBarry Smith sum2 = *zt++;
6744c1414c8SBarry Smith sum3 = *zt++;
6754c1414c8SBarry Smith sum4 = *zt++;
6764c1414c8SBarry Smith v2 = v1 + n;
6774c1414c8SBarry Smith v3 = v2 + n;
6784c1414c8SBarry Smith v4 = v3 + n;
6794c1414c8SBarry Smith
6804c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
6814c1414c8SBarry Smith i1 = idx[0];
6824c1414c8SBarry Smith i2 = idx[1];
6834c1414c8SBarry Smith idx += 2;
6844c1414c8SBarry Smith tmp0 = x[i1];
6854c1414c8SBarry Smith tmp1 = x[i2];
6869371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
6879371c9d4SSatish Balay v1 += 2;
6889371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1;
6899371c9d4SSatish Balay v2 += 2;
6909371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1;
6919371c9d4SSatish Balay v3 += 2;
6929371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1;
6939371c9d4SSatish Balay v4 += 2;
6944c1414c8SBarry Smith }
6954c1414c8SBarry Smith if (n == sz - 1) {
6964c1414c8SBarry Smith tmp0 = x[*idx++];
6974c1414c8SBarry Smith sum1 += *v1++ * tmp0;
6984c1414c8SBarry Smith sum2 += *v2++ * tmp0;
6994c1414c8SBarry Smith sum3 += *v3++ * tmp0;
7004c1414c8SBarry Smith sum4 += *v4++ * tmp0;
7014c1414c8SBarry Smith }
7024c1414c8SBarry Smith y[row++] = sum1;
7034c1414c8SBarry Smith y[row++] = sum2;
7044c1414c8SBarry Smith y[row++] = sum3;
7054c1414c8SBarry Smith y[row++] = sum4;
7064c1414c8SBarry Smith v1 = v4; /* Since the next block to be processed starts there*/
7074c1414c8SBarry Smith idx += 3 * sz;
7084c1414c8SBarry Smith break;
7094c1414c8SBarry Smith case 5:
7104c1414c8SBarry Smith sum1 = *zt++;
7114c1414c8SBarry Smith sum2 = *zt++;
7124c1414c8SBarry Smith sum3 = *zt++;
7134c1414c8SBarry Smith sum4 = *zt++;
7144c1414c8SBarry Smith sum5 = *zt++;
7154c1414c8SBarry Smith v2 = v1 + n;
7164c1414c8SBarry Smith v3 = v2 + n;
7174c1414c8SBarry Smith v4 = v3 + n;
7184c1414c8SBarry Smith v5 = v4 + n;
7194c1414c8SBarry Smith
7204c1414c8SBarry Smith for (n = 0; n < sz - 1; n += 2) {
7214c1414c8SBarry Smith i1 = idx[0];
7224c1414c8SBarry Smith i2 = idx[1];
7234c1414c8SBarry Smith idx += 2;
7244c1414c8SBarry Smith tmp0 = x[i1];
7254c1414c8SBarry Smith tmp1 = x[i2];
7269371c9d4SSatish Balay sum1 += v1[0] * tmp0 + v1[1] * tmp1;
7279371c9d4SSatish Balay v1 += 2;
7289371c9d4SSatish Balay sum2 += v2[0] * tmp0 + v2[1] * tmp1;
7299371c9d4SSatish Balay v2 += 2;
7309371c9d4SSatish Balay sum3 += v3[0] * tmp0 + v3[1] * tmp1;
7319371c9d4SSatish Balay v3 += 2;
7329371c9d4SSatish Balay sum4 += v4[0] * tmp0 + v4[1] * tmp1;
7339371c9d4SSatish Balay v4 += 2;
7349371c9d4SSatish Balay sum5 += v5[0] * tmp0 + v5[1] * tmp1;
7359371c9d4SSatish Balay v5 += 2;
7364c1414c8SBarry Smith }
7374c1414c8SBarry Smith if (n == sz - 1) {
7384c1414c8SBarry Smith tmp0 = x[*idx++];
7394c1414c8SBarry Smith sum1 += *v1++ * tmp0;
7404c1414c8SBarry Smith sum2 += *v2++ * tmp0;
7414c1414c8SBarry Smith sum3 += *v3++ * tmp0;
7424c1414c8SBarry Smith sum4 += *v4++ * tmp0;
7434c1414c8SBarry Smith sum5 += *v5++ * tmp0;
7444c1414c8SBarry Smith }
7454c1414c8SBarry Smith y[row++] = sum1;
7464c1414c8SBarry Smith y[row++] = sum2;
7474c1414c8SBarry Smith y[row++] = sum3;
7484c1414c8SBarry Smith y[row++] = sum4;
7494c1414c8SBarry Smith y[row++] = sum5;
7504c1414c8SBarry Smith v1 = v5; /* Since the next block to be processed starts there */
7514c1414c8SBarry Smith idx += 4 * sz;
7524c1414c8SBarry Smith break;
753d71ae5a4SJacob Faibussowitsch default:
754d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported");
7554c1414c8SBarry Smith }
7564c1414c8SBarry Smith }
7579566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x));
7589566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayPair(zz, yy, &z, &y));
7599566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz));
7603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
7614c1414c8SBarry Smith }
7624c1414c8SBarry Smith
MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx)763ff6a9541SJacob Faibussowitsch static PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A, Vec bb, Vec xx)
764d71ae5a4SJacob Faibussowitsch {
7654c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
7664c1414c8SBarry Smith IS iscol = a->col, isrow = a->row;
7675d0c19d7SBarry Smith const PetscInt *r, *c, *rout, *cout;
7688758e1faSBarry Smith PetscInt i, j, n = A->rmap->n, nz;
7698758e1faSBarry Smith PetscInt node_max, *ns, row, nsz, aii, i0, i1;
7708758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *vi, *ad, *aj;
771d9fead3dSBarry Smith PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
772d9fead3dSBarry Smith PetscScalar sum1, sum2, sum3, sum4, sum5;
773dd6ea824SBarry Smith const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
774dd6ea824SBarry Smith const PetscScalar *b;
7754c1414c8SBarry Smith
7764c1414c8SBarry Smith PetscFunctionBegin;
7774d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
7784c1414c8SBarry Smith node_max = a->inode.node_count;
7794d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */
7804c1414c8SBarry Smith
7819566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b));
7829566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x));
7834c1414c8SBarry Smith tmp = a->solve_work;
7844c1414c8SBarry Smith
7859371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout));
7869371c9d4SSatish Balay r = rout;
7879371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout));
7889371c9d4SSatish Balay c = cout + (n - 1);
7894c1414c8SBarry Smith
7904c1414c8SBarry Smith /* forward solve the lower triangular */
7914c1414c8SBarry Smith tmps = tmp;
7924c1414c8SBarry Smith aa = a_a;
7934c1414c8SBarry Smith aj = a_j;
7944c1414c8SBarry Smith ad = a->diag;
7954c1414c8SBarry Smith
7964c1414c8SBarry Smith for (i = 0, row = 0; i < node_max; ++i) {
7974d12350bSJunchao Zhang row = ns[i];
7984d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i];
7994c1414c8SBarry Smith aii = ai[row];
8004c1414c8SBarry Smith v1 = aa + aii;
8014c1414c8SBarry Smith vi = aj + aii;
8024c1414c8SBarry Smith nz = ad[row] - aii;
80326549573SJed Brown if (i < node_max - 1) {
80426549573SJed Brown /* Prefetch the block after the current one, the prefetch itself can't cause a memory error,
80591c35059SPierre Jolivet * but our indexing to determine its size could. */
80650d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ad[row + nsz] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
80726549573SJed Brown /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */
8084d12350bSJunchao Zhang PetscPrefetchBlock(aa + ai[row + nsz], ad[ns[i + 2] - 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
80926549573SJed Brown /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */
81026549573SJed Brown }
8114c1414c8SBarry Smith
8124c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */
8134c1414c8SBarry Smith case 1:
8144c1414c8SBarry Smith sum1 = b[*r++];
8154c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) {
8164c1414c8SBarry Smith i0 = vi[0];
8174c1414c8SBarry Smith i1 = vi[1];
8184c1414c8SBarry Smith vi += 2;
8194c1414c8SBarry Smith tmp0 = tmps[i0];
8204c1414c8SBarry Smith tmp1 = tmps[i1];
8219371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8229371c9d4SSatish Balay v1 += 2;
8234c1414c8SBarry Smith }
8244c1414c8SBarry Smith if (j == nz - 1) {
8254c1414c8SBarry Smith tmp0 = tmps[*vi++];
8264c1414c8SBarry Smith sum1 -= *v1++ * tmp0;
8274c1414c8SBarry Smith }
8284c1414c8SBarry Smith tmp[row++] = sum1;
8294c1414c8SBarry Smith break;
8304c1414c8SBarry Smith case 2:
8314c1414c8SBarry Smith sum1 = b[*r++];
8324c1414c8SBarry Smith sum2 = b[*r++];
8334c1414c8SBarry Smith v2 = aa + ai[row + 1];
8344c1414c8SBarry Smith
8354c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) {
8364c1414c8SBarry Smith i0 = vi[0];
8374c1414c8SBarry Smith i1 = vi[1];
8384c1414c8SBarry Smith vi += 2;
8394c1414c8SBarry Smith tmp0 = tmps[i0];
8404c1414c8SBarry Smith tmp1 = tmps[i1];
8419371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8429371c9d4SSatish Balay v1 += 2;
8439371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8449371c9d4SSatish Balay v2 += 2;
8454c1414c8SBarry Smith }
8464c1414c8SBarry Smith if (j == nz - 1) {
8474c1414c8SBarry Smith tmp0 = tmps[*vi++];
8484c1414c8SBarry Smith sum1 -= *v1++ * tmp0;
8494c1414c8SBarry Smith sum2 -= *v2++ * tmp0;
8504c1414c8SBarry Smith }
8514c1414c8SBarry Smith sum2 -= *v2++ * sum1;
8524c1414c8SBarry Smith tmp[row++] = sum1;
8534c1414c8SBarry Smith tmp[row++] = sum2;
8544c1414c8SBarry Smith break;
8554c1414c8SBarry Smith case 3:
8564c1414c8SBarry Smith sum1 = b[*r++];
8574c1414c8SBarry Smith sum2 = b[*r++];
8584c1414c8SBarry Smith sum3 = b[*r++];
8594c1414c8SBarry Smith v2 = aa + ai[row + 1];
8604c1414c8SBarry Smith v3 = aa + ai[row + 2];
8614c1414c8SBarry Smith
8624c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) {
8634c1414c8SBarry Smith i0 = vi[0];
8644c1414c8SBarry Smith i1 = vi[1];
8654c1414c8SBarry Smith vi += 2;
8664c1414c8SBarry Smith tmp0 = tmps[i0];
8674c1414c8SBarry Smith tmp1 = tmps[i1];
8689371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
8699371c9d4SSatish Balay v1 += 2;
8709371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
8719371c9d4SSatish Balay v2 += 2;
8729371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
8739371c9d4SSatish Balay v3 += 2;
8744c1414c8SBarry Smith }
8754c1414c8SBarry Smith if (j == nz - 1) {
8764c1414c8SBarry Smith tmp0 = tmps[*vi++];
8774c1414c8SBarry Smith sum1 -= *v1++ * tmp0;
8784c1414c8SBarry Smith sum2 -= *v2++ * tmp0;
8794c1414c8SBarry Smith sum3 -= *v3++ * tmp0;
8804c1414c8SBarry Smith }
8814c1414c8SBarry Smith sum2 -= *v2++ * sum1;
8824c1414c8SBarry Smith sum3 -= *v3++ * sum1;
8834c1414c8SBarry Smith sum3 -= *v3++ * sum2;
8842205254eSKarl Rupp
8854c1414c8SBarry Smith tmp[row++] = sum1;
8864c1414c8SBarry Smith tmp[row++] = sum2;
8874c1414c8SBarry Smith tmp[row++] = sum3;
8884c1414c8SBarry Smith break;
8894c1414c8SBarry Smith
8904c1414c8SBarry Smith case 4:
8914c1414c8SBarry Smith sum1 = b[*r++];
8924c1414c8SBarry Smith sum2 = b[*r++];
8934c1414c8SBarry Smith sum3 = b[*r++];
8944c1414c8SBarry Smith sum4 = b[*r++];
8954c1414c8SBarry Smith v2 = aa + ai[row + 1];
8964c1414c8SBarry Smith v3 = aa + ai[row + 2];
8974c1414c8SBarry Smith v4 = aa + ai[row + 3];
8984c1414c8SBarry Smith
8994c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) {
9004c1414c8SBarry Smith i0 = vi[0];
9014c1414c8SBarry Smith i1 = vi[1];
9024c1414c8SBarry Smith vi += 2;
9034c1414c8SBarry Smith tmp0 = tmps[i0];
9044c1414c8SBarry Smith tmp1 = tmps[i1];
9059371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
9069371c9d4SSatish Balay v1 += 2;
9079371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9089371c9d4SSatish Balay v2 += 2;
9099371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9109371c9d4SSatish Balay v3 += 2;
9119371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9129371c9d4SSatish Balay v4 += 2;
9134c1414c8SBarry Smith }
9144c1414c8SBarry Smith if (j == nz - 1) {
9154c1414c8SBarry Smith tmp0 = tmps[*vi++];
9164c1414c8SBarry Smith sum1 -= *v1++ * tmp0;
9174c1414c8SBarry Smith sum2 -= *v2++ * tmp0;
9184c1414c8SBarry Smith sum3 -= *v3++ * tmp0;
9194c1414c8SBarry Smith sum4 -= *v4++ * tmp0;
9204c1414c8SBarry Smith }
9214c1414c8SBarry Smith sum2 -= *v2++ * sum1;
9224c1414c8SBarry Smith sum3 -= *v3++ * sum1;
9234c1414c8SBarry Smith sum4 -= *v4++ * sum1;
9244c1414c8SBarry Smith sum3 -= *v3++ * sum2;
9254c1414c8SBarry Smith sum4 -= *v4++ * sum2;
9264c1414c8SBarry Smith sum4 -= *v4++ * sum3;
9274c1414c8SBarry Smith
9284c1414c8SBarry Smith tmp[row++] = sum1;
9294c1414c8SBarry Smith tmp[row++] = sum2;
9304c1414c8SBarry Smith tmp[row++] = sum3;
9314c1414c8SBarry Smith tmp[row++] = sum4;
9324c1414c8SBarry Smith break;
9334c1414c8SBarry Smith case 5:
9344c1414c8SBarry Smith sum1 = b[*r++];
9354c1414c8SBarry Smith sum2 = b[*r++];
9364c1414c8SBarry Smith sum3 = b[*r++];
9374c1414c8SBarry Smith sum4 = b[*r++];
9384c1414c8SBarry Smith sum5 = b[*r++];
9394c1414c8SBarry Smith v2 = aa + ai[row + 1];
9404c1414c8SBarry Smith v3 = aa + ai[row + 2];
9414c1414c8SBarry Smith v4 = aa + ai[row + 3];
9424c1414c8SBarry Smith v5 = aa + ai[row + 4];
9434c1414c8SBarry Smith
9444c1414c8SBarry Smith for (j = 0; j < nz - 1; j += 2) {
9454c1414c8SBarry Smith i0 = vi[0];
9464c1414c8SBarry Smith i1 = vi[1];
9474c1414c8SBarry Smith vi += 2;
9484c1414c8SBarry Smith tmp0 = tmps[i0];
9494c1414c8SBarry Smith tmp1 = tmps[i1];
9509371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
9519371c9d4SSatish Balay v1 += 2;
9529371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
9539371c9d4SSatish Balay v2 += 2;
9549371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
9559371c9d4SSatish Balay v3 += 2;
9569371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
9579371c9d4SSatish Balay v4 += 2;
9589371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
9599371c9d4SSatish Balay v5 += 2;
9604c1414c8SBarry Smith }
9614c1414c8SBarry Smith if (j == nz - 1) {
9624c1414c8SBarry Smith tmp0 = tmps[*vi++];
9634c1414c8SBarry Smith sum1 -= *v1++ * tmp0;
9644c1414c8SBarry Smith sum2 -= *v2++ * tmp0;
9654c1414c8SBarry Smith sum3 -= *v3++ * tmp0;
9664c1414c8SBarry Smith sum4 -= *v4++ * tmp0;
9674c1414c8SBarry Smith sum5 -= *v5++ * tmp0;
9684c1414c8SBarry Smith }
9694c1414c8SBarry Smith
9704c1414c8SBarry Smith sum2 -= *v2++ * sum1;
9714c1414c8SBarry Smith sum3 -= *v3++ * sum1;
9724c1414c8SBarry Smith sum4 -= *v4++ * sum1;
9734c1414c8SBarry Smith sum5 -= *v5++ * sum1;
9744c1414c8SBarry Smith sum3 -= *v3++ * sum2;
9754c1414c8SBarry Smith sum4 -= *v4++ * sum2;
9764c1414c8SBarry Smith sum5 -= *v5++ * sum2;
9774c1414c8SBarry Smith sum4 -= *v4++ * sum3;
9784c1414c8SBarry Smith sum5 -= *v5++ * sum3;
9794c1414c8SBarry Smith sum5 -= *v5++ * sum4;
9804c1414c8SBarry Smith
9814c1414c8SBarry Smith tmp[row++] = sum1;
9824c1414c8SBarry Smith tmp[row++] = sum2;
9834c1414c8SBarry Smith tmp[row++] = sum3;
9844c1414c8SBarry Smith tmp[row++] = sum4;
9854c1414c8SBarry Smith tmp[row++] = sum5;
9864c1414c8SBarry Smith break;
987d71ae5a4SJacob Faibussowitsch default:
988d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
9894c1414c8SBarry Smith }
9904c1414c8SBarry Smith }
9914c1414c8SBarry Smith /* backward solve the upper triangular */
9924d12350bSJunchao Zhang for (i = node_max - 1; i >= 0; i--) {
9934d12350bSJunchao Zhang row = ns[i + 1];
9944d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i];
9954c1414c8SBarry Smith aii = ai[row + 1] - 1;
9964c1414c8SBarry Smith v1 = aa + aii;
9974c1414c8SBarry Smith vi = aj + aii;
9984c1414c8SBarry Smith nz = aii - ad[row];
9994c1414c8SBarry Smith switch (nsz) { /* Each loop in 'case' is unrolled */
10004c1414c8SBarry Smith case 1:
10014c1414c8SBarry Smith sum1 = tmp[row];
10024c1414c8SBarry Smith
10034c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) {
10044c1414c8SBarry Smith vi -= 2;
10054c1414c8SBarry Smith i0 = vi[2];
10064c1414c8SBarry Smith i1 = vi[1];
10074c1414c8SBarry Smith tmp0 = tmps[i0];
10084c1414c8SBarry Smith tmp1 = tmps[i1];
10094c1414c8SBarry Smith v1 -= 2;
10104c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10114c1414c8SBarry Smith }
10124c1414c8SBarry Smith if (j == 1) {
10134c1414c8SBarry Smith tmp0 = tmps[*vi--];
10144c1414c8SBarry Smith sum1 -= *v1-- * tmp0;
10154c1414c8SBarry Smith }
10169371c9d4SSatish Balay x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10179371c9d4SSatish Balay row--;
10184c1414c8SBarry Smith break;
10194c1414c8SBarry Smith case 2:
10204c1414c8SBarry Smith sum1 = tmp[row];
10214c1414c8SBarry Smith sum2 = tmp[row - 1];
10224c1414c8SBarry Smith v2 = aa + ai[row] - 1;
10234c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) {
10244c1414c8SBarry Smith vi -= 2;
10254c1414c8SBarry Smith i0 = vi[2];
10264c1414c8SBarry Smith i1 = vi[1];
10274c1414c8SBarry Smith tmp0 = tmps[i0];
10284c1414c8SBarry Smith tmp1 = tmps[i1];
10294c1414c8SBarry Smith v1 -= 2;
10304c1414c8SBarry Smith v2 -= 2;
10314c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10324c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10334c1414c8SBarry Smith }
10344c1414c8SBarry Smith if (j == 1) {
10354c1414c8SBarry Smith tmp0 = tmps[*vi--];
10364c1414c8SBarry Smith sum1 -= *v1-- * tmp0;
10374c1414c8SBarry Smith sum2 -= *v2-- * tmp0;
10384c1414c8SBarry Smith }
10394c1414c8SBarry Smith
10409371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10419371c9d4SSatish Balay row--;
10424c1414c8SBarry Smith sum2 -= *v2-- * tmp0;
10439371c9d4SSatish Balay x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10449371c9d4SSatish Balay row--;
10454c1414c8SBarry Smith break;
10464c1414c8SBarry Smith case 3:
10474c1414c8SBarry Smith sum1 = tmp[row];
10484c1414c8SBarry Smith sum2 = tmp[row - 1];
10494c1414c8SBarry Smith sum3 = tmp[row - 2];
10504c1414c8SBarry Smith v2 = aa + ai[row] - 1;
10514c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1;
10524c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) {
10534c1414c8SBarry Smith vi -= 2;
10544c1414c8SBarry Smith i0 = vi[2];
10554c1414c8SBarry Smith i1 = vi[1];
10564c1414c8SBarry Smith tmp0 = tmps[i0];
10574c1414c8SBarry Smith tmp1 = tmps[i1];
10584c1414c8SBarry Smith v1 -= 2;
10594c1414c8SBarry Smith v2 -= 2;
10604c1414c8SBarry Smith v3 -= 2;
10614c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
10624c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
10634c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
10644c1414c8SBarry Smith }
10654c1414c8SBarry Smith if (j == 1) {
10664c1414c8SBarry Smith tmp0 = tmps[*vi--];
10674c1414c8SBarry Smith sum1 -= *v1-- * tmp0;
10684c1414c8SBarry Smith sum2 -= *v2-- * tmp0;
10694c1414c8SBarry Smith sum3 -= *v3-- * tmp0;
10704c1414c8SBarry Smith }
10719371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
10729371c9d4SSatish Balay row--;
10734c1414c8SBarry Smith sum2 -= *v2-- * tmp0;
10744c1414c8SBarry Smith sum3 -= *v3-- * tmp0;
10759371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
10769371c9d4SSatish Balay row--;
10774c1414c8SBarry Smith sum3 -= *v3-- * tmp0;
10789371c9d4SSatish Balay x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
10799371c9d4SSatish Balay row--;
10804c1414c8SBarry Smith
10814c1414c8SBarry Smith break;
10824c1414c8SBarry Smith case 4:
10834c1414c8SBarry Smith sum1 = tmp[row];
10844c1414c8SBarry Smith sum2 = tmp[row - 1];
10854c1414c8SBarry Smith sum3 = tmp[row - 2];
10864c1414c8SBarry Smith sum4 = tmp[row - 3];
10874c1414c8SBarry Smith v2 = aa + ai[row] - 1;
10884c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1;
10894c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1;
10904c1414c8SBarry Smith
10914c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) {
10924c1414c8SBarry Smith vi -= 2;
10934c1414c8SBarry Smith i0 = vi[2];
10944c1414c8SBarry Smith i1 = vi[1];
10954c1414c8SBarry Smith tmp0 = tmps[i0];
10964c1414c8SBarry Smith tmp1 = tmps[i1];
10974c1414c8SBarry Smith v1 -= 2;
10984c1414c8SBarry Smith v2 -= 2;
10994c1414c8SBarry Smith v3 -= 2;
11004c1414c8SBarry Smith v4 -= 2;
11014c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
11024c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
11034c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
11044c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
11054c1414c8SBarry Smith }
11064c1414c8SBarry Smith if (j == 1) {
11074c1414c8SBarry Smith tmp0 = tmps[*vi--];
11084c1414c8SBarry Smith sum1 -= *v1-- * tmp0;
11094c1414c8SBarry Smith sum2 -= *v2-- * tmp0;
11104c1414c8SBarry Smith sum3 -= *v3-- * tmp0;
11114c1414c8SBarry Smith sum4 -= *v4-- * tmp0;
11124c1414c8SBarry Smith }
11134c1414c8SBarry Smith
11149371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11159371c9d4SSatish Balay row--;
11164c1414c8SBarry Smith sum2 -= *v2-- * tmp0;
11174c1414c8SBarry Smith sum3 -= *v3-- * tmp0;
11184c1414c8SBarry Smith sum4 -= *v4-- * tmp0;
11199371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11209371c9d4SSatish Balay row--;
11214c1414c8SBarry Smith sum3 -= *v3-- * tmp0;
11224c1414c8SBarry Smith sum4 -= *v4-- * tmp0;
11239371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11249371c9d4SSatish Balay row--;
11254c1414c8SBarry Smith sum4 -= *v4-- * tmp0;
11269371c9d4SSatish Balay x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11279371c9d4SSatish Balay row--;
11284c1414c8SBarry Smith break;
11294c1414c8SBarry Smith case 5:
11304c1414c8SBarry Smith sum1 = tmp[row];
11314c1414c8SBarry Smith sum2 = tmp[row - 1];
11324c1414c8SBarry Smith sum3 = tmp[row - 2];
11334c1414c8SBarry Smith sum4 = tmp[row - 3];
11344c1414c8SBarry Smith sum5 = tmp[row - 4];
11354c1414c8SBarry Smith v2 = aa + ai[row] - 1;
11364c1414c8SBarry Smith v3 = aa + ai[row - 1] - 1;
11374c1414c8SBarry Smith v4 = aa + ai[row - 2] - 1;
11384c1414c8SBarry Smith v5 = aa + ai[row - 3] - 1;
11394c1414c8SBarry Smith for (j = nz; j > 1; j -= 2) {
11404c1414c8SBarry Smith vi -= 2;
11414c1414c8SBarry Smith i0 = vi[2];
11424c1414c8SBarry Smith i1 = vi[1];
11434c1414c8SBarry Smith tmp0 = tmps[i0];
11444c1414c8SBarry Smith tmp1 = tmps[i1];
11454c1414c8SBarry Smith v1 -= 2;
11464c1414c8SBarry Smith v2 -= 2;
11474c1414c8SBarry Smith v3 -= 2;
11484c1414c8SBarry Smith v4 -= 2;
11494c1414c8SBarry Smith v5 -= 2;
11504c1414c8SBarry Smith sum1 -= v1[2] * tmp0 + v1[1] * tmp1;
11514c1414c8SBarry Smith sum2 -= v2[2] * tmp0 + v2[1] * tmp1;
11524c1414c8SBarry Smith sum3 -= v3[2] * tmp0 + v3[1] * tmp1;
11534c1414c8SBarry Smith sum4 -= v4[2] * tmp0 + v4[1] * tmp1;
11544c1414c8SBarry Smith sum5 -= v5[2] * tmp0 + v5[1] * tmp1;
11554c1414c8SBarry Smith }
11564c1414c8SBarry Smith if (j == 1) {
11574c1414c8SBarry Smith tmp0 = tmps[*vi--];
11584c1414c8SBarry Smith sum1 -= *v1-- * tmp0;
11594c1414c8SBarry Smith sum2 -= *v2-- * tmp0;
11604c1414c8SBarry Smith sum3 -= *v3-- * tmp0;
11614c1414c8SBarry Smith sum4 -= *v4-- * tmp0;
11624c1414c8SBarry Smith sum5 -= *v5-- * tmp0;
11634c1414c8SBarry Smith }
11644c1414c8SBarry Smith
11659371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum1 * a_a[ad[row]];
11669371c9d4SSatish Balay row--;
11674c1414c8SBarry Smith sum2 -= *v2-- * tmp0;
11684c1414c8SBarry Smith sum3 -= *v3-- * tmp0;
11694c1414c8SBarry Smith sum4 -= *v4-- * tmp0;
11704c1414c8SBarry Smith sum5 -= *v5-- * tmp0;
11719371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum2 * a_a[ad[row]];
11729371c9d4SSatish Balay row--;
11734c1414c8SBarry Smith sum3 -= *v3-- * tmp0;
11744c1414c8SBarry Smith sum4 -= *v4-- * tmp0;
11754c1414c8SBarry Smith sum5 -= *v5-- * tmp0;
11769371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum3 * a_a[ad[row]];
11779371c9d4SSatish Balay row--;
11784c1414c8SBarry Smith sum4 -= *v4-- * tmp0;
11794c1414c8SBarry Smith sum5 -= *v5-- * tmp0;
11809371c9d4SSatish Balay tmp0 = x[*c--] = tmp[row] = sum4 * a_a[ad[row]];
11819371c9d4SSatish Balay row--;
11824c1414c8SBarry Smith sum5 -= *v5-- * tmp0;
11839371c9d4SSatish Balay x[*c--] = tmp[row] = sum5 * a_a[ad[row]];
11849371c9d4SSatish Balay row--;
11854c1414c8SBarry Smith break;
1186d71ae5a4SJacob Faibussowitsch default:
1187d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
11884c1414c8SBarry Smith }
11894c1414c8SBarry Smith }
11909566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout));
11919566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout));
11929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b));
11939566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x));
11949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
11953ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
11964c1414c8SBarry Smith }
11974c1414c8SBarry Smith
MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo * info)1198d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B, Mat A, const MatFactorInfo *info)
1199d71ae5a4SJacob Faibussowitsch {
120028f1b45aSHong Zhang Mat C = B;
120128f1b45aSHong Zhang Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data, *b = (Mat_SeqAIJ *)C->data;
120228f1b45aSHong Zhang IS isrow = b->row, isicol = b->icol;
120328f1b45aSHong Zhang const PetscInt *r, *ic, *ics;
120428f1b45aSHong Zhang const PetscInt n = A->rmap->n, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
120528f1b45aSHong Zhang PetscInt i, j, k, nz, nzL, row, *pj;
120628f1b45aSHong Zhang const PetscInt *ajtmp, *bjtmp;
12079877982aSShri Abhyankar MatScalar *pc, *pc1, *pc2, *pc3, *pc4, mul1, mul2, mul3, mul4, *pv, *rtmp1, *rtmp2, *rtmp3, *rtmp4;
12089877982aSShri Abhyankar const MatScalar *aa = a->a, *v, *v1, *v2, *v3, *v4;
120928f1b45aSHong Zhang FactorShiftCtx sctx;
12104f81c4b7SBarry Smith const PetscInt *ddiag;
121128f1b45aSHong Zhang PetscReal rs;
121228f1b45aSHong Zhang MatScalar d;
12134f81c4b7SBarry Smith PetscInt inod, nodesz, node_max, col;
12144f81c4b7SBarry Smith const PetscInt *ns;
121507b50cabSHong Zhang PetscInt *tmp_vec1, *tmp_vec2, *nsmap;
12160e95ead3SHong Zhang
121728f1b45aSHong Zhang PetscFunctionBegin;
121828f1b45aSHong Zhang /* MatPivotSetUp(): initialize shift context sctx */
12199566063dSJacob Faibussowitsch PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
122028f1b45aSHong Zhang
1221f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
122228f1b45aSHong Zhang ddiag = a->diag;
122328f1b45aSHong Zhang sctx.shift_top = info->zeropivot;
122428f1b45aSHong Zhang for (i = 0; i < n; i++) {
122528f1b45aSHong Zhang /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
122628f1b45aSHong Zhang d = (aa)[ddiag[i]];
122728f1b45aSHong Zhang rs = -PetscAbsScalar(d) - PetscRealPart(d);
122828f1b45aSHong Zhang v = aa + ai[i];
122928f1b45aSHong Zhang nz = ai[i + 1] - ai[i];
12302205254eSKarl Rupp for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
123128f1b45aSHong Zhang if (rs > sctx.shift_top) sctx.shift_top = rs;
123228f1b45aSHong Zhang }
123328f1b45aSHong Zhang sctx.shift_top *= 1.1;
123428f1b45aSHong Zhang sctx.nshift_max = 5;
123528f1b45aSHong Zhang sctx.shift_lo = 0.;
123628f1b45aSHong Zhang sctx.shift_hi = 1.;
123728f1b45aSHong Zhang }
123828f1b45aSHong Zhang
12399566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r));
12409566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic));
124168785679SHong Zhang
12429566063dSJacob Faibussowitsch PetscCall(PetscCalloc4(n, &rtmp1, n, &rtmp2, n, &rtmp3, n, &rtmp4));
124328f1b45aSHong Zhang ics = ic;
124428f1b45aSHong Zhang
124528f1b45aSHong Zhang node_max = a->inode.node_count;
12464d12350bSJunchao Zhang ns = a->inode.size_csr;
124728b400f6SJacob Faibussowitsch PetscCheck(ns, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Matrix without inode information");
124828f1b45aSHong Zhang
12499877982aSShri Abhyankar /* If max inode size > 4, split it into two inodes.*/
125068785679SHong Zhang /* also map the inode sizes according to the ordering */
12519566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &tmp_vec1));
125268785679SHong Zhang for (i = 0, j = 0; i < node_max; ++i, ++j) {
12534d12350bSJunchao Zhang nodesz = ns[i + 1] - ns[i];
12544d12350bSJunchao Zhang if (nodesz > 4) {
1255048b5e81SShri Abhyankar tmp_vec1[j] = 4;
125668785679SHong Zhang ++j;
12574d12350bSJunchao Zhang tmp_vec1[j] = nodesz - tmp_vec1[j - 1];
125868785679SHong Zhang } else {
12594d12350bSJunchao Zhang tmp_vec1[j] = nodesz;
126068785679SHong Zhang }
126168785679SHong Zhang }
126268785679SHong Zhang /* Use the correct node_max */
126368785679SHong Zhang node_max = j;
126468785679SHong Zhang
126568785679SHong Zhang /* Now reorder the inode info based on mat re-ordering info */
126668785679SHong Zhang /* First create a row -> inode_size_array_index map */
12679566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &nsmap));
12689566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(node_max + 1, &tmp_vec2));
12694d12350bSJunchao Zhang tmp_vec2[0] = 0;
127068785679SHong Zhang for (i = 0, row = 0; i < node_max; i++) {
127168785679SHong Zhang nodesz = tmp_vec1[i];
1272ad540459SPierre Jolivet for (j = 0; j < nodesz; j++, row++) nsmap[row] = i;
127368785679SHong Zhang }
127468785679SHong Zhang /* Using nsmap, create a reordered ns structure */
127568785679SHong Zhang for (i = 0, j = 0; i < node_max; i++) {
127668785679SHong Zhang nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */
12774d12350bSJunchao Zhang tmp_vec2[i + 1] = tmp_vec2[i] + nodesz;
127868785679SHong Zhang j += nodesz;
127968785679SHong Zhang }
12809566063dSJacob Faibussowitsch PetscCall(PetscFree(nsmap));
12819566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec1));
1282b89f182dSHong Zhang
128368785679SHong Zhang /* Now use the correct ns */
128468785679SHong Zhang ns = tmp_vec2;
128568785679SHong Zhang
128628f1b45aSHong Zhang do {
128707b50cabSHong Zhang sctx.newshift = PETSC_FALSE;
128828f1b45aSHong Zhang /* Now loop over each block-row, and do the factorization */
128928f1b45aSHong Zhang for (inod = 0, i = 0; inod < node_max; inod++) { /* i: row index; inod: inode index */
12904d12350bSJunchao Zhang nodesz = ns[inod + 1] - ns[inod];
129128f1b45aSHong Zhang
129228f1b45aSHong Zhang switch (nodesz) {
129328f1b45aSHong Zhang case 1:
1294b89f182dSHong Zhang /* zero rtmp1 */
129528f1b45aSHong Zhang /* L part */
129628f1b45aSHong Zhang nz = bi[i + 1] - bi[i];
129728f1b45aSHong Zhang bjtmp = bj + bi[i];
1298b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
129928f1b45aSHong Zhang
130028f1b45aSHong Zhang /* U part */
130128f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1];
130228f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1;
1303b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[bjtmp[j]] = 0.0;
130428f1b45aSHong Zhang
130528f1b45aSHong Zhang /* load in initial (unfactored row) */
130628f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]];
130728f1b45aSHong Zhang ajtmp = aj + ai[r[i]];
130828f1b45aSHong Zhang v = aa + ai[r[i]];
13092205254eSKarl Rupp for (j = 0; j < nz; j++) rtmp1[ics[ajtmp[j]]] = v[j];
13102205254eSKarl Rupp
131128f1b45aSHong Zhang /* ZeropivotApply() */
1312b89f182dSHong Zhang rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
131328f1b45aSHong Zhang
131428f1b45aSHong Zhang /* elimination */
131528f1b45aSHong Zhang bjtmp = bj + bi[i];
131628f1b45aSHong Zhang row = *bjtmp++;
131728f1b45aSHong Zhang nzL = bi[i + 1] - bi[i];
131828f1b45aSHong Zhang for (k = 0; k < nzL; k++) {
1319b89f182dSHong Zhang pc = rtmp1 + row;
132028f1b45aSHong Zhang if (*pc != 0.0) {
132128f1b45aSHong Zhang pv = b->a + bdiag[row];
1322b89f182dSHong Zhang mul1 = *pc * (*pv);
1323b89f182dSHong Zhang *pc = mul1;
132428f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
132528f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1;
132628f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
1327b89f182dSHong Zhang for (j = 0; j < nz; j++) rtmp1[pj[j]] -= mul1 * pv[j];
13289566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz));
132928f1b45aSHong Zhang }
133028f1b45aSHong Zhang row = *bjtmp++;
133128f1b45aSHong Zhang }
133228f1b45aSHong Zhang
133328f1b45aSHong Zhang /* finished row so stick it into b->a */
133428f1b45aSHong Zhang rs = 0.0;
133528f1b45aSHong Zhang /* L part */
133628f1b45aSHong Zhang pv = b->a + bi[i];
133728f1b45aSHong Zhang pj = b->j + bi[i];
133828f1b45aSHong Zhang nz = bi[i + 1] - bi[i];
133928f1b45aSHong Zhang for (j = 0; j < nz; j++) {
13409371c9d4SSatish Balay pv[j] = rtmp1[pj[j]];
13419371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]);
134228f1b45aSHong Zhang }
134328f1b45aSHong Zhang
134428f1b45aSHong Zhang /* U part */
134528f1b45aSHong Zhang pv = b->a + bdiag[i + 1] + 1;
134628f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1;
134728f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1;
134828f1b45aSHong Zhang for (j = 0; j < nz; j++) {
13499371c9d4SSatish Balay pv[j] = rtmp1[pj[j]];
13509371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]);
135128f1b45aSHong Zhang }
135228f1b45aSHong Zhang
1353b89f182dSHong Zhang /* Check zero pivot */
135428f1b45aSHong Zhang sctx.rs = rs;
1355b89f182dSHong Zhang sctx.pv = rtmp1[i];
13569566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i));
135707b50cabSHong Zhang if (sctx.newshift) break;
135828f1b45aSHong Zhang
1359a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */
136028f1b45aSHong Zhang pv = b->a + bdiag[i];
1361b89f182dSHong Zhang *pv = 1.0 / sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */
136228f1b45aSHong Zhang break;
136328f1b45aSHong Zhang
136428f1b45aSHong Zhang case 2:
1365b89f182dSHong Zhang /* zero rtmp1 and rtmp2 */
136628f1b45aSHong Zhang /* L part */
136728f1b45aSHong Zhang nz = bi[i + 1] - bi[i];
136828f1b45aSHong Zhang bjtmp = bj + bi[i];
136928f1b45aSHong Zhang for (j = 0; j < nz; j++) {
137068785679SHong Zhang col = bjtmp[j];
13719371c9d4SSatish Balay rtmp1[col] = 0.0;
13729371c9d4SSatish Balay rtmp2[col] = 0.0;
137328f1b45aSHong Zhang }
137428f1b45aSHong Zhang
137528f1b45aSHong Zhang /* U part */
137628f1b45aSHong Zhang nz = bdiag[i] - bdiag[i + 1];
137728f1b45aSHong Zhang bjtmp = bj + bdiag[i + 1] + 1;
137828f1b45aSHong Zhang for (j = 0; j < nz; j++) {
137968785679SHong Zhang col = bjtmp[j];
13809371c9d4SSatish Balay rtmp1[col] = 0.0;
13819371c9d4SSatish Balay rtmp2[col] = 0.0;
138228f1b45aSHong Zhang }
138328f1b45aSHong Zhang
138428f1b45aSHong Zhang /* load in initial (unfactored row) */
138528f1b45aSHong Zhang nz = ai[r[i] + 1] - ai[r[i]];
138628f1b45aSHong Zhang ajtmp = aj + ai[r[i]];
13879371c9d4SSatish Balay v1 = aa + ai[r[i]];
13881a303e4dSPierre Jolivet v2 = aa + ai[r[i + 1]];
138928f1b45aSHong Zhang for (j = 0; j < nz; j++) {
139068785679SHong Zhang col = ics[ajtmp[j]];
13919371c9d4SSatish Balay rtmp1[col] = v1[j];
13929371c9d4SSatish Balay rtmp2[col] = v2[j];
139328f1b45aSHong Zhang }
139428f1b45aSHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */
13959371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount;
13969371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount;
139728f1b45aSHong Zhang
139828f1b45aSHong Zhang /* elimination */
139928f1b45aSHong Zhang bjtmp = bj + bi[i];
140028f1b45aSHong Zhang row = *bjtmp++; /* pivot row */
140128f1b45aSHong Zhang nzL = bi[i + 1] - bi[i];
140228f1b45aSHong Zhang for (k = 0; k < nzL; k++) {
1403b89f182dSHong Zhang pc1 = rtmp1 + row;
1404b89f182dSHong Zhang pc2 = rtmp2 + row;
140528f1b45aSHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0) {
140628f1b45aSHong Zhang pv = b->a + bdiag[row];
14079371c9d4SSatish Balay mul1 = *pc1 * (*pv);
14089371c9d4SSatish Balay mul2 = *pc2 * (*pv);
14099371c9d4SSatish Balay *pc1 = mul1;
14109371c9d4SSatish Balay *pc2 = mul2;
141128f1b45aSHong Zhang
141228f1b45aSHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
141328f1b45aSHong Zhang pv = b->a + bdiag[row + 1] + 1;
141428f1b45aSHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
141528f1b45aSHong Zhang for (j = 0; j < nz; j++) {
141668785679SHong Zhang col = pj[j];
1417b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j];
1418b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j];
141928f1b45aSHong Zhang }
14209566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz));
142128f1b45aSHong Zhang }
142228f1b45aSHong Zhang row = *bjtmp++;
142328f1b45aSHong Zhang }
142428f1b45aSHong Zhang
1425b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */
142628f1b45aSHong Zhang rs = 0.0;
142728f1b45aSHong Zhang /* L part */
1428b89f182dSHong Zhang pc1 = b->a + bi[i];
142928f1b45aSHong Zhang pj = b->j + bi[i];
143028f1b45aSHong Zhang nz = bi[i + 1] - bi[i];
143128f1b45aSHong Zhang for (j = 0; j < nz; j++) {
143268785679SHong Zhang col = pj[j];
14339371c9d4SSatish Balay pc1[j] = rtmp1[col];
14349371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]);
143528f1b45aSHong Zhang }
143628f1b45aSHong Zhang /* U part */
1437b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1;
143828f1b45aSHong Zhang pj = b->j + bdiag[i + 1] + 1;
14390e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
144028f1b45aSHong Zhang for (j = 0; j < nz; j++) {
144168785679SHong Zhang col = pj[j];
14429371c9d4SSatish Balay pc1[j] = rtmp1[col];
14439371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]);
144428f1b45aSHong Zhang }
144528f1b45aSHong Zhang
144628f1b45aSHong Zhang sctx.rs = rs;
1447b89f182dSHong Zhang sctx.pv = rtmp1[i];
14489566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i));
144907b50cabSHong Zhang if (sctx.newshift) break;
1450b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diagonal */
1451b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv;
1452b89f182dSHong Zhang
1453b89f182dSHong Zhang /* Now take care of diagonal 2x2 block. */
1454b89f182dSHong Zhang pc2 = rtmp2 + i;
1455b89f182dSHong Zhang if (*pc2 != 0.0) {
1456b89f182dSHong Zhang mul1 = (*pc2) * (*pc1); /* *pc1=diag[i] is inverted! */
1457b89f182dSHong Zhang *pc2 = mul1; /* insert L entry */
1458b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
1459b89f182dSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
1460b89f182dSHong Zhang for (j = 0; j < nz; j++) {
14619371c9d4SSatish Balay col = pj[j];
14629371c9d4SSatish Balay rtmp2[col] -= mul1 * rtmp1[col];
146328f1b45aSHong Zhang }
14649566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz));
1465b89f182dSHong Zhang }
1466b89f182dSHong Zhang
1467b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1468b89f182dSHong Zhang rs = 0.0;
1469b89f182dSHong Zhang /* L part */
1470b89f182dSHong Zhang pc2 = b->a + bi[i + 1];
1471b89f182dSHong Zhang pj = b->j + bi[i + 1];
1472b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1];
1473b89f182dSHong Zhang for (j = 0; j < nz; j++) {
1474b89f182dSHong Zhang col = pj[j];
14759371c9d4SSatish Balay pc2[j] = rtmp2[col];
14769371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]);
1477b89f182dSHong Zhang }
1478b89f182dSHong Zhang /* U part */
1479b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1;
14800e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1;
14810e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1482b89f182dSHong Zhang for (j = 0; j < nz; j++) {
1483b89f182dSHong Zhang col = pj[j];
14849371c9d4SSatish Balay pc2[j] = rtmp2[col];
14859371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]);
1486b89f182dSHong Zhang }
1487b89f182dSHong Zhang
148828f1b45aSHong Zhang sctx.rs = rs;
1489b89f182dSHong Zhang sctx.pv = rtmp2[i + 1];
14909566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
149107b50cabSHong Zhang if (sctx.newshift) break;
149228f1b45aSHong Zhang pc2 = b->a + bdiag[i + 1];
1493b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv;
149428f1b45aSHong Zhang break;
1495b89f182dSHong Zhang
149668785679SHong Zhang case 3:
149768785679SHong Zhang /* zero rtmp */
149868785679SHong Zhang /* L part */
149968785679SHong Zhang nz = bi[i + 1] - bi[i];
150068785679SHong Zhang bjtmp = bj + bi[i];
150168785679SHong Zhang for (j = 0; j < nz; j++) {
150268785679SHong Zhang col = bjtmp[j];
15039371c9d4SSatish Balay rtmp1[col] = 0.0;
15049371c9d4SSatish Balay rtmp2[col] = 0.0;
15059371c9d4SSatish Balay rtmp3[col] = 0.0;
150668785679SHong Zhang }
150768785679SHong Zhang
150868785679SHong Zhang /* U part */
150968785679SHong Zhang nz = bdiag[i] - bdiag[i + 1];
151068785679SHong Zhang bjtmp = bj + bdiag[i + 1] + 1;
151168785679SHong Zhang for (j = 0; j < nz; j++) {
151268785679SHong Zhang col = bjtmp[j];
15139371c9d4SSatish Balay rtmp1[col] = 0.0;
15149371c9d4SSatish Balay rtmp2[col] = 0.0;
15159371c9d4SSatish Balay rtmp3[col] = 0.0;
151668785679SHong Zhang }
151768785679SHong Zhang
151868785679SHong Zhang /* load in initial (unfactored row) */
151968785679SHong Zhang nz = ai[r[i] + 1] - ai[r[i]];
152068785679SHong Zhang ajtmp = aj + ai[r[i]];
15219371c9d4SSatish Balay v1 = aa + ai[r[i]];
15221a303e4dSPierre Jolivet v2 = aa + ai[r[i + 1]];
15231a303e4dSPierre Jolivet v3 = aa + ai[r[i + 2]];
152468785679SHong Zhang for (j = 0; j < nz; j++) {
152568785679SHong Zhang col = ics[ajtmp[j]];
15269371c9d4SSatish Balay rtmp1[col] = v1[j];
15279371c9d4SSatish Balay rtmp2[col] = v2[j];
15289371c9d4SSatish Balay rtmp3[col] = v3[j];
152968785679SHong Zhang }
153068785679SHong Zhang /* ZeropivotApply(): shift the diagonal of the matrix */
15319371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount;
15329371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount;
15339371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount;
153468785679SHong Zhang
153568785679SHong Zhang /* elimination */
153668785679SHong Zhang bjtmp = bj + bi[i];
153768785679SHong Zhang row = *bjtmp++; /* pivot row */
153868785679SHong Zhang nzL = bi[i + 1] - bi[i];
153968785679SHong Zhang for (k = 0; k < nzL; k++) {
1540b89f182dSHong Zhang pc1 = rtmp1 + row;
1541b89f182dSHong Zhang pc2 = rtmp2 + row;
1542b89f182dSHong Zhang pc3 = rtmp3 + row;
154368785679SHong Zhang if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) {
154468785679SHong Zhang pv = b->a + bdiag[row];
15459371c9d4SSatish Balay mul1 = *pc1 * (*pv);
15469371c9d4SSatish Balay mul2 = *pc2 * (*pv);
15479371c9d4SSatish Balay mul3 = *pc3 * (*pv);
15489371c9d4SSatish Balay *pc1 = mul1;
15499371c9d4SSatish Balay *pc2 = mul2;
15509371c9d4SSatish Balay *pc3 = mul3;
155168785679SHong Zhang
155268785679SHong Zhang pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
155368785679SHong Zhang pv = b->a + bdiag[row + 1] + 1;
155468785679SHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
155568785679SHong Zhang for (j = 0; j < nz; j++) {
155668785679SHong Zhang col = pj[j];
1557b89f182dSHong Zhang rtmp1[col] -= mul1 * pv[j];
1558b89f182dSHong Zhang rtmp2[col] -= mul2 * pv[j];
1559b89f182dSHong Zhang rtmp3[col] -= mul3 * pv[j];
156068785679SHong Zhang }
15619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz));
156268785679SHong Zhang }
156368785679SHong Zhang row = *bjtmp++;
156468785679SHong Zhang }
156568785679SHong Zhang
1566b89f182dSHong Zhang /* finished row i; check zero pivot, then stick row i into b->a */
1567b89f182dSHong Zhang rs = 0.0;
1568b89f182dSHong Zhang /* L part */
1569b89f182dSHong Zhang pc1 = b->a + bi[i];
1570b89f182dSHong Zhang pj = b->j + bi[i];
1571b89f182dSHong Zhang nz = bi[i + 1] - bi[i];
1572b89f182dSHong Zhang for (j = 0; j < nz; j++) {
1573b89f182dSHong Zhang col = pj[j];
15749371c9d4SSatish Balay pc1[j] = rtmp1[col];
15759371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]);
1576b89f182dSHong Zhang }
1577b89f182dSHong Zhang /* U part */
1578b89f182dSHong Zhang pc1 = b->a + bdiag[i + 1] + 1;
1579b89f182dSHong Zhang pj = b->j + bdiag[i + 1] + 1;
15800e7a5c2bSHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
1581b89f182dSHong Zhang for (j = 0; j < nz; j++) {
1582b89f182dSHong Zhang col = pj[j];
15839371c9d4SSatish Balay pc1[j] = rtmp1[col];
15849371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]);
1585b89f182dSHong Zhang }
158668785679SHong Zhang
1587b89f182dSHong Zhang sctx.rs = rs;
1588b89f182dSHong Zhang sctx.pv = rtmp1[i];
15899566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i));
159007b50cabSHong Zhang if (sctx.newshift) break;
1591b89f182dSHong Zhang pc1 = b->a + bdiag[i]; /* Mark diag[i] */
1592b89f182dSHong Zhang *pc1 = 1.0 / sctx.pv;
1593b89f182dSHong Zhang
1594b89f182dSHong Zhang /* Now take care of 1st column of diagonal 3x3 block. */
1595b89f182dSHong Zhang pc2 = rtmp2 + i;
1596b89f182dSHong Zhang pc3 = rtmp3 + i;
1597b89f182dSHong Zhang if (*pc2 != 0.0 || *pc3 != 0.0) {
15989371c9d4SSatish Balay mul2 = (*pc2) * (*pc1);
15999371c9d4SSatish Balay *pc2 = mul2;
16009371c9d4SSatish Balay mul3 = (*pc3) * (*pc1);
16019371c9d4SSatish Balay *pc3 = mul3;
160268785679SHong Zhang pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
160368785679SHong Zhang nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
160468785679SHong Zhang for (j = 0; j < nz; j++) {
160568785679SHong Zhang col = pj[j];
1606b89f182dSHong Zhang rtmp2[col] -= mul2 * rtmp1[col];
1607b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp1[col];
160868785679SHong Zhang }
16099566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2 + 4.0 * nz));
161068785679SHong Zhang }
161168785679SHong Zhang
1612b89f182dSHong Zhang /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
1613b89f182dSHong Zhang rs = 0.0;
1614b89f182dSHong Zhang /* L part */
1615b89f182dSHong Zhang pc2 = b->a + bi[i + 1];
1616b89f182dSHong Zhang pj = b->j + bi[i + 1];
1617b89f182dSHong Zhang nz = bi[i + 2] - bi[i + 1];
1618b89f182dSHong Zhang for (j = 0; j < nz; j++) {
1619b89f182dSHong Zhang col = pj[j];
16209371c9d4SSatish Balay pc2[j] = rtmp2[col];
16219371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]);
1622b89f182dSHong Zhang }
1623b89f182dSHong Zhang /* U part */
1624b89f182dSHong Zhang pc2 = b->a + bdiag[i + 2] + 1;
16250e7a5c2bSHong Zhang pj = b->j + bdiag[i + 2] + 1;
16260e7a5c2bSHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
1627b89f182dSHong Zhang for (j = 0; j < nz; j++) {
1628b89f182dSHong Zhang col = pj[j];
16299371c9d4SSatish Balay pc2[j] = rtmp2[col];
16309371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]);
1631b89f182dSHong Zhang }
1632b89f182dSHong Zhang
1633b89f182dSHong Zhang sctx.rs = rs;
1634b89f182dSHong Zhang sctx.pv = rtmp2[i + 1];
16359566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
163607b50cabSHong Zhang if (sctx.newshift) break;
1637b89f182dSHong Zhang pc2 = b->a + bdiag[i + 1];
1638b89f182dSHong Zhang *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
1639b89f182dSHong Zhang
1640b89f182dSHong Zhang /* Now take care of 2nd column of diagonal 3x3 block. */
1641b89f182dSHong Zhang pc3 = rtmp3 + i + 1;
164268785679SHong Zhang if (*pc3 != 0.0) {
16439371c9d4SSatish Balay mul3 = (*pc3) * (*pc2);
16449371c9d4SSatish Balay *pc3 = mul3;
164568785679SHong Zhang pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
164668785679SHong Zhang nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
164768785679SHong Zhang for (j = 0; j < nz; j++) {
164868785679SHong Zhang col = pj[j];
1649b89f182dSHong Zhang rtmp3[col] -= mul3 * rtmp2[col];
165068785679SHong Zhang }
16519566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz));
165268785679SHong Zhang }
165368785679SHong Zhang
1654b89f182dSHong Zhang /* finished i+2; check zero pivot, then stick row i+2 into b->a */
165568785679SHong Zhang rs = 0.0;
165668785679SHong Zhang /* L part */
1657b89f182dSHong Zhang pc3 = b->a + bi[i + 2];
1658b89f182dSHong Zhang pj = b->j + bi[i + 2];
1659b89f182dSHong Zhang nz = bi[i + 3] - bi[i + 2];
166068785679SHong Zhang for (j = 0; j < nz; j++) {
166168785679SHong Zhang col = pj[j];
16629371c9d4SSatish Balay pc3[j] = rtmp3[col];
16639371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]);
166468785679SHong Zhang }
166568785679SHong Zhang /* U part */
1666b89f182dSHong Zhang pc3 = b->a + bdiag[i + 3] + 1;
16670e7a5c2bSHong Zhang pj = b->j + bdiag[i + 3] + 1;
16680e7a5c2bSHong Zhang nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
166968785679SHong Zhang for (j = 0; j < nz; j++) {
167068785679SHong Zhang col = pj[j];
16719371c9d4SSatish Balay pc3[j] = rtmp3[col];
16729371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]);
167368785679SHong Zhang }
167468785679SHong Zhang
167568785679SHong Zhang sctx.rs = rs;
1676b89f182dSHong Zhang sctx.pv = rtmp3[i + 2];
16779566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
167807b50cabSHong Zhang if (sctx.newshift) break;
167968785679SHong Zhang pc3 = b->a + bdiag[i + 2];
1680b89f182dSHong Zhang *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
168168785679SHong Zhang break;
16829877982aSShri Abhyankar case 4:
16839877982aSShri Abhyankar /* zero rtmp */
16849877982aSShri Abhyankar /* L part */
16859877982aSShri Abhyankar nz = bi[i + 1] - bi[i];
16869877982aSShri Abhyankar bjtmp = bj + bi[i];
16879877982aSShri Abhyankar for (j = 0; j < nz; j++) {
16889877982aSShri Abhyankar col = bjtmp[j];
16899371c9d4SSatish Balay rtmp1[col] = 0.0;
16909371c9d4SSatish Balay rtmp2[col] = 0.0;
16919371c9d4SSatish Balay rtmp3[col] = 0.0;
16929371c9d4SSatish Balay rtmp4[col] = 0.0;
16939877982aSShri Abhyankar }
16949877982aSShri Abhyankar
16959877982aSShri Abhyankar /* U part */
16969877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1];
16979877982aSShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1;
16989877982aSShri Abhyankar for (j = 0; j < nz; j++) {
16999877982aSShri Abhyankar col = bjtmp[j];
17009371c9d4SSatish Balay rtmp1[col] = 0.0;
17019371c9d4SSatish Balay rtmp2[col] = 0.0;
17029371c9d4SSatish Balay rtmp3[col] = 0.0;
17039371c9d4SSatish Balay rtmp4[col] = 0.0;
17049877982aSShri Abhyankar }
17059877982aSShri Abhyankar
17069877982aSShri Abhyankar /* load in initial (unfactored row) */
17079877982aSShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]];
17089877982aSShri Abhyankar ajtmp = aj + ai[r[i]];
17099371c9d4SSatish Balay v1 = aa + ai[r[i]];
17101a303e4dSPierre Jolivet v2 = aa + ai[r[i + 1]];
17111a303e4dSPierre Jolivet v3 = aa + ai[r[i + 2]];
17121a303e4dSPierre Jolivet v4 = aa + ai[r[i + 3]];
17139877982aSShri Abhyankar for (j = 0; j < nz; j++) {
17149877982aSShri Abhyankar col = ics[ajtmp[j]];
17159371c9d4SSatish Balay rtmp1[col] = v1[j];
17169371c9d4SSatish Balay rtmp2[col] = v2[j];
17179371c9d4SSatish Balay rtmp3[col] = v3[j];
17189371c9d4SSatish Balay rtmp4[col] = v4[j];
17199877982aSShri Abhyankar }
17209877982aSShri Abhyankar /* ZeropivotApply(): shift the diagonal of the matrix */
17219371c9d4SSatish Balay rtmp1[i] += sctx.shift_amount;
17229371c9d4SSatish Balay rtmp2[i + 1] += sctx.shift_amount;
17239371c9d4SSatish Balay rtmp3[i + 2] += sctx.shift_amount;
17249371c9d4SSatish Balay rtmp4[i + 3] += sctx.shift_amount;
17259877982aSShri Abhyankar
17269877982aSShri Abhyankar /* elimination */
17279877982aSShri Abhyankar bjtmp = bj + bi[i];
17289877982aSShri Abhyankar row = *bjtmp++; /* pivot row */
17299877982aSShri Abhyankar nzL = bi[i + 1] - bi[i];
17309877982aSShri Abhyankar for (k = 0; k < nzL; k++) {
17319877982aSShri Abhyankar pc1 = rtmp1 + row;
17329877982aSShri Abhyankar pc2 = rtmp2 + row;
17339877982aSShri Abhyankar pc3 = rtmp3 + row;
17349877982aSShri Abhyankar pc4 = rtmp4 + row;
17359877982aSShri Abhyankar if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17369877982aSShri Abhyankar pv = b->a + bdiag[row];
17379371c9d4SSatish Balay mul1 = *pc1 * (*pv);
17389371c9d4SSatish Balay mul2 = *pc2 * (*pv);
17399371c9d4SSatish Balay mul3 = *pc3 * (*pv);
17409371c9d4SSatish Balay mul4 = *pc4 * (*pv);
17419371c9d4SSatish Balay *pc1 = mul1;
17429371c9d4SSatish Balay *pc2 = mul2;
17439371c9d4SSatish Balay *pc3 = mul3;
17449371c9d4SSatish Balay *pc4 = mul4;
17459877982aSShri Abhyankar
17469877982aSShri Abhyankar pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
17479877982aSShri Abhyankar pv = b->a + bdiag[row + 1] + 1;
17489877982aSShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
17499877982aSShri Abhyankar for (j = 0; j < nz; j++) {
17509877982aSShri Abhyankar col = pj[j];
17519877982aSShri Abhyankar rtmp1[col] -= mul1 * pv[j];
17529877982aSShri Abhyankar rtmp2[col] -= mul2 * pv[j];
17539877982aSShri Abhyankar rtmp3[col] -= mul3 * pv[j];
17549877982aSShri Abhyankar rtmp4[col] -= mul4 * pv[j];
17559877982aSShri Abhyankar }
17569566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4 + 8.0 * nz));
17579877982aSShri Abhyankar }
17589877982aSShri Abhyankar row = *bjtmp++;
17599877982aSShri Abhyankar }
17609877982aSShri Abhyankar
17619877982aSShri Abhyankar /* finished row i; check zero pivot, then stick row i into b->a */
17629877982aSShri Abhyankar rs = 0.0;
17639877982aSShri Abhyankar /* L part */
17649877982aSShri Abhyankar pc1 = b->a + bi[i];
17659877982aSShri Abhyankar pj = b->j + bi[i];
17669877982aSShri Abhyankar nz = bi[i + 1] - bi[i];
17679877982aSShri Abhyankar for (j = 0; j < nz; j++) {
17689877982aSShri Abhyankar col = pj[j];
17699371c9d4SSatish Balay pc1[j] = rtmp1[col];
17709371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]);
17719877982aSShri Abhyankar }
17729877982aSShri Abhyankar /* U part */
17739877982aSShri Abhyankar pc1 = b->a + bdiag[i + 1] + 1;
17749877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1;
17759877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* exclude diagonal */
17769877982aSShri Abhyankar for (j = 0; j < nz; j++) {
17779877982aSShri Abhyankar col = pj[j];
17789371c9d4SSatish Balay pc1[j] = rtmp1[col];
17799371c9d4SSatish Balay rs += PetscAbsScalar(pc1[j]);
17809877982aSShri Abhyankar }
17819877982aSShri Abhyankar
17829877982aSShri Abhyankar sctx.rs = rs;
17839877982aSShri Abhyankar sctx.pv = rtmp1[i];
17849566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i));
178507b50cabSHong Zhang if (sctx.newshift) break;
17869877982aSShri Abhyankar pc1 = b->a + bdiag[i]; /* Mark diag[i] */
17879877982aSShri Abhyankar *pc1 = 1.0 / sctx.pv;
17889877982aSShri Abhyankar
17899877982aSShri Abhyankar /* Now take care of 1st column of diagonal 4x4 block. */
17909877982aSShri Abhyankar pc2 = rtmp2 + i;
17919877982aSShri Abhyankar pc3 = rtmp3 + i;
17929877982aSShri Abhyankar pc4 = rtmp4 + i;
17939877982aSShri Abhyankar if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) {
17949371c9d4SSatish Balay mul2 = (*pc2) * (*pc1);
17959371c9d4SSatish Balay *pc2 = mul2;
17969371c9d4SSatish Balay mul3 = (*pc3) * (*pc1);
17979371c9d4SSatish Balay *pc3 = mul3;
17989371c9d4SSatish Balay mul4 = (*pc4) * (*pc1);
17999371c9d4SSatish Balay *pc4 = mul4;
18009877982aSShri Abhyankar pj = b->j + bdiag[i + 1] + 1; /* beginning of U(i,:) */
18019877982aSShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; /* num of entries in U(i,:) excluding diag */
18029877982aSShri Abhyankar for (j = 0; j < nz; j++) {
18039877982aSShri Abhyankar col = pj[j];
18049877982aSShri Abhyankar rtmp2[col] -= mul2 * rtmp1[col];
18059877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp1[col];
18069877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp1[col];
18079877982aSShri Abhyankar }
18089566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(3 + 6.0 * nz));
18099877982aSShri Abhyankar }
18109877982aSShri Abhyankar
18119877982aSShri Abhyankar /* finished row i+1; check zero pivot, then stick row i+1 into b->a */
18129877982aSShri Abhyankar rs = 0.0;
18139877982aSShri Abhyankar /* L part */
18149877982aSShri Abhyankar pc2 = b->a + bi[i + 1];
18159877982aSShri Abhyankar pj = b->j + bi[i + 1];
18169877982aSShri Abhyankar nz = bi[i + 2] - bi[i + 1];
18179877982aSShri Abhyankar for (j = 0; j < nz; j++) {
18189877982aSShri Abhyankar col = pj[j];
18199371c9d4SSatish Balay pc2[j] = rtmp2[col];
18209371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]);
18219877982aSShri Abhyankar }
18229877982aSShri Abhyankar /* U part */
18239877982aSShri Abhyankar pc2 = b->a + bdiag[i + 2] + 1;
18249877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1;
18259877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* exclude diagonal */
18269877982aSShri Abhyankar for (j = 0; j < nz; j++) {
18279877982aSShri Abhyankar col = pj[j];
18289371c9d4SSatish Balay pc2[j] = rtmp2[col];
18299371c9d4SSatish Balay rs += PetscAbsScalar(pc2[j]);
18309877982aSShri Abhyankar }
18319877982aSShri Abhyankar
18329877982aSShri Abhyankar sctx.rs = rs;
18339877982aSShri Abhyankar sctx.pv = rtmp2[i + 1];
18349566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 1));
183507b50cabSHong Zhang if (sctx.newshift) break;
18369877982aSShri Abhyankar pc2 = b->a + bdiag[i + 1];
18379877982aSShri Abhyankar *pc2 = 1.0 / sctx.pv; /* Mark diag[i+1] */
18389877982aSShri Abhyankar
18399877982aSShri Abhyankar /* Now take care of 2nd column of diagonal 4x4 block. */
18409877982aSShri Abhyankar pc3 = rtmp3 + i + 1;
18419877982aSShri Abhyankar pc4 = rtmp4 + i + 1;
18429877982aSShri Abhyankar if (*pc3 != 0.0 || *pc4 != 0.0) {
18439371c9d4SSatish Balay mul3 = (*pc3) * (*pc2);
18449371c9d4SSatish Balay *pc3 = mul3;
18459371c9d4SSatish Balay mul4 = (*pc4) * (*pc2);
18469371c9d4SSatish Balay *pc4 = mul4;
18479877982aSShri Abhyankar pj = b->j + bdiag[i + 2] + 1; /* beginning of U(i+1,:) */
18489877982aSShri Abhyankar nz = bdiag[i + 1] - bdiag[i + 2] - 1; /* num of entries in U(i+1,:) excluding diag */
18499877982aSShri Abhyankar for (j = 0; j < nz; j++) {
18509877982aSShri Abhyankar col = pj[j];
18519877982aSShri Abhyankar rtmp3[col] -= mul3 * rtmp2[col];
18529877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp2[col];
18539877982aSShri Abhyankar }
18549566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * nz));
18559877982aSShri Abhyankar }
18569877982aSShri Abhyankar
18579877982aSShri Abhyankar /* finished i+2; check zero pivot, then stick row i+2 into b->a */
18589877982aSShri Abhyankar rs = 0.0;
18599877982aSShri Abhyankar /* L part */
18609877982aSShri Abhyankar pc3 = b->a + bi[i + 2];
18619877982aSShri Abhyankar pj = b->j + bi[i + 2];
18629877982aSShri Abhyankar nz = bi[i + 3] - bi[i + 2];
18639877982aSShri Abhyankar for (j = 0; j < nz; j++) {
18649877982aSShri Abhyankar col = pj[j];
18659371c9d4SSatish Balay pc3[j] = rtmp3[col];
18669371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]);
18679877982aSShri Abhyankar }
18689877982aSShri Abhyankar /* U part */
18699877982aSShri Abhyankar pc3 = b->a + bdiag[i + 3] + 1;
18709877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1;
18719877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* exclude diagonal */
18729877982aSShri Abhyankar for (j = 0; j < nz; j++) {
18739877982aSShri Abhyankar col = pj[j];
18749371c9d4SSatish Balay pc3[j] = rtmp3[col];
18759371c9d4SSatish Balay rs += PetscAbsScalar(pc3[j]);
18769877982aSShri Abhyankar }
18779877982aSShri Abhyankar
18789877982aSShri Abhyankar sctx.rs = rs;
18799877982aSShri Abhyankar sctx.pv = rtmp3[i + 2];
18809566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 2));
188107b50cabSHong Zhang if (sctx.newshift) break;
18829877982aSShri Abhyankar pc3 = b->a + bdiag[i + 2];
18839877982aSShri Abhyankar *pc3 = 1.0 / sctx.pv; /* Mark diag[i+2] */
18849877982aSShri Abhyankar
18859877982aSShri Abhyankar /* Now take care of 3rd column of diagonal 4x4 block. */
18869877982aSShri Abhyankar pc4 = rtmp4 + i + 2;
18879877982aSShri Abhyankar if (*pc4 != 0.0) {
18889371c9d4SSatish Balay mul4 = (*pc4) * (*pc3);
18899371c9d4SSatish Balay *pc4 = mul4;
18909877982aSShri Abhyankar pj = b->j + bdiag[i + 3] + 1; /* beginning of U(i+2,:) */
18919877982aSShri Abhyankar nz = bdiag[i + 2] - bdiag[i + 3] - 1; /* num of entries in U(i+2,:) excluding diag */
18929877982aSShri Abhyankar for (j = 0; j < nz; j++) {
18939877982aSShri Abhyankar col = pj[j];
18949877982aSShri Abhyankar rtmp4[col] -= mul4 * rtmp3[col];
18959877982aSShri Abhyankar }
18969566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1 + 2.0 * nz));
18979877982aSShri Abhyankar }
18989877982aSShri Abhyankar
18999877982aSShri Abhyankar /* finished i+3; check zero pivot, then stick row i+3 into b->a */
19009877982aSShri Abhyankar rs = 0.0;
19019877982aSShri Abhyankar /* L part */
19029877982aSShri Abhyankar pc4 = b->a + bi[i + 3];
19039877982aSShri Abhyankar pj = b->j + bi[i + 3];
19049877982aSShri Abhyankar nz = bi[i + 4] - bi[i + 3];
19059877982aSShri Abhyankar for (j = 0; j < nz; j++) {
19069877982aSShri Abhyankar col = pj[j];
19079371c9d4SSatish Balay pc4[j] = rtmp4[col];
19089371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]);
19099877982aSShri Abhyankar }
19109877982aSShri Abhyankar /* U part */
19119877982aSShri Abhyankar pc4 = b->a + bdiag[i + 4] + 1;
19129877982aSShri Abhyankar pj = b->j + bdiag[i + 4] + 1;
19139877982aSShri Abhyankar nz = bdiag[i + 3] - bdiag[i + 4] - 1; /* exclude diagonal */
19149877982aSShri Abhyankar for (j = 0; j < nz; j++) {
19159877982aSShri Abhyankar col = pj[j];
19169371c9d4SSatish Balay pc4[j] = rtmp4[col];
19179371c9d4SSatish Balay rs += PetscAbsScalar(pc4[j]);
19189877982aSShri Abhyankar }
19199877982aSShri Abhyankar
19209877982aSShri Abhyankar sctx.rs = rs;
19219877982aSShri Abhyankar sctx.pv = rtmp4[i + 3];
19229566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i + 3));
192307b50cabSHong Zhang if (sctx.newshift) break;
19249877982aSShri Abhyankar pc4 = b->a + bdiag[i + 3];
19259877982aSShri Abhyankar *pc4 = 1.0 / sctx.pv; /* Mark diag[i+3] */
19269877982aSShri Abhyankar break;
192768785679SHong Zhang
1928d71ae5a4SJacob Faibussowitsch default:
1929d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Node size not yet supported ");
193028f1b45aSHong Zhang }
1931c2b86aeeSHong Zhang if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */
193228f1b45aSHong Zhang i += nodesz; /* Update the row */
193368785679SHong Zhang }
193428f1b45aSHong Zhang
193528f1b45aSHong Zhang /* MatPivotRefine() */
193607b50cabSHong Zhang if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
193728f1b45aSHong Zhang /*
193828f1b45aSHong Zhang * if no shift in this attempt & shifting & started shifting & can refine,
193928f1b45aSHong Zhang * then try lower shift
194028f1b45aSHong Zhang */
194128f1b45aSHong Zhang sctx.shift_hi = sctx.shift_fraction;
194228f1b45aSHong Zhang sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
194328f1b45aSHong Zhang sctx.shift_amount = sctx.shift_fraction * sctx.shift_top;
194407b50cabSHong Zhang sctx.newshift = PETSC_TRUE;
194528f1b45aSHong Zhang sctx.nshift++;
194628f1b45aSHong Zhang }
194707b50cabSHong Zhang } while (sctx.newshift);
194828f1b45aSHong Zhang
19499566063dSJacob Faibussowitsch PetscCall(PetscFree4(rtmp1, rtmp2, rtmp3, rtmp4));
19509566063dSJacob Faibussowitsch PetscCall(PetscFree(tmp_vec2));
19519566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic));
19529566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r));
195328f1b45aSHong Zhang
19544d12350bSJunchao Zhang if (b->inode.size_csr) {
1955abb87a52SBarry Smith C->ops->solve = MatSolve_SeqAIJ_Inode;
1956abb87a52SBarry Smith } else {
1957d3ac4fa3SBarry Smith C->ops->solve = MatSolve_SeqAIJ;
1958abb87a52SBarry Smith }
195928f1b45aSHong Zhang C->ops->solveadd = MatSolveAdd_SeqAIJ;
196028f1b45aSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqAIJ;
196128f1b45aSHong Zhang C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ;
196228f1b45aSHong Zhang C->ops->matsolve = MatMatSolve_SeqAIJ;
1963a3d9026eSPierre Jolivet C->ops->matsolvetranspose = MatMatSolveTranspose_SeqAIJ;
196428f1b45aSHong Zhang C->assembled = PETSC_TRUE;
196528f1b45aSHong Zhang C->preallocated = PETSC_TRUE;
19662205254eSKarl Rupp
19679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n));
196828f1b45aSHong Zhang
196928f1b45aSHong Zhang /* MatShiftView(A,info,&sctx) */
197028f1b45aSHong Zhang if (sctx.nshift) {
1971f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
19729566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
1973f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
19749566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
1975f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
19769566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
197728f1b45aSHong Zhang }
197828f1b45aSHong Zhang }
19793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
198028f1b45aSHong Zhang }
1981628f99d7SShri Abhyankar
MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)1982d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
1983d71ae5a4SJacob Faibussowitsch {
1984019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
1985019b515eSShri Abhyankar IS iscol = a->col, isrow = a->row;
1986019b515eSShri Abhyankar const PetscInt *r, *c, *rout, *cout;
19874d12350bSJunchao Zhang PetscInt i, j;
19888758e1faSBarry Smith PetscInt node_max, row, nsz, aii, i0, i1, nz;
19898758e1faSBarry Smith const PetscInt *ai = a->i, *a_j = a->j, *ns, *vi, *ad, *aj;
1990019b515eSShri Abhyankar PetscScalar *x, *tmp, *tmps, tmp0, tmp1;
1991019b515eSShri Abhyankar PetscScalar sum1, sum2, sum3, sum4, sum5;
1992019b515eSShri Abhyankar const MatScalar *v1, *v2, *v3, *v4, *v5, *a_a = a->a, *aa;
1993019b515eSShri Abhyankar const PetscScalar *b;
1994019b515eSShri Abhyankar
1995019b515eSShri Abhyankar PetscFunctionBegin;
19964d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
1997019b515eSShri Abhyankar node_max = a->inode.node_count;
19984d12350bSJunchao Zhang ns = a->inode.size_csr; /* Node Size array */
1999019b515eSShri Abhyankar
20009566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b));
20019566063dSJacob Faibussowitsch PetscCall(VecGetArrayWrite(xx, &x));
2002019b515eSShri Abhyankar tmp = a->solve_work;
2003019b515eSShri Abhyankar
20049371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout));
20059371c9d4SSatish Balay r = rout;
20069371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout));
20079371c9d4SSatish Balay c = cout;
2008019b515eSShri Abhyankar
2009019b515eSShri Abhyankar /* forward solve the lower triangular */
2010019b515eSShri Abhyankar tmps = tmp;
2011019b515eSShri Abhyankar aa = a_a;
2012019b515eSShri Abhyankar aj = a_j;
2013019b515eSShri Abhyankar ad = a->diag;
2014019b515eSShri Abhyankar
20154d12350bSJunchao Zhang for (i = 0; i < node_max; ++i) {
20164d12350bSJunchao Zhang row = ns[i];
20174d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i];
2018019b515eSShri Abhyankar aii = ai[row];
2019019b515eSShri Abhyankar v1 = aa + aii;
2020019b515eSShri Abhyankar vi = aj + aii;
2021019b515eSShri Abhyankar nz = ai[row + 1] - ai[row];
2022019b515eSShri Abhyankar
202398991853SShri Abhyankar if (i < node_max - 1) {
202498991853SShri Abhyankar /* Prefetch the indices for the next block */
202550d8bf02SJed Brown PetscPrefetchBlock(aj + ai[row + nsz], ai[row + nsz + 1] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA); /* indices */
202698991853SShri Abhyankar /* Prefetch the data for the next block */
20274d12350bSJunchao Zhang PetscPrefetchBlock(aa + ai[row + nsz], ai[ns[i + 2]] - ai[row + nsz], 0, PETSC_PREFETCH_HINT_NTA);
202898991853SShri Abhyankar }
202998991853SShri Abhyankar
2030019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */
2031019b515eSShri Abhyankar case 1:
2032019b515eSShri Abhyankar sum1 = b[r[row]];
2033019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2034019b515eSShri Abhyankar i0 = vi[j];
2035019b515eSShri Abhyankar i1 = vi[j + 1];
2036019b515eSShri Abhyankar tmp0 = tmps[i0];
2037019b515eSShri Abhyankar tmp1 = tmps[i1];
2038019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2039019b515eSShri Abhyankar }
2040019b515eSShri Abhyankar if (j == nz - 1) {
2041019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2042019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2043019b515eSShri Abhyankar }
2044019b515eSShri Abhyankar tmp[row++] = sum1;
2045019b515eSShri Abhyankar break;
2046019b515eSShri Abhyankar case 2:
2047019b515eSShri Abhyankar sum1 = b[r[row]];
2048019b515eSShri Abhyankar sum2 = b[r[row + 1]];
2049019b515eSShri Abhyankar v2 = aa + ai[row + 1];
2050019b515eSShri Abhyankar
2051019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2052019b515eSShri Abhyankar i0 = vi[j];
2053019b515eSShri Abhyankar i1 = vi[j + 1];
2054019b515eSShri Abhyankar tmp0 = tmps[i0];
2055019b515eSShri Abhyankar tmp1 = tmps[i1];
2056019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2057019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2058019b515eSShri Abhyankar }
2059019b515eSShri Abhyankar if (j == nz - 1) {
2060019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2061019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2062019b515eSShri Abhyankar sum2 -= v2[j] * tmp0;
2063019b515eSShri Abhyankar }
2064019b515eSShri Abhyankar sum2 -= v2[nz] * sum1;
2065019b515eSShri Abhyankar tmp[row++] = sum1;
2066019b515eSShri Abhyankar tmp[row++] = sum2;
2067019b515eSShri Abhyankar break;
2068019b515eSShri Abhyankar case 3:
2069019b515eSShri Abhyankar sum1 = b[r[row]];
2070019b515eSShri Abhyankar sum2 = b[r[row + 1]];
2071019b515eSShri Abhyankar sum3 = b[r[row + 2]];
2072019b515eSShri Abhyankar v2 = aa + ai[row + 1];
2073019b515eSShri Abhyankar v3 = aa + ai[row + 2];
2074019b515eSShri Abhyankar
2075019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2076019b515eSShri Abhyankar i0 = vi[j];
2077019b515eSShri Abhyankar i1 = vi[j + 1];
2078019b515eSShri Abhyankar tmp0 = tmps[i0];
2079019b515eSShri Abhyankar tmp1 = tmps[i1];
2080019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2081019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2082019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2083019b515eSShri Abhyankar }
2084019b515eSShri Abhyankar if (j == nz - 1) {
2085019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2086019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2087019b515eSShri Abhyankar sum2 -= v2[j] * tmp0;
2088019b515eSShri Abhyankar sum3 -= v3[j] * tmp0;
2089019b515eSShri Abhyankar }
2090019b515eSShri Abhyankar sum2 -= v2[nz] * sum1;
2091019b515eSShri Abhyankar sum3 -= v3[nz] * sum1;
2092019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2;
2093019b515eSShri Abhyankar tmp[row++] = sum1;
2094019b515eSShri Abhyankar tmp[row++] = sum2;
2095019b515eSShri Abhyankar tmp[row++] = sum3;
2096019b515eSShri Abhyankar break;
2097019b515eSShri Abhyankar
2098019b515eSShri Abhyankar case 4:
2099019b515eSShri Abhyankar sum1 = b[r[row]];
2100019b515eSShri Abhyankar sum2 = b[r[row + 1]];
2101019b515eSShri Abhyankar sum3 = b[r[row + 2]];
2102019b515eSShri Abhyankar sum4 = b[r[row + 3]];
2103019b515eSShri Abhyankar v2 = aa + ai[row + 1];
2104019b515eSShri Abhyankar v3 = aa + ai[row + 2];
2105019b515eSShri Abhyankar v4 = aa + ai[row + 3];
2106019b515eSShri Abhyankar
2107019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2108019b515eSShri Abhyankar i0 = vi[j];
2109019b515eSShri Abhyankar i1 = vi[j + 1];
2110019b515eSShri Abhyankar tmp0 = tmps[i0];
2111019b515eSShri Abhyankar tmp1 = tmps[i1];
2112019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2113019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2114019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2115019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2116019b515eSShri Abhyankar }
2117019b515eSShri Abhyankar if (j == nz - 1) {
2118019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2119019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2120019b515eSShri Abhyankar sum2 -= v2[j] * tmp0;
2121019b515eSShri Abhyankar sum3 -= v3[j] * tmp0;
2122019b515eSShri Abhyankar sum4 -= v4[j] * tmp0;
2123019b515eSShri Abhyankar }
2124019b515eSShri Abhyankar sum2 -= v2[nz] * sum1;
2125019b515eSShri Abhyankar sum3 -= v3[nz] * sum1;
2126019b515eSShri Abhyankar sum4 -= v4[nz] * sum1;
2127019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2;
2128019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2;
2129019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3;
2130019b515eSShri Abhyankar
2131019b515eSShri Abhyankar tmp[row++] = sum1;
2132019b515eSShri Abhyankar tmp[row++] = sum2;
2133019b515eSShri Abhyankar tmp[row++] = sum3;
2134019b515eSShri Abhyankar tmp[row++] = sum4;
2135019b515eSShri Abhyankar break;
2136019b515eSShri Abhyankar case 5:
2137019b515eSShri Abhyankar sum1 = b[r[row]];
2138019b515eSShri Abhyankar sum2 = b[r[row + 1]];
2139019b515eSShri Abhyankar sum3 = b[r[row + 2]];
2140019b515eSShri Abhyankar sum4 = b[r[row + 3]];
2141019b515eSShri Abhyankar sum5 = b[r[row + 4]];
2142019b515eSShri Abhyankar v2 = aa + ai[row + 1];
2143019b515eSShri Abhyankar v3 = aa + ai[row + 2];
2144019b515eSShri Abhyankar v4 = aa + ai[row + 3];
2145019b515eSShri Abhyankar v5 = aa + ai[row + 4];
2146019b515eSShri Abhyankar
2147019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2148019b515eSShri Abhyankar i0 = vi[j];
2149019b515eSShri Abhyankar i1 = vi[j + 1];
2150019b515eSShri Abhyankar tmp0 = tmps[i0];
2151019b515eSShri Abhyankar tmp1 = tmps[i1];
2152019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2153019b515eSShri Abhyankar sum2 -= v2[j] * tmp0 + v2[j + 1] * tmp1;
2154019b515eSShri Abhyankar sum3 -= v3[j] * tmp0 + v3[j + 1] * tmp1;
2155019b515eSShri Abhyankar sum4 -= v4[j] * tmp0 + v4[j + 1] * tmp1;
2156019b515eSShri Abhyankar sum5 -= v5[j] * tmp0 + v5[j + 1] * tmp1;
2157019b515eSShri Abhyankar }
2158019b515eSShri Abhyankar if (j == nz - 1) {
2159019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2160019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2161019b515eSShri Abhyankar sum2 -= v2[j] * tmp0;
2162019b515eSShri Abhyankar sum3 -= v3[j] * tmp0;
2163019b515eSShri Abhyankar sum4 -= v4[j] * tmp0;
2164019b515eSShri Abhyankar sum5 -= v5[j] * tmp0;
2165019b515eSShri Abhyankar }
2166019b515eSShri Abhyankar
2167019b515eSShri Abhyankar sum2 -= v2[nz] * sum1;
2168019b515eSShri Abhyankar sum3 -= v3[nz] * sum1;
2169019b515eSShri Abhyankar sum4 -= v4[nz] * sum1;
2170019b515eSShri Abhyankar sum5 -= v5[nz] * sum1;
2171019b515eSShri Abhyankar sum3 -= v3[nz + 1] * sum2;
2172019b515eSShri Abhyankar sum4 -= v4[nz + 1] * sum2;
2173019b515eSShri Abhyankar sum5 -= v5[nz + 1] * sum2;
2174019b515eSShri Abhyankar sum4 -= v4[nz + 2] * sum3;
2175019b515eSShri Abhyankar sum5 -= v5[nz + 2] * sum3;
2176019b515eSShri Abhyankar sum5 -= v5[nz + 3] * sum4;
2177019b515eSShri Abhyankar
2178019b515eSShri Abhyankar tmp[row++] = sum1;
2179019b515eSShri Abhyankar tmp[row++] = sum2;
2180019b515eSShri Abhyankar tmp[row++] = sum3;
2181019b515eSShri Abhyankar tmp[row++] = sum4;
2182019b515eSShri Abhyankar tmp[row++] = sum5;
2183019b515eSShri Abhyankar break;
2184d71ae5a4SJacob Faibussowitsch default:
2185d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2186019b515eSShri Abhyankar }
2187019b515eSShri Abhyankar }
2188019b515eSShri Abhyankar /* backward solve the upper triangular */
21894d12350bSJunchao Zhang for (i = node_max - 1; i >= 0; i--) {
21904d12350bSJunchao Zhang row = ns[i + 1] - 1;
21914d12350bSJunchao Zhang nsz = ns[i + 1] - ns[i];
2192019b515eSShri Abhyankar aii = ad[row + 1] + 1;
2193019b515eSShri Abhyankar v1 = aa + aii;
2194019b515eSShri Abhyankar vi = aj + aii;
2195019b515eSShri Abhyankar nz = ad[row] - ad[row + 1] - 1;
219698991853SShri Abhyankar
219798991853SShri Abhyankar if (i > 0) {
219898991853SShri Abhyankar /* Prefetch the indices for the next block */
219950d8bf02SJed Brown PetscPrefetchBlock(aj + ad[row - nsz + 1] + 1, ad[row - nsz] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
220098991853SShri Abhyankar /* Prefetch the data for the next block */
22014d12350bSJunchao Zhang PetscPrefetchBlock(aa + ad[row - nsz + 1] + 1, ad[ns[i - 1] + 1] - ad[row - nsz + 1], 0, PETSC_PREFETCH_HINT_NTA);
220298991853SShri Abhyankar }
220398991853SShri Abhyankar
2204019b515eSShri Abhyankar switch (nsz) { /* Each loop in 'case' is unrolled */
2205019b515eSShri Abhyankar case 1:
2206019b515eSShri Abhyankar sum1 = tmp[row];
2207019b515eSShri Abhyankar
2208019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2209019b515eSShri Abhyankar i0 = vi[j];
2210019b515eSShri Abhyankar i1 = vi[j + 1];
2211019b515eSShri Abhyankar tmp0 = tmps[i0];
2212019b515eSShri Abhyankar tmp1 = tmps[i1];
2213019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2214019b515eSShri Abhyankar }
2215019b515eSShri Abhyankar if (j == nz - 1) {
2216019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2217019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2218019b515eSShri Abhyankar }
22199371c9d4SSatish Balay x[c[row]] = tmp[row] = sum1 * v1[nz];
22209371c9d4SSatish Balay row--;
2221019b515eSShri Abhyankar break;
2222019b515eSShri Abhyankar case 2:
2223019b515eSShri Abhyankar sum1 = tmp[row];
2224019b515eSShri Abhyankar sum2 = tmp[row - 1];
2225019b515eSShri Abhyankar v2 = aa + ad[row] + 1;
2226019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2227019b515eSShri Abhyankar i0 = vi[j];
2228019b515eSShri Abhyankar i1 = vi[j + 1];
2229019b515eSShri Abhyankar tmp0 = tmps[i0];
2230019b515eSShri Abhyankar tmp1 = tmps[i1];
2231019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2232019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2233019b515eSShri Abhyankar }
2234019b515eSShri Abhyankar if (j == nz - 1) {
2235019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2236019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2237019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0;
2238019b515eSShri Abhyankar }
2239019b515eSShri Abhyankar
22409371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
22419371c9d4SSatish Balay row--;
2242019b515eSShri Abhyankar sum2 -= v2[0] * tmp0;
22439371c9d4SSatish Balay x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
22449371c9d4SSatish Balay row--;
2245019b515eSShri Abhyankar break;
2246019b515eSShri Abhyankar case 3:
2247019b515eSShri Abhyankar sum1 = tmp[row];
2248019b515eSShri Abhyankar sum2 = tmp[row - 1];
2249019b515eSShri Abhyankar sum3 = tmp[row - 2];
2250019b515eSShri Abhyankar v2 = aa + ad[row] + 1;
2251019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1;
2252019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2253019b515eSShri Abhyankar i0 = vi[j];
2254019b515eSShri Abhyankar i1 = vi[j + 1];
2255019b515eSShri Abhyankar tmp0 = tmps[i0];
2256019b515eSShri Abhyankar tmp1 = tmps[i1];
2257019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2258019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2259019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2260019b515eSShri Abhyankar }
2261019b515eSShri Abhyankar if (j == nz - 1) {
2262019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2263019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2264019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0;
2265019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0;
2266019b515eSShri Abhyankar }
22679371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
22689371c9d4SSatish Balay row--;
2269019b515eSShri Abhyankar sum2 -= v2[0] * tmp0;
2270019b515eSShri Abhyankar sum3 -= v3[1] * tmp0;
22719371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
22729371c9d4SSatish Balay row--;
2273019b515eSShri Abhyankar sum3 -= v3[0] * tmp0;
22749371c9d4SSatish Balay x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
22759371c9d4SSatish Balay row--;
2276019b515eSShri Abhyankar
2277019b515eSShri Abhyankar break;
2278019b515eSShri Abhyankar case 4:
2279019b515eSShri Abhyankar sum1 = tmp[row];
2280019b515eSShri Abhyankar sum2 = tmp[row - 1];
2281019b515eSShri Abhyankar sum3 = tmp[row - 2];
2282019b515eSShri Abhyankar sum4 = tmp[row - 3];
2283019b515eSShri Abhyankar v2 = aa + ad[row] + 1;
2284019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1;
2285019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1;
2286019b515eSShri Abhyankar
2287019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2288019b515eSShri Abhyankar i0 = vi[j];
2289019b515eSShri Abhyankar i1 = vi[j + 1];
2290019b515eSShri Abhyankar tmp0 = tmps[i0];
2291019b515eSShri Abhyankar tmp1 = tmps[i1];
2292019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2293019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2294019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2295019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2296019b515eSShri Abhyankar }
2297019b515eSShri Abhyankar if (j == nz - 1) {
2298019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2299019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2300019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0;
2301019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0;
2302019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0;
2303019b515eSShri Abhyankar }
2304019b515eSShri Abhyankar
23059371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
23069371c9d4SSatish Balay row--;
2307019b515eSShri Abhyankar sum2 -= v2[0] * tmp0;
2308019b515eSShri Abhyankar sum3 -= v3[1] * tmp0;
2309019b515eSShri Abhyankar sum4 -= v4[2] * tmp0;
23109371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
23119371c9d4SSatish Balay row--;
2312019b515eSShri Abhyankar sum3 -= v3[0] * tmp0;
2313019b515eSShri Abhyankar sum4 -= v4[1] * tmp0;
23149371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
23159371c9d4SSatish Balay row--;
2316019b515eSShri Abhyankar sum4 -= v4[0] * tmp0;
23179371c9d4SSatish Balay x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
23189371c9d4SSatish Balay row--;
2319019b515eSShri Abhyankar break;
2320019b515eSShri Abhyankar case 5:
2321019b515eSShri Abhyankar sum1 = tmp[row];
2322019b515eSShri Abhyankar sum2 = tmp[row - 1];
2323019b515eSShri Abhyankar sum3 = tmp[row - 2];
2324019b515eSShri Abhyankar sum4 = tmp[row - 3];
2325019b515eSShri Abhyankar sum5 = tmp[row - 4];
2326019b515eSShri Abhyankar v2 = aa + ad[row] + 1;
2327019b515eSShri Abhyankar v3 = aa + ad[row - 1] + 1;
2328019b515eSShri Abhyankar v4 = aa + ad[row - 2] + 1;
2329019b515eSShri Abhyankar v5 = aa + ad[row - 3] + 1;
2330019b515eSShri Abhyankar for (j = 0; j < nz - 1; j += 2) {
2331019b515eSShri Abhyankar i0 = vi[j];
2332019b515eSShri Abhyankar i1 = vi[j + 1];
2333019b515eSShri Abhyankar tmp0 = tmps[i0];
2334019b515eSShri Abhyankar tmp1 = tmps[i1];
2335019b515eSShri Abhyankar sum1 -= v1[j] * tmp0 + v1[j + 1] * tmp1;
2336019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0 + v2[j + 2] * tmp1;
2337019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0 + v3[j + 3] * tmp1;
2338019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0 + v4[j + 4] * tmp1;
2339019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0 + v5[j + 5] * tmp1;
2340019b515eSShri Abhyankar }
2341019b515eSShri Abhyankar if (j == nz - 1) {
2342019b515eSShri Abhyankar tmp0 = tmps[vi[j]];
2343019b515eSShri Abhyankar sum1 -= v1[j] * tmp0;
2344019b515eSShri Abhyankar sum2 -= v2[j + 1] * tmp0;
2345019b515eSShri Abhyankar sum3 -= v3[j + 2] * tmp0;
2346019b515eSShri Abhyankar sum4 -= v4[j + 3] * tmp0;
2347019b515eSShri Abhyankar sum5 -= v5[j + 4] * tmp0;
2348019b515eSShri Abhyankar }
2349019b515eSShri Abhyankar
23509371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum1 * v1[nz];
23519371c9d4SSatish Balay row--;
2352019b515eSShri Abhyankar sum2 -= v2[0] * tmp0;
2353019b515eSShri Abhyankar sum3 -= v3[1] * tmp0;
2354019b515eSShri Abhyankar sum4 -= v4[2] * tmp0;
2355019b515eSShri Abhyankar sum5 -= v5[3] * tmp0;
23569371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum2 * v2[nz + 1];
23579371c9d4SSatish Balay row--;
2358019b515eSShri Abhyankar sum3 -= v3[0] * tmp0;
2359019b515eSShri Abhyankar sum4 -= v4[1] * tmp0;
2360019b515eSShri Abhyankar sum5 -= v5[2] * tmp0;
23619371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum3 * v3[nz + 2];
23629371c9d4SSatish Balay row--;
2363019b515eSShri Abhyankar sum4 -= v4[0] * tmp0;
2364019b515eSShri Abhyankar sum5 -= v5[1] * tmp0;
23659371c9d4SSatish Balay tmp0 = x[c[row]] = tmp[row] = sum4 * v4[nz + 3];
23669371c9d4SSatish Balay row--;
2367019b515eSShri Abhyankar sum5 -= v5[0] * tmp0;
23689371c9d4SSatish Balay x[c[row]] = tmp[row] = sum5 * v5[nz + 4];
23699371c9d4SSatish Balay row--;
2370019b515eSShri Abhyankar break;
2371d71ae5a4SJacob Faibussowitsch default:
2372d71ae5a4SJacob Faibussowitsch SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not yet supported ");
2373019b515eSShri Abhyankar }
2374019b515eSShri Abhyankar }
23759566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout));
23769566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout));
23779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b));
23789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayWrite(xx, &x));
23799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
23803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
2381019b515eSShri Abhyankar }
2382019b515eSShri Abhyankar
23834c1414c8SBarry Smith /*
23844c1414c8SBarry Smith Makes a longer coloring[] array and calls the usual code with that
23854c1414c8SBarry Smith */
MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring * iscoloring)238666976f2fSJacob Faibussowitsch static PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat, PetscInt ncolors, PetscInt nin, ISColoringValue coloring[], ISColoring *iscoloring)
2387d71ae5a4SJacob Faibussowitsch {
23884c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)mat->data;
23894d12350bSJunchao Zhang PetscInt n = mat->cmap->n, m = a->inode.node_count, j, *ns = a->inode.size_csr, row;
23904c1414c8SBarry Smith PetscInt *colorused, i;
23914c1414c8SBarry Smith ISColoringValue *newcolor;
23924c1414c8SBarry Smith
23934c1414c8SBarry Smith PetscFunctionBegin;
23944d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
23959566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &newcolor));
23964c1414c8SBarry Smith /* loop over inodes, marking a color for each column*/
23974c1414c8SBarry Smith row = 0;
23984c1414c8SBarry Smith for (i = 0; i < m; i++) {
23994d12350bSJunchao Zhang for (j = 0; j < (ns[i + 1] - ns[i]); j++) PetscCall(ISColoringValueCast(coloring[i] + j * ncolors, newcolor + row++));
24004c1414c8SBarry Smith }
24014c1414c8SBarry Smith
24024c1414c8SBarry Smith /* eliminate unneeded colors */
24039566063dSJacob Faibussowitsch PetscCall(PetscCalloc1(5 * ncolors, &colorused));
2404ad540459SPierre Jolivet for (i = 0; i < n; i++) colorused[newcolor[i]] = 1;
24054c1414c8SBarry Smith
2406ad540459SPierre Jolivet for (i = 1; i < 5 * ncolors; i++) colorused[i] += colorused[i - 1];
24074c1414c8SBarry Smith ncolors = colorused[5 * ncolors - 1];
24086497c311SBarry Smith for (i = 0; i < n; i++) PetscCall(ISColoringValueCast(colorused[newcolor[i]] - 1, newcolor + i));
24099566063dSJacob Faibussowitsch PetscCall(PetscFree(colorused));
24109566063dSJacob Faibussowitsch PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat), ncolors, n, newcolor, PETSC_OWN_POINTER, iscoloring));
24119566063dSJacob Faibussowitsch PetscCall(PetscFree(coloring));
24123ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
24134c1414c8SBarry Smith }
24144c1414c8SBarry Smith
2415af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
24162af78befSBarry Smith
241707425a8dSBarry Smith /*
241807425a8dSBarry Smith Negative shift indicates do not generate an error if there is a zero diagonal, just invert it anyways
241907425a8dSBarry Smith */
MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A,PetscScalar omega,PetscScalar fshift)242007425a8dSBarry Smith static PetscErrorCode MatInvertDiagonalForSOR_SeqAIJ_Inode(Mat A, PetscScalar omega, PetscScalar fshift)
2421d71ae5a4SJacob Faibussowitsch {
24222af78befSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
242307425a8dSBarry Smith MatScalar *ibdiag, *bdiag, work[25];
242407425a8dSBarry Smith const MatScalar *v = a->a;
24257b6c816cSBarry Smith PetscReal zeropivot = 100. * PETSC_MACHINE_EPSILON, shift = 0.0;
242607425a8dSBarry Smith PetscInt m = a->inode.node_count, cnt = 0, i, j, row, nodesz;
242707425a8dSBarry Smith PetscInt k, ipvt[5];
242807425a8dSBarry Smith PetscBool allowzeropivot = PetscNot(A->erroriffailure), zeropivotdetected;
2429*421480d9SBarry Smith const PetscInt *sizes = a->inode.size_csr, *diag;
24302af78befSBarry Smith
24312af78befSBarry Smith PetscFunctionBegin;
243207425a8dSBarry Smith if (a->idiagState == ((PetscObject)A)->state) PetscFunctionReturn(PETSC_SUCCESS);
2433*421480d9SBarry Smith PetscCall(MatGetDiagonalMarkers_SeqAIJ(A, &diag, NULL));
24342af78befSBarry Smith if (!a->inode.ibdiag) {
24352af78befSBarry Smith /* calculate space needed for diagonal blocks */
24364d12350bSJunchao Zhang for (i = 0; i < m; i++) {
24374d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
24384d12350bSJunchao Zhang cnt += nodesz * nodesz;
24394d12350bSJunchao Zhang }
2440f0d39aaaSBarry Smith a->inode.bdiagsize = cnt;
24419566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(cnt, &a->inode.ibdiag, cnt, &a->inode.bdiag, A->rmap->n, &a->inode.ssor_work));
244271f1c65dSBarry Smith }
244371f1c65dSBarry Smith
244471f1c65dSBarry Smith /* copy over the diagonal blocks and invert them */
24452af78befSBarry Smith ibdiag = a->inode.ibdiag;
24462af78befSBarry Smith bdiag = a->inode.bdiag;
24472af78befSBarry Smith cnt = 0;
24482af78befSBarry Smith for (i = 0, row = 0; i < m; i++) {
24494d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
24504d12350bSJunchao Zhang for (j = 0; j < nodesz; j++) {
24514d12350bSJunchao Zhang for (k = 0; k < nodesz; k++) bdiag[cnt + k * nodesz + j] = v[diag[row + j] - j + k];
24522af78befSBarry Smith }
24534d12350bSJunchao Zhang PetscCall(PetscArraycpy(ibdiag + cnt, bdiag + cnt, nodesz * nodesz));
24542af78befSBarry Smith
24554d12350bSJunchao Zhang switch (nodesz) {
24562af78befSBarry Smith case 1:
24572af78befSBarry Smith /* Create matrix data structure */
24588e0e2a9aSHong Zhang if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) {
2459966bd95aSPierre Jolivet PetscCheck(allowzeropivot, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot on row %" PetscInt_FMT, row);
24607b6c816cSBarry Smith A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
24617b6c816cSBarry Smith A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]);
24627b6c816cSBarry Smith A->factorerror_zeropivot_row = row;
24639566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Zero pivot, row %" PetscInt_FMT "\n", row));
24648e0e2a9aSHong Zhang }
246564c62002SMatthew Knepley ibdiag[cnt] = 1.0 / ibdiag[cnt];
24662af78befSBarry Smith break;
24672af78befSBarry Smith case 2:
24689566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
24697b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
24702af78befSBarry Smith break;
24712af78befSBarry Smith case 3:
24729566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
24737b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
24742af78befSBarry Smith break;
24752af78befSBarry Smith case 4:
24769566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag + cnt, shift, allowzeropivot, &zeropivotdetected));
24777b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
24782af78befSBarry Smith break;
24792af78befSBarry Smith case 5:
24809566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag + cnt, ipvt, work, shift, allowzeropivot, &zeropivotdetected));
24817b6c816cSBarry Smith if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
24822af78befSBarry Smith break;
2483d71ae5a4SJacob Faibussowitsch default:
24844d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
24852af78befSBarry Smith }
24864d12350bSJunchao Zhang cnt += nodesz * nodesz;
24874d12350bSJunchao Zhang row += nodesz;
24882af78befSBarry Smith }
248907425a8dSBarry Smith a->inode.ibdiagState = ((PetscObject)A)->state;
249007425a8dSBarry Smith PetscFunctionReturn(PETSC_SUCCESS);
24912af78befSBarry Smith }
249207425a8dSBarry Smith
MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)249307425a8dSBarry Smith PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
249407425a8dSBarry Smith {
249507425a8dSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
249607425a8dSBarry Smith PetscScalar sum1 = 0.0, sum2 = 0.0, sum3 = 0.0, sum4 = 0.0, sum5 = 0.0, tmp0, tmp1, tmp2, tmp3;
249707425a8dSBarry Smith MatScalar *ibdiag, *bdiag, *t;
249807425a8dSBarry Smith PetscScalar *x, tmp4, tmp5, x1, x2, x3, x4, x5;
249907425a8dSBarry Smith const MatScalar *v1 = NULL, *v2 = NULL, *v3 = NULL, *v4 = NULL, *v5 = NULL;
250007425a8dSBarry Smith const PetscScalar *xb, *b;
250107425a8dSBarry Smith PetscInt n, m = a->inode.node_count, cnt = 0, i, row, i1, i2, nodesz;
250207425a8dSBarry Smith PetscInt sz;
250307425a8dSBarry Smith const PetscInt *sizes = a->inode.size_csr, *idx, *diag, *ii = a->i;
250407425a8dSBarry Smith
250507425a8dSBarry Smith PetscFunctionBegin;
250607425a8dSBarry Smith PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
250707425a8dSBarry Smith PetscCheck(omega == 1.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for omega != 1.0; use -mat_no_inode");
250807425a8dSBarry Smith PetscCheck(fshift == 0.0, PETSC_COMM_SELF, PETSC_ERR_SUP, "No support for fshift != 0.0; use -mat_no_inode");
250907425a8dSBarry Smith PetscCall(MatInvertDiagonalForSOR_SeqAIJ_Inode(A, omega, fshift));
251007425a8dSBarry Smith diag = a->diag;
251107425a8dSBarry Smith
25122af78befSBarry Smith ibdiag = a->inode.ibdiag;
25132af78befSBarry Smith bdiag = a->inode.bdiag;
25145850ef23SBarry Smith t = a->inode.ssor_work;
25152af78befSBarry Smith
25169566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x));
25179566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b));
25185850ef23SBarry Smith /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */
25195850ef23SBarry Smith if (flag & SOR_ZERO_INITIAL_GUESS) {
25202af78befSBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
25218862d2efSBarry Smith for (i = 0, row = 0; i < m; i++) {
25228862d2efSBarry Smith sz = diag[row] - ii[row];
25238862d2efSBarry Smith v1 = a->a + ii[row];
25248862d2efSBarry Smith idx = a->j + ii[row];
25258862d2efSBarry Smith
25264108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
25274d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
25284d12350bSJunchao Zhang switch (nodesz) {
25298862d2efSBarry Smith case 1:
25308862d2efSBarry Smith
25318862d2efSBarry Smith sum1 = b[row];
25328862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) {
25338862d2efSBarry Smith i1 = idx[0];
25348862d2efSBarry Smith i2 = idx[1];
25358862d2efSBarry Smith idx += 2;
25368862d2efSBarry Smith tmp0 = x[i1];
25378862d2efSBarry Smith tmp1 = x[i2];
25389371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
25399371c9d4SSatish Balay v1 += 2;
25408862d2efSBarry Smith }
25418862d2efSBarry Smith
25428862d2efSBarry Smith if (n == sz - 1) {
2543f0d39aaaSBarry Smith tmp0 = x[*idx];
2544f0d39aaaSBarry Smith sum1 -= *v1 * tmp0;
25458862d2efSBarry Smith }
25465850ef23SBarry Smith t[row] = sum1;
25478862d2efSBarry Smith x[row++] = sum1 * (*ibdiag++);
25488862d2efSBarry Smith break;
2549f0d39aaaSBarry Smith case 2:
2550f0d39aaaSBarry Smith v2 = a->a + ii[row + 1];
2551f0d39aaaSBarry Smith sum1 = b[row];
2552f0d39aaaSBarry Smith sum2 = b[row + 1];
2553f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) {
2554f0d39aaaSBarry Smith i1 = idx[0];
2555f0d39aaaSBarry Smith i2 = idx[1];
2556f0d39aaaSBarry Smith idx += 2;
2557f0d39aaaSBarry Smith tmp0 = x[i1];
2558f0d39aaaSBarry Smith tmp1 = x[i2];
25599371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
25609371c9d4SSatish Balay v1 += 2;
25619371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
25629371c9d4SSatish Balay v2 += 2;
2563f0d39aaaSBarry Smith }
2564f0d39aaaSBarry Smith
2565f0d39aaaSBarry Smith if (n == sz - 1) {
2566f0d39aaaSBarry Smith tmp0 = x[*idx];
2567f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0;
2568f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0;
2569f0d39aaaSBarry Smith }
25705850ef23SBarry Smith t[row] = sum1;
25715850ef23SBarry Smith t[row + 1] = sum2;
2572f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2573f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
2574f0d39aaaSBarry Smith ibdiag += 4;
2575f0d39aaaSBarry Smith break;
2576f0d39aaaSBarry Smith case 3:
2577f0d39aaaSBarry Smith v2 = a->a + ii[row + 1];
2578f0d39aaaSBarry Smith v3 = a->a + ii[row + 2];
2579f0d39aaaSBarry Smith sum1 = b[row];
2580f0d39aaaSBarry Smith sum2 = b[row + 1];
2581f0d39aaaSBarry Smith sum3 = b[row + 2];
2582f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) {
2583f0d39aaaSBarry Smith i1 = idx[0];
2584f0d39aaaSBarry Smith i2 = idx[1];
2585f0d39aaaSBarry Smith idx += 2;
2586f0d39aaaSBarry Smith tmp0 = x[i1];
2587f0d39aaaSBarry Smith tmp1 = x[i2];
25889371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
25899371c9d4SSatish Balay v1 += 2;
25909371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
25919371c9d4SSatish Balay v2 += 2;
25929371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
25939371c9d4SSatish Balay v3 += 2;
2594f0d39aaaSBarry Smith }
2595f0d39aaaSBarry Smith
2596f0d39aaaSBarry Smith if (n == sz - 1) {
2597f0d39aaaSBarry Smith tmp0 = x[*idx];
2598f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0;
2599f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0;
2600f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0;
2601f0d39aaaSBarry Smith }
26025850ef23SBarry Smith t[row] = sum1;
26035850ef23SBarry Smith t[row + 1] = sum2;
26045850ef23SBarry Smith t[row + 2] = sum3;
2605f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
2606f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
2607f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
2608f0d39aaaSBarry Smith ibdiag += 9;
2609f0d39aaaSBarry Smith break;
2610f0d39aaaSBarry Smith case 4:
2611f0d39aaaSBarry Smith v2 = a->a + ii[row + 1];
2612f0d39aaaSBarry Smith v3 = a->a + ii[row + 2];
2613f0d39aaaSBarry Smith v4 = a->a + ii[row + 3];
2614f0d39aaaSBarry Smith sum1 = b[row];
2615f0d39aaaSBarry Smith sum2 = b[row + 1];
2616f0d39aaaSBarry Smith sum3 = b[row + 2];
2617f0d39aaaSBarry Smith sum4 = b[row + 3];
2618f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) {
2619f0d39aaaSBarry Smith i1 = idx[0];
2620f0d39aaaSBarry Smith i2 = idx[1];
2621f0d39aaaSBarry Smith idx += 2;
2622f0d39aaaSBarry Smith tmp0 = x[i1];
2623f0d39aaaSBarry Smith tmp1 = x[i2];
26249371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
26259371c9d4SSatish Balay v1 += 2;
26269371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
26279371c9d4SSatish Balay v2 += 2;
26289371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
26299371c9d4SSatish Balay v3 += 2;
26309371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
26319371c9d4SSatish Balay v4 += 2;
2632f0d39aaaSBarry Smith }
2633f0d39aaaSBarry Smith
2634f0d39aaaSBarry Smith if (n == sz - 1) {
2635f0d39aaaSBarry Smith tmp0 = x[*idx];
2636f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0;
2637f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0;
2638f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0;
2639f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0;
2640f0d39aaaSBarry Smith }
26415850ef23SBarry Smith t[row] = sum1;
26425850ef23SBarry Smith t[row + 1] = sum2;
26435850ef23SBarry Smith t[row + 2] = sum3;
26445850ef23SBarry Smith t[row + 3] = sum4;
2645f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
2646f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
2647f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
2648f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
2649f0d39aaaSBarry Smith ibdiag += 16;
2650f0d39aaaSBarry Smith break;
2651f0d39aaaSBarry Smith case 5:
2652f0d39aaaSBarry Smith v2 = a->a + ii[row + 1];
2653f0d39aaaSBarry Smith v3 = a->a + ii[row + 2];
2654f0d39aaaSBarry Smith v4 = a->a + ii[row + 3];
2655f0d39aaaSBarry Smith v5 = a->a + ii[row + 4];
2656f0d39aaaSBarry Smith sum1 = b[row];
2657f0d39aaaSBarry Smith sum2 = b[row + 1];
2658f0d39aaaSBarry Smith sum3 = b[row + 2];
2659f0d39aaaSBarry Smith sum4 = b[row + 3];
2660f0d39aaaSBarry Smith sum5 = b[row + 4];
2661f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) {
2662f0d39aaaSBarry Smith i1 = idx[0];
2663f0d39aaaSBarry Smith i2 = idx[1];
2664f0d39aaaSBarry Smith idx += 2;
2665f0d39aaaSBarry Smith tmp0 = x[i1];
2666f0d39aaaSBarry Smith tmp1 = x[i2];
26679371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
26689371c9d4SSatish Balay v1 += 2;
26699371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
26709371c9d4SSatish Balay v2 += 2;
26719371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
26729371c9d4SSatish Balay v3 += 2;
26739371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
26749371c9d4SSatish Balay v4 += 2;
26759371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
26769371c9d4SSatish Balay v5 += 2;
2677f0d39aaaSBarry Smith }
2678f0d39aaaSBarry Smith
2679f0d39aaaSBarry Smith if (n == sz - 1) {
2680f0d39aaaSBarry Smith tmp0 = x[*idx];
2681f0d39aaaSBarry Smith sum1 -= v1[0] * tmp0;
2682f0d39aaaSBarry Smith sum2 -= v2[0] * tmp0;
2683f0d39aaaSBarry Smith sum3 -= v3[0] * tmp0;
2684f0d39aaaSBarry Smith sum4 -= v4[0] * tmp0;
2685f0d39aaaSBarry Smith sum5 -= v5[0] * tmp0;
2686f0d39aaaSBarry Smith }
26875850ef23SBarry Smith t[row] = sum1;
26885850ef23SBarry Smith t[row + 1] = sum2;
26895850ef23SBarry Smith t[row + 2] = sum3;
26905850ef23SBarry Smith t[row + 3] = sum4;
26915850ef23SBarry Smith t[row + 4] = sum5;
2692f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
2693f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
2694f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
2695f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
2696f0d39aaaSBarry Smith x[row++] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
2697f0d39aaaSBarry Smith ibdiag += 25;
2698f0d39aaaSBarry Smith break;
2699d71ae5a4SJacob Faibussowitsch default:
27004d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
27018862d2efSBarry Smith }
27022af78befSBarry Smith }
27032af78befSBarry Smith
27045850ef23SBarry Smith xb = t;
27059566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz));
27062af78befSBarry Smith } else xb = b;
27072af78befSBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2708f0d39aaaSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
2709d0f46423SBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
27104d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
27114d12350bSJunchao Zhang ibdiag -= nodesz * nodesz;
27128862d2efSBarry Smith sz = ii[row + 1] - diag[row] - 1;
27138862d2efSBarry Smith v1 = a->a + diag[row] + 1;
27148862d2efSBarry Smith idx = a->j + diag[row] + 1;
27152af78befSBarry Smith
27164108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
27174d12350bSJunchao Zhang switch (nodesz) {
27188862d2efSBarry Smith case 1:
27198862d2efSBarry Smith
27208862d2efSBarry Smith sum1 = xb[row];
27218862d2efSBarry Smith for (n = 0; n < sz - 1; n += 2) {
27228862d2efSBarry Smith i1 = idx[0];
27238862d2efSBarry Smith i2 = idx[1];
27248862d2efSBarry Smith idx += 2;
27258862d2efSBarry Smith tmp0 = x[i1];
27268862d2efSBarry Smith tmp1 = x[i2];
27279371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
27289371c9d4SSatish Balay v1 += 2;
27298862d2efSBarry Smith }
27308862d2efSBarry Smith
27318862d2efSBarry Smith if (n == sz - 1) {
2732f0d39aaaSBarry Smith tmp0 = x[*idx];
2733f0d39aaaSBarry Smith sum1 -= *v1 * tmp0;
27348862d2efSBarry Smith }
2735f0d39aaaSBarry Smith x[row--] = sum1 * (*ibdiag);
2736f0d39aaaSBarry Smith break;
2737f0d39aaaSBarry Smith
2738f0d39aaaSBarry Smith case 2:
2739f0d39aaaSBarry Smith
2740f0d39aaaSBarry Smith sum1 = xb[row];
2741f0d39aaaSBarry Smith sum2 = xb[row - 1];
2742f0d39aaaSBarry Smith /* note that sum1 is associated with the second of the two rows */
2743f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2;
2744f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) {
2745f0d39aaaSBarry Smith i1 = idx[0];
2746f0d39aaaSBarry Smith i2 = idx[1];
2747f0d39aaaSBarry Smith idx += 2;
2748f0d39aaaSBarry Smith tmp0 = x[i1];
2749f0d39aaaSBarry Smith tmp1 = x[i2];
27509371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
27519371c9d4SSatish Balay v1 += 2;
27529371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
27539371c9d4SSatish Balay v2 += 2;
2754f0d39aaaSBarry Smith }
2755f0d39aaaSBarry Smith
2756f0d39aaaSBarry Smith if (n == sz - 1) {
2757f0d39aaaSBarry Smith tmp0 = x[*idx];
2758f0d39aaaSBarry Smith sum1 -= *v1 * tmp0;
2759f0d39aaaSBarry Smith sum2 -= *v2 * tmp0;
2760f0d39aaaSBarry Smith }
2761f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
2762f0d39aaaSBarry Smith x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
2763f0d39aaaSBarry Smith break;
2764f0d39aaaSBarry Smith case 3:
2765f0d39aaaSBarry Smith
2766f0d39aaaSBarry Smith sum1 = xb[row];
2767f0d39aaaSBarry Smith sum2 = xb[row - 1];
2768f0d39aaaSBarry Smith sum3 = xb[row - 2];
2769f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2;
2770f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3;
2771f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) {
2772f0d39aaaSBarry Smith i1 = idx[0];
2773f0d39aaaSBarry Smith i2 = idx[1];
2774f0d39aaaSBarry Smith idx += 2;
2775f0d39aaaSBarry Smith tmp0 = x[i1];
2776f0d39aaaSBarry Smith tmp1 = x[i2];
27779371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
27789371c9d4SSatish Balay v1 += 2;
27799371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
27809371c9d4SSatish Balay v2 += 2;
27819371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
27829371c9d4SSatish Balay v3 += 2;
2783f0d39aaaSBarry Smith }
2784f0d39aaaSBarry Smith
2785f0d39aaaSBarry Smith if (n == sz - 1) {
2786f0d39aaaSBarry Smith tmp0 = x[*idx];
2787f0d39aaaSBarry Smith sum1 -= *v1 * tmp0;
2788f0d39aaaSBarry Smith sum2 -= *v2 * tmp0;
2789f0d39aaaSBarry Smith sum3 -= *v3 * tmp0;
2790f0d39aaaSBarry Smith }
2791f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
2792f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
2793f0d39aaaSBarry Smith x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
2794f0d39aaaSBarry Smith break;
2795f0d39aaaSBarry Smith case 4:
2796f0d39aaaSBarry Smith
2797f0d39aaaSBarry Smith sum1 = xb[row];
2798f0d39aaaSBarry Smith sum2 = xb[row - 1];
2799f0d39aaaSBarry Smith sum3 = xb[row - 2];
2800f0d39aaaSBarry Smith sum4 = xb[row - 3];
2801f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2;
2802f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3;
2803f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4;
2804f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) {
2805f0d39aaaSBarry Smith i1 = idx[0];
2806f0d39aaaSBarry Smith i2 = idx[1];
2807f0d39aaaSBarry Smith idx += 2;
2808f0d39aaaSBarry Smith tmp0 = x[i1];
2809f0d39aaaSBarry Smith tmp1 = x[i2];
28109371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
28119371c9d4SSatish Balay v1 += 2;
28129371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
28139371c9d4SSatish Balay v2 += 2;
28149371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
28159371c9d4SSatish Balay v3 += 2;
28169371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
28179371c9d4SSatish Balay v4 += 2;
2818f0d39aaaSBarry Smith }
2819f0d39aaaSBarry Smith
2820f0d39aaaSBarry Smith if (n == sz - 1) {
2821f0d39aaaSBarry Smith tmp0 = x[*idx];
2822f0d39aaaSBarry Smith sum1 -= *v1 * tmp0;
2823f0d39aaaSBarry Smith sum2 -= *v2 * tmp0;
2824f0d39aaaSBarry Smith sum3 -= *v3 * tmp0;
2825f0d39aaaSBarry Smith sum4 -= *v4 * tmp0;
2826f0d39aaaSBarry Smith }
2827f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
2828f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
2829f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
2830f0d39aaaSBarry Smith x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
2831f0d39aaaSBarry Smith break;
2832f0d39aaaSBarry Smith case 5:
2833f0d39aaaSBarry Smith
2834f0d39aaaSBarry Smith sum1 = xb[row];
2835f0d39aaaSBarry Smith sum2 = xb[row - 1];
2836f0d39aaaSBarry Smith sum3 = xb[row - 2];
2837f0d39aaaSBarry Smith sum4 = xb[row - 3];
2838f0d39aaaSBarry Smith sum5 = xb[row - 4];
2839f0d39aaaSBarry Smith v2 = a->a + diag[row - 1] + 2;
2840f0d39aaaSBarry Smith v3 = a->a + diag[row - 2] + 3;
2841f0d39aaaSBarry Smith v4 = a->a + diag[row - 3] + 4;
2842f0d39aaaSBarry Smith v5 = a->a + diag[row - 4] + 5;
2843f0d39aaaSBarry Smith for (n = 0; n < sz - 1; n += 2) {
2844f0d39aaaSBarry Smith i1 = idx[0];
2845f0d39aaaSBarry Smith i2 = idx[1];
2846f0d39aaaSBarry Smith idx += 2;
2847f0d39aaaSBarry Smith tmp0 = x[i1];
2848f0d39aaaSBarry Smith tmp1 = x[i2];
28499371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
28509371c9d4SSatish Balay v1 += 2;
28519371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
28529371c9d4SSatish Balay v2 += 2;
28539371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
28549371c9d4SSatish Balay v3 += 2;
28559371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
28569371c9d4SSatish Balay v4 += 2;
28579371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
28589371c9d4SSatish Balay v5 += 2;
2859f0d39aaaSBarry Smith }
2860f0d39aaaSBarry Smith
2861f0d39aaaSBarry Smith if (n == sz - 1) {
2862f0d39aaaSBarry Smith tmp0 = x[*idx];
2863f0d39aaaSBarry Smith sum1 -= *v1 * tmp0;
2864f0d39aaaSBarry Smith sum2 -= *v2 * tmp0;
2865f0d39aaaSBarry Smith sum3 -= *v3 * tmp0;
2866f0d39aaaSBarry Smith sum4 -= *v4 * tmp0;
2867f0d39aaaSBarry Smith sum5 -= *v5 * tmp0;
2868f0d39aaaSBarry Smith }
2869f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
2870f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
2871f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
2872f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
2873f0d39aaaSBarry Smith x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
28748862d2efSBarry Smith break;
2875d71ae5a4SJacob Faibussowitsch default:
28764d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
28778862d2efSBarry Smith }
28782af78befSBarry Smith }
28792af78befSBarry Smith
28809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz));
28812af78befSBarry Smith }
28822af78befSBarry Smith its--;
28835850ef23SBarry Smith }
28845850ef23SBarry Smith while (its--) {
28855850ef23SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
28864d12350bSJunchao Zhang for (i = 0, row = 0, ibdiag = a->inode.ibdiag; i < m; row += nodesz, ibdiag += nodesz * nodesz, i++) {
28874d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
2888d876e2b0SMark Adams sz = diag[row] - ii[row];
28895850ef23SBarry Smith v1 = a->a + ii[row];
28905850ef23SBarry Smith idx = a->j + ii[row];
28915850ef23SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
28924d12350bSJunchao Zhang switch (nodesz) {
28935850ef23SBarry Smith case 1:
28945850ef23SBarry Smith sum1 = b[row];
28955850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) {
28965850ef23SBarry Smith i1 = idx[0];
28975850ef23SBarry Smith i2 = idx[1];
28985850ef23SBarry Smith idx += 2;
28995850ef23SBarry Smith tmp0 = x[i1];
29005850ef23SBarry Smith tmp1 = x[i2];
29019371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29029371c9d4SSatish Balay v1 += 2;
29035850ef23SBarry Smith }
29045850ef23SBarry Smith if (n == sz - 1) {
2905d876e2b0SMark Adams tmp0 = x[*idx++];
2906d876e2b0SMark Adams sum1 -= *v1 * tmp0;
2907d876e2b0SMark Adams v1++;
2908d876e2b0SMark Adams }
2909d876e2b0SMark Adams t[row] = sum1;
2910d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1;
2911d876e2b0SMark Adams idx = a->j + diag[row] + 1;
2912d876e2b0SMark Adams v1 += 1;
2913d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) {
2914d876e2b0SMark Adams i1 = idx[0];
2915d876e2b0SMark Adams i2 = idx[1];
2916d876e2b0SMark Adams idx += 2;
2917d876e2b0SMark Adams tmp0 = x[i1];
2918d876e2b0SMark Adams tmp1 = x[i2];
29199371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29209371c9d4SSatish Balay v1 += 2;
2921d876e2b0SMark Adams }
2922d876e2b0SMark Adams if (n == sz - 1) {
2923d876e2b0SMark Adams tmp0 = x[*idx++];
29245850ef23SBarry Smith sum1 -= *v1 * tmp0;
29255850ef23SBarry Smith }
29265850ef23SBarry Smith /* in MatSOR_SeqAIJ this line would be
29275850ef23SBarry Smith *
29285850ef23SBarry Smith * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++);
29295850ef23SBarry Smith *
29305850ef23SBarry Smith * but omega == 1, so this becomes
29315850ef23SBarry Smith *
2932d876e2b0SMark Adams * x[row] = sum1*(*ibdiag++);
29335850ef23SBarry Smith *
29345850ef23SBarry Smith */
2935d876e2b0SMark Adams x[row] = sum1 * (*ibdiag);
29365850ef23SBarry Smith break;
29375850ef23SBarry Smith case 2:
29385850ef23SBarry Smith v2 = a->a + ii[row + 1];
29395850ef23SBarry Smith sum1 = b[row];
29405850ef23SBarry Smith sum2 = b[row + 1];
29415850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) {
29425850ef23SBarry Smith i1 = idx[0];
29435850ef23SBarry Smith i2 = idx[1];
29445850ef23SBarry Smith idx += 2;
29455850ef23SBarry Smith tmp0 = x[i1];
29465850ef23SBarry Smith tmp1 = x[i2];
29479371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29489371c9d4SSatish Balay v1 += 2;
29499371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29509371c9d4SSatish Balay v2 += 2;
29515850ef23SBarry Smith }
2952d876e2b0SMark Adams if (n == sz - 1) {
2953d876e2b0SMark Adams tmp0 = x[*idx++];
2954d876e2b0SMark Adams sum1 -= v1[0] * tmp0;
2955d876e2b0SMark Adams sum2 -= v2[0] * tmp0;
29569371c9d4SSatish Balay v1++;
29579371c9d4SSatish Balay v2++;
2958d876e2b0SMark Adams }
2959d876e2b0SMark Adams t[row] = sum1;
2960d876e2b0SMark Adams t[row + 1] = sum2;
2961d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 2;
2962d876e2b0SMark Adams idx = a->j + diag[row] + 2;
2963d876e2b0SMark Adams v1 += 2;
2964d876e2b0SMark Adams v2 += 2;
2965d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) {
2966d876e2b0SMark Adams i1 = idx[0];
2967d876e2b0SMark Adams i2 = idx[1];
2968d876e2b0SMark Adams idx += 2;
2969d876e2b0SMark Adams tmp0 = x[i1];
2970d876e2b0SMark Adams tmp1 = x[i2];
29719371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29729371c9d4SSatish Balay v1 += 2;
29739371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29749371c9d4SSatish Balay v2 += 2;
2975d876e2b0SMark Adams }
29765850ef23SBarry Smith if (n == sz - 1) {
29775850ef23SBarry Smith tmp0 = x[*idx];
29785850ef23SBarry Smith sum1 -= v1[0] * tmp0;
29795850ef23SBarry Smith sum2 -= v2[0] * tmp0;
29805850ef23SBarry Smith }
2981d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
2982d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
29835850ef23SBarry Smith break;
29845850ef23SBarry Smith case 3:
29855850ef23SBarry Smith v2 = a->a + ii[row + 1];
29865850ef23SBarry Smith v3 = a->a + ii[row + 2];
29875850ef23SBarry Smith sum1 = b[row];
29885850ef23SBarry Smith sum2 = b[row + 1];
29895850ef23SBarry Smith sum3 = b[row + 2];
29905850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) {
29915850ef23SBarry Smith i1 = idx[0];
29925850ef23SBarry Smith i2 = idx[1];
29935850ef23SBarry Smith idx += 2;
29945850ef23SBarry Smith tmp0 = x[i1];
29955850ef23SBarry Smith tmp1 = x[i2];
29969371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
29979371c9d4SSatish Balay v1 += 2;
29989371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
29999371c9d4SSatish Balay v2 += 2;
30009371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30019371c9d4SSatish Balay v3 += 2;
30025850ef23SBarry Smith }
3003d876e2b0SMark Adams if (n == sz - 1) {
3004d876e2b0SMark Adams tmp0 = x[*idx++];
3005d876e2b0SMark Adams sum1 -= v1[0] * tmp0;
3006d876e2b0SMark Adams sum2 -= v2[0] * tmp0;
3007d876e2b0SMark Adams sum3 -= v3[0] * tmp0;
30089371c9d4SSatish Balay v1++;
30099371c9d4SSatish Balay v2++;
30109371c9d4SSatish Balay v3++;
3011d876e2b0SMark Adams }
3012d876e2b0SMark Adams t[row] = sum1;
3013d876e2b0SMark Adams t[row + 1] = sum2;
3014d876e2b0SMark Adams t[row + 2] = sum3;
3015d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 3;
3016d876e2b0SMark Adams idx = a->j + diag[row] + 3;
3017d876e2b0SMark Adams v1 += 3;
3018d876e2b0SMark Adams v2 += 3;
3019d876e2b0SMark Adams v3 += 3;
3020d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) {
3021d876e2b0SMark Adams i1 = idx[0];
3022d876e2b0SMark Adams i2 = idx[1];
3023d876e2b0SMark Adams idx += 2;
3024d876e2b0SMark Adams tmp0 = x[i1];
3025d876e2b0SMark Adams tmp1 = x[i2];
30269371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30279371c9d4SSatish Balay v1 += 2;
30289371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30299371c9d4SSatish Balay v2 += 2;
30309371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30319371c9d4SSatish Balay v3 += 2;
3032d876e2b0SMark Adams }
30335850ef23SBarry Smith if (n == sz - 1) {
30345850ef23SBarry Smith tmp0 = x[*idx];
30355850ef23SBarry Smith sum1 -= v1[0] * tmp0;
30365850ef23SBarry Smith sum2 -= v2[0] * tmp0;
30375850ef23SBarry Smith sum3 -= v3[0] * tmp0;
30385850ef23SBarry Smith }
3039d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
3040d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
3041d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
30425850ef23SBarry Smith break;
30435850ef23SBarry Smith case 4:
30445850ef23SBarry Smith v2 = a->a + ii[row + 1];
30455850ef23SBarry Smith v3 = a->a + ii[row + 2];
30465850ef23SBarry Smith v4 = a->a + ii[row + 3];
30475850ef23SBarry Smith sum1 = b[row];
30485850ef23SBarry Smith sum2 = b[row + 1];
30495850ef23SBarry Smith sum3 = b[row + 2];
30505850ef23SBarry Smith sum4 = b[row + 3];
30515850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) {
30525850ef23SBarry Smith i1 = idx[0];
30535850ef23SBarry Smith i2 = idx[1];
30545850ef23SBarry Smith idx += 2;
30555850ef23SBarry Smith tmp0 = x[i1];
30565850ef23SBarry Smith tmp1 = x[i2];
30579371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30589371c9d4SSatish Balay v1 += 2;
30599371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30609371c9d4SSatish Balay v2 += 2;
30619371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30629371c9d4SSatish Balay v3 += 2;
30639371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
30649371c9d4SSatish Balay v4 += 2;
30655850ef23SBarry Smith }
3066d876e2b0SMark Adams if (n == sz - 1) {
3067d876e2b0SMark Adams tmp0 = x[*idx++];
3068d876e2b0SMark Adams sum1 -= v1[0] * tmp0;
3069d876e2b0SMark Adams sum2 -= v2[0] * tmp0;
3070d876e2b0SMark Adams sum3 -= v3[0] * tmp0;
3071d876e2b0SMark Adams sum4 -= v4[0] * tmp0;
30729371c9d4SSatish Balay v1++;
30739371c9d4SSatish Balay v2++;
30749371c9d4SSatish Balay v3++;
30759371c9d4SSatish Balay v4++;
3076d876e2b0SMark Adams }
3077d876e2b0SMark Adams t[row] = sum1;
3078d876e2b0SMark Adams t[row + 1] = sum2;
3079d876e2b0SMark Adams t[row + 2] = sum3;
3080d876e2b0SMark Adams t[row + 3] = sum4;
3081d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 4;
3082d876e2b0SMark Adams idx = a->j + diag[row] + 4;
3083d876e2b0SMark Adams v1 += 4;
3084d876e2b0SMark Adams v2 += 4;
3085d876e2b0SMark Adams v3 += 4;
3086d876e2b0SMark Adams v4 += 4;
3087d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) {
3088d876e2b0SMark Adams i1 = idx[0];
3089d876e2b0SMark Adams i2 = idx[1];
3090d876e2b0SMark Adams idx += 2;
3091d876e2b0SMark Adams tmp0 = x[i1];
3092d876e2b0SMark Adams tmp1 = x[i2];
30939371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
30949371c9d4SSatish Balay v1 += 2;
30959371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
30969371c9d4SSatish Balay v2 += 2;
30979371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
30989371c9d4SSatish Balay v3 += 2;
30999371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
31009371c9d4SSatish Balay v4 += 2;
3101d876e2b0SMark Adams }
31025850ef23SBarry Smith if (n == sz - 1) {
31035850ef23SBarry Smith tmp0 = x[*idx];
31045850ef23SBarry Smith sum1 -= v1[0] * tmp0;
31055850ef23SBarry Smith sum2 -= v2[0] * tmp0;
31065850ef23SBarry Smith sum3 -= v3[0] * tmp0;
31075850ef23SBarry Smith sum4 -= v4[0] * tmp0;
31085850ef23SBarry Smith }
3109d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
3110d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
3111d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
3112d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
31135850ef23SBarry Smith break;
31145850ef23SBarry Smith case 5:
31155850ef23SBarry Smith v2 = a->a + ii[row + 1];
31165850ef23SBarry Smith v3 = a->a + ii[row + 2];
31175850ef23SBarry Smith v4 = a->a + ii[row + 3];
31185850ef23SBarry Smith v5 = a->a + ii[row + 4];
31195850ef23SBarry Smith sum1 = b[row];
31205850ef23SBarry Smith sum2 = b[row + 1];
31215850ef23SBarry Smith sum3 = b[row + 2];
31225850ef23SBarry Smith sum4 = b[row + 3];
31235850ef23SBarry Smith sum5 = b[row + 4];
31245850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) {
31255850ef23SBarry Smith i1 = idx[0];
31265850ef23SBarry Smith i2 = idx[1];
31275850ef23SBarry Smith idx += 2;
31285850ef23SBarry Smith tmp0 = x[i1];
31295850ef23SBarry Smith tmp1 = x[i2];
31309371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31319371c9d4SSatish Balay v1 += 2;
31329371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31339371c9d4SSatish Balay v2 += 2;
31349371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
31359371c9d4SSatish Balay v3 += 2;
31369371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
31379371c9d4SSatish Balay v4 += 2;
31389371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
31399371c9d4SSatish Balay v5 += 2;
31405850ef23SBarry Smith }
31415850ef23SBarry Smith if (n == sz - 1) {
3142d876e2b0SMark Adams tmp0 = x[*idx++];
31435850ef23SBarry Smith sum1 -= v1[0] * tmp0;
31445850ef23SBarry Smith sum2 -= v2[0] * tmp0;
31455850ef23SBarry Smith sum3 -= v3[0] * tmp0;
31465850ef23SBarry Smith sum4 -= v4[0] * tmp0;
31475850ef23SBarry Smith sum5 -= v5[0] * tmp0;
31489371c9d4SSatish Balay v1++;
31499371c9d4SSatish Balay v2++;
31509371c9d4SSatish Balay v3++;
31519371c9d4SSatish Balay v4++;
31529371c9d4SSatish Balay v5++;
31535850ef23SBarry Smith }
3154d876e2b0SMark Adams t[row] = sum1;
3155d876e2b0SMark Adams t[row + 1] = sum2;
3156d876e2b0SMark Adams t[row + 2] = sum3;
3157d876e2b0SMark Adams t[row + 3] = sum4;
3158d876e2b0SMark Adams t[row + 4] = sum5;
3159d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 5;
3160d876e2b0SMark Adams idx = a->j + diag[row] + 5;
3161d876e2b0SMark Adams v1 += 5;
3162d876e2b0SMark Adams v2 += 5;
3163d876e2b0SMark Adams v3 += 5;
3164d876e2b0SMark Adams v4 += 5;
3165d876e2b0SMark Adams v5 += 5;
31665850ef23SBarry Smith for (n = 0; n < sz - 1; n += 2) {
31675850ef23SBarry Smith i1 = idx[0];
31685850ef23SBarry Smith i2 = idx[1];
31695850ef23SBarry Smith idx += 2;
31705850ef23SBarry Smith tmp0 = x[i1];
31715850ef23SBarry Smith tmp1 = x[i2];
31729371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
31739371c9d4SSatish Balay v1 += 2;
31749371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
31759371c9d4SSatish Balay v2 += 2;
31769371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
31779371c9d4SSatish Balay v3 += 2;
31789371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
31799371c9d4SSatish Balay v4 += 2;
31809371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
31819371c9d4SSatish Balay v5 += 2;
31825850ef23SBarry Smith }
31835850ef23SBarry Smith if (n == sz - 1) {
31845850ef23SBarry Smith tmp0 = x[*idx];
3185d876e2b0SMark Adams sum1 -= v1[0] * tmp0;
3186d876e2b0SMark Adams sum2 -= v2[0] * tmp0;
3187d876e2b0SMark Adams sum3 -= v3[0] * tmp0;
3188d876e2b0SMark Adams sum4 -= v4[0] * tmp0;
3189d876e2b0SMark Adams sum5 -= v5[0] * tmp0;
31905850ef23SBarry Smith }
3191d876e2b0SMark Adams x[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
3192d876e2b0SMark Adams x[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
3193d876e2b0SMark Adams x[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
3194d876e2b0SMark Adams x[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
3195d876e2b0SMark Adams x[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
3196d876e2b0SMark Adams break;
3197d71ae5a4SJacob Faibussowitsch default:
31984d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3199d876e2b0SMark Adams }
3200d876e2b0SMark Adams }
3201d876e2b0SMark Adams xb = t;
32029566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz)); /* undercounts diag inverse */
3203d876e2b0SMark Adams } else xb = b;
3204d876e2b0SMark Adams
3205d876e2b0SMark Adams if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
3206d876e2b0SMark Adams ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
3207d876e2b0SMark Adams for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
32084d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
32094d12350bSJunchao Zhang ibdiag -= nodesz * nodesz;
3210d876e2b0SMark Adams
3211d876e2b0SMark Adams /* set RHS */
3212d876e2b0SMark Adams if (xb == b) {
3213d876e2b0SMark Adams /* whole (old way) */
3214d876e2b0SMark Adams sz = ii[row + 1] - ii[row];
3215d876e2b0SMark Adams idx = a->j + ii[row];
32164d12350bSJunchao Zhang switch (nodesz) {
3217d71ae5a4SJacob Faibussowitsch case 5:
3218d71ae5a4SJacob Faibussowitsch v5 = a->a + ii[row - 4]; /* fall through */
3219d71ae5a4SJacob Faibussowitsch case 4:
3220d71ae5a4SJacob Faibussowitsch v4 = a->a + ii[row - 3]; /* fall through */
3221d71ae5a4SJacob Faibussowitsch case 3:
3222d71ae5a4SJacob Faibussowitsch v3 = a->a + ii[row - 2]; /* fall through */
3223d71ae5a4SJacob Faibussowitsch case 2:
3224d71ae5a4SJacob Faibussowitsch v2 = a->a + ii[row - 1]; /* fall through */
3225d71ae5a4SJacob Faibussowitsch case 1:
3226d71ae5a4SJacob Faibussowitsch v1 = a->a + ii[row];
3227d71ae5a4SJacob Faibussowitsch break;
3228d71ae5a4SJacob Faibussowitsch default:
32294d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
3230d876e2b0SMark Adams }
3231d876e2b0SMark Adams } else {
3232d876e2b0SMark Adams /* upper, no diag */
3233d876e2b0SMark Adams sz = ii[row + 1] - diag[row] - 1;
3234d876e2b0SMark Adams idx = a->j + diag[row] + 1;
32354d12350bSJunchao Zhang switch (nodesz) {
3236d71ae5a4SJacob Faibussowitsch case 5:
3237d71ae5a4SJacob Faibussowitsch v5 = a->a + diag[row - 4] + 5; /* fall through */
3238d71ae5a4SJacob Faibussowitsch case 4:
3239d71ae5a4SJacob Faibussowitsch v4 = a->a + diag[row - 3] + 4; /* fall through */
3240d71ae5a4SJacob Faibussowitsch case 3:
3241d71ae5a4SJacob Faibussowitsch v3 = a->a + diag[row - 2] + 3; /* fall through */
3242d71ae5a4SJacob Faibussowitsch case 2:
3243d71ae5a4SJacob Faibussowitsch v2 = a->a + diag[row - 1] + 2; /* fall through */
3244d71ae5a4SJacob Faibussowitsch case 1:
3245d71ae5a4SJacob Faibussowitsch v1 = a->a + diag[row] + 1;
3246d876e2b0SMark Adams }
3247d876e2b0SMark Adams }
3248d876e2b0SMark Adams /* set sum */
32494d12350bSJunchao Zhang switch (nodesz) {
3250d71ae5a4SJacob Faibussowitsch case 5:
3251d71ae5a4SJacob Faibussowitsch sum5 = xb[row - 4]; /* fall through */
3252d71ae5a4SJacob Faibussowitsch case 4:
3253d71ae5a4SJacob Faibussowitsch sum4 = xb[row - 3]; /* fall through */
3254d71ae5a4SJacob Faibussowitsch case 3:
3255d71ae5a4SJacob Faibussowitsch sum3 = xb[row - 2]; /* fall through */
3256d71ae5a4SJacob Faibussowitsch case 2:
3257d71ae5a4SJacob Faibussowitsch sum2 = xb[row - 1]; /* fall through */
3258d876e2b0SMark Adams case 1:
3259d876e2b0SMark Adams /* note that sum1 is associated with the last row */
3260d876e2b0SMark Adams sum1 = xb[row];
3261d876e2b0SMark Adams }
3262d876e2b0SMark Adams /* do sums */
3263d876e2b0SMark Adams for (n = 0; n < sz - 1; n += 2) {
3264d876e2b0SMark Adams i1 = idx[0];
3265d876e2b0SMark Adams i2 = idx[1];
3266d876e2b0SMark Adams idx += 2;
3267d876e2b0SMark Adams tmp0 = x[i1];
3268d876e2b0SMark Adams tmp1 = x[i2];
32694d12350bSJunchao Zhang switch (nodesz) {
3270d71ae5a4SJacob Faibussowitsch case 5:
3271d71ae5a4SJacob Faibussowitsch sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
3272d71ae5a4SJacob Faibussowitsch v5 += 2; /* fall through */
3273d71ae5a4SJacob Faibussowitsch case 4:
3274d71ae5a4SJacob Faibussowitsch sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
3275d71ae5a4SJacob Faibussowitsch v4 += 2; /* fall through */
3276d71ae5a4SJacob Faibussowitsch case 3:
3277d71ae5a4SJacob Faibussowitsch sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
3278d71ae5a4SJacob Faibussowitsch v3 += 2; /* fall through */
3279d71ae5a4SJacob Faibussowitsch case 2:
3280d71ae5a4SJacob Faibussowitsch sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
3281d71ae5a4SJacob Faibussowitsch v2 += 2; /* fall through */
3282d71ae5a4SJacob Faibussowitsch case 1:
3283d71ae5a4SJacob Faibussowitsch sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
3284d71ae5a4SJacob Faibussowitsch v1 += 2;
3285d876e2b0SMark Adams }
3286d876e2b0SMark Adams }
3287d876e2b0SMark Adams /* ragged edge */
3288d876e2b0SMark Adams if (n == sz - 1) {
3289d876e2b0SMark Adams tmp0 = x[*idx];
32904d12350bSJunchao Zhang switch (nodesz) {
3291d71ae5a4SJacob Faibussowitsch case 5:
3292d71ae5a4SJacob Faibussowitsch sum5 -= *v5 * tmp0; /* fall through */
3293d71ae5a4SJacob Faibussowitsch case 4:
3294d71ae5a4SJacob Faibussowitsch sum4 -= *v4 * tmp0; /* fall through */
3295d71ae5a4SJacob Faibussowitsch case 3:
3296d71ae5a4SJacob Faibussowitsch sum3 -= *v3 * tmp0; /* fall through */
3297d71ae5a4SJacob Faibussowitsch case 2:
3298d71ae5a4SJacob Faibussowitsch sum2 -= *v2 * tmp0; /* fall through */
3299d71ae5a4SJacob Faibussowitsch case 1:
3300d71ae5a4SJacob Faibussowitsch sum1 -= *v1 * tmp0;
3301d876e2b0SMark Adams }
3302d876e2b0SMark Adams }
3303d876e2b0SMark Adams /* update */
3304d876e2b0SMark Adams if (xb == b) {
3305d876e2b0SMark Adams /* whole (old way) w/ diag */
33064d12350bSJunchao Zhang switch (nodesz) {
3307d876e2b0SMark Adams case 5:
33085850ef23SBarry Smith x[row--] += sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
33095850ef23SBarry Smith x[row--] += sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
33105850ef23SBarry Smith x[row--] += sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
33115850ef23SBarry Smith x[row--] += sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
33125850ef23SBarry Smith x[row--] += sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
33135850ef23SBarry Smith break;
3314d876e2b0SMark Adams case 4:
3315d876e2b0SMark Adams x[row--] += sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3316d876e2b0SMark Adams x[row--] += sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3317d876e2b0SMark Adams x[row--] += sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3318d876e2b0SMark Adams x[row--] += sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3319d876e2b0SMark Adams break;
3320d876e2b0SMark Adams case 3:
3321d876e2b0SMark Adams x[row--] += sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3322d876e2b0SMark Adams x[row--] += sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3323d876e2b0SMark Adams x[row--] += sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3324d876e2b0SMark Adams break;
3325d876e2b0SMark Adams case 2:
3326d876e2b0SMark Adams x[row--] += sum2 * ibdiag[1] + sum1 * ibdiag[3];
3327d876e2b0SMark Adams x[row--] += sum2 * ibdiag[0] + sum1 * ibdiag[2];
3328d876e2b0SMark Adams break;
3329d71ae5a4SJacob Faibussowitsch case 1:
3330d71ae5a4SJacob Faibussowitsch x[row--] += sum1 * (*ibdiag);
3331d71ae5a4SJacob Faibussowitsch break;
3332d876e2b0SMark Adams }
3333d876e2b0SMark Adams } else {
3334d876e2b0SMark Adams /* no diag so set = */
33354d12350bSJunchao Zhang switch (nodesz) {
3336d876e2b0SMark Adams case 5:
3337d876e2b0SMark Adams x[row--] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3338d876e2b0SMark Adams x[row--] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3339d876e2b0SMark Adams x[row--] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3340d876e2b0SMark Adams x[row--] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3341d876e2b0SMark Adams x[row--] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3342d876e2b0SMark Adams break;
3343d876e2b0SMark Adams case 4:
3344d876e2b0SMark Adams x[row--] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3345d876e2b0SMark Adams x[row--] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3346d876e2b0SMark Adams x[row--] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3347d876e2b0SMark Adams x[row--] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3348d876e2b0SMark Adams break;
3349d876e2b0SMark Adams case 3:
3350d876e2b0SMark Adams x[row--] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3351d876e2b0SMark Adams x[row--] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3352d876e2b0SMark Adams x[row--] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3353d876e2b0SMark Adams break;
3354d876e2b0SMark Adams case 2:
3355d876e2b0SMark Adams x[row--] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3356d876e2b0SMark Adams x[row--] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3357d876e2b0SMark Adams break;
3358d71ae5a4SJacob Faibussowitsch case 1:
3359d71ae5a4SJacob Faibussowitsch x[row--] = sum1 * (*ibdiag);
3360d71ae5a4SJacob Faibussowitsch break;
33615850ef23SBarry Smith }
33625850ef23SBarry Smith }
3363d876e2b0SMark Adams }
3364d876e2b0SMark Adams if (xb == b) {
33659566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz));
3366d876e2b0SMark Adams } else {
33679566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */
3368d876e2b0SMark Adams }
33695850ef23SBarry Smith }
33702af78befSBarry Smith }
337189c6957cSBarry Smith if (flag & SOR_EISENSTAT) {
337289c6957cSBarry Smith /*
337389c6957cSBarry Smith Apply (U + D)^-1 where D is now the block diagonal
337489c6957cSBarry Smith */
337589c6957cSBarry Smith ibdiag = a->inode.ibdiag + a->inode.bdiagsize;
337689c6957cSBarry Smith for (i = m - 1, row = A->rmap->n - 1; i >= 0; i--) {
33774d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
33784d12350bSJunchao Zhang ibdiag -= nodesz * nodesz;
337989c6957cSBarry Smith sz = ii[row + 1] - diag[row] - 1;
338089c6957cSBarry Smith v1 = a->a + diag[row] + 1;
338189c6957cSBarry Smith idx = a->j + diag[row] + 1;
33824108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
33834d12350bSJunchao Zhang switch (nodesz) {
338489c6957cSBarry Smith case 1:
338589c6957cSBarry Smith
338689c6957cSBarry Smith sum1 = b[row];
338789c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
338889c6957cSBarry Smith i1 = idx[0];
338989c6957cSBarry Smith i2 = idx[1];
339089c6957cSBarry Smith idx += 2;
339189c6957cSBarry Smith tmp0 = x[i1];
339289c6957cSBarry Smith tmp1 = x[i2];
33939371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
33949371c9d4SSatish Balay v1 += 2;
339589c6957cSBarry Smith }
339689c6957cSBarry Smith
339789c6957cSBarry Smith if (n == sz - 1) {
339889c6957cSBarry Smith tmp0 = x[*idx];
339989c6957cSBarry Smith sum1 -= *v1 * tmp0;
340089c6957cSBarry Smith }
34019371c9d4SSatish Balay x[row] = sum1 * (*ibdiag);
34029371c9d4SSatish Balay row--;
340389c6957cSBarry Smith break;
340489c6957cSBarry Smith
340589c6957cSBarry Smith case 2:
340689c6957cSBarry Smith
340789c6957cSBarry Smith sum1 = b[row];
340889c6957cSBarry Smith sum2 = b[row - 1];
340989c6957cSBarry Smith /* note that sum1 is associated with the second of the two rows */
341089c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2;
341189c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
341289c6957cSBarry Smith i1 = idx[0];
341389c6957cSBarry Smith i2 = idx[1];
341489c6957cSBarry Smith idx += 2;
341589c6957cSBarry Smith tmp0 = x[i1];
341689c6957cSBarry Smith tmp1 = x[i2];
34179371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34189371c9d4SSatish Balay v1 += 2;
34199371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34209371c9d4SSatish Balay v2 += 2;
342189c6957cSBarry Smith }
342289c6957cSBarry Smith
342389c6957cSBarry Smith if (n == sz - 1) {
342489c6957cSBarry Smith tmp0 = x[*idx];
342589c6957cSBarry Smith sum1 -= *v1 * tmp0;
342689c6957cSBarry Smith sum2 -= *v2 * tmp0;
342789c6957cSBarry Smith }
3428938d4eb3SBarry Smith x[row] = sum2 * ibdiag[1] + sum1 * ibdiag[3];
3429938d4eb3SBarry Smith x[row - 1] = sum2 * ibdiag[0] + sum1 * ibdiag[2];
3430938d4eb3SBarry Smith row -= 2;
343189c6957cSBarry Smith break;
343289c6957cSBarry Smith case 3:
343389c6957cSBarry Smith
343489c6957cSBarry Smith sum1 = b[row];
343589c6957cSBarry Smith sum2 = b[row - 1];
343689c6957cSBarry Smith sum3 = b[row - 2];
343789c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2;
343889c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3;
343989c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
344089c6957cSBarry Smith i1 = idx[0];
344189c6957cSBarry Smith i2 = idx[1];
344289c6957cSBarry Smith idx += 2;
344389c6957cSBarry Smith tmp0 = x[i1];
344489c6957cSBarry Smith tmp1 = x[i2];
34459371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34469371c9d4SSatish Balay v1 += 2;
34479371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34489371c9d4SSatish Balay v2 += 2;
34499371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34509371c9d4SSatish Balay v3 += 2;
345189c6957cSBarry Smith }
345289c6957cSBarry Smith
345389c6957cSBarry Smith if (n == sz - 1) {
345489c6957cSBarry Smith tmp0 = x[*idx];
345589c6957cSBarry Smith sum1 -= *v1 * tmp0;
345689c6957cSBarry Smith sum2 -= *v2 * tmp0;
345789c6957cSBarry Smith sum3 -= *v3 * tmp0;
345889c6957cSBarry Smith }
3459938d4eb3SBarry Smith x[row] = sum3 * ibdiag[2] + sum2 * ibdiag[5] + sum1 * ibdiag[8];
3460938d4eb3SBarry Smith x[row - 1] = sum3 * ibdiag[1] + sum2 * ibdiag[4] + sum1 * ibdiag[7];
3461938d4eb3SBarry Smith x[row - 2] = sum3 * ibdiag[0] + sum2 * ibdiag[3] + sum1 * ibdiag[6];
3462938d4eb3SBarry Smith row -= 3;
346389c6957cSBarry Smith break;
346489c6957cSBarry Smith case 4:
346589c6957cSBarry Smith
346689c6957cSBarry Smith sum1 = b[row];
346789c6957cSBarry Smith sum2 = b[row - 1];
346889c6957cSBarry Smith sum3 = b[row - 2];
346989c6957cSBarry Smith sum4 = b[row - 3];
347089c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2;
347189c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3;
347289c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4;
347389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
347489c6957cSBarry Smith i1 = idx[0];
347589c6957cSBarry Smith i2 = idx[1];
347689c6957cSBarry Smith idx += 2;
347789c6957cSBarry Smith tmp0 = x[i1];
347889c6957cSBarry Smith tmp1 = x[i2];
34799371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
34809371c9d4SSatish Balay v1 += 2;
34819371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
34829371c9d4SSatish Balay v2 += 2;
34839371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
34849371c9d4SSatish Balay v3 += 2;
34859371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
34869371c9d4SSatish Balay v4 += 2;
348789c6957cSBarry Smith }
348889c6957cSBarry Smith
348989c6957cSBarry Smith if (n == sz - 1) {
349089c6957cSBarry Smith tmp0 = x[*idx];
349189c6957cSBarry Smith sum1 -= *v1 * tmp0;
349289c6957cSBarry Smith sum2 -= *v2 * tmp0;
349389c6957cSBarry Smith sum3 -= *v3 * tmp0;
349489c6957cSBarry Smith sum4 -= *v4 * tmp0;
349589c6957cSBarry Smith }
3496938d4eb3SBarry Smith x[row] = sum4 * ibdiag[3] + sum3 * ibdiag[7] + sum2 * ibdiag[11] + sum1 * ibdiag[15];
3497938d4eb3SBarry Smith x[row - 1] = sum4 * ibdiag[2] + sum3 * ibdiag[6] + sum2 * ibdiag[10] + sum1 * ibdiag[14];
3498938d4eb3SBarry Smith x[row - 2] = sum4 * ibdiag[1] + sum3 * ibdiag[5] + sum2 * ibdiag[9] + sum1 * ibdiag[13];
3499938d4eb3SBarry Smith x[row - 3] = sum4 * ibdiag[0] + sum3 * ibdiag[4] + sum2 * ibdiag[8] + sum1 * ibdiag[12];
3500938d4eb3SBarry Smith row -= 4;
350189c6957cSBarry Smith break;
350289c6957cSBarry Smith case 5:
350389c6957cSBarry Smith
350489c6957cSBarry Smith sum1 = b[row];
350589c6957cSBarry Smith sum2 = b[row - 1];
350689c6957cSBarry Smith sum3 = b[row - 2];
350789c6957cSBarry Smith sum4 = b[row - 3];
350889c6957cSBarry Smith sum5 = b[row - 4];
350989c6957cSBarry Smith v2 = a->a + diag[row - 1] + 2;
351089c6957cSBarry Smith v3 = a->a + diag[row - 2] + 3;
351189c6957cSBarry Smith v4 = a->a + diag[row - 3] + 4;
351289c6957cSBarry Smith v5 = a->a + diag[row - 4] + 5;
351389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
351489c6957cSBarry Smith i1 = idx[0];
351589c6957cSBarry Smith i2 = idx[1];
351689c6957cSBarry Smith idx += 2;
351789c6957cSBarry Smith tmp0 = x[i1];
351889c6957cSBarry Smith tmp1 = x[i2];
35199371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
35209371c9d4SSatish Balay v1 += 2;
35219371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
35229371c9d4SSatish Balay v2 += 2;
35239371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
35249371c9d4SSatish Balay v3 += 2;
35259371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
35269371c9d4SSatish Balay v4 += 2;
35279371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
35289371c9d4SSatish Balay v5 += 2;
352989c6957cSBarry Smith }
353089c6957cSBarry Smith
353189c6957cSBarry Smith if (n == sz - 1) {
353289c6957cSBarry Smith tmp0 = x[*idx];
353389c6957cSBarry Smith sum1 -= *v1 * tmp0;
353489c6957cSBarry Smith sum2 -= *v2 * tmp0;
353589c6957cSBarry Smith sum3 -= *v3 * tmp0;
353689c6957cSBarry Smith sum4 -= *v4 * tmp0;
353789c6957cSBarry Smith sum5 -= *v5 * tmp0;
353889c6957cSBarry Smith }
3539938d4eb3SBarry Smith x[row] = sum5 * ibdiag[4] + sum4 * ibdiag[9] + sum3 * ibdiag[14] + sum2 * ibdiag[19] + sum1 * ibdiag[24];
3540938d4eb3SBarry Smith x[row - 1] = sum5 * ibdiag[3] + sum4 * ibdiag[8] + sum3 * ibdiag[13] + sum2 * ibdiag[18] + sum1 * ibdiag[23];
3541938d4eb3SBarry Smith x[row - 2] = sum5 * ibdiag[2] + sum4 * ibdiag[7] + sum3 * ibdiag[12] + sum2 * ibdiag[17] + sum1 * ibdiag[22];
3542938d4eb3SBarry Smith x[row - 3] = sum5 * ibdiag[1] + sum4 * ibdiag[6] + sum3 * ibdiag[11] + sum2 * ibdiag[16] + sum1 * ibdiag[21];
3543938d4eb3SBarry Smith x[row - 4] = sum5 * ibdiag[0] + sum4 * ibdiag[5] + sum3 * ibdiag[10] + sum2 * ibdiag[15] + sum1 * ibdiag[20];
3544938d4eb3SBarry Smith row -= 5;
354589c6957cSBarry Smith break;
3546d71ae5a4SJacob Faibussowitsch default:
35474d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
354889c6957cSBarry Smith }
354989c6957cSBarry Smith }
35509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz));
355189c6957cSBarry Smith
355289c6957cSBarry Smith /*
355389c6957cSBarry Smith t = b - D x where D is the block diagonal
355489c6957cSBarry Smith */
355589c6957cSBarry Smith cnt = 0;
355689c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) {
35574d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
35584d12350bSJunchao Zhang switch (nodesz) {
355989c6957cSBarry Smith case 1:
35609371c9d4SSatish Balay t[row] = b[row] - bdiag[cnt++] * x[row];
35619371c9d4SSatish Balay row++;
356289c6957cSBarry Smith break;
356389c6957cSBarry Smith case 2:
35649371c9d4SSatish Balay x1 = x[row];
35659371c9d4SSatish Balay x2 = x[row + 1];
356689c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
356789c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
356889c6957cSBarry Smith t[row] = b[row] - tmp1;
35699371c9d4SSatish Balay t[row + 1] = b[row + 1] - tmp2;
35709371c9d4SSatish Balay row += 2;
357189c6957cSBarry Smith cnt += 4;
357289c6957cSBarry Smith break;
357389c6957cSBarry Smith case 3:
35749371c9d4SSatish Balay x1 = x[row];
35759371c9d4SSatish Balay x2 = x[row + 1];
35769371c9d4SSatish Balay x3 = x[row + 2];
357789c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
357889c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
357989c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
358089c6957cSBarry Smith t[row] = b[row] - tmp1;
358189c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2;
35829371c9d4SSatish Balay t[row + 2] = b[row + 2] - tmp3;
35839371c9d4SSatish Balay row += 3;
358489c6957cSBarry Smith cnt += 9;
358589c6957cSBarry Smith break;
358689c6957cSBarry Smith case 4:
35879371c9d4SSatish Balay x1 = x[row];
35889371c9d4SSatish Balay x2 = x[row + 1];
35899371c9d4SSatish Balay x3 = x[row + 2];
35909371c9d4SSatish Balay x4 = x[row + 3];
359189c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
359289c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
359389c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
359489c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
359589c6957cSBarry Smith t[row] = b[row] - tmp1;
359689c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2;
359789c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3;
35989371c9d4SSatish Balay t[row + 3] = b[row + 3] - tmp4;
35999371c9d4SSatish Balay row += 4;
360089c6957cSBarry Smith cnt += 16;
360189c6957cSBarry Smith break;
360289c6957cSBarry Smith case 5:
36039371c9d4SSatish Balay x1 = x[row];
36049371c9d4SSatish Balay x2 = x[row + 1];
36059371c9d4SSatish Balay x3 = x[row + 2];
36069371c9d4SSatish Balay x4 = x[row + 3];
36079371c9d4SSatish Balay x5 = x[row + 4];
360889c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
360989c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
361089c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
361189c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
361289c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
361389c6957cSBarry Smith t[row] = b[row] - tmp1;
361489c6957cSBarry Smith t[row + 1] = b[row + 1] - tmp2;
361589c6957cSBarry Smith t[row + 2] = b[row + 2] - tmp3;
361689c6957cSBarry Smith t[row + 3] = b[row + 3] - tmp4;
36179371c9d4SSatish Balay t[row + 4] = b[row + 4] - tmp5;
36189371c9d4SSatish Balay row += 5;
361989c6957cSBarry Smith cnt += 25;
362089c6957cSBarry Smith break;
3621d71ae5a4SJacob Faibussowitsch default:
36224d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
362389c6957cSBarry Smith }
362489c6957cSBarry Smith }
36259566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(m));
362689c6957cSBarry Smith
362789c6957cSBarry Smith /*
362889c6957cSBarry Smith Apply (L + D)^-1 where D is the block diagonal
362989c6957cSBarry Smith */
363089c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) {
36314d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
363289c6957cSBarry Smith sz = diag[row] - ii[row];
363389c6957cSBarry Smith v1 = a->a + ii[row];
363489c6957cSBarry Smith idx = a->j + ii[row];
36354108e4d5SBarry Smith /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */
36364d12350bSJunchao Zhang switch (nodesz) {
363789c6957cSBarry Smith case 1:
363889c6957cSBarry Smith
363989c6957cSBarry Smith sum1 = t[row];
364089c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
364189c6957cSBarry Smith i1 = idx[0];
364289c6957cSBarry Smith i2 = idx[1];
364389c6957cSBarry Smith idx += 2;
364489c6957cSBarry Smith tmp0 = t[i1];
364589c6957cSBarry Smith tmp1 = t[i2];
36469371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
36479371c9d4SSatish Balay v1 += 2;
364889c6957cSBarry Smith }
364989c6957cSBarry Smith
365089c6957cSBarry Smith if (n == sz - 1) {
365189c6957cSBarry Smith tmp0 = t[*idx];
365289c6957cSBarry Smith sum1 -= *v1 * tmp0;
365389c6957cSBarry Smith }
36549371c9d4SSatish Balay x[row] += t[row] = sum1 * (*ibdiag++);
36559371c9d4SSatish Balay row++;
365689c6957cSBarry Smith break;
365789c6957cSBarry Smith case 2:
365889c6957cSBarry Smith v2 = a->a + ii[row + 1];
365989c6957cSBarry Smith sum1 = t[row];
366089c6957cSBarry Smith sum2 = t[row + 1];
366189c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
366289c6957cSBarry Smith i1 = idx[0];
366389c6957cSBarry Smith i2 = idx[1];
366489c6957cSBarry Smith idx += 2;
366589c6957cSBarry Smith tmp0 = t[i1];
366689c6957cSBarry Smith tmp1 = t[i2];
36679371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
36689371c9d4SSatish Balay v1 += 2;
36699371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
36709371c9d4SSatish Balay v2 += 2;
367189c6957cSBarry Smith }
367289c6957cSBarry Smith
367389c6957cSBarry Smith if (n == sz - 1) {
367489c6957cSBarry Smith tmp0 = t[*idx];
367589c6957cSBarry Smith sum1 -= v1[0] * tmp0;
367689c6957cSBarry Smith sum2 -= v2[0] * tmp0;
367789c6957cSBarry Smith }
367889c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[2];
367989c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[3];
36809371c9d4SSatish Balay ibdiag += 4;
36819371c9d4SSatish Balay row += 2;
368289c6957cSBarry Smith break;
368389c6957cSBarry Smith case 3:
368489c6957cSBarry Smith v2 = a->a + ii[row + 1];
368589c6957cSBarry Smith v3 = a->a + ii[row + 2];
368689c6957cSBarry Smith sum1 = t[row];
368789c6957cSBarry Smith sum2 = t[row + 1];
368889c6957cSBarry Smith sum3 = t[row + 2];
368989c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
369089c6957cSBarry Smith i1 = idx[0];
369189c6957cSBarry Smith i2 = idx[1];
369289c6957cSBarry Smith idx += 2;
369389c6957cSBarry Smith tmp0 = t[i1];
369489c6957cSBarry Smith tmp1 = t[i2];
36959371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
36969371c9d4SSatish Balay v1 += 2;
36979371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
36989371c9d4SSatish Balay v2 += 2;
36999371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
37009371c9d4SSatish Balay v3 += 2;
370189c6957cSBarry Smith }
370289c6957cSBarry Smith
370389c6957cSBarry Smith if (n == sz - 1) {
370489c6957cSBarry Smith tmp0 = t[*idx];
370589c6957cSBarry Smith sum1 -= v1[0] * tmp0;
370689c6957cSBarry Smith sum2 -= v2[0] * tmp0;
370789c6957cSBarry Smith sum3 -= v3[0] * tmp0;
370889c6957cSBarry Smith }
370989c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[3] + sum3 * ibdiag[6];
371089c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[4] + sum3 * ibdiag[7];
371189c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[5] + sum3 * ibdiag[8];
37129371c9d4SSatish Balay ibdiag += 9;
37139371c9d4SSatish Balay row += 3;
371489c6957cSBarry Smith break;
371589c6957cSBarry Smith case 4:
371689c6957cSBarry Smith v2 = a->a + ii[row + 1];
371789c6957cSBarry Smith v3 = a->a + ii[row + 2];
371889c6957cSBarry Smith v4 = a->a + ii[row + 3];
371989c6957cSBarry Smith sum1 = t[row];
372089c6957cSBarry Smith sum2 = t[row + 1];
372189c6957cSBarry Smith sum3 = t[row + 2];
372289c6957cSBarry Smith sum4 = t[row + 3];
372389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
372489c6957cSBarry Smith i1 = idx[0];
372589c6957cSBarry Smith i2 = idx[1];
372689c6957cSBarry Smith idx += 2;
372789c6957cSBarry Smith tmp0 = t[i1];
372889c6957cSBarry Smith tmp1 = t[i2];
37299371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
37309371c9d4SSatish Balay v1 += 2;
37319371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
37329371c9d4SSatish Balay v2 += 2;
37339371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
37349371c9d4SSatish Balay v3 += 2;
37359371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
37369371c9d4SSatish Balay v4 += 2;
373789c6957cSBarry Smith }
373889c6957cSBarry Smith
373989c6957cSBarry Smith if (n == sz - 1) {
374089c6957cSBarry Smith tmp0 = t[*idx];
374189c6957cSBarry Smith sum1 -= v1[0] * tmp0;
374289c6957cSBarry Smith sum2 -= v2[0] * tmp0;
374389c6957cSBarry Smith sum3 -= v3[0] * tmp0;
374489c6957cSBarry Smith sum4 -= v4[0] * tmp0;
374589c6957cSBarry Smith }
374689c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[4] + sum3 * ibdiag[8] + sum4 * ibdiag[12];
374789c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[5] + sum3 * ibdiag[9] + sum4 * ibdiag[13];
374889c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[6] + sum3 * ibdiag[10] + sum4 * ibdiag[14];
374989c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[7] + sum3 * ibdiag[11] + sum4 * ibdiag[15];
37509371c9d4SSatish Balay ibdiag += 16;
37519371c9d4SSatish Balay row += 4;
375289c6957cSBarry Smith break;
375389c6957cSBarry Smith case 5:
375489c6957cSBarry Smith v2 = a->a + ii[row + 1];
375589c6957cSBarry Smith v3 = a->a + ii[row + 2];
375689c6957cSBarry Smith v4 = a->a + ii[row + 3];
375789c6957cSBarry Smith v5 = a->a + ii[row + 4];
375889c6957cSBarry Smith sum1 = t[row];
375989c6957cSBarry Smith sum2 = t[row + 1];
376089c6957cSBarry Smith sum3 = t[row + 2];
376189c6957cSBarry Smith sum4 = t[row + 3];
376289c6957cSBarry Smith sum5 = t[row + 4];
376389c6957cSBarry Smith for (n = 0; n < sz - 1; n += 2) {
376489c6957cSBarry Smith i1 = idx[0];
376589c6957cSBarry Smith i2 = idx[1];
376689c6957cSBarry Smith idx += 2;
376789c6957cSBarry Smith tmp0 = t[i1];
376889c6957cSBarry Smith tmp1 = t[i2];
37699371c9d4SSatish Balay sum1 -= v1[0] * tmp0 + v1[1] * tmp1;
37709371c9d4SSatish Balay v1 += 2;
37719371c9d4SSatish Balay sum2 -= v2[0] * tmp0 + v2[1] * tmp1;
37729371c9d4SSatish Balay v2 += 2;
37739371c9d4SSatish Balay sum3 -= v3[0] * tmp0 + v3[1] * tmp1;
37749371c9d4SSatish Balay v3 += 2;
37759371c9d4SSatish Balay sum4 -= v4[0] * tmp0 + v4[1] * tmp1;
37769371c9d4SSatish Balay v4 += 2;
37779371c9d4SSatish Balay sum5 -= v5[0] * tmp0 + v5[1] * tmp1;
37789371c9d4SSatish Balay v5 += 2;
377989c6957cSBarry Smith }
378089c6957cSBarry Smith
378189c6957cSBarry Smith if (n == sz - 1) {
378289c6957cSBarry Smith tmp0 = t[*idx];
378389c6957cSBarry Smith sum1 -= v1[0] * tmp0;
378489c6957cSBarry Smith sum2 -= v2[0] * tmp0;
378589c6957cSBarry Smith sum3 -= v3[0] * tmp0;
378689c6957cSBarry Smith sum4 -= v4[0] * tmp0;
378789c6957cSBarry Smith sum5 -= v5[0] * tmp0;
378889c6957cSBarry Smith }
378989c6957cSBarry Smith x[row] += t[row] = sum1 * ibdiag[0] + sum2 * ibdiag[5] + sum3 * ibdiag[10] + sum4 * ibdiag[15] + sum5 * ibdiag[20];
379089c6957cSBarry Smith x[row + 1] += t[row + 1] = sum1 * ibdiag[1] + sum2 * ibdiag[6] + sum3 * ibdiag[11] + sum4 * ibdiag[16] + sum5 * ibdiag[21];
379189c6957cSBarry Smith x[row + 2] += t[row + 2] = sum1 * ibdiag[2] + sum2 * ibdiag[7] + sum3 * ibdiag[12] + sum4 * ibdiag[17] + sum5 * ibdiag[22];
379289c6957cSBarry Smith x[row + 3] += t[row + 3] = sum1 * ibdiag[3] + sum2 * ibdiag[8] + sum3 * ibdiag[13] + sum4 * ibdiag[18] + sum5 * ibdiag[23];
379389c6957cSBarry Smith x[row + 4] += t[row + 4] = sum1 * ibdiag[4] + sum2 * ibdiag[9] + sum3 * ibdiag[14] + sum4 * ibdiag[19] + sum5 * ibdiag[24];
37949371c9d4SSatish Balay ibdiag += 25;
37959371c9d4SSatish Balay row += 5;
379689c6957cSBarry Smith break;
3797d71ae5a4SJacob Faibussowitsch default:
37984d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
379989c6957cSBarry Smith }
380089c6957cSBarry Smith }
38019566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz));
38025850ef23SBarry Smith }
38039566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x));
38049566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b));
38053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
38062af78befSBarry Smith }
38072af78befSBarry Smith
MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx)3808ff6a9541SJacob Faibussowitsch static PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A, Vec bb, Vec xx)
3809d71ae5a4SJacob Faibussowitsch {
381089c6957cSBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
381189c6957cSBarry Smith PetscScalar *x, tmp1, tmp2, tmp3, tmp4, tmp5, x1, x2, x3, x4, x5;
381289c6957cSBarry Smith const MatScalar *bdiag = a->inode.bdiag;
381389c6957cSBarry Smith const PetscScalar *b;
38144d12350bSJunchao Zhang PetscInt m = a->inode.node_count, cnt = 0, i, row, nodesz;
38154d12350bSJunchao Zhang const PetscInt *sizes = a->inode.size_csr;
38162af78befSBarry Smith
381789c6957cSBarry Smith PetscFunctionBegin;
38184d12350bSJunchao Zhang PetscCheck(a->inode.size_csr, PETSC_COMM_SELF, PETSC_ERR_COR, "Missing Inode Structure");
38199566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x));
38209566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b));
382189c6957cSBarry Smith cnt = 0;
382289c6957cSBarry Smith for (i = 0, row = 0; i < m; i++) {
38234d12350bSJunchao Zhang nodesz = sizes[i + 1] - sizes[i];
38244d12350bSJunchao Zhang switch (nodesz) {
382589c6957cSBarry Smith case 1:
38269371c9d4SSatish Balay x[row] = b[row] * bdiag[cnt++];
38279371c9d4SSatish Balay row++;
382889c6957cSBarry Smith break;
382989c6957cSBarry Smith case 2:
38309371c9d4SSatish Balay x1 = b[row];
38319371c9d4SSatish Balay x2 = b[row + 1];
383289c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 2];
383389c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 3];
383489c6957cSBarry Smith x[row++] = tmp1;
383589c6957cSBarry Smith x[row++] = tmp2;
383689c6957cSBarry Smith cnt += 4;
383789c6957cSBarry Smith break;
383889c6957cSBarry Smith case 3:
38399371c9d4SSatish Balay x1 = b[row];
38409371c9d4SSatish Balay x2 = b[row + 1];
38419371c9d4SSatish Balay x3 = b[row + 2];
384289c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 3] + x3 * bdiag[cnt + 6];
384389c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 7];
384489c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 8];
384589c6957cSBarry Smith x[row++] = tmp1;
384689c6957cSBarry Smith x[row++] = tmp2;
384789c6957cSBarry Smith x[row++] = tmp3;
384889c6957cSBarry Smith cnt += 9;
384989c6957cSBarry Smith break;
385089c6957cSBarry Smith case 4:
38519371c9d4SSatish Balay x1 = b[row];
38529371c9d4SSatish Balay x2 = b[row + 1];
38539371c9d4SSatish Balay x3 = b[row + 2];
38549371c9d4SSatish Balay x4 = b[row + 3];
385589c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 4] + x3 * bdiag[cnt + 8] + x4 * bdiag[cnt + 12];
385689c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 9] + x4 * bdiag[cnt + 13];
385789c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 14];
385889c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 15];
385989c6957cSBarry Smith x[row++] = tmp1;
386089c6957cSBarry Smith x[row++] = tmp2;
386189c6957cSBarry Smith x[row++] = tmp3;
386289c6957cSBarry Smith x[row++] = tmp4;
386389c6957cSBarry Smith cnt += 16;
386489c6957cSBarry Smith break;
386589c6957cSBarry Smith case 5:
38669371c9d4SSatish Balay x1 = b[row];
38679371c9d4SSatish Balay x2 = b[row + 1];
38689371c9d4SSatish Balay x3 = b[row + 2];
38699371c9d4SSatish Balay x4 = b[row + 3];
38709371c9d4SSatish Balay x5 = b[row + 4];
387189c6957cSBarry Smith tmp1 = x1 * bdiag[cnt] + x2 * bdiag[cnt + 5] + x3 * bdiag[cnt + 10] + x4 * bdiag[cnt + 15] + x5 * bdiag[cnt + 20];
387289c6957cSBarry Smith tmp2 = x1 * bdiag[cnt + 1] + x2 * bdiag[cnt + 6] + x3 * bdiag[cnt + 11] + x4 * bdiag[cnt + 16] + x5 * bdiag[cnt + 21];
387389c6957cSBarry Smith tmp3 = x1 * bdiag[cnt + 2] + x2 * bdiag[cnt + 7] + x3 * bdiag[cnt + 12] + x4 * bdiag[cnt + 17] + x5 * bdiag[cnt + 22];
387489c6957cSBarry Smith tmp4 = x1 * bdiag[cnt + 3] + x2 * bdiag[cnt + 8] + x3 * bdiag[cnt + 13] + x4 * bdiag[cnt + 18] + x5 * bdiag[cnt + 23];
387589c6957cSBarry Smith tmp5 = x1 * bdiag[cnt + 4] + x2 * bdiag[cnt + 9] + x3 * bdiag[cnt + 14] + x4 * bdiag[cnt + 19] + x5 * bdiag[cnt + 24];
387689c6957cSBarry Smith x[row++] = tmp1;
387789c6957cSBarry Smith x[row++] = tmp2;
387889c6957cSBarry Smith x[row++] = tmp3;
387989c6957cSBarry Smith x[row++] = tmp4;
388089c6957cSBarry Smith x[row++] = tmp5;
388189c6957cSBarry Smith cnt += 25;
388289c6957cSBarry Smith break;
3883d71ae5a4SJacob Faibussowitsch default:
38844d12350bSJunchao Zhang SETERRQ(PETSC_COMM_SELF, PETSC_ERR_COR, "Node size not supported, node row %" PetscInt_FMT " size %" PetscInt_FMT, row, nodesz);
388589c6957cSBarry Smith }
388689c6957cSBarry Smith }
38879566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * cnt));
38889566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x));
38899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b));
38903ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
389189c6957cSBarry Smith }
389289c6957cSBarry Smith
MatSeqAIJ_Inode_ResetOps(Mat A)3893d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A)
3894d71ae5a4SJacob Faibussowitsch {
3895b215bc84SStefano Zampini Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
3896b215bc84SStefano Zampini
3897b215bc84SStefano Zampini PetscFunctionBegin;
3898b215bc84SStefano Zampini a->inode.node_count = 0;
3899b215bc84SStefano Zampini a->inode.use = PETSC_FALSE;
3900b215bc84SStefano Zampini a->inode.checked = PETSC_FALSE;
3901b215bc84SStefano Zampini a->inode.mat_nonzerostate = -1;
3902b215bc84SStefano Zampini A->ops->getrowij = MatGetRowIJ_SeqAIJ;
3903b215bc84SStefano Zampini A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ;
3904b215bc84SStefano Zampini A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ;
3905b215bc84SStefano Zampini A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ;
3906b215bc84SStefano Zampini A->ops->coloringpatch = NULL;
3907b215bc84SStefano Zampini A->ops->multdiagonalblock = NULL;
3908ad540459SPierre Jolivet if (A->factortype) A->ops->solve = MatSolve_SeqAIJ_inplace;
39093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
3910b215bc84SStefano Zampini }
3911b215bc84SStefano Zampini
39124c1414c8SBarry Smith /*
39134c1414c8SBarry Smith samestructure indicates that the matrix has not changed its nonzero structure so we
39144c1414c8SBarry Smith do not need to recompute the inodes
39154c1414c8SBarry Smith */
MatSeqAIJCheckInode(Mat A)3916d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode(Mat A)
3917d71ae5a4SJacob Faibussowitsch {
39184c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
39198758e1faSBarry Smith PetscInt i, j, m, nzx, nzy, *ns, node_count, blk_size;
3920ace3abfcSBarry Smith PetscBool flag;
39218758e1faSBarry Smith const PetscInt *idx, *idy, *ii;
39224c1414c8SBarry Smith
39234c1414c8SBarry Smith PetscFunctionBegin;
3924b215bc84SStefano Zampini if (!a->inode.use) {
39259566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A));
39264d12350bSJunchao Zhang PetscCall(PetscFree(a->inode.size_csr));
39273ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
3928b215bc84SStefano Zampini }
39293ba16761SJacob Faibussowitsch if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(PETSC_SUCCESS);
39304c1414c8SBarry Smith
3931d0f46423SBarry Smith m = A->rmap->n;
39324d12350bSJunchao Zhang if (!a->inode.size_csr) PetscCall(PetscMalloc1(m + 1, &a->inode.size_csr));
39334d12350bSJunchao Zhang ns = a->inode.size_csr;
39344d12350bSJunchao Zhang ns[0] = 0;
39354c1414c8SBarry Smith
39364c1414c8SBarry Smith i = 0;
39374c1414c8SBarry Smith node_count = 0;
39384c1414c8SBarry Smith idx = a->j;
39394c1414c8SBarry Smith ii = a->i;
39406f2c871aSStefano Zampini if (idx) {
39414c1414c8SBarry Smith while (i < m) { /* For each row */
39424c1414c8SBarry Smith nzx = ii[i + 1] - ii[i]; /* Number of nonzeros */
39434c1414c8SBarry Smith /* Limits the number of elements in a node to 'a->inode.limit' */
39444c1414c8SBarry Smith for (j = i + 1, idy = idx, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
39454c1414c8SBarry Smith nzy = ii[j + 1] - ii[j]; /* Same number of nonzeros */
39464c1414c8SBarry Smith if (nzy != nzx) break;
39474c1414c8SBarry Smith idy += nzx; /* Same nonzero pattern */
39489566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(idx, idy, nzx, &flag));
39494c1414c8SBarry Smith if (!flag) break;
39504c1414c8SBarry Smith }
39514d12350bSJunchao Zhang ns[node_count + 1] = ns[node_count] + blk_size;
39524d12350bSJunchao Zhang node_count++;
39534c1414c8SBarry Smith idx += blk_size * nzx;
39544c1414c8SBarry Smith i = j;
39554c1414c8SBarry Smith }
39566f2c871aSStefano Zampini }
39574c1414c8SBarry Smith /* If not enough inodes found,, do not use inode version of the routines */
39586f2c871aSStefano Zampini if (!m || !idx || node_count > .8 * m) {
39599566063dSJacob Faibussowitsch PetscCall(MatSeqAIJ_Inode_ResetOps(A));
39604d12350bSJunchao Zhang PetscCall(PetscFree(a->inode.size_csr));
39619566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
39624c1414c8SBarry Smith } else {
3963d5f3da31SBarry Smith if (!A->factortype) {
3964375a6242SBarry Smith A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
3965375a6242SBarry Smith if (A->rmap->n == A->cmap->n) {
39664108e4d5SBarry Smith A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
39674108e4d5SBarry Smith A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
39684108e4d5SBarry Smith A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
39694108e4d5SBarry Smith A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
39704108e4d5SBarry Smith A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
3971375a6242SBarry Smith }
3972d3ac4fa3SBarry Smith } else {
3973d3ac4fa3SBarry Smith A->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
3974d3ac4fa3SBarry Smith }
39754c1414c8SBarry Smith a->inode.node_count = node_count;
39769566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
39774c1414c8SBarry Smith }
3978be6adb11SBarry Smith a->inode.checked = PETSC_TRUE;
3979a02bda8eSBarry Smith a->inode.mat_nonzerostate = A->nonzerostate;
39803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
39814c1414c8SBarry Smith }
39824c1414c8SBarry Smith
MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat * C)3983d71ae5a4SJacob Faibussowitsch PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A, MatDuplicateOption cpvalues, Mat *C)
3984d71ae5a4SJacob Faibussowitsch {
3985150f0143SBarry Smith Mat B = *C;
3986150f0143SBarry Smith Mat_SeqAIJ *c = (Mat_SeqAIJ *)B->data, *a = (Mat_SeqAIJ *)A->data;
3987150f0143SBarry Smith PetscInt m = A->rmap->n;
3988150f0143SBarry Smith
3989150f0143SBarry Smith PetscFunctionBegin;
3990150f0143SBarry Smith c->inode.use = a->inode.use;
3991150f0143SBarry Smith c->inode.limit = a->inode.limit;
3992150f0143SBarry Smith c->inode.max_limit = a->inode.max_limit;
3993ec710b6aSStefano Zampini c->inode.checked = PETSC_FALSE;
39944d12350bSJunchao Zhang c->inode.size_csr = NULL;
3995ec710b6aSStefano Zampini c->inode.node_count = 0;
3996ec710b6aSStefano Zampini c->inode.ibdiag = NULL;
3997ec710b6aSStefano Zampini c->inode.bdiag = NULL;
3998ec710b6aSStefano Zampini c->inode.mat_nonzerostate = -1;
3999b215bc84SStefano Zampini if (a->inode.use) {
40004d12350bSJunchao Zhang if (a->inode.checked && a->inode.size_csr) {
40014d12350bSJunchao Zhang PetscCall(PetscMalloc1(m + 1, &c->inode.size_csr));
40024d12350bSJunchao Zhang PetscCall(PetscArraycpy(c->inode.size_csr, a->inode.size_csr, m + 1));
4003ec710b6aSStefano Zampini
4004ec710b6aSStefano Zampini c->inode.checked = PETSC_TRUE;
4005ec710b6aSStefano Zampini c->inode.node_count = a->inode.node_count;
4006ec710b6aSStefano Zampini c->inode.mat_nonzerostate = (*C)->nonzerostate;
4007ec710b6aSStefano Zampini }
4008a02bda8eSBarry Smith /* note the table of functions below should match that in MatSeqAIJCheckInode() */
40092c451681SBarry Smith if (!B->factortype) {
40102c451681SBarry Smith B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode;
40112c451681SBarry Smith B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode;
40122c451681SBarry Smith B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode;
40132c451681SBarry Smith B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode;
40142c451681SBarry Smith B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode;
40152c451681SBarry Smith B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode;
4016150f0143SBarry Smith } else {
40172c451681SBarry Smith B->ops->solve = MatSolve_SeqAIJ_Inode_inplace;
4018150f0143SBarry Smith }
4019150f0143SBarry Smith }
40203ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
4021150f0143SBarry Smith }
4022150f0143SBarry Smith
MatGetRow_FactoredLU(PetscInt * cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt * ai,const PetscInt * aj,const PetscInt * adiag,PetscInt row)4023d71ae5a4SJacob Faibussowitsch static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols, PetscInt nzl, PetscInt nzu, PetscInt nz, const PetscInt *ai, const PetscInt *aj, const PetscInt *adiag, PetscInt row)
4024d71ae5a4SJacob Faibussowitsch {
40258758e1faSBarry Smith PetscInt k;
40268758e1faSBarry Smith const PetscInt *vi;
40276e111a19SKarl Rupp
402817454e89SShri Abhyankar PetscFunctionBegin;
402917454e89SShri Abhyankar vi = aj + ai[row];
403017454e89SShri Abhyankar for (k = 0; k < nzl; k++) cols[k] = vi[k];
403117454e89SShri Abhyankar vi = aj + adiag[row];
403217454e89SShri Abhyankar cols[nzl] = vi[0];
403317454e89SShri Abhyankar vi = aj + adiag[row + 1] + 1;
403417454e89SShri Abhyankar for (k = 0; k < nzu; k++) cols[nzl + 1 + k] = vi[k];
40353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
403617454e89SShri Abhyankar }
40376936b636SHong Zhang /*
4038a02bda8eSBarry Smith MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix.
4039a02bda8eSBarry Smith Modified from MatSeqAIJCheckInode().
40406936b636SHong Zhang
40416936b636SHong Zhang Input Parameters:
4042abb87a52SBarry Smith . Mat A - ILU or LU matrix factor
4043abb87a52SBarry Smith
40446936b636SHong Zhang */
MatSeqAIJCheckInode_FactorLU(Mat A)4045d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A)
4046d71ae5a4SJacob Faibussowitsch {
4047019b515eSShri Abhyankar Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
4048019b515eSShri Abhyankar PetscInt i, j, m, nzl1, nzu1, nzl2, nzu2, nzx, nzy, node_count, blk_size;
40498758e1faSBarry Smith PetscInt *cols1, *cols2, *ns;
40508758e1faSBarry Smith const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag;
4051ace3abfcSBarry Smith PetscBool flag;
4052019b515eSShri Abhyankar
4053019b515eSShri Abhyankar PetscFunctionBegin;
40543ba16761SJacob Faibussowitsch if (!a->inode.use) PetscFunctionReturn(PETSC_SUCCESS);
40553ba16761SJacob Faibussowitsch if (a->inode.checked) PetscFunctionReturn(PETSC_SUCCESS);
4056019b515eSShri Abhyankar
4057019b515eSShri Abhyankar m = A->rmap->n;
40584d12350bSJunchao Zhang if (a->inode.size_csr) ns = a->inode.size_csr;
405948a46eb9SPierre Jolivet else PetscCall(PetscMalloc1(m + 1, &ns));
40604d12350bSJunchao Zhang ns[0] = 0;
4061019b515eSShri Abhyankar
4062019b515eSShri Abhyankar i = 0;
4063019b515eSShri Abhyankar node_count = 0;
40649566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &cols1, m, &cols2));
4065019b515eSShri Abhyankar while (i < m) { /* For each row */
4066019b515eSShri Abhyankar nzl1 = ai[i + 1] - ai[i]; /* Number of nonzeros in L */
4067019b515eSShri Abhyankar nzu1 = adiag[i] - adiag[i + 1] - 1; /* Number of nonzeros in U excluding diagonal*/
4068019b515eSShri Abhyankar nzx = nzl1 + nzu1 + 1;
40693ba16761SJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols1, nzl1, nzu1, nzx, ai, aj, adiag, i));
4070019b515eSShri Abhyankar
4071019b515eSShri Abhyankar /* Limits the number of elements in a node to 'a->inode.limit' */
4072019b515eSShri Abhyankar for (j = i + 1, blk_size = 1; j < m && blk_size < a->inode.limit; ++j, ++blk_size) {
4073019b515eSShri Abhyankar nzl2 = ai[j + 1] - ai[j];
4074019b515eSShri Abhyankar nzu2 = adiag[j] - adiag[j + 1] - 1;
4075019b515eSShri Abhyankar nzy = nzl2 + nzu2 + 1;
4076019b515eSShri Abhyankar if (nzy != nzx) break;
40779566063dSJacob Faibussowitsch PetscCall(MatGetRow_FactoredLU(cols2, nzl2, nzu2, nzy, ai, aj, adiag, j));
40789566063dSJacob Faibussowitsch PetscCall(PetscArraycmp(cols1, cols2, nzx, &flag));
40798758e1faSBarry Smith if (!flag) break;
4080019b515eSShri Abhyankar }
40814d12350bSJunchao Zhang ns[node_count + 1] = ns[node_count] + blk_size;
40824d12350bSJunchao Zhang node_count++;
4083019b515eSShri Abhyankar i = j;
4084019b515eSShri Abhyankar }
40859566063dSJacob Faibussowitsch PetscCall(PetscFree2(cols1, cols2));
4086019b515eSShri Abhyankar /* If not enough inodes found,, do not use inode version of the routines */
4087be6adb11SBarry Smith if (!m || node_count > .8 * m) {
40889566063dSJacob Faibussowitsch PetscCall(PetscFree(ns));
40892205254eSKarl Rupp
4090019b515eSShri Abhyankar a->inode.node_count = 0;
40914d12350bSJunchao Zhang a->inode.size_csr = NULL;
4092019b515eSShri Abhyankar a->inode.use = PETSC_FALSE;
40932205254eSKarl Rupp
40949566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n", node_count, m));
4095019b515eSShri Abhyankar } else {
4096f4259b30SLisandro Dalcin A->ops->mult = NULL;
4097f4259b30SLisandro Dalcin A->ops->sor = NULL;
4098f4259b30SLisandro Dalcin A->ops->multadd = NULL;
4099f4259b30SLisandro Dalcin A->ops->getrowij = NULL;
4100f4259b30SLisandro Dalcin A->ops->restorerowij = NULL;
4101f4259b30SLisandro Dalcin A->ops->getcolumnij = NULL;
4102f4259b30SLisandro Dalcin A->ops->restorecolumnij = NULL;
4103f4259b30SLisandro Dalcin A->ops->coloringpatch = NULL;
4104f4259b30SLisandro Dalcin A->ops->multdiagonalblock = NULL;
4105019b515eSShri Abhyankar a->inode.node_count = node_count;
41064d12350bSJunchao Zhang a->inode.size_csr = ns;
41079566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n", node_count, m, a->inode.limit));
4108019b515eSShri Abhyankar }
4109be6adb11SBarry Smith a->inode.checked = PETSC_TRUE;
41103ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
4111019b515eSShri Abhyankar }
4112019b515eSShri Abhyankar
41134c1414c8SBarry Smith /*
41144c1414c8SBarry Smith This is really ugly. if inodes are used this replaces the
41154c1414c8SBarry Smith permutations with ones that correspond to rows/cols of the matrix
4116467446fbSPierre Jolivet rather than inode blocks
41174c1414c8SBarry Smith */
MatInodeAdjustForInodes(Mat A,IS * rperm,IS * cperm)4118d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes(Mat A, IS *rperm, IS *cperm)
4119d71ae5a4SJacob Faibussowitsch {
41204c1414c8SBarry Smith PetscFunctionBegin;
4121cac4c232SBarry Smith PetscTryMethod(A, "MatInodeAdjustForInodes_C", (Mat, IS *, IS *), (A, rperm, cperm));
41223ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
41234c1414c8SBarry Smith }
41244c1414c8SBarry Smith
MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS * rperm,IS * cperm)4125d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A, IS *rperm, IS *cperm)
4126d71ae5a4SJacob Faibussowitsch {
41274c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
41285d0c19d7SBarry Smith PetscInt m = A->rmap->n, n = A->cmap->n, i, j, nslim_row = a->inode.node_count;
41295d0c19d7SBarry Smith const PetscInt *ridx, *cidx;
41304d12350bSJunchao Zhang PetscInt row, col, *permr, *permc, *ns_row = a->inode.size_csr, *tns, start_val, end_val, indx;
41314c1414c8SBarry Smith PetscInt nslim_col, *ns_col;
41324c1414c8SBarry Smith IS ris = *rperm, cis = *cperm;
41334c1414c8SBarry Smith
41344c1414c8SBarry Smith PetscFunctionBegin;
41354d12350bSJunchao Zhang if (!a->inode.size_csr) PetscFunctionReturn(PETSC_SUCCESS); /* no inodes so return */
41363ba16761SJacob Faibussowitsch if (a->inode.node_count == m) PetscFunctionReturn(PETSC_SUCCESS); /* all inodes are of size 1 */
41374c1414c8SBarry Smith
41389566063dSJacob Faibussowitsch PetscCall(MatCreateColInode_Private(A, &nslim_col, &ns_col));
413932603206SJames Wright PetscCall(PetscMalloc1(((nslim_row > nslim_col ? nslim_row : nslim_col) + 1), &tns));
41409566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(m, &permr, n, &permc));
41414c1414c8SBarry Smith
41429566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ris, &ridx));
41439566063dSJacob Faibussowitsch PetscCall(ISGetIndices(cis, &cidx));
41444c1414c8SBarry Smith
4145baca6076SPierre Jolivet /* Form the inode structure for the rows of permuted matrix using inv perm*/
41464d12350bSJunchao Zhang for (i = 0, tns[0] = 0; i < nslim_row; ++i) tns[i + 1] = tns[i] + (ns_row[i + 1] - ns_row[i]);
41474c1414c8SBarry Smith
41484c1414c8SBarry Smith /* Construct the permutations for rows*/
41494c1414c8SBarry Smith for (i = 0, row = 0; i < nslim_row; ++i) {
41504c1414c8SBarry Smith indx = ridx[i];
41514c1414c8SBarry Smith start_val = tns[indx];
41524c1414c8SBarry Smith end_val = tns[indx + 1];
41534c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++row) permr[row] = j;
41544c1414c8SBarry Smith }
41554c1414c8SBarry Smith
41564c1414c8SBarry Smith /* Form the inode structure for the columns of permuted matrix using inv perm*/
41574d12350bSJunchao Zhang for (i = 0, tns[0] = 0; i < nslim_col; ++i) tns[i + 1] = tns[i] + (ns_col[i + 1] - ns_col[i]);
41584c1414c8SBarry Smith
41594c1414c8SBarry Smith /* Construct permutations for columns */
41604c1414c8SBarry Smith for (i = 0, col = 0; i < nslim_col; ++i) {
41614c1414c8SBarry Smith indx = cidx[i];
41624c1414c8SBarry Smith start_val = tns[indx];
41634c1414c8SBarry Smith end_val = tns[indx + 1];
41644c1414c8SBarry Smith for (j = start_val; j < end_val; ++j, ++col) permc[col] = j;
41654c1414c8SBarry Smith }
41664c1414c8SBarry Smith
41679566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permr, PETSC_COPY_VALUES, rperm));
41689566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*rperm));
41699566063dSJacob Faibussowitsch PetscCall(ISCreateGeneral(PETSC_COMM_SELF, n, permc, PETSC_COPY_VALUES, cperm));
41709566063dSJacob Faibussowitsch PetscCall(ISSetPermutation(*cperm));
41714c1414c8SBarry Smith
41729566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ris, &ridx));
41739566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(cis, &cidx));
41744c1414c8SBarry Smith
41759566063dSJacob Faibussowitsch PetscCall(PetscFree(ns_col));
41769566063dSJacob Faibussowitsch PetscCall(PetscFree2(permr, permc));
41779566063dSJacob Faibussowitsch PetscCall(ISDestroy(&cis));
41789566063dSJacob Faibussowitsch PetscCall(ISDestroy(&ris));
41799566063dSJacob Faibussowitsch PetscCall(PetscFree(tns));
41803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
41814c1414c8SBarry Smith }
41824c1414c8SBarry Smith
41834c1414c8SBarry Smith /*@C
418411a5261eSBarry Smith MatInodeGetInodeSizes - Returns the inode information of a matrix with inodes
41854c1414c8SBarry Smith
41863f9fe445SBarry Smith Not Collective
41874c1414c8SBarry Smith
41884c1414c8SBarry Smith Input Parameter:
418911a5261eSBarry Smith . A - the Inode matrix or matrix derived from the Inode class -- e.g., `MATSEQAIJ`
41904c1414c8SBarry Smith
4191d8d19677SJose E. Roman Output Parameters:
41924c1414c8SBarry Smith + node_count - no of inodes present in the matrix.
41932ef1f0ffSBarry Smith . sizes - an array of size `node_count`, with the sizes of each inode.
41944c1414c8SBarry Smith - limit - the max size used to generate the inodes.
41954c1414c8SBarry Smith
41964c1414c8SBarry Smith Level: advanced
41974c1414c8SBarry Smith
419811a5261eSBarry Smith Note:
41994c1414c8SBarry Smith It should be called after the matrix is assembled.
42004c1414c8SBarry Smith The contents of the sizes[] array should not be changed.
42012ef1f0ffSBarry Smith `NULL` may be passed for information not needed
42024c1414c8SBarry Smith
42031cc06b55SBarry Smith .seealso: [](ch_matrices), `Mat`, `MatGetInfo()`
42044c1414c8SBarry Smith @*/
MatInodeGetInodeSizes(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4205d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4206d71ae5a4SJacob Faibussowitsch {
42075f80ce2aSJacob Faibussowitsch PetscErrorCode (*f)(Mat, PetscInt *, PetscInt **, PetscInt *);
42084c1414c8SBarry Smith
42094c1414c8SBarry Smith PetscFunctionBegin;
42105f80ce2aSJacob Faibussowitsch PetscCheck(A->assembled, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Not for unassembled matrix");
42119566063dSJacob Faibussowitsch PetscCall(PetscObjectQueryFunction((PetscObject)A, "MatInodeGetInodeSizes_C", &f));
42129566063dSJacob Faibussowitsch if (f) PetscCall((*f)(A, node_count, sizes, limit));
42133ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
42144c1414c8SBarry Smith }
42154c1414c8SBarry Smith
MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt * node_count,PetscInt * sizes[],PetscInt * limit)4216d71ae5a4SJacob Faibussowitsch PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A, PetscInt *node_count, PetscInt *sizes[], PetscInt *limit)
4217d71ae5a4SJacob Faibussowitsch {
42184c1414c8SBarry Smith Mat_SeqAIJ *a = (Mat_SeqAIJ *)A->data;
42194c1414c8SBarry Smith
42204c1414c8SBarry Smith PetscFunctionBegin;
42214c1414c8SBarry Smith if (node_count) *node_count = a->inode.node_count;
42224d12350bSJunchao Zhang if (sizes) *sizes = a->inode.size_csr;
42234c1414c8SBarry Smith if (limit) *limit = a->inode.limit;
42243ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
42254c1414c8SBarry Smith }
4226