183287d42SBarry Smith /*
283287d42SBarry Smith Factorization code for BAIJ format.
383287d42SBarry Smith */
4c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
5af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
683287d42SBarry Smith
783287d42SBarry Smith /*
883287d42SBarry Smith Version for when blocks are 3 by 3
983287d42SBarry Smith */
MatILUFactorNumeric_SeqBAIJ_3_inplace(Mat C,Mat A,const MatFactorInfo * info)10*421480d9SBarry Smith PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info)
11d71ae5a4SJacob Faibussowitsch {
1283287d42SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1383287d42SBarry Smith IS isrow = b->row, isicol = b->icol;
145d0c19d7SBarry Smith const PetscInt *r, *ic;
155d0c19d7SBarry Smith PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16690b6cddSBarry Smith PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
17*421480d9SBarry Smith const PetscInt *diag_offset;
18*421480d9SBarry Smith PetscInt idx, *pj;
1983287d42SBarry Smith MatScalar *pv, *v, *rtmp, *pc, *w, *x;
2083287d42SBarry Smith MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
2183287d42SBarry Smith MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
2283287d42SBarry Smith MatScalar *ba = b->a, *aa = a->a;
23182b8fbaSHong Zhang PetscReal shift = info->shiftamount;
24a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected;
2583287d42SBarry Smith
2683287d42SBarry Smith PetscFunctionBegin;
27*421480d9SBarry Smith /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
28*421480d9SBarry Smith A->factortype = MAT_FACTOR_NONE;
29*421480d9SBarry Smith PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
30*421480d9SBarry Smith A->factortype = MAT_FACTOR_ILU;
319566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r));
329566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic));
339566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
345f8bbccaSHong Zhang allowzeropivot = PetscNot(A->erroriffailure);
3583287d42SBarry Smith
3683287d42SBarry Smith for (i = 0; i < n; i++) {
3783287d42SBarry Smith nz = bi[i + 1] - bi[i];
3883287d42SBarry Smith ajtmp = bj + bi[i];
3983287d42SBarry Smith for (j = 0; j < nz; j++) {
4083287d42SBarry Smith x = rtmp + 9 * ajtmp[j];
4183287d42SBarry Smith x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
4283287d42SBarry Smith }
4383287d42SBarry Smith /* load in initial (unfactored row) */
4483287d42SBarry Smith idx = r[i];
4583287d42SBarry Smith nz = ai[idx + 1] - ai[idx];
4683287d42SBarry Smith ajtmpold = aj + ai[idx];
4783287d42SBarry Smith v = aa + 9 * ai[idx];
4883287d42SBarry Smith for (j = 0; j < nz; j++) {
4983287d42SBarry Smith x = rtmp + 9 * ic[ajtmpold[j]];
509371c9d4SSatish Balay x[0] = v[0];
519371c9d4SSatish Balay x[1] = v[1];
529371c9d4SSatish Balay x[2] = v[2];
539371c9d4SSatish Balay x[3] = v[3];
549371c9d4SSatish Balay x[4] = v[4];
559371c9d4SSatish Balay x[5] = v[5];
569371c9d4SSatish Balay x[6] = v[6];
579371c9d4SSatish Balay x[7] = v[7];
589371c9d4SSatish Balay x[8] = v[8];
5983287d42SBarry Smith v += 9;
6083287d42SBarry Smith }
6183287d42SBarry Smith row = *ajtmp++;
6283287d42SBarry Smith while (row < i) {
6383287d42SBarry Smith pc = rtmp + 9 * row;
649371c9d4SSatish Balay p1 = pc[0];
659371c9d4SSatish Balay p2 = pc[1];
669371c9d4SSatish Balay p3 = pc[2];
679371c9d4SSatish Balay p4 = pc[3];
689371c9d4SSatish Balay p5 = pc[4];
699371c9d4SSatish Balay p6 = pc[5];
709371c9d4SSatish Balay p7 = pc[6];
719371c9d4SSatish Balay p8 = pc[7];
729371c9d4SSatish Balay p9 = pc[8];
739371c9d4SSatish Balay if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
7483287d42SBarry Smith pv = ba + 9 * diag_offset[row];
7583287d42SBarry Smith pj = bj + diag_offset[row] + 1;
769371c9d4SSatish Balay x1 = pv[0];
779371c9d4SSatish Balay x2 = pv[1];
789371c9d4SSatish Balay x3 = pv[2];
799371c9d4SSatish Balay x4 = pv[3];
809371c9d4SSatish Balay x5 = pv[4];
819371c9d4SSatish Balay x6 = pv[5];
829371c9d4SSatish Balay x7 = pv[6];
839371c9d4SSatish Balay x8 = pv[7];
849371c9d4SSatish Balay x9 = pv[8];
8583287d42SBarry Smith pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
8683287d42SBarry Smith pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
8783287d42SBarry Smith pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
8883287d42SBarry Smith
8983287d42SBarry Smith pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
9083287d42SBarry Smith pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
9183287d42SBarry Smith pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
9283287d42SBarry Smith
9383287d42SBarry Smith pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
9483287d42SBarry Smith pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
9583287d42SBarry Smith pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
9683287d42SBarry Smith nz = bi[row + 1] - diag_offset[row] - 1;
9783287d42SBarry Smith pv += 9;
9883287d42SBarry Smith for (j = 0; j < nz; j++) {
999371c9d4SSatish Balay x1 = pv[0];
1009371c9d4SSatish Balay x2 = pv[1];
1019371c9d4SSatish Balay x3 = pv[2];
1029371c9d4SSatish Balay x4 = pv[3];
1039371c9d4SSatish Balay x5 = pv[4];
1049371c9d4SSatish Balay x6 = pv[5];
1059371c9d4SSatish Balay x7 = pv[6];
1069371c9d4SSatish Balay x8 = pv[7];
1079371c9d4SSatish Balay x9 = pv[8];
10883287d42SBarry Smith x = rtmp + 9 * pj[j];
10983287d42SBarry Smith x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
11083287d42SBarry Smith x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
11183287d42SBarry Smith x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
11283287d42SBarry Smith
11383287d42SBarry Smith x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
11483287d42SBarry Smith x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
11583287d42SBarry Smith x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
11683287d42SBarry Smith
11783287d42SBarry Smith x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
11883287d42SBarry Smith x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
11983287d42SBarry Smith x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
12083287d42SBarry Smith pv += 9;
12183287d42SBarry Smith }
1229566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(54.0 * nz + 36.0));
12383287d42SBarry Smith }
12483287d42SBarry Smith row = *ajtmp++;
12583287d42SBarry Smith }
12683287d42SBarry Smith /* finished row so stick it into b->a */
12783287d42SBarry Smith pv = ba + 9 * bi[i];
12883287d42SBarry Smith pj = bj + bi[i];
12983287d42SBarry Smith nz = bi[i + 1] - bi[i];
13083287d42SBarry Smith for (j = 0; j < nz; j++) {
13183287d42SBarry Smith x = rtmp + 9 * pj[j];
1329371c9d4SSatish Balay pv[0] = x[0];
1339371c9d4SSatish Balay pv[1] = x[1];
1349371c9d4SSatish Balay pv[2] = x[2];
1359371c9d4SSatish Balay pv[3] = x[3];
1369371c9d4SSatish Balay pv[4] = x[4];
1379371c9d4SSatish Balay pv[5] = x[5];
1389371c9d4SSatish Balay pv[6] = x[6];
1399371c9d4SSatish Balay pv[7] = x[7];
1409371c9d4SSatish Balay pv[8] = x[8];
14183287d42SBarry Smith pv += 9;
14283287d42SBarry Smith }
14383287d42SBarry Smith /* invert diagonal block */
14483287d42SBarry Smith w = ba + 9 * diag_offset[i];
1459566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
1467b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
14783287d42SBarry Smith }
14883287d42SBarry Smith
1499566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp));
1509566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic));
1519566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r));
15226fbe8dcSKarl Rupp
15306e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_3_inplace;
15406e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
15583287d42SBarry Smith C->assembled = PETSC_TRUE;
15626fbe8dcSKarl Rupp
1579566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
1583ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
15983287d42SBarry Smith }
16017542729SShri Abhyankar
1614dd39f65SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_3 -
1624dd39f65SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
16396b95a6bSBarry Smith PetscKernel_A_gets_A_times_B()
16496b95a6bSBarry Smith PetscKernel_A_gets_A_minus_B_times_C()
16596b95a6bSBarry Smith PetscKernel_A_gets_inverse_A()
16617542729SShri Abhyankar */
MatLUFactorNumeric_SeqBAIJ_3(Mat B,Mat A,const MatFactorInfo * info)167d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info)
168d71ae5a4SJacob Faibussowitsch {
16917542729SShri Abhyankar Mat C = B;
17017542729SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
17117542729SShri Abhyankar IS isrow = b->row, isicol = b->icol;
1725a586d82SBarry Smith const PetscInt *r, *ic;
173bbd65245SShri Abhyankar PetscInt i, j, k, nz, nzL, row;
174bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
175bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
17617542729SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
177bbd65245SShri Abhyankar PetscInt flg;
178182b8fbaSHong Zhang PetscReal shift = info->shiftamount;
179a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected;
18017542729SShri Abhyankar
18117542729SShri Abhyankar PetscFunctionBegin;
1829566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r));
1839566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic));
1840164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure);
18517542729SShri Abhyankar
18617542729SShri Abhyankar /* generate work space needed by the factorization */
1879566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
1889566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n));
18917542729SShri Abhyankar
19017542729SShri Abhyankar for (i = 0; i < n; i++) {
19117542729SShri Abhyankar /* zero rtmp */
19217542729SShri Abhyankar /* L part */
19317542729SShri Abhyankar nz = bi[i + 1] - bi[i];
19417542729SShri Abhyankar bjtmp = bj + bi[i];
19548a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
19617542729SShri Abhyankar
19717542729SShri Abhyankar /* U part */
1980c4413a7SShri Abhyankar nz = bdiag[i] - bdiag[i + 1];
1990c4413a7SShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1;
20048a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
2010c4413a7SShri Abhyankar
2020c4413a7SShri Abhyankar /* load in initial (unfactored row) */
2030c4413a7SShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]];
2040c4413a7SShri Abhyankar ajtmp = aj + ai[r[i]];
2050c4413a7SShri Abhyankar v = aa + bs2 * ai[r[i]];
20648a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
2070c4413a7SShri Abhyankar
2080c4413a7SShri Abhyankar /* elimination */
2090c4413a7SShri Abhyankar bjtmp = bj + bi[i];
2100c4413a7SShri Abhyankar nzL = bi[i + 1] - bi[i];
2110c4413a7SShri Abhyankar for (k = 0; k < nzL; k++) {
2120c4413a7SShri Abhyankar row = bjtmp[k];
2130c4413a7SShri Abhyankar pc = rtmp + bs2 * row;
214c35f09e5SBarry Smith for (flg = 0, j = 0; j < bs2; j++) {
215c35f09e5SBarry Smith if (pc[j] != 0.0) {
216c35f09e5SBarry Smith flg = 1;
217c35f09e5SBarry Smith break;
218c35f09e5SBarry Smith }
219c35f09e5SBarry Smith }
2200c4413a7SShri Abhyankar if (flg) {
2210c4413a7SShri Abhyankar pv = b->a + bs2 * bdiag[row];
22296b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
2239566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
2240c4413a7SShri Abhyankar
2250c4413a7SShri Abhyankar pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
2260c4413a7SShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1);
2270c4413a7SShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
2280c4413a7SShri Abhyankar for (j = 0; j < nz; j++) {
22996b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
2300c4413a7SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
2310c4413a7SShri Abhyankar v = rtmp + bs2 * pj[j];
2329566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
2330c4413a7SShri Abhyankar pv += bs2;
2340c4413a7SShri Abhyankar }
2359566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
2360c4413a7SShri Abhyankar }
2370c4413a7SShri Abhyankar }
2380c4413a7SShri Abhyankar
2390c4413a7SShri Abhyankar /* finished row so stick it into b->a */
2400c4413a7SShri Abhyankar /* L part */
2410c4413a7SShri Abhyankar pv = b->a + bs2 * bi[i];
2420c4413a7SShri Abhyankar pj = b->j + bi[i];
2430c4413a7SShri Abhyankar nz = bi[i + 1] - bi[i];
24448a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
2450c4413a7SShri Abhyankar
246a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */
2470c4413a7SShri Abhyankar pv = b->a + bs2 * bdiag[i];
2480c4413a7SShri Abhyankar pj = b->j + bdiag[i];
2499566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
2509566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
2517b6c816cSBarry Smith if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2520c4413a7SShri Abhyankar
2530c4413a7SShri Abhyankar /* U part */
2540c4413a7SShri Abhyankar pj = b->j + bdiag[i + 1] + 1;
2550c4413a7SShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1);
2560c4413a7SShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1;
25748a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
2580c4413a7SShri Abhyankar }
2590c4413a7SShri Abhyankar
2609566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork));
2619566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic));
2629566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r));
26326fbe8dcSKarl Rupp
2644dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_3;
2654dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
2660c4413a7SShri Abhyankar C->assembled = PETSC_TRUE;
26726fbe8dcSKarl Rupp
2689566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
2693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
2700c4413a7SShri Abhyankar }
2710c4413a7SShri Abhyankar
MatILUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C,Mat A,const MatFactorInfo * info)272*421480d9SBarry Smith PetscErrorCode MatILUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
273d71ae5a4SJacob Faibussowitsch {
27417542729SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
27517542729SShri Abhyankar PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j;
27617542729SShri Abhyankar PetscInt *ajtmpold, *ajtmp, nz, row;
277*421480d9SBarry Smith const PetscInt *diag_offset;
278*421480d9SBarry Smith PetscInt *ai = a->i, *aj = a->j, *pj;
27917542729SShri Abhyankar MatScalar *pv, *v, *rtmp, *pc, *w, *x;
28017542729SShri Abhyankar MatScalar p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
28117542729SShri Abhyankar MatScalar p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
28217542729SShri Abhyankar MatScalar *ba = b->a, *aa = a->a;
283182b8fbaSHong Zhang PetscReal shift = info->shiftamount;
284a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected;
28517542729SShri Abhyankar
28617542729SShri Abhyankar PetscFunctionBegin;
287*421480d9SBarry Smith /* Since A is C and C is labeled as a factored matrix we need to lie to MatGetDiagonalMarkers_SeqBAIJ() to get it to compute the diagonals */
288*421480d9SBarry Smith A->factortype = MAT_FACTOR_NONE;
289*421480d9SBarry Smith PetscCall(MatGetDiagonalMarkers_SeqBAIJ(A, &diag_offset, NULL));
290*421480d9SBarry Smith A->factortype = MAT_FACTOR_ILU;
2919566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
2920164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure);
29317542729SShri Abhyankar
29417542729SShri Abhyankar for (i = 0; i < n; i++) {
29517542729SShri Abhyankar nz = bi[i + 1] - bi[i];
29617542729SShri Abhyankar ajtmp = bj + bi[i];
29717542729SShri Abhyankar for (j = 0; j < nz; j++) {
29817542729SShri Abhyankar x = rtmp + 9 * ajtmp[j];
29917542729SShri Abhyankar x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
30017542729SShri Abhyankar }
30117542729SShri Abhyankar /* load in initial (unfactored row) */
30217542729SShri Abhyankar nz = ai[i + 1] - ai[i];
30317542729SShri Abhyankar ajtmpold = aj + ai[i];
30417542729SShri Abhyankar v = aa + 9 * ai[i];
30517542729SShri Abhyankar for (j = 0; j < nz; j++) {
30617542729SShri Abhyankar x = rtmp + 9 * ajtmpold[j];
3079371c9d4SSatish Balay x[0] = v[0];
3089371c9d4SSatish Balay x[1] = v[1];
3099371c9d4SSatish Balay x[2] = v[2];
3109371c9d4SSatish Balay x[3] = v[3];
3119371c9d4SSatish Balay x[4] = v[4];
3129371c9d4SSatish Balay x[5] = v[5];
3139371c9d4SSatish Balay x[6] = v[6];
3149371c9d4SSatish Balay x[7] = v[7];
3159371c9d4SSatish Balay x[8] = v[8];
31617542729SShri Abhyankar v += 9;
31717542729SShri Abhyankar }
31817542729SShri Abhyankar row = *ajtmp++;
31917542729SShri Abhyankar while (row < i) {
32017542729SShri Abhyankar pc = rtmp + 9 * row;
3219371c9d4SSatish Balay p1 = pc[0];
3229371c9d4SSatish Balay p2 = pc[1];
3239371c9d4SSatish Balay p3 = pc[2];
3249371c9d4SSatish Balay p4 = pc[3];
3259371c9d4SSatish Balay p5 = pc[4];
3269371c9d4SSatish Balay p6 = pc[5];
3279371c9d4SSatish Balay p7 = pc[6];
3289371c9d4SSatish Balay p8 = pc[7];
3299371c9d4SSatish Balay p9 = pc[8];
3309371c9d4SSatish Balay if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
33117542729SShri Abhyankar pv = ba + 9 * diag_offset[row];
33217542729SShri Abhyankar pj = bj + diag_offset[row] + 1;
3339371c9d4SSatish Balay x1 = pv[0];
3349371c9d4SSatish Balay x2 = pv[1];
3359371c9d4SSatish Balay x3 = pv[2];
3369371c9d4SSatish Balay x4 = pv[3];
3379371c9d4SSatish Balay x5 = pv[4];
3389371c9d4SSatish Balay x6 = pv[5];
3399371c9d4SSatish Balay x7 = pv[6];
3409371c9d4SSatish Balay x8 = pv[7];
3419371c9d4SSatish Balay x9 = pv[8];
34217542729SShri Abhyankar pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
34317542729SShri Abhyankar pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
34417542729SShri Abhyankar pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
34517542729SShri Abhyankar
34617542729SShri Abhyankar pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
34717542729SShri Abhyankar pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
34817542729SShri Abhyankar pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
34917542729SShri Abhyankar
35017542729SShri Abhyankar pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
35117542729SShri Abhyankar pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
35217542729SShri Abhyankar pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
35317542729SShri Abhyankar
35417542729SShri Abhyankar nz = bi[row + 1] - diag_offset[row] - 1;
35517542729SShri Abhyankar pv += 9;
35617542729SShri Abhyankar for (j = 0; j < nz; j++) {
3579371c9d4SSatish Balay x1 = pv[0];
3589371c9d4SSatish Balay x2 = pv[1];
3599371c9d4SSatish Balay x3 = pv[2];
3609371c9d4SSatish Balay x4 = pv[3];
3619371c9d4SSatish Balay x5 = pv[4];
3629371c9d4SSatish Balay x6 = pv[5];
3639371c9d4SSatish Balay x7 = pv[6];
3649371c9d4SSatish Balay x8 = pv[7];
3659371c9d4SSatish Balay x9 = pv[8];
36617542729SShri Abhyankar x = rtmp + 9 * pj[j];
36717542729SShri Abhyankar x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
36817542729SShri Abhyankar x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
36917542729SShri Abhyankar x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
37017542729SShri Abhyankar
37117542729SShri Abhyankar x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
37217542729SShri Abhyankar x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
37317542729SShri Abhyankar x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
37417542729SShri Abhyankar
37517542729SShri Abhyankar x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
37617542729SShri Abhyankar x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
37717542729SShri Abhyankar x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
37817542729SShri Abhyankar pv += 9;
37917542729SShri Abhyankar }
3809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(54.0 * nz + 36.0));
38117542729SShri Abhyankar }
38217542729SShri Abhyankar row = *ajtmp++;
38317542729SShri Abhyankar }
38417542729SShri Abhyankar /* finished row so stick it into b->a */
38517542729SShri Abhyankar pv = ba + 9 * bi[i];
38617542729SShri Abhyankar pj = bj + bi[i];
38717542729SShri Abhyankar nz = bi[i + 1] - bi[i];
38817542729SShri Abhyankar for (j = 0; j < nz; j++) {
38917542729SShri Abhyankar x = rtmp + 9 * pj[j];
3909371c9d4SSatish Balay pv[0] = x[0];
3919371c9d4SSatish Balay pv[1] = x[1];
3929371c9d4SSatish Balay pv[2] = x[2];
3939371c9d4SSatish Balay pv[3] = x[3];
3949371c9d4SSatish Balay pv[4] = x[4];
3959371c9d4SSatish Balay pv[5] = x[5];
3969371c9d4SSatish Balay pv[6] = x[6];
3979371c9d4SSatish Balay pv[7] = x[7];
3989371c9d4SSatish Balay pv[8] = x[8];
39917542729SShri Abhyankar pv += 9;
40017542729SShri Abhyankar }
40117542729SShri Abhyankar /* invert diagonal block */
40217542729SShri Abhyankar w = ba + 9 * diag_offset[i];
4039566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
4047b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
40517542729SShri Abhyankar }
40617542729SShri Abhyankar
4079566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp));
40826fbe8dcSKarl Rupp
40906e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
41006e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
41117542729SShri Abhyankar C->assembled = PETSC_TRUE;
41226fbe8dcSKarl Rupp
4139566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
4143ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
41517542729SShri Abhyankar }
41617542729SShri Abhyankar
41717542729SShri Abhyankar /*
4184dd39f65SShri Abhyankar MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
4194dd39f65SShri Abhyankar copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
42017542729SShri Abhyankar */
MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B,Mat A,const MatFactorInfo * info)421d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
422d71ae5a4SJacob Faibussowitsch {
42317542729SShri Abhyankar Mat C = B;
42417542729SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
425bbd65245SShri Abhyankar PetscInt i, j, k, nz, nzL, row;
426bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
427bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
42817542729SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
429bbd65245SShri Abhyankar PetscInt flg;
430182b8fbaSHong Zhang PetscReal shift = info->shiftamount;
431a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected;
43217542729SShri Abhyankar
43317542729SShri Abhyankar PetscFunctionBegin;
4340164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure);
4350164db54SHong Zhang
43617542729SShri Abhyankar /* generate work space needed by the factorization */
4379566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
4389566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n));
43917542729SShri Abhyankar
44017542729SShri Abhyankar for (i = 0; i < n; i++) {
44117542729SShri Abhyankar /* zero rtmp */
44217542729SShri Abhyankar /* L part */
44317542729SShri Abhyankar nz = bi[i + 1] - bi[i];
44417542729SShri Abhyankar bjtmp = bj + bi[i];
44548a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
44617542729SShri Abhyankar
44717542729SShri Abhyankar /* U part */
448b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i + 1];
449b2b2dd24SShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1;
45048a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
451b2b2dd24SShri Abhyankar
452b2b2dd24SShri Abhyankar /* load in initial (unfactored row) */
453b2b2dd24SShri Abhyankar nz = ai[i + 1] - ai[i];
454b2b2dd24SShri Abhyankar ajtmp = aj + ai[i];
455b2b2dd24SShri Abhyankar v = aa + bs2 * ai[i];
45648a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
457b2b2dd24SShri Abhyankar
458b2b2dd24SShri Abhyankar /* elimination */
459b2b2dd24SShri Abhyankar bjtmp = bj + bi[i];
460b2b2dd24SShri Abhyankar nzL = bi[i + 1] - bi[i];
461b2b2dd24SShri Abhyankar for (k = 0; k < nzL; k++) {
462b2b2dd24SShri Abhyankar row = bjtmp[k];
463b2b2dd24SShri Abhyankar pc = rtmp + bs2 * row;
464c35f09e5SBarry Smith for (flg = 0, j = 0; j < bs2; j++) {
465c35f09e5SBarry Smith if (pc[j] != 0.0) {
466c35f09e5SBarry Smith flg = 1;
467c35f09e5SBarry Smith break;
468c35f09e5SBarry Smith }
469c35f09e5SBarry Smith }
470b2b2dd24SShri Abhyankar if (flg) {
471b2b2dd24SShri Abhyankar pv = b->a + bs2 * bdiag[row];
47296b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
4739566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
474b2b2dd24SShri Abhyankar
475b2b2dd24SShri Abhyankar pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
476b2b2dd24SShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1);
477b2b2dd24SShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
478b2b2dd24SShri Abhyankar for (j = 0; j < nz; j++) {
47996b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
480b2b2dd24SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
481b2b2dd24SShri Abhyankar v = rtmp + bs2 * pj[j];
4829566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
483b2b2dd24SShri Abhyankar pv += bs2;
484b2b2dd24SShri Abhyankar }
4859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
486b2b2dd24SShri Abhyankar }
487b2b2dd24SShri Abhyankar }
488b2b2dd24SShri Abhyankar
489b2b2dd24SShri Abhyankar /* finished row so stick it into b->a */
490b2b2dd24SShri Abhyankar /* L part */
491b2b2dd24SShri Abhyankar pv = b->a + bs2 * bi[i];
492b2b2dd24SShri Abhyankar pj = b->j + bi[i];
493b2b2dd24SShri Abhyankar nz = bi[i + 1] - bi[i];
49448a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
495b2b2dd24SShri Abhyankar
496a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */
497b2b2dd24SShri Abhyankar pv = b->a + bs2 * bdiag[i];
498b2b2dd24SShri Abhyankar pj = b->j + bdiag[i];
4999566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
5009566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
5017b6c816cSBarry Smith if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
502b2b2dd24SShri Abhyankar
503b2b2dd24SShri Abhyankar /* U part */
504b2b2dd24SShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1);
505b2b2dd24SShri Abhyankar pj = b->j + bdiag[i + 1] + 1;
506b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1;
50748a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
508b2b2dd24SShri Abhyankar }
5099566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork));
51026fbe8dcSKarl Rupp
5114dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering;
5129f5c0bcdSBarry Smith C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
5139f5c0bcdSBarry Smith C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
5144dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
515b2b2dd24SShri Abhyankar C->assembled = PETSC_TRUE;
51626fbe8dcSKarl Rupp
5179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
5183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
519b2b2dd24SShri Abhyankar }
520