1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3
MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)4 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
5 {
6 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
7 const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
8 PetscInt i, nz, idx, idt, jdx;
9 const MatScalar *aa = a->a, *v;
10 PetscScalar *x, s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7;
11 const PetscScalar *b;
12
13 PetscFunctionBegin;
14 PetscCall(VecGetArrayRead(bb, &b));
15 PetscCall(VecGetArray(xx, &x));
16 /* forward solve the lower triangular */
17 idx = 0;
18 x[0] = b[idx];
19 x[1] = b[1 + idx];
20 x[2] = b[2 + idx];
21 x[3] = b[3 + idx];
22 x[4] = b[4 + idx];
23 x[5] = b[5 + idx];
24 x[6] = b[6 + idx];
25 for (i = 1; i < n; i++) {
26 v = aa + 49 * ai[i];
27 vi = aj + ai[i];
28 nz = diag[i] - ai[i];
29 idx = 7 * i;
30 s1 = b[idx];
31 s2 = b[1 + idx];
32 s3 = b[2 + idx];
33 s4 = b[3 + idx];
34 s5 = b[4 + idx];
35 s6 = b[5 + idx];
36 s7 = b[6 + idx];
37 while (nz--) {
38 jdx = 7 * (*vi++);
39 x1 = x[jdx];
40 x2 = x[1 + jdx];
41 x3 = x[2 + jdx];
42 x4 = x[3 + jdx];
43 x5 = x[4 + jdx];
44 x6 = x[5 + jdx];
45 x7 = x[6 + jdx];
46 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
47 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
48 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
49 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
50 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
51 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
52 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
53 v += 49;
54 }
55 x[idx] = s1;
56 x[1 + idx] = s2;
57 x[2 + idx] = s3;
58 x[3 + idx] = s4;
59 x[4 + idx] = s5;
60 x[5 + idx] = s6;
61 x[6 + idx] = s7;
62 }
63 /* backward solve the upper triangular */
64 for (i = n - 1; i >= 0; i--) {
65 v = aa + 49 * diag[i] + 49;
66 vi = aj + diag[i] + 1;
67 nz = ai[i + 1] - diag[i] - 1;
68 idt = 7 * i;
69 s1 = x[idt];
70 s2 = x[1 + idt];
71 s3 = x[2 + idt];
72 s4 = x[3 + idt];
73 s5 = x[4 + idt];
74 s6 = x[5 + idt];
75 s7 = x[6 + idt];
76 while (nz--) {
77 idx = 7 * (*vi++);
78 x1 = x[idx];
79 x2 = x[1 + idx];
80 x3 = x[2 + idx];
81 x4 = x[3 + idx];
82 x5 = x[4 + idx];
83 x6 = x[5 + idx];
84 x7 = x[6 + idx];
85 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
86 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
87 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
88 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
89 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
90 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
91 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
92 v += 49;
93 }
94 v = aa + 49 * diag[i];
95 x[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
96 x[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
97 x[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
98 x[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
99 x[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
100 x[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
101 x[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
102 }
103
104 PetscCall(VecRestoreArrayRead(bb, &b));
105 PetscCall(VecRestoreArray(xx, &x));
106 PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
107 PetscFunctionReturn(PETSC_SUCCESS);
108 }
109
MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)110 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A, Vec bb, Vec xx)
111 {
112 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
113 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
114 PetscInt i, k, nz, idx, jdx, idt;
115 const PetscInt bs = A->rmap->bs, bs2 = a->bs2;
116 const MatScalar *aa = a->a, *v;
117 PetscScalar *x;
118 const PetscScalar *b;
119 PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7;
120
121 PetscFunctionBegin;
122 PetscCall(VecGetArrayRead(bb, &b));
123 PetscCall(VecGetArray(xx, &x));
124 /* forward solve the lower triangular */
125 idx = 0;
126 x[0] = b[idx];
127 x[1] = b[1 + idx];
128 x[2] = b[2 + idx];
129 x[3] = b[3 + idx];
130 x[4] = b[4 + idx];
131 x[5] = b[5 + idx];
132 x[6] = b[6 + idx];
133 for (i = 1; i < n; i++) {
134 v = aa + bs2 * ai[i];
135 vi = aj + ai[i];
136 nz = ai[i + 1] - ai[i];
137 idx = bs * i;
138 s1 = b[idx];
139 s2 = b[1 + idx];
140 s3 = b[2 + idx];
141 s4 = b[3 + idx];
142 s5 = b[4 + idx];
143 s6 = b[5 + idx];
144 s7 = b[6 + idx];
145 for (k = 0; k < nz; k++) {
146 jdx = bs * vi[k];
147 x1 = x[jdx];
148 x2 = x[1 + jdx];
149 x3 = x[2 + jdx];
150 x4 = x[3 + jdx];
151 x5 = x[4 + jdx];
152 x6 = x[5 + jdx];
153 x7 = x[6 + jdx];
154 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
155 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
156 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
157 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
158 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
159 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
160 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
161 v += bs2;
162 }
163
164 x[idx] = s1;
165 x[1 + idx] = s2;
166 x[2 + idx] = s3;
167 x[3 + idx] = s4;
168 x[4 + idx] = s5;
169 x[5 + idx] = s6;
170 x[6 + idx] = s7;
171 }
172
173 /* backward solve the upper triangular */
174 for (i = n - 1; i >= 0; i--) {
175 v = aa + bs2 * (adiag[i + 1] + 1);
176 vi = aj + adiag[i + 1] + 1;
177 nz = adiag[i] - adiag[i + 1] - 1;
178 idt = bs * i;
179 s1 = x[idt];
180 s2 = x[1 + idt];
181 s3 = x[2 + idt];
182 s4 = x[3 + idt];
183 s5 = x[4 + idt];
184 s6 = x[5 + idt];
185 s7 = x[6 + idt];
186 for (k = 0; k < nz; k++) {
187 idx = bs * vi[k];
188 x1 = x[idx];
189 x2 = x[1 + idx];
190 x3 = x[2 + idx];
191 x4 = x[3 + idx];
192 x5 = x[4 + idx];
193 x6 = x[5 + idx];
194 x7 = x[6 + idx];
195 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
196 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
197 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
198 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
199 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
200 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
201 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
202 v += bs2;
203 }
204 /* x = inv_diagonal*x */
205 x[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
206 x[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
207 x[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
208 x[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
209 x[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
210 x[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
211 x[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
212 }
213
214 PetscCall(VecRestoreArrayRead(bb, &b));
215 PetscCall(VecRestoreArray(xx, &x));
216 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n));
217 PetscFunctionReturn(PETSC_SUCCESS);
218 }
219