1 #include <../src/mat/impls/baij/seq/baij.h>
2 #include <petsc/private/kernels/blockinvert.h>
3
4 #if defined(PETSC_HAVE_XMMINTRIN_H)
5 #include <xmmintrin.h>
6 #endif
7
8 /*
9 Special case where the matrix was ILU(0) factored in the natural
10 ordering. This eliminates the need for the column and row permutation.
11 */
MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)12 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
13 {
14 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
15 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *diag = a->diag;
16 const MatScalar *aa = a->a, *v;
17 PetscScalar *x, s1, s2, x1, x2;
18 const PetscScalar *b;
19 PetscInt jdx, idt, idx, nz, i;
20
21 PetscFunctionBegin;
22 PetscCall(VecGetArrayRead(bb, &b));
23 PetscCall(VecGetArray(xx, &x));
24
25 /* forward solve the lower triangular */
26 idx = 0;
27 x[0] = b[0];
28 x[1] = b[1];
29 for (i = 1; i < n; i++) {
30 v = aa + 4 * ai[i];
31 vi = aj + ai[i];
32 nz = diag[i] - ai[i];
33 idx += 2;
34 s1 = b[idx];
35 s2 = b[1 + idx];
36 while (nz--) {
37 jdx = 2 * (*vi++);
38 x1 = x[jdx];
39 x2 = x[1 + jdx];
40 s1 -= v[0] * x1 + v[2] * x2;
41 s2 -= v[1] * x1 + v[3] * x2;
42 v += 4;
43 }
44 x[idx] = s1;
45 x[1 + idx] = s2;
46 }
47 /* backward solve the upper triangular */
48 for (i = n - 1; i >= 0; i--) {
49 v = aa + 4 * diag[i] + 4;
50 vi = aj + diag[i] + 1;
51 nz = ai[i + 1] - diag[i] - 1;
52 idt = 2 * i;
53 s1 = x[idt];
54 s2 = x[1 + idt];
55 while (nz--) {
56 idx = 2 * (*vi++);
57 x1 = x[idx];
58 x2 = x[1 + idx];
59 s1 -= v[0] * x1 + v[2] * x2;
60 s2 -= v[1] * x1 + v[3] * x2;
61 v += 4;
62 }
63 v = aa + 4 * diag[i];
64 x[idt] = v[0] * s1 + v[2] * s2;
65 x[1 + idt] = v[1] * s1 + v[3] * s2;
66 }
67
68 PetscCall(VecRestoreArrayRead(bb, &b));
69 PetscCall(VecRestoreArray(xx, &x));
70 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
71 PetscFunctionReturn(PETSC_SUCCESS);
72 }
73
MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)74 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx)
75 {
76 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
77 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
78 PetscInt i, k, nz, idx, idt, jdx;
79 const MatScalar *aa = a->a, *v;
80 PetscScalar *x, s1, s2, x1, x2;
81 const PetscScalar *b;
82
83 PetscFunctionBegin;
84 PetscCall(VecGetArrayRead(bb, &b));
85 PetscCall(VecGetArray(xx, &x));
86 /* forward solve the lower triangular */
87 idx = 0;
88 x[0] = b[idx];
89 x[1] = b[1 + idx];
90 for (i = 1; i < n; i++) {
91 v = aa + 4 * ai[i];
92 vi = aj + ai[i];
93 nz = ai[i + 1] - ai[i];
94 idx = 2 * i;
95 s1 = b[idx];
96 s2 = b[1 + idx];
97 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
98 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
99 for (k = 0; k < nz; k++) {
100 jdx = 2 * vi[k];
101 x1 = x[jdx];
102 x2 = x[1 + jdx];
103 s1 -= v[0] * x1 + v[2] * x2;
104 s2 -= v[1] * x1 + v[3] * x2;
105 v += 4;
106 }
107 x[idx] = s1;
108 x[1 + idx] = s2;
109 }
110
111 /* backward solve the upper triangular */
112 for (i = n - 1; i >= 0; i--) {
113 v = aa + 4 * (adiag[i + 1] + 1);
114 vi = aj + adiag[i + 1] + 1;
115 nz = adiag[i] - adiag[i + 1] - 1;
116 idt = 2 * i;
117 s1 = x[idt];
118 s2 = x[1 + idt];
119 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
120 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
121 for (k = 0; k < nz; k++) {
122 idx = 2 * vi[k];
123 x1 = x[idx];
124 x2 = x[1 + idx];
125 s1 -= v[0] * x1 + v[2] * x2;
126 s2 -= v[1] * x1 + v[3] * x2;
127 v += 4;
128 }
129 /* x = inv_diagonal*x */
130 x[idt] = v[0] * s1 + v[2] * s2;
131 x[1 + idt] = v[1] * s1 + v[3] * s2;
132 }
133
134 PetscCall(VecRestoreArrayRead(bb, &b));
135 PetscCall(VecRestoreArray(xx, &x));
136 PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
137 PetscFunctionReturn(PETSC_SUCCESS);
138 }
139
MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)140 PetscErrorCode MatForwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx)
141 {
142 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
143 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j;
144 PetscInt i, k, nz, idx, jdx;
145 const MatScalar *aa = a->a, *v;
146 PetscScalar *x, s1, s2, x1, x2;
147 const PetscScalar *b;
148
149 PetscFunctionBegin;
150 PetscCall(VecGetArrayRead(bb, &b));
151 PetscCall(VecGetArray(xx, &x));
152 /* forward solve the lower triangular */
153 idx = 0;
154 x[0] = b[idx];
155 x[1] = b[1 + idx];
156 for (i = 1; i < n; i++) {
157 v = aa + 4 * ai[i];
158 vi = aj + ai[i];
159 nz = ai[i + 1] - ai[i];
160 idx = 2 * i;
161 s1 = b[idx];
162 s2 = b[1 + idx];
163 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
164 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
165 for (k = 0; k < nz; k++) {
166 jdx = 2 * vi[k];
167 x1 = x[jdx];
168 x2 = x[1 + jdx];
169 s1 -= v[0] * x1 + v[2] * x2;
170 s2 -= v[1] * x1 + v[3] * x2;
171 v += 4;
172 }
173 x[idx] = s1;
174 x[1 + idx] = s2;
175 }
176
177 PetscCall(VecRestoreArrayRead(bb, &b));
178 PetscCall(VecRestoreArray(xx, &x));
179 PetscCall(PetscLogFlops(4.0 * (a->nz) - A->cmap->n));
180 PetscFunctionReturn(PETSC_SUCCESS);
181 }
182
MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)183 PetscErrorCode MatBackwardSolve_SeqBAIJ_2_NaturalOrdering(Mat A, Vec bb, Vec xx)
184 {
185 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
186 const PetscInt n = a->mbs, *vi, *aj = a->j, *adiag = a->diag;
187 PetscInt i, k, nz, idx, idt;
188 const MatScalar *aa = a->a, *v;
189 PetscScalar *x, s1, s2, x1, x2;
190 const PetscScalar *b;
191
192 PetscFunctionBegin;
193 PetscCall(VecGetArrayRead(bb, &b));
194 PetscCall(VecGetArray(xx, &x));
195
196 /* backward solve the upper triangular */
197 for (i = n - 1; i >= 0; i--) {
198 v = aa + 4 * (adiag[i + 1] + 1);
199 vi = aj + adiag[i + 1] + 1;
200 nz = adiag[i] - adiag[i + 1] - 1;
201 idt = 2 * i;
202 s1 = b[idt];
203 s2 = b[1 + idt];
204 PetscPrefetchBlock(vi + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);
205 PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA);
206 for (k = 0; k < nz; k++) {
207 idx = 2 * vi[k];
208 x1 = x[idx];
209 x2 = x[1 + idx];
210 s1 -= v[0] * x1 + v[2] * x2;
211 s2 -= v[1] * x1 + v[3] * x2;
212 v += 4;
213 }
214 /* x = inv_diagonal*x */
215 x[idt] = v[0] * s1 + v[2] * s2;
216 x[1 + idt] = v[1] * s1 + v[3] * s2;
217 }
218
219 PetscCall(VecRestoreArrayRead(bb, &b));
220 PetscCall(VecRestoreArray(xx, &x));
221 PetscCall(PetscLogFlops(4.0 * a->nz - A->cmap->n));
222 PetscFunctionReturn(PETSC_SUCCESS);
223 }
224