1 /*
2 This is included by sbaij.c to generate unsigned short and regular versions of these two functions
3 */
4
5 /* We cut-and-past below from aij.h to make a "no_function" version of PetscSparseDensePlusDot().
6 * This is necessary because the USESHORT case cannot use the inlined functions that may be employed. */
7
8 #if defined(PETSC_KERNEL_USE_UNROLL_4)
9 #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \
10 do { \
11 if (nnz > 0) { \
12 PetscInt nnz2 = nnz, rem = nnz & 0x3; \
13 switch (rem) { \
14 case 3: \
15 sum += *xv++ * r[*xi++]; \
16 case 2: \
17 sum += *xv++ * r[*xi++]; \
18 case 1: \
19 sum += *xv++ * r[*xi++]; \
20 nnz2 -= rem; \
21 } \
22 while (nnz2 > 0) { \
23 sum += xv[0] * r[xi[0]] + xv[1] * r[xi[1]] + xv[2] * r[xi[2]] + xv[3] * r[xi[3]]; \
24 xv += 4; \
25 xi += 4; \
26 nnz2 -= 4; \
27 } \
28 xv -= nnz; \
29 xi -= nnz; \
30 } \
31 } while (0)
32
33 #elif defined(PETSC_KERNEL_USE_UNROLL_2)
34 #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \
35 do { \
36 PetscInt __i, __i1, __i2; \
37 for (__i = 0; __i < nnz - 1; __i += 2) { \
38 __i1 = xi[__i]; \
39 __i2 = xi[__i + 1]; \
40 sum += (xv[__i] * r[__i1] + xv[__i + 1] * r[__i2]); \
41 } \
42 if (nnz & 0x1) sum += xv[__i] * r[xi[__i]]; \
43 } while (0)
44
45 #else
46 #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \
47 do { \
48 PetscInt __i; \
49 for (__i = 0; __i < nnz; __i++) sum += xv[__i] * r[xi[__i]]; \
50 } while (0)
51 #endif
52
53 #if defined(USESHORT)
MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz)54 PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A, Vec xx, Vec zz)
55 #else
56 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A, Vec xx, Vec zz)
57 #endif
58 {
59 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
60 const PetscScalar *x;
61 PetscScalar *z, x1, sum;
62 const MatScalar *v;
63 MatScalar vj;
64 PetscInt mbs = a->mbs, i, j, nz;
65 const PetscInt *ai = a->i;
66 #if defined(USESHORT)
67 const unsigned short *ib = a->jshort;
68 unsigned short ibt;
69 #else
70 const PetscInt *ib = a->j;
71 PetscInt ibt;
72 #endif
73 PetscInt nonzerorow = 0, jmin;
74 const int aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
75
76 PetscFunctionBegin;
77 PetscCall(VecSet(zz, 0.0));
78 PetscCall(VecGetArrayRead(xx, &x));
79 PetscCall(VecGetArray(zz, &z));
80
81 v = a->a;
82 for (i = 0; i < mbs; i++) {
83 nz = ai[i + 1] - ai[i]; /* length of i_th row of A */
84 if (!nz) continue; /* Move to the next row if the current row is empty */
85 nonzerorow++;
86 sum = 0.0;
87 jmin = 0;
88 x1 = x[i];
89 if (ib[0] == i) {
90 sum = v[0] * x1; /* diagonal term */
91 jmin++;
92 }
93 PetscPrefetchBlock(ib + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
94 PetscPrefetchBlock(v + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
95 if (aconj) {
96 for (j = jmin; j < nz; j++) {
97 ibt = ib[j];
98 vj = v[j];
99 z[ibt] += PetscConj(vj) * x1; /* (strict lower triangular part of A)*x */
100 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */
101 }
102 } else {
103 for (j = jmin; j < nz; j++) {
104 ibt = ib[j];
105 vj = v[j];
106 z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */
107 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */
108 }
109 }
110 z[i] += sum;
111 v += nz;
112 ib += nz;
113 }
114
115 PetscCall(VecRestoreArrayRead(xx, &x));
116 PetscCall(VecRestoreArray(zz, &z));
117 PetscCall(PetscLogFlops(2.0 * (2.0 * a->nz - nonzerorow) - nonzerorow));
118 PetscFunctionReturn(PETSC_SUCCESS);
119 }
120
121 #if defined(USESHORT)
MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)122 PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
123 #else
124 PetscErrorCode MatSOR_SeqSBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
125 #endif
126 {
127 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
128 const MatScalar *aa = a->a, *v, *v1, *aidiag;
129 PetscScalar *x, *t, sum;
130 const PetscScalar *b;
131 MatScalar tmp;
132 PetscInt m = a->mbs, bs = A->rmap->bs, j;
133 const PetscInt *ai = a->i;
134 #if defined(USESHORT)
135 const unsigned short *aj = a->jshort, *vj, *vj1;
136 #else
137 const PetscInt *aj = a->j, *vj, *vj1;
138 #endif
139 PetscInt nz, nz1, i;
140
141 PetscFunctionBegin;
142 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
143 PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
144
145 its = its * lits;
146 PetscCheck(its > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Relaxation requires global its %" PetscInt_FMT " and local its %" PetscInt_FMT " both positive", its, lits);
147
148 PetscCheck(bs <= 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "SSOR for block size > 1 is not yet implemented");
149
150 PetscCall(VecGetArray(xx, &x));
151 PetscCall(VecGetArrayRead(bb, &b));
152
153 if (!a->idiagvalid) {
154 if (!a->idiag) PetscCall(PetscMalloc1(m, &a->idiag));
155 for (i = 0; i < a->mbs; i++) a->idiag[i] = 1.0 / a->a[a->i[i]];
156 a->idiagvalid = PETSC_TRUE;
157 }
158
159 if (!a->sor_work) PetscCall(PetscMalloc1(m, &a->sor_work));
160 t = a->sor_work;
161
162 aidiag = a->idiag;
163
164 if (flag == SOR_APPLY_UPPER) {
165 /* apply (U + D/omega) to the vector */
166 PetscScalar d;
167 for (i = 0; i < m; i++) {
168 d = fshift + aa[ai[i]];
169 nz = ai[i + 1] - ai[i] - 1;
170 vj = aj + ai[i] + 1;
171 v = aa + ai[i] + 1;
172 sum = b[i] * d / omega;
173 #ifdef USESHORT
174 PetscSparseDensePlusDot_no_function(sum, b, v, vj, nz);
175 #else
176 PetscSparseDensePlusDot(sum, b, v, vj, nz);
177 #endif
178 x[i] = sum;
179 }
180 PetscCall(PetscLogFlops(a->nz));
181 }
182
183 if (flag & SOR_ZERO_INITIAL_GUESS) {
184 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
185 PetscCall(PetscArraycpy(t, b, m));
186
187 v = aa + 1;
188 vj = aj + 1;
189 for (i = 0; i < m; i++) {
190 nz = ai[i + 1] - ai[i] - 1;
191 tmp = -(x[i] = omega * t[i] * aidiag[i]);
192 for (j = 0; j < nz; j++) t[vj[j]] += tmp * v[j];
193 v += nz + 1;
194 vj += nz + 1;
195 }
196 PetscCall(PetscLogFlops(2.0 * a->nz));
197 }
198
199 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
200 PetscInt nz2;
201 if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)) {
202 v = aa + ai[m] - 1;
203 vj = aj + ai[m] - 1;
204 for (i = m - 1; i >= 0; i--) {
205 sum = b[i];
206 nz = ai[i + 1] - ai[i] - 1;
207 {
208 PetscInt __i;
209 for (__i = 0; __i < nz; __i++) sum -= v[-__i] * x[vj[-__i]];
210 }
211 x[i] = omega * sum * aidiag[i];
212 v -= nz + 1;
213 vj -= nz + 1;
214 }
215 PetscCall(PetscLogFlops(2.0 * a->nz));
216 } else {
217 v = aa + ai[m - 1] + 1;
218 vj = aj + ai[m - 1] + 1;
219 nz = 0;
220 for (i = m - 1; i >= 0; i--) {
221 sum = t[i];
222 nz2 = ai[i] - ai[PetscMax(i - 1, 0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */
223 PETSC_Prefetch(v - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA);
224 PETSC_Prefetch(vj - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA);
225 PetscSparseDenseMinusDot(sum, x, v, vj, nz);
226 x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i];
227 nz = nz2;
228 v -= nz + 1;
229 vj -= nz + 1;
230 }
231 PetscCall(PetscLogFlops(2.0 * a->nz));
232 }
233 }
234 its--;
235 }
236
237 while (its--) {
238 /*
239 forward sweep:
240 for i=0,...,m-1:
241 sum[i] = (b[i] - U(i,:)x)/d[i];
242 x[i] = (1-omega)x[i] + omega*sum[i];
243 b = b - x[i]*U^T(i,:);
244
245 */
246 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
247 PetscCall(PetscArraycpy(t, b, m));
248
249 for (i = 0; i < m; i++) {
250 v = aa + ai[i] + 1;
251 v1 = v;
252 vj = aj + ai[i] + 1;
253 vj1 = vj;
254 nz = ai[i + 1] - ai[i] - 1;
255 nz1 = nz;
256 sum = t[i];
257 while (nz1--) sum -= (*v1++) * x[*vj1++];
258 x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i];
259 while (nz--) t[*vj++] -= x[i] * (*v++);
260 }
261 PetscCall(PetscLogFlops(4.0 * a->nz));
262 }
263
264 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
265 /*
266 backward sweep:
267 b = b - x[i]*U^T(i,:), i=0,...,n-2
268 for i=m-1,...,0:
269 sum[i] = (b[i] - U(i,:)x)/d[i];
270 x[i] = (1-omega)x[i] + omega*sum[i];
271 */
272 /* if there was a forward sweep done above then I thing the next two for loops are not needed */
273 PetscCall(PetscArraycpy(t, b, m));
274
275 for (i = 0; i < m - 1; i++) { /* update rhs */
276 v = aa + ai[i] + 1;
277 vj = aj + ai[i] + 1;
278 nz = ai[i + 1] - ai[i] - 1;
279 while (nz--) t[*vj++] -= x[i] * (*v++);
280 }
281 PetscCall(PetscLogFlops(2.0 * (a->nz - m)));
282 for (i = m - 1; i >= 0; i--) {
283 v = aa + ai[i] + 1;
284 vj = aj + ai[i] + 1;
285 nz = ai[i + 1] - ai[i] - 1;
286 sum = t[i];
287 while (nz--) sum -= x[*vj++] * (*v++);
288 x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i];
289 }
290 PetscCall(PetscLogFlops(2.0 * (a->nz + m)));
291 }
292 }
293
294 PetscCall(VecRestoreArray(xx, &x));
295 PetscCall(VecRestoreArrayRead(bb, &b));
296 PetscFunctionReturn(PETSC_SUCCESS);
297 }
298