1b5b17502SBarry Smith /*
2b5b17502SBarry Smith This is included by sbaij.c to generate unsigned short and regular versions of these two functions
3b5b17502SBarry Smith */
454e8760dSRichard Tran Mills
554e8760dSRichard Tran Mills /* We cut-and-past below from aij.h to make a "no_function" version of PetscSparseDensePlusDot().
654e8760dSRichard Tran Mills * This is necessary because the USESHORT case cannot use the inlined functions that may be employed. */
754e8760dSRichard Tran Mills
854e8760dSRichard Tran Mills #if defined(PETSC_KERNEL_USE_UNROLL_4)
99371c9d4SSatish Balay #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \
10a8f51744SPierre Jolivet do { \
1154e8760dSRichard Tran Mills if (nnz > 0) { \
12a9ce9505SJunchao Zhang PetscInt nnz2 = nnz, rem = nnz & 0x3; \
13a9ce9505SJunchao Zhang switch (rem) { \
14d71ae5a4SJacob Faibussowitsch case 3: \
15d71ae5a4SJacob Faibussowitsch sum += *xv++ * r[*xi++]; \
16d71ae5a4SJacob Faibussowitsch case 2: \
17d71ae5a4SJacob Faibussowitsch sum += *xv++ * r[*xi++]; \
18d71ae5a4SJacob Faibussowitsch case 1: \
19d71ae5a4SJacob Faibussowitsch sum += *xv++ * r[*xi++]; \
20d71ae5a4SJacob Faibussowitsch nnz2 -= rem; \
21a9ce9505SJunchao Zhang } \
229371c9d4SSatish Balay while (nnz2 > 0) { \
239371c9d4SSatish Balay sum += xv[0] * r[xi[0]] + xv[1] * r[xi[1]] + xv[2] * r[xi[2]] + xv[3] * r[xi[3]]; \
249371c9d4SSatish Balay xv += 4; \
259371c9d4SSatish Balay xi += 4; \
269371c9d4SSatish Balay nnz2 -= 4; \
279371c9d4SSatish Balay } \
289371c9d4SSatish Balay xv -= nnz; \
299371c9d4SSatish Balay xi -= nnz; \
30a9ce9505SJunchao Zhang } \
31a8f51744SPierre Jolivet } while (0)
3254e8760dSRichard Tran Mills
3354e8760dSRichard Tran Mills #elif defined(PETSC_KERNEL_USE_UNROLL_2)
349371c9d4SSatish Balay #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \
35a8f51744SPierre Jolivet do { \
3654e8760dSRichard Tran Mills PetscInt __i, __i1, __i2; \
379371c9d4SSatish Balay for (__i = 0; __i < nnz - 1; __i += 2) { \
389371c9d4SSatish Balay __i1 = xi[__i]; \
399371c9d4SSatish Balay __i2 = xi[__i + 1]; \
409371c9d4SSatish Balay sum += (xv[__i] * r[__i1] + xv[__i + 1] * r[__i2]); \
419371c9d4SSatish Balay } \
429371c9d4SSatish Balay if (nnz & 0x1) sum += xv[__i] * r[xi[__i]]; \
43a8f51744SPierre Jolivet } while (0)
4454e8760dSRichard Tran Mills
4554e8760dSRichard Tran Mills #else
469371c9d4SSatish Balay #define PetscSparseDensePlusDot_no_function(sum, r, xv, xi, nnz) \
47a8f51744SPierre Jolivet do { \
4854e8760dSRichard Tran Mills PetscInt __i; \
499371c9d4SSatish Balay for (__i = 0; __i < nnz; __i++) sum += xv[__i] * r[xi[__i]]; \
50a8f51744SPierre Jolivet } while (0)
5154e8760dSRichard Tran Mills #endif
5254e8760dSRichard Tran Mills
53018dd85eSSatish Balay #if defined(USESHORT)
MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz)54018dd85eSSatish Balay PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A, Vec xx, Vec zz)
55018dd85eSSatish Balay #else
56018dd85eSSatish Balay PetscErrorCode MatMult_SeqSBAIJ_1(Mat A, Vec xx, Vec zz)
57018dd85eSSatish Balay #endif
58b5b17502SBarry Smith {
59b5b17502SBarry Smith Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
60b5b17502SBarry Smith const PetscScalar *x;
61b5b17502SBarry Smith PetscScalar *z, x1, sum;
62b5b17502SBarry Smith const MatScalar *v;
63b5b17502SBarry Smith MatScalar vj;
64b5b17502SBarry Smith PetscInt mbs = a->mbs, i, j, nz;
65b5b17502SBarry Smith const PetscInt *ai = a->i;
66b5b17502SBarry Smith #if defined(USESHORT)
67b5b17502SBarry Smith const unsigned short *ib = a->jshort;
68b5b17502SBarry Smith unsigned short ibt;
69b5b17502SBarry Smith #else
70b5b17502SBarry Smith const PetscInt *ib = a->j;
71b5b17502SBarry Smith PetscInt ibt;
72b5b17502SBarry Smith #endif
738dca527aSShri Abhyankar PetscInt nonzerorow = 0, jmin;
74*fc2fb351SPierre Jolivet const int aconj = PetscDefined(USE_COMPLEX) && A->hermitian == PETSC_BOOL3_TRUE ? 1 : 0;
75b5b17502SBarry Smith
76b5b17502SBarry Smith PetscFunctionBegin;
779566063dSJacob Faibussowitsch PetscCall(VecSet(zz, 0.0));
789566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(xx, &x));
799566063dSJacob Faibussowitsch PetscCall(VecGetArray(zz, &z));
80b5b17502SBarry Smith
81b5b17502SBarry Smith v = a->a;
82b5b17502SBarry Smith for (i = 0; i < mbs; i++) {
83b5b17502SBarry Smith nz = ai[i + 1] - ai[i]; /* length of i_th row of A */
8403d09022SShri Abhyankar if (!nz) continue; /* Move to the next row if the current row is empty */
8503d09022SShri Abhyankar nonzerorow++;
868dca527aSShri Abhyankar sum = 0.0;
878dca527aSShri Abhyankar jmin = 0;
88b5b17502SBarry Smith x1 = x[i];
898dca527aSShri Abhyankar if (ib[0] == i) {
90b5b17502SBarry Smith sum = v[0] * x1; /* diagonal term */
918dca527aSShri Abhyankar jmin++;
928dca527aSShri Abhyankar }
93444d8c10SJed Brown PetscPrefetchBlock(ib + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
94444d8c10SJed Brown PetscPrefetchBlock(v + nz, nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
95eb1ec7c1SStefano Zampini if (aconj) {
96eb1ec7c1SStefano Zampini for (j = jmin; j < nz; j++) {
97eb1ec7c1SStefano Zampini ibt = ib[j];
98eb1ec7c1SStefano Zampini vj = v[j];
99eb1ec7c1SStefano Zampini z[ibt] += PetscConj(vj) * x1; /* (strict lower triangular part of A)*x */
100eb1ec7c1SStefano Zampini sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */
101eb1ec7c1SStefano Zampini }
102eb1ec7c1SStefano Zampini } else {
1038dca527aSShri Abhyankar for (j = jmin; j < nz; j++) {
104b5b17502SBarry Smith ibt = ib[j];
105b5b17502SBarry Smith vj = v[j];
106b5b17502SBarry Smith z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */
107b5b17502SBarry Smith sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */
108b5b17502SBarry Smith }
109eb1ec7c1SStefano Zampini }
110b5b17502SBarry Smith z[i] += sum;
111b5b17502SBarry Smith v += nz;
112b5b17502SBarry Smith ib += nz;
113b5b17502SBarry Smith }
114b5b17502SBarry Smith
1159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(xx, &x));
1169566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(zz, &z));
1179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (2.0 * a->nz - nonzerorow) - nonzerorow));
1183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
119b5b17502SBarry Smith }
120b5b17502SBarry Smith
121018dd85eSSatish Balay #if defined(USESHORT)
MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)122018dd85eSSatish Balay PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
123018dd85eSSatish Balay #else
124018dd85eSSatish Balay PetscErrorCode MatSOR_SeqSBAIJ(Mat A, Vec bb, PetscReal omega, MatSORType flag, PetscReal fshift, PetscInt its, PetscInt lits, Vec xx)
125018dd85eSSatish Balay #endif
126b5b17502SBarry Smith {
127b5b17502SBarry Smith Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
128b5b17502SBarry Smith const MatScalar *aa = a->a, *v, *v1, *aidiag;
129b5b17502SBarry Smith PetscScalar *x, *t, sum;
130b5b17502SBarry Smith const PetscScalar *b;
131b5b17502SBarry Smith MatScalar tmp;
132b5b17502SBarry Smith PetscInt m = a->mbs, bs = A->rmap->bs, j;
133b5b17502SBarry Smith const PetscInt *ai = a->i;
134b5b17502SBarry Smith #if defined(USESHORT)
135b5b17502SBarry Smith const unsigned short *aj = a->jshort, *vj, *vj1;
136b5b17502SBarry Smith #else
137b5b17502SBarry Smith const PetscInt *aj = a->j, *vj, *vj1;
138b5b17502SBarry Smith #endif
139b5b17502SBarry Smith PetscInt nz, nz1, i;
140b5b17502SBarry Smith
141b5b17502SBarry Smith PetscFunctionBegin;
142422a814eSBarry Smith if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */
143aed4548fSBarry Smith PetscCheck(!(flag & SOR_EISENSTAT), PETSC_COMM_SELF, PETSC_ERR_SUP, "No support yet for Eisenstat");
144b5b17502SBarry Smith
145b5b17502SBarry Smith its = its * lits;
14608401ef6SPierre Jolivet PetscCheck(its > 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Relaxation requires global its %" PetscInt_FMT " and local its %" PetscInt_FMT " both positive", its, lits);
147b5b17502SBarry Smith
14808401ef6SPierre Jolivet PetscCheck(bs <= 1, PETSC_COMM_SELF, PETSC_ERR_SUP, "SSOR for block size > 1 is not yet implemented");
149b5b17502SBarry Smith
1509566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x));
1519566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b));
152b5b17502SBarry Smith
153b5b17502SBarry Smith if (!a->idiagvalid) {
15448a46eb9SPierre Jolivet if (!a->idiag) PetscCall(PetscMalloc1(m, &a->idiag));
155b5b17502SBarry Smith for (i = 0; i < a->mbs; i++) a->idiag[i] = 1.0 / a->a[a->i[i]];
156b5b17502SBarry Smith a->idiagvalid = PETSC_TRUE;
157b5b17502SBarry Smith }
158b5b17502SBarry Smith
15948a46eb9SPierre Jolivet if (!a->sor_work) PetscCall(PetscMalloc1(m, &a->sor_work));
16041f059aeSBarry Smith t = a->sor_work;
161b5b17502SBarry Smith
162b5b17502SBarry Smith aidiag = a->idiag;
163b5b17502SBarry Smith
164b5b17502SBarry Smith if (flag == SOR_APPLY_UPPER) {
165b5b17502SBarry Smith /* apply (U + D/omega) to the vector */
166b5b17502SBarry Smith PetscScalar d;
167b5b17502SBarry Smith for (i = 0; i < m; i++) {
168b5b17502SBarry Smith d = fshift + aa[ai[i]];
169b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1;
170b5b17502SBarry Smith vj = aj + ai[i] + 1;
171b5b17502SBarry Smith v = aa + ai[i] + 1;
172b5b17502SBarry Smith sum = b[i] * d / omega;
17354e8760dSRichard Tran Mills #ifdef USESHORT
17454e8760dSRichard Tran Mills PetscSparseDensePlusDot_no_function(sum, b, v, vj, nz);
17554e8760dSRichard Tran Mills #else
176b5b17502SBarry Smith PetscSparseDensePlusDot(sum, b, v, vj, nz);
17754e8760dSRichard Tran Mills #endif
178b5b17502SBarry Smith x[i] = sum;
179b5b17502SBarry Smith }
1809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(a->nz));
181b5b17502SBarry Smith }
182b5b17502SBarry Smith
183b5b17502SBarry Smith if (flag & SOR_ZERO_INITIAL_GUESS) {
184b5b17502SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
1859566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b, m));
186b5b17502SBarry Smith
187b5b17502SBarry Smith v = aa + 1;
188b5b17502SBarry Smith vj = aj + 1;
189b5b17502SBarry Smith for (i = 0; i < m; i++) {
190b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1;
191b5b17502SBarry Smith tmp = -(x[i] = omega * t[i] * aidiag[i]);
19226fbe8dcSKarl Rupp for (j = 0; j < nz; j++) t[vj[j]] += tmp * v[j];
193b5b17502SBarry Smith v += nz + 1;
194b5b17502SBarry Smith vj += nz + 1;
195b5b17502SBarry Smith }
1969566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz));
197b5b17502SBarry Smith }
198b5b17502SBarry Smith
199b5b17502SBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
2006497c311SBarry Smith PetscInt nz2;
201b5b17502SBarry Smith if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)) {
202b5b17502SBarry Smith v = aa + ai[m] - 1;
203b5b17502SBarry Smith vj = aj + ai[m] - 1;
204b5b17502SBarry Smith for (i = m - 1; i >= 0; i--) {
205b5b17502SBarry Smith sum = b[i];
206b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1;
2079371c9d4SSatish Balay {
2089371c9d4SSatish Balay PetscInt __i;
2099371c9d4SSatish Balay for (__i = 0; __i < nz; __i++) sum -= v[-__i] * x[vj[-__i]];
2109371c9d4SSatish Balay }
211b5b17502SBarry Smith x[i] = omega * sum * aidiag[i];
212b5b17502SBarry Smith v -= nz + 1;
213b5b17502SBarry Smith vj -= nz + 1;
214b5b17502SBarry Smith }
2159566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz));
216b5b17502SBarry Smith } else {
217b5b17502SBarry Smith v = aa + ai[m - 1] + 1;
218b5b17502SBarry Smith vj = aj + ai[m - 1] + 1;
219b5b17502SBarry Smith nz = 0;
220b5b17502SBarry Smith for (i = m - 1; i >= 0; i--) {
221b5b17502SBarry Smith sum = t[i];
2226c11f900SJed Brown nz2 = ai[i] - ai[PetscMax(i - 1, 0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */
22350d8bf02SJed Brown PETSC_Prefetch(v - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA);
22450d8bf02SJed Brown PETSC_Prefetch(vj - nz2 - 1, 0, PETSC_PREFETCH_HINT_NTA);
225b5b17502SBarry Smith PetscSparseDenseMinusDot(sum, x, v, vj, nz);
226b5b17502SBarry Smith x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i];
227b5b17502SBarry Smith nz = nz2;
228b5b17502SBarry Smith v -= nz + 1;
229b5b17502SBarry Smith vj -= nz + 1;
230b5b17502SBarry Smith }
2319566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz));
232b5b17502SBarry Smith }
233b5b17502SBarry Smith }
234b5b17502SBarry Smith its--;
235b5b17502SBarry Smith }
236b5b17502SBarry Smith
237b5b17502SBarry Smith while (its--) {
238b5b17502SBarry Smith /*
239b5b17502SBarry Smith forward sweep:
240b5b17502SBarry Smith for i=0,...,m-1:
241b5b17502SBarry Smith sum[i] = (b[i] - U(i,:)x)/d[i];
242b5b17502SBarry Smith x[i] = (1-omega)x[i] + omega*sum[i];
243b5b17502SBarry Smith b = b - x[i]*U^T(i,:);
244b5b17502SBarry Smith
245b5b17502SBarry Smith */
246b5b17502SBarry Smith if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
2479566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b, m));
248b5b17502SBarry Smith
249b5b17502SBarry Smith for (i = 0; i < m; i++) {
2509371c9d4SSatish Balay v = aa + ai[i] + 1;
2519371c9d4SSatish Balay v1 = v;
2529371c9d4SSatish Balay vj = aj + ai[i] + 1;
2539371c9d4SSatish Balay vj1 = vj;
2549371c9d4SSatish Balay nz = ai[i + 1] - ai[i] - 1;
2559371c9d4SSatish Balay nz1 = nz;
256b5b17502SBarry Smith sum = t[i];
257b5b17502SBarry Smith while (nz1--) sum -= (*v1++) * x[*vj1++];
258b5b17502SBarry Smith x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i];
259b5b17502SBarry Smith while (nz--) t[*vj++] -= x[i] * (*v++);
260b5b17502SBarry Smith }
2619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(4.0 * a->nz));
262b5b17502SBarry Smith }
263b5b17502SBarry Smith
264b5b17502SBarry Smith if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
265b5b17502SBarry Smith /*
266b5b17502SBarry Smith backward sweep:
267b5b17502SBarry Smith b = b - x[i]*U^T(i,:), i=0,...,n-2
268b5b17502SBarry Smith for i=m-1,...,0:
269b5b17502SBarry Smith sum[i] = (b[i] - U(i,:)x)/d[i];
270b5b17502SBarry Smith x[i] = (1-omega)x[i] + omega*sum[i];
271b5b17502SBarry Smith */
272b5b17502SBarry Smith /* if there was a forward sweep done above then I thing the next two for loops are not needed */
2739566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b, m));
274b5b17502SBarry Smith
275b5b17502SBarry Smith for (i = 0; i < m - 1; i++) { /* update rhs */
276b5b17502SBarry Smith v = aa + ai[i] + 1;
277b5b17502SBarry Smith vj = aj + ai[i] + 1;
278b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1;
279b5b17502SBarry Smith while (nz--) t[*vj++] -= x[i] * (*v++);
280b5b17502SBarry Smith }
2819566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz - m)));
282b5b17502SBarry Smith for (i = m - 1; i >= 0; i--) {
283b5b17502SBarry Smith v = aa + ai[i] + 1;
284b5b17502SBarry Smith vj = aj + ai[i] + 1;
285b5b17502SBarry Smith nz = ai[i + 1] - ai[i] - 1;
286b5b17502SBarry Smith sum = t[i];
287b5b17502SBarry Smith while (nz--) sum -= x[*vj++] * (*v++);
288b5b17502SBarry Smith x[i] = (1 - omega) * x[i] + omega * sum * aidiag[i];
289b5b17502SBarry Smith }
2909566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->nz + m)));
291b5b17502SBarry Smith }
292b5b17502SBarry Smith }
293b5b17502SBarry Smith
2949566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x));
2959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b));
2963ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS);
297b5b17502SBarry Smith }
298