1 #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
2 /*
3 This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes.
4 These processes are organized in a virtual ring.
5
6 References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
7
8 initial author R. B. SIDJE,
9 modified : G.-A Atenekeng-Kahou, 2008
10 modified : D. NUENTSA WAKAM, 2011
11
12 */
13
14 /*
15 * Take the processes that own the vectors and organize them on a virtual ring.
16 */
17 static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *, PetscReal *, PetscReal *, PetscInt);
18
KSPAGMRESRoddecInitNeighboor(KSP ksp)19 PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
20 {
21 MPI_Comm comm;
22 KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
23 PetscMPIInt First, Last, rank, size;
24
25 PetscFunctionBegin;
26 PetscCall(PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm));
27 PetscCallMPI(MPI_Comm_rank(comm, &rank));
28 PetscCallMPI(MPI_Comm_size(comm, &size));
29 PetscCallMPI(MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm));
30 PetscCallMPI(MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm));
31
32 if ((rank != Last) && (rank == 0)) {
33 agmres->Ileft = rank - 1;
34 agmres->Iright = rank + 1;
35 } else {
36 if (rank == Last) {
37 agmres->Ileft = rank - 1;
38 agmres->Iright = First;
39 } else {
40 {
41 agmres->Ileft = Last;
42 agmres->Iright = rank + 1;
43 }
44 }
45 }
46 agmres->rank = rank;
47 agmres->size = size;
48 agmres->First = First;
49 agmres->Last = Last;
50 PetscFunctionReturn(PETSC_SUCCESS);
51 }
52
KSPAGMRESRoddecGivens(PetscReal * c,PetscReal * s,PetscReal * r,PetscInt make_r)53 static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *c, PetscReal *s, PetscReal *r, PetscInt make_r)
54 {
55 PetscReal a, b, t;
56
57 PetscFunctionBegin;
58 if (make_r == 1) {
59 a = *c;
60 b = *s;
61 if (b == 0.e0) {
62 *c = 1.e0;
63 *s = 0.e0;
64 } else {
65 if (PetscAbsReal(b) > PetscAbsReal(a)) {
66 t = -a / b;
67 *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
68 *c = (*s) * t;
69 } else {
70 t = -b / a;
71 *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
72 *s = (*c) * t;
73 }
74 }
75 if (*c == 0.e0) {
76 *r = 1.e0;
77 } else {
78 if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
79 *r = PetscSign(*c) * (*s) / 2.e0;
80 } else {
81 *r = PetscSign(*s) * 2.e0 / (*c);
82 }
83 }
84 }
85
86 if (*r == 1.e0) {
87 *c = 0.e0;
88 *s = 1.e0;
89 } else {
90 if (PetscAbsReal(*r) < 1.e0) {
91 *s = 2.e0 * (*r);
92 *c = PetscSqrtReal(1.e0 - (*s) * (*s));
93 } else {
94 *c = 2.e0 / (*r);
95 *s = PetscSqrtReal(1.e0 - (*c) * (*c));
96 }
97 }
98 PetscFunctionReturn(PETSC_SUCCESS);
99 }
100
101 /*
102 Compute the QR factorization of the Krylov basis vectors
103 Input :
104 - the vectors are available in agmres->vecs (alias VEC_V)
105 - nvec : the number of vectors
106 Output :
107 - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
108 - agmres->sgn : Sign of the rotations
109 - agmres->tloc : scalar factors of the elementary reflectors.
110
111 */
KSPAGMRESRoddec(KSP ksp,PetscInt nvec)112 PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
113 {
114 KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
115 MPI_Comm comm;
116 PetscScalar *Qloc = agmres->Qloc;
117 PetscScalar *sgn = agmres->sgn;
118 PetscScalar *tloc = agmres->tloc;
119 PetscReal *wbufptr = agmres->wbufptr;
120 PetscMPIInt rank = agmres->rank;
121 PetscMPIInt First = agmres->First;
122 PetscMPIInt Last = agmres->Last;
123 PetscBLASInt pas, len, bnloc, bpos;
124 PetscInt nloc, d, i, j, k;
125 PetscInt pos;
126 PetscReal c, s, rho, Ajj, val, tt, old;
127 PetscScalar *col;
128 MPI_Status status;
129 PetscBLASInt N;
130
131 PetscFunctionBegin;
132 PetscCall(PetscBLASIntCast(MAXKSPSIZE + 1, &N));
133 PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
134 PetscCall(PetscLogEventBegin(KSP_AGMRESRoddec, ksp, 0, 0, 0));
135 PetscCall(PetscArrayzero(agmres->Rloc, N * N));
136 /* check input arguments */
137 PetscCheck(nvec >= 1, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors should be positive");
138 PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
139 PetscCall(PetscBLASIntCast(nloc, &bnloc));
140 PetscCheck(nvec <= nloc, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
141 pas = 1;
142 /* Copy the vectors of the basis */
143 for (j = 0; j < nvec; j++) {
144 PetscCall(VecGetArray(VEC_V(j), &col));
145 PetscCallBLAS("BLAScopy", BLAScopy_(&bnloc, col, &pas, &Qloc[j * nloc], &pas));
146 PetscCall(VecRestoreArray(VEC_V(j), &col));
147 }
148 /* Each process performs a local QR on its own block */
149 for (j = 0; j < nvec; j++) {
150 PetscCall(PetscBLASIntCast(nloc - j, &len));
151 Ajj = Qloc[j * nloc + j];
152 PetscCallBLAS("BLASnrm2", rho = -PetscSign(Ajj) * BLASnrm2_(&len, &Qloc[j * nloc + j], &pas));
153 if (rho == 0.0) tloc[j] = 0.0;
154 else {
155 tloc[j] = (Ajj - rho) / rho;
156 len = len - 1;
157 val = 1.0 / (Ajj - rho);
158 PetscCallBLAS("BLASscal", BLASscal_(&len, &val, &Qloc[j * nloc + j + 1], &pas));
159 Qloc[j * nloc + j] = 1.0;
160 len = len + 1;
161 for (k = j + 1; k < nvec; k++) {
162 PetscCallBLAS("BLASdot", tt = tloc[j] * BLASdot_(&len, &Qloc[j * nloc + j], &pas, &Qloc[k * nloc + j], &pas));
163 PetscCallBLAS("BLASaxpy", BLASaxpy_(&len, &tt, &Qloc[j * nloc + j], &pas, &Qloc[k * nloc + j], &pas));
164 }
165 Qloc[j * nloc + j] = rho;
166 }
167 }
168 /* annihilate undesirable Rloc, diagonal by diagonal*/
169 for (d = 0; d < nvec; d++) {
170 PetscCall(PetscBLASIntCast(nloc - j, &len));
171 if (rank == First) {
172 PetscCallBLAS("BLAScopy", BLAScopy_(&len, &Qloc[d * nloc + d], &bnloc, &wbufptr[d], &pas));
173 PetscCallMPI(MPI_Send(&wbufptr[d], len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
174 } else {
175 PetscCallMPI(MPI_Recv(&wbufptr[d], len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status));
176 /* Elimination of Rloc(1,d)*/
177 c = wbufptr[d];
178 s = Qloc[d * nloc];
179 PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
180 /* Apply Givens Rotation*/
181 for (k = d; k < nvec; k++) {
182 old = wbufptr[k];
183 wbufptr[k] = c * old - s * Qloc[k * nloc];
184 Qloc[k * nloc] = s * old + c * Qloc[k * nloc];
185 }
186 Qloc[d * nloc] = rho;
187 if (rank != Last) PetscCallMPI(MPI_Send(&wbufptr[d], len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
188 /* zero-out the d-th diagonal of Rloc ...*/
189 for (j = d + 1; j < nvec; j++) {
190 /* elimination of Rloc[i][j]*/
191 i = j - d;
192 c = Qloc[j * nloc + i - 1];
193 s = Qloc[j * nloc + i];
194 PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
195 for (k = j; k < nvec; k++) {
196 old = Qloc[k * nloc + i - 1];
197 Qloc[k * nloc + i - 1] = c * old - s * Qloc[k * nloc + i];
198 Qloc[k * nloc + i] = s * old + c * Qloc[k * nloc + i];
199 }
200 Qloc[j * nloc + i] = rho;
201 }
202 if (rank == Last) {
203 PetscCallBLAS("BLAScopy", BLAScopy_(&len, &wbufptr[d], &pas, RLOC(d, d), &N));
204 for (k = d + 1; k < nvec; k++) *RLOC(k, d) = 0.0;
205 }
206 }
207 }
208
209 if (rank == Last) {
210 for (d = 0; d < nvec; d++) {
211 pos = nvec - d;
212 PetscCall(PetscBLASIntCast(pos, &bpos));
213 sgn[d] = PetscSign(*RLOC(d, d));
214 PetscCallBLAS("BLASscal", BLASscal_(&bpos, &sgn[d], RLOC(d, d), &N));
215 }
216 }
217 /* BroadCast Rloc to all other processes
218 * NWD : should not be needed
219 */
220 PetscCallMPI(MPI_Bcast(agmres->Rloc, N * N, MPIU_SCALAR, Last, comm));
221 PetscCall(PetscLogEventEnd(KSP_AGMRESRoddec, ksp, 0, 0, 0));
222 PetscFunctionReturn(PETSC_SUCCESS);
223 }
224
225 /*
226 Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
227 Input
228 - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
229 - In : input vector (size nvec)
230 Output :
231 - Out : PETSc vector (distributed as the basis vectors)
232 */
KSPAGMRESRodvec(KSP ksp,PetscInt nvec,PetscScalar * In,Vec Out)233 PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
234 {
235 KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
236 MPI_Comm comm;
237 PetscScalar *Qloc = agmres->Qloc;
238 PetscScalar *sgn = agmres->sgn;
239 PetscScalar *tloc = agmres->tloc;
240 PetscMPIInt rank = agmres->rank;
241 PetscMPIInt First = agmres->First, Last = agmres->Last;
242 PetscMPIInt Iright = agmres->Iright, Ileft = agmres->Ileft;
243 PetscScalar *y, *zloc;
244 PetscInt nloc, d, len, i, j;
245 PetscBLASInt bnvec, pas, blen;
246 PetscInt dpt;
247 PetscReal c, s, rho, zp, zq, yd = 0.0, tt;
248 MPI_Status status;
249
250 PetscFunctionBegin;
251 PetscCall(PetscBLASIntCast(nvec, &bnvec));
252 PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
253 pas = 1;
254 PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
255 PetscCall(PetscMalloc1(nvec, &y));
256 PetscCall(PetscArraycpy(y, In, nvec));
257 PetscCall(VecGetArray(Out, &zloc));
258
259 if (rank == Last) {
260 for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
261 }
262 for (i = 0; i < nloc; i++) zloc[i] = 0.0;
263 if (agmres->size == 1) PetscCallBLAS("BLAScopy", BLAScopy_(&bnvec, y, &pas, &zloc[0], &pas));
264 else {
265 for (d = nvec - 1; d >= 0; d--) {
266 if (rank == First) {
267 PetscCallMPI(MPI_Recv(&zloc[d], 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
268 } else {
269 for (j = nvec - 1; j >= d + 1; j--) {
270 i = j - d;
271 PetscCall(KSPAGMRESRoddecGivens(&c, &s, &Qloc[j * nloc + i], 0));
272 zp = zloc[i - 1];
273 zq = zloc[i];
274 zloc[i - 1] = c * zp + s * zq;
275 zloc[i] = -s * zp + c * zq;
276 }
277 PetscCall(KSPAGMRESRoddecGivens(&c, &s, &Qloc[d * nloc], 0));
278 if (rank == Last) {
279 zp = y[d];
280 zq = zloc[0];
281 y[d] = c * zp + s * zq;
282 zloc[0] = -s * zp + c * zq;
283 PetscCallMPI(MPI_Send(&y[d], 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
284 } else {
285 PetscCallMPI(MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
286 zp = yd;
287 zq = zloc[0];
288 yd = c * zp + s * zq;
289 zloc[0] = -s * zp + c * zq;
290 PetscCallMPI(MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
291 }
292 }
293 }
294 }
295 for (j = nvec - 1; j >= 0; j--) {
296 dpt = j * nloc + j;
297 if (tloc[j] != 0.0) {
298 len = nloc - j;
299 PetscCall(PetscBLASIntCast(len, &blen));
300 rho = Qloc[dpt];
301 Qloc[dpt] = 1.0;
302 tt = tloc[j] * (BLASdot_(&blen, &Qloc[dpt], &pas, &zloc[j], &pas));
303 PetscCallBLAS("BLASaxpy", BLASaxpy_(&blen, &tt, &Qloc[dpt], &pas, &zloc[j], &pas));
304 Qloc[dpt] = rho;
305 }
306 }
307 PetscCall(VecRestoreArray(Out, &zloc));
308 PetscCall(PetscFree(y));
309 PetscFunctionReturn(PETSC_SUCCESS);
310 }
311