xref: /petsc/src/ksp/ksp/impls/gmres/agmres/agmresorthog.c (revision 07c2e4feb6773e78bda63e3a89d5b841667f9670)
1 #include <../src/ksp/ksp/impls/gmres/agmres/agmresimpl.h>
2 /*
3   This file implements the RODDEC algorithm : its purpose is to orthogonalize a set of vectors distributed across several processes.
4   These processes are organized in a virtual ring.
5 
6   References : [1] Sidje, Roger B. Alternatives for parallel Krylov subspace basis computation. Numer. Linear Algebra Appl. 4 (1997), no. 4, 305-331
7 
8   initial author R. B. SIDJE,
9   modified : G.-A Atenekeng-Kahou, 2008
10   modified : D. NUENTSA WAKAM, 2011
11 
12 */
13 
14 /*
15 * Take the processes that own the vectors and organize them on a virtual ring.
16 */
17 static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *, PetscReal *, PetscReal *, PetscInt);
18 
KSPAGMRESRoddecInitNeighboor(KSP ksp)19 PetscErrorCode KSPAGMRESRoddecInitNeighboor(KSP ksp)
20 {
21   MPI_Comm    comm;
22   KSP_AGMRES *agmres = (KSP_AGMRES *)ksp->data;
23   PetscMPIInt First, Last, rank, size;
24 
25   PetscFunctionBegin;
26   PetscCall(PetscObjectGetComm((PetscObject)agmres->vecs[0], &comm));
27   PetscCallMPI(MPI_Comm_rank(comm, &rank));
28   PetscCallMPI(MPI_Comm_size(comm, &size));
29   PetscCallMPI(MPIU_Allreduce(&rank, &First, 1, MPI_INT, MPI_MIN, comm));
30   PetscCallMPI(MPIU_Allreduce(&rank, &Last, 1, MPI_INT, MPI_MAX, comm));
31 
32   if ((rank != Last) && (rank == 0)) {
33     agmres->Ileft  = rank - 1;
34     agmres->Iright = rank + 1;
35   } else {
36     if (rank == Last) {
37       agmres->Ileft  = rank - 1;
38       agmres->Iright = First;
39     } else {
40       {
41         agmres->Ileft  = Last;
42         agmres->Iright = rank + 1;
43       }
44     }
45   }
46   agmres->rank  = rank;
47   agmres->size  = size;
48   agmres->First = First;
49   agmres->Last  = Last;
50   PetscFunctionReturn(PETSC_SUCCESS);
51 }
52 
KSPAGMRESRoddecGivens(PetscReal * c,PetscReal * s,PetscReal * r,PetscInt make_r)53 static PetscErrorCode KSPAGMRESRoddecGivens(PetscReal *c, PetscReal *s, PetscReal *r, PetscInt make_r)
54 {
55   PetscReal a, b, t;
56 
57   PetscFunctionBegin;
58   if (make_r == 1) {
59     a = *c;
60     b = *s;
61     if (b == 0.e0) {
62       *c = 1.e0;
63       *s = 0.e0;
64     } else {
65       if (PetscAbsReal(b) > PetscAbsReal(a)) {
66         t  = -a / b;
67         *s = 1.e0 / PetscSqrtReal(1.e0 + t * t);
68         *c = (*s) * t;
69       } else {
70         t  = -b / a;
71         *c = 1.e0 / PetscSqrtReal(1.e0 + t * t);
72         *s = (*c) * t;
73       }
74     }
75     if (*c == 0.e0) {
76       *r = 1.e0;
77     } else {
78       if (PetscAbsReal(*s) < PetscAbsReal(*c)) {
79         *r = PetscSign(*c) * (*s) / 2.e0;
80       } else {
81         *r = PetscSign(*s) * 2.e0 / (*c);
82       }
83     }
84   }
85 
86   if (*r == 1.e0) {
87     *c = 0.e0;
88     *s = 1.e0;
89   } else {
90     if (PetscAbsReal(*r) < 1.e0) {
91       *s = 2.e0 * (*r);
92       *c = PetscSqrtReal(1.e0 - (*s) * (*s));
93     } else {
94       *c = 2.e0 / (*r);
95       *s = PetscSqrtReal(1.e0 - (*c) * (*c));
96     }
97   }
98   PetscFunctionReturn(PETSC_SUCCESS);
99 }
100 
101 /*
102   Compute the QR factorization of the Krylov basis vectors
103   Input :
104    - the vectors are available in agmres->vecs (alias VEC_V)
105    - nvec :  the number of vectors
106   Output :
107    - agmres->Qloc : product of elementary reflectors for the QR of the (local part) of the vectors.
108    - agmres->sgn :  Sign of the rotations
109    - agmres->tloc : scalar factors of the elementary reflectors.
110 
111 */
KSPAGMRESRoddec(KSP ksp,PetscInt nvec)112 PetscErrorCode KSPAGMRESRoddec(KSP ksp, PetscInt nvec)
113 {
114   KSP_AGMRES  *agmres = (KSP_AGMRES *)ksp->data;
115   MPI_Comm     comm;
116   PetscScalar *Qloc    = agmres->Qloc;
117   PetscScalar *sgn     = agmres->sgn;
118   PetscScalar *tloc    = agmres->tloc;
119   PetscReal   *wbufptr = agmres->wbufptr;
120   PetscMPIInt  rank    = agmres->rank;
121   PetscMPIInt  First   = agmres->First;
122   PetscMPIInt  Last    = agmres->Last;
123   PetscBLASInt pas, len, bnloc, bpos;
124   PetscInt     nloc, d, i, j, k;
125   PetscInt     pos;
126   PetscReal    c, s, rho, Ajj, val, tt, old;
127   PetscScalar *col;
128   MPI_Status   status;
129   PetscBLASInt N;
130 
131   PetscFunctionBegin;
132   PetscCall(PetscBLASIntCast(MAXKSPSIZE + 1, &N));
133   PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
134   PetscCall(PetscLogEventBegin(KSP_AGMRESRoddec, ksp, 0, 0, 0));
135   PetscCall(PetscArrayzero(agmres->Rloc, N * N));
136   /* check input arguments */
137   PetscCheck(nvec >= 1, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_OUTOFRANGE, "The number of input vectors should be positive");
138   PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
139   PetscCall(PetscBLASIntCast(nloc, &bnloc));
140   PetscCheck(nvec <= nloc, PetscObjectComm((PetscObject)ksp), PETSC_ERR_ARG_WRONG, "In QR factorization, the number of local rows should be greater or equal to the number of columns");
141   pas = 1;
142   /* Copy the vectors of the basis */
143   for (j = 0; j < nvec; j++) {
144     PetscCall(VecGetArray(VEC_V(j), &col));
145     PetscCallBLAS("BLAScopy", BLAScopy_(&bnloc, col, &pas, &Qloc[j * nloc], &pas));
146     PetscCall(VecRestoreArray(VEC_V(j), &col));
147   }
148   /* Each process performs a local QR on its own block */
149   for (j = 0; j < nvec; j++) {
150     PetscCall(PetscBLASIntCast(nloc - j, &len));
151     Ajj = Qloc[j * nloc + j];
152     PetscCallBLAS("BLASnrm2", rho = -PetscSign(Ajj) * BLASnrm2_(&len, &Qloc[j * nloc + j], &pas));
153     if (rho == 0.0) tloc[j] = 0.0;
154     else {
155       tloc[j] = (Ajj - rho) / rho;
156       len     = len - 1;
157       val     = 1.0 / (Ajj - rho);
158       PetscCallBLAS("BLASscal", BLASscal_(&len, &val, &Qloc[j * nloc + j + 1], &pas));
159       Qloc[j * nloc + j] = 1.0;
160       len                = len + 1;
161       for (k = j + 1; k < nvec; k++) {
162         PetscCallBLAS("BLASdot", tt = tloc[j] * BLASdot_(&len, &Qloc[j * nloc + j], &pas, &Qloc[k * nloc + j], &pas));
163         PetscCallBLAS("BLASaxpy", BLASaxpy_(&len, &tt, &Qloc[j * nloc + j], &pas, &Qloc[k * nloc + j], &pas));
164       }
165       Qloc[j * nloc + j] = rho;
166     }
167   }
168   /* annihilate undesirable Rloc, diagonal by diagonal*/
169   for (d = 0; d < nvec; d++) {
170     PetscCall(PetscBLASIntCast(nloc - j, &len));
171     if (rank == First) {
172       PetscCallBLAS("BLAScopy", BLAScopy_(&len, &Qloc[d * nloc + d], &bnloc, &wbufptr[d], &pas));
173       PetscCallMPI(MPI_Send(&wbufptr[d], len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
174     } else {
175       PetscCallMPI(MPI_Recv(&wbufptr[d], len, MPIU_SCALAR, rank - 1, agmres->tag, comm, &status));
176       /* Elimination of Rloc(1,d)*/
177       c = wbufptr[d];
178       s = Qloc[d * nloc];
179       PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
180       /* Apply Givens Rotation*/
181       for (k = d; k < nvec; k++) {
182         old            = wbufptr[k];
183         wbufptr[k]     = c * old - s * Qloc[k * nloc];
184         Qloc[k * nloc] = s * old + c * Qloc[k * nloc];
185       }
186       Qloc[d * nloc] = rho;
187       if (rank != Last) PetscCallMPI(MPI_Send(&wbufptr[d], len, MPIU_SCALAR, rank + 1, agmres->tag, comm));
188       /* zero-out the d-th diagonal of Rloc ...*/
189       for (j = d + 1; j < nvec; j++) {
190         /* elimination of Rloc[i][j]*/
191         i = j - d;
192         c = Qloc[j * nloc + i - 1];
193         s = Qloc[j * nloc + i];
194         PetscCall(KSPAGMRESRoddecGivens(&c, &s, &rho, 1));
195         for (k = j; k < nvec; k++) {
196           old                    = Qloc[k * nloc + i - 1];
197           Qloc[k * nloc + i - 1] = c * old - s * Qloc[k * nloc + i];
198           Qloc[k * nloc + i]     = s * old + c * Qloc[k * nloc + i];
199         }
200         Qloc[j * nloc + i] = rho;
201       }
202       if (rank == Last) {
203         PetscCallBLAS("BLAScopy", BLAScopy_(&len, &wbufptr[d], &pas, RLOC(d, d), &N));
204         for (k = d + 1; k < nvec; k++) *RLOC(k, d) = 0.0;
205       }
206     }
207   }
208 
209   if (rank == Last) {
210     for (d = 0; d < nvec; d++) {
211       pos = nvec - d;
212       PetscCall(PetscBLASIntCast(pos, &bpos));
213       sgn[d] = PetscSign(*RLOC(d, d));
214       PetscCallBLAS("BLASscal", BLASscal_(&bpos, &sgn[d], RLOC(d, d), &N));
215     }
216   }
217   /* BroadCast Rloc to all other processes
218    * NWD : should not be needed
219    */
220   PetscCallMPI(MPI_Bcast(agmres->Rloc, N * N, MPIU_SCALAR, Last, comm));
221   PetscCall(PetscLogEventEnd(KSP_AGMRESRoddec, ksp, 0, 0, 0));
222   PetscFunctionReturn(PETSC_SUCCESS);
223 }
224 
225 /*
226    Computes Out <-- Q * In where Q is the orthogonal matrix from AGMRESRoddec
227    Input
228     - Qloc, sgn, tloc, nvec (see AGMRESRoddec above)
229     - In : input vector (size nvec)
230    Output :
231     - Out : PETSc vector (distributed as the basis vectors)
232 */
KSPAGMRESRodvec(KSP ksp,PetscInt nvec,PetscScalar * In,Vec Out)233 PetscErrorCode KSPAGMRESRodvec(KSP ksp, PetscInt nvec, PetscScalar *In, Vec Out)
234 {
235   KSP_AGMRES  *agmres = (KSP_AGMRES *)ksp->data;
236   MPI_Comm     comm;
237   PetscScalar *Qloc  = agmres->Qloc;
238   PetscScalar *sgn   = agmres->sgn;
239   PetscScalar *tloc  = agmres->tloc;
240   PetscMPIInt  rank  = agmres->rank;
241   PetscMPIInt  First = agmres->First, Last = agmres->Last;
242   PetscMPIInt  Iright = agmres->Iright, Ileft = agmres->Ileft;
243   PetscScalar *y, *zloc;
244   PetscInt     nloc, d, len, i, j;
245   PetscBLASInt bnvec, pas, blen;
246   PetscInt     dpt;
247   PetscReal    c, s, rho, zp, zq, yd = 0.0, tt;
248   MPI_Status   status;
249 
250   PetscFunctionBegin;
251   PetscCall(PetscBLASIntCast(nvec, &bnvec));
252   PetscCall(PetscObjectGetComm((PetscObject)ksp, &comm));
253   pas = 1;
254   PetscCall(VecGetLocalSize(VEC_V(0), &nloc));
255   PetscCall(PetscMalloc1(nvec, &y));
256   PetscCall(PetscArraycpy(y, In, nvec));
257   PetscCall(VecGetArray(Out, &zloc));
258 
259   if (rank == Last) {
260     for (i = 0; i < nvec; i++) y[i] = sgn[i] * y[i];
261   }
262   for (i = 0; i < nloc; i++) zloc[i] = 0.0;
263   if (agmres->size == 1) PetscCallBLAS("BLAScopy", BLAScopy_(&bnvec, y, &pas, &zloc[0], &pas));
264   else {
265     for (d = nvec - 1; d >= 0; d--) {
266       if (rank == First) {
267         PetscCallMPI(MPI_Recv(&zloc[d], 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
268       } else {
269         for (j = nvec - 1; j >= d + 1; j--) {
270           i = j - d;
271           PetscCall(KSPAGMRESRoddecGivens(&c, &s, &Qloc[j * nloc + i], 0));
272           zp          = zloc[i - 1];
273           zq          = zloc[i];
274           zloc[i - 1] = c * zp + s * zq;
275           zloc[i]     = -s * zp + c * zq;
276         }
277         PetscCall(KSPAGMRESRoddecGivens(&c, &s, &Qloc[d * nloc], 0));
278         if (rank == Last) {
279           zp      = y[d];
280           zq      = zloc[0];
281           y[d]    = c * zp + s * zq;
282           zloc[0] = -s * zp + c * zq;
283           PetscCallMPI(MPI_Send(&y[d], 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
284         } else {
285           PetscCallMPI(MPI_Recv(&yd, 1, MPIU_SCALAR, Iright, agmres->tag, comm, &status));
286           zp      = yd;
287           zq      = zloc[0];
288           yd      = c * zp + s * zq;
289           zloc[0] = -s * zp + c * zq;
290           PetscCallMPI(MPI_Send(&yd, 1, MPIU_SCALAR, Ileft, agmres->tag, comm));
291         }
292       }
293     }
294   }
295   for (j = nvec - 1; j >= 0; j--) {
296     dpt = j * nloc + j;
297     if (tloc[j] != 0.0) {
298       len = nloc - j;
299       PetscCall(PetscBLASIntCast(len, &blen));
300       rho       = Qloc[dpt];
301       Qloc[dpt] = 1.0;
302       tt        = tloc[j] * (BLASdot_(&blen, &Qloc[dpt], &pas, &zloc[j], &pas));
303       PetscCallBLAS("BLASaxpy", BLASaxpy_(&blen, &tt, &Qloc[dpt], &pas, &zloc[j], &pas));
304       Qloc[dpt] = rho;
305     }
306   }
307   PetscCall(VecRestoreArray(Out, &zloc));
308   PetscCall(PetscFree(y));
309   PetscFunctionReturn(PETSC_SUCCESS);
310 }
311