xref: /petsc/src/mat/impls/sbaij/seq/relax.h (revision cf1aed2ce99d23e50336629af3ca8cf096900abb)
1 
2 /*
3     This is included by sbaij.c to generate unsigned short and regular versions of these two functions
4 */
5 #undef __FUNCT__
6 #if defined(USESHORT)
7 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian_ushort"
8 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian_ushort(Mat A,Vec xx,Vec zz)
9 #else
10 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian"
11 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian(Mat A,Vec xx,Vec zz)
12 #endif
13 {
14   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
15   const PetscScalar *x;
16   PetscScalar       *z,x1,sum;
17   const MatScalar   *v;
18   MatScalar         vj;
19   PetscErrorCode    ierr;
20   PetscInt          mbs=a->mbs,i,j,nz;
21   const PetscInt    *ai=a->i;
22 #if defined(USESHORT)
23   const unsigned short *ib=a->jshort;
24   unsigned short       ibt;
25 #else
26   const PetscInt *ib=a->j;
27   PetscInt       ibt;
28 #endif
29   PetscInt nonzerorow = 0,jmin;
30 
31   PetscFunctionBegin;
32   ierr = VecSet(zz,0.0);CHKERRQ(ierr);
33   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
34   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
35 
36   v = a->a;
37   for (i=0; i<mbs; i++) {
38     nz = ai[i+1] - ai[i];    /* length of i_th row of A */
39     if (!nz) continue; /* Move to the next row if the current row is empty */
40     nonzerorow++;
41     x1   = x[i];
42     sum  = 0.0;
43     jmin = 0;
44     if (ib[0] == i) {
45       sum = v[0]*x1;           /* diagonal term */
46       jmin++;
47     }
48     for (j=jmin; j<nz; j++) {
49       ibt     = ib[j];
50       vj      = v[j];
51       sum    += vj * x[ibt]; /* (strict upper triangular part of A)*x  */
52       z[ibt] += PetscConj(v[j]) * x1;    /* (strict lower triangular part of A)*x  */
53     }
54     z[i] += sum;
55     v    +=    nz;
56     ib   += nz;
57   }
58 
59   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
60   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
61   ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr);
62   PetscFunctionReturn(0);
63 }
64 
65 #undef __FUNCT__
66 #if defined(USESHORT)
67 #define __FUNCT__ "MatMult_SeqSBAIJ_1_ushort"
68 PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz)
69 #else
70 #define __FUNCT__ "MatMult_SeqSBAIJ_1"
71 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz)
72 #endif
73 {
74   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
75   const PetscScalar *x;
76   PetscScalar       *z,x1,sum;
77   const MatScalar   *v;
78   MatScalar         vj;
79   PetscErrorCode    ierr;
80   PetscInt          mbs=a->mbs,i,j,nz;
81   const PetscInt    *ai=a->i;
82 #if defined(USESHORT)
83   const unsigned short *ib=a->jshort;
84   unsigned short       ibt;
85 #else
86   const PetscInt *ib=a->j;
87   PetscInt       ibt;
88 #endif
89   PetscInt nonzerorow=0,jmin;
90 
91   PetscFunctionBegin;
92   ierr = VecSet(zz,0.0);CHKERRQ(ierr);
93   ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr);
94   ierr = VecGetArray(zz,&z);CHKERRQ(ierr);
95 
96   v = a->a;
97   for (i=0; i<mbs; i++) {
98     nz = ai[i+1] - ai[i];          /* length of i_th row of A */
99     if (!nz) continue; /* Move to the next row if the current row is empty */
100     nonzerorow++;
101     sum  = 0.0;
102     jmin = 0;
103     x1   = x[i];
104     if (ib[0] == i) {
105       sum = v[0]*x1;                 /* diagonal term */
106       jmin++;
107     }
108     PetscPrefetchBlock(ib+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
109     PetscPrefetchBlock(v+nz,nz,0,PETSC_PREFETCH_HINT_NTA);  /* Entries for the next row */
110     for (j=jmin; j<nz; j++) {
111       ibt     = ib[j];
112       vj      = v[j];
113       z[ibt] += vj * x1;       /* (strict lower triangular part of A)*x  */
114       sum    += vj * x[ibt];   /* (strict upper triangular part of A)*x  */
115     }
116     z[i] += sum;
117     v    += nz;
118     ib   += nz;
119   }
120 
121   ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr);
122   ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr);
123   ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr);
124   PetscFunctionReturn(0);
125 }
126 
127 #undef __FUNCT__
128 #if defined(USESHORT)
129 #define __FUNCT__ "MatSOR_SeqSBAIJ_ushort"
130 PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
131 #else
132 #define __FUNCT__ "MatSOR_SeqSBAIJ"
133 PetscErrorCode MatSOR_SeqSBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx)
134 #endif
135 {
136   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
137   const MatScalar   *aa=a->a,*v,*v1,*aidiag;
138   PetscScalar       *x,*t,sum;
139   const PetscScalar *b;
140   MatScalar         tmp;
141   PetscErrorCode    ierr;
142   PetscInt          m  =a->mbs,bs=A->rmap->bs,j;
143   const PetscInt    *ai=a->i;
144 #if defined(USESHORT)
145   const unsigned short *aj=a->jshort,*vj,*vj1;
146 #else
147   const PetscInt *aj=a->j,*vj,*vj1;
148 #endif
149   PetscInt nz,nz1,i;
150 
151   PetscFunctionBegin;
152   if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat");
153 
154   its = its*lits;
155   if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits);
156 
157   if (bs > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented");
158 
159   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
160   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
161 
162   if (!a->idiagvalid) {
163     if (!a->idiag) {
164       ierr = PetscMalloc1(m,&a->idiag);CHKERRQ(ierr);
165     }
166     for (i=0; i<a->mbs; i++) a->idiag[i] = 1.0/a->a[a->i[i]];
167     a->idiagvalid = PETSC_TRUE;
168   }
169 
170   if (!a->sor_work) {
171     ierr = PetscMalloc1(m,&a->sor_work);CHKERRQ(ierr);
172   }
173   t = a->sor_work;
174 
175   aidiag = a->idiag;
176 
177   if (flag == SOR_APPLY_UPPER) {
178     /* apply (U + D/omega) to the vector */
179     PetscScalar d;
180     for (i=0; i<m; i++) {
181       d   = fshift + aa[ai[i]];
182       nz  = ai[i+1] - ai[i] - 1;
183       vj  = aj + ai[i] + 1;
184       v   = aa + ai[i] + 1;
185       sum = b[i]*d/omega;
186       PetscSparseDensePlusDot(sum,b,v,vj,nz);
187       x[i] = sum;
188     }
189     ierr = PetscLogFlops(a->nz);CHKERRQ(ierr);
190   }
191 
192   if (flag & SOR_ZERO_INITIAL_GUESS) {
193     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
194       ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr);
195 
196       v  = aa + 1;
197       vj = aj + 1;
198       for (i=0; i<m; i++) {
199         nz  = ai[i+1] - ai[i] - 1;
200         tmp = -(x[i] = omega*t[i]*aidiag[i]);
201         for (j=0; j<nz; j++) t[vj[j]] += tmp*v[j];
202         v  += nz + 1;
203         vj += nz + 1;
204       }
205       ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
206     }
207 
208     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
209       int nz2;
210       if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)) {
211 #if defined(PETSC_USE_BACKWARD_LOOP)
212         v  = aa + ai[m] - 1;
213         vj = aj + ai[m] - 1;
214         for (i=m-1; i>=0; i--) {
215           sum = b[i];
216           nz  = ai[i+1] - ai[i] - 1;
217           {PetscInt __i;for (__i=0; __i<nz; __i++) sum -= v[-__i] * x[vj[-__i]];}
218 #else
219         v  = aa + ai[m-1] + 1;
220         vj = aj + ai[m-1] + 1;
221         nz = 0;
222         for (i=m-1; i>=0; i--) {
223           sum = b[i];
224           nz2 = ai[i] - ai[PetscMax(i-1,0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */
225           PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA);
226           PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA);
227           PetscSparseDenseMinusDot(sum,x,v,vj,nz);
228           nz = nz2;
229 #endif
230           x[i] = omega*sum*aidiag[i];
231           v   -= nz + 1;
232           vj  -= nz + 1;
233         }
234         ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
235       } else {
236         v  = aa + ai[m-1] + 1;
237         vj = aj + ai[m-1] + 1;
238         nz = 0;
239         for (i=m-1; i>=0; i--) {
240           sum = t[i];
241           nz2 = ai[i] - ai[PetscMax(i-1,0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */
242           PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA);
243           PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA);
244           PetscSparseDenseMinusDot(sum,x,v,vj,nz);
245           x[i] = (1-omega)*x[i] + omega*sum*aidiag[i];
246           nz   = nz2;
247           v   -= nz + 1;
248           vj  -= nz + 1;
249         }
250         ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr);
251       }
252     }
253     its--;
254   }
255 
256   while (its--) {
257     /*
258        forward sweep:
259        for i=0,...,m-1:
260          sum[i] = (b[i] - U(i,:)x)/d[i];
261          x[i]   = (1-omega)x[i] + omega*sum[i];
262          b      = b - x[i]*U^T(i,:);
263 
264     */
265     if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) {
266       ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr);
267 
268       for (i=0; i<m; i++) {
269         v    = aa + ai[i] + 1; v1=v;
270         vj   = aj + ai[i] + 1; vj1=vj;
271         nz   = ai[i+1] - ai[i] - 1; nz1=nz;
272         sum  = t[i];
273         while (nz1--) sum -= (*v1++)*x[*vj1++];
274         x[i] = (1-omega)*x[i] + omega*sum*aidiag[i];
275         while (nz--) t[*vj++] -= x[i]*(*v++);
276       }
277       ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
278     }
279 
280     if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) {
281       /*
282        backward sweep:
283        b = b - x[i]*U^T(i,:), i=0,...,n-2
284        for i=m-1,...,0:
285          sum[i] = (b[i] - U(i,:)x)/d[i];
286          x[i]   = (1-omega)x[i] + omega*sum[i];
287       */
288       /* if there was a forward sweep done above then I thing the next two for loops are not needed */
289       ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr);
290 
291       for (i=0; i<m-1; i++) {  /* update rhs */
292         v    = aa + ai[i] + 1;
293         vj   = aj + ai[i] + 1;
294         nz   = ai[i+1] - ai[i] - 1;
295         while (nz--) t[*vj++] -= x[i]*(*v++);
296       }
297       ierr = PetscLogFlops(2.0*(a->nz - m));CHKERRQ(ierr);
298       for (i=m-1; i>=0; i--) {
299         v    = aa + ai[i] + 1;
300         vj   = aj + ai[i] + 1;
301         nz   = ai[i+1] - ai[i] - 1;
302         sum  = t[i];
303         while (nz--) sum -= x[*vj++]*(*v++);
304         x[i] =   (1-omega)*x[i] + omega*sum*aidiag[i];
305       }
306       ierr = PetscLogFlops(2.0*(a->nz + m));CHKERRQ(ierr);
307     }
308   }
309 
310   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
311   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
312   PetscFunctionReturn(0);
313 }
314