1 2 /* 3 This is included by sbaij.c to generate unsigned short and regular versions of these two functions 4 */ 5 6 /* We cut-and-past below from aij.h to make a "no_function" version of PetscSparseDensePlusDot(). 7 * This is necessary because the USESHORT case cannot use the inlined functions that may be employed. */ 8 9 #if defined(PETSC_KERNEL_USE_UNROLL_4) 10 #define PetscSparseDensePlusDot_no_function(sum,r,xv,xi,nnz) { \ 11 if (nnz > 0) { \ 12 PetscInt nnz2=nnz,rem=nnz&0x3; \ 13 switch (rem) { \ 14 case 3: sum += *xv++ *r[*xi++]; \ 15 case 2: sum += *xv++ *r[*xi++]; \ 16 case 1: sum += *xv++ *r[*xi++]; \ 17 nnz2 -= rem;} \ 18 while (nnz2 > 0) { \ 19 sum += xv[0] * r[xi[0]] + xv[1] * r[xi[1]] + \ 20 xv[2] * r[xi[2]] + xv[3] * r[xi[3]]; \ 21 xv += 4; xi += 4; nnz2 -= 4; \ 22 } \ 23 xv -= nnz; xi -= nnz; \ 24 } \ 25 } 26 27 #elif defined(PETSC_KERNEL_USE_UNROLL_2) 28 #define PetscSparseDensePlusDot_no_function(sum,r,xv,xi,nnz) { \ 29 PetscInt __i,__i1,__i2; \ 30 for (__i=0; __i<nnz-1; __i+=2) {__i1 = xi[__i]; __i2=xi[__i+1]; \ 31 sum += (xv[__i]*r[__i1] + xv[__i+1]*r[__i2]);} \ 32 if (nnz & 0x1) sum += xv[__i] * r[xi[__i]];} 33 34 #else 35 #define PetscSparseDensePlusDot_no_function(sum,r,xv,xi,nnz) { \ 36 PetscInt __i; \ 37 for (__i=0; __i<nnz; __i++) sum += xv[__i] * r[xi[__i]];} 38 #endif 39 40 41 #if defined(USESHORT) 42 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian_ushort(Mat A,Vec xx,Vec zz) 43 #else 44 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian(Mat A,Vec xx,Vec zz) 45 #endif 46 { 47 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 48 const PetscScalar *x; 49 PetscScalar *z,x1,sum; 50 const MatScalar *v; 51 MatScalar vj; 52 PetscErrorCode ierr; 53 PetscInt mbs=a->mbs,i,j,nz; 54 const PetscInt *ai=a->i; 55 #if defined(USESHORT) 56 const unsigned short *ib=a->jshort; 57 unsigned short ibt; 58 #else 59 const PetscInt *ib=a->j; 60 PetscInt ibt; 61 #endif 62 PetscInt nonzerorow = 0,jmin; 63 64 PetscFunctionBegin; 65 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 66 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 67 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 68 69 v = a->a; 70 for (i=0; i<mbs; i++) { 71 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 72 if (!nz) continue; /* Move to the next row if the current row is empty */ 73 nonzerorow++; 74 x1 = x[i]; 75 sum = 0.0; 76 jmin = 0; 77 if (ib[0] == i) { 78 sum = v[0]*x1; /* diagonal term */ 79 jmin++; 80 } 81 for (j=jmin; j<nz; j++) { 82 ibt = ib[j]; 83 vj = v[j]; 84 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 85 z[ibt] += PetscConj(v[j]) * x1; /* (strict lower triangular part of A)*x */ 86 } 87 z[i] += sum; 88 v += nz; 89 ib += nz; 90 } 91 92 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 93 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 94 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 95 PetscFunctionReturn(0); 96 } 97 98 #if defined(USESHORT) 99 PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz) 100 #else 101 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 102 #endif 103 { 104 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 105 const PetscScalar *x; 106 PetscScalar *z,x1,sum; 107 const MatScalar *v; 108 MatScalar vj; 109 PetscErrorCode ierr; 110 PetscInt mbs=a->mbs,i,j,nz; 111 const PetscInt *ai=a->i; 112 #if defined(USESHORT) 113 const unsigned short *ib=a->jshort; 114 unsigned short ibt; 115 #else 116 const PetscInt *ib=a->j; 117 PetscInt ibt; 118 #endif 119 PetscInt nonzerorow=0,jmin; 120 121 PetscFunctionBegin; 122 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 123 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 124 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 125 126 v = a->a; 127 for (i=0; i<mbs; i++) { 128 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 129 if (!nz) continue; /* Move to the next row if the current row is empty */ 130 nonzerorow++; 131 sum = 0.0; 132 jmin = 0; 133 x1 = x[i]; 134 if (ib[0] == i) { 135 sum = v[0]*x1; /* diagonal term */ 136 jmin++; 137 } 138 PetscPrefetchBlock(ib+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 139 PetscPrefetchBlock(v+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 140 for (j=jmin; j<nz; j++) { 141 ibt = ib[j]; 142 vj = v[j]; 143 z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */ 144 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 145 } 146 z[i] += sum; 147 v += nz; 148 ib += nz; 149 } 150 151 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 152 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 153 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 154 PetscFunctionReturn(0); 155 } 156 157 #if defined(USESHORT) 158 PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 159 #else 160 PetscErrorCode MatSOR_SeqSBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 161 #endif 162 { 163 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 164 const MatScalar *aa=a->a,*v,*v1,*aidiag; 165 PetscScalar *x,*t,sum; 166 const PetscScalar *b; 167 MatScalar tmp; 168 PetscErrorCode ierr; 169 PetscInt m =a->mbs,bs=A->rmap->bs,j; 170 const PetscInt *ai=a->i; 171 #if defined(USESHORT) 172 const unsigned short *aj=a->jshort,*vj,*vj1; 173 #else 174 const PetscInt *aj=a->j,*vj,*vj1; 175 #endif 176 PetscInt nz,nz1,i; 177 178 PetscFunctionBegin; 179 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 180 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 181 182 its = its*lits; 183 if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits); 184 185 if (bs > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented"); 186 187 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 188 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 189 190 if (!a->idiagvalid) { 191 if (!a->idiag) { 192 ierr = PetscMalloc1(m,&a->idiag);CHKERRQ(ierr); 193 } 194 for (i=0; i<a->mbs; i++) a->idiag[i] = 1.0/a->a[a->i[i]]; 195 a->idiagvalid = PETSC_TRUE; 196 } 197 198 if (!a->sor_work) { 199 ierr = PetscMalloc1(m,&a->sor_work);CHKERRQ(ierr); 200 } 201 t = a->sor_work; 202 203 aidiag = a->idiag; 204 205 if (flag == SOR_APPLY_UPPER) { 206 /* apply (U + D/omega) to the vector */ 207 PetscScalar d; 208 for (i=0; i<m; i++) { 209 d = fshift + aa[ai[i]]; 210 nz = ai[i+1] - ai[i] - 1; 211 vj = aj + ai[i] + 1; 212 v = aa + ai[i] + 1; 213 sum = b[i]*d/omega; 214 #ifdef USESHORT 215 PetscSparseDensePlusDot_no_function(sum,b,v,vj,nz); 216 #else 217 PetscSparseDensePlusDot(sum,b,v,vj,nz); 218 #endif 219 x[i] = sum; 220 } 221 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 222 } 223 224 if (flag & SOR_ZERO_INITIAL_GUESS) { 225 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 226 ierr = PetscArraycpy(t,b,m);CHKERRQ(ierr); 227 228 v = aa + 1; 229 vj = aj + 1; 230 for (i=0; i<m; i++) { 231 nz = ai[i+1] - ai[i] - 1; 232 tmp = -(x[i] = omega*t[i]*aidiag[i]); 233 for (j=0; j<nz; j++) t[vj[j]] += tmp*v[j]; 234 v += nz + 1; 235 vj += nz + 1; 236 } 237 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 238 } 239 240 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 241 int nz2; 242 if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)) { 243 #if defined(PETSC_USE_BACKWARD_LOOP) 244 v = aa + ai[m] - 1; 245 vj = aj + ai[m] - 1; 246 for (i=m-1; i>=0; i--) { 247 sum = b[i]; 248 nz = ai[i+1] - ai[i] - 1; 249 {PetscInt __i;for (__i=0; __i<nz; __i++) sum -= v[-__i] * x[vj[-__i]];} 250 #else 251 v = aa + ai[m-1] + 1; 252 vj = aj + ai[m-1] + 1; 253 nz = 0; 254 for (i=m-1; i>=0; i--) { 255 sum = b[i]; 256 nz2 = ai[i] - ai[PetscMax(i-1,0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */ 257 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 258 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 259 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 260 nz = nz2; 261 #endif 262 x[i] = omega*sum*aidiag[i]; 263 v -= nz + 1; 264 vj -= nz + 1; 265 } 266 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 267 } else { 268 v = aa + ai[m-1] + 1; 269 vj = aj + ai[m-1] + 1; 270 nz = 0; 271 for (i=m-1; i>=0; i--) { 272 sum = t[i]; 273 nz2 = ai[i] - ai[PetscMax(i-1,0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */ 274 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 275 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 276 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 277 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 278 nz = nz2; 279 v -= nz + 1; 280 vj -= nz + 1; 281 } 282 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 283 } 284 } 285 its--; 286 } 287 288 while (its--) { 289 /* 290 forward sweep: 291 for i=0,...,m-1: 292 sum[i] = (b[i] - U(i,:)x)/d[i]; 293 x[i] = (1-omega)x[i] + omega*sum[i]; 294 b = b - x[i]*U^T(i,:); 295 296 */ 297 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 298 ierr = PetscArraycpy(t,b,m);CHKERRQ(ierr); 299 300 for (i=0; i<m; i++) { 301 v = aa + ai[i] + 1; v1=v; 302 vj = aj + ai[i] + 1; vj1=vj; 303 nz = ai[i+1] - ai[i] - 1; nz1=nz; 304 sum = t[i]; 305 while (nz1--) sum -= (*v1++)*x[*vj1++]; 306 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 307 while (nz--) t[*vj++] -= x[i]*(*v++); 308 } 309 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 310 } 311 312 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 313 /* 314 backward sweep: 315 b = b - x[i]*U^T(i,:), i=0,...,n-2 316 for i=m-1,...,0: 317 sum[i] = (b[i] - U(i,:)x)/d[i]; 318 x[i] = (1-omega)x[i] + omega*sum[i]; 319 */ 320 /* if there was a forward sweep done above then I thing the next two for loops are not needed */ 321 ierr = PetscArraycpy(t,b,m);CHKERRQ(ierr); 322 323 for (i=0; i<m-1; i++) { /* update rhs */ 324 v = aa + ai[i] + 1; 325 vj = aj + ai[i] + 1; 326 nz = ai[i+1] - ai[i] - 1; 327 while (nz--) t[*vj++] -= x[i]*(*v++); 328 } 329 ierr = PetscLogFlops(2.0*(a->nz - m));CHKERRQ(ierr); 330 for (i=m-1; i>=0; i--) { 331 v = aa + ai[i] + 1; 332 vj = aj + ai[i] + 1; 333 nz = ai[i+1] - ai[i] - 1; 334 sum = t[i]; 335 while (nz--) sum -= x[*vj++]*(*v++); 336 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 337 } 338 ierr = PetscLogFlops(2.0*(a->nz + m));CHKERRQ(ierr); 339 } 340 } 341 342 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 343 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 344 PetscFunctionReturn(0); 345 } 346