1 2 /* 3 This is included by sbaij.c to generate unsigned short and regular versions of these two functions 4 */ 5 #if defined(USESHORT) 6 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian_ushort(Mat A,Vec xx,Vec zz) 7 #else 8 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian(Mat A,Vec xx,Vec zz) 9 #endif 10 { 11 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 12 const PetscScalar *x; 13 PetscScalar *z,x1,sum; 14 const MatScalar *v; 15 MatScalar vj; 16 PetscErrorCode ierr; 17 PetscInt mbs=a->mbs,i,j,nz; 18 const PetscInt *ai=a->i; 19 #if defined(USESHORT) 20 const unsigned short *ib=a->jshort; 21 unsigned short ibt; 22 #else 23 const PetscInt *ib=a->j; 24 PetscInt ibt; 25 #endif 26 PetscInt nonzerorow = 0,jmin; 27 28 PetscFunctionBegin; 29 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 30 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 31 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 32 33 v = a->a; 34 for (i=0; i<mbs; i++) { 35 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 36 if (!nz) continue; /* Move to the next row if the current row is empty */ 37 nonzerorow++; 38 x1 = x[i]; 39 sum = 0.0; 40 jmin = 0; 41 if (ib[0] == i) { 42 sum = v[0]*x1; /* diagonal term */ 43 jmin++; 44 } 45 for (j=jmin; j<nz; j++) { 46 ibt = ib[j]; 47 vj = v[j]; 48 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 49 z[ibt] += PetscConj(v[j]) * x1; /* (strict lower triangular part of A)*x */ 50 } 51 z[i] += sum; 52 v += nz; 53 ib += nz; 54 } 55 56 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 57 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 58 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 59 PetscFunctionReturn(0); 60 } 61 62 #if defined(USESHORT) 63 PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz) 64 #else 65 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 66 #endif 67 { 68 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 69 const PetscScalar *x; 70 PetscScalar *z,x1,sum; 71 const MatScalar *v; 72 MatScalar vj; 73 PetscErrorCode ierr; 74 PetscInt mbs=a->mbs,i,j,nz; 75 const PetscInt *ai=a->i; 76 #if defined(USESHORT) 77 const unsigned short *ib=a->jshort; 78 unsigned short ibt; 79 #else 80 const PetscInt *ib=a->j; 81 PetscInt ibt; 82 #endif 83 PetscInt nonzerorow=0,jmin; 84 85 PetscFunctionBegin; 86 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 87 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 88 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 89 90 v = a->a; 91 for (i=0; i<mbs; i++) { 92 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 93 if (!nz) continue; /* Move to the next row if the current row is empty */ 94 nonzerorow++; 95 sum = 0.0; 96 jmin = 0; 97 x1 = x[i]; 98 if (ib[0] == i) { 99 sum = v[0]*x1; /* diagonal term */ 100 jmin++; 101 } 102 PetscPrefetchBlock(ib+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 103 PetscPrefetchBlock(v+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 104 for (j=jmin; j<nz; j++) { 105 ibt = ib[j]; 106 vj = v[j]; 107 z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */ 108 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 109 } 110 z[i] += sum; 111 v += nz; 112 ib += nz; 113 } 114 115 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 116 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 117 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 118 PetscFunctionReturn(0); 119 } 120 121 #if defined(USESHORT) 122 PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 123 #else 124 PetscErrorCode MatSOR_SeqSBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 125 #endif 126 { 127 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 128 const MatScalar *aa=a->a,*v,*v1,*aidiag; 129 PetscScalar *x,*t,sum; 130 const PetscScalar *b; 131 MatScalar tmp; 132 PetscErrorCode ierr; 133 PetscInt m =a->mbs,bs=A->rmap->bs,j; 134 const PetscInt *ai=a->i; 135 #if defined(USESHORT) 136 const unsigned short *aj=a->jshort,*vj,*vj1; 137 #else 138 const PetscInt *aj=a->j,*vj,*vj1; 139 #endif 140 PetscInt nz,nz1,i; 141 142 PetscFunctionBegin; 143 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 144 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 145 146 its = its*lits; 147 if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits); 148 149 if (bs > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented"); 150 151 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 152 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 153 154 if (!a->idiagvalid) { 155 if (!a->idiag) { 156 ierr = PetscMalloc1(m,&a->idiag);CHKERRQ(ierr); 157 } 158 for (i=0; i<a->mbs; i++) a->idiag[i] = 1.0/a->a[a->i[i]]; 159 a->idiagvalid = PETSC_TRUE; 160 } 161 162 if (!a->sor_work) { 163 ierr = PetscMalloc1(m,&a->sor_work);CHKERRQ(ierr); 164 } 165 t = a->sor_work; 166 167 aidiag = a->idiag; 168 169 if (flag == SOR_APPLY_UPPER) { 170 /* apply (U + D/omega) to the vector */ 171 PetscScalar d; 172 for (i=0; i<m; i++) { 173 d = fshift + aa[ai[i]]; 174 nz = ai[i+1] - ai[i] - 1; 175 vj = aj + ai[i] + 1; 176 v = aa + ai[i] + 1; 177 sum = b[i]*d/omega; 178 PetscSparseDensePlusDot(sum,b,v,vj,nz); 179 x[i] = sum; 180 } 181 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 182 } 183 184 if (flag & SOR_ZERO_INITIAL_GUESS) { 185 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 186 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 187 188 v = aa + 1; 189 vj = aj + 1; 190 for (i=0; i<m; i++) { 191 nz = ai[i+1] - ai[i] - 1; 192 tmp = -(x[i] = omega*t[i]*aidiag[i]); 193 for (j=0; j<nz; j++) t[vj[j]] += tmp*v[j]; 194 v += nz + 1; 195 vj += nz + 1; 196 } 197 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 198 } 199 200 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 201 int nz2; 202 if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)) { 203 #if defined(PETSC_USE_BACKWARD_LOOP) 204 v = aa + ai[m] - 1; 205 vj = aj + ai[m] - 1; 206 for (i=m-1; i>=0; i--) { 207 sum = b[i]; 208 nz = ai[i+1] - ai[i] - 1; 209 {PetscInt __i;for (__i=0; __i<nz; __i++) sum -= v[-__i] * x[vj[-__i]];} 210 #else 211 v = aa + ai[m-1] + 1; 212 vj = aj + ai[m-1] + 1; 213 nz = 0; 214 for (i=m-1; i>=0; i--) { 215 sum = b[i]; 216 nz2 = ai[i] - ai[PetscMax(i-1,0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */ 217 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 218 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 219 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 220 nz = nz2; 221 #endif 222 x[i] = omega*sum*aidiag[i]; 223 v -= nz + 1; 224 vj -= nz + 1; 225 } 226 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 227 } else { 228 v = aa + ai[m-1] + 1; 229 vj = aj + ai[m-1] + 1; 230 nz = 0; 231 for (i=m-1; i>=0; i--) { 232 sum = t[i]; 233 nz2 = ai[i] - ai[PetscMax(i-1,0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */ 234 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 235 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 236 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 237 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 238 nz = nz2; 239 v -= nz + 1; 240 vj -= nz + 1; 241 } 242 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 243 } 244 } 245 its--; 246 } 247 248 while (its--) { 249 /* 250 forward sweep: 251 for i=0,...,m-1: 252 sum[i] = (b[i] - U(i,:)x)/d[i]; 253 x[i] = (1-omega)x[i] + omega*sum[i]; 254 b = b - x[i]*U^T(i,:); 255 256 */ 257 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 258 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 259 260 for (i=0; i<m; i++) { 261 v = aa + ai[i] + 1; v1=v; 262 vj = aj + ai[i] + 1; vj1=vj; 263 nz = ai[i+1] - ai[i] - 1; nz1=nz; 264 sum = t[i]; 265 while (nz1--) sum -= (*v1++)*x[*vj1++]; 266 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 267 while (nz--) t[*vj++] -= x[i]*(*v++); 268 } 269 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 270 } 271 272 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 273 /* 274 backward sweep: 275 b = b - x[i]*U^T(i,:), i=0,...,n-2 276 for i=m-1,...,0: 277 sum[i] = (b[i] - U(i,:)x)/d[i]; 278 x[i] = (1-omega)x[i] + omega*sum[i]; 279 */ 280 /* if there was a forward sweep done above then I thing the next two for loops are not needed */ 281 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 282 283 for (i=0; i<m-1; i++) { /* update rhs */ 284 v = aa + ai[i] + 1; 285 vj = aj + ai[i] + 1; 286 nz = ai[i+1] - ai[i] - 1; 287 while (nz--) t[*vj++] -= x[i]*(*v++); 288 } 289 ierr = PetscLogFlops(2.0*(a->nz - m));CHKERRQ(ierr); 290 for (i=m-1; i>=0; i--) { 291 v = aa + ai[i] + 1; 292 vj = aj + ai[i] + 1; 293 nz = ai[i+1] - ai[i] - 1; 294 sum = t[i]; 295 while (nz--) sum -= x[*vj++]*(*v++); 296 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 297 } 298 ierr = PetscLogFlops(2.0*(a->nz + m));CHKERRQ(ierr); 299 } 300 } 301 302 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 303 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 304 PetscFunctionReturn(0); 305 } 306