1 2 /* 3 This is included by sbaij.c to generate unsigned short and regular versions of these two functions 4 */ 5 #undef __FUNCT__ 6 #if defined(USESHORT) 7 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian_ushort" 8 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian_ushort(Mat A,Vec xx,Vec zz) 9 #else 10 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian" 11 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian(Mat A,Vec xx,Vec zz) 12 #endif 13 { 14 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 15 const PetscScalar *x; 16 PetscScalar *z,x1,sum; 17 const MatScalar *v; 18 MatScalar vj; 19 PetscErrorCode ierr; 20 PetscInt mbs=a->mbs,i,j,nz; 21 const PetscInt *ai=a->i; 22 #if defined(USESHORT) 23 const unsigned short *ib=a->jshort; 24 unsigned short ibt; 25 #else 26 const PetscInt *ib=a->j; 27 PetscInt ibt; 28 #endif 29 PetscInt nonzerorow = 0,jmin; 30 31 PetscFunctionBegin; 32 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 33 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 34 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 35 36 v = a->a; 37 for (i=0; i<mbs; i++) { 38 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 39 if (!nz) continue; /* Move to the next row if the current row is empty */ 40 nonzerorow++; 41 x1 = x[i]; 42 sum = 0.0; 43 jmin = 0; 44 if (ib[0] == i) { 45 sum = v[0]*x1; /* diagonal term */ 46 jmin++; 47 } 48 for (j=jmin; j<nz; j++) { 49 ibt = ib[j]; 50 vj = v[j]; 51 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 52 z[ibt] += PetscConj(v[j]) * x1; /* (strict lower triangular part of A)*x */ 53 } 54 z[i] += sum; 55 v += nz; 56 ib += nz; 57 } 58 59 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 60 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 61 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 62 PetscFunctionReturn(0); 63 } 64 65 #undef __FUNCT__ 66 #if defined(USESHORT) 67 #define __FUNCT__ "MatMult_SeqSBAIJ_1_ushort" 68 PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz) 69 #else 70 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 71 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 72 #endif 73 { 74 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 75 const PetscScalar *x; 76 PetscScalar *z,x1,sum; 77 const MatScalar *v; 78 MatScalar vj; 79 PetscErrorCode ierr; 80 PetscInt mbs=a->mbs,i,j,nz; 81 const PetscInt *ai=a->i; 82 #if defined(USESHORT) 83 const unsigned short *ib=a->jshort; 84 unsigned short ibt; 85 #else 86 const PetscInt *ib=a->j; 87 PetscInt ibt; 88 #endif 89 PetscInt nonzerorow=0,jmin; 90 91 PetscFunctionBegin; 92 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 93 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 94 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 95 96 v = a->a; 97 for (i=0; i<mbs; i++) { 98 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 99 if (!nz) continue; /* Move to the next row if the current row is empty */ 100 nonzerorow++; 101 sum = 0.0; 102 jmin = 0; 103 x1 = x[i]; 104 if (ib[0] == i) { 105 sum = v[0]*x1; /* diagonal term */ 106 jmin++; 107 } 108 PetscPrefetchBlock(ib+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */ 109 PetscPrefetchBlock(v+nz,nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */ 110 for (j=jmin; j<nz; j++) { 111 ibt = ib[j]; 112 vj = v[j]; 113 z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */ 114 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 115 } 116 z[i] += sum; 117 v += nz; 118 ib += nz; 119 } 120 121 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 122 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 123 ierr = PetscLogFlops(2.0*(2.0*a->nz - nonzerorow) - nonzerorow);CHKERRQ(ierr); 124 PetscFunctionReturn(0); 125 } 126 127 #undef __FUNCT__ 128 #if defined(USESHORT) 129 #define __FUNCT__ "MatSOR_SeqSBAIJ_ushort" 130 PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 131 #else 132 #define __FUNCT__ "MatSOR_SeqSBAIJ" 133 PetscErrorCode MatSOR_SeqSBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 134 #endif 135 { 136 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 137 const MatScalar *aa=a->a,*v,*v1,*aidiag; 138 PetscScalar *x,*t,sum; 139 const PetscScalar *b; 140 MatScalar tmp; 141 PetscErrorCode ierr; 142 PetscInt m =a->mbs,bs=A->rmap->bs,j; 143 const PetscInt *ai=a->i; 144 #if defined(USESHORT) 145 const unsigned short *aj=a->jshort,*vj,*vj1; 146 #else 147 const PetscInt *aj=a->j,*vj,*vj1; 148 #endif 149 PetscInt nz,nz1,i; 150 151 PetscFunctionBegin; 152 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 153 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 154 155 its = its*lits; 156 if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits); 157 158 if (bs > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented"); 159 160 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 161 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 162 163 if (!a->idiagvalid) { 164 if (!a->idiag) { 165 ierr = PetscMalloc1(m,&a->idiag);CHKERRQ(ierr); 166 } 167 for (i=0; i<a->mbs; i++) a->idiag[i] = 1.0/a->a[a->i[i]]; 168 a->idiagvalid = PETSC_TRUE; 169 } 170 171 if (!a->sor_work) { 172 ierr = PetscMalloc1(m,&a->sor_work);CHKERRQ(ierr); 173 } 174 t = a->sor_work; 175 176 aidiag = a->idiag; 177 178 if (flag == SOR_APPLY_UPPER) { 179 /* apply (U + D/omega) to the vector */ 180 PetscScalar d; 181 for (i=0; i<m; i++) { 182 d = fshift + aa[ai[i]]; 183 nz = ai[i+1] - ai[i] - 1; 184 vj = aj + ai[i] + 1; 185 v = aa + ai[i] + 1; 186 sum = b[i]*d/omega; 187 PetscSparseDensePlusDot(sum,b,v,vj,nz); 188 x[i] = sum; 189 } 190 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 191 } 192 193 if (flag & SOR_ZERO_INITIAL_GUESS) { 194 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 195 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 196 197 v = aa + 1; 198 vj = aj + 1; 199 for (i=0; i<m; i++) { 200 nz = ai[i+1] - ai[i] - 1; 201 tmp = -(x[i] = omega*t[i]*aidiag[i]); 202 for (j=0; j<nz; j++) t[vj[j]] += tmp*v[j]; 203 v += nz + 1; 204 vj += nz + 1; 205 } 206 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 207 } 208 209 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 210 int nz2; 211 if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)) { 212 #if defined(PETSC_USE_BACKWARD_LOOP) 213 v = aa + ai[m] - 1; 214 vj = aj + ai[m] - 1; 215 for (i=m-1; i>=0; i--) { 216 sum = b[i]; 217 nz = ai[i+1] - ai[i] - 1; 218 {PetscInt __i;for (__i=0; __i<nz; __i++) sum -= v[-__i] * x[vj[-__i]];} 219 #else 220 v = aa + ai[m-1] + 1; 221 vj = aj + ai[m-1] + 1; 222 nz = 0; 223 for (i=m-1; i>=0; i--) { 224 sum = b[i]; 225 nz2 = ai[i] - ai[PetscMax(i-1,0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */ 226 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 227 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 228 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 229 nz = nz2; 230 #endif 231 x[i] = omega*sum*aidiag[i]; 232 v -= nz + 1; 233 vj -= nz + 1; 234 } 235 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 236 } else { 237 v = aa + ai[m-1] + 1; 238 vj = aj + ai[m-1] + 1; 239 nz = 0; 240 for (i=m-1; i>=0; i--) { 241 sum = t[i]; 242 nz2 = ai[i] - ai[PetscMax(i-1,0)] - 1; /* avoid referencing ai[-1], nonsense nz2 is okay on last iteration */ 243 PETSC_Prefetch(v-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 244 PETSC_Prefetch(vj-nz2-1,0,PETSC_PREFETCH_HINT_NTA); 245 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 246 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 247 nz = nz2; 248 v -= nz + 1; 249 vj -= nz + 1; 250 } 251 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 252 } 253 } 254 its--; 255 } 256 257 while (its--) { 258 /* 259 forward sweep: 260 for i=0,...,m-1: 261 sum[i] = (b[i] - U(i,:)x)/d[i]; 262 x[i] = (1-omega)x[i] + omega*sum[i]; 263 b = b - x[i]*U^T(i,:); 264 265 */ 266 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 267 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 268 269 for (i=0; i<m; i++) { 270 v = aa + ai[i] + 1; v1=v; 271 vj = aj + ai[i] + 1; vj1=vj; 272 nz = ai[i+1] - ai[i] - 1; nz1=nz; 273 sum = t[i]; 274 while (nz1--) sum -= (*v1++)*x[*vj1++]; 275 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 276 while (nz--) t[*vj++] -= x[i]*(*v++); 277 } 278 ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr); 279 } 280 281 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 282 /* 283 backward sweep: 284 b = b - x[i]*U^T(i,:), i=0,...,n-2 285 for i=m-1,...,0: 286 sum[i] = (b[i] - U(i,:)x)/d[i]; 287 x[i] = (1-omega)x[i] + omega*sum[i]; 288 */ 289 /* if there was a forward sweep done above then I thing the next two for loops are not needed */ 290 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 291 292 for (i=0; i<m-1; i++) { /* update rhs */ 293 v = aa + ai[i] + 1; 294 vj = aj + ai[i] + 1; 295 nz = ai[i+1] - ai[i] - 1; 296 while (nz--) t[*vj++] -= x[i]*(*v++); 297 } 298 ierr = PetscLogFlops(2.0*(a->nz - m));CHKERRQ(ierr); 299 for (i=m-1; i>=0; i--) { 300 v = aa + ai[i] + 1; 301 vj = aj + ai[i] + 1; 302 nz = ai[i+1] - ai[i] - 1; 303 sum = t[i]; 304 while (nz--) sum -= x[*vj++]*(*v++); 305 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 306 } 307 ierr = PetscLogFlops(2.0*(a->nz + m));CHKERRQ(ierr); 308 } 309 } 310 311 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 312 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 313 PetscFunctionReturn(0); 314 } 315