1 2 /* 3 This is included by sbaij.c to generate unsigned short and regular versions of these two functions 4 */ 5 #undef __FUNCT__ 6 #define __FUNCT__ "MatMult_SeqSBAIJ_1_Hermitian" 7 #if defined(USESHORT) 8 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian_ushort(Mat A,Vec xx,Vec zz) 9 #else 10 PetscErrorCode MatMult_SeqSBAIJ_1_Hermitian(Mat A,Vec xx,Vec zz) 11 #endif 12 { 13 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 14 const PetscScalar *x; 15 PetscScalar *z,x1,sum; 16 const MatScalar *v; 17 MatScalar vj; 18 PetscErrorCode ierr; 19 PetscInt mbs=a->mbs,i,j,nz; 20 const PetscInt *ai=a->i; 21 #if defined(USESHORT) 22 const unsigned short *ib=a->jshort; 23 unsigned short ibt; 24 #else 25 const PetscInt *ib=a->j; 26 PetscInt ibt; 27 #endif 28 29 PetscFunctionBegin; 30 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 31 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 32 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 33 34 v = a->a; 35 for (i=0; i<mbs; i++) { 36 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 37 x1 = x[i]; 38 sum = v[0]*x1; /* diagonal term */ 39 for (j=1; j<nz; j++) { 40 ibt = ib[j]; 41 vj = v[j]; 42 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 43 z[ibt] += PetscConj(v[j]) * x1; /* (strict lower triangular part of A)*x */ 44 } 45 z[i] += sum; 46 v += nz; 47 ib += nz; 48 } 49 50 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 51 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 52 ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr); 53 PetscFunctionReturn(0); 54 } 55 56 #undef __FUNCT__ 57 #define __FUNCT__ "MatMult_SeqSBAIJ_1" 58 #if defined(USESHORT) 59 PetscErrorCode MatMult_SeqSBAIJ_1_ushort(Mat A,Vec xx,Vec zz) 60 #else 61 PetscErrorCode MatMult_SeqSBAIJ_1(Mat A,Vec xx,Vec zz) 62 #endif 63 { 64 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 65 const PetscScalar *x; 66 PetscScalar *z,x1,sum; 67 const MatScalar *v; 68 MatScalar vj; 69 PetscErrorCode ierr; 70 PetscInt mbs=a->mbs,i,j,nz; 71 const PetscInt *ai=a->i; 72 #if defined(USESHORT) 73 const unsigned short *ib=a->jshort; 74 unsigned short ibt; 75 #else 76 const PetscInt *ib=a->j; 77 PetscInt ibt; 78 #endif 79 80 PetscFunctionBegin; 81 ierr = VecSet(zz,0.0);CHKERRQ(ierr); 82 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 83 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 84 85 v = a->a; 86 for (i=0; i<mbs; i++) { 87 nz = ai[i+1] - ai[i]; /* length of i_th row of A */ 88 x1 = x[i]; 89 sum = v[0]*x1; /* diagonal term */ 90 for (j=1; j<nz; j++) { 91 ibt = ib[j]; 92 vj = v[j]; 93 z[ibt] += vj * x1; /* (strict lower triangular part of A)*x */ 94 sum += vj * x[ibt]; /* (strict upper triangular part of A)*x */ 95 } 96 z[i] += sum; 97 v += nz; 98 ib += nz; 99 } 100 101 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 102 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 103 ierr = PetscLogFlops(2.0*(2.0*a->nz - mbs) - mbs);CHKERRQ(ierr); 104 PetscFunctionReturn(0); 105 } 106 107 #undef __FUNCT__ 108 #define __FUNCT__ "MatSOR_SeqSBAIJ" 109 #if defined(USESHORT) 110 PetscErrorCode MatSOR_SeqSBAIJ_ushort(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 111 #else 112 PetscErrorCode MatSOR_SeqSBAIJ(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 113 #endif 114 { 115 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data; 116 const MatScalar *aa=a->a,*v,*v1,*aidiag; 117 PetscScalar *x,*t,sum; 118 const PetscScalar *b; 119 MatScalar tmp; 120 PetscErrorCode ierr; 121 PetscInt m=a->mbs,bs=A->rmap->bs,j; 122 const PetscInt *ai=a->i; 123 #if defined(USESHORT) 124 const unsigned short *aj=a->jshort,*vj,*vj1; 125 #else 126 const PetscInt *aj=a->j,*vj,*vj1; 127 #endif 128 PetscInt nz,nz1,i; 129 130 PetscFunctionBegin; 131 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support yet for Eisenstat"); 132 133 its = its*lits; 134 if (its <= 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Relaxation requires global its %D and local its %D both positive",its,lits); 135 136 if (bs > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"SSOR for block size > 1 is not yet implemented"); 137 138 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 139 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 140 141 if (!a->idiagvalid) { 142 if (!a->idiag) { 143 ierr = PetscMalloc(m*sizeof(PetscScalar),&a->idiag);CHKERRQ(ierr); 144 } 145 for (i=0; i<a->mbs; i++) a->idiag[i] = 1.0/a->a[a->i[i]]; 146 a->idiagvalid = PETSC_TRUE; 147 } 148 149 if (!a->sor_work) { 150 ierr = PetscMalloc(m*sizeof(PetscScalar),&a->sor_work);CHKERRQ(ierr); 151 } 152 t = a->sor_work; 153 154 aidiag = a->idiag; 155 156 if (flag == SOR_APPLY_UPPER) { 157 /* apply (U + D/omega) to the vector */ 158 PetscScalar d; 159 for (i=0; i<m; i++) { 160 d = fshift + aa[ai[i]]; 161 nz = ai[i+1] - ai[i] - 1; 162 vj = aj + ai[i] + 1; 163 v = aa + ai[i] + 1; 164 sum = b[i]*d/omega; 165 PetscSparseDensePlusDot(sum,b,v,vj,nz); 166 x[i] = sum; 167 } 168 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 169 } 170 171 if (flag & SOR_ZERO_INITIAL_GUESS) { 172 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 173 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 174 175 v = aa + 1; 176 vj = aj + 1; 177 for (i=0; i<m; i++){ 178 nz = ai[i+1] - ai[i] - 1; 179 tmp = - (x[i] = omega*t[i]*aidiag[i]); 180 for (j=0; j<nz; j++) { 181 t[vj[j]] += tmp*v[j]; 182 } 183 v += nz + 1; 184 vj += nz + 1; 185 } 186 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 187 } 188 189 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 190 int nz2; 191 if (!(flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP)){ 192 #if defined(PETSC_USE_BACKWARD_LOOP) 193 v = aa + ai[m] - 1; 194 vj = aj + ai[m] - 1; 195 for (i=m-1; i>=0; i--){ 196 sum = b[i]; 197 nz = ai[i+1] - ai[i] - 1; 198 {PetscInt __i;for(__i=0;__i<nz;__i++) sum -= v[-__i] * x[vj[-__i]];} 199 #else 200 v = aa + ai[m-1] + 1; 201 vj = aj + ai[m-1] + 1; 202 nz = 0; 203 for (i=m-1; i>=0; i--){ 204 sum = b[i]; 205 nz2 = ai[i] - ai[i-1] - 1; 206 PETSC_Prefetch(v-nz2-1,0,1); 207 PETSC_Prefetch(vj-nz2-1,0,1); 208 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 209 nz = nz2; 210 #endif 211 x[i] = omega*sum*aidiag[i]; 212 v -= nz + 1; 213 vj -= nz + 1; 214 } 215 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 216 } else { 217 v = aa + ai[m-1] + 1; 218 vj = aj + ai[m-1] + 1; 219 nz = 0; 220 for (i=m-1; i>=0; i--){ 221 sum = t[i]; 222 nz2 = ai[i] - ai[i-1] - 1; 223 PETSC_Prefetch(v-nz2-1,0,1); 224 PETSC_Prefetch(vj-nz2-1,0,1); 225 PetscSparseDenseMinusDot(sum,x,v,vj,nz); 226 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 227 nz = nz2; 228 v -= nz + 1; 229 vj -= nz + 1; 230 } 231 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 232 } 233 } 234 its--; 235 } 236 237 while (its--) { 238 /* 239 forward sweep: 240 for i=0,...,m-1: 241 sum[i] = (b[i] - U(i,:)x )/d[i]; 242 x[i] = (1-omega)x[i] + omega*sum[i]; 243 b = b - x[i]*U^T(i,:); 244 245 */ 246 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 247 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 248 249 for (i=0; i<m; i++){ 250 v = aa + ai[i] + 1; v1=v; 251 vj = aj + ai[i] + 1; vj1=vj; 252 nz = ai[i+1] - ai[i] - 1; nz1=nz; 253 sum = t[i]; 254 ierr = PetscLogFlops(4.0*nz-2);CHKERRQ(ierr); 255 while (nz1--) sum -= (*v1++)*x[*vj1++]; 256 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 257 while (nz--) t[*vj++] -= x[i]*(*v++); 258 } 259 } 260 261 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 262 /* 263 backward sweep: 264 b = b - x[i]*U^T(i,:), i=0,...,n-2 265 for i=m-1,...,0: 266 sum[i] = (b[i] - U(i,:)x )/d[i]; 267 x[i] = (1-omega)x[i] + omega*sum[i]; 268 */ 269 /* if there was a forward sweep done above then I thing the next two for loops are not needed */ 270 ierr = PetscMemcpy(t,b,m*sizeof(PetscScalar));CHKERRQ(ierr); 271 272 for (i=0; i<m-1; i++){ /* update rhs */ 273 v = aa + ai[i] + 1; 274 vj = aj + ai[i] + 1; 275 nz = ai[i+1] - ai[i] - 1; 276 ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 277 while (nz--) t[*vj++] -= x[i]*(*v++); 278 } 279 for (i=m-1; i>=0; i--){ 280 v = aa + ai[i] + 1; 281 vj = aj + ai[i] + 1; 282 nz = ai[i+1] - ai[i] - 1; 283 ierr = PetscLogFlops(2.0*nz-1);CHKERRQ(ierr); 284 sum = t[i]; 285 while (nz--) sum -= x[*vj++]*(*v++); 286 x[i] = (1-omega)*x[i] + omega*sum*aidiag[i]; 287 } 288 } 289 } 290 291 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 292 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 293 PetscFunctionReturn(0); 294 } 295