1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* Block operations are done by accessing one column at at time */ 5 /* Default MatSolve for block size 14 */ 6 7 PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx) 8 { 9 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 10 PetscErrorCode ierr; 11 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 12 PetscInt i,k,nz,idx,idt,m; 13 const MatScalar *aa=a->a,*v; 14 PetscScalar s[14]; 15 PetscScalar *x,xv; 16 const PetscScalar *b; 17 18 PetscFunctionBegin; 19 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 20 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21 22 /* forward solve the lower triangular */ 23 for (i=0; i<n; i++) { 24 v = aa + bs2*ai[i]; 25 vi = aj + ai[i]; 26 nz = ai[i+1] - ai[i]; 27 idt = bs*i; 28 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 29 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 30 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; 31 for (m=0; m<nz; m++) { 32 idx = bs*vi[m]; 33 for (k=0; k<bs; k++) { 34 xv = x[k + idx]; 35 x[idt] -= v[0]*xv; 36 x[1+idt] -= v[1]*xv; 37 x[2+idt] -= v[2]*xv; 38 x[3+idt] -= v[3]*xv; 39 x[4+idt] -= v[4]*xv; 40 x[5+idt] -= v[5]*xv; 41 x[6+idt] -= v[6]*xv; 42 x[7+idt] -= v[7]*xv; 43 x[8+idt] -= v[8]*xv; 44 x[9+idt] -= v[9]*xv; 45 x[10+idt] -= v[10]*xv; 46 x[11+idt] -= v[11]*xv; 47 x[12+idt] -= v[12]*xv; 48 x[13+idt] -= v[13]*xv; 49 v += 14; 50 } 51 } 52 } 53 /* backward solve the upper triangular */ 54 for (i=n-1; i>=0; i--) { 55 v = aa + bs2*(adiag[i+1]+1); 56 vi = aj + adiag[i+1]+1; 57 nz = adiag[i] - adiag[i+1] - 1; 58 idt = bs*i; 59 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 60 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 61 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; 62 63 for (m=0; m<nz; m++) { 64 idx = bs*vi[m]; 65 for (k=0; k<bs; k++) { 66 xv = x[k + idx]; 67 s[0] -= v[0]*xv; 68 s[1] -= v[1]*xv; 69 s[2] -= v[2]*xv; 70 s[3] -= v[3]*xv; 71 s[4] -= v[4]*xv; 72 s[5] -= v[5]*xv; 73 s[6] -= v[6]*xv; 74 s[7] -= v[7]*xv; 75 s[8] -= v[8]*xv; 76 s[9] -= v[9]*xv; 77 s[10] -= v[10]*xv; 78 s[11] -= v[11]*xv; 79 s[12] -= v[12]*xv; 80 s[13] -= v[13]*xv; 81 v += 14; 82 } 83 } 84 ierr = PetscArrayzero(x+idt,bs);CHKERRQ(ierr); 85 for (k=0; k<bs; k++) { 86 x[idt] += v[0]*s[k]; 87 x[1+idt] += v[1]*s[k]; 88 x[2+idt] += v[2]*s[k]; 89 x[3+idt] += v[3]*s[k]; 90 x[4+idt] += v[4]*s[k]; 91 x[5+idt] += v[5]*s[k]; 92 x[6+idt] += v[6]*s[k]; 93 x[7+idt] += v[7]*s[k]; 94 x[8+idt] += v[8]*s[k]; 95 x[9+idt] += v[9]*s[k]; 96 x[10+idt] += v[10]*s[k]; 97 x[11+idt] += v[11]*s[k]; 98 x[12+idt] += v[12]*s[k]; 99 x[13+idt] += v[13]*s[k]; 100 v += 14; 101 } 102 } 103 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 104 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 105 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 106 PetscFunctionReturn(0); 107 } 108 109 /* Block operations are done by accessing one column at at time */ 110 /* Default MatSolve for block size 13 */ 111 112 PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx) 113 { 114 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 115 PetscErrorCode ierr; 116 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 117 PetscInt i,k,nz,idx,idt,m; 118 const MatScalar *aa=a->a,*v; 119 PetscScalar s[13]; 120 PetscScalar *x,xv; 121 const PetscScalar *b; 122 123 PetscFunctionBegin; 124 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 125 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 126 127 /* forward solve the lower triangular */ 128 for (i=0; i<n; i++) { 129 v = aa + bs2*ai[i]; 130 vi = aj + ai[i]; 131 nz = ai[i+1] - ai[i]; 132 idt = bs*i; 133 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 134 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 135 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; 136 for (m=0; m<nz; m++) { 137 idx = bs*vi[m]; 138 for (k=0; k<bs; k++) { 139 xv = x[k + idx]; 140 x[idt] -= v[0]*xv; 141 x[1+idt] -= v[1]*xv; 142 x[2+idt] -= v[2]*xv; 143 x[3+idt] -= v[3]*xv; 144 x[4+idt] -= v[4]*xv; 145 x[5+idt] -= v[5]*xv; 146 x[6+idt] -= v[6]*xv; 147 x[7+idt] -= v[7]*xv; 148 x[8+idt] -= v[8]*xv; 149 x[9+idt] -= v[9]*xv; 150 x[10+idt] -= v[10]*xv; 151 x[11+idt] -= v[11]*xv; 152 x[12+idt] -= v[12]*xv; 153 v += 13; 154 } 155 } 156 } 157 /* backward solve the upper triangular */ 158 for (i=n-1; i>=0; i--) { 159 v = aa + bs2*(adiag[i+1]+1); 160 vi = aj + adiag[i+1]+1; 161 nz = adiag[i] - adiag[i+1] - 1; 162 idt = bs*i; 163 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 164 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 165 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; 166 167 for (m=0; m<nz; m++) { 168 idx = bs*vi[m]; 169 for (k=0; k<bs; k++) { 170 xv = x[k + idx]; 171 s[0] -= v[0]*xv; 172 s[1] -= v[1]*xv; 173 s[2] -= v[2]*xv; 174 s[3] -= v[3]*xv; 175 s[4] -= v[4]*xv; 176 s[5] -= v[5]*xv; 177 s[6] -= v[6]*xv; 178 s[7] -= v[7]*xv; 179 s[8] -= v[8]*xv; 180 s[9] -= v[9]*xv; 181 s[10] -= v[10]*xv; 182 s[11] -= v[11]*xv; 183 s[12] -= v[12]*xv; 184 v += 13; 185 } 186 } 187 ierr = PetscArrayzero(x+idt,bs);CHKERRQ(ierr); 188 for (k=0; k<bs; k++) { 189 x[idt] += v[0]*s[k]; 190 x[1+idt] += v[1]*s[k]; 191 x[2+idt] += v[2]*s[k]; 192 x[3+idt] += v[3]*s[k]; 193 x[4+idt] += v[4]*s[k]; 194 x[5+idt] += v[5]*s[k]; 195 x[6+idt] += v[6]*s[k]; 196 x[7+idt] += v[7]*s[k]; 197 x[8+idt] += v[8]*s[k]; 198 x[9+idt] += v[9]*s[k]; 199 x[10+idt] += v[10]*s[k]; 200 x[11+idt] += v[11]*s[k]; 201 x[12+idt] += v[12]*s[k]; 202 v += 13; 203 } 204 } 205 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 206 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 207 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 208 PetscFunctionReturn(0); 209 } 210 211 /* Block operations are done by accessing one column at at time */ 212 /* Default MatSolve for block size 12 */ 213 214 PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx) 215 { 216 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 217 PetscErrorCode ierr; 218 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 219 PetscInt i,k,nz,idx,idt,m; 220 const MatScalar *aa=a->a,*v; 221 PetscScalar s[12]; 222 PetscScalar *x,xv; 223 const PetscScalar *b; 224 225 PetscFunctionBegin; 226 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 227 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 228 229 /* forward solve the lower triangular */ 230 for (i=0; i<n; i++) { 231 v = aa + bs2*ai[i]; 232 vi = aj + ai[i]; 233 nz = ai[i+1] - ai[i]; 234 idt = bs*i; 235 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 236 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 237 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; 238 for (m=0; m<nz; m++) { 239 idx = bs*vi[m]; 240 for (k=0; k<bs; k++) { 241 xv = x[k + idx]; 242 x[idt] -= v[0]*xv; 243 x[1+idt] -= v[1]*xv; 244 x[2+idt] -= v[2]*xv; 245 x[3+idt] -= v[3]*xv; 246 x[4+idt] -= v[4]*xv; 247 x[5+idt] -= v[5]*xv; 248 x[6+idt] -= v[6]*xv; 249 x[7+idt] -= v[7]*xv; 250 x[8+idt] -= v[8]*xv; 251 x[9+idt] -= v[9]*xv; 252 x[10+idt] -= v[10]*xv; 253 x[11+idt] -= v[11]*xv; 254 v += 12; 255 } 256 } 257 } 258 /* backward solve the upper triangular */ 259 for (i=n-1; i>=0; i--) { 260 v = aa + bs2*(adiag[i+1]+1); 261 vi = aj + adiag[i+1]+1; 262 nz = adiag[i] - adiag[i+1] - 1; 263 idt = bs*i; 264 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 265 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 266 s[10] = x[10+idt]; s[11] = x[11+idt]; 267 268 for (m=0; m<nz; m++) { 269 idx = bs*vi[m]; 270 for (k=0; k<bs; k++) { 271 xv = x[k + idx]; 272 s[0] -= v[0]*xv; 273 s[1] -= v[1]*xv; 274 s[2] -= v[2]*xv; 275 s[3] -= v[3]*xv; 276 s[4] -= v[4]*xv; 277 s[5] -= v[5]*xv; 278 s[6] -= v[6]*xv; 279 s[7] -= v[7]*xv; 280 s[8] -= v[8]*xv; 281 s[9] -= v[9]*xv; 282 s[10] -= v[10]*xv; 283 s[11] -= v[11]*xv; 284 v += 12; 285 } 286 } 287 ierr = PetscArrayzero(x+idt,bs);CHKERRQ(ierr); 288 for (k=0; k<bs; k++) { 289 x[idt] += v[0]*s[k]; 290 x[1+idt] += v[1]*s[k]; 291 x[2+idt] += v[2]*s[k]; 292 x[3+idt] += v[3]*s[k]; 293 x[4+idt] += v[4]*s[k]; 294 x[5+idt] += v[5]*s[k]; 295 x[6+idt] += v[6]*s[k]; 296 x[7+idt] += v[7]*s[k]; 297 x[8+idt] += v[8]*s[k]; 298 x[9+idt] += v[9]*s[k]; 299 x[10+idt] += v[10]*s[k]; 300 x[11+idt] += v[11]*s[k]; 301 v += 12; 302 } 303 } 304 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 305 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 306 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 307 PetscFunctionReturn(0); 308 } 309