1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns) 9 { 10 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 11 PetscErrorCode ierr; 12 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 13 14 PetscFunctionBegin; 15 n = A->cmap->n; 16 m = A->rmap->n; 17 ns_row = a->inode.size; 18 19 min_mn = (m < n) ? m : n; 20 if (!ns) { 21 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 22 for (; count+1 < n; count++,i++) ; 23 if (count < n) { 24 i++; 25 } 26 *size = i; 27 PetscFunctionReturn(0); 28 } 29 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 30 31 /* Use the same row structure wherever feasible. */ 32 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 33 ns_col[i] = ns_row[i]; 34 } 35 36 /* if m < n; pad up the remainder with inode_limit */ 37 for (; count+1 < n; count++,i++) { 38 ns_col[i] = 1; 39 } 40 /* The last node is the odd ball. padd it up with the remaining rows; */ 41 if (count < n) { 42 ns_col[i] = n - count; 43 i++; 44 } else if (count > n) { 45 /* Adjust for the over estimation */ 46 ns_col[i-1] += n - count; 47 } 48 *size = i; 49 *ns = ns_col; 50 PetscFunctionReturn(0); 51 } 52 53 54 /* 55 This builds symmetric version of nonzero structure, 56 */ 57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 58 { 59 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 60 PetscErrorCode ierr; 61 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 62 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 63 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 64 65 PetscFunctionBegin; 66 nslim_row = a->inode.node_count; 67 m = A->rmap->n; 68 n = A->cmap->n; 69 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 70 71 /* Use the row_inode as column_inode */ 72 nslim_col = nslim_row; 73 ns_col = ns_row; 74 75 /* allocate space for reformated inode structure */ 76 ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr); 77 ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr); 78 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 79 80 for (i1=0,col=0; i1<nslim_col; ++i1) { 81 nsz = ns_col[i1]; 82 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 83 } 84 /* allocate space for row pointers */ 85 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 86 *iia = ia; 87 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 88 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 89 90 /* determine the number of columns in each row */ 91 ia[0] = oshift; 92 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 93 94 j = aj + ai[row] + ishift; 95 jmax = aj + ai[row+1] + ishift; 96 if (j==jmax) continue; /* empty row */ 97 col = *j++ + ishift; 98 i2 = tvc[col]; 99 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 100 ia[i1+1]++; 101 ia[i2+1]++; 102 i2++; /* Start col of next node */ 103 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 104 i2 = tvc[col]; 105 } 106 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 107 } 108 109 /* shift ia[i] to point to next row */ 110 for (i1=1; i1<nslim_row+1; i1++) { 111 row = ia[i1-1]; 112 ia[i1] += row; 113 work[i1-1] = row - oshift; 114 } 115 116 /* allocate space for column pointers */ 117 nz = ia[nslim_row] + (!ishift); 118 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 119 *jja = ja; 120 121 /* loop over lower triangular part putting into ja */ 122 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 123 j = aj + ai[row] + ishift; 124 jmax = aj + ai[row+1] + ishift; 125 if (j==jmax) continue; /* empty row */ 126 col = *j++ + ishift; 127 i2 = tvc[col]; 128 while (i2<i1 && j<jmax) { 129 ja[work[i2]++] = i1 + oshift; 130 ja[work[i1]++] = i2 + oshift; 131 ++i2; 132 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 133 i2 = tvc[col]; 134 } 135 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 136 137 } 138 ierr = PetscFree(work);CHKERRQ(ierr); 139 ierr = PetscFree(tns);CHKERRQ(ierr); 140 ierr = PetscFree(tvc);CHKERRQ(ierr); 141 PetscFunctionReturn(0); 142 } 143 144 /* 145 This builds nonsymmetric version of nonzero structure, 146 */ 147 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 148 { 149 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 150 PetscErrorCode ierr; 151 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 152 PetscInt *tns,*tvc,nsz,i1,i2; 153 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 154 155 PetscFunctionBegin; 156 nslim_row = a->inode.node_count; 157 n = A->cmap->n; 158 159 /* Create The column_inode for this matrix */ 160 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 161 162 /* allocate space for reformated column_inode structure */ 163 ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr); 164 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 165 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 166 167 for (i1=0,col=0; i1<nslim_col; ++i1) { 168 nsz = ns_col[i1]; 169 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 170 } 171 /* allocate space for row pointers */ 172 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 173 *iia = ia; 174 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 175 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 176 177 /* determine the number of columns in each row */ 178 ia[0] = oshift; 179 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 180 j = aj + ai[row] + ishift; 181 nz = ai[row+1] - ai[row]; 182 if (!nz) continue; /* empty row */ 183 col = *j++ + ishift; 184 i2 = tvc[col]; 185 while (nz-- > 0) { /* off-diagonal elemets */ 186 ia[i1+1]++; 187 i2++; /* Start col of next node */ 188 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 189 if (nz > 0) i2 = tvc[col]; 190 } 191 } 192 193 /* shift ia[i] to point to next row */ 194 for (i1=1; i1<nslim_row+1; i1++) { 195 row = ia[i1-1]; 196 ia[i1] += row; 197 work[i1-1] = row - oshift; 198 } 199 200 /* allocate space for column pointers */ 201 nz = ia[nslim_row] + (!ishift); 202 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 203 *jja = ja; 204 205 /* loop over matrix putting into ja */ 206 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 207 j = aj + ai[row] + ishift; 208 nz = ai[row+1] - ai[row]; 209 if (!nz) continue; /* empty row */ 210 col = *j++ + ishift; 211 i2 = tvc[col]; 212 while (nz-- > 0) { 213 ja[work[i1]++] = i2 + oshift; 214 ++i2; 215 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 216 if (nz > 0) i2 = tvc[col]; 217 } 218 } 219 ierr = PetscFree(ns_col);CHKERRQ(ierr); 220 ierr = PetscFree(work);CHKERRQ(ierr); 221 ierr = PetscFree(tns);CHKERRQ(ierr); 222 ierr = PetscFree(tvc);CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 227 { 228 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 229 PetscErrorCode ierr; 230 231 PetscFunctionBegin; 232 *n = a->inode.node_count; 233 if (!ia) PetscFunctionReturn(0); 234 if (!blockcompressed) { 235 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 236 } else if (symmetric) { 237 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 238 } else { 239 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 240 } 241 PetscFunctionReturn(0); 242 } 243 244 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 245 { 246 PetscErrorCode ierr; 247 248 PetscFunctionBegin; 249 if (!ia) PetscFunctionReturn(0); 250 251 if (!blockcompressed) { 252 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 253 } else { 254 ierr = PetscFree(*ia);CHKERRQ(ierr); 255 ierr = PetscFree(*ja);CHKERRQ(ierr); 256 } 257 PetscFunctionReturn(0); 258 } 259 260 /* ----------------------------------------------------------- */ 261 262 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 263 { 264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 265 PetscErrorCode ierr; 266 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 267 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 268 269 PetscFunctionBegin; 270 nslim_row = a->inode.node_count; 271 n = A->cmap->n; 272 273 /* Create The column_inode for this matrix */ 274 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 275 276 /* allocate space for reformated column_inode structure */ 277 ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr); 278 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 279 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 280 281 for (i1=0,col=0; i1<nslim_col; ++i1) { 282 nsz = ns_col[i1]; 283 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 284 } 285 /* allocate space for column pointers */ 286 ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 287 *iia = ia; 288 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 289 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 290 291 /* determine the number of columns in each row */ 292 ia[0] = oshift; 293 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 294 j = aj + ai[row] + ishift; 295 col = *j++ + ishift; 296 i2 = tvc[col]; 297 nz = ai[row+1] - ai[row]; 298 while (nz-- > 0) { /* off-diagonal elemets */ 299 /* ia[i1+1]++; */ 300 ia[i2+1]++; 301 i2++; 302 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 303 if (nz > 0) i2 = tvc[col]; 304 } 305 } 306 307 /* shift ia[i] to point to next col */ 308 for (i1=1; i1<nslim_col+1; i1++) { 309 col = ia[i1-1]; 310 ia[i1] += col; 311 work[i1-1] = col - oshift; 312 } 313 314 /* allocate space for column pointers */ 315 nz = ia[nslim_col] + (!ishift); 316 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 317 *jja = ja; 318 319 /* loop over matrix putting into ja */ 320 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 321 j = aj + ai[row] + ishift; 322 col = *j++ + ishift; 323 i2 = tvc[col]; 324 nz = ai[row+1] - ai[row]; 325 while (nz-- > 0) { 326 /* ja[work[i1]++] = i2 + oshift; */ 327 ja[work[i2]++] = i1 + oshift; 328 i2++; 329 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 330 if (nz > 0) i2 = tvc[col]; 331 } 332 } 333 ierr = PetscFree(ns_col);CHKERRQ(ierr); 334 ierr = PetscFree(work);CHKERRQ(ierr); 335 ierr = PetscFree(tns);CHKERRQ(ierr); 336 ierr = PetscFree(tvc);CHKERRQ(ierr); 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 341 { 342 PetscErrorCode ierr; 343 344 PetscFunctionBegin; 345 ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr); 346 if (!ia) PetscFunctionReturn(0); 347 348 if (!blockcompressed) { 349 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 350 } else if (symmetric) { 351 /* Since the indices are symmetric it does'nt matter */ 352 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 353 } else { 354 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 355 } 356 PetscFunctionReturn(0); 357 } 358 359 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 360 { 361 PetscErrorCode ierr; 362 363 PetscFunctionBegin; 364 if (!ia) PetscFunctionReturn(0); 365 if (!blockcompressed) { 366 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else { 368 ierr = PetscFree(*ia);CHKERRQ(ierr); 369 ierr = PetscFree(*ja);CHKERRQ(ierr); 370 } 371 PetscFunctionReturn(0); 372 } 373 374 /* ----------------------------------------------------------- */ 375 376 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 377 { 378 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 379 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 380 PetscScalar *y; 381 const PetscScalar *x; 382 const MatScalar *v1,*v2,*v3,*v4,*v5; 383 PetscErrorCode ierr; 384 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 385 const PetscInt *idx,*ns,*ii; 386 387 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 388 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 389 #endif 390 391 PetscFunctionBegin; 392 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 393 node_max = a->inode.node_count; 394 ns = a->inode.size; /* Node Size array */ 395 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 396 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 397 idx = a->j; 398 v1 = a->a; 399 ii = a->i; 400 401 for (i = 0,row = 0; i< node_max; ++i) { 402 nsz = ns[i]; 403 n = ii[1] - ii[0]; 404 nonzerorow += (n>0)*nsz; 405 ii += nsz; 406 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 407 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 408 sz = n; /* No of non zeros in this row */ 409 /* Switch on the size of Node */ 410 switch (nsz) { /* Each loop in 'case' is unrolled */ 411 case 1: 412 sum1 = 0.; 413 414 for (n = 0; n< sz-1; n+=2) { 415 i1 = idx[0]; /* The instructions are ordered to */ 416 i2 = idx[1]; /* make the compiler's job easy */ 417 idx += 2; 418 tmp0 = x[i1]; 419 tmp1 = x[i2]; 420 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 421 } 422 423 if (n == sz-1) { /* Take care of the last nonzero */ 424 tmp0 = x[*idx++]; 425 sum1 += *v1++ *tmp0; 426 } 427 y[row++]=sum1; 428 break; 429 case 2: 430 sum1 = 0.; 431 sum2 = 0.; 432 v2 = v1 + n; 433 434 for (n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; 436 i2 = idx[1]; 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 442 } 443 if (n == sz-1) { 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 sum2 += *v2++ * tmp0; 447 } 448 y[row++]=sum1; 449 y[row++]=sum2; 450 v1 =v2; /* Since the next block to be processed starts there*/ 451 idx +=sz; 452 break; 453 case 3: 454 sum1 = 0.; 455 sum2 = 0.; 456 sum3 = 0.; 457 v2 = v1 + n; 458 v3 = v2 + n; 459 460 for (n = 0; n< sz-1; n+=2) { 461 i1 = idx[0]; 462 i2 = idx[1]; 463 idx += 2; 464 tmp0 = x[i1]; 465 tmp1 = x[i2]; 466 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 467 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 468 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 469 } 470 if (n == sz-1) { 471 tmp0 = x[*idx++]; 472 sum1 += *v1++ * tmp0; 473 sum2 += *v2++ * tmp0; 474 sum3 += *v3++ * tmp0; 475 } 476 y[row++]=sum1; 477 y[row++]=sum2; 478 y[row++]=sum3; 479 v1 =v3; /* Since the next block to be processed starts there*/ 480 idx +=2*sz; 481 break; 482 case 4: 483 sum1 = 0.; 484 sum2 = 0.; 485 sum3 = 0.; 486 sum4 = 0.; 487 v2 = v1 + n; 488 v3 = v2 + n; 489 v4 = v3 + n; 490 491 for (n = 0; n< sz-1; n+=2) { 492 i1 = idx[0]; 493 i2 = idx[1]; 494 idx += 2; 495 tmp0 = x[i1]; 496 tmp1 = x[i2]; 497 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 498 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 499 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 500 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 501 } 502 if (n == sz-1) { 503 tmp0 = x[*idx++]; 504 sum1 += *v1++ * tmp0; 505 sum2 += *v2++ * tmp0; 506 sum3 += *v3++ * tmp0; 507 sum4 += *v4++ * tmp0; 508 } 509 y[row++]=sum1; 510 y[row++]=sum2; 511 y[row++]=sum3; 512 y[row++]=sum4; 513 v1 =v4; /* Since the next block to be processed starts there*/ 514 idx +=3*sz; 515 break; 516 case 5: 517 sum1 = 0.; 518 sum2 = 0.; 519 sum3 = 0.; 520 sum4 = 0.; 521 sum5 = 0.; 522 v2 = v1 + n; 523 v3 = v2 + n; 524 v4 = v3 + n; 525 v5 = v4 + n; 526 527 for (n = 0; n<sz-1; n+=2) { 528 i1 = idx[0]; 529 i2 = idx[1]; 530 idx += 2; 531 tmp0 = x[i1]; 532 tmp1 = x[i2]; 533 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 534 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 535 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 536 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 537 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 538 } 539 if (n == sz-1) { 540 tmp0 = x[*idx++]; 541 sum1 += *v1++ * tmp0; 542 sum2 += *v2++ * tmp0; 543 sum3 += *v3++ * tmp0; 544 sum4 += *v4++ * tmp0; 545 sum5 += *v5++ * tmp0; 546 } 547 y[row++]=sum1; 548 y[row++]=sum2; 549 y[row++]=sum3; 550 y[row++]=sum4; 551 y[row++]=sum5; 552 v1 =v5; /* Since the next block to be processed starts there */ 553 idx +=4*sz; 554 break; 555 default: 556 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 557 } 558 } 559 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 560 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 561 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } 564 /* ----------------------------------------------------------- */ 565 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 566 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 567 { 568 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 569 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 570 const MatScalar *v1,*v2,*v3,*v4,*v5; 571 const PetscScalar *x; 572 PetscScalar *y,*z,*zt; 573 PetscErrorCode ierr; 574 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 575 const PetscInt *idx,*ns,*ii; 576 577 PetscFunctionBegin; 578 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 579 node_max = a->inode.node_count; 580 ns = a->inode.size; /* Node Size array */ 581 582 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 583 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 584 zt = z; 585 586 idx = a->j; 587 v1 = a->a; 588 ii = a->i; 589 590 for (i = 0,row = 0; i< node_max; ++i) { 591 nsz = ns[i]; 592 n = ii[1] - ii[0]; 593 ii += nsz; 594 sz = n; /* No of non zeros in this row */ 595 /* Switch on the size of Node */ 596 switch (nsz) { /* Each loop in 'case' is unrolled */ 597 case 1: 598 sum1 = *zt++; 599 600 for (n = 0; n< sz-1; n+=2) { 601 i1 = idx[0]; /* The instructions are ordered to */ 602 i2 = idx[1]; /* make the compiler's job easy */ 603 idx += 2; 604 tmp0 = x[i1]; 605 tmp1 = x[i2]; 606 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 607 } 608 609 if (n == sz-1) { /* Take care of the last nonzero */ 610 tmp0 = x[*idx++]; 611 sum1 += *v1++ * tmp0; 612 } 613 y[row++]=sum1; 614 break; 615 case 2: 616 sum1 = *zt++; 617 sum2 = *zt++; 618 v2 = v1 + n; 619 620 for (n = 0; n< sz-1; n+=2) { 621 i1 = idx[0]; 622 i2 = idx[1]; 623 idx += 2; 624 tmp0 = x[i1]; 625 tmp1 = x[i2]; 626 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 627 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 628 } 629 if (n == sz-1) { 630 tmp0 = x[*idx++]; 631 sum1 += *v1++ * tmp0; 632 sum2 += *v2++ * tmp0; 633 } 634 y[row++]=sum1; 635 y[row++]=sum2; 636 v1 =v2; /* Since the next block to be processed starts there*/ 637 idx +=sz; 638 break; 639 case 3: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 sum3 = *zt++; 643 v2 = v1 + n; 644 v3 = v2 + n; 645 646 for (n = 0; n< sz-1; n+=2) { 647 i1 = idx[0]; 648 i2 = idx[1]; 649 idx += 2; 650 tmp0 = x[i1]; 651 tmp1 = x[i2]; 652 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 653 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 654 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 655 } 656 if (n == sz-1) { 657 tmp0 = x[*idx++]; 658 sum1 += *v1++ * tmp0; 659 sum2 += *v2++ * tmp0; 660 sum3 += *v3++ * tmp0; 661 } 662 y[row++]=sum1; 663 y[row++]=sum2; 664 y[row++]=sum3; 665 v1 =v3; /* Since the next block to be processed starts there*/ 666 idx +=2*sz; 667 break; 668 case 4: 669 sum1 = *zt++; 670 sum2 = *zt++; 671 sum3 = *zt++; 672 sum4 = *zt++; 673 v2 = v1 + n; 674 v3 = v2 + n; 675 v4 = v3 + n; 676 677 for (n = 0; n< sz-1; n+=2) { 678 i1 = idx[0]; 679 i2 = idx[1]; 680 idx += 2; 681 tmp0 = x[i1]; 682 tmp1 = x[i2]; 683 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 684 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 685 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 686 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 687 } 688 if (n == sz-1) { 689 tmp0 = x[*idx++]; 690 sum1 += *v1++ * tmp0; 691 sum2 += *v2++ * tmp0; 692 sum3 += *v3++ * tmp0; 693 sum4 += *v4++ * tmp0; 694 } 695 y[row++]=sum1; 696 y[row++]=sum2; 697 y[row++]=sum3; 698 y[row++]=sum4; 699 v1 =v4; /* Since the next block to be processed starts there*/ 700 idx +=3*sz; 701 break; 702 case 5: 703 sum1 = *zt++; 704 sum2 = *zt++; 705 sum3 = *zt++; 706 sum4 = *zt++; 707 sum5 = *zt++; 708 v2 = v1 + n; 709 v3 = v2 + n; 710 v4 = v3 + n; 711 v5 = v4 + n; 712 713 for (n = 0; n<sz-1; n+=2) { 714 i1 = idx[0]; 715 i2 = idx[1]; 716 idx += 2; 717 tmp0 = x[i1]; 718 tmp1 = x[i2]; 719 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 720 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 721 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 722 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 723 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 724 } 725 if (n == sz-1) { 726 tmp0 = x[*idx++]; 727 sum1 += *v1++ * tmp0; 728 sum2 += *v2++ * tmp0; 729 sum3 += *v3++ * tmp0; 730 sum4 += *v4++ * tmp0; 731 sum5 += *v5++ * tmp0; 732 } 733 y[row++]=sum1; 734 y[row++]=sum2; 735 y[row++]=sum3; 736 y[row++]=sum4; 737 y[row++]=sum5; 738 v1 =v5; /* Since the next block to be processed starts there */ 739 idx +=4*sz; 740 break; 741 default: 742 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 743 } 744 } 745 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 746 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 747 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 748 PetscFunctionReturn(0); 749 } 750 751 /* ----------------------------------------------------------- */ 752 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 753 { 754 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 755 IS iscol = a->col,isrow = a->row; 756 PetscErrorCode ierr; 757 const PetscInt *r,*c,*rout,*cout; 758 PetscInt i,j,n = A->rmap->n,nz; 759 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 760 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 761 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 762 PetscScalar sum1,sum2,sum3,sum4,sum5; 763 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 764 const PetscScalar *b; 765 766 PetscFunctionBegin; 767 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 768 node_max = a->inode.node_count; 769 ns = a->inode.size; /* Node Size array */ 770 771 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 772 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 773 tmp = a->solve_work; 774 775 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 776 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 777 778 /* forward solve the lower triangular */ 779 tmps = tmp; 780 aa = a_a; 781 aj = a_j; 782 ad = a->diag; 783 784 for (i = 0,row = 0; i< node_max; ++i) { 785 nsz = ns[i]; 786 aii = ai[row]; 787 v1 = aa + aii; 788 vi = aj + aii; 789 nz = ad[row]- aii; 790 if (i < node_max-1) { 791 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 792 * but our indexing to determine it's size could. */ 793 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 794 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 795 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 796 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 797 } 798 799 switch (nsz) { /* Each loop in 'case' is unrolled */ 800 case 1: 801 sum1 = b[*r++]; 802 for (j=0; j<nz-1; j+=2) { 803 i0 = vi[0]; 804 i1 = vi[1]; 805 vi +=2; 806 tmp0 = tmps[i0]; 807 tmp1 = tmps[i1]; 808 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 809 } 810 if (j == nz-1) { 811 tmp0 = tmps[*vi++]; 812 sum1 -= *v1++ *tmp0; 813 } 814 tmp[row++]=sum1; 815 break; 816 case 2: 817 sum1 = b[*r++]; 818 sum2 = b[*r++]; 819 v2 = aa + ai[row+1]; 820 821 for (j=0; j<nz-1; j+=2) { 822 i0 = vi[0]; 823 i1 = vi[1]; 824 vi +=2; 825 tmp0 = tmps[i0]; 826 tmp1 = tmps[i1]; 827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 828 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 829 } 830 if (j == nz-1) { 831 tmp0 = tmps[*vi++]; 832 sum1 -= *v1++ *tmp0; 833 sum2 -= *v2++ *tmp0; 834 } 835 sum2 -= *v2++ *sum1; 836 tmp[row++]=sum1; 837 tmp[row++]=sum2; 838 break; 839 case 3: 840 sum1 = b[*r++]; 841 sum2 = b[*r++]; 842 sum3 = b[*r++]; 843 v2 = aa + ai[row+1]; 844 v3 = aa + ai[row+2]; 845 846 for (j=0; j<nz-1; j+=2) { 847 i0 = vi[0]; 848 i1 = vi[1]; 849 vi +=2; 850 tmp0 = tmps[i0]; 851 tmp1 = tmps[i1]; 852 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 853 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 854 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 855 } 856 if (j == nz-1) { 857 tmp0 = tmps[*vi++]; 858 sum1 -= *v1++ *tmp0; 859 sum2 -= *v2++ *tmp0; 860 sum3 -= *v3++ *tmp0; 861 } 862 sum2 -= *v2++ * sum1; 863 sum3 -= *v3++ * sum1; 864 sum3 -= *v3++ * sum2; 865 866 tmp[row++]=sum1; 867 tmp[row++]=sum2; 868 tmp[row++]=sum3; 869 break; 870 871 case 4: 872 sum1 = b[*r++]; 873 sum2 = b[*r++]; 874 sum3 = b[*r++]; 875 sum4 = b[*r++]; 876 v2 = aa + ai[row+1]; 877 v3 = aa + ai[row+2]; 878 v4 = aa + ai[row+3]; 879 880 for (j=0; j<nz-1; j+=2) { 881 i0 = vi[0]; 882 i1 = vi[1]; 883 vi +=2; 884 tmp0 = tmps[i0]; 885 tmp1 = tmps[i1]; 886 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 887 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 888 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 889 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 890 } 891 if (j == nz-1) { 892 tmp0 = tmps[*vi++]; 893 sum1 -= *v1++ *tmp0; 894 sum2 -= *v2++ *tmp0; 895 sum3 -= *v3++ *tmp0; 896 sum4 -= *v4++ *tmp0; 897 } 898 sum2 -= *v2++ * sum1; 899 sum3 -= *v3++ * sum1; 900 sum4 -= *v4++ * sum1; 901 sum3 -= *v3++ * sum2; 902 sum4 -= *v4++ * sum2; 903 sum4 -= *v4++ * sum3; 904 905 tmp[row++]=sum1; 906 tmp[row++]=sum2; 907 tmp[row++]=sum3; 908 tmp[row++]=sum4; 909 break; 910 case 5: 911 sum1 = b[*r++]; 912 sum2 = b[*r++]; 913 sum3 = b[*r++]; 914 sum4 = b[*r++]; 915 sum5 = b[*r++]; 916 v2 = aa + ai[row+1]; 917 v3 = aa + ai[row+2]; 918 v4 = aa + ai[row+3]; 919 v5 = aa + ai[row+4]; 920 921 for (j=0; j<nz-1; j+=2) { 922 i0 = vi[0]; 923 i1 = vi[1]; 924 vi +=2; 925 tmp0 = tmps[i0]; 926 tmp1 = tmps[i1]; 927 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 928 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 929 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 930 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 931 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 932 } 933 if (j == nz-1) { 934 tmp0 = tmps[*vi++]; 935 sum1 -= *v1++ *tmp0; 936 sum2 -= *v2++ *tmp0; 937 sum3 -= *v3++ *tmp0; 938 sum4 -= *v4++ *tmp0; 939 sum5 -= *v5++ *tmp0; 940 } 941 942 sum2 -= *v2++ * sum1; 943 sum3 -= *v3++ * sum1; 944 sum4 -= *v4++ * sum1; 945 sum5 -= *v5++ * sum1; 946 sum3 -= *v3++ * sum2; 947 sum4 -= *v4++ * sum2; 948 sum5 -= *v5++ * sum2; 949 sum4 -= *v4++ * sum3; 950 sum5 -= *v5++ * sum3; 951 sum5 -= *v5++ * sum4; 952 953 tmp[row++]=sum1; 954 tmp[row++]=sum2; 955 tmp[row++]=sum3; 956 tmp[row++]=sum4; 957 tmp[row++]=sum5; 958 break; 959 default: 960 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 961 } 962 } 963 /* backward solve the upper triangular */ 964 for (i=node_max -1,row = n-1; i>=0; i--) { 965 nsz = ns[i]; 966 aii = ai[row+1] -1; 967 v1 = aa + aii; 968 vi = aj + aii; 969 nz = aii- ad[row]; 970 switch (nsz) { /* Each loop in 'case' is unrolled */ 971 case 1: 972 sum1 = tmp[row]; 973 974 for (j=nz; j>1; j-=2) { 975 vi -=2; 976 i0 = vi[2]; 977 i1 = vi[1]; 978 tmp0 = tmps[i0]; 979 tmp1 = tmps[i1]; 980 v1 -= 2; 981 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 982 } 983 if (j==1) { 984 tmp0 = tmps[*vi--]; 985 sum1 -= *v1-- * tmp0; 986 } 987 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 988 break; 989 case 2: 990 sum1 = tmp[row]; 991 sum2 = tmp[row -1]; 992 v2 = aa + ai[row]-1; 993 for (j=nz; j>1; j-=2) { 994 vi -=2; 995 i0 = vi[2]; 996 i1 = vi[1]; 997 tmp0 = tmps[i0]; 998 tmp1 = tmps[i1]; 999 v1 -= 2; 1000 v2 -= 2; 1001 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1002 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1003 } 1004 if (j==1) { 1005 tmp0 = tmps[*vi--]; 1006 sum1 -= *v1-- * tmp0; 1007 sum2 -= *v2-- * tmp0; 1008 } 1009 1010 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1011 sum2 -= *v2-- * tmp0; 1012 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1013 break; 1014 case 3: 1015 sum1 = tmp[row]; 1016 sum2 = tmp[row -1]; 1017 sum3 = tmp[row -2]; 1018 v2 = aa + ai[row]-1; 1019 v3 = aa + ai[row -1]-1; 1020 for (j=nz; j>1; j-=2) { 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 v3 -= 2; 1029 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1030 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1031 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1032 } 1033 if (j==1) { 1034 tmp0 = tmps[*vi--]; 1035 sum1 -= *v1-- * tmp0; 1036 sum2 -= *v2-- * tmp0; 1037 sum3 -= *v3-- * tmp0; 1038 } 1039 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1040 sum2 -= *v2-- * tmp0; 1041 sum3 -= *v3-- * tmp0; 1042 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1043 sum3 -= *v3-- * tmp0; 1044 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1045 1046 break; 1047 case 4: 1048 sum1 = tmp[row]; 1049 sum2 = tmp[row -1]; 1050 sum3 = tmp[row -2]; 1051 sum4 = tmp[row -3]; 1052 v2 = aa + ai[row]-1; 1053 v3 = aa + ai[row -1]-1; 1054 v4 = aa + ai[row -2]-1; 1055 1056 for (j=nz; j>1; j-=2) { 1057 vi -=2; 1058 i0 = vi[2]; 1059 i1 = vi[1]; 1060 tmp0 = tmps[i0]; 1061 tmp1 = tmps[i1]; 1062 v1 -= 2; 1063 v2 -= 2; 1064 v3 -= 2; 1065 v4 -= 2; 1066 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1067 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1068 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1069 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1070 } 1071 if (j==1) { 1072 tmp0 = tmps[*vi--]; 1073 sum1 -= *v1-- * tmp0; 1074 sum2 -= *v2-- * tmp0; 1075 sum3 -= *v3-- * tmp0; 1076 sum4 -= *v4-- * tmp0; 1077 } 1078 1079 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1080 sum2 -= *v2-- * tmp0; 1081 sum3 -= *v3-- * tmp0; 1082 sum4 -= *v4-- * tmp0; 1083 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1084 sum3 -= *v3-- * tmp0; 1085 sum4 -= *v4-- * tmp0; 1086 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1087 sum4 -= *v4-- * tmp0; 1088 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1089 break; 1090 case 5: 1091 sum1 = tmp[row]; 1092 sum2 = tmp[row -1]; 1093 sum3 = tmp[row -2]; 1094 sum4 = tmp[row -3]; 1095 sum5 = tmp[row -4]; 1096 v2 = aa + ai[row]-1; 1097 v3 = aa + ai[row -1]-1; 1098 v4 = aa + ai[row -2]-1; 1099 v5 = aa + ai[row -3]-1; 1100 for (j=nz; j>1; j-=2) { 1101 vi -= 2; 1102 i0 = vi[2]; 1103 i1 = vi[1]; 1104 tmp0 = tmps[i0]; 1105 tmp1 = tmps[i1]; 1106 v1 -= 2; 1107 v2 -= 2; 1108 v3 -= 2; 1109 v4 -= 2; 1110 v5 -= 2; 1111 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1112 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1113 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1114 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1115 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1116 } 1117 if (j==1) { 1118 tmp0 = tmps[*vi--]; 1119 sum1 -= *v1-- * tmp0; 1120 sum2 -= *v2-- * tmp0; 1121 sum3 -= *v3-- * tmp0; 1122 sum4 -= *v4-- * tmp0; 1123 sum5 -= *v5-- * tmp0; 1124 } 1125 1126 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1127 sum2 -= *v2-- * tmp0; 1128 sum3 -= *v3-- * tmp0; 1129 sum4 -= *v4-- * tmp0; 1130 sum5 -= *v5-- * tmp0; 1131 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1132 sum3 -= *v3-- * tmp0; 1133 sum4 -= *v4-- * tmp0; 1134 sum5 -= *v5-- * tmp0; 1135 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1136 sum4 -= *v4-- * tmp0; 1137 sum5 -= *v5-- * tmp0; 1138 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1139 sum5 -= *v5-- * tmp0; 1140 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1141 break; 1142 default: 1143 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1144 } 1145 } 1146 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1147 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1148 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1149 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1150 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1151 PetscFunctionReturn(0); 1152 } 1153 1154 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1155 { 1156 Mat C =B; 1157 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1158 IS isrow = b->row,isicol = b->icol; 1159 PetscErrorCode ierr; 1160 const PetscInt *r,*ic,*ics; 1161 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1162 PetscInt i,j,k,nz,nzL,row,*pj; 1163 const PetscInt *ajtmp,*bjtmp; 1164 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1165 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1166 FactorShiftCtx sctx; 1167 const PetscInt *ddiag; 1168 PetscReal rs; 1169 MatScalar d; 1170 PetscInt inod,nodesz,node_max,col; 1171 const PetscInt *ns; 1172 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1173 1174 PetscFunctionBegin; 1175 /* MatPivotSetUp(): initialize shift context sctx */ 1176 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1177 1178 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1179 ddiag = a->diag; 1180 sctx.shift_top = info->zeropivot; 1181 for (i=0; i<n; i++) { 1182 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1183 d = (aa)[ddiag[i]]; 1184 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1185 v = aa+ai[i]; 1186 nz = ai[i+1] - ai[i]; 1187 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1188 if (rs>sctx.shift_top) sctx.shift_top = rs; 1189 } 1190 sctx.shift_top *= 1.1; 1191 sctx.nshift_max = 5; 1192 sctx.shift_lo = 0.; 1193 sctx.shift_hi = 1.; 1194 } 1195 1196 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1197 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1198 1199 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1200 ics = ic; 1201 1202 node_max = a->inode.node_count; 1203 ns = a->inode.size; 1204 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1205 1206 /* If max inode size > 4, split it into two inodes.*/ 1207 /* also map the inode sizes according to the ordering */ 1208 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1209 for (i=0,j=0; i<node_max; ++i,++j) { 1210 if (ns[i] > 4) { 1211 tmp_vec1[j] = 4; 1212 ++j; 1213 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1214 } else { 1215 tmp_vec1[j] = ns[i]; 1216 } 1217 } 1218 /* Use the correct node_max */ 1219 node_max = j; 1220 1221 /* Now reorder the inode info based on mat re-ordering info */ 1222 /* First create a row -> inode_size_array_index map */ 1223 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1224 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1225 for (i=0,row=0; i<node_max; i++) { 1226 nodesz = tmp_vec1[i]; 1227 for (j=0; j<nodesz; j++,row++) { 1228 nsmap[row] = i; 1229 } 1230 } 1231 /* Using nsmap, create a reordered ns structure */ 1232 for (i=0,j=0; i< node_max; i++) { 1233 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1234 tmp_vec2[i] = nodesz; 1235 j += nodesz; 1236 } 1237 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1238 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1239 1240 /* Now use the correct ns */ 1241 ns = tmp_vec2; 1242 1243 do { 1244 sctx.newshift = PETSC_FALSE; 1245 /* Now loop over each block-row, and do the factorization */ 1246 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1247 nodesz = ns[inod]; 1248 1249 switch (nodesz) { 1250 case 1: 1251 /*----------*/ 1252 /* zero rtmp1 */ 1253 /* L part */ 1254 nz = bi[i+1] - bi[i]; 1255 bjtmp = bj + bi[i]; 1256 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1257 1258 /* U part */ 1259 nz = bdiag[i]-bdiag[i+1]; 1260 bjtmp = bj + bdiag[i+1]+1; 1261 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1262 1263 /* load in initial (unfactored row) */ 1264 nz = ai[r[i]+1] - ai[r[i]]; 1265 ajtmp = aj + ai[r[i]]; 1266 v = aa + ai[r[i]]; 1267 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1268 1269 /* ZeropivotApply() */ 1270 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1271 1272 /* elimination */ 1273 bjtmp = bj + bi[i]; 1274 row = *bjtmp++; 1275 nzL = bi[i+1] - bi[i]; 1276 for (k=0; k < nzL; k++) { 1277 pc = rtmp1 + row; 1278 if (*pc != 0.0) { 1279 pv = b->a + bdiag[row]; 1280 mul1 = *pc * (*pv); 1281 *pc = mul1; 1282 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1283 pv = b->a + bdiag[row+1]+1; 1284 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1285 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1286 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1287 } 1288 row = *bjtmp++; 1289 } 1290 1291 /* finished row so stick it into b->a */ 1292 rs = 0.0; 1293 /* L part */ 1294 pv = b->a + bi[i]; 1295 pj = b->j + bi[i]; 1296 nz = bi[i+1] - bi[i]; 1297 for (j=0; j<nz; j++) { 1298 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1299 } 1300 1301 /* U part */ 1302 pv = b->a + bdiag[i+1]+1; 1303 pj = b->j + bdiag[i+1]+1; 1304 nz = bdiag[i] - bdiag[i+1]-1; 1305 for (j=0; j<nz; j++) { 1306 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1307 } 1308 1309 /* Check zero pivot */ 1310 sctx.rs = rs; 1311 sctx.pv = rtmp1[i]; 1312 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1313 if (sctx.newshift) break; 1314 1315 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1316 pv = b->a + bdiag[i]; 1317 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1318 break; 1319 1320 case 2: 1321 /*----------*/ 1322 /* zero rtmp1 and rtmp2 */ 1323 /* L part */ 1324 nz = bi[i+1] - bi[i]; 1325 bjtmp = bj + bi[i]; 1326 for (j=0; j<nz; j++) { 1327 col = bjtmp[j]; 1328 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1329 } 1330 1331 /* U part */ 1332 nz = bdiag[i]-bdiag[i+1]; 1333 bjtmp = bj + bdiag[i+1]+1; 1334 for (j=0; j<nz; j++) { 1335 col = bjtmp[j]; 1336 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1337 } 1338 1339 /* load in initial (unfactored row) */ 1340 nz = ai[r[i]+1] - ai[r[i]]; 1341 ajtmp = aj + ai[r[i]]; 1342 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1343 for (j=0; j<nz; j++) { 1344 col = ics[ajtmp[j]]; 1345 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1346 } 1347 /* ZeropivotApply(): shift the diagonal of the matrix */ 1348 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1349 1350 /* elimination */ 1351 bjtmp = bj + bi[i]; 1352 row = *bjtmp++; /* pivot row */ 1353 nzL = bi[i+1] - bi[i]; 1354 for (k=0; k < nzL; k++) { 1355 pc1 = rtmp1 + row; 1356 pc2 = rtmp2 + row; 1357 if (*pc1 != 0.0 || *pc2 != 0.0) { 1358 pv = b->a + bdiag[row]; 1359 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1360 *pc1 = mul1; *pc2 = mul2; 1361 1362 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1363 pv = b->a + bdiag[row+1]+1; 1364 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1365 for (j=0; j<nz; j++) { 1366 col = pj[j]; 1367 rtmp1[col] -= mul1 * pv[j]; 1368 rtmp2[col] -= mul2 * pv[j]; 1369 } 1370 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1371 } 1372 row = *bjtmp++; 1373 } 1374 1375 /* finished row i; check zero pivot, then stick row i into b->a */ 1376 rs = 0.0; 1377 /* L part */ 1378 pc1 = b->a + bi[i]; 1379 pj = b->j + bi[i]; 1380 nz = bi[i+1] - bi[i]; 1381 for (j=0; j<nz; j++) { 1382 col = pj[j]; 1383 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1384 } 1385 /* U part */ 1386 pc1 = b->a + bdiag[i+1]+1; 1387 pj = b->j + bdiag[i+1]+1; 1388 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1389 for (j=0; j<nz; j++) { 1390 col = pj[j]; 1391 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1392 } 1393 1394 sctx.rs = rs; 1395 sctx.pv = rtmp1[i]; 1396 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1397 if (sctx.newshift) break; 1398 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1399 *pc1 = 1.0/sctx.pv; 1400 1401 /* Now take care of diagonal 2x2 block. */ 1402 pc2 = rtmp2 + i; 1403 if (*pc2 != 0.0) { 1404 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1405 *pc2 = mul1; /* insert L entry */ 1406 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1407 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1408 for (j=0; j<nz; j++) { 1409 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1410 } 1411 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1412 } 1413 1414 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1415 rs = 0.0; 1416 /* L part */ 1417 pc2 = b->a + bi[i+1]; 1418 pj = b->j + bi[i+1]; 1419 nz = bi[i+2] - bi[i+1]; 1420 for (j=0; j<nz; j++) { 1421 col = pj[j]; 1422 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1423 } 1424 /* U part */ 1425 pc2 = b->a + bdiag[i+2]+1; 1426 pj = b->j + bdiag[i+2]+1; 1427 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1428 for (j=0; j<nz; j++) { 1429 col = pj[j]; 1430 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1431 } 1432 1433 sctx.rs = rs; 1434 sctx.pv = rtmp2[i+1]; 1435 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1436 if (sctx.newshift) break; 1437 pc2 = b->a + bdiag[i+1]; 1438 *pc2 = 1.0/sctx.pv; 1439 break; 1440 1441 case 3: 1442 /*----------*/ 1443 /* zero rtmp */ 1444 /* L part */ 1445 nz = bi[i+1] - bi[i]; 1446 bjtmp = bj + bi[i]; 1447 for (j=0; j<nz; j++) { 1448 col = bjtmp[j]; 1449 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1450 } 1451 1452 /* U part */ 1453 nz = bdiag[i]-bdiag[i+1]; 1454 bjtmp = bj + bdiag[i+1]+1; 1455 for (j=0; j<nz; j++) { 1456 col = bjtmp[j]; 1457 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1458 } 1459 1460 /* load in initial (unfactored row) */ 1461 nz = ai[r[i]+1] - ai[r[i]]; 1462 ajtmp = aj + ai[r[i]]; 1463 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1464 for (j=0; j<nz; j++) { 1465 col = ics[ajtmp[j]]; 1466 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1467 } 1468 /* ZeropivotApply(): shift the diagonal of the matrix */ 1469 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1470 1471 /* elimination */ 1472 bjtmp = bj + bi[i]; 1473 row = *bjtmp++; /* pivot row */ 1474 nzL = bi[i+1] - bi[i]; 1475 for (k=0; k < nzL; k++) { 1476 pc1 = rtmp1 + row; 1477 pc2 = rtmp2 + row; 1478 pc3 = rtmp3 + row; 1479 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1480 pv = b->a + bdiag[row]; 1481 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1482 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1483 1484 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1485 pv = b->a + bdiag[row+1]+1; 1486 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1487 for (j=0; j<nz; j++) { 1488 col = pj[j]; 1489 rtmp1[col] -= mul1 * pv[j]; 1490 rtmp2[col] -= mul2 * pv[j]; 1491 rtmp3[col] -= mul3 * pv[j]; 1492 } 1493 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1494 } 1495 row = *bjtmp++; 1496 } 1497 1498 /* finished row i; check zero pivot, then stick row i into b->a */ 1499 rs = 0.0; 1500 /* L part */ 1501 pc1 = b->a + bi[i]; 1502 pj = b->j + bi[i]; 1503 nz = bi[i+1] - bi[i]; 1504 for (j=0; j<nz; j++) { 1505 col = pj[j]; 1506 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1507 } 1508 /* U part */ 1509 pc1 = b->a + bdiag[i+1]+1; 1510 pj = b->j + bdiag[i+1]+1; 1511 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1512 for (j=0; j<nz; j++) { 1513 col = pj[j]; 1514 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1515 } 1516 1517 sctx.rs = rs; 1518 sctx.pv = rtmp1[i]; 1519 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1520 if (sctx.newshift) break; 1521 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1522 *pc1 = 1.0/sctx.pv; 1523 1524 /* Now take care of 1st column of diagonal 3x3 block. */ 1525 pc2 = rtmp2 + i; 1526 pc3 = rtmp3 + i; 1527 if (*pc2 != 0.0 || *pc3 != 0.0) { 1528 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1529 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1530 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1531 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1532 for (j=0; j<nz; j++) { 1533 col = pj[j]; 1534 rtmp2[col] -= mul2 * rtmp1[col]; 1535 rtmp3[col] -= mul3 * rtmp1[col]; 1536 } 1537 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1538 } 1539 1540 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1541 rs = 0.0; 1542 /* L part */ 1543 pc2 = b->a + bi[i+1]; 1544 pj = b->j + bi[i+1]; 1545 nz = bi[i+2] - bi[i+1]; 1546 for (j=0; j<nz; j++) { 1547 col = pj[j]; 1548 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1549 } 1550 /* U part */ 1551 pc2 = b->a + bdiag[i+2]+1; 1552 pj = b->j + bdiag[i+2]+1; 1553 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1554 for (j=0; j<nz; j++) { 1555 col = pj[j]; 1556 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1557 } 1558 1559 sctx.rs = rs; 1560 sctx.pv = rtmp2[i+1]; 1561 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1562 if (sctx.newshift) break; 1563 pc2 = b->a + bdiag[i+1]; 1564 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1565 1566 /* Now take care of 2nd column of diagonal 3x3 block. */ 1567 pc3 = rtmp3 + i+1; 1568 if (*pc3 != 0.0) { 1569 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1570 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1571 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1572 for (j=0; j<nz; j++) { 1573 col = pj[j]; 1574 rtmp3[col] -= mul3 * rtmp2[col]; 1575 } 1576 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1577 } 1578 1579 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1580 rs = 0.0; 1581 /* L part */ 1582 pc3 = b->a + bi[i+2]; 1583 pj = b->j + bi[i+2]; 1584 nz = bi[i+3] - bi[i+2]; 1585 for (j=0; j<nz; j++) { 1586 col = pj[j]; 1587 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1588 } 1589 /* U part */ 1590 pc3 = b->a + bdiag[i+3]+1; 1591 pj = b->j + bdiag[i+3]+1; 1592 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1593 for (j=0; j<nz; j++) { 1594 col = pj[j]; 1595 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1596 } 1597 1598 sctx.rs = rs; 1599 sctx.pv = rtmp3[i+2]; 1600 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1601 if (sctx.newshift) break; 1602 pc3 = b->a + bdiag[i+2]; 1603 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1604 break; 1605 case 4: 1606 /*----------*/ 1607 /* zero rtmp */ 1608 /* L part */ 1609 nz = bi[i+1] - bi[i]; 1610 bjtmp = bj + bi[i]; 1611 for (j=0; j<nz; j++) { 1612 col = bjtmp[j]; 1613 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1614 } 1615 1616 /* U part */ 1617 nz = bdiag[i]-bdiag[i+1]; 1618 bjtmp = bj + bdiag[i+1]+1; 1619 for (j=0; j<nz; j++) { 1620 col = bjtmp[j]; 1621 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1622 } 1623 1624 /* load in initial (unfactored row) */ 1625 nz = ai[r[i]+1] - ai[r[i]]; 1626 ajtmp = aj + ai[r[i]]; 1627 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1628 for (j=0; j<nz; j++) { 1629 col = ics[ajtmp[j]]; 1630 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1631 } 1632 /* ZeropivotApply(): shift the diagonal of the matrix */ 1633 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1634 1635 /* elimination */ 1636 bjtmp = bj + bi[i]; 1637 row = *bjtmp++; /* pivot row */ 1638 nzL = bi[i+1] - bi[i]; 1639 for (k=0; k < nzL; k++) { 1640 pc1 = rtmp1 + row; 1641 pc2 = rtmp2 + row; 1642 pc3 = rtmp3 + row; 1643 pc4 = rtmp4 + row; 1644 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1645 pv = b->a + bdiag[row]; 1646 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1647 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1648 1649 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1650 pv = b->a + bdiag[row+1]+1; 1651 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1652 for (j=0; j<nz; j++) { 1653 col = pj[j]; 1654 rtmp1[col] -= mul1 * pv[j]; 1655 rtmp2[col] -= mul2 * pv[j]; 1656 rtmp3[col] -= mul3 * pv[j]; 1657 rtmp4[col] -= mul4 * pv[j]; 1658 } 1659 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1660 } 1661 row = *bjtmp++; 1662 } 1663 1664 /* finished row i; check zero pivot, then stick row i into b->a */ 1665 rs = 0.0; 1666 /* L part */ 1667 pc1 = b->a + bi[i]; 1668 pj = b->j + bi[i]; 1669 nz = bi[i+1] - bi[i]; 1670 for (j=0; j<nz; j++) { 1671 col = pj[j]; 1672 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1673 } 1674 /* U part */ 1675 pc1 = b->a + bdiag[i+1]+1; 1676 pj = b->j + bdiag[i+1]+1; 1677 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1678 for (j=0; j<nz; j++) { 1679 col = pj[j]; 1680 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1681 } 1682 1683 sctx.rs = rs; 1684 sctx.pv = rtmp1[i]; 1685 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1686 if (sctx.newshift) break; 1687 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1688 *pc1 = 1.0/sctx.pv; 1689 1690 /* Now take care of 1st column of diagonal 4x4 block. */ 1691 pc2 = rtmp2 + i; 1692 pc3 = rtmp3 + i; 1693 pc4 = rtmp4 + i; 1694 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1695 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1696 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1697 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1698 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1699 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1700 for (j=0; j<nz; j++) { 1701 col = pj[j]; 1702 rtmp2[col] -= mul2 * rtmp1[col]; 1703 rtmp3[col] -= mul3 * rtmp1[col]; 1704 rtmp4[col] -= mul4 * rtmp1[col]; 1705 } 1706 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1707 } 1708 1709 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1710 rs = 0.0; 1711 /* L part */ 1712 pc2 = b->a + bi[i+1]; 1713 pj = b->j + bi[i+1]; 1714 nz = bi[i+2] - bi[i+1]; 1715 for (j=0; j<nz; j++) { 1716 col = pj[j]; 1717 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1718 } 1719 /* U part */ 1720 pc2 = b->a + bdiag[i+2]+1; 1721 pj = b->j + bdiag[i+2]+1; 1722 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1723 for (j=0; j<nz; j++) { 1724 col = pj[j]; 1725 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1726 } 1727 1728 sctx.rs = rs; 1729 sctx.pv = rtmp2[i+1]; 1730 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1731 if (sctx.newshift) break; 1732 pc2 = b->a + bdiag[i+1]; 1733 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1734 1735 /* Now take care of 2nd column of diagonal 4x4 block. */ 1736 pc3 = rtmp3 + i+1; 1737 pc4 = rtmp4 + i+1; 1738 if (*pc3 != 0.0 || *pc4 != 0.0) { 1739 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1740 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1741 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1742 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1743 for (j=0; j<nz; j++) { 1744 col = pj[j]; 1745 rtmp3[col] -= mul3 * rtmp2[col]; 1746 rtmp4[col] -= mul4 * rtmp2[col]; 1747 } 1748 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1749 } 1750 1751 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1752 rs = 0.0; 1753 /* L part */ 1754 pc3 = b->a + bi[i+2]; 1755 pj = b->j + bi[i+2]; 1756 nz = bi[i+3] - bi[i+2]; 1757 for (j=0; j<nz; j++) { 1758 col = pj[j]; 1759 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1760 } 1761 /* U part */ 1762 pc3 = b->a + bdiag[i+3]+1; 1763 pj = b->j + bdiag[i+3]+1; 1764 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1765 for (j=0; j<nz; j++) { 1766 col = pj[j]; 1767 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1768 } 1769 1770 sctx.rs = rs; 1771 sctx.pv = rtmp3[i+2]; 1772 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1773 if (sctx.newshift) break; 1774 pc3 = b->a + bdiag[i+2]; 1775 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1776 1777 /* Now take care of 3rd column of diagonal 4x4 block. */ 1778 pc4 = rtmp4 + i+2; 1779 if (*pc4 != 0.0) { 1780 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1781 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1782 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1783 for (j=0; j<nz; j++) { 1784 col = pj[j]; 1785 rtmp4[col] -= mul4 * rtmp3[col]; 1786 } 1787 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1788 } 1789 1790 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1791 rs = 0.0; 1792 /* L part */ 1793 pc4 = b->a + bi[i+3]; 1794 pj = b->j + bi[i+3]; 1795 nz = bi[i+4] - bi[i+3]; 1796 for (j=0; j<nz; j++) { 1797 col = pj[j]; 1798 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1799 } 1800 /* U part */ 1801 pc4 = b->a + bdiag[i+4]+1; 1802 pj = b->j + bdiag[i+4]+1; 1803 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1804 for (j=0; j<nz; j++) { 1805 col = pj[j]; 1806 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1807 } 1808 1809 sctx.rs = rs; 1810 sctx.pv = rtmp4[i+3]; 1811 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr); 1812 if (sctx.newshift) break; 1813 pc4 = b->a + bdiag[i+3]; 1814 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1815 break; 1816 1817 default: 1818 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1819 } 1820 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1821 i += nodesz; /* Update the row */ 1822 } 1823 1824 /* MatPivotRefine() */ 1825 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1826 /* 1827 * if no shift in this attempt & shifting & started shifting & can refine, 1828 * then try lower shift 1829 */ 1830 sctx.shift_hi = sctx.shift_fraction; 1831 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1832 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1833 sctx.newshift = PETSC_TRUE; 1834 sctx.nshift++; 1835 } 1836 } while (sctx.newshift); 1837 1838 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1839 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1840 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1841 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1842 1843 if (b->inode.size) { 1844 C->ops->solve = MatSolve_SeqAIJ_Inode; 1845 } else { 1846 C->ops->solve = MatSolve_SeqAIJ; 1847 } 1848 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1849 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1850 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1851 C->ops->matsolve = MatMatSolve_SeqAIJ; 1852 C->assembled = PETSC_TRUE; 1853 C->preallocated = PETSC_TRUE; 1854 1855 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1856 1857 /* MatShiftView(A,info,&sctx) */ 1858 if (sctx.nshift) { 1859 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1860 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1861 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1862 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1863 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1864 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1865 } 1866 } 1867 PetscFunctionReturn(0); 1868 } 1869 1870 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1871 { 1872 Mat C = B; 1873 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1874 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1875 PetscErrorCode ierr; 1876 const PetscInt *r,*ic,*c,*ics; 1877 PetscInt n = A->rmap->n,*bi = b->i; 1878 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1879 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1880 PetscInt *ai = a->i,*aj = a->j; 1881 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1882 PetscScalar mul1,mul2,mul3,tmp; 1883 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1884 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1885 PetscReal rs=0.0; 1886 FactorShiftCtx sctx; 1887 1888 PetscFunctionBegin; 1889 sctx.shift_top = 0; 1890 sctx.nshift_max = 0; 1891 sctx.shift_lo = 0; 1892 sctx.shift_hi = 0; 1893 sctx.shift_fraction = 0; 1894 1895 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1896 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1897 sctx.shift_top = 0; 1898 for (i=0; i<n; i++) { 1899 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1900 rs = 0.0; 1901 ajtmp = aj + ai[i]; 1902 rtmp1 = aa + ai[i]; 1903 nz = ai[i+1] - ai[i]; 1904 for (j=0; j<nz; j++) { 1905 if (*ajtmp != i) { 1906 rs += PetscAbsScalar(*rtmp1++); 1907 } else { 1908 rs -= PetscRealPart(*rtmp1++); 1909 } 1910 ajtmp++; 1911 } 1912 if (rs>sctx.shift_top) sctx.shift_top = rs; 1913 } 1914 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1915 sctx.shift_top *= 1.1; 1916 sctx.nshift_max = 5; 1917 sctx.shift_lo = 0.; 1918 sctx.shift_hi = 1.; 1919 } 1920 sctx.shift_amount = 0; 1921 sctx.nshift = 0; 1922 1923 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1924 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1925 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1926 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1927 ics = ic; 1928 1929 node_max = a->inode.node_count; 1930 ns = a->inode.size; 1931 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1932 1933 /* If max inode size > 3, split it into two inodes.*/ 1934 /* also map the inode sizes according to the ordering */ 1935 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1936 for (i=0,j=0; i<node_max; ++i,++j) { 1937 if (ns[i]>3) { 1938 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1939 ++j; 1940 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1941 } else { 1942 tmp_vec1[j] = ns[i]; 1943 } 1944 } 1945 /* Use the correct node_max */ 1946 node_max = j; 1947 1948 /* Now reorder the inode info based on mat re-ordering info */ 1949 /* First create a row -> inode_size_array_index map */ 1950 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1951 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1952 for (i=0,row=0; i<node_max; i++) { 1953 nodesz = tmp_vec1[i]; 1954 for (j=0; j<nodesz; j++,row++) { 1955 nsmap[row] = i; 1956 } 1957 } 1958 /* Using nsmap, create a reordered ns structure */ 1959 for (i=0,j=0; i< node_max; i++) { 1960 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1961 tmp_vec2[i] = nodesz; 1962 j += nodesz; 1963 } 1964 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1965 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1966 /* Now use the correct ns */ 1967 ns = tmp_vec2; 1968 1969 do { 1970 sctx.newshift = PETSC_FALSE; 1971 /* Now loop over each block-row, and do the factorization */ 1972 for (i=0,row=0; i<node_max; i++) { 1973 nodesz = ns[i]; 1974 nz = bi[row+1] - bi[row]; 1975 bjtmp = bj + bi[row]; 1976 1977 switch (nodesz) { 1978 case 1: 1979 for (j=0; j<nz; j++) { 1980 idx = bjtmp[j]; 1981 rtmp11[idx] = 0.0; 1982 } 1983 1984 /* load in initial (unfactored row) */ 1985 idx = r[row]; 1986 nz_tmp = ai[idx+1] - ai[idx]; 1987 ajtmp = aj + ai[idx]; 1988 v1 = aa + ai[idx]; 1989 1990 for (j=0; j<nz_tmp; j++) { 1991 idx = ics[ajtmp[j]]; 1992 rtmp11[idx] = v1[j]; 1993 } 1994 rtmp11[ics[r[row]]] += sctx.shift_amount; 1995 1996 prow = *bjtmp++; 1997 while (prow < row) { 1998 pc1 = rtmp11 + prow; 1999 if (*pc1 != 0.0) { 2000 pv = ba + bd[prow]; 2001 pj = nbj + bd[prow]; 2002 mul1 = *pc1 * *pv++; 2003 *pc1 = mul1; 2004 nz_tmp = bi[prow+1] - bd[prow] - 1; 2005 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2006 for (j=0; j<nz_tmp; j++) { 2007 tmp = pv[j]; 2008 idx = pj[j]; 2009 rtmp11[idx] -= mul1 * tmp; 2010 } 2011 } 2012 prow = *bjtmp++; 2013 } 2014 pj = bj + bi[row]; 2015 pc1 = ba + bi[row]; 2016 2017 sctx.pv = rtmp11[row]; 2018 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2019 rs = 0.0; 2020 for (j=0; j<nz; j++) { 2021 idx = pj[j]; 2022 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2023 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2024 } 2025 sctx.rs = rs; 2026 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2027 if (sctx.newshift) goto endofwhile; 2028 break; 2029 2030 case 2: 2031 for (j=0; j<nz; j++) { 2032 idx = bjtmp[j]; 2033 rtmp11[idx] = 0.0; 2034 rtmp22[idx] = 0.0; 2035 } 2036 2037 /* load in initial (unfactored row) */ 2038 idx = r[row]; 2039 nz_tmp = ai[idx+1] - ai[idx]; 2040 ajtmp = aj + ai[idx]; 2041 v1 = aa + ai[idx]; 2042 v2 = aa + ai[idx+1]; 2043 for (j=0; j<nz_tmp; j++) { 2044 idx = ics[ajtmp[j]]; 2045 rtmp11[idx] = v1[j]; 2046 rtmp22[idx] = v2[j]; 2047 } 2048 rtmp11[ics[r[row]]] += sctx.shift_amount; 2049 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2050 2051 prow = *bjtmp++; 2052 while (prow < row) { 2053 pc1 = rtmp11 + prow; 2054 pc2 = rtmp22 + prow; 2055 if (*pc1 != 0.0 || *pc2 != 0.0) { 2056 pv = ba + bd[prow]; 2057 pj = nbj + bd[prow]; 2058 mul1 = *pc1 * *pv; 2059 mul2 = *pc2 * *pv; 2060 ++pv; 2061 *pc1 = mul1; 2062 *pc2 = mul2; 2063 2064 nz_tmp = bi[prow+1] - bd[prow] - 1; 2065 for (j=0; j<nz_tmp; j++) { 2066 tmp = pv[j]; 2067 idx = pj[j]; 2068 rtmp11[idx] -= mul1 * tmp; 2069 rtmp22[idx] -= mul2 * tmp; 2070 } 2071 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2072 } 2073 prow = *bjtmp++; 2074 } 2075 2076 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2077 pc1 = rtmp11 + prow; 2078 pc2 = rtmp22 + prow; 2079 2080 sctx.pv = *pc1; 2081 pj = bj + bi[prow]; 2082 rs = 0.0; 2083 for (j=0; j<nz; j++) { 2084 idx = pj[j]; 2085 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2086 } 2087 sctx.rs = rs; 2088 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2089 if (sctx.newshift) goto endofwhile; 2090 2091 if (*pc2 != 0.0) { 2092 pj = nbj + bd[prow]; 2093 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2094 *pc2 = mul2; 2095 nz_tmp = bi[prow+1] - bd[prow] - 1; 2096 for (j=0; j<nz_tmp; j++) { 2097 idx = pj[j]; 2098 tmp = rtmp11[idx]; 2099 rtmp22[idx] -= mul2 * tmp; 2100 } 2101 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2102 } 2103 2104 pj = bj + bi[row]; 2105 pc1 = ba + bi[row]; 2106 pc2 = ba + bi[row+1]; 2107 2108 sctx.pv = rtmp22[row+1]; 2109 rs = 0.0; 2110 rtmp11[row] = 1.0/rtmp11[row]; 2111 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2112 /* copy row entries from dense representation to sparse */ 2113 for (j=0; j<nz; j++) { 2114 idx = pj[j]; 2115 pc1[j] = rtmp11[idx]; 2116 pc2[j] = rtmp22[idx]; 2117 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2118 } 2119 sctx.rs = rs; 2120 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2121 if (sctx.newshift) goto endofwhile; 2122 break; 2123 2124 case 3: 2125 for (j=0; j<nz; j++) { 2126 idx = bjtmp[j]; 2127 rtmp11[idx] = 0.0; 2128 rtmp22[idx] = 0.0; 2129 rtmp33[idx] = 0.0; 2130 } 2131 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2132 idx = r[row]; 2133 nz_tmp = ai[idx+1] - ai[idx]; 2134 ajtmp = aj + ai[idx]; 2135 v1 = aa + ai[idx]; 2136 v2 = aa + ai[idx+1]; 2137 v3 = aa + ai[idx+2]; 2138 for (j=0; j<nz_tmp; j++) { 2139 idx = ics[ajtmp[j]]; 2140 rtmp11[idx] = v1[j]; 2141 rtmp22[idx] = v2[j]; 2142 rtmp33[idx] = v3[j]; 2143 } 2144 rtmp11[ics[r[row]]] += sctx.shift_amount; 2145 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2146 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2147 2148 /* loop over all pivot row blocks above this row block */ 2149 prow = *bjtmp++; 2150 while (prow < row) { 2151 pc1 = rtmp11 + prow; 2152 pc2 = rtmp22 + prow; 2153 pc3 = rtmp33 + prow; 2154 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2155 pv = ba + bd[prow]; 2156 pj = nbj + bd[prow]; 2157 mul1 = *pc1 * *pv; 2158 mul2 = *pc2 * *pv; 2159 mul3 = *pc3 * *pv; 2160 ++pv; 2161 *pc1 = mul1; 2162 *pc2 = mul2; 2163 *pc3 = mul3; 2164 2165 nz_tmp = bi[prow+1] - bd[prow] - 1; 2166 /* update this row based on pivot row */ 2167 for (j=0; j<nz_tmp; j++) { 2168 tmp = pv[j]; 2169 idx = pj[j]; 2170 rtmp11[idx] -= mul1 * tmp; 2171 rtmp22[idx] -= mul2 * tmp; 2172 rtmp33[idx] -= mul3 * tmp; 2173 } 2174 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2175 } 2176 prow = *bjtmp++; 2177 } 2178 2179 /* Now take care of diagonal 3x3 block in this set of rows */ 2180 /* note: prow = row here */ 2181 pc1 = rtmp11 + prow; 2182 pc2 = rtmp22 + prow; 2183 pc3 = rtmp33 + prow; 2184 2185 sctx.pv = *pc1; 2186 pj = bj + bi[prow]; 2187 rs = 0.0; 2188 for (j=0; j<nz; j++) { 2189 idx = pj[j]; 2190 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2191 } 2192 sctx.rs = rs; 2193 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2194 if (sctx.newshift) goto endofwhile; 2195 2196 if (*pc2 != 0.0 || *pc3 != 0.0) { 2197 mul2 = (*pc2)/(*pc1); 2198 mul3 = (*pc3)/(*pc1); 2199 *pc2 = mul2; 2200 *pc3 = mul3; 2201 nz_tmp = bi[prow+1] - bd[prow] - 1; 2202 pj = nbj + bd[prow]; 2203 for (j=0; j<nz_tmp; j++) { 2204 idx = pj[j]; 2205 tmp = rtmp11[idx]; 2206 rtmp22[idx] -= mul2 * tmp; 2207 rtmp33[idx] -= mul3 * tmp; 2208 } 2209 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2210 } 2211 ++prow; 2212 2213 pc2 = rtmp22 + prow; 2214 pc3 = rtmp33 + prow; 2215 sctx.pv = *pc2; 2216 pj = bj + bi[prow]; 2217 rs = 0.0; 2218 for (j=0; j<nz; j++) { 2219 idx = pj[j]; 2220 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2221 } 2222 sctx.rs = rs; 2223 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2224 if (sctx.newshift) goto endofwhile; 2225 2226 if (*pc3 != 0.0) { 2227 mul3 = (*pc3)/(*pc2); 2228 *pc3 = mul3; 2229 pj = nbj + bd[prow]; 2230 nz_tmp = bi[prow+1] - bd[prow] - 1; 2231 for (j=0; j<nz_tmp; j++) { 2232 idx = pj[j]; 2233 tmp = rtmp22[idx]; 2234 rtmp33[idx] -= mul3 * tmp; 2235 } 2236 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2237 } 2238 2239 pj = bj + bi[row]; 2240 pc1 = ba + bi[row]; 2241 pc2 = ba + bi[row+1]; 2242 pc3 = ba + bi[row+2]; 2243 2244 sctx.pv = rtmp33[row+2]; 2245 rs = 0.0; 2246 rtmp11[row] = 1.0/rtmp11[row]; 2247 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2248 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2249 /* copy row entries from dense representation to sparse */ 2250 for (j=0; j<nz; j++) { 2251 idx = pj[j]; 2252 pc1[j] = rtmp11[idx]; 2253 pc2[j] = rtmp22[idx]; 2254 pc3[j] = rtmp33[idx]; 2255 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2256 } 2257 2258 sctx.rs = rs; 2259 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr); 2260 if (sctx.newshift) goto endofwhile; 2261 break; 2262 2263 default: 2264 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2265 } 2266 row += nodesz; /* Update the row */ 2267 } 2268 endofwhile:; 2269 } while (sctx.newshift); 2270 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2271 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2272 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2273 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2274 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2275 2276 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2277 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2278 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2279 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2280 C->assembled = PETSC_TRUE; 2281 C->preallocated = PETSC_TRUE; 2282 if (sctx.nshift) { 2283 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2284 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2285 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2286 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2287 } 2288 } 2289 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2290 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2291 PetscFunctionReturn(0); 2292 } 2293 2294 2295 /* ----------------------------------------------------------- */ 2296 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2297 { 2298 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2299 IS iscol = a->col,isrow = a->row; 2300 PetscErrorCode ierr; 2301 const PetscInt *r,*c,*rout,*cout; 2302 PetscInt i,j,n = A->rmap->n; 2303 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2304 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2305 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2306 PetscScalar sum1,sum2,sum3,sum4,sum5; 2307 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2308 const PetscScalar *b; 2309 2310 PetscFunctionBegin; 2311 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2312 node_max = a->inode.node_count; 2313 ns = a->inode.size; /* Node Size array */ 2314 2315 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2316 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2317 tmp = a->solve_work; 2318 2319 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2320 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2321 2322 /* forward solve the lower triangular */ 2323 tmps = tmp; 2324 aa = a_a; 2325 aj = a_j; 2326 ad = a->diag; 2327 2328 for (i = 0,row = 0; i< node_max; ++i) { 2329 nsz = ns[i]; 2330 aii = ai[row]; 2331 v1 = aa + aii; 2332 vi = aj + aii; 2333 nz = ai[row+1]- ai[row]; 2334 2335 if (i < node_max-1) { 2336 /* Prefetch the indices for the next block */ 2337 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2338 /* Prefetch the data for the next block */ 2339 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2340 } 2341 2342 switch (nsz) { /* Each loop in 'case' is unrolled */ 2343 case 1: 2344 sum1 = b[r[row]]; 2345 for (j=0; j<nz-1; j+=2) { 2346 i0 = vi[j]; 2347 i1 = vi[j+1]; 2348 tmp0 = tmps[i0]; 2349 tmp1 = tmps[i1]; 2350 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2351 } 2352 if (j == nz-1) { 2353 tmp0 = tmps[vi[j]]; 2354 sum1 -= v1[j]*tmp0; 2355 } 2356 tmp[row++]=sum1; 2357 break; 2358 case 2: 2359 sum1 = b[r[row]]; 2360 sum2 = b[r[row+1]]; 2361 v2 = aa + ai[row+1]; 2362 2363 for (j=0; j<nz-1; j+=2) { 2364 i0 = vi[j]; 2365 i1 = vi[j+1]; 2366 tmp0 = tmps[i0]; 2367 tmp1 = tmps[i1]; 2368 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2369 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2370 } 2371 if (j == nz-1) { 2372 tmp0 = tmps[vi[j]]; 2373 sum1 -= v1[j] *tmp0; 2374 sum2 -= v2[j] *tmp0; 2375 } 2376 sum2 -= v2[nz] * sum1; 2377 tmp[row++]=sum1; 2378 tmp[row++]=sum2; 2379 break; 2380 case 3: 2381 sum1 = b[r[row]]; 2382 sum2 = b[r[row+1]]; 2383 sum3 = b[r[row+2]]; 2384 v2 = aa + ai[row+1]; 2385 v3 = aa + ai[row+2]; 2386 2387 for (j=0; j<nz-1; j+=2) { 2388 i0 = vi[j]; 2389 i1 = vi[j+1]; 2390 tmp0 = tmps[i0]; 2391 tmp1 = tmps[i1]; 2392 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2393 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2394 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2395 } 2396 if (j == nz-1) { 2397 tmp0 = tmps[vi[j]]; 2398 sum1 -= v1[j] *tmp0; 2399 sum2 -= v2[j] *tmp0; 2400 sum3 -= v3[j] *tmp0; 2401 } 2402 sum2 -= v2[nz] * sum1; 2403 sum3 -= v3[nz] * sum1; 2404 sum3 -= v3[nz+1] * sum2; 2405 tmp[row++]=sum1; 2406 tmp[row++]=sum2; 2407 tmp[row++]=sum3; 2408 break; 2409 2410 case 4: 2411 sum1 = b[r[row]]; 2412 sum2 = b[r[row+1]]; 2413 sum3 = b[r[row+2]]; 2414 sum4 = b[r[row+3]]; 2415 v2 = aa + ai[row+1]; 2416 v3 = aa + ai[row+2]; 2417 v4 = aa + ai[row+3]; 2418 2419 for (j=0; j<nz-1; j+=2) { 2420 i0 = vi[j]; 2421 i1 = vi[j+1]; 2422 tmp0 = tmps[i0]; 2423 tmp1 = tmps[i1]; 2424 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2425 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2426 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2427 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2428 } 2429 if (j == nz-1) { 2430 tmp0 = tmps[vi[j]]; 2431 sum1 -= v1[j] *tmp0; 2432 sum2 -= v2[j] *tmp0; 2433 sum3 -= v3[j] *tmp0; 2434 sum4 -= v4[j] *tmp0; 2435 } 2436 sum2 -= v2[nz] * sum1; 2437 sum3 -= v3[nz] * sum1; 2438 sum4 -= v4[nz] * sum1; 2439 sum3 -= v3[nz+1] * sum2; 2440 sum4 -= v4[nz+1] * sum2; 2441 sum4 -= v4[nz+2] * sum3; 2442 2443 tmp[row++]=sum1; 2444 tmp[row++]=sum2; 2445 tmp[row++]=sum3; 2446 tmp[row++]=sum4; 2447 break; 2448 case 5: 2449 sum1 = b[r[row]]; 2450 sum2 = b[r[row+1]]; 2451 sum3 = b[r[row+2]]; 2452 sum4 = b[r[row+3]]; 2453 sum5 = b[r[row+4]]; 2454 v2 = aa + ai[row+1]; 2455 v3 = aa + ai[row+2]; 2456 v4 = aa + ai[row+3]; 2457 v5 = aa + ai[row+4]; 2458 2459 for (j=0; j<nz-1; j+=2) { 2460 i0 = vi[j]; 2461 i1 = vi[j+1]; 2462 tmp0 = tmps[i0]; 2463 tmp1 = tmps[i1]; 2464 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2465 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2466 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2467 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2468 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2469 } 2470 if (j == nz-1) { 2471 tmp0 = tmps[vi[j]]; 2472 sum1 -= v1[j] *tmp0; 2473 sum2 -= v2[j] *tmp0; 2474 sum3 -= v3[j] *tmp0; 2475 sum4 -= v4[j] *tmp0; 2476 sum5 -= v5[j] *tmp0; 2477 } 2478 2479 sum2 -= v2[nz] * sum1; 2480 sum3 -= v3[nz] * sum1; 2481 sum4 -= v4[nz] * sum1; 2482 sum5 -= v5[nz] * sum1; 2483 sum3 -= v3[nz+1] * sum2; 2484 sum4 -= v4[nz+1] * sum2; 2485 sum5 -= v5[nz+1] * sum2; 2486 sum4 -= v4[nz+2] * sum3; 2487 sum5 -= v5[nz+2] * sum3; 2488 sum5 -= v5[nz+3] * sum4; 2489 2490 tmp[row++]=sum1; 2491 tmp[row++]=sum2; 2492 tmp[row++]=sum3; 2493 tmp[row++]=sum4; 2494 tmp[row++]=sum5; 2495 break; 2496 default: 2497 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2498 } 2499 } 2500 /* backward solve the upper triangular */ 2501 for (i=node_max -1,row = n-1; i>=0; i--) { 2502 nsz = ns[i]; 2503 aii = ad[row+1] + 1; 2504 v1 = aa + aii; 2505 vi = aj + aii; 2506 nz = ad[row]- ad[row+1] - 1; 2507 2508 if (i > 0) { 2509 /* Prefetch the indices for the next block */ 2510 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2511 /* Prefetch the data for the next block */ 2512 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2513 } 2514 2515 switch (nsz) { /* Each loop in 'case' is unrolled */ 2516 case 1: 2517 sum1 = tmp[row]; 2518 2519 for (j=0; j<nz-1; j+=2) { 2520 i0 = vi[j]; 2521 i1 = vi[j+1]; 2522 tmp0 = tmps[i0]; 2523 tmp1 = tmps[i1]; 2524 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2525 } 2526 if (j == nz-1) { 2527 tmp0 = tmps[vi[j]]; 2528 sum1 -= v1[j]*tmp0; 2529 } 2530 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2531 break; 2532 case 2: 2533 sum1 = tmp[row]; 2534 sum2 = tmp[row-1]; 2535 v2 = aa + ad[row] + 1; 2536 for (j=0; j<nz-1; j+=2) { 2537 i0 = vi[j]; 2538 i1 = vi[j+1]; 2539 tmp0 = tmps[i0]; 2540 tmp1 = tmps[i1]; 2541 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2542 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2543 } 2544 if (j == nz-1) { 2545 tmp0 = tmps[vi[j]]; 2546 sum1 -= v1[j]* tmp0; 2547 sum2 -= v2[j+1]* tmp0; 2548 } 2549 2550 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2551 sum2 -= v2[0] * tmp0; 2552 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2553 break; 2554 case 3: 2555 sum1 = tmp[row]; 2556 sum2 = tmp[row -1]; 2557 sum3 = tmp[row -2]; 2558 v2 = aa + ad[row] + 1; 2559 v3 = aa + ad[row -1] + 1; 2560 for (j=0; j<nz-1; j+=2) { 2561 i0 = vi[j]; 2562 i1 = vi[j+1]; 2563 tmp0 = tmps[i0]; 2564 tmp1 = tmps[i1]; 2565 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2566 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2567 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2568 } 2569 if (j== nz-1) { 2570 tmp0 = tmps[vi[j]]; 2571 sum1 -= v1[j] * tmp0; 2572 sum2 -= v2[j+1] * tmp0; 2573 sum3 -= v3[j+2] * tmp0; 2574 } 2575 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2576 sum2 -= v2[0]* tmp0; 2577 sum3 -= v3[1] * tmp0; 2578 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2579 sum3 -= v3[0]* tmp0; 2580 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2581 2582 break; 2583 case 4: 2584 sum1 = tmp[row]; 2585 sum2 = tmp[row -1]; 2586 sum3 = tmp[row -2]; 2587 sum4 = tmp[row -3]; 2588 v2 = aa + ad[row]+1; 2589 v3 = aa + ad[row -1]+1; 2590 v4 = aa + ad[row -2]+1; 2591 2592 for (j=0; j<nz-1; j+=2) { 2593 i0 = vi[j]; 2594 i1 = vi[j+1]; 2595 tmp0 = tmps[i0]; 2596 tmp1 = tmps[i1]; 2597 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2598 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2599 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2600 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2601 } 2602 if (j== nz-1) { 2603 tmp0 = tmps[vi[j]]; 2604 sum1 -= v1[j] * tmp0; 2605 sum2 -= v2[j+1] * tmp0; 2606 sum3 -= v3[j+2] * tmp0; 2607 sum4 -= v4[j+3] * tmp0; 2608 } 2609 2610 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2611 sum2 -= v2[0] * tmp0; 2612 sum3 -= v3[1] * tmp0; 2613 sum4 -= v4[2] * tmp0; 2614 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2615 sum3 -= v3[0] * tmp0; 2616 sum4 -= v4[1] * tmp0; 2617 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2618 sum4 -= v4[0] * tmp0; 2619 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2620 break; 2621 case 5: 2622 sum1 = tmp[row]; 2623 sum2 = tmp[row -1]; 2624 sum3 = tmp[row -2]; 2625 sum4 = tmp[row -3]; 2626 sum5 = tmp[row -4]; 2627 v2 = aa + ad[row]+1; 2628 v3 = aa + ad[row -1]+1; 2629 v4 = aa + ad[row -2]+1; 2630 v5 = aa + ad[row -3]+1; 2631 for (j=0; j<nz-1; j+=2) { 2632 i0 = vi[j]; 2633 i1 = vi[j+1]; 2634 tmp0 = tmps[i0]; 2635 tmp1 = tmps[i1]; 2636 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2637 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2638 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2639 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2640 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2641 } 2642 if (j==nz-1) { 2643 tmp0 = tmps[vi[j]]; 2644 sum1 -= v1[j] * tmp0; 2645 sum2 -= v2[j+1] * tmp0; 2646 sum3 -= v3[j+2] * tmp0; 2647 sum4 -= v4[j+3] * tmp0; 2648 sum5 -= v5[j+4] * tmp0; 2649 } 2650 2651 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2652 sum2 -= v2[0] * tmp0; 2653 sum3 -= v3[1] * tmp0; 2654 sum4 -= v4[2] * tmp0; 2655 sum5 -= v5[3] * tmp0; 2656 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2657 sum3 -= v3[0] * tmp0; 2658 sum4 -= v4[1] * tmp0; 2659 sum5 -= v5[2] * tmp0; 2660 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2661 sum4 -= v4[0] * tmp0; 2662 sum5 -= v5[1] * tmp0; 2663 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2664 sum5 -= v5[0] * tmp0; 2665 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2666 break; 2667 default: 2668 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2669 } 2670 } 2671 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2672 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2673 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2674 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2675 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2676 PetscFunctionReturn(0); 2677 } 2678 2679 2680 /* 2681 Makes a longer coloring[] array and calls the usual code with that 2682 */ 2683 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2684 { 2685 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2686 PetscErrorCode ierr; 2687 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2688 PetscInt *colorused,i; 2689 ISColoringValue *newcolor; 2690 2691 PetscFunctionBegin; 2692 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2693 /* loop over inodes, marking a color for each column*/ 2694 row = 0; 2695 for (i=0; i<m; i++) { 2696 for (j=0; j<ns[i]; j++) { 2697 newcolor[row++] = coloring[i] + j*ncolors; 2698 } 2699 } 2700 2701 /* eliminate unneeded colors */ 2702 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2703 for (i=0; i<n; i++) { 2704 colorused[newcolor[i]] = 1; 2705 } 2706 2707 for (i=1; i<5*ncolors; i++) { 2708 colorused[i] += colorused[i-1]; 2709 } 2710 ncolors = colorused[5*ncolors-1]; 2711 for (i=0; i<n; i++) { 2712 newcolor[i] = colorused[newcolor[i]]-1; 2713 } 2714 ierr = PetscFree(colorused);CHKERRQ(ierr); 2715 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2716 ierr = PetscFree(coloring);CHKERRQ(ierr); 2717 PetscFunctionReturn(0); 2718 } 2719 2720 #include <petsc/private/kernels/blockinvert.h> 2721 2722 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2723 { 2724 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2725 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2726 MatScalar *ibdiag,*bdiag,work[25],*t; 2727 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2728 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2729 const PetscScalar *xb, *b; 2730 PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0; 2731 PetscErrorCode ierr; 2732 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2733 PetscInt sz,k,ipvt[5]; 2734 PetscBool allowzeropivot,zeropivotdetected; 2735 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2736 2737 PetscFunctionBegin; 2738 allowzeropivot = PetscNot(A->erroriffailure); 2739 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2740 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2741 2742 if (!a->inode.ibdiagvalid) { 2743 if (!a->inode.ibdiag) { 2744 /* calculate space needed for diagonal blocks */ 2745 for (i=0; i<m; i++) { 2746 cnt += sizes[i]*sizes[i]; 2747 } 2748 a->inode.bdiagsize = cnt; 2749 2750 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2751 } 2752 2753 /* copy over the diagonal blocks and invert them */ 2754 ibdiag = a->inode.ibdiag; 2755 bdiag = a->inode.bdiag; 2756 cnt = 0; 2757 for (i=0, row = 0; i<m; i++) { 2758 for (j=0; j<sizes[i]; j++) { 2759 for (k=0; k<sizes[i]; k++) { 2760 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2761 } 2762 } 2763 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2764 2765 switch (sizes[i]) { 2766 case 1: 2767 /* Create matrix data structure */ 2768 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2769 if (allowzeropivot) { 2770 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2771 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2772 A->factorerror_zeropivot_row = row; 2773 ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr); 2774 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2775 } 2776 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2777 break; 2778 case 2: 2779 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2780 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2781 break; 2782 case 3: 2783 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2784 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2785 break; 2786 case 4: 2787 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2788 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2789 break; 2790 case 5: 2791 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2792 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2793 break; 2794 default: 2795 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2796 } 2797 cnt += sizes[i]*sizes[i]; 2798 row += sizes[i]; 2799 } 2800 a->inode.ibdiagvalid = PETSC_TRUE; 2801 } 2802 ibdiag = a->inode.ibdiag; 2803 bdiag = a->inode.bdiag; 2804 t = a->inode.ssor_work; 2805 2806 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2807 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2808 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2809 if (flag & SOR_ZERO_INITIAL_GUESS) { 2810 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2811 2812 for (i=0, row=0; i<m; i++) { 2813 sz = diag[row] - ii[row]; 2814 v1 = a->a + ii[row]; 2815 idx = a->j + ii[row]; 2816 2817 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2818 switch (sizes[i]) { 2819 case 1: 2820 2821 sum1 = b[row]; 2822 for (n = 0; n<sz-1; n+=2) { 2823 i1 = idx[0]; 2824 i2 = idx[1]; 2825 idx += 2; 2826 tmp0 = x[i1]; 2827 tmp1 = x[i2]; 2828 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2829 } 2830 2831 if (n == sz-1) { 2832 tmp0 = x[*idx]; 2833 sum1 -= *v1 * tmp0; 2834 } 2835 t[row] = sum1; 2836 x[row++] = sum1*(*ibdiag++); 2837 break; 2838 case 2: 2839 v2 = a->a + ii[row+1]; 2840 sum1 = b[row]; 2841 sum2 = b[row+1]; 2842 for (n = 0; n<sz-1; n+=2) { 2843 i1 = idx[0]; 2844 i2 = idx[1]; 2845 idx += 2; 2846 tmp0 = x[i1]; 2847 tmp1 = x[i2]; 2848 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2849 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2850 } 2851 2852 if (n == sz-1) { 2853 tmp0 = x[*idx]; 2854 sum1 -= v1[0] * tmp0; 2855 sum2 -= v2[0] * tmp0; 2856 } 2857 t[row] = sum1; 2858 t[row+1] = sum2; 2859 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2860 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2861 ibdiag += 4; 2862 break; 2863 case 3: 2864 v2 = a->a + ii[row+1]; 2865 v3 = a->a + ii[row+2]; 2866 sum1 = b[row]; 2867 sum2 = b[row+1]; 2868 sum3 = b[row+2]; 2869 for (n = 0; n<sz-1; n+=2) { 2870 i1 = idx[0]; 2871 i2 = idx[1]; 2872 idx += 2; 2873 tmp0 = x[i1]; 2874 tmp1 = x[i2]; 2875 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2876 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2877 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2878 } 2879 2880 if (n == sz-1) { 2881 tmp0 = x[*idx]; 2882 sum1 -= v1[0] * tmp0; 2883 sum2 -= v2[0] * tmp0; 2884 sum3 -= v3[0] * tmp0; 2885 } 2886 t[row] = sum1; 2887 t[row+1] = sum2; 2888 t[row+2] = sum3; 2889 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2890 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2891 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2892 ibdiag += 9; 2893 break; 2894 case 4: 2895 v2 = a->a + ii[row+1]; 2896 v3 = a->a + ii[row+2]; 2897 v4 = a->a + ii[row+3]; 2898 sum1 = b[row]; 2899 sum2 = b[row+1]; 2900 sum3 = b[row+2]; 2901 sum4 = b[row+3]; 2902 for (n = 0; n<sz-1; n+=2) { 2903 i1 = idx[0]; 2904 i2 = idx[1]; 2905 idx += 2; 2906 tmp0 = x[i1]; 2907 tmp1 = x[i2]; 2908 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2909 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2910 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2911 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2912 } 2913 2914 if (n == sz-1) { 2915 tmp0 = x[*idx]; 2916 sum1 -= v1[0] * tmp0; 2917 sum2 -= v2[0] * tmp0; 2918 sum3 -= v3[0] * tmp0; 2919 sum4 -= v4[0] * tmp0; 2920 } 2921 t[row] = sum1; 2922 t[row+1] = sum2; 2923 t[row+2] = sum3; 2924 t[row+3] = sum4; 2925 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2926 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2927 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2928 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2929 ibdiag += 16; 2930 break; 2931 case 5: 2932 v2 = a->a + ii[row+1]; 2933 v3 = a->a + ii[row+2]; 2934 v4 = a->a + ii[row+3]; 2935 v5 = a->a + ii[row+4]; 2936 sum1 = b[row]; 2937 sum2 = b[row+1]; 2938 sum3 = b[row+2]; 2939 sum4 = b[row+3]; 2940 sum5 = b[row+4]; 2941 for (n = 0; n<sz-1; n+=2) { 2942 i1 = idx[0]; 2943 i2 = idx[1]; 2944 idx += 2; 2945 tmp0 = x[i1]; 2946 tmp1 = x[i2]; 2947 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2948 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2949 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2950 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2951 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2952 } 2953 2954 if (n == sz-1) { 2955 tmp0 = x[*idx]; 2956 sum1 -= v1[0] * tmp0; 2957 sum2 -= v2[0] * tmp0; 2958 sum3 -= v3[0] * tmp0; 2959 sum4 -= v4[0] * tmp0; 2960 sum5 -= v5[0] * tmp0; 2961 } 2962 t[row] = sum1; 2963 t[row+1] = sum2; 2964 t[row+2] = sum3; 2965 t[row+3] = sum4; 2966 t[row+4] = sum5; 2967 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2968 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2969 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2970 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2971 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2972 ibdiag += 25; 2973 break; 2974 default: 2975 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2976 } 2977 } 2978 2979 xb = t; 2980 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2981 } else xb = b; 2982 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 2983 2984 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2985 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2986 ibdiag -= sizes[i]*sizes[i]; 2987 sz = ii[row+1] - diag[row] - 1; 2988 v1 = a->a + diag[row] + 1; 2989 idx = a->j + diag[row] + 1; 2990 2991 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2992 switch (sizes[i]) { 2993 case 1: 2994 2995 sum1 = xb[row]; 2996 for (n = 0; n<sz-1; n+=2) { 2997 i1 = idx[0]; 2998 i2 = idx[1]; 2999 idx += 2; 3000 tmp0 = x[i1]; 3001 tmp1 = x[i2]; 3002 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3003 } 3004 3005 if (n == sz-1) { 3006 tmp0 = x[*idx]; 3007 sum1 -= *v1*tmp0; 3008 } 3009 x[row--] = sum1*(*ibdiag); 3010 break; 3011 3012 case 2: 3013 3014 sum1 = xb[row]; 3015 sum2 = xb[row-1]; 3016 /* note that sum1 is associated with the second of the two rows */ 3017 v2 = a->a + diag[row-1] + 2; 3018 for (n = 0; n<sz-1; n+=2) { 3019 i1 = idx[0]; 3020 i2 = idx[1]; 3021 idx += 2; 3022 tmp0 = x[i1]; 3023 tmp1 = x[i2]; 3024 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3025 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3026 } 3027 3028 if (n == sz-1) { 3029 tmp0 = x[*idx]; 3030 sum1 -= *v1*tmp0; 3031 sum2 -= *v2*tmp0; 3032 } 3033 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3034 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3035 break; 3036 case 3: 3037 3038 sum1 = xb[row]; 3039 sum2 = xb[row-1]; 3040 sum3 = xb[row-2]; 3041 v2 = a->a + diag[row-1] + 2; 3042 v3 = a->a + diag[row-2] + 3; 3043 for (n = 0; n<sz-1; n+=2) { 3044 i1 = idx[0]; 3045 i2 = idx[1]; 3046 idx += 2; 3047 tmp0 = x[i1]; 3048 tmp1 = x[i2]; 3049 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3050 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3051 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3052 } 3053 3054 if (n == sz-1) { 3055 tmp0 = x[*idx]; 3056 sum1 -= *v1*tmp0; 3057 sum2 -= *v2*tmp0; 3058 sum3 -= *v3*tmp0; 3059 } 3060 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3061 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3062 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3063 break; 3064 case 4: 3065 3066 sum1 = xb[row]; 3067 sum2 = xb[row-1]; 3068 sum3 = xb[row-2]; 3069 sum4 = xb[row-3]; 3070 v2 = a->a + diag[row-1] + 2; 3071 v3 = a->a + diag[row-2] + 3; 3072 v4 = a->a + diag[row-3] + 4; 3073 for (n = 0; n<sz-1; n+=2) { 3074 i1 = idx[0]; 3075 i2 = idx[1]; 3076 idx += 2; 3077 tmp0 = x[i1]; 3078 tmp1 = x[i2]; 3079 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3080 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3081 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3082 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3083 } 3084 3085 if (n == sz-1) { 3086 tmp0 = x[*idx]; 3087 sum1 -= *v1*tmp0; 3088 sum2 -= *v2*tmp0; 3089 sum3 -= *v3*tmp0; 3090 sum4 -= *v4*tmp0; 3091 } 3092 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3093 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3094 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3095 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3096 break; 3097 case 5: 3098 3099 sum1 = xb[row]; 3100 sum2 = xb[row-1]; 3101 sum3 = xb[row-2]; 3102 sum4 = xb[row-3]; 3103 sum5 = xb[row-4]; 3104 v2 = a->a + diag[row-1] + 2; 3105 v3 = a->a + diag[row-2] + 3; 3106 v4 = a->a + diag[row-3] + 4; 3107 v5 = a->a + diag[row-4] + 5; 3108 for (n = 0; n<sz-1; n+=2) { 3109 i1 = idx[0]; 3110 i2 = idx[1]; 3111 idx += 2; 3112 tmp0 = x[i1]; 3113 tmp1 = x[i2]; 3114 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3115 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3116 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3117 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3118 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3119 } 3120 3121 if (n == sz-1) { 3122 tmp0 = x[*idx]; 3123 sum1 -= *v1*tmp0; 3124 sum2 -= *v2*tmp0; 3125 sum3 -= *v3*tmp0; 3126 sum4 -= *v4*tmp0; 3127 sum5 -= *v5*tmp0; 3128 } 3129 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3130 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3131 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3132 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3133 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3134 break; 3135 default: 3136 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3137 } 3138 } 3139 3140 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3141 } 3142 its--; 3143 } 3144 while (its--) { 3145 3146 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3147 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3148 i<m; 3149 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3150 3151 sz = diag[row] - ii[row]; 3152 v1 = a->a + ii[row]; 3153 idx = a->j + ii[row]; 3154 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3155 switch (sizes[i]) { 3156 case 1: 3157 sum1 = b[row]; 3158 for (n = 0; n<sz-1; n+=2) { 3159 i1 = idx[0]; 3160 i2 = idx[1]; 3161 idx += 2; 3162 tmp0 = x[i1]; 3163 tmp1 = x[i2]; 3164 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3165 } 3166 if (n == sz-1) { 3167 tmp0 = x[*idx++]; 3168 sum1 -= *v1 * tmp0; 3169 v1++; 3170 } 3171 t[row] = sum1; 3172 sz = ii[row+1] - diag[row] - 1; 3173 idx = a->j + diag[row] + 1; 3174 v1 += 1; 3175 for (n = 0; n<sz-1; n+=2) { 3176 i1 = idx[0]; 3177 i2 = idx[1]; 3178 idx += 2; 3179 tmp0 = x[i1]; 3180 tmp1 = x[i2]; 3181 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3182 } 3183 if (n == sz-1) { 3184 tmp0 = x[*idx++]; 3185 sum1 -= *v1 * tmp0; 3186 } 3187 /* in MatSOR_SeqAIJ this line would be 3188 * 3189 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3190 * 3191 * but omega == 1, so this becomes 3192 * 3193 * x[row] = sum1*(*ibdiag++); 3194 * 3195 */ 3196 x[row] = sum1*(*ibdiag); 3197 break; 3198 case 2: 3199 v2 = a->a + ii[row+1]; 3200 sum1 = b[row]; 3201 sum2 = b[row+1]; 3202 for (n = 0; n<sz-1; n+=2) { 3203 i1 = idx[0]; 3204 i2 = idx[1]; 3205 idx += 2; 3206 tmp0 = x[i1]; 3207 tmp1 = x[i2]; 3208 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3209 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3210 } 3211 if (n == sz-1) { 3212 tmp0 = x[*idx++]; 3213 sum1 -= v1[0] * tmp0; 3214 sum2 -= v2[0] * tmp0; 3215 v1++; v2++; 3216 } 3217 t[row] = sum1; 3218 t[row+1] = sum2; 3219 sz = ii[row+1] - diag[row] - 2; 3220 idx = a->j + diag[row] + 2; 3221 v1 += 2; 3222 v2 += 2; 3223 for (n = 0; n<sz-1; n+=2) { 3224 i1 = idx[0]; 3225 i2 = idx[1]; 3226 idx += 2; 3227 tmp0 = x[i1]; 3228 tmp1 = x[i2]; 3229 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3230 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3231 } 3232 if (n == sz-1) { 3233 tmp0 = x[*idx]; 3234 sum1 -= v1[0] * tmp0; 3235 sum2 -= v2[0] * tmp0; 3236 } 3237 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3238 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3239 break; 3240 case 3: 3241 v2 = a->a + ii[row+1]; 3242 v3 = a->a + ii[row+2]; 3243 sum1 = b[row]; 3244 sum2 = b[row+1]; 3245 sum3 = b[row+2]; 3246 for (n = 0; n<sz-1; n+=2) { 3247 i1 = idx[0]; 3248 i2 = idx[1]; 3249 idx += 2; 3250 tmp0 = x[i1]; 3251 tmp1 = x[i2]; 3252 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3253 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3254 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3255 } 3256 if (n == sz-1) { 3257 tmp0 = x[*idx++]; 3258 sum1 -= v1[0] * tmp0; 3259 sum2 -= v2[0] * tmp0; 3260 sum3 -= v3[0] * tmp0; 3261 v1++; v2++; v3++; 3262 } 3263 t[row] = sum1; 3264 t[row+1] = sum2; 3265 t[row+2] = sum3; 3266 sz = ii[row+1] - diag[row] - 3; 3267 idx = a->j + diag[row] + 3; 3268 v1 += 3; 3269 v2 += 3; 3270 v3 += 3; 3271 for (n = 0; n<sz-1; n+=2) { 3272 i1 = idx[0]; 3273 i2 = idx[1]; 3274 idx += 2; 3275 tmp0 = x[i1]; 3276 tmp1 = x[i2]; 3277 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3278 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3279 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3280 } 3281 if (n == sz-1) { 3282 tmp0 = x[*idx]; 3283 sum1 -= v1[0] * tmp0; 3284 sum2 -= v2[0] * tmp0; 3285 sum3 -= v3[0] * tmp0; 3286 } 3287 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3288 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3289 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3290 break; 3291 case 4: 3292 v2 = a->a + ii[row+1]; 3293 v3 = a->a + ii[row+2]; 3294 v4 = a->a + ii[row+3]; 3295 sum1 = b[row]; 3296 sum2 = b[row+1]; 3297 sum3 = b[row+2]; 3298 sum4 = b[row+3]; 3299 for (n = 0; n<sz-1; n+=2) { 3300 i1 = idx[0]; 3301 i2 = idx[1]; 3302 idx += 2; 3303 tmp0 = x[i1]; 3304 tmp1 = x[i2]; 3305 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3306 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3307 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3308 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3309 } 3310 if (n == sz-1) { 3311 tmp0 = x[*idx++]; 3312 sum1 -= v1[0] * tmp0; 3313 sum2 -= v2[0] * tmp0; 3314 sum3 -= v3[0] * tmp0; 3315 sum4 -= v4[0] * tmp0; 3316 v1++; v2++; v3++; v4++; 3317 } 3318 t[row] = sum1; 3319 t[row+1] = sum2; 3320 t[row+2] = sum3; 3321 t[row+3] = sum4; 3322 sz = ii[row+1] - diag[row] - 4; 3323 idx = a->j + diag[row] + 4; 3324 v1 += 4; 3325 v2 += 4; 3326 v3 += 4; 3327 v4 += 4; 3328 for (n = 0; n<sz-1; n+=2) { 3329 i1 = idx[0]; 3330 i2 = idx[1]; 3331 idx += 2; 3332 tmp0 = x[i1]; 3333 tmp1 = x[i2]; 3334 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3335 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3336 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3337 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3338 } 3339 if (n == sz-1) { 3340 tmp0 = x[*idx]; 3341 sum1 -= v1[0] * tmp0; 3342 sum2 -= v2[0] * tmp0; 3343 sum3 -= v3[0] * tmp0; 3344 sum4 -= v4[0] * tmp0; 3345 } 3346 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3347 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3348 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3349 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3350 break; 3351 case 5: 3352 v2 = a->a + ii[row+1]; 3353 v3 = a->a + ii[row+2]; 3354 v4 = a->a + ii[row+3]; 3355 v5 = a->a + ii[row+4]; 3356 sum1 = b[row]; 3357 sum2 = b[row+1]; 3358 sum3 = b[row+2]; 3359 sum4 = b[row+3]; 3360 sum5 = b[row+4]; 3361 for (n = 0; n<sz-1; n+=2) { 3362 i1 = idx[0]; 3363 i2 = idx[1]; 3364 idx += 2; 3365 tmp0 = x[i1]; 3366 tmp1 = x[i2]; 3367 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3368 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3369 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3370 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3371 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3372 } 3373 if (n == sz-1) { 3374 tmp0 = x[*idx++]; 3375 sum1 -= v1[0] * tmp0; 3376 sum2 -= v2[0] * tmp0; 3377 sum3 -= v3[0] * tmp0; 3378 sum4 -= v4[0] * tmp0; 3379 sum5 -= v5[0] * tmp0; 3380 v1++; v2++; v3++; v4++; v5++; 3381 } 3382 t[row] = sum1; 3383 t[row+1] = sum2; 3384 t[row+2] = sum3; 3385 t[row+3] = sum4; 3386 t[row+4] = sum5; 3387 sz = ii[row+1] - diag[row] - 5; 3388 idx = a->j + diag[row] + 5; 3389 v1 += 5; 3390 v2 += 5; 3391 v3 += 5; 3392 v4 += 5; 3393 v5 += 5; 3394 for (n = 0; n<sz-1; n+=2) { 3395 i1 = idx[0]; 3396 i2 = idx[1]; 3397 idx += 2; 3398 tmp0 = x[i1]; 3399 tmp1 = x[i2]; 3400 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3401 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3402 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3403 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3404 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3405 } 3406 if (n == sz-1) { 3407 tmp0 = x[*idx]; 3408 sum1 -= v1[0] * tmp0; 3409 sum2 -= v2[0] * tmp0; 3410 sum3 -= v3[0] * tmp0; 3411 sum4 -= v4[0] * tmp0; 3412 sum5 -= v5[0] * tmp0; 3413 } 3414 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3415 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3416 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3417 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3418 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3419 break; 3420 default: 3421 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3422 } 3423 } 3424 xb = t; 3425 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3426 } else xb = b; 3427 3428 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3429 3430 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3431 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3432 ibdiag -= sizes[i]*sizes[i]; 3433 3434 /* set RHS */ 3435 if (xb == b) { 3436 /* whole (old way) */ 3437 sz = ii[row+1] - ii[row]; 3438 idx = a->j + ii[row]; 3439 switch (sizes[i]) { 3440 case 5: 3441 v5 = a->a + ii[row-4]; 3442 case 4: /* fall through */ 3443 v4 = a->a + ii[row-3]; 3444 case 3: 3445 v3 = a->a + ii[row-2]; 3446 case 2: 3447 v2 = a->a + ii[row-1]; 3448 case 1: 3449 v1 = a->a + ii[row]; 3450 break; 3451 default: 3452 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3453 } 3454 } else { 3455 /* upper, no diag */ 3456 sz = ii[row+1] - diag[row] - 1; 3457 idx = a->j + diag[row] + 1; 3458 switch (sizes[i]) { 3459 case 5: 3460 v5 = a->a + diag[row-4] + 5; 3461 case 4: /* fall through */ 3462 v4 = a->a + diag[row-3] + 4; 3463 case 3: 3464 v3 = a->a + diag[row-2] + 3; 3465 case 2: 3466 v2 = a->a + diag[row-1] + 2; 3467 case 1: 3468 v1 = a->a + diag[row] + 1; 3469 } 3470 } 3471 /* set sum */ 3472 switch (sizes[i]) { 3473 case 5: 3474 sum5 = xb[row-4]; 3475 case 4: /* fall through */ 3476 sum4 = xb[row-3]; 3477 case 3: 3478 sum3 = xb[row-2]; 3479 case 2: 3480 sum2 = xb[row-1]; 3481 case 1: 3482 /* note that sum1 is associated with the last row */ 3483 sum1 = xb[row]; 3484 } 3485 /* do sums */ 3486 for (n = 0; n<sz-1; n+=2) { 3487 i1 = idx[0]; 3488 i2 = idx[1]; 3489 idx += 2; 3490 tmp0 = x[i1]; 3491 tmp1 = x[i2]; 3492 switch (sizes[i]) { 3493 case 5: 3494 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3495 case 4: /* fall through */ 3496 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3497 case 3: 3498 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3499 case 2: 3500 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3501 case 1: 3502 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3503 } 3504 } 3505 /* ragged edge */ 3506 if (n == sz-1) { 3507 tmp0 = x[*idx]; 3508 switch (sizes[i]) { 3509 case 5: 3510 sum5 -= *v5*tmp0; 3511 case 4: /* fall through */ 3512 sum4 -= *v4*tmp0; 3513 case 3: 3514 sum3 -= *v3*tmp0; 3515 case 2: 3516 sum2 -= *v2*tmp0; 3517 case 1: 3518 sum1 -= *v1*tmp0; 3519 } 3520 } 3521 /* update */ 3522 if (xb == b) { 3523 /* whole (old way) w/ diag */ 3524 switch (sizes[i]) { 3525 case 5: 3526 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3527 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3528 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3529 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3530 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3531 break; 3532 case 4: 3533 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3534 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3535 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3536 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3537 break; 3538 case 3: 3539 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3540 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3541 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3542 break; 3543 case 2: 3544 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3545 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3546 break; 3547 case 1: 3548 x[row--] += sum1*(*ibdiag); 3549 break; 3550 } 3551 } else { 3552 /* no diag so set = */ 3553 switch (sizes[i]) { 3554 case 5: 3555 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3556 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3557 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3558 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3559 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3560 break; 3561 case 4: 3562 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3563 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3564 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3565 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3566 break; 3567 case 3: 3568 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3569 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3570 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3571 break; 3572 case 2: 3573 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3574 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3575 break; 3576 case 1: 3577 x[row--] = sum1*(*ibdiag); 3578 break; 3579 } 3580 } 3581 } 3582 if (xb == b) { 3583 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3584 } else { 3585 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3586 } 3587 } 3588 } 3589 if (flag & SOR_EISENSTAT) { 3590 /* 3591 Apply (U + D)^-1 where D is now the block diagonal 3592 */ 3593 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3594 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3595 ibdiag -= sizes[i]*sizes[i]; 3596 sz = ii[row+1] - diag[row] - 1; 3597 v1 = a->a + diag[row] + 1; 3598 idx = a->j + diag[row] + 1; 3599 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3600 switch (sizes[i]) { 3601 case 1: 3602 3603 sum1 = b[row]; 3604 for (n = 0; n<sz-1; n+=2) { 3605 i1 = idx[0]; 3606 i2 = idx[1]; 3607 idx += 2; 3608 tmp0 = x[i1]; 3609 tmp1 = x[i2]; 3610 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3611 } 3612 3613 if (n == sz-1) { 3614 tmp0 = x[*idx]; 3615 sum1 -= *v1*tmp0; 3616 } 3617 x[row] = sum1*(*ibdiag);row--; 3618 break; 3619 3620 case 2: 3621 3622 sum1 = b[row]; 3623 sum2 = b[row-1]; 3624 /* note that sum1 is associated with the second of the two rows */ 3625 v2 = a->a + diag[row-1] + 2; 3626 for (n = 0; n<sz-1; n+=2) { 3627 i1 = idx[0]; 3628 i2 = idx[1]; 3629 idx += 2; 3630 tmp0 = x[i1]; 3631 tmp1 = x[i2]; 3632 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3633 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3634 } 3635 3636 if (n == sz-1) { 3637 tmp0 = x[*idx]; 3638 sum1 -= *v1*tmp0; 3639 sum2 -= *v2*tmp0; 3640 } 3641 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3642 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3643 row -= 2; 3644 break; 3645 case 3: 3646 3647 sum1 = b[row]; 3648 sum2 = b[row-1]; 3649 sum3 = b[row-2]; 3650 v2 = a->a + diag[row-1] + 2; 3651 v3 = a->a + diag[row-2] + 3; 3652 for (n = 0; n<sz-1; n+=2) { 3653 i1 = idx[0]; 3654 i2 = idx[1]; 3655 idx += 2; 3656 tmp0 = x[i1]; 3657 tmp1 = x[i2]; 3658 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3659 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3660 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3661 } 3662 3663 if (n == sz-1) { 3664 tmp0 = x[*idx]; 3665 sum1 -= *v1*tmp0; 3666 sum2 -= *v2*tmp0; 3667 sum3 -= *v3*tmp0; 3668 } 3669 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3670 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3671 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3672 row -= 3; 3673 break; 3674 case 4: 3675 3676 sum1 = b[row]; 3677 sum2 = b[row-1]; 3678 sum3 = b[row-2]; 3679 sum4 = b[row-3]; 3680 v2 = a->a + diag[row-1] + 2; 3681 v3 = a->a + diag[row-2] + 3; 3682 v4 = a->a + diag[row-3] + 4; 3683 for (n = 0; n<sz-1; n+=2) { 3684 i1 = idx[0]; 3685 i2 = idx[1]; 3686 idx += 2; 3687 tmp0 = x[i1]; 3688 tmp1 = x[i2]; 3689 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3690 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3691 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3692 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3693 } 3694 3695 if (n == sz-1) { 3696 tmp0 = x[*idx]; 3697 sum1 -= *v1*tmp0; 3698 sum2 -= *v2*tmp0; 3699 sum3 -= *v3*tmp0; 3700 sum4 -= *v4*tmp0; 3701 } 3702 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3703 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3704 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3705 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3706 row -= 4; 3707 break; 3708 case 5: 3709 3710 sum1 = b[row]; 3711 sum2 = b[row-1]; 3712 sum3 = b[row-2]; 3713 sum4 = b[row-3]; 3714 sum5 = b[row-4]; 3715 v2 = a->a + diag[row-1] + 2; 3716 v3 = a->a + diag[row-2] + 3; 3717 v4 = a->a + diag[row-3] + 4; 3718 v5 = a->a + diag[row-4] + 5; 3719 for (n = 0; n<sz-1; n+=2) { 3720 i1 = idx[0]; 3721 i2 = idx[1]; 3722 idx += 2; 3723 tmp0 = x[i1]; 3724 tmp1 = x[i2]; 3725 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3726 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3727 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3728 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3729 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3730 } 3731 3732 if (n == sz-1) { 3733 tmp0 = x[*idx]; 3734 sum1 -= *v1*tmp0; 3735 sum2 -= *v2*tmp0; 3736 sum3 -= *v3*tmp0; 3737 sum4 -= *v4*tmp0; 3738 sum5 -= *v5*tmp0; 3739 } 3740 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3741 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3742 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3743 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3744 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3745 row -= 5; 3746 break; 3747 default: 3748 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3749 } 3750 } 3751 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3752 3753 /* 3754 t = b - D x where D is the block diagonal 3755 */ 3756 cnt = 0; 3757 for (i=0, row=0; i<m; i++) { 3758 switch (sizes[i]) { 3759 case 1: 3760 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3761 break; 3762 case 2: 3763 x1 = x[row]; x2 = x[row+1]; 3764 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3765 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3766 t[row] = b[row] - tmp1; 3767 t[row+1] = b[row+1] - tmp2; row += 2; 3768 cnt += 4; 3769 break; 3770 case 3: 3771 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3772 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3773 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3774 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3775 t[row] = b[row] - tmp1; 3776 t[row+1] = b[row+1] - tmp2; 3777 t[row+2] = b[row+2] - tmp3; row += 3; 3778 cnt += 9; 3779 break; 3780 case 4: 3781 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3782 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3783 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3784 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3785 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3786 t[row] = b[row] - tmp1; 3787 t[row+1] = b[row+1] - tmp2; 3788 t[row+2] = b[row+2] - tmp3; 3789 t[row+3] = b[row+3] - tmp4; row += 4; 3790 cnt += 16; 3791 break; 3792 case 5: 3793 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3794 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3795 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3796 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3797 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3798 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3799 t[row] = b[row] - tmp1; 3800 t[row+1] = b[row+1] - tmp2; 3801 t[row+2] = b[row+2] - tmp3; 3802 t[row+3] = b[row+3] - tmp4; 3803 t[row+4] = b[row+4] - tmp5;row += 5; 3804 cnt += 25; 3805 break; 3806 default: 3807 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3808 } 3809 } 3810 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3811 3812 3813 3814 /* 3815 Apply (L + D)^-1 where D is the block diagonal 3816 */ 3817 for (i=0, row=0; i<m; i++) { 3818 sz = diag[row] - ii[row]; 3819 v1 = a->a + ii[row]; 3820 idx = a->j + ii[row]; 3821 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3822 switch (sizes[i]) { 3823 case 1: 3824 3825 sum1 = t[row]; 3826 for (n = 0; n<sz-1; n+=2) { 3827 i1 = idx[0]; 3828 i2 = idx[1]; 3829 idx += 2; 3830 tmp0 = t[i1]; 3831 tmp1 = t[i2]; 3832 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3833 } 3834 3835 if (n == sz-1) { 3836 tmp0 = t[*idx]; 3837 sum1 -= *v1 * tmp0; 3838 } 3839 x[row] += t[row] = sum1*(*ibdiag++); row++; 3840 break; 3841 case 2: 3842 v2 = a->a + ii[row+1]; 3843 sum1 = t[row]; 3844 sum2 = t[row+1]; 3845 for (n = 0; n<sz-1; n+=2) { 3846 i1 = idx[0]; 3847 i2 = idx[1]; 3848 idx += 2; 3849 tmp0 = t[i1]; 3850 tmp1 = t[i2]; 3851 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3852 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3853 } 3854 3855 if (n == sz-1) { 3856 tmp0 = t[*idx]; 3857 sum1 -= v1[0] * tmp0; 3858 sum2 -= v2[0] * tmp0; 3859 } 3860 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3861 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3862 ibdiag += 4; row += 2; 3863 break; 3864 case 3: 3865 v2 = a->a + ii[row+1]; 3866 v3 = a->a + ii[row+2]; 3867 sum1 = t[row]; 3868 sum2 = t[row+1]; 3869 sum3 = t[row+2]; 3870 for (n = 0; n<sz-1; n+=2) { 3871 i1 = idx[0]; 3872 i2 = idx[1]; 3873 idx += 2; 3874 tmp0 = t[i1]; 3875 tmp1 = t[i2]; 3876 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3877 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3878 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3879 } 3880 3881 if (n == sz-1) { 3882 tmp0 = t[*idx]; 3883 sum1 -= v1[0] * tmp0; 3884 sum2 -= v2[0] * tmp0; 3885 sum3 -= v3[0] * tmp0; 3886 } 3887 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3888 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3889 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3890 ibdiag += 9; row += 3; 3891 break; 3892 case 4: 3893 v2 = a->a + ii[row+1]; 3894 v3 = a->a + ii[row+2]; 3895 v4 = a->a + ii[row+3]; 3896 sum1 = t[row]; 3897 sum2 = t[row+1]; 3898 sum3 = t[row+2]; 3899 sum4 = t[row+3]; 3900 for (n = 0; n<sz-1; n+=2) { 3901 i1 = idx[0]; 3902 i2 = idx[1]; 3903 idx += 2; 3904 tmp0 = t[i1]; 3905 tmp1 = t[i2]; 3906 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3908 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3909 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3910 } 3911 3912 if (n == sz-1) { 3913 tmp0 = t[*idx]; 3914 sum1 -= v1[0] * tmp0; 3915 sum2 -= v2[0] * tmp0; 3916 sum3 -= v3[0] * tmp0; 3917 sum4 -= v4[0] * tmp0; 3918 } 3919 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3920 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3921 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3922 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3923 ibdiag += 16; row += 4; 3924 break; 3925 case 5: 3926 v2 = a->a + ii[row+1]; 3927 v3 = a->a + ii[row+2]; 3928 v4 = a->a + ii[row+3]; 3929 v5 = a->a + ii[row+4]; 3930 sum1 = t[row]; 3931 sum2 = t[row+1]; 3932 sum3 = t[row+2]; 3933 sum4 = t[row+3]; 3934 sum5 = t[row+4]; 3935 for (n = 0; n<sz-1; n+=2) { 3936 i1 = idx[0]; 3937 i2 = idx[1]; 3938 idx += 2; 3939 tmp0 = t[i1]; 3940 tmp1 = t[i2]; 3941 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3942 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3943 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3944 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3945 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3946 } 3947 3948 if (n == sz-1) { 3949 tmp0 = t[*idx]; 3950 sum1 -= v1[0] * tmp0; 3951 sum2 -= v2[0] * tmp0; 3952 sum3 -= v3[0] * tmp0; 3953 sum4 -= v4[0] * tmp0; 3954 sum5 -= v5[0] * tmp0; 3955 } 3956 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3957 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3958 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3959 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3960 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3961 ibdiag += 25; row += 5; 3962 break; 3963 default: 3964 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3965 } 3966 } 3967 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3968 } 3969 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3970 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3971 PetscFunctionReturn(0); 3972 } 3973 3974 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3975 { 3976 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3977 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3978 const MatScalar *bdiag = a->inode.bdiag; 3979 const PetscScalar *b; 3980 PetscErrorCode ierr; 3981 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3982 const PetscInt *sizes = a->inode.size; 3983 3984 PetscFunctionBegin; 3985 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3986 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3987 cnt = 0; 3988 for (i=0, row=0; i<m; i++) { 3989 switch (sizes[i]) { 3990 case 1: 3991 x[row] = b[row]*bdiag[cnt++];row++; 3992 break; 3993 case 2: 3994 x1 = b[row]; x2 = b[row+1]; 3995 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3996 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3997 x[row++] = tmp1; 3998 x[row++] = tmp2; 3999 cnt += 4; 4000 break; 4001 case 3: 4002 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4003 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4004 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4005 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4006 x[row++] = tmp1; 4007 x[row++] = tmp2; 4008 x[row++] = tmp3; 4009 cnt += 9; 4010 break; 4011 case 4: 4012 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4013 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4014 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4015 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4016 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4017 x[row++] = tmp1; 4018 x[row++] = tmp2; 4019 x[row++] = tmp3; 4020 x[row++] = tmp4; 4021 cnt += 16; 4022 break; 4023 case 5: 4024 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4025 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4026 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4027 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4028 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4029 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4030 x[row++] = tmp1; 4031 x[row++] = tmp2; 4032 x[row++] = tmp3; 4033 x[row++] = tmp4; 4034 x[row++] = tmp5; 4035 cnt += 25; 4036 break; 4037 default: 4038 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4039 } 4040 } 4041 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4042 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4043 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4044 PetscFunctionReturn(0); 4045 } 4046 4047 /* 4048 samestructure indicates that the matrix has not changed its nonzero structure so we 4049 do not need to recompute the inodes 4050 */ 4051 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4052 { 4053 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4054 PetscErrorCode ierr; 4055 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4056 PetscBool flag; 4057 const PetscInt *idx,*idy,*ii; 4058 4059 PetscFunctionBegin; 4060 if (!a->inode.use) PetscFunctionReturn(0); 4061 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4062 4063 m = A->rmap->n; 4064 if (a->inode.size) ns = a->inode.size; 4065 else { 4066 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4067 } 4068 4069 i = 0; 4070 node_count = 0; 4071 idx = a->j; 4072 ii = a->i; 4073 while (i < m) { /* For each row */ 4074 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4075 /* Limits the number of elements in a node to 'a->inode.limit' */ 4076 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4077 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4078 if (nzy != nzx) break; 4079 idy += nzx; /* Same nonzero pattern */ 4080 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4081 if (!flag) break; 4082 } 4083 ns[node_count++] = blk_size; 4084 idx += blk_size*nzx; 4085 i = j; 4086 } 4087 /* If not enough inodes found,, do not use inode version of the routines */ 4088 if (!m || node_count > .8*m) { 4089 ierr = PetscFree(ns);CHKERRQ(ierr); 4090 4091 a->inode.node_count = 0; 4092 a->inode.size = NULL; 4093 a->inode.use = PETSC_FALSE; 4094 A->ops->mult = MatMult_SeqAIJ; 4095 A->ops->sor = MatSOR_SeqAIJ; 4096 A->ops->multadd = MatMultAdd_SeqAIJ; 4097 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4098 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4099 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4100 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4101 A->ops->coloringpatch = 0; 4102 A->ops->multdiagonalblock = 0; 4103 4104 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4105 } else { 4106 if (!A->factortype) { 4107 A->ops->mult = MatMult_SeqAIJ_Inode; 4108 A->ops->sor = MatSOR_SeqAIJ_Inode; 4109 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4110 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4111 if (A->rmap->n == A->cmap->n) { 4112 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4113 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4114 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4115 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4116 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4117 } 4118 } else { 4119 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4120 } 4121 a->inode.node_count = node_count; 4122 a->inode.size = ns; 4123 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4124 } 4125 a->inode.checked = PETSC_TRUE; 4126 a->inode.mat_nonzerostate = A->nonzerostate; 4127 PetscFunctionReturn(0); 4128 } 4129 4130 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4131 { 4132 Mat B =*C; 4133 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4134 PetscErrorCode ierr; 4135 PetscInt m=A->rmap->n; 4136 4137 PetscFunctionBegin; 4138 c->inode.use = a->inode.use; 4139 c->inode.limit = a->inode.limit; 4140 c->inode.max_limit = a->inode.max_limit; 4141 if (a->inode.size) { 4142 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4143 c->inode.node_count = a->inode.node_count; 4144 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4145 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4146 if (!B->factortype) { 4147 B->ops->mult = MatMult_SeqAIJ_Inode; 4148 B->ops->sor = MatSOR_SeqAIJ_Inode; 4149 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4150 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4151 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4152 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4153 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4154 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4155 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4156 } else { 4157 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4158 } 4159 } else { 4160 c->inode.size = 0; 4161 c->inode.node_count = 0; 4162 } 4163 c->inode.ibdiagvalid = PETSC_FALSE; 4164 c->inode.ibdiag = 0; 4165 c->inode.bdiag = 0; 4166 PetscFunctionReturn(0); 4167 } 4168 4169 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4170 { 4171 PetscInt k; 4172 const PetscInt *vi; 4173 4174 PetscFunctionBegin; 4175 vi = aj + ai[row]; 4176 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4177 vi = aj + adiag[row]; 4178 cols[nzl] = vi[0]; 4179 vi = aj + adiag[row+1]+1; 4180 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4181 PetscFunctionReturn(0); 4182 } 4183 /* 4184 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4185 Modified from MatSeqAIJCheckInode(). 4186 4187 Input Parameters: 4188 . Mat A - ILU or LU matrix factor 4189 4190 */ 4191 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4192 { 4193 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4194 PetscErrorCode ierr; 4195 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4196 PetscInt *cols1,*cols2,*ns; 4197 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4198 PetscBool flag; 4199 4200 PetscFunctionBegin; 4201 if (!a->inode.use) PetscFunctionReturn(0); 4202 if (a->inode.checked) PetscFunctionReturn(0); 4203 4204 m = A->rmap->n; 4205 if (a->inode.size) ns = a->inode.size; 4206 else { 4207 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4208 } 4209 4210 i = 0; 4211 node_count = 0; 4212 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4213 while (i < m) { /* For each row */ 4214 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4215 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4216 nzx = nzl1 + nzu1 + 1; 4217 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4218 4219 /* Limits the number of elements in a node to 'a->inode.limit' */ 4220 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4221 nzl2 = ai[j+1] - ai[j]; 4222 nzu2 = adiag[j] - adiag[j+1] - 1; 4223 nzy = nzl2 + nzu2 + 1; 4224 if (nzy != nzx) break; 4225 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4226 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4227 if (!flag) break; 4228 } 4229 ns[node_count++] = blk_size; 4230 i = j; 4231 } 4232 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4233 /* If not enough inodes found,, do not use inode version of the routines */ 4234 if (!m || node_count > .8*m) { 4235 ierr = PetscFree(ns);CHKERRQ(ierr); 4236 4237 a->inode.node_count = 0; 4238 a->inode.size = NULL; 4239 a->inode.use = PETSC_FALSE; 4240 4241 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4242 } else { 4243 A->ops->mult = 0; 4244 A->ops->sor = 0; 4245 A->ops->multadd = 0; 4246 A->ops->getrowij = 0; 4247 A->ops->restorerowij = 0; 4248 A->ops->getcolumnij = 0; 4249 A->ops->restorecolumnij = 0; 4250 A->ops->coloringpatch = 0; 4251 A->ops->multdiagonalblock = 0; 4252 a->inode.node_count = node_count; 4253 a->inode.size = ns; 4254 4255 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4256 } 4257 a->inode.checked = PETSC_TRUE; 4258 PetscFunctionReturn(0); 4259 } 4260 4261 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4262 { 4263 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4264 4265 PetscFunctionBegin; 4266 a->inode.ibdiagvalid = PETSC_FALSE; 4267 PetscFunctionReturn(0); 4268 } 4269 4270 /* 4271 This is really ugly. if inodes are used this replaces the 4272 permutations with ones that correspond to rows/cols of the matrix 4273 rather then inode blocks 4274 */ 4275 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4276 { 4277 PetscErrorCode ierr; 4278 4279 PetscFunctionBegin; 4280 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4281 PetscFunctionReturn(0); 4282 } 4283 4284 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4285 { 4286 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4287 PetscErrorCode ierr; 4288 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4289 const PetscInt *ridx,*cidx; 4290 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4291 PetscInt nslim_col,*ns_col; 4292 IS ris = *rperm,cis = *cperm; 4293 4294 PetscFunctionBegin; 4295 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4296 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4297 4298 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4299 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4300 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4301 4302 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4303 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4304 4305 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4306 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4307 4308 /* Construct the permutations for rows*/ 4309 for (i=0,row = 0; i<nslim_row; ++i) { 4310 indx = ridx[i]; 4311 start_val = tns[indx]; 4312 end_val = tns[indx + 1]; 4313 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4314 } 4315 4316 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4317 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4318 4319 /* Construct permutations for columns */ 4320 for (i=0,col=0; i<nslim_col; ++i) { 4321 indx = cidx[i]; 4322 start_val = tns[indx]; 4323 end_val = tns[indx + 1]; 4324 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4325 } 4326 4327 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4328 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4329 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4330 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4331 4332 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4333 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4334 4335 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4336 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4337 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4338 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4339 ierr = PetscFree(tns);CHKERRQ(ierr); 4340 PetscFunctionReturn(0); 4341 } 4342 4343 /*@C 4344 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4345 4346 Not Collective 4347 4348 Input Parameter: 4349 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4350 4351 Output Parameter: 4352 + node_count - no of inodes present in the matrix. 4353 . sizes - an array of size node_count,with sizes of each inode. 4354 - limit - the max size used to generate the inodes. 4355 4356 Level: advanced 4357 4358 Notes: 4359 This routine returns some internal storage information 4360 of the matrix, it is intended to be used by advanced users. 4361 It should be called after the matrix is assembled. 4362 The contents of the sizes[] array should not be changed. 4363 NULL may be passed for information not requested. 4364 4365 .keywords: matrix, seqaij, get, inode 4366 4367 .seealso: MatGetInfo() 4368 @*/ 4369 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4370 { 4371 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4372 4373 PetscFunctionBegin; 4374 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4375 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4376 if (f) { 4377 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4378 } 4379 PetscFunctionReturn(0); 4380 } 4381 4382 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4383 { 4384 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4385 4386 PetscFunctionBegin; 4387 if (node_count) *node_count = a->inode.node_count; 4388 if (sizes) *sizes = a->inode.size; 4389 if (limit) *limit = a->inode.limit; 4390 PetscFunctionReturn(0); 4391 } 4392