1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns) 9 { 10 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 11 PetscErrorCode ierr; 12 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 13 14 PetscFunctionBegin; 15 n = A->cmap->n; 16 m = A->rmap->n; 17 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 18 ns_row = a->inode.size; 19 20 min_mn = (m < n) ? m : n; 21 if (!ns) { 22 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 23 for (; count+1 < n; count++,i++) ; 24 if (count < n) { 25 i++; 26 } 27 *size = i; 28 PetscFunctionReturn(0); 29 } 30 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 31 32 /* Use the same row structure wherever feasible. */ 33 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 34 ns_col[i] = ns_row[i]; 35 } 36 37 /* if m < n; pad up the remainder with inode_limit */ 38 for (; count+1 < n; count++,i++) { 39 ns_col[i] = 1; 40 } 41 /* The last node is the odd ball. padd it up with the remaining rows; */ 42 if (count < n) { 43 ns_col[i] = n - count; 44 i++; 45 } else if (count > n) { 46 /* Adjust for the over estimation */ 47 ns_col[i-1] += n - count; 48 } 49 *size = i; 50 *ns = ns_col; 51 PetscFunctionReturn(0); 52 } 53 54 /* 55 This builds symmetric version of nonzero structure, 56 */ 57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 58 { 59 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 60 PetscErrorCode ierr; 61 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 62 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 63 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 64 65 PetscFunctionBegin; 66 nslim_row = a->inode.node_count; 67 m = A->rmap->n; 68 n = A->cmap->n; 69 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 70 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 71 72 /* Use the row_inode as column_inode */ 73 nslim_col = nslim_row; 74 ns_col = ns_row; 75 76 /* allocate space for reformated inode structure */ 77 ierr = PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);CHKERRQ(ierr); 78 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 79 80 for (i1=0,col=0; i1<nslim_col; ++i1) { 81 nsz = ns_col[i1]; 82 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 83 } 84 /* allocate space for row pointers */ 85 ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 86 *iia = ia; 87 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 88 89 /* determine the number of columns in each row */ 90 ia[0] = oshift; 91 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 92 93 j = aj + ai[row] + ishift; 94 jmax = aj + ai[row+1] + ishift; 95 if (j==jmax) continue; /* empty row */ 96 col = *j++ + ishift; 97 i2 = tvc[col]; 98 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 99 ia[i1+1]++; 100 ia[i2+1]++; 101 i2++; /* Start col of next node */ 102 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 103 i2 = tvc[col]; 104 } 105 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 106 } 107 108 /* shift ia[i] to point to next row */ 109 for (i1=1; i1<nslim_row+1; i1++) { 110 row = ia[i1-1]; 111 ia[i1] += row; 112 work[i1-1] = row - oshift; 113 } 114 115 /* allocate space for column pointers */ 116 nz = ia[nslim_row] + (!ishift); 117 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 118 *jja = ja; 119 120 /* loop over lower triangular part putting into ja */ 121 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 122 j = aj + ai[row] + ishift; 123 jmax = aj + ai[row+1] + ishift; 124 if (j==jmax) continue; /* empty row */ 125 col = *j++ + ishift; 126 i2 = tvc[col]; 127 while (i2<i1 && j<jmax) { 128 ja[work[i2]++] = i1 + oshift; 129 ja[work[i1]++] = i2 + oshift; 130 ++i2; 131 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 132 i2 = tvc[col]; 133 } 134 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 135 136 } 137 ierr = PetscFree(work);CHKERRQ(ierr); 138 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); 139 PetscFunctionReturn(0); 140 } 141 142 /* 143 This builds nonsymmetric version of nonzero structure, 144 */ 145 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 146 { 147 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 148 PetscErrorCode ierr; 149 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 150 PetscInt *tns,*tvc,nsz,i1,i2; 151 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 152 153 PetscFunctionBegin; 154 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 155 nslim_row = a->inode.node_count; 156 n = A->cmap->n; 157 158 /* Create The column_inode for this matrix */ 159 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 160 161 /* allocate space for reformated column_inode structure */ 162 ierr = PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);CHKERRQ(ierr); 163 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 164 165 for (i1=0,col=0; i1<nslim_col; ++i1) { 166 nsz = ns_col[i1]; 167 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 168 } 169 /* allocate space for row pointers */ 170 ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 171 *iia = ia; 172 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 173 174 /* determine the number of columns in each row */ 175 ia[0] = oshift; 176 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 177 j = aj + ai[row] + ishift; 178 nz = ai[row+1] - ai[row]; 179 if (!nz) continue; /* empty row */ 180 col = *j++ + ishift; 181 i2 = tvc[col]; 182 while (nz-- > 0) { /* off-diagonal elemets */ 183 ia[i1+1]++; 184 i2++; /* Start col of next node */ 185 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 186 if (nz > 0) i2 = tvc[col]; 187 } 188 } 189 190 /* shift ia[i] to point to next row */ 191 for (i1=1; i1<nslim_row+1; i1++) { 192 row = ia[i1-1]; 193 ia[i1] += row; 194 work[i1-1] = row - oshift; 195 } 196 197 /* allocate space for column pointers */ 198 nz = ia[nslim_row] + (!ishift); 199 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 200 *jja = ja; 201 202 /* loop over matrix putting into ja */ 203 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 204 j = aj + ai[row] + ishift; 205 nz = ai[row+1] - ai[row]; 206 if (!nz) continue; /* empty row */ 207 col = *j++ + ishift; 208 i2 = tvc[col]; 209 while (nz-- > 0) { 210 ja[work[i1]++] = i2 + oshift; 211 ++i2; 212 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 213 if (nz > 0) i2 = tvc[col]; 214 } 215 } 216 ierr = PetscFree(ns_col);CHKERRQ(ierr); 217 ierr = PetscFree(work);CHKERRQ(ierr); 218 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); 219 PetscFunctionReturn(0); 220 } 221 222 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 223 { 224 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 225 PetscErrorCode ierr; 226 227 PetscFunctionBegin; 228 *n = a->inode.node_count; 229 if (!ia) PetscFunctionReturn(0); 230 if (!blockcompressed) { 231 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); 232 } else if (symmetric) { 233 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 234 } else { 235 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 236 } 237 PetscFunctionReturn(0); 238 } 239 240 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 241 { 242 PetscErrorCode ierr; 243 244 PetscFunctionBegin; 245 if (!ia) PetscFunctionReturn(0); 246 247 if (!blockcompressed) { 248 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); 249 } else { 250 ierr = PetscFree(*ia);CHKERRQ(ierr); 251 ierr = PetscFree(*ja);CHKERRQ(ierr); 252 } 253 PetscFunctionReturn(0); 254 } 255 256 /* ----------------------------------------------------------- */ 257 258 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 259 { 260 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 261 PetscErrorCode ierr; 262 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 263 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 264 265 PetscFunctionBegin; 266 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 267 nslim_row = a->inode.node_count; 268 n = A->cmap->n; 269 270 /* Create The column_inode for this matrix */ 271 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 272 273 /* allocate space for reformated column_inode structure */ 274 ierr = PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);CHKERRQ(ierr); 275 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 276 277 for (i1=0,col=0; i1<nslim_col; ++i1) { 278 nsz = ns_col[i1]; 279 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 280 } 281 /* allocate space for column pointers */ 282 ierr = PetscCalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 283 *iia = ia; 284 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 285 286 /* determine the number of columns in each row */ 287 ia[0] = oshift; 288 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 289 j = aj + ai[row] + ishift; 290 col = *j++ + ishift; 291 i2 = tvc[col]; 292 nz = ai[row+1] - ai[row]; 293 while (nz-- > 0) { /* off-diagonal elemets */ 294 /* ia[i1+1]++; */ 295 ia[i2+1]++; 296 i2++; 297 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 298 if (nz > 0) i2 = tvc[col]; 299 } 300 } 301 302 /* shift ia[i] to point to next col */ 303 for (i1=1; i1<nslim_col+1; i1++) { 304 col = ia[i1-1]; 305 ia[i1] += col; 306 work[i1-1] = col - oshift; 307 } 308 309 /* allocate space for column pointers */ 310 nz = ia[nslim_col] + (!ishift); 311 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 312 *jja = ja; 313 314 /* loop over matrix putting into ja */ 315 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 316 j = aj + ai[row] + ishift; 317 col = *j++ + ishift; 318 i2 = tvc[col]; 319 nz = ai[row+1] - ai[row]; 320 while (nz-- > 0) { 321 /* ja[work[i1]++] = i2 + oshift; */ 322 ja[work[i2]++] = i1 + oshift; 323 i2++; 324 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 325 if (nz > 0) i2 = tvc[col]; 326 } 327 } 328 ierr = PetscFree(ns_col);CHKERRQ(ierr); 329 ierr = PetscFree(work);CHKERRQ(ierr); 330 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); 331 PetscFunctionReturn(0); 332 } 333 334 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 335 { 336 PetscErrorCode ierr; 337 338 PetscFunctionBegin; 339 ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr); 340 if (!ia) PetscFunctionReturn(0); 341 342 if (!blockcompressed) { 343 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); 344 } else if (symmetric) { 345 /* Since the indices are symmetric it does'nt matter */ 346 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 347 } else { 348 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 349 } 350 PetscFunctionReturn(0); 351 } 352 353 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 354 { 355 PetscErrorCode ierr; 356 357 PetscFunctionBegin; 358 if (!ia) PetscFunctionReturn(0); 359 if (!blockcompressed) { 360 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); 361 } else { 362 ierr = PetscFree(*ia);CHKERRQ(ierr); 363 ierr = PetscFree(*ja);CHKERRQ(ierr); 364 } 365 PetscFunctionReturn(0); 366 } 367 368 /* ----------------------------------------------------------- */ 369 370 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 371 { 372 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 373 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 374 PetscScalar *y; 375 const PetscScalar *x; 376 const MatScalar *v1,*v2,*v3,*v4,*v5; 377 PetscErrorCode ierr; 378 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 379 const PetscInt *idx,*ns,*ii; 380 381 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 382 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 383 #endif 384 385 PetscFunctionBegin; 386 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 387 node_max = a->inode.node_count; 388 ns = a->inode.size; /* Node Size array */ 389 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 390 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 391 idx = a->j; 392 v1 = a->a; 393 ii = a->i; 394 395 for (i = 0,row = 0; i< node_max; ++i) { 396 nsz = ns[i]; 397 n = ii[1] - ii[0]; 398 nonzerorow += (n>0)*nsz; 399 ii += nsz; 400 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 401 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 402 sz = n; /* No of non zeros in this row */ 403 /* Switch on the size of Node */ 404 switch (nsz) { /* Each loop in 'case' is unrolled */ 405 case 1: 406 sum1 = 0.; 407 408 for (n = 0; n< sz-1; n+=2) { 409 i1 = idx[0]; /* The instructions are ordered to */ 410 i2 = idx[1]; /* make the compiler's job easy */ 411 idx += 2; 412 tmp0 = x[i1]; 413 tmp1 = x[i2]; 414 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 415 } 416 417 if (n == sz-1) { /* Take care of the last nonzero */ 418 tmp0 = x[*idx++]; 419 sum1 += *v1++ *tmp0; 420 } 421 y[row++]=sum1; 422 break; 423 case 2: 424 sum1 = 0.; 425 sum2 = 0.; 426 v2 = v1 + n; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; 430 i2 = idx[1]; 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 436 } 437 if (n == sz-1) { 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ * tmp0; 440 sum2 += *v2++ * tmp0; 441 } 442 y[row++]=sum1; 443 y[row++]=sum2; 444 v1 =v2; /* Since the next block to be processed starts there*/ 445 idx +=sz; 446 break; 447 case 3: 448 sum1 = 0.; 449 sum2 = 0.; 450 sum3 = 0.; 451 v2 = v1 + n; 452 v3 = v2 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 463 } 464 if (n == sz-1) { 465 tmp0 = x[*idx++]; 466 sum1 += *v1++ * tmp0; 467 sum2 += *v2++ * tmp0; 468 sum3 += *v3++ * tmp0; 469 } 470 y[row++]=sum1; 471 y[row++]=sum2; 472 y[row++]=sum3; 473 v1 =v3; /* Since the next block to be processed starts there*/ 474 idx +=2*sz; 475 break; 476 case 4: 477 sum1 = 0.; 478 sum2 = 0.; 479 sum3 = 0.; 480 sum4 = 0.; 481 v2 = v1 + n; 482 v3 = v2 + n; 483 v4 = v3 + n; 484 485 for (n = 0; n< sz-1; n+=2) { 486 i1 = idx[0]; 487 i2 = idx[1]; 488 idx += 2; 489 tmp0 = x[i1]; 490 tmp1 = x[i2]; 491 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 492 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 493 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 494 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 495 } 496 if (n == sz-1) { 497 tmp0 = x[*idx++]; 498 sum1 += *v1++ * tmp0; 499 sum2 += *v2++ * tmp0; 500 sum3 += *v3++ * tmp0; 501 sum4 += *v4++ * tmp0; 502 } 503 y[row++]=sum1; 504 y[row++]=sum2; 505 y[row++]=sum3; 506 y[row++]=sum4; 507 v1 =v4; /* Since the next block to be processed starts there*/ 508 idx +=3*sz; 509 break; 510 case 5: 511 sum1 = 0.; 512 sum2 = 0.; 513 sum3 = 0.; 514 sum4 = 0.; 515 sum5 = 0.; 516 v2 = v1 + n; 517 v3 = v2 + n; 518 v4 = v3 + n; 519 v5 = v4 + n; 520 521 for (n = 0; n<sz-1; n+=2) { 522 i1 = idx[0]; 523 i2 = idx[1]; 524 idx += 2; 525 tmp0 = x[i1]; 526 tmp1 = x[i2]; 527 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 528 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 529 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 530 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 531 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 532 } 533 if (n == sz-1) { 534 tmp0 = x[*idx++]; 535 sum1 += *v1++ * tmp0; 536 sum2 += *v2++ * tmp0; 537 sum3 += *v3++ * tmp0; 538 sum4 += *v4++ * tmp0; 539 sum5 += *v5++ * tmp0; 540 } 541 y[row++]=sum1; 542 y[row++]=sum2; 543 y[row++]=sum3; 544 y[row++]=sum4; 545 y[row++]=sum5; 546 v1 =v5; /* Since the next block to be processed starts there */ 547 idx +=4*sz; 548 break; 549 default: 550 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 551 } 552 } 553 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 554 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 555 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 556 PetscFunctionReturn(0); 557 } 558 /* ----------------------------------------------------------- */ 559 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 560 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 561 { 562 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 563 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 564 const MatScalar *v1,*v2,*v3,*v4,*v5; 565 const PetscScalar *x; 566 PetscScalar *y,*z,*zt; 567 PetscErrorCode ierr; 568 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 569 const PetscInt *idx,*ns,*ii; 570 571 PetscFunctionBegin; 572 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 573 node_max = a->inode.node_count; 574 ns = a->inode.size; /* Node Size array */ 575 576 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 577 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 578 zt = z; 579 580 idx = a->j; 581 v1 = a->a; 582 ii = a->i; 583 584 for (i = 0,row = 0; i< node_max; ++i) { 585 nsz = ns[i]; 586 n = ii[1] - ii[0]; 587 ii += nsz; 588 sz = n; /* No of non zeros in this row */ 589 /* Switch on the size of Node */ 590 switch (nsz) { /* Each loop in 'case' is unrolled */ 591 case 1: 592 sum1 = *zt++; 593 594 for (n = 0; n< sz-1; n+=2) { 595 i1 = idx[0]; /* The instructions are ordered to */ 596 i2 = idx[1]; /* make the compiler's job easy */ 597 idx += 2; 598 tmp0 = x[i1]; 599 tmp1 = x[i2]; 600 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 601 } 602 603 if (n == sz-1) { /* Take care of the last nonzero */ 604 tmp0 = x[*idx++]; 605 sum1 += *v1++ * tmp0; 606 } 607 y[row++]=sum1; 608 break; 609 case 2: 610 sum1 = *zt++; 611 sum2 = *zt++; 612 v2 = v1 + n; 613 614 for (n = 0; n< sz-1; n+=2) { 615 i1 = idx[0]; 616 i2 = idx[1]; 617 idx += 2; 618 tmp0 = x[i1]; 619 tmp1 = x[i2]; 620 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 621 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 622 } 623 if (n == sz-1) { 624 tmp0 = x[*idx++]; 625 sum1 += *v1++ * tmp0; 626 sum2 += *v2++ * tmp0; 627 } 628 y[row++]=sum1; 629 y[row++]=sum2; 630 v1 =v2; /* Since the next block to be processed starts there*/ 631 idx +=sz; 632 break; 633 case 3: 634 sum1 = *zt++; 635 sum2 = *zt++; 636 sum3 = *zt++; 637 v2 = v1 + n; 638 v3 = v2 + n; 639 640 for (n = 0; n< sz-1; n+=2) { 641 i1 = idx[0]; 642 i2 = idx[1]; 643 idx += 2; 644 tmp0 = x[i1]; 645 tmp1 = x[i2]; 646 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 647 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 648 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 649 } 650 if (n == sz-1) { 651 tmp0 = x[*idx++]; 652 sum1 += *v1++ * tmp0; 653 sum2 += *v2++ * tmp0; 654 sum3 += *v3++ * tmp0; 655 } 656 y[row++]=sum1; 657 y[row++]=sum2; 658 y[row++]=sum3; 659 v1 =v3; /* Since the next block to be processed starts there*/ 660 idx +=2*sz; 661 break; 662 case 4: 663 sum1 = *zt++; 664 sum2 = *zt++; 665 sum3 = *zt++; 666 sum4 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 v4 = v3 + n; 670 671 for (n = 0; n< sz-1; n+=2) { 672 i1 = idx[0]; 673 i2 = idx[1]; 674 idx += 2; 675 tmp0 = x[i1]; 676 tmp1 = x[i2]; 677 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 678 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 679 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 680 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 681 } 682 if (n == sz-1) { 683 tmp0 = x[*idx++]; 684 sum1 += *v1++ * tmp0; 685 sum2 += *v2++ * tmp0; 686 sum3 += *v3++ * tmp0; 687 sum4 += *v4++ * tmp0; 688 } 689 y[row++]=sum1; 690 y[row++]=sum2; 691 y[row++]=sum3; 692 y[row++]=sum4; 693 v1 =v4; /* Since the next block to be processed starts there*/ 694 idx +=3*sz; 695 break; 696 case 5: 697 sum1 = *zt++; 698 sum2 = *zt++; 699 sum3 = *zt++; 700 sum4 = *zt++; 701 sum5 = *zt++; 702 v2 = v1 + n; 703 v3 = v2 + n; 704 v4 = v3 + n; 705 v5 = v4 + n; 706 707 for (n = 0; n<sz-1; n+=2) { 708 i1 = idx[0]; 709 i2 = idx[1]; 710 idx += 2; 711 tmp0 = x[i1]; 712 tmp1 = x[i2]; 713 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 714 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 715 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 716 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 717 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 718 } 719 if (n == sz-1) { 720 tmp0 = x[*idx++]; 721 sum1 += *v1++ * tmp0; 722 sum2 += *v2++ * tmp0; 723 sum3 += *v3++ * tmp0; 724 sum4 += *v4++ * tmp0; 725 sum5 += *v5++ * tmp0; 726 } 727 y[row++]=sum1; 728 y[row++]=sum2; 729 y[row++]=sum3; 730 y[row++]=sum4; 731 y[row++]=sum5; 732 v1 =v5; /* Since the next block to be processed starts there */ 733 idx +=4*sz; 734 break; 735 default: 736 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 737 } 738 } 739 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 740 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 741 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 742 PetscFunctionReturn(0); 743 } 744 745 /* ----------------------------------------------------------- */ 746 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 747 { 748 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 749 IS iscol = a->col,isrow = a->row; 750 PetscErrorCode ierr; 751 const PetscInt *r,*c,*rout,*cout; 752 PetscInt i,j,n = A->rmap->n,nz; 753 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 754 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 755 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 756 PetscScalar sum1,sum2,sum3,sum4,sum5; 757 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 758 const PetscScalar *b; 759 760 PetscFunctionBegin; 761 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 762 node_max = a->inode.node_count; 763 ns = a->inode.size; /* Node Size array */ 764 765 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 766 ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr); 767 tmp = a->solve_work; 768 769 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 770 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 771 772 /* forward solve the lower triangular */ 773 tmps = tmp; 774 aa = a_a; 775 aj = a_j; 776 ad = a->diag; 777 778 for (i = 0,row = 0; i< node_max; ++i) { 779 nsz = ns[i]; 780 aii = ai[row]; 781 v1 = aa + aii; 782 vi = aj + aii; 783 nz = ad[row]- aii; 784 if (i < node_max-1) { 785 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 786 * but our indexing to determine it's size could. */ 787 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 788 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 789 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 790 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 791 } 792 793 switch (nsz) { /* Each loop in 'case' is unrolled */ 794 case 1: 795 sum1 = b[*r++]; 796 for (j=0; j<nz-1; j+=2) { 797 i0 = vi[0]; 798 i1 = vi[1]; 799 vi +=2; 800 tmp0 = tmps[i0]; 801 tmp1 = tmps[i1]; 802 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 803 } 804 if (j == nz-1) { 805 tmp0 = tmps[*vi++]; 806 sum1 -= *v1++ *tmp0; 807 } 808 tmp[row++]=sum1; 809 break; 810 case 2: 811 sum1 = b[*r++]; 812 sum2 = b[*r++]; 813 v2 = aa + ai[row+1]; 814 815 for (j=0; j<nz-1; j+=2) { 816 i0 = vi[0]; 817 i1 = vi[1]; 818 vi +=2; 819 tmp0 = tmps[i0]; 820 tmp1 = tmps[i1]; 821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 822 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 823 } 824 if (j == nz-1) { 825 tmp0 = tmps[*vi++]; 826 sum1 -= *v1++ *tmp0; 827 sum2 -= *v2++ *tmp0; 828 } 829 sum2 -= *v2++ *sum1; 830 tmp[row++]=sum1; 831 tmp[row++]=sum2; 832 break; 833 case 3: 834 sum1 = b[*r++]; 835 sum2 = b[*r++]; 836 sum3 = b[*r++]; 837 v2 = aa + ai[row+1]; 838 v3 = aa + ai[row+2]; 839 840 for (j=0; j<nz-1; j+=2) { 841 i0 = vi[0]; 842 i1 = vi[1]; 843 vi +=2; 844 tmp0 = tmps[i0]; 845 tmp1 = tmps[i1]; 846 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 847 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 848 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 849 } 850 if (j == nz-1) { 851 tmp0 = tmps[*vi++]; 852 sum1 -= *v1++ *tmp0; 853 sum2 -= *v2++ *tmp0; 854 sum3 -= *v3++ *tmp0; 855 } 856 sum2 -= *v2++ * sum1; 857 sum3 -= *v3++ * sum1; 858 sum3 -= *v3++ * sum2; 859 860 tmp[row++]=sum1; 861 tmp[row++]=sum2; 862 tmp[row++]=sum3; 863 break; 864 865 case 4: 866 sum1 = b[*r++]; 867 sum2 = b[*r++]; 868 sum3 = b[*r++]; 869 sum4 = b[*r++]; 870 v2 = aa + ai[row+1]; 871 v3 = aa + ai[row+2]; 872 v4 = aa + ai[row+3]; 873 874 for (j=0; j<nz-1; j+=2) { 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 884 } 885 if (j == nz-1) { 886 tmp0 = tmps[*vi++]; 887 sum1 -= *v1++ *tmp0; 888 sum2 -= *v2++ *tmp0; 889 sum3 -= *v3++ *tmp0; 890 sum4 -= *v4++ *tmp0; 891 } 892 sum2 -= *v2++ * sum1; 893 sum3 -= *v3++ * sum1; 894 sum4 -= *v4++ * sum1; 895 sum3 -= *v3++ * sum2; 896 sum4 -= *v4++ * sum2; 897 sum4 -= *v4++ * sum3; 898 899 tmp[row++]=sum1; 900 tmp[row++]=sum2; 901 tmp[row++]=sum3; 902 tmp[row++]=sum4; 903 break; 904 case 5: 905 sum1 = b[*r++]; 906 sum2 = b[*r++]; 907 sum3 = b[*r++]; 908 sum4 = b[*r++]; 909 sum5 = b[*r++]; 910 v2 = aa + ai[row+1]; 911 v3 = aa + ai[row+2]; 912 v4 = aa + ai[row+3]; 913 v5 = aa + ai[row+4]; 914 915 for (j=0; j<nz-1; j+=2) { 916 i0 = vi[0]; 917 i1 = vi[1]; 918 vi +=2; 919 tmp0 = tmps[i0]; 920 tmp1 = tmps[i1]; 921 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 922 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 923 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 924 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 925 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 926 } 927 if (j == nz-1) { 928 tmp0 = tmps[*vi++]; 929 sum1 -= *v1++ *tmp0; 930 sum2 -= *v2++ *tmp0; 931 sum3 -= *v3++ *tmp0; 932 sum4 -= *v4++ *tmp0; 933 sum5 -= *v5++ *tmp0; 934 } 935 936 sum2 -= *v2++ * sum1; 937 sum3 -= *v3++ * sum1; 938 sum4 -= *v4++ * sum1; 939 sum5 -= *v5++ * sum1; 940 sum3 -= *v3++ * sum2; 941 sum4 -= *v4++ * sum2; 942 sum5 -= *v5++ * sum2; 943 sum4 -= *v4++ * sum3; 944 sum5 -= *v5++ * sum3; 945 sum5 -= *v5++ * sum4; 946 947 tmp[row++]=sum1; 948 tmp[row++]=sum2; 949 tmp[row++]=sum3; 950 tmp[row++]=sum4; 951 tmp[row++]=sum5; 952 break; 953 default: 954 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 955 } 956 } 957 /* backward solve the upper triangular */ 958 for (i=node_max -1,row = n-1; i>=0; i--) { 959 nsz = ns[i]; 960 aii = ai[row+1] -1; 961 v1 = aa + aii; 962 vi = aj + aii; 963 nz = aii- ad[row]; 964 switch (nsz) { /* Each loop in 'case' is unrolled */ 965 case 1: 966 sum1 = tmp[row]; 967 968 for (j=nz; j>1; j-=2) { 969 vi -=2; 970 i0 = vi[2]; 971 i1 = vi[1]; 972 tmp0 = tmps[i0]; 973 tmp1 = tmps[i1]; 974 v1 -= 2; 975 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 976 } 977 if (j==1) { 978 tmp0 = tmps[*vi--]; 979 sum1 -= *v1-- * tmp0; 980 } 981 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 982 break; 983 case 2: 984 sum1 = tmp[row]; 985 sum2 = tmp[row -1]; 986 v2 = aa + ai[row]-1; 987 for (j=nz; j>1; j-=2) { 988 vi -=2; 989 i0 = vi[2]; 990 i1 = vi[1]; 991 tmp0 = tmps[i0]; 992 tmp1 = tmps[i1]; 993 v1 -= 2; 994 v2 -= 2; 995 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 996 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 997 } 998 if (j==1) { 999 tmp0 = tmps[*vi--]; 1000 sum1 -= *v1-- * tmp0; 1001 sum2 -= *v2-- * tmp0; 1002 } 1003 1004 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1005 sum2 -= *v2-- * tmp0; 1006 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1007 break; 1008 case 3: 1009 sum1 = tmp[row]; 1010 sum2 = tmp[row -1]; 1011 sum3 = tmp[row -2]; 1012 v2 = aa + ai[row]-1; 1013 v3 = aa + ai[row -1]-1; 1014 for (j=nz; j>1; j-=2) { 1015 vi -=2; 1016 i0 = vi[2]; 1017 i1 = vi[1]; 1018 tmp0 = tmps[i0]; 1019 tmp1 = tmps[i1]; 1020 v1 -= 2; 1021 v2 -= 2; 1022 v3 -= 2; 1023 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1024 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1025 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1026 } 1027 if (j==1) { 1028 tmp0 = tmps[*vi--]; 1029 sum1 -= *v1-- * tmp0; 1030 sum2 -= *v2-- * tmp0; 1031 sum3 -= *v3-- * tmp0; 1032 } 1033 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1034 sum2 -= *v2-- * tmp0; 1035 sum3 -= *v3-- * tmp0; 1036 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1037 sum3 -= *v3-- * tmp0; 1038 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1039 1040 break; 1041 case 4: 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 sum4 = tmp[row -3]; 1046 v2 = aa + ai[row]-1; 1047 v3 = aa + ai[row -1]-1; 1048 v4 = aa + ai[row -2]-1; 1049 1050 for (j=nz; j>1; j-=2) { 1051 vi -=2; 1052 i0 = vi[2]; 1053 i1 = vi[1]; 1054 tmp0 = tmps[i0]; 1055 tmp1 = tmps[i1]; 1056 v1 -= 2; 1057 v2 -= 2; 1058 v3 -= 2; 1059 v4 -= 2; 1060 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1061 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1062 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1063 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1064 } 1065 if (j==1) { 1066 tmp0 = tmps[*vi--]; 1067 sum1 -= *v1-- * tmp0; 1068 sum2 -= *v2-- * tmp0; 1069 sum3 -= *v3-- * tmp0; 1070 sum4 -= *v4-- * tmp0; 1071 } 1072 1073 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1074 sum2 -= *v2-- * tmp0; 1075 sum3 -= *v3-- * tmp0; 1076 sum4 -= *v4-- * tmp0; 1077 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1078 sum3 -= *v3-- * tmp0; 1079 sum4 -= *v4-- * tmp0; 1080 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1081 sum4 -= *v4-- * tmp0; 1082 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1083 break; 1084 case 5: 1085 sum1 = tmp[row]; 1086 sum2 = tmp[row -1]; 1087 sum3 = tmp[row -2]; 1088 sum4 = tmp[row -3]; 1089 sum5 = tmp[row -4]; 1090 v2 = aa + ai[row]-1; 1091 v3 = aa + ai[row -1]-1; 1092 v4 = aa + ai[row -2]-1; 1093 v5 = aa + ai[row -3]-1; 1094 for (j=nz; j>1; j-=2) { 1095 vi -= 2; 1096 i0 = vi[2]; 1097 i1 = vi[1]; 1098 tmp0 = tmps[i0]; 1099 tmp1 = tmps[i1]; 1100 v1 -= 2; 1101 v2 -= 2; 1102 v3 -= 2; 1103 v4 -= 2; 1104 v5 -= 2; 1105 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1106 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1107 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1108 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1109 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1110 } 1111 if (j==1) { 1112 tmp0 = tmps[*vi--]; 1113 sum1 -= *v1-- * tmp0; 1114 sum2 -= *v2-- * tmp0; 1115 sum3 -= *v3-- * tmp0; 1116 sum4 -= *v4-- * tmp0; 1117 sum5 -= *v5-- * tmp0; 1118 } 1119 1120 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1121 sum2 -= *v2-- * tmp0; 1122 sum3 -= *v3-- * tmp0; 1123 sum4 -= *v4-- * tmp0; 1124 sum5 -= *v5-- * tmp0; 1125 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1126 sum3 -= *v3-- * tmp0; 1127 sum4 -= *v4-- * tmp0; 1128 sum5 -= *v5-- * tmp0; 1129 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1130 sum4 -= *v4-- * tmp0; 1131 sum5 -= *v5-- * tmp0; 1132 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1133 sum5 -= *v5-- * tmp0; 1134 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1135 break; 1136 default: 1137 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1138 } 1139 } 1140 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1141 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1142 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1143 ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr); 1144 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1145 PetscFunctionReturn(0); 1146 } 1147 1148 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1149 { 1150 Mat C =B; 1151 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1152 IS isrow = b->row,isicol = b->icol; 1153 PetscErrorCode ierr; 1154 const PetscInt *r,*ic,*ics; 1155 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1156 PetscInt i,j,k,nz,nzL,row,*pj; 1157 const PetscInt *ajtmp,*bjtmp; 1158 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1159 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1160 FactorShiftCtx sctx; 1161 const PetscInt *ddiag; 1162 PetscReal rs; 1163 MatScalar d; 1164 PetscInt inod,nodesz,node_max,col; 1165 const PetscInt *ns; 1166 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1167 1168 PetscFunctionBegin; 1169 /* MatPivotSetUp(): initialize shift context sctx */ 1170 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1171 1172 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1173 ddiag = a->diag; 1174 sctx.shift_top = info->zeropivot; 1175 for (i=0; i<n; i++) { 1176 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1177 d = (aa)[ddiag[i]]; 1178 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1179 v = aa+ai[i]; 1180 nz = ai[i+1] - ai[i]; 1181 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1182 if (rs>sctx.shift_top) sctx.shift_top = rs; 1183 } 1184 sctx.shift_top *= 1.1; 1185 sctx.nshift_max = 5; 1186 sctx.shift_lo = 0.; 1187 sctx.shift_hi = 1.; 1188 } 1189 1190 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1191 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1192 1193 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1194 ics = ic; 1195 1196 node_max = a->inode.node_count; 1197 ns = a->inode.size; 1198 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1199 1200 /* If max inode size > 4, split it into two inodes.*/ 1201 /* also map the inode sizes according to the ordering */ 1202 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1203 for (i=0,j=0; i<node_max; ++i,++j) { 1204 if (ns[i] > 4) { 1205 tmp_vec1[j] = 4; 1206 ++j; 1207 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1208 } else { 1209 tmp_vec1[j] = ns[i]; 1210 } 1211 } 1212 /* Use the correct node_max */ 1213 node_max = j; 1214 1215 /* Now reorder the inode info based on mat re-ordering info */ 1216 /* First create a row -> inode_size_array_index map */ 1217 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1218 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1219 for (i=0,row=0; i<node_max; i++) { 1220 nodesz = tmp_vec1[i]; 1221 for (j=0; j<nodesz; j++,row++) { 1222 nsmap[row] = i; 1223 } 1224 } 1225 /* Using nsmap, create a reordered ns structure */ 1226 for (i=0,j=0; i< node_max; i++) { 1227 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1228 tmp_vec2[i] = nodesz; 1229 j += nodesz; 1230 } 1231 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1232 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1233 1234 /* Now use the correct ns */ 1235 ns = tmp_vec2; 1236 1237 do { 1238 sctx.newshift = PETSC_FALSE; 1239 /* Now loop over each block-row, and do the factorization */ 1240 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1241 nodesz = ns[inod]; 1242 1243 switch (nodesz) { 1244 case 1: 1245 /*----------*/ 1246 /* zero rtmp1 */ 1247 /* L part */ 1248 nz = bi[i+1] - bi[i]; 1249 bjtmp = bj + bi[i]; 1250 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1251 1252 /* U part */ 1253 nz = bdiag[i]-bdiag[i+1]; 1254 bjtmp = bj + bdiag[i+1]+1; 1255 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1256 1257 /* load in initial (unfactored row) */ 1258 nz = ai[r[i]+1] - ai[r[i]]; 1259 ajtmp = aj + ai[r[i]]; 1260 v = aa + ai[r[i]]; 1261 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1262 1263 /* ZeropivotApply() */ 1264 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1265 1266 /* elimination */ 1267 bjtmp = bj + bi[i]; 1268 row = *bjtmp++; 1269 nzL = bi[i+1] - bi[i]; 1270 for (k=0; k < nzL; k++) { 1271 pc = rtmp1 + row; 1272 if (*pc != 0.0) { 1273 pv = b->a + bdiag[row]; 1274 mul1 = *pc * (*pv); 1275 *pc = mul1; 1276 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1277 pv = b->a + bdiag[row+1]+1; 1278 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1279 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1280 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr); 1281 } 1282 row = *bjtmp++; 1283 } 1284 1285 /* finished row so stick it into b->a */ 1286 rs = 0.0; 1287 /* L part */ 1288 pv = b->a + bi[i]; 1289 pj = b->j + bi[i]; 1290 nz = bi[i+1] - bi[i]; 1291 for (j=0; j<nz; j++) { 1292 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1293 } 1294 1295 /* U part */ 1296 pv = b->a + bdiag[i+1]+1; 1297 pj = b->j + bdiag[i+1]+1; 1298 nz = bdiag[i] - bdiag[i+1]-1; 1299 for (j=0; j<nz; j++) { 1300 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1301 } 1302 1303 /* Check zero pivot */ 1304 sctx.rs = rs; 1305 sctx.pv = rtmp1[i]; 1306 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1307 if (sctx.newshift) break; 1308 1309 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1310 pv = b->a + bdiag[i]; 1311 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1312 break; 1313 1314 case 2: 1315 /*----------*/ 1316 /* zero rtmp1 and rtmp2 */ 1317 /* L part */ 1318 nz = bi[i+1] - bi[i]; 1319 bjtmp = bj + bi[i]; 1320 for (j=0; j<nz; j++) { 1321 col = bjtmp[j]; 1322 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1323 } 1324 1325 /* U part */ 1326 nz = bdiag[i]-bdiag[i+1]; 1327 bjtmp = bj + bdiag[i+1]+1; 1328 for (j=0; j<nz; j++) { 1329 col = bjtmp[j]; 1330 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1331 } 1332 1333 /* load in initial (unfactored row) */ 1334 nz = ai[r[i]+1] - ai[r[i]]; 1335 ajtmp = aj + ai[r[i]]; 1336 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1337 for (j=0; j<nz; j++) { 1338 col = ics[ajtmp[j]]; 1339 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1340 } 1341 /* ZeropivotApply(): shift the diagonal of the matrix */ 1342 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1343 1344 /* elimination */ 1345 bjtmp = bj + bi[i]; 1346 row = *bjtmp++; /* pivot row */ 1347 nzL = bi[i+1] - bi[i]; 1348 for (k=0; k < nzL; k++) { 1349 pc1 = rtmp1 + row; 1350 pc2 = rtmp2 + row; 1351 if (*pc1 != 0.0 || *pc2 != 0.0) { 1352 pv = b->a + bdiag[row]; 1353 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1354 *pc1 = mul1; *pc2 = mul2; 1355 1356 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1357 pv = b->a + bdiag[row+1]+1; 1358 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1359 for (j=0; j<nz; j++) { 1360 col = pj[j]; 1361 rtmp1[col] -= mul1 * pv[j]; 1362 rtmp2[col] -= mul2 * pv[j]; 1363 } 1364 ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr); 1365 } 1366 row = *bjtmp++; 1367 } 1368 1369 /* finished row i; check zero pivot, then stick row i into b->a */ 1370 rs = 0.0; 1371 /* L part */ 1372 pc1 = b->a + bi[i]; 1373 pj = b->j + bi[i]; 1374 nz = bi[i+1] - bi[i]; 1375 for (j=0; j<nz; j++) { 1376 col = pj[j]; 1377 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1378 } 1379 /* U part */ 1380 pc1 = b->a + bdiag[i+1]+1; 1381 pj = b->j + bdiag[i+1]+1; 1382 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1383 for (j=0; j<nz; j++) { 1384 col = pj[j]; 1385 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1386 } 1387 1388 sctx.rs = rs; 1389 sctx.pv = rtmp1[i]; 1390 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1391 if (sctx.newshift) break; 1392 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1393 *pc1 = 1.0/sctx.pv; 1394 1395 /* Now take care of diagonal 2x2 block. */ 1396 pc2 = rtmp2 + i; 1397 if (*pc2 != 0.0) { 1398 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1399 *pc2 = mul1; /* insert L entry */ 1400 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1401 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1402 for (j=0; j<nz; j++) { 1403 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1404 } 1405 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr); 1406 } 1407 1408 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1409 rs = 0.0; 1410 /* L part */ 1411 pc2 = b->a + bi[i+1]; 1412 pj = b->j + bi[i+1]; 1413 nz = bi[i+2] - bi[i+1]; 1414 for (j=0; j<nz; j++) { 1415 col = pj[j]; 1416 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1417 } 1418 /* U part */ 1419 pc2 = b->a + bdiag[i+2]+1; 1420 pj = b->j + bdiag[i+2]+1; 1421 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1422 for (j=0; j<nz; j++) { 1423 col = pj[j]; 1424 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1425 } 1426 1427 sctx.rs = rs; 1428 sctx.pv = rtmp2[i+1]; 1429 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1430 if (sctx.newshift) break; 1431 pc2 = b->a + bdiag[i+1]; 1432 *pc2 = 1.0/sctx.pv; 1433 break; 1434 1435 case 3: 1436 /*----------*/ 1437 /* zero rtmp */ 1438 /* L part */ 1439 nz = bi[i+1] - bi[i]; 1440 bjtmp = bj + bi[i]; 1441 for (j=0; j<nz; j++) { 1442 col = bjtmp[j]; 1443 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1444 } 1445 1446 /* U part */ 1447 nz = bdiag[i]-bdiag[i+1]; 1448 bjtmp = bj + bdiag[i+1]+1; 1449 for (j=0; j<nz; j++) { 1450 col = bjtmp[j]; 1451 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1452 } 1453 1454 /* load in initial (unfactored row) */ 1455 nz = ai[r[i]+1] - ai[r[i]]; 1456 ajtmp = aj + ai[r[i]]; 1457 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1458 for (j=0; j<nz; j++) { 1459 col = ics[ajtmp[j]]; 1460 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1461 } 1462 /* ZeropivotApply(): shift the diagonal of the matrix */ 1463 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1464 1465 /* elimination */ 1466 bjtmp = bj + bi[i]; 1467 row = *bjtmp++; /* pivot row */ 1468 nzL = bi[i+1] - bi[i]; 1469 for (k=0; k < nzL; k++) { 1470 pc1 = rtmp1 + row; 1471 pc2 = rtmp2 + row; 1472 pc3 = rtmp3 + row; 1473 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1474 pv = b->a + bdiag[row]; 1475 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1476 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1477 1478 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1479 pv = b->a + bdiag[row+1]+1; 1480 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1481 for (j=0; j<nz; j++) { 1482 col = pj[j]; 1483 rtmp1[col] -= mul1 * pv[j]; 1484 rtmp2[col] -= mul2 * pv[j]; 1485 rtmp3[col] -= mul3 * pv[j]; 1486 } 1487 ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr); 1488 } 1489 row = *bjtmp++; 1490 } 1491 1492 /* finished row i; check zero pivot, then stick row i into b->a */ 1493 rs = 0.0; 1494 /* L part */ 1495 pc1 = b->a + bi[i]; 1496 pj = b->j + bi[i]; 1497 nz = bi[i+1] - bi[i]; 1498 for (j=0; j<nz; j++) { 1499 col = pj[j]; 1500 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1501 } 1502 /* U part */ 1503 pc1 = b->a + bdiag[i+1]+1; 1504 pj = b->j + bdiag[i+1]+1; 1505 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1506 for (j=0; j<nz; j++) { 1507 col = pj[j]; 1508 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1509 } 1510 1511 sctx.rs = rs; 1512 sctx.pv = rtmp1[i]; 1513 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1514 if (sctx.newshift) break; 1515 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1516 *pc1 = 1.0/sctx.pv; 1517 1518 /* Now take care of 1st column of diagonal 3x3 block. */ 1519 pc2 = rtmp2 + i; 1520 pc3 = rtmp3 + i; 1521 if (*pc2 != 0.0 || *pc3 != 0.0) { 1522 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1523 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1524 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1525 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1526 for (j=0; j<nz; j++) { 1527 col = pj[j]; 1528 rtmp2[col] -= mul2 * rtmp1[col]; 1529 rtmp3[col] -= mul3 * rtmp1[col]; 1530 } 1531 ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr); 1532 } 1533 1534 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1535 rs = 0.0; 1536 /* L part */ 1537 pc2 = b->a + bi[i+1]; 1538 pj = b->j + bi[i+1]; 1539 nz = bi[i+2] - bi[i+1]; 1540 for (j=0; j<nz; j++) { 1541 col = pj[j]; 1542 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1543 } 1544 /* U part */ 1545 pc2 = b->a + bdiag[i+2]+1; 1546 pj = b->j + bdiag[i+2]+1; 1547 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1548 for (j=0; j<nz; j++) { 1549 col = pj[j]; 1550 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1551 } 1552 1553 sctx.rs = rs; 1554 sctx.pv = rtmp2[i+1]; 1555 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1556 if (sctx.newshift) break; 1557 pc2 = b->a + bdiag[i+1]; 1558 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1559 1560 /* Now take care of 2nd column of diagonal 3x3 block. */ 1561 pc3 = rtmp3 + i+1; 1562 if (*pc3 != 0.0) { 1563 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1564 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1565 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1566 for (j=0; j<nz; j++) { 1567 col = pj[j]; 1568 rtmp3[col] -= mul3 * rtmp2[col]; 1569 } 1570 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr); 1571 } 1572 1573 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1574 rs = 0.0; 1575 /* L part */ 1576 pc3 = b->a + bi[i+2]; 1577 pj = b->j + bi[i+2]; 1578 nz = bi[i+3] - bi[i+2]; 1579 for (j=0; j<nz; j++) { 1580 col = pj[j]; 1581 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1582 } 1583 /* U part */ 1584 pc3 = b->a + bdiag[i+3]+1; 1585 pj = b->j + bdiag[i+3]+1; 1586 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1587 for (j=0; j<nz; j++) { 1588 col = pj[j]; 1589 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1590 } 1591 1592 sctx.rs = rs; 1593 sctx.pv = rtmp3[i+2]; 1594 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1595 if (sctx.newshift) break; 1596 pc3 = b->a + bdiag[i+2]; 1597 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1598 break; 1599 case 4: 1600 /*----------*/ 1601 /* zero rtmp */ 1602 /* L part */ 1603 nz = bi[i+1] - bi[i]; 1604 bjtmp = bj + bi[i]; 1605 for (j=0; j<nz; j++) { 1606 col = bjtmp[j]; 1607 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1608 } 1609 1610 /* U part */ 1611 nz = bdiag[i]-bdiag[i+1]; 1612 bjtmp = bj + bdiag[i+1]+1; 1613 for (j=0; j<nz; j++) { 1614 col = bjtmp[j]; 1615 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1616 } 1617 1618 /* load in initial (unfactored row) */ 1619 nz = ai[r[i]+1] - ai[r[i]]; 1620 ajtmp = aj + ai[r[i]]; 1621 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1622 for (j=0; j<nz; j++) { 1623 col = ics[ajtmp[j]]; 1624 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1625 } 1626 /* ZeropivotApply(): shift the diagonal of the matrix */ 1627 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1628 1629 /* elimination */ 1630 bjtmp = bj + bi[i]; 1631 row = *bjtmp++; /* pivot row */ 1632 nzL = bi[i+1] - bi[i]; 1633 for (k=0; k < nzL; k++) { 1634 pc1 = rtmp1 + row; 1635 pc2 = rtmp2 + row; 1636 pc3 = rtmp3 + row; 1637 pc4 = rtmp4 + row; 1638 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1639 pv = b->a + bdiag[row]; 1640 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1641 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1642 1643 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1644 pv = b->a + bdiag[row+1]+1; 1645 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1646 for (j=0; j<nz; j++) { 1647 col = pj[j]; 1648 rtmp1[col] -= mul1 * pv[j]; 1649 rtmp2[col] -= mul2 * pv[j]; 1650 rtmp3[col] -= mul3 * pv[j]; 1651 rtmp4[col] -= mul4 * pv[j]; 1652 } 1653 ierr = PetscLogFlops(4+8.0*nz);CHKERRQ(ierr); 1654 } 1655 row = *bjtmp++; 1656 } 1657 1658 /* finished row i; check zero pivot, then stick row i into b->a */ 1659 rs = 0.0; 1660 /* L part */ 1661 pc1 = b->a + bi[i]; 1662 pj = b->j + bi[i]; 1663 nz = bi[i+1] - bi[i]; 1664 for (j=0; j<nz; j++) { 1665 col = pj[j]; 1666 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1667 } 1668 /* U part */ 1669 pc1 = b->a + bdiag[i+1]+1; 1670 pj = b->j + bdiag[i+1]+1; 1671 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1672 for (j=0; j<nz; j++) { 1673 col = pj[j]; 1674 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1675 } 1676 1677 sctx.rs = rs; 1678 sctx.pv = rtmp1[i]; 1679 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1680 if (sctx.newshift) break; 1681 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1682 *pc1 = 1.0/sctx.pv; 1683 1684 /* Now take care of 1st column of diagonal 4x4 block. */ 1685 pc2 = rtmp2 + i; 1686 pc3 = rtmp3 + i; 1687 pc4 = rtmp4 + i; 1688 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1689 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1690 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1691 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1692 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1693 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1694 for (j=0; j<nz; j++) { 1695 col = pj[j]; 1696 rtmp2[col] -= mul2 * rtmp1[col]; 1697 rtmp3[col] -= mul3 * rtmp1[col]; 1698 rtmp4[col] -= mul4 * rtmp1[col]; 1699 } 1700 ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr); 1701 } 1702 1703 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1704 rs = 0.0; 1705 /* L part */ 1706 pc2 = b->a + bi[i+1]; 1707 pj = b->j + bi[i+1]; 1708 nz = bi[i+2] - bi[i+1]; 1709 for (j=0; j<nz; j++) { 1710 col = pj[j]; 1711 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1712 } 1713 /* U part */ 1714 pc2 = b->a + bdiag[i+2]+1; 1715 pj = b->j + bdiag[i+2]+1; 1716 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1717 for (j=0; j<nz; j++) { 1718 col = pj[j]; 1719 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1720 } 1721 1722 sctx.rs = rs; 1723 sctx.pv = rtmp2[i+1]; 1724 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1725 if (sctx.newshift) break; 1726 pc2 = b->a + bdiag[i+1]; 1727 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1728 1729 /* Now take care of 2nd column of diagonal 4x4 block. */ 1730 pc3 = rtmp3 + i+1; 1731 pc4 = rtmp4 + i+1; 1732 if (*pc3 != 0.0 || *pc4 != 0.0) { 1733 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1734 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1735 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1736 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1737 for (j=0; j<nz; j++) { 1738 col = pj[j]; 1739 rtmp3[col] -= mul3 * rtmp2[col]; 1740 rtmp4[col] -= mul4 * rtmp2[col]; 1741 } 1742 ierr = PetscLogFlops(4.0*nz);CHKERRQ(ierr); 1743 } 1744 1745 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1746 rs = 0.0; 1747 /* L part */ 1748 pc3 = b->a + bi[i+2]; 1749 pj = b->j + bi[i+2]; 1750 nz = bi[i+3] - bi[i+2]; 1751 for (j=0; j<nz; j++) { 1752 col = pj[j]; 1753 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1754 } 1755 /* U part */ 1756 pc3 = b->a + bdiag[i+3]+1; 1757 pj = b->j + bdiag[i+3]+1; 1758 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1759 for (j=0; j<nz; j++) { 1760 col = pj[j]; 1761 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1762 } 1763 1764 sctx.rs = rs; 1765 sctx.pv = rtmp3[i+2]; 1766 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1767 if (sctx.newshift) break; 1768 pc3 = b->a + bdiag[i+2]; 1769 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1770 1771 /* Now take care of 3rd column of diagonal 4x4 block. */ 1772 pc4 = rtmp4 + i+2; 1773 if (*pc4 != 0.0) { 1774 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1775 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1776 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1777 for (j=0; j<nz; j++) { 1778 col = pj[j]; 1779 rtmp4[col] -= mul4 * rtmp3[col]; 1780 } 1781 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr); 1782 } 1783 1784 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1785 rs = 0.0; 1786 /* L part */ 1787 pc4 = b->a + bi[i+3]; 1788 pj = b->j + bi[i+3]; 1789 nz = bi[i+4] - bi[i+3]; 1790 for (j=0; j<nz; j++) { 1791 col = pj[j]; 1792 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1793 } 1794 /* U part */ 1795 pc4 = b->a + bdiag[i+4]+1; 1796 pj = b->j + bdiag[i+4]+1; 1797 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1798 for (j=0; j<nz; j++) { 1799 col = pj[j]; 1800 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1801 } 1802 1803 sctx.rs = rs; 1804 sctx.pv = rtmp4[i+3]; 1805 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr); 1806 if (sctx.newshift) break; 1807 pc4 = b->a + bdiag[i+3]; 1808 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1809 break; 1810 1811 default: 1812 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1813 } 1814 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1815 i += nodesz; /* Update the row */ 1816 } 1817 1818 /* MatPivotRefine() */ 1819 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1820 /* 1821 * if no shift in this attempt & shifting & started shifting & can refine, 1822 * then try lower shift 1823 */ 1824 sctx.shift_hi = sctx.shift_fraction; 1825 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1826 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1827 sctx.newshift = PETSC_TRUE; 1828 sctx.nshift++; 1829 } 1830 } while (sctx.newshift); 1831 1832 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1833 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1834 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1835 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1836 1837 if (b->inode.size) { 1838 C->ops->solve = MatSolve_SeqAIJ_Inode; 1839 } else { 1840 C->ops->solve = MatSolve_SeqAIJ; 1841 } 1842 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1843 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1844 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1845 C->ops->matsolve = MatMatSolve_SeqAIJ; 1846 C->assembled = PETSC_TRUE; 1847 C->preallocated = PETSC_TRUE; 1848 1849 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1850 1851 /* MatShiftView(A,info,&sctx) */ 1852 if (sctx.nshift) { 1853 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1854 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1855 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1856 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1857 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1858 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1859 } 1860 } 1861 PetscFunctionReturn(0); 1862 } 1863 1864 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1865 { 1866 Mat C = B; 1867 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1868 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1869 PetscErrorCode ierr; 1870 const PetscInt *r,*ic,*c,*ics; 1871 PetscInt n = A->rmap->n,*bi = b->i; 1872 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1873 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1874 PetscInt *ai = a->i,*aj = a->j; 1875 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1876 PetscScalar mul1,mul2,mul3,tmp; 1877 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1878 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1879 PetscReal rs=0.0; 1880 FactorShiftCtx sctx; 1881 1882 PetscFunctionBegin; 1883 sctx.shift_top = 0; 1884 sctx.nshift_max = 0; 1885 sctx.shift_lo = 0; 1886 sctx.shift_hi = 0; 1887 sctx.shift_fraction = 0; 1888 1889 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1890 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1891 sctx.shift_top = 0; 1892 for (i=0; i<n; i++) { 1893 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1894 rs = 0.0; 1895 ajtmp = aj + ai[i]; 1896 rtmp1 = aa + ai[i]; 1897 nz = ai[i+1] - ai[i]; 1898 for (j=0; j<nz; j++) { 1899 if (*ajtmp != i) { 1900 rs += PetscAbsScalar(*rtmp1++); 1901 } else { 1902 rs -= PetscRealPart(*rtmp1++); 1903 } 1904 ajtmp++; 1905 } 1906 if (rs>sctx.shift_top) sctx.shift_top = rs; 1907 } 1908 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1909 sctx.shift_top *= 1.1; 1910 sctx.nshift_max = 5; 1911 sctx.shift_lo = 0.; 1912 sctx.shift_hi = 1.; 1913 } 1914 sctx.shift_amount = 0; 1915 sctx.nshift = 0; 1916 1917 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1918 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1919 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1920 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1921 ics = ic; 1922 1923 node_max = a->inode.node_count; 1924 ns = a->inode.size; 1925 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1926 1927 /* If max inode size > 3, split it into two inodes.*/ 1928 /* also map the inode sizes according to the ordering */ 1929 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1930 for (i=0,j=0; i<node_max; ++i,++j) { 1931 if (ns[i]>3) { 1932 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1933 ++j; 1934 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1935 } else { 1936 tmp_vec1[j] = ns[i]; 1937 } 1938 } 1939 /* Use the correct node_max */ 1940 node_max = j; 1941 1942 /* Now reorder the inode info based on mat re-ordering info */ 1943 /* First create a row -> inode_size_array_index map */ 1944 ierr = PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);CHKERRQ(ierr); 1945 for (i=0,row=0; i<node_max; i++) { 1946 nodesz = tmp_vec1[i]; 1947 for (j=0; j<nodesz; j++,row++) { 1948 nsmap[row] = i; 1949 } 1950 } 1951 /* Using nsmap, create a reordered ns structure */ 1952 for (i=0,j=0; i< node_max; i++) { 1953 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1954 tmp_vec2[i] = nodesz; 1955 j += nodesz; 1956 } 1957 ierr = PetscFree2(nsmap,tmp_vec1);CHKERRQ(ierr); 1958 /* Now use the correct ns */ 1959 ns = tmp_vec2; 1960 1961 do { 1962 sctx.newshift = PETSC_FALSE; 1963 /* Now loop over each block-row, and do the factorization */ 1964 for (i=0,row=0; i<node_max; i++) { 1965 nodesz = ns[i]; 1966 nz = bi[row+1] - bi[row]; 1967 bjtmp = bj + bi[row]; 1968 1969 switch (nodesz) { 1970 case 1: 1971 for (j=0; j<nz; j++) { 1972 idx = bjtmp[j]; 1973 rtmp11[idx] = 0.0; 1974 } 1975 1976 /* load in initial (unfactored row) */ 1977 idx = r[row]; 1978 nz_tmp = ai[idx+1] - ai[idx]; 1979 ajtmp = aj + ai[idx]; 1980 v1 = aa + ai[idx]; 1981 1982 for (j=0; j<nz_tmp; j++) { 1983 idx = ics[ajtmp[j]]; 1984 rtmp11[idx] = v1[j]; 1985 } 1986 rtmp11[ics[r[row]]] += sctx.shift_amount; 1987 1988 prow = *bjtmp++; 1989 while (prow < row) { 1990 pc1 = rtmp11 + prow; 1991 if (*pc1 != 0.0) { 1992 pv = ba + bd[prow]; 1993 pj = nbj + bd[prow]; 1994 mul1 = *pc1 * *pv++; 1995 *pc1 = mul1; 1996 nz_tmp = bi[prow+1] - bd[prow] - 1; 1997 ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr); 1998 for (j=0; j<nz_tmp; j++) { 1999 tmp = pv[j]; 2000 idx = pj[j]; 2001 rtmp11[idx] -= mul1 * tmp; 2002 } 2003 } 2004 prow = *bjtmp++; 2005 } 2006 pj = bj + bi[row]; 2007 pc1 = ba + bi[row]; 2008 2009 sctx.pv = rtmp11[row]; 2010 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2011 rs = 0.0; 2012 for (j=0; j<nz; j++) { 2013 idx = pj[j]; 2014 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2015 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2016 } 2017 sctx.rs = rs; 2018 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2019 if (sctx.newshift) goto endofwhile; 2020 break; 2021 2022 case 2: 2023 for (j=0; j<nz; j++) { 2024 idx = bjtmp[j]; 2025 rtmp11[idx] = 0.0; 2026 rtmp22[idx] = 0.0; 2027 } 2028 2029 /* load in initial (unfactored row) */ 2030 idx = r[row]; 2031 nz_tmp = ai[idx+1] - ai[idx]; 2032 ajtmp = aj + ai[idx]; 2033 v1 = aa + ai[idx]; 2034 v2 = aa + ai[idx+1]; 2035 for (j=0; j<nz_tmp; j++) { 2036 idx = ics[ajtmp[j]]; 2037 rtmp11[idx] = v1[j]; 2038 rtmp22[idx] = v2[j]; 2039 } 2040 rtmp11[ics[r[row]]] += sctx.shift_amount; 2041 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2042 2043 prow = *bjtmp++; 2044 while (prow < row) { 2045 pc1 = rtmp11 + prow; 2046 pc2 = rtmp22 + prow; 2047 if (*pc1 != 0.0 || *pc2 != 0.0) { 2048 pv = ba + bd[prow]; 2049 pj = nbj + bd[prow]; 2050 mul1 = *pc1 * *pv; 2051 mul2 = *pc2 * *pv; 2052 ++pv; 2053 *pc1 = mul1; 2054 *pc2 = mul2; 2055 2056 nz_tmp = bi[prow+1] - bd[prow] - 1; 2057 for (j=0; j<nz_tmp; j++) { 2058 tmp = pv[j]; 2059 idx = pj[j]; 2060 rtmp11[idx] -= mul1 * tmp; 2061 rtmp22[idx] -= mul2 * tmp; 2062 } 2063 ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr); 2064 } 2065 prow = *bjtmp++; 2066 } 2067 2068 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2069 pc1 = rtmp11 + prow; 2070 pc2 = rtmp22 + prow; 2071 2072 sctx.pv = *pc1; 2073 pj = bj + bi[prow]; 2074 rs = 0.0; 2075 for (j=0; j<nz; j++) { 2076 idx = pj[j]; 2077 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2078 } 2079 sctx.rs = rs; 2080 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2081 if (sctx.newshift) goto endofwhile; 2082 2083 if (*pc2 != 0.0) { 2084 pj = nbj + bd[prow]; 2085 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2086 *pc2 = mul2; 2087 nz_tmp = bi[prow+1] - bd[prow] - 1; 2088 for (j=0; j<nz_tmp; j++) { 2089 idx = pj[j]; 2090 tmp = rtmp11[idx]; 2091 rtmp22[idx] -= mul2 * tmp; 2092 } 2093 ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr); 2094 } 2095 2096 pj = bj + bi[row]; 2097 pc1 = ba + bi[row]; 2098 pc2 = ba + bi[row+1]; 2099 2100 sctx.pv = rtmp22[row+1]; 2101 rs = 0.0; 2102 rtmp11[row] = 1.0/rtmp11[row]; 2103 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2104 /* copy row entries from dense representation to sparse */ 2105 for (j=0; j<nz; j++) { 2106 idx = pj[j]; 2107 pc1[j] = rtmp11[idx]; 2108 pc2[j] = rtmp22[idx]; 2109 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2110 } 2111 sctx.rs = rs; 2112 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2113 if (sctx.newshift) goto endofwhile; 2114 break; 2115 2116 case 3: 2117 for (j=0; j<nz; j++) { 2118 idx = bjtmp[j]; 2119 rtmp11[idx] = 0.0; 2120 rtmp22[idx] = 0.0; 2121 rtmp33[idx] = 0.0; 2122 } 2123 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2124 idx = r[row]; 2125 nz_tmp = ai[idx+1] - ai[idx]; 2126 ajtmp = aj + ai[idx]; 2127 v1 = aa + ai[idx]; 2128 v2 = aa + ai[idx+1]; 2129 v3 = aa + ai[idx+2]; 2130 for (j=0; j<nz_tmp; j++) { 2131 idx = ics[ajtmp[j]]; 2132 rtmp11[idx] = v1[j]; 2133 rtmp22[idx] = v2[j]; 2134 rtmp33[idx] = v3[j]; 2135 } 2136 rtmp11[ics[r[row]]] += sctx.shift_amount; 2137 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2138 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2139 2140 /* loop over all pivot row blocks above this row block */ 2141 prow = *bjtmp++; 2142 while (prow < row) { 2143 pc1 = rtmp11 + prow; 2144 pc2 = rtmp22 + prow; 2145 pc3 = rtmp33 + prow; 2146 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2147 pv = ba + bd[prow]; 2148 pj = nbj + bd[prow]; 2149 mul1 = *pc1 * *pv; 2150 mul2 = *pc2 * *pv; 2151 mul3 = *pc3 * *pv; 2152 ++pv; 2153 *pc1 = mul1; 2154 *pc2 = mul2; 2155 *pc3 = mul3; 2156 2157 nz_tmp = bi[prow+1] - bd[prow] - 1; 2158 /* update this row based on pivot row */ 2159 for (j=0; j<nz_tmp; j++) { 2160 tmp = pv[j]; 2161 idx = pj[j]; 2162 rtmp11[idx] -= mul1 * tmp; 2163 rtmp22[idx] -= mul2 * tmp; 2164 rtmp33[idx] -= mul3 * tmp; 2165 } 2166 ierr = PetscLogFlops(3+6.0*nz_tmp);CHKERRQ(ierr); 2167 } 2168 prow = *bjtmp++; 2169 } 2170 2171 /* Now take care of diagonal 3x3 block in this set of rows */ 2172 /* note: prow = row here */ 2173 pc1 = rtmp11 + prow; 2174 pc2 = rtmp22 + prow; 2175 pc3 = rtmp33 + prow; 2176 2177 sctx.pv = *pc1; 2178 pj = bj + bi[prow]; 2179 rs = 0.0; 2180 for (j=0; j<nz; j++) { 2181 idx = pj[j]; 2182 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2183 } 2184 sctx.rs = rs; 2185 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2186 if (sctx.newshift) goto endofwhile; 2187 2188 if (*pc2 != 0.0 || *pc3 != 0.0) { 2189 mul2 = (*pc2)/(*pc1); 2190 mul3 = (*pc3)/(*pc1); 2191 *pc2 = mul2; 2192 *pc3 = mul3; 2193 nz_tmp = bi[prow+1] - bd[prow] - 1; 2194 pj = nbj + bd[prow]; 2195 for (j=0; j<nz_tmp; j++) { 2196 idx = pj[j]; 2197 tmp = rtmp11[idx]; 2198 rtmp22[idx] -= mul2 * tmp; 2199 rtmp33[idx] -= mul3 * tmp; 2200 } 2201 ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr); 2202 } 2203 ++prow; 2204 2205 pc2 = rtmp22 + prow; 2206 pc3 = rtmp33 + prow; 2207 sctx.pv = *pc2; 2208 pj = bj + bi[prow]; 2209 rs = 0.0; 2210 for (j=0; j<nz; j++) { 2211 idx = pj[j]; 2212 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2213 } 2214 sctx.rs = rs; 2215 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2216 if (sctx.newshift) goto endofwhile; 2217 2218 if (*pc3 != 0.0) { 2219 mul3 = (*pc3)/(*pc2); 2220 *pc3 = mul3; 2221 pj = nbj + bd[prow]; 2222 nz_tmp = bi[prow+1] - bd[prow] - 1; 2223 for (j=0; j<nz_tmp; j++) { 2224 idx = pj[j]; 2225 tmp = rtmp22[idx]; 2226 rtmp33[idx] -= mul3 * tmp; 2227 } 2228 ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr); 2229 } 2230 2231 pj = bj + bi[row]; 2232 pc1 = ba + bi[row]; 2233 pc2 = ba + bi[row+1]; 2234 pc3 = ba + bi[row+2]; 2235 2236 sctx.pv = rtmp33[row+2]; 2237 rs = 0.0; 2238 rtmp11[row] = 1.0/rtmp11[row]; 2239 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2240 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2241 /* copy row entries from dense representation to sparse */ 2242 for (j=0; j<nz; j++) { 2243 idx = pj[j]; 2244 pc1[j] = rtmp11[idx]; 2245 pc2[j] = rtmp22[idx]; 2246 pc3[j] = rtmp33[idx]; 2247 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2248 } 2249 2250 sctx.rs = rs; 2251 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr); 2252 if (sctx.newshift) goto endofwhile; 2253 break; 2254 2255 default: 2256 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2257 } 2258 row += nodesz; /* Update the row */ 2259 } 2260 endofwhile:; 2261 } while (sctx.newshift); 2262 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2263 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2264 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2265 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2266 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2267 2268 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2269 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2270 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2271 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2272 C->assembled = PETSC_TRUE; 2273 C->preallocated = PETSC_TRUE; 2274 if (sctx.nshift) { 2275 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2276 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2277 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2278 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2279 } 2280 } 2281 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2282 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2283 PetscFunctionReturn(0); 2284 } 2285 2286 /* ----------------------------------------------------------- */ 2287 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2288 { 2289 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2290 IS iscol = a->col,isrow = a->row; 2291 PetscErrorCode ierr; 2292 const PetscInt *r,*c,*rout,*cout; 2293 PetscInt i,j,n = A->rmap->n; 2294 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2295 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2296 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2297 PetscScalar sum1,sum2,sum3,sum4,sum5; 2298 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2299 const PetscScalar *b; 2300 2301 PetscFunctionBegin; 2302 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2303 node_max = a->inode.node_count; 2304 ns = a->inode.size; /* Node Size array */ 2305 2306 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2307 ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr); 2308 tmp = a->solve_work; 2309 2310 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2311 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2312 2313 /* forward solve the lower triangular */ 2314 tmps = tmp; 2315 aa = a_a; 2316 aj = a_j; 2317 ad = a->diag; 2318 2319 for (i = 0,row = 0; i< node_max; ++i) { 2320 nsz = ns[i]; 2321 aii = ai[row]; 2322 v1 = aa + aii; 2323 vi = aj + aii; 2324 nz = ai[row+1]- ai[row]; 2325 2326 if (i < node_max-1) { 2327 /* Prefetch the indices for the next block */ 2328 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2329 /* Prefetch the data for the next block */ 2330 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2331 } 2332 2333 switch (nsz) { /* Each loop in 'case' is unrolled */ 2334 case 1: 2335 sum1 = b[r[row]]; 2336 for (j=0; j<nz-1; j+=2) { 2337 i0 = vi[j]; 2338 i1 = vi[j+1]; 2339 tmp0 = tmps[i0]; 2340 tmp1 = tmps[i1]; 2341 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2342 } 2343 if (j == nz-1) { 2344 tmp0 = tmps[vi[j]]; 2345 sum1 -= v1[j]*tmp0; 2346 } 2347 tmp[row++]=sum1; 2348 break; 2349 case 2: 2350 sum1 = b[r[row]]; 2351 sum2 = b[r[row+1]]; 2352 v2 = aa + ai[row+1]; 2353 2354 for (j=0; j<nz-1; j+=2) { 2355 i0 = vi[j]; 2356 i1 = vi[j+1]; 2357 tmp0 = tmps[i0]; 2358 tmp1 = tmps[i1]; 2359 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2360 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2361 } 2362 if (j == nz-1) { 2363 tmp0 = tmps[vi[j]]; 2364 sum1 -= v1[j] *tmp0; 2365 sum2 -= v2[j] *tmp0; 2366 } 2367 sum2 -= v2[nz] * sum1; 2368 tmp[row++]=sum1; 2369 tmp[row++]=sum2; 2370 break; 2371 case 3: 2372 sum1 = b[r[row]]; 2373 sum2 = b[r[row+1]]; 2374 sum3 = b[r[row+2]]; 2375 v2 = aa + ai[row+1]; 2376 v3 = aa + ai[row+2]; 2377 2378 for (j=0; j<nz-1; j+=2) { 2379 i0 = vi[j]; 2380 i1 = vi[j+1]; 2381 tmp0 = tmps[i0]; 2382 tmp1 = tmps[i1]; 2383 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2384 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2385 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2386 } 2387 if (j == nz-1) { 2388 tmp0 = tmps[vi[j]]; 2389 sum1 -= v1[j] *tmp0; 2390 sum2 -= v2[j] *tmp0; 2391 sum3 -= v3[j] *tmp0; 2392 } 2393 sum2 -= v2[nz] * sum1; 2394 sum3 -= v3[nz] * sum1; 2395 sum3 -= v3[nz+1] * sum2; 2396 tmp[row++]=sum1; 2397 tmp[row++]=sum2; 2398 tmp[row++]=sum3; 2399 break; 2400 2401 case 4: 2402 sum1 = b[r[row]]; 2403 sum2 = b[r[row+1]]; 2404 sum3 = b[r[row+2]]; 2405 sum4 = b[r[row+3]]; 2406 v2 = aa + ai[row+1]; 2407 v3 = aa + ai[row+2]; 2408 v4 = aa + ai[row+3]; 2409 2410 for (j=0; j<nz-1; j+=2) { 2411 i0 = vi[j]; 2412 i1 = vi[j+1]; 2413 tmp0 = tmps[i0]; 2414 tmp1 = tmps[i1]; 2415 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2416 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2417 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2418 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2419 } 2420 if (j == nz-1) { 2421 tmp0 = tmps[vi[j]]; 2422 sum1 -= v1[j] *tmp0; 2423 sum2 -= v2[j] *tmp0; 2424 sum3 -= v3[j] *tmp0; 2425 sum4 -= v4[j] *tmp0; 2426 } 2427 sum2 -= v2[nz] * sum1; 2428 sum3 -= v3[nz] * sum1; 2429 sum4 -= v4[nz] * sum1; 2430 sum3 -= v3[nz+1] * sum2; 2431 sum4 -= v4[nz+1] * sum2; 2432 sum4 -= v4[nz+2] * sum3; 2433 2434 tmp[row++]=sum1; 2435 tmp[row++]=sum2; 2436 tmp[row++]=sum3; 2437 tmp[row++]=sum4; 2438 break; 2439 case 5: 2440 sum1 = b[r[row]]; 2441 sum2 = b[r[row+1]]; 2442 sum3 = b[r[row+2]]; 2443 sum4 = b[r[row+3]]; 2444 sum5 = b[r[row+4]]; 2445 v2 = aa + ai[row+1]; 2446 v3 = aa + ai[row+2]; 2447 v4 = aa + ai[row+3]; 2448 v5 = aa + ai[row+4]; 2449 2450 for (j=0; j<nz-1; j+=2) { 2451 i0 = vi[j]; 2452 i1 = vi[j+1]; 2453 tmp0 = tmps[i0]; 2454 tmp1 = tmps[i1]; 2455 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2456 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2457 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2458 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2459 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2460 } 2461 if (j == nz-1) { 2462 tmp0 = tmps[vi[j]]; 2463 sum1 -= v1[j] *tmp0; 2464 sum2 -= v2[j] *tmp0; 2465 sum3 -= v3[j] *tmp0; 2466 sum4 -= v4[j] *tmp0; 2467 sum5 -= v5[j] *tmp0; 2468 } 2469 2470 sum2 -= v2[nz] * sum1; 2471 sum3 -= v3[nz] * sum1; 2472 sum4 -= v4[nz] * sum1; 2473 sum5 -= v5[nz] * sum1; 2474 sum3 -= v3[nz+1] * sum2; 2475 sum4 -= v4[nz+1] * sum2; 2476 sum5 -= v5[nz+1] * sum2; 2477 sum4 -= v4[nz+2] * sum3; 2478 sum5 -= v5[nz+2] * sum3; 2479 sum5 -= v5[nz+3] * sum4; 2480 2481 tmp[row++]=sum1; 2482 tmp[row++]=sum2; 2483 tmp[row++]=sum3; 2484 tmp[row++]=sum4; 2485 tmp[row++]=sum5; 2486 break; 2487 default: 2488 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2489 } 2490 } 2491 /* backward solve the upper triangular */ 2492 for (i=node_max -1,row = n-1; i>=0; i--) { 2493 nsz = ns[i]; 2494 aii = ad[row+1] + 1; 2495 v1 = aa + aii; 2496 vi = aj + aii; 2497 nz = ad[row]- ad[row+1] - 1; 2498 2499 if (i > 0) { 2500 /* Prefetch the indices for the next block */ 2501 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2502 /* Prefetch the data for the next block */ 2503 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2504 } 2505 2506 switch (nsz) { /* Each loop in 'case' is unrolled */ 2507 case 1: 2508 sum1 = tmp[row]; 2509 2510 for (j=0; j<nz-1; j+=2) { 2511 i0 = vi[j]; 2512 i1 = vi[j+1]; 2513 tmp0 = tmps[i0]; 2514 tmp1 = tmps[i1]; 2515 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2516 } 2517 if (j == nz-1) { 2518 tmp0 = tmps[vi[j]]; 2519 sum1 -= v1[j]*tmp0; 2520 } 2521 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2522 break; 2523 case 2: 2524 sum1 = tmp[row]; 2525 sum2 = tmp[row-1]; 2526 v2 = aa + ad[row] + 1; 2527 for (j=0; j<nz-1; j+=2) { 2528 i0 = vi[j]; 2529 i1 = vi[j+1]; 2530 tmp0 = tmps[i0]; 2531 tmp1 = tmps[i1]; 2532 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2533 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2534 } 2535 if (j == nz-1) { 2536 tmp0 = tmps[vi[j]]; 2537 sum1 -= v1[j]* tmp0; 2538 sum2 -= v2[j+1]* tmp0; 2539 } 2540 2541 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2542 sum2 -= v2[0] * tmp0; 2543 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2544 break; 2545 case 3: 2546 sum1 = tmp[row]; 2547 sum2 = tmp[row -1]; 2548 sum3 = tmp[row -2]; 2549 v2 = aa + ad[row] + 1; 2550 v3 = aa + ad[row -1] + 1; 2551 for (j=0; j<nz-1; j+=2) { 2552 i0 = vi[j]; 2553 i1 = vi[j+1]; 2554 tmp0 = tmps[i0]; 2555 tmp1 = tmps[i1]; 2556 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2557 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2558 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2559 } 2560 if (j== nz-1) { 2561 tmp0 = tmps[vi[j]]; 2562 sum1 -= v1[j] * tmp0; 2563 sum2 -= v2[j+1] * tmp0; 2564 sum3 -= v3[j+2] * tmp0; 2565 } 2566 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2567 sum2 -= v2[0]* tmp0; 2568 sum3 -= v3[1] * tmp0; 2569 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2570 sum3 -= v3[0]* tmp0; 2571 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2572 2573 break; 2574 case 4: 2575 sum1 = tmp[row]; 2576 sum2 = tmp[row -1]; 2577 sum3 = tmp[row -2]; 2578 sum4 = tmp[row -3]; 2579 v2 = aa + ad[row]+1; 2580 v3 = aa + ad[row -1]+1; 2581 v4 = aa + ad[row -2]+1; 2582 2583 for (j=0; j<nz-1; j+=2) { 2584 i0 = vi[j]; 2585 i1 = vi[j+1]; 2586 tmp0 = tmps[i0]; 2587 tmp1 = tmps[i1]; 2588 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2589 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2590 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2591 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2592 } 2593 if (j== nz-1) { 2594 tmp0 = tmps[vi[j]]; 2595 sum1 -= v1[j] * tmp0; 2596 sum2 -= v2[j+1] * tmp0; 2597 sum3 -= v3[j+2] * tmp0; 2598 sum4 -= v4[j+3] * tmp0; 2599 } 2600 2601 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2602 sum2 -= v2[0] * tmp0; 2603 sum3 -= v3[1] * tmp0; 2604 sum4 -= v4[2] * tmp0; 2605 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2606 sum3 -= v3[0] * tmp0; 2607 sum4 -= v4[1] * tmp0; 2608 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2609 sum4 -= v4[0] * tmp0; 2610 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2611 break; 2612 case 5: 2613 sum1 = tmp[row]; 2614 sum2 = tmp[row -1]; 2615 sum3 = tmp[row -2]; 2616 sum4 = tmp[row -3]; 2617 sum5 = tmp[row -4]; 2618 v2 = aa + ad[row]+1; 2619 v3 = aa + ad[row -1]+1; 2620 v4 = aa + ad[row -2]+1; 2621 v5 = aa + ad[row -3]+1; 2622 for (j=0; j<nz-1; j+=2) { 2623 i0 = vi[j]; 2624 i1 = vi[j+1]; 2625 tmp0 = tmps[i0]; 2626 tmp1 = tmps[i1]; 2627 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2628 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2629 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2630 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2631 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2632 } 2633 if (j==nz-1) { 2634 tmp0 = tmps[vi[j]]; 2635 sum1 -= v1[j] * tmp0; 2636 sum2 -= v2[j+1] * tmp0; 2637 sum3 -= v3[j+2] * tmp0; 2638 sum4 -= v4[j+3] * tmp0; 2639 sum5 -= v5[j+4] * tmp0; 2640 } 2641 2642 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2643 sum2 -= v2[0] * tmp0; 2644 sum3 -= v3[1] * tmp0; 2645 sum4 -= v4[2] * tmp0; 2646 sum5 -= v5[3] * tmp0; 2647 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2648 sum3 -= v3[0] * tmp0; 2649 sum4 -= v4[1] * tmp0; 2650 sum5 -= v5[2] * tmp0; 2651 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2652 sum4 -= v4[0] * tmp0; 2653 sum5 -= v5[1] * tmp0; 2654 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2655 sum5 -= v5[0] * tmp0; 2656 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2657 break; 2658 default: 2659 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2660 } 2661 } 2662 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2663 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2664 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2665 ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr); 2666 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2667 PetscFunctionReturn(0); 2668 } 2669 2670 /* 2671 Makes a longer coloring[] array and calls the usual code with that 2672 */ 2673 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2674 { 2675 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2676 PetscErrorCode ierr; 2677 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2678 PetscInt *colorused,i; 2679 ISColoringValue *newcolor; 2680 2681 PetscFunctionBegin; 2682 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2683 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2684 /* loop over inodes, marking a color for each column*/ 2685 row = 0; 2686 for (i=0; i<m; i++) { 2687 for (j=0; j<ns[i]; j++) { 2688 newcolor[row++] = coloring[i] + j*ncolors; 2689 } 2690 } 2691 2692 /* eliminate unneeded colors */ 2693 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2694 for (i=0; i<n; i++) { 2695 colorused[newcolor[i]] = 1; 2696 } 2697 2698 for (i=1; i<5*ncolors; i++) { 2699 colorused[i] += colorused[i-1]; 2700 } 2701 ncolors = colorused[5*ncolors-1]; 2702 for (i=0; i<n; i++) { 2703 newcolor[i] = colorused[newcolor[i]]-1; 2704 } 2705 ierr = PetscFree(colorused);CHKERRQ(ierr); 2706 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2707 ierr = PetscFree(coloring);CHKERRQ(ierr); 2708 PetscFunctionReturn(0); 2709 } 2710 2711 #include <petsc/private/kernels/blockinvert.h> 2712 2713 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2714 { 2715 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2716 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2717 MatScalar *ibdiag,*bdiag,work[25],*t; 2718 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2719 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2720 const PetscScalar *xb, *b; 2721 PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0; 2722 PetscErrorCode ierr; 2723 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2724 PetscInt sz,k,ipvt[5]; 2725 PetscBool allowzeropivot,zeropivotdetected; 2726 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2727 2728 PetscFunctionBegin; 2729 allowzeropivot = PetscNot(A->erroriffailure); 2730 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2731 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2732 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2733 2734 if (!a->inode.ibdiagvalid) { 2735 if (!a->inode.ibdiag) { 2736 /* calculate space needed for diagonal blocks */ 2737 for (i=0; i<m; i++) { 2738 cnt += sizes[i]*sizes[i]; 2739 } 2740 a->inode.bdiagsize = cnt; 2741 2742 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2743 } 2744 2745 /* copy over the diagonal blocks and invert them */ 2746 ibdiag = a->inode.ibdiag; 2747 bdiag = a->inode.bdiag; 2748 cnt = 0; 2749 for (i=0, row = 0; i<m; i++) { 2750 for (j=0; j<sizes[i]; j++) { 2751 for (k=0; k<sizes[i]; k++) { 2752 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2753 } 2754 } 2755 ierr = PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);CHKERRQ(ierr); 2756 2757 switch (sizes[i]) { 2758 case 1: 2759 /* Create matrix data structure */ 2760 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2761 if (allowzeropivot) { 2762 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2763 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2764 A->factorerror_zeropivot_row = row; 2765 ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr); 2766 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2767 } 2768 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2769 break; 2770 case 2: 2771 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2772 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2773 break; 2774 case 3: 2775 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2776 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2777 break; 2778 case 4: 2779 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2780 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2781 break; 2782 case 5: 2783 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2784 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2785 break; 2786 default: 2787 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2788 } 2789 cnt += sizes[i]*sizes[i]; 2790 row += sizes[i]; 2791 } 2792 a->inode.ibdiagvalid = PETSC_TRUE; 2793 } 2794 ibdiag = a->inode.ibdiag; 2795 bdiag = a->inode.bdiag; 2796 t = a->inode.ssor_work; 2797 2798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2799 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2800 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2801 if (flag & SOR_ZERO_INITIAL_GUESS) { 2802 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2803 2804 for (i=0, row=0; i<m; i++) { 2805 sz = diag[row] - ii[row]; 2806 v1 = a->a + ii[row]; 2807 idx = a->j + ii[row]; 2808 2809 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2810 switch (sizes[i]) { 2811 case 1: 2812 2813 sum1 = b[row]; 2814 for (n = 0; n<sz-1; n+=2) { 2815 i1 = idx[0]; 2816 i2 = idx[1]; 2817 idx += 2; 2818 tmp0 = x[i1]; 2819 tmp1 = x[i2]; 2820 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2821 } 2822 2823 if (n == sz-1) { 2824 tmp0 = x[*idx]; 2825 sum1 -= *v1 * tmp0; 2826 } 2827 t[row] = sum1; 2828 x[row++] = sum1*(*ibdiag++); 2829 break; 2830 case 2: 2831 v2 = a->a + ii[row+1]; 2832 sum1 = b[row]; 2833 sum2 = b[row+1]; 2834 for (n = 0; n<sz-1; n+=2) { 2835 i1 = idx[0]; 2836 i2 = idx[1]; 2837 idx += 2; 2838 tmp0 = x[i1]; 2839 tmp1 = x[i2]; 2840 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2841 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2842 } 2843 2844 if (n == sz-1) { 2845 tmp0 = x[*idx]; 2846 sum1 -= v1[0] * tmp0; 2847 sum2 -= v2[0] * tmp0; 2848 } 2849 t[row] = sum1; 2850 t[row+1] = sum2; 2851 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2852 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2853 ibdiag += 4; 2854 break; 2855 case 3: 2856 v2 = a->a + ii[row+1]; 2857 v3 = a->a + ii[row+2]; 2858 sum1 = b[row]; 2859 sum2 = b[row+1]; 2860 sum3 = b[row+2]; 2861 for (n = 0; n<sz-1; n+=2) { 2862 i1 = idx[0]; 2863 i2 = idx[1]; 2864 idx += 2; 2865 tmp0 = x[i1]; 2866 tmp1 = x[i2]; 2867 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2868 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2869 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2870 } 2871 2872 if (n == sz-1) { 2873 tmp0 = x[*idx]; 2874 sum1 -= v1[0] * tmp0; 2875 sum2 -= v2[0] * tmp0; 2876 sum3 -= v3[0] * tmp0; 2877 } 2878 t[row] = sum1; 2879 t[row+1] = sum2; 2880 t[row+2] = sum3; 2881 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2882 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2883 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2884 ibdiag += 9; 2885 break; 2886 case 4: 2887 v2 = a->a + ii[row+1]; 2888 v3 = a->a + ii[row+2]; 2889 v4 = a->a + ii[row+3]; 2890 sum1 = b[row]; 2891 sum2 = b[row+1]; 2892 sum3 = b[row+2]; 2893 sum4 = b[row+3]; 2894 for (n = 0; n<sz-1; n+=2) { 2895 i1 = idx[0]; 2896 i2 = idx[1]; 2897 idx += 2; 2898 tmp0 = x[i1]; 2899 tmp1 = x[i2]; 2900 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2901 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2902 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2903 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2904 } 2905 2906 if (n == sz-1) { 2907 tmp0 = x[*idx]; 2908 sum1 -= v1[0] * tmp0; 2909 sum2 -= v2[0] * tmp0; 2910 sum3 -= v3[0] * tmp0; 2911 sum4 -= v4[0] * tmp0; 2912 } 2913 t[row] = sum1; 2914 t[row+1] = sum2; 2915 t[row+2] = sum3; 2916 t[row+3] = sum4; 2917 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2918 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2919 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2920 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2921 ibdiag += 16; 2922 break; 2923 case 5: 2924 v2 = a->a + ii[row+1]; 2925 v3 = a->a + ii[row+2]; 2926 v4 = a->a + ii[row+3]; 2927 v5 = a->a + ii[row+4]; 2928 sum1 = b[row]; 2929 sum2 = b[row+1]; 2930 sum3 = b[row+2]; 2931 sum4 = b[row+3]; 2932 sum5 = b[row+4]; 2933 for (n = 0; n<sz-1; n+=2) { 2934 i1 = idx[0]; 2935 i2 = idx[1]; 2936 idx += 2; 2937 tmp0 = x[i1]; 2938 tmp1 = x[i2]; 2939 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2940 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2941 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2942 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2943 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2944 } 2945 2946 if (n == sz-1) { 2947 tmp0 = x[*idx]; 2948 sum1 -= v1[0] * tmp0; 2949 sum2 -= v2[0] * tmp0; 2950 sum3 -= v3[0] * tmp0; 2951 sum4 -= v4[0] * tmp0; 2952 sum5 -= v5[0] * tmp0; 2953 } 2954 t[row] = sum1; 2955 t[row+1] = sum2; 2956 t[row+2] = sum3; 2957 t[row+3] = sum4; 2958 t[row+4] = sum5; 2959 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2960 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2961 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2962 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2963 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2964 ibdiag += 25; 2965 break; 2966 default: 2967 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2968 } 2969 } 2970 2971 xb = t; 2972 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2973 } else xb = b; 2974 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 2975 2976 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2977 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2978 ibdiag -= sizes[i]*sizes[i]; 2979 sz = ii[row+1] - diag[row] - 1; 2980 v1 = a->a + diag[row] + 1; 2981 idx = a->j + diag[row] + 1; 2982 2983 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2984 switch (sizes[i]) { 2985 case 1: 2986 2987 sum1 = xb[row]; 2988 for (n = 0; n<sz-1; n+=2) { 2989 i1 = idx[0]; 2990 i2 = idx[1]; 2991 idx += 2; 2992 tmp0 = x[i1]; 2993 tmp1 = x[i2]; 2994 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2995 } 2996 2997 if (n == sz-1) { 2998 tmp0 = x[*idx]; 2999 sum1 -= *v1*tmp0; 3000 } 3001 x[row--] = sum1*(*ibdiag); 3002 break; 3003 3004 case 2: 3005 3006 sum1 = xb[row]; 3007 sum2 = xb[row-1]; 3008 /* note that sum1 is associated with the second of the two rows */ 3009 v2 = a->a + diag[row-1] + 2; 3010 for (n = 0; n<sz-1; n+=2) { 3011 i1 = idx[0]; 3012 i2 = idx[1]; 3013 idx += 2; 3014 tmp0 = x[i1]; 3015 tmp1 = x[i2]; 3016 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3017 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3018 } 3019 3020 if (n == sz-1) { 3021 tmp0 = x[*idx]; 3022 sum1 -= *v1*tmp0; 3023 sum2 -= *v2*tmp0; 3024 } 3025 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3026 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3027 break; 3028 case 3: 3029 3030 sum1 = xb[row]; 3031 sum2 = xb[row-1]; 3032 sum3 = xb[row-2]; 3033 v2 = a->a + diag[row-1] + 2; 3034 v3 = a->a + diag[row-2] + 3; 3035 for (n = 0; n<sz-1; n+=2) { 3036 i1 = idx[0]; 3037 i2 = idx[1]; 3038 idx += 2; 3039 tmp0 = x[i1]; 3040 tmp1 = x[i2]; 3041 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3042 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3043 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3044 } 3045 3046 if (n == sz-1) { 3047 tmp0 = x[*idx]; 3048 sum1 -= *v1*tmp0; 3049 sum2 -= *v2*tmp0; 3050 sum3 -= *v3*tmp0; 3051 } 3052 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3053 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3054 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3055 break; 3056 case 4: 3057 3058 sum1 = xb[row]; 3059 sum2 = xb[row-1]; 3060 sum3 = xb[row-2]; 3061 sum4 = xb[row-3]; 3062 v2 = a->a + diag[row-1] + 2; 3063 v3 = a->a + diag[row-2] + 3; 3064 v4 = a->a + diag[row-3] + 4; 3065 for (n = 0; n<sz-1; n+=2) { 3066 i1 = idx[0]; 3067 i2 = idx[1]; 3068 idx += 2; 3069 tmp0 = x[i1]; 3070 tmp1 = x[i2]; 3071 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3072 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3073 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3074 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3075 } 3076 3077 if (n == sz-1) { 3078 tmp0 = x[*idx]; 3079 sum1 -= *v1*tmp0; 3080 sum2 -= *v2*tmp0; 3081 sum3 -= *v3*tmp0; 3082 sum4 -= *v4*tmp0; 3083 } 3084 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3085 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3086 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3087 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3088 break; 3089 case 5: 3090 3091 sum1 = xb[row]; 3092 sum2 = xb[row-1]; 3093 sum3 = xb[row-2]; 3094 sum4 = xb[row-3]; 3095 sum5 = xb[row-4]; 3096 v2 = a->a + diag[row-1] + 2; 3097 v3 = a->a + diag[row-2] + 3; 3098 v4 = a->a + diag[row-3] + 4; 3099 v5 = a->a + diag[row-4] + 5; 3100 for (n = 0; n<sz-1; n+=2) { 3101 i1 = idx[0]; 3102 i2 = idx[1]; 3103 idx += 2; 3104 tmp0 = x[i1]; 3105 tmp1 = x[i2]; 3106 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3107 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3108 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3109 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3110 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3111 } 3112 3113 if (n == sz-1) { 3114 tmp0 = x[*idx]; 3115 sum1 -= *v1*tmp0; 3116 sum2 -= *v2*tmp0; 3117 sum3 -= *v3*tmp0; 3118 sum4 -= *v4*tmp0; 3119 sum5 -= *v5*tmp0; 3120 } 3121 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3122 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3123 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3124 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3125 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3126 break; 3127 default: 3128 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3129 } 3130 } 3131 3132 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3133 } 3134 its--; 3135 } 3136 while (its--) { 3137 3138 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3139 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3140 i<m; 3141 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3142 3143 sz = diag[row] - ii[row]; 3144 v1 = a->a + ii[row]; 3145 idx = a->j + ii[row]; 3146 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3147 switch (sizes[i]) { 3148 case 1: 3149 sum1 = b[row]; 3150 for (n = 0; n<sz-1; n+=2) { 3151 i1 = idx[0]; 3152 i2 = idx[1]; 3153 idx += 2; 3154 tmp0 = x[i1]; 3155 tmp1 = x[i2]; 3156 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3157 } 3158 if (n == sz-1) { 3159 tmp0 = x[*idx++]; 3160 sum1 -= *v1 * tmp0; 3161 v1++; 3162 } 3163 t[row] = sum1; 3164 sz = ii[row+1] - diag[row] - 1; 3165 idx = a->j + diag[row] + 1; 3166 v1 += 1; 3167 for (n = 0; n<sz-1; n+=2) { 3168 i1 = idx[0]; 3169 i2 = idx[1]; 3170 idx += 2; 3171 tmp0 = x[i1]; 3172 tmp1 = x[i2]; 3173 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3174 } 3175 if (n == sz-1) { 3176 tmp0 = x[*idx++]; 3177 sum1 -= *v1 * tmp0; 3178 } 3179 /* in MatSOR_SeqAIJ this line would be 3180 * 3181 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3182 * 3183 * but omega == 1, so this becomes 3184 * 3185 * x[row] = sum1*(*ibdiag++); 3186 * 3187 */ 3188 x[row] = sum1*(*ibdiag); 3189 break; 3190 case 2: 3191 v2 = a->a + ii[row+1]; 3192 sum1 = b[row]; 3193 sum2 = b[row+1]; 3194 for (n = 0; n<sz-1; n+=2) { 3195 i1 = idx[0]; 3196 i2 = idx[1]; 3197 idx += 2; 3198 tmp0 = x[i1]; 3199 tmp1 = x[i2]; 3200 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3201 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3202 } 3203 if (n == sz-1) { 3204 tmp0 = x[*idx++]; 3205 sum1 -= v1[0] * tmp0; 3206 sum2 -= v2[0] * tmp0; 3207 v1++; v2++; 3208 } 3209 t[row] = sum1; 3210 t[row+1] = sum2; 3211 sz = ii[row+1] - diag[row] - 2; 3212 idx = a->j + diag[row] + 2; 3213 v1 += 2; 3214 v2 += 2; 3215 for (n = 0; n<sz-1; n+=2) { 3216 i1 = idx[0]; 3217 i2 = idx[1]; 3218 idx += 2; 3219 tmp0 = x[i1]; 3220 tmp1 = x[i2]; 3221 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3222 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3223 } 3224 if (n == sz-1) { 3225 tmp0 = x[*idx]; 3226 sum1 -= v1[0] * tmp0; 3227 sum2 -= v2[0] * tmp0; 3228 } 3229 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3230 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3231 break; 3232 case 3: 3233 v2 = a->a + ii[row+1]; 3234 v3 = a->a + ii[row+2]; 3235 sum1 = b[row]; 3236 sum2 = b[row+1]; 3237 sum3 = b[row+2]; 3238 for (n = 0; n<sz-1; n+=2) { 3239 i1 = idx[0]; 3240 i2 = idx[1]; 3241 idx += 2; 3242 tmp0 = x[i1]; 3243 tmp1 = x[i2]; 3244 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3245 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3246 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3247 } 3248 if (n == sz-1) { 3249 tmp0 = x[*idx++]; 3250 sum1 -= v1[0] * tmp0; 3251 sum2 -= v2[0] * tmp0; 3252 sum3 -= v3[0] * tmp0; 3253 v1++; v2++; v3++; 3254 } 3255 t[row] = sum1; 3256 t[row+1] = sum2; 3257 t[row+2] = sum3; 3258 sz = ii[row+1] - diag[row] - 3; 3259 idx = a->j + diag[row] + 3; 3260 v1 += 3; 3261 v2 += 3; 3262 v3 += 3; 3263 for (n = 0; n<sz-1; n+=2) { 3264 i1 = idx[0]; 3265 i2 = idx[1]; 3266 idx += 2; 3267 tmp0 = x[i1]; 3268 tmp1 = x[i2]; 3269 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3270 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3271 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3272 } 3273 if (n == sz-1) { 3274 tmp0 = x[*idx]; 3275 sum1 -= v1[0] * tmp0; 3276 sum2 -= v2[0] * tmp0; 3277 sum3 -= v3[0] * tmp0; 3278 } 3279 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3280 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3281 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3282 break; 3283 case 4: 3284 v2 = a->a + ii[row+1]; 3285 v3 = a->a + ii[row+2]; 3286 v4 = a->a + ii[row+3]; 3287 sum1 = b[row]; 3288 sum2 = b[row+1]; 3289 sum3 = b[row+2]; 3290 sum4 = b[row+3]; 3291 for (n = 0; n<sz-1; n+=2) { 3292 i1 = idx[0]; 3293 i2 = idx[1]; 3294 idx += 2; 3295 tmp0 = x[i1]; 3296 tmp1 = x[i2]; 3297 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3298 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3299 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3300 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3301 } 3302 if (n == sz-1) { 3303 tmp0 = x[*idx++]; 3304 sum1 -= v1[0] * tmp0; 3305 sum2 -= v2[0] * tmp0; 3306 sum3 -= v3[0] * tmp0; 3307 sum4 -= v4[0] * tmp0; 3308 v1++; v2++; v3++; v4++; 3309 } 3310 t[row] = sum1; 3311 t[row+1] = sum2; 3312 t[row+2] = sum3; 3313 t[row+3] = sum4; 3314 sz = ii[row+1] - diag[row] - 4; 3315 idx = a->j + diag[row] + 4; 3316 v1 += 4; 3317 v2 += 4; 3318 v3 += 4; 3319 v4 += 4; 3320 for (n = 0; n<sz-1; n+=2) { 3321 i1 = idx[0]; 3322 i2 = idx[1]; 3323 idx += 2; 3324 tmp0 = x[i1]; 3325 tmp1 = x[i2]; 3326 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3327 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3328 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3329 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3330 } 3331 if (n == sz-1) { 3332 tmp0 = x[*idx]; 3333 sum1 -= v1[0] * tmp0; 3334 sum2 -= v2[0] * tmp0; 3335 sum3 -= v3[0] * tmp0; 3336 sum4 -= v4[0] * tmp0; 3337 } 3338 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3339 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3340 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3341 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3342 break; 3343 case 5: 3344 v2 = a->a + ii[row+1]; 3345 v3 = a->a + ii[row+2]; 3346 v4 = a->a + ii[row+3]; 3347 v5 = a->a + ii[row+4]; 3348 sum1 = b[row]; 3349 sum2 = b[row+1]; 3350 sum3 = b[row+2]; 3351 sum4 = b[row+3]; 3352 sum5 = b[row+4]; 3353 for (n = 0; n<sz-1; n+=2) { 3354 i1 = idx[0]; 3355 i2 = idx[1]; 3356 idx += 2; 3357 tmp0 = x[i1]; 3358 tmp1 = x[i2]; 3359 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3360 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3361 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3362 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3363 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3364 } 3365 if (n == sz-1) { 3366 tmp0 = x[*idx++]; 3367 sum1 -= v1[0] * tmp0; 3368 sum2 -= v2[0] * tmp0; 3369 sum3 -= v3[0] * tmp0; 3370 sum4 -= v4[0] * tmp0; 3371 sum5 -= v5[0] * tmp0; 3372 v1++; v2++; v3++; v4++; v5++; 3373 } 3374 t[row] = sum1; 3375 t[row+1] = sum2; 3376 t[row+2] = sum3; 3377 t[row+3] = sum4; 3378 t[row+4] = sum5; 3379 sz = ii[row+1] - diag[row] - 5; 3380 idx = a->j + diag[row] + 5; 3381 v1 += 5; 3382 v2 += 5; 3383 v3 += 5; 3384 v4 += 5; 3385 v5 += 5; 3386 for (n = 0; n<sz-1; n+=2) { 3387 i1 = idx[0]; 3388 i2 = idx[1]; 3389 idx += 2; 3390 tmp0 = x[i1]; 3391 tmp1 = x[i2]; 3392 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3393 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3394 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3395 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3396 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3397 } 3398 if (n == sz-1) { 3399 tmp0 = x[*idx]; 3400 sum1 -= v1[0] * tmp0; 3401 sum2 -= v2[0] * tmp0; 3402 sum3 -= v3[0] * tmp0; 3403 sum4 -= v4[0] * tmp0; 3404 sum5 -= v5[0] * tmp0; 3405 } 3406 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3407 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3408 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3409 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3410 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3411 break; 3412 default: 3413 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3414 } 3415 } 3416 xb = t; 3417 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3418 } else xb = b; 3419 3420 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3421 3422 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3423 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3424 ibdiag -= sizes[i]*sizes[i]; 3425 3426 /* set RHS */ 3427 if (xb == b) { 3428 /* whole (old way) */ 3429 sz = ii[row+1] - ii[row]; 3430 idx = a->j + ii[row]; 3431 switch (sizes[i]) { 3432 case 5: 3433 v5 = a->a + ii[row-4]; 3434 case 4: /* fall through */ 3435 v4 = a->a + ii[row-3]; 3436 case 3: 3437 v3 = a->a + ii[row-2]; 3438 case 2: 3439 v2 = a->a + ii[row-1]; 3440 case 1: 3441 v1 = a->a + ii[row]; 3442 break; 3443 default: 3444 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3445 } 3446 } else { 3447 /* upper, no diag */ 3448 sz = ii[row+1] - diag[row] - 1; 3449 idx = a->j + diag[row] + 1; 3450 switch (sizes[i]) { 3451 case 5: 3452 v5 = a->a + diag[row-4] + 5; 3453 case 4: /* fall through */ 3454 v4 = a->a + diag[row-3] + 4; 3455 case 3: 3456 v3 = a->a + diag[row-2] + 3; 3457 case 2: 3458 v2 = a->a + diag[row-1] + 2; 3459 case 1: 3460 v1 = a->a + diag[row] + 1; 3461 } 3462 } 3463 /* set sum */ 3464 switch (sizes[i]) { 3465 case 5: 3466 sum5 = xb[row-4]; 3467 case 4: /* fall through */ 3468 sum4 = xb[row-3]; 3469 case 3: 3470 sum3 = xb[row-2]; 3471 case 2: 3472 sum2 = xb[row-1]; 3473 case 1: 3474 /* note that sum1 is associated with the last row */ 3475 sum1 = xb[row]; 3476 } 3477 /* do sums */ 3478 for (n = 0; n<sz-1; n+=2) { 3479 i1 = idx[0]; 3480 i2 = idx[1]; 3481 idx += 2; 3482 tmp0 = x[i1]; 3483 tmp1 = x[i2]; 3484 switch (sizes[i]) { 3485 case 5: 3486 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3487 case 4: /* fall through */ 3488 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3489 case 3: 3490 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3491 case 2: 3492 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3493 case 1: 3494 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3495 } 3496 } 3497 /* ragged edge */ 3498 if (n == sz-1) { 3499 tmp0 = x[*idx]; 3500 switch (sizes[i]) { 3501 case 5: 3502 sum5 -= *v5*tmp0; 3503 case 4: /* fall through */ 3504 sum4 -= *v4*tmp0; 3505 case 3: 3506 sum3 -= *v3*tmp0; 3507 case 2: 3508 sum2 -= *v2*tmp0; 3509 case 1: 3510 sum1 -= *v1*tmp0; 3511 } 3512 } 3513 /* update */ 3514 if (xb == b) { 3515 /* whole (old way) w/ diag */ 3516 switch (sizes[i]) { 3517 case 5: 3518 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3519 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3520 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3521 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3522 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3523 break; 3524 case 4: 3525 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3526 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3527 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3528 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3529 break; 3530 case 3: 3531 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3532 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3533 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3534 break; 3535 case 2: 3536 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3537 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3538 break; 3539 case 1: 3540 x[row--] += sum1*(*ibdiag); 3541 break; 3542 } 3543 } else { 3544 /* no diag so set = */ 3545 switch (sizes[i]) { 3546 case 5: 3547 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3548 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3549 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3550 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3551 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3552 break; 3553 case 4: 3554 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3555 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3556 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3557 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3558 break; 3559 case 3: 3560 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3561 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3562 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3563 break; 3564 case 2: 3565 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3566 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3567 break; 3568 case 1: 3569 x[row--] = sum1*(*ibdiag); 3570 break; 3571 } 3572 } 3573 } 3574 if (xb == b) { 3575 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3576 } else { 3577 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3578 } 3579 } 3580 } 3581 if (flag & SOR_EISENSTAT) { 3582 /* 3583 Apply (U + D)^-1 where D is now the block diagonal 3584 */ 3585 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3586 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3587 ibdiag -= sizes[i]*sizes[i]; 3588 sz = ii[row+1] - diag[row] - 1; 3589 v1 = a->a + diag[row] + 1; 3590 idx = a->j + diag[row] + 1; 3591 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3592 switch (sizes[i]) { 3593 case 1: 3594 3595 sum1 = b[row]; 3596 for (n = 0; n<sz-1; n+=2) { 3597 i1 = idx[0]; 3598 i2 = idx[1]; 3599 idx += 2; 3600 tmp0 = x[i1]; 3601 tmp1 = x[i2]; 3602 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3603 } 3604 3605 if (n == sz-1) { 3606 tmp0 = x[*idx]; 3607 sum1 -= *v1*tmp0; 3608 } 3609 x[row] = sum1*(*ibdiag);row--; 3610 break; 3611 3612 case 2: 3613 3614 sum1 = b[row]; 3615 sum2 = b[row-1]; 3616 /* note that sum1 is associated with the second of the two rows */ 3617 v2 = a->a + diag[row-1] + 2; 3618 for (n = 0; n<sz-1; n+=2) { 3619 i1 = idx[0]; 3620 i2 = idx[1]; 3621 idx += 2; 3622 tmp0 = x[i1]; 3623 tmp1 = x[i2]; 3624 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3625 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3626 } 3627 3628 if (n == sz-1) { 3629 tmp0 = x[*idx]; 3630 sum1 -= *v1*tmp0; 3631 sum2 -= *v2*tmp0; 3632 } 3633 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3634 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3635 row -= 2; 3636 break; 3637 case 3: 3638 3639 sum1 = b[row]; 3640 sum2 = b[row-1]; 3641 sum3 = b[row-2]; 3642 v2 = a->a + diag[row-1] + 2; 3643 v3 = a->a + diag[row-2] + 3; 3644 for (n = 0; n<sz-1; n+=2) { 3645 i1 = idx[0]; 3646 i2 = idx[1]; 3647 idx += 2; 3648 tmp0 = x[i1]; 3649 tmp1 = x[i2]; 3650 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3651 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3652 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3653 } 3654 3655 if (n == sz-1) { 3656 tmp0 = x[*idx]; 3657 sum1 -= *v1*tmp0; 3658 sum2 -= *v2*tmp0; 3659 sum3 -= *v3*tmp0; 3660 } 3661 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3662 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3663 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3664 row -= 3; 3665 break; 3666 case 4: 3667 3668 sum1 = b[row]; 3669 sum2 = b[row-1]; 3670 sum3 = b[row-2]; 3671 sum4 = b[row-3]; 3672 v2 = a->a + diag[row-1] + 2; 3673 v3 = a->a + diag[row-2] + 3; 3674 v4 = a->a + diag[row-3] + 4; 3675 for (n = 0; n<sz-1; n+=2) { 3676 i1 = idx[0]; 3677 i2 = idx[1]; 3678 idx += 2; 3679 tmp0 = x[i1]; 3680 tmp1 = x[i2]; 3681 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3682 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3683 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3684 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3685 } 3686 3687 if (n == sz-1) { 3688 tmp0 = x[*idx]; 3689 sum1 -= *v1*tmp0; 3690 sum2 -= *v2*tmp0; 3691 sum3 -= *v3*tmp0; 3692 sum4 -= *v4*tmp0; 3693 } 3694 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3695 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3696 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3697 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3698 row -= 4; 3699 break; 3700 case 5: 3701 3702 sum1 = b[row]; 3703 sum2 = b[row-1]; 3704 sum3 = b[row-2]; 3705 sum4 = b[row-3]; 3706 sum5 = b[row-4]; 3707 v2 = a->a + diag[row-1] + 2; 3708 v3 = a->a + diag[row-2] + 3; 3709 v4 = a->a + diag[row-3] + 4; 3710 v5 = a->a + diag[row-4] + 5; 3711 for (n = 0; n<sz-1; n+=2) { 3712 i1 = idx[0]; 3713 i2 = idx[1]; 3714 idx += 2; 3715 tmp0 = x[i1]; 3716 tmp1 = x[i2]; 3717 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3718 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3719 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3720 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3721 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3722 } 3723 3724 if (n == sz-1) { 3725 tmp0 = x[*idx]; 3726 sum1 -= *v1*tmp0; 3727 sum2 -= *v2*tmp0; 3728 sum3 -= *v3*tmp0; 3729 sum4 -= *v4*tmp0; 3730 sum5 -= *v5*tmp0; 3731 } 3732 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3733 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3734 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3735 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3736 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3737 row -= 5; 3738 break; 3739 default: 3740 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3741 } 3742 } 3743 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3744 3745 /* 3746 t = b - D x where D is the block diagonal 3747 */ 3748 cnt = 0; 3749 for (i=0, row=0; i<m; i++) { 3750 switch (sizes[i]) { 3751 case 1: 3752 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3753 break; 3754 case 2: 3755 x1 = x[row]; x2 = x[row+1]; 3756 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3757 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3758 t[row] = b[row] - tmp1; 3759 t[row+1] = b[row+1] - tmp2; row += 2; 3760 cnt += 4; 3761 break; 3762 case 3: 3763 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3764 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3765 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3766 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3767 t[row] = b[row] - tmp1; 3768 t[row+1] = b[row+1] - tmp2; 3769 t[row+2] = b[row+2] - tmp3; row += 3; 3770 cnt += 9; 3771 break; 3772 case 4: 3773 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3774 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3775 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3776 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3777 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3778 t[row] = b[row] - tmp1; 3779 t[row+1] = b[row+1] - tmp2; 3780 t[row+2] = b[row+2] - tmp3; 3781 t[row+3] = b[row+3] - tmp4; row += 4; 3782 cnt += 16; 3783 break; 3784 case 5: 3785 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3786 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3787 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3788 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3789 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3790 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3791 t[row] = b[row] - tmp1; 3792 t[row+1] = b[row+1] - tmp2; 3793 t[row+2] = b[row+2] - tmp3; 3794 t[row+3] = b[row+3] - tmp4; 3795 t[row+4] = b[row+4] - tmp5;row += 5; 3796 cnt += 25; 3797 break; 3798 default: 3799 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3800 } 3801 } 3802 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3803 3804 /* 3805 Apply (L + D)^-1 where D is the block diagonal 3806 */ 3807 for (i=0, row=0; i<m; i++) { 3808 sz = diag[row] - ii[row]; 3809 v1 = a->a + ii[row]; 3810 idx = a->j + ii[row]; 3811 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3812 switch (sizes[i]) { 3813 case 1: 3814 3815 sum1 = t[row]; 3816 for (n = 0; n<sz-1; n+=2) { 3817 i1 = idx[0]; 3818 i2 = idx[1]; 3819 idx += 2; 3820 tmp0 = t[i1]; 3821 tmp1 = t[i2]; 3822 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3823 } 3824 3825 if (n == sz-1) { 3826 tmp0 = t[*idx]; 3827 sum1 -= *v1 * tmp0; 3828 } 3829 x[row] += t[row] = sum1*(*ibdiag++); row++; 3830 break; 3831 case 2: 3832 v2 = a->a + ii[row+1]; 3833 sum1 = t[row]; 3834 sum2 = t[row+1]; 3835 for (n = 0; n<sz-1; n+=2) { 3836 i1 = idx[0]; 3837 i2 = idx[1]; 3838 idx += 2; 3839 tmp0 = t[i1]; 3840 tmp1 = t[i2]; 3841 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3842 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3843 } 3844 3845 if (n == sz-1) { 3846 tmp0 = t[*idx]; 3847 sum1 -= v1[0] * tmp0; 3848 sum2 -= v2[0] * tmp0; 3849 } 3850 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3851 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3852 ibdiag += 4; row += 2; 3853 break; 3854 case 3: 3855 v2 = a->a + ii[row+1]; 3856 v3 = a->a + ii[row+2]; 3857 sum1 = t[row]; 3858 sum2 = t[row+1]; 3859 sum3 = t[row+2]; 3860 for (n = 0; n<sz-1; n+=2) { 3861 i1 = idx[0]; 3862 i2 = idx[1]; 3863 idx += 2; 3864 tmp0 = t[i1]; 3865 tmp1 = t[i2]; 3866 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3867 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3868 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3869 } 3870 3871 if (n == sz-1) { 3872 tmp0 = t[*idx]; 3873 sum1 -= v1[0] * tmp0; 3874 sum2 -= v2[0] * tmp0; 3875 sum3 -= v3[0] * tmp0; 3876 } 3877 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3878 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3879 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3880 ibdiag += 9; row += 3; 3881 break; 3882 case 4: 3883 v2 = a->a + ii[row+1]; 3884 v3 = a->a + ii[row+2]; 3885 v4 = a->a + ii[row+3]; 3886 sum1 = t[row]; 3887 sum2 = t[row+1]; 3888 sum3 = t[row+2]; 3889 sum4 = t[row+3]; 3890 for (n = 0; n<sz-1; n+=2) { 3891 i1 = idx[0]; 3892 i2 = idx[1]; 3893 idx += 2; 3894 tmp0 = t[i1]; 3895 tmp1 = t[i2]; 3896 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3897 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3898 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3899 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3900 } 3901 3902 if (n == sz-1) { 3903 tmp0 = t[*idx]; 3904 sum1 -= v1[0] * tmp0; 3905 sum2 -= v2[0] * tmp0; 3906 sum3 -= v3[0] * tmp0; 3907 sum4 -= v4[0] * tmp0; 3908 } 3909 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3910 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3911 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3912 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3913 ibdiag += 16; row += 4; 3914 break; 3915 case 5: 3916 v2 = a->a + ii[row+1]; 3917 v3 = a->a + ii[row+2]; 3918 v4 = a->a + ii[row+3]; 3919 v5 = a->a + ii[row+4]; 3920 sum1 = t[row]; 3921 sum2 = t[row+1]; 3922 sum3 = t[row+2]; 3923 sum4 = t[row+3]; 3924 sum5 = t[row+4]; 3925 for (n = 0; n<sz-1; n+=2) { 3926 i1 = idx[0]; 3927 i2 = idx[1]; 3928 idx += 2; 3929 tmp0 = t[i1]; 3930 tmp1 = t[i2]; 3931 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3932 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3933 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3934 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3935 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3936 } 3937 3938 if (n == sz-1) { 3939 tmp0 = t[*idx]; 3940 sum1 -= v1[0] * tmp0; 3941 sum2 -= v2[0] * tmp0; 3942 sum3 -= v3[0] * tmp0; 3943 sum4 -= v4[0] * tmp0; 3944 sum5 -= v5[0] * tmp0; 3945 } 3946 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3947 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3948 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3949 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3950 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3951 ibdiag += 25; row += 5; 3952 break; 3953 default: 3954 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3955 } 3956 } 3957 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3958 } 3959 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3960 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3961 PetscFunctionReturn(0); 3962 } 3963 3964 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3965 { 3966 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3967 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3968 const MatScalar *bdiag = a->inode.bdiag; 3969 const PetscScalar *b; 3970 PetscErrorCode ierr; 3971 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3972 const PetscInt *sizes = a->inode.size; 3973 3974 PetscFunctionBegin; 3975 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 3976 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3977 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3978 cnt = 0; 3979 for (i=0, row=0; i<m; i++) { 3980 switch (sizes[i]) { 3981 case 1: 3982 x[row] = b[row]*bdiag[cnt++];row++; 3983 break; 3984 case 2: 3985 x1 = b[row]; x2 = b[row+1]; 3986 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3987 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3988 x[row++] = tmp1; 3989 x[row++] = tmp2; 3990 cnt += 4; 3991 break; 3992 case 3: 3993 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3994 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3995 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3996 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3997 x[row++] = tmp1; 3998 x[row++] = tmp2; 3999 x[row++] = tmp3; 4000 cnt += 9; 4001 break; 4002 case 4: 4003 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4004 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4005 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4006 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4007 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4008 x[row++] = tmp1; 4009 x[row++] = tmp2; 4010 x[row++] = tmp3; 4011 x[row++] = tmp4; 4012 cnt += 16; 4013 break; 4014 case 5: 4015 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4016 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4017 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4018 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4019 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4020 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4021 x[row++] = tmp1; 4022 x[row++] = tmp2; 4023 x[row++] = tmp3; 4024 x[row++] = tmp4; 4025 x[row++] = tmp5; 4026 cnt += 25; 4027 break; 4028 default: 4029 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4030 } 4031 } 4032 ierr = PetscLogFlops(2.0*cnt);CHKERRQ(ierr); 4033 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4034 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4035 PetscFunctionReturn(0); 4036 } 4037 4038 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4039 { 4040 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4041 4042 PetscFunctionBegin; 4043 a->inode.node_count = 0; 4044 a->inode.use = PETSC_FALSE; 4045 a->inode.checked = PETSC_FALSE; 4046 a->inode.mat_nonzerostate = -1; 4047 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4048 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4049 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4050 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4051 A->ops->coloringpatch = NULL; 4052 A->ops->multdiagonalblock = NULL; 4053 if (A->factortype) { 4054 A->ops->solve = MatSolve_SeqAIJ_inplace; 4055 } 4056 PetscFunctionReturn(0); 4057 } 4058 4059 /* 4060 samestructure indicates that the matrix has not changed its nonzero structure so we 4061 do not need to recompute the inodes 4062 */ 4063 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4064 { 4065 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4066 PetscErrorCode ierr; 4067 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4068 PetscBool flag; 4069 const PetscInt *idx,*idy,*ii; 4070 4071 PetscFunctionBegin; 4072 if (!a->inode.use) { 4073 ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr); 4074 ierr = PetscFree(a->inode.size);CHKERRQ(ierr); 4075 PetscFunctionReturn(0); 4076 } 4077 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4078 4079 m = A->rmap->n; 4080 if (!a->inode.size) { ierr = PetscMalloc1(m+1,&a->inode.size);CHKERRQ(ierr); } 4081 ns = a->inode.size; 4082 4083 i = 0; 4084 node_count = 0; 4085 idx = a->j; 4086 ii = a->i; 4087 while (i < m) { /* For each row */ 4088 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4089 /* Limits the number of elements in a node to 'a->inode.limit' */ 4090 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4091 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4092 if (nzy != nzx) break; 4093 idy += nzx; /* Same nonzero pattern */ 4094 ierr = PetscArraycmp(idx,idy,nzx,&flag);CHKERRQ(ierr); 4095 if (!flag) break; 4096 } 4097 ns[node_count++] = blk_size; 4098 idx += blk_size*nzx; 4099 i = j; 4100 } 4101 4102 /* If not enough inodes found,, do not use inode version of the routines */ 4103 if (!m || node_count > .8*m) { 4104 ierr = MatSeqAIJ_Inode_ResetOps(A);CHKERRQ(ierr); 4105 ierr = PetscFree(a->inode.size);CHKERRQ(ierr); 4106 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4107 } else { 4108 if (!A->factortype) { 4109 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4110 if (A->rmap->n == A->cmap->n) { 4111 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4112 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4113 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4114 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4115 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4116 } 4117 } else { 4118 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4119 } 4120 a->inode.node_count = node_count; 4121 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4122 } 4123 a->inode.checked = PETSC_TRUE; 4124 a->inode.mat_nonzerostate = A->nonzerostate; 4125 PetscFunctionReturn(0); 4126 } 4127 4128 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4129 { 4130 Mat B =*C; 4131 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4132 PetscErrorCode ierr; 4133 PetscInt m=A->rmap->n; 4134 4135 PetscFunctionBegin; 4136 c->inode.use = a->inode.use; 4137 c->inode.limit = a->inode.limit; 4138 c->inode.max_limit = a->inode.max_limit; 4139 c->inode.checked = PETSC_FALSE; 4140 c->inode.size = NULL; 4141 c->inode.node_count = 0; 4142 c->inode.ibdiagvalid = PETSC_FALSE; 4143 c->inode.ibdiag = NULL; 4144 c->inode.bdiag = NULL; 4145 c->inode.mat_nonzerostate = -1; 4146 if (a->inode.use) { 4147 if (a->inode.checked && a->inode.size) { 4148 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4149 ierr = PetscArraycpy(c->inode.size,a->inode.size,m+1);CHKERRQ(ierr); 4150 4151 c->inode.checked = PETSC_TRUE; 4152 c->inode.node_count = a->inode.node_count; 4153 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4154 } 4155 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4156 if (!B->factortype) { 4157 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4158 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4159 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4160 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4161 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4162 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4163 } else { 4164 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4165 } 4166 } 4167 PetscFunctionReturn(0); 4168 } 4169 4170 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4171 { 4172 PetscInt k; 4173 const PetscInt *vi; 4174 4175 PetscFunctionBegin; 4176 vi = aj + ai[row]; 4177 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4178 vi = aj + adiag[row]; 4179 cols[nzl] = vi[0]; 4180 vi = aj + adiag[row+1]+1; 4181 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4182 PetscFunctionReturn(0); 4183 } 4184 /* 4185 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4186 Modified from MatSeqAIJCheckInode(). 4187 4188 Input Parameters: 4189 . Mat A - ILU or LU matrix factor 4190 4191 */ 4192 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4193 { 4194 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4195 PetscErrorCode ierr; 4196 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4197 PetscInt *cols1,*cols2,*ns; 4198 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4199 PetscBool flag; 4200 4201 PetscFunctionBegin; 4202 if (!a->inode.use) PetscFunctionReturn(0); 4203 if (a->inode.checked) PetscFunctionReturn(0); 4204 4205 m = A->rmap->n; 4206 if (a->inode.size) ns = a->inode.size; 4207 else { 4208 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4209 } 4210 4211 i = 0; 4212 node_count = 0; 4213 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4214 while (i < m) { /* For each row */ 4215 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4216 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4217 nzx = nzl1 + nzu1 + 1; 4218 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4219 4220 /* Limits the number of elements in a node to 'a->inode.limit' */ 4221 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4222 nzl2 = ai[j+1] - ai[j]; 4223 nzu2 = adiag[j] - adiag[j+1] - 1; 4224 nzy = nzl2 + nzu2 + 1; 4225 if (nzy != nzx) break; 4226 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4227 ierr = PetscArraycmp(cols1,cols2,nzx,&flag);CHKERRQ(ierr); 4228 if (!flag) break; 4229 } 4230 ns[node_count++] = blk_size; 4231 i = j; 4232 } 4233 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4234 /* If not enough inodes found,, do not use inode version of the routines */ 4235 if (!m || node_count > .8*m) { 4236 ierr = PetscFree(ns);CHKERRQ(ierr); 4237 4238 a->inode.node_count = 0; 4239 a->inode.size = NULL; 4240 a->inode.use = PETSC_FALSE; 4241 4242 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4243 } else { 4244 A->ops->mult = NULL; 4245 A->ops->sor = NULL; 4246 A->ops->multadd = NULL; 4247 A->ops->getrowij = NULL; 4248 A->ops->restorerowij = NULL; 4249 A->ops->getcolumnij = NULL; 4250 A->ops->restorecolumnij = NULL; 4251 A->ops->coloringpatch = NULL; 4252 A->ops->multdiagonalblock = NULL; 4253 a->inode.node_count = node_count; 4254 a->inode.size = ns; 4255 4256 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4257 } 4258 a->inode.checked = PETSC_TRUE; 4259 PetscFunctionReturn(0); 4260 } 4261 4262 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4263 { 4264 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4265 4266 PetscFunctionBegin; 4267 a->inode.ibdiagvalid = PETSC_FALSE; 4268 PetscFunctionReturn(0); 4269 } 4270 4271 /* 4272 This is really ugly. if inodes are used this replaces the 4273 permutations with ones that correspond to rows/cols of the matrix 4274 rather then inode blocks 4275 */ 4276 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4277 { 4278 PetscErrorCode ierr; 4279 4280 PetscFunctionBegin; 4281 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4282 PetscFunctionReturn(0); 4283 } 4284 4285 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4286 { 4287 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4288 PetscErrorCode ierr; 4289 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4290 const PetscInt *ridx,*cidx; 4291 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4292 PetscInt nslim_col,*ns_col; 4293 IS ris = *rperm,cis = *cperm; 4294 4295 PetscFunctionBegin; 4296 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4297 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4298 4299 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4300 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4301 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4302 4303 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4304 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4305 4306 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4307 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4308 4309 /* Construct the permutations for rows*/ 4310 for (i=0,row = 0; i<nslim_row; ++i) { 4311 indx = ridx[i]; 4312 start_val = tns[indx]; 4313 end_val = tns[indx + 1]; 4314 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4315 } 4316 4317 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4318 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4319 4320 /* Construct permutations for columns */ 4321 for (i=0,col=0; i<nslim_col; ++i) { 4322 indx = cidx[i]; 4323 start_val = tns[indx]; 4324 end_val = tns[indx + 1]; 4325 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4326 } 4327 4328 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4329 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4330 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4331 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4332 4333 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4334 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4335 4336 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4337 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4338 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4339 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4340 ierr = PetscFree(tns);CHKERRQ(ierr); 4341 PetscFunctionReturn(0); 4342 } 4343 4344 /*@C 4345 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4346 4347 Not Collective 4348 4349 Input Parameter: 4350 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4351 4352 Output Parameter: 4353 + node_count - no of inodes present in the matrix. 4354 . sizes - an array of size node_count,with sizes of each inode. 4355 - limit - the max size used to generate the inodes. 4356 4357 Level: advanced 4358 4359 Notes: 4360 This routine returns some internal storage information 4361 of the matrix, it is intended to be used by advanced users. 4362 It should be called after the matrix is assembled. 4363 The contents of the sizes[] array should not be changed. 4364 NULL may be passed for information not requested. 4365 4366 .seealso: MatGetInfo() 4367 @*/ 4368 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4369 { 4370 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4371 4372 PetscFunctionBegin; 4373 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4374 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4375 if (f) { 4376 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4377 } 4378 PetscFunctionReturn(0); 4379 } 4380 4381 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4382 { 4383 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4384 4385 PetscFunctionBegin; 4386 if (node_count) *node_count = a->inode.node_count; 4387 if (sizes) *sizes = a->inode.size; 4388 if (limit) *limit = a->inode.limit; 4389 PetscFunctionReturn(0); 4390 } 4391