1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns) 9 { 10 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 11 PetscErrorCode ierr; 12 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 13 14 PetscFunctionBegin; 15 n = A->cmap->n; 16 m = A->rmap->n; 17 ns_row = a->inode.size; 18 19 min_mn = (m < n) ? m : n; 20 if (!ns) { 21 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 22 for (; count+1 < n; count++,i++) ; 23 if (count < n) { 24 i++; 25 } 26 *size = i; 27 PetscFunctionReturn(0); 28 } 29 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 30 31 /* Use the same row structure wherever feasible. */ 32 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 33 ns_col[i] = ns_row[i]; 34 } 35 36 /* if m < n; pad up the remainder with inode_limit */ 37 for (; count+1 < n; count++,i++) { 38 ns_col[i] = 1; 39 } 40 /* The last node is the odd ball. padd it up with the remaining rows; */ 41 if (count < n) { 42 ns_col[i] = n - count; 43 i++; 44 } else if (count > n) { 45 /* Adjust for the over estimation */ 46 ns_col[i-1] += n - count; 47 } 48 *size = i; 49 *ns = ns_col; 50 PetscFunctionReturn(0); 51 } 52 53 54 /* 55 This builds symmetric version of nonzero structure, 56 */ 57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 58 { 59 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 60 PetscErrorCode ierr; 61 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 62 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 63 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 64 65 PetscFunctionBegin; 66 nslim_row = a->inode.node_count; 67 m = A->rmap->n; 68 n = A->cmap->n; 69 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 70 71 /* Use the row_inode as column_inode */ 72 nslim_col = nslim_row; 73 ns_col = ns_row; 74 75 /* allocate space for reformated inode structure */ 76 ierr = PetscMalloc2(nslim_col+1,&tns,n+1,&tvc);CHKERRQ(ierr); 77 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 78 79 for (i1=0,col=0; i1<nslim_col; ++i1) { 80 nsz = ns_col[i1]; 81 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 82 } 83 /* allocate space for row pointers */ 84 ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 85 *iia = ia; 86 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 87 88 /* determine the number of columns in each row */ 89 ia[0] = oshift; 90 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 91 92 j = aj + ai[row] + ishift; 93 jmax = aj + ai[row+1] + ishift; 94 if (j==jmax) continue; /* empty row */ 95 col = *j++ + ishift; 96 i2 = tvc[col]; 97 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 98 ia[i1+1]++; 99 ia[i2+1]++; 100 i2++; /* Start col of next node */ 101 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 102 i2 = tvc[col]; 103 } 104 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 105 } 106 107 /* shift ia[i] to point to next row */ 108 for (i1=1; i1<nslim_row+1; i1++) { 109 row = ia[i1-1]; 110 ia[i1] += row; 111 work[i1-1] = row - oshift; 112 } 113 114 /* allocate space for column pointers */ 115 nz = ia[nslim_row] + (!ishift); 116 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 117 *jja = ja; 118 119 /* loop over lower triangular part putting into ja */ 120 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 121 j = aj + ai[row] + ishift; 122 jmax = aj + ai[row+1] + ishift; 123 if (j==jmax) continue; /* empty row */ 124 col = *j++ + ishift; 125 i2 = tvc[col]; 126 while (i2<i1 && j<jmax) { 127 ja[work[i2]++] = i1 + oshift; 128 ja[work[i1]++] = i2 + oshift; 129 ++i2; 130 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 131 i2 = tvc[col]; 132 } 133 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 134 135 } 136 ierr = PetscFree(work);CHKERRQ(ierr); 137 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); 138 PetscFunctionReturn(0); 139 } 140 141 /* 142 This builds nonsymmetric version of nonzero structure, 143 */ 144 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 145 { 146 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 147 PetscErrorCode ierr; 148 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 149 PetscInt *tns,*tvc,nsz,i1,i2; 150 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 151 152 PetscFunctionBegin; 153 nslim_row = a->inode.node_count; 154 n = A->cmap->n; 155 156 /* Create The column_inode for this matrix */ 157 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 158 159 /* allocate space for reformated column_inode structure */ 160 ierr = PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc);CHKERRQ(ierr); 161 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 162 163 for (i1=0,col=0; i1<nslim_col; ++i1) { 164 nsz = ns_col[i1]; 165 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 166 } 167 /* allocate space for row pointers */ 168 ierr = PetscCalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 169 *iia = ia; 170 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 171 172 /* determine the number of columns in each row */ 173 ia[0] = oshift; 174 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 175 j = aj + ai[row] + ishift; 176 nz = ai[row+1] - ai[row]; 177 if (!nz) continue; /* empty row */ 178 col = *j++ + ishift; 179 i2 = tvc[col]; 180 while (nz-- > 0) { /* off-diagonal elemets */ 181 ia[i1+1]++; 182 i2++; /* Start col of next node */ 183 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 184 if (nz > 0) i2 = tvc[col]; 185 } 186 } 187 188 /* shift ia[i] to point to next row */ 189 for (i1=1; i1<nslim_row+1; i1++) { 190 row = ia[i1-1]; 191 ia[i1] += row; 192 work[i1-1] = row - oshift; 193 } 194 195 /* allocate space for column pointers */ 196 nz = ia[nslim_row] + (!ishift); 197 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 198 *jja = ja; 199 200 /* loop over matrix putting into ja */ 201 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 202 j = aj + ai[row] + ishift; 203 nz = ai[row+1] - ai[row]; 204 if (!nz) continue; /* empty row */ 205 col = *j++ + ishift; 206 i2 = tvc[col]; 207 while (nz-- > 0) { 208 ja[work[i1]++] = i2 + oshift; 209 ++i2; 210 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 211 if (nz > 0) i2 = tvc[col]; 212 } 213 } 214 ierr = PetscFree(ns_col);CHKERRQ(ierr); 215 ierr = PetscFree(work);CHKERRQ(ierr); 216 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); 217 PetscFunctionReturn(0); 218 } 219 220 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 221 { 222 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 223 PetscErrorCode ierr; 224 225 PetscFunctionBegin; 226 *n = a->inode.node_count; 227 if (!ia) PetscFunctionReturn(0); 228 if (!blockcompressed) { 229 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); 230 } else if (symmetric) { 231 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 232 } else { 233 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 234 } 235 PetscFunctionReturn(0); 236 } 237 238 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 239 { 240 PetscErrorCode ierr; 241 242 PetscFunctionBegin; 243 if (!ia) PetscFunctionReturn(0); 244 245 if (!blockcompressed) { 246 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); 247 } else { 248 ierr = PetscFree(*ia);CHKERRQ(ierr); 249 ierr = PetscFree(*ja);CHKERRQ(ierr); 250 } 251 PetscFunctionReturn(0); 252 } 253 254 /* ----------------------------------------------------------- */ 255 256 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 257 { 258 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 259 PetscErrorCode ierr; 260 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 261 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 262 263 PetscFunctionBegin; 264 nslim_row = a->inode.node_count; 265 n = A->cmap->n; 266 267 /* Create The column_inode for this matrix */ 268 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 269 270 /* allocate space for reformated column_inode structure */ 271 ierr = PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc);CHKERRQ(ierr); 272 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 273 274 for (i1=0,col=0; i1<nslim_col; ++i1) { 275 nsz = ns_col[i1]; 276 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 277 } 278 /* allocate space for column pointers */ 279 ierr = PetscCalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 280 *iia = ia; 281 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 282 283 /* determine the number of columns in each row */ 284 ia[0] = oshift; 285 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 286 j = aj + ai[row] + ishift; 287 col = *j++ + ishift; 288 i2 = tvc[col]; 289 nz = ai[row+1] - ai[row]; 290 while (nz-- > 0) { /* off-diagonal elemets */ 291 /* ia[i1+1]++; */ 292 ia[i2+1]++; 293 i2++; 294 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 295 if (nz > 0) i2 = tvc[col]; 296 } 297 } 298 299 /* shift ia[i] to point to next col */ 300 for (i1=1; i1<nslim_col+1; i1++) { 301 col = ia[i1-1]; 302 ia[i1] += col; 303 work[i1-1] = col - oshift; 304 } 305 306 /* allocate space for column pointers */ 307 nz = ia[nslim_col] + (!ishift); 308 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 309 *jja = ja; 310 311 /* loop over matrix putting into ja */ 312 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 313 j = aj + ai[row] + ishift; 314 col = *j++ + ishift; 315 i2 = tvc[col]; 316 nz = ai[row+1] - ai[row]; 317 while (nz-- > 0) { 318 /* ja[work[i1]++] = i2 + oshift; */ 319 ja[work[i2]++] = i1 + oshift; 320 i2++; 321 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 322 if (nz > 0) i2 = tvc[col]; 323 } 324 } 325 ierr = PetscFree(ns_col);CHKERRQ(ierr); 326 ierr = PetscFree(work);CHKERRQ(ierr); 327 ierr = PetscFree2(tns,tvc);CHKERRQ(ierr); 328 PetscFunctionReturn(0); 329 } 330 331 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 332 { 333 PetscErrorCode ierr; 334 335 PetscFunctionBegin; 336 ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr); 337 if (!ia) PetscFunctionReturn(0); 338 339 if (!blockcompressed) { 340 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); 341 } else if (symmetric) { 342 /* Since the indices are symmetric it does'nt matter */ 343 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 344 } else { 345 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 346 } 347 PetscFunctionReturn(0); 348 } 349 350 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 if (!ia) PetscFunctionReturn(0); 356 if (!blockcompressed) { 357 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr); 358 } else { 359 ierr = PetscFree(*ia);CHKERRQ(ierr); 360 ierr = PetscFree(*ja);CHKERRQ(ierr); 361 } 362 PetscFunctionReturn(0); 363 } 364 365 /* ----------------------------------------------------------- */ 366 367 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 368 { 369 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 370 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 371 PetscScalar *y; 372 const PetscScalar *x; 373 const MatScalar *v1,*v2,*v3,*v4,*v5; 374 PetscErrorCode ierr; 375 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 376 const PetscInt *idx,*ns,*ii; 377 378 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 379 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 380 #endif 381 382 PetscFunctionBegin; 383 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 384 node_max = a->inode.node_count; 385 ns = a->inode.size; /* Node Size array */ 386 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 387 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 388 idx = a->j; 389 v1 = a->a; 390 ii = a->i; 391 392 for (i = 0,row = 0; i< node_max; ++i) { 393 nsz = ns[i]; 394 n = ii[1] - ii[0]; 395 nonzerorow += (n>0)*nsz; 396 ii += nsz; 397 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 398 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 399 sz = n; /* No of non zeros in this row */ 400 /* Switch on the size of Node */ 401 switch (nsz) { /* Each loop in 'case' is unrolled */ 402 case 1: 403 sum1 = 0.; 404 405 for (n = 0; n< sz-1; n+=2) { 406 i1 = idx[0]; /* The instructions are ordered to */ 407 i2 = idx[1]; /* make the compiler's job easy */ 408 idx += 2; 409 tmp0 = x[i1]; 410 tmp1 = x[i2]; 411 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 412 } 413 414 if (n == sz-1) { /* Take care of the last nonzero */ 415 tmp0 = x[*idx++]; 416 sum1 += *v1++ *tmp0; 417 } 418 y[row++]=sum1; 419 break; 420 case 2: 421 sum1 = 0.; 422 sum2 = 0.; 423 v2 = v1 + n; 424 425 for (n = 0; n< sz-1; n+=2) { 426 i1 = idx[0]; 427 i2 = idx[1]; 428 idx += 2; 429 tmp0 = x[i1]; 430 tmp1 = x[i2]; 431 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 432 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 433 } 434 if (n == sz-1) { 435 tmp0 = x[*idx++]; 436 sum1 += *v1++ * tmp0; 437 sum2 += *v2++ * tmp0; 438 } 439 y[row++]=sum1; 440 y[row++]=sum2; 441 v1 =v2; /* Since the next block to be processed starts there*/ 442 idx +=sz; 443 break; 444 case 3: 445 sum1 = 0.; 446 sum2 = 0.; 447 sum3 = 0.; 448 v2 = v1 + n; 449 v3 = v2 + n; 450 451 for (n = 0; n< sz-1; n+=2) { 452 i1 = idx[0]; 453 i2 = idx[1]; 454 idx += 2; 455 tmp0 = x[i1]; 456 tmp1 = x[i2]; 457 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 458 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 459 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 460 } 461 if (n == sz-1) { 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 sum3 += *v3++ * tmp0; 466 } 467 y[row++]=sum1; 468 y[row++]=sum2; 469 y[row++]=sum3; 470 v1 =v3; /* Since the next block to be processed starts there*/ 471 idx +=2*sz; 472 break; 473 case 4: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 sum4 = 0.; 478 v2 = v1 + n; 479 v3 = v2 + n; 480 v4 = v3 + n; 481 482 for (n = 0; n< sz-1; n+=2) { 483 i1 = idx[0]; 484 i2 = idx[1]; 485 idx += 2; 486 tmp0 = x[i1]; 487 tmp1 = x[i2]; 488 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 489 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 490 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 491 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 492 } 493 if (n == sz-1) { 494 tmp0 = x[*idx++]; 495 sum1 += *v1++ * tmp0; 496 sum2 += *v2++ * tmp0; 497 sum3 += *v3++ * tmp0; 498 sum4 += *v4++ * tmp0; 499 } 500 y[row++]=sum1; 501 y[row++]=sum2; 502 y[row++]=sum3; 503 y[row++]=sum4; 504 v1 =v4; /* Since the next block to be processed starts there*/ 505 idx +=3*sz; 506 break; 507 case 5: 508 sum1 = 0.; 509 sum2 = 0.; 510 sum3 = 0.; 511 sum4 = 0.; 512 sum5 = 0.; 513 v2 = v1 + n; 514 v3 = v2 + n; 515 v4 = v3 + n; 516 v5 = v4 + n; 517 518 for (n = 0; n<sz-1; n+=2) { 519 i1 = idx[0]; 520 i2 = idx[1]; 521 idx += 2; 522 tmp0 = x[i1]; 523 tmp1 = x[i2]; 524 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 525 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 526 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 527 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 528 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 529 } 530 if (n == sz-1) { 531 tmp0 = x[*idx++]; 532 sum1 += *v1++ * tmp0; 533 sum2 += *v2++ * tmp0; 534 sum3 += *v3++ * tmp0; 535 sum4 += *v4++ * tmp0; 536 sum5 += *v5++ * tmp0; 537 } 538 y[row++]=sum1; 539 y[row++]=sum2; 540 y[row++]=sum3; 541 y[row++]=sum4; 542 y[row++]=sum5; 543 v1 =v5; /* Since the next block to be processed starts there */ 544 idx +=4*sz; 545 break; 546 default: 547 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 548 } 549 } 550 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 551 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 552 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 553 PetscFunctionReturn(0); 554 } 555 /* ----------------------------------------------------------- */ 556 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 557 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 558 { 559 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 560 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 561 const MatScalar *v1,*v2,*v3,*v4,*v5; 562 const PetscScalar *x; 563 PetscScalar *y,*z,*zt; 564 PetscErrorCode ierr; 565 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 566 const PetscInt *idx,*ns,*ii; 567 568 PetscFunctionBegin; 569 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 570 node_max = a->inode.node_count; 571 ns = a->inode.size; /* Node Size array */ 572 573 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 575 zt = z; 576 577 idx = a->j; 578 v1 = a->a; 579 ii = a->i; 580 581 for (i = 0,row = 0; i< node_max; ++i) { 582 nsz = ns[i]; 583 n = ii[1] - ii[0]; 584 ii += nsz; 585 sz = n; /* No of non zeros in this row */ 586 /* Switch on the size of Node */ 587 switch (nsz) { /* Each loop in 'case' is unrolled */ 588 case 1: 589 sum1 = *zt++; 590 591 for (n = 0; n< sz-1; n+=2) { 592 i1 = idx[0]; /* The instructions are ordered to */ 593 i2 = idx[1]; /* make the compiler's job easy */ 594 idx += 2; 595 tmp0 = x[i1]; 596 tmp1 = x[i2]; 597 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 598 } 599 600 if (n == sz-1) { /* Take care of the last nonzero */ 601 tmp0 = x[*idx++]; 602 sum1 += *v1++ * tmp0; 603 } 604 y[row++]=sum1; 605 break; 606 case 2: 607 sum1 = *zt++; 608 sum2 = *zt++; 609 v2 = v1 + n; 610 611 for (n = 0; n< sz-1; n+=2) { 612 i1 = idx[0]; 613 i2 = idx[1]; 614 idx += 2; 615 tmp0 = x[i1]; 616 tmp1 = x[i2]; 617 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 618 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 619 } 620 if (n == sz-1) { 621 tmp0 = x[*idx++]; 622 sum1 += *v1++ * tmp0; 623 sum2 += *v2++ * tmp0; 624 } 625 y[row++]=sum1; 626 y[row++]=sum2; 627 v1 =v2; /* Since the next block to be processed starts there*/ 628 idx +=sz; 629 break; 630 case 3: 631 sum1 = *zt++; 632 sum2 = *zt++; 633 sum3 = *zt++; 634 v2 = v1 + n; 635 v3 = v2 + n; 636 637 for (n = 0; n< sz-1; n+=2) { 638 i1 = idx[0]; 639 i2 = idx[1]; 640 idx += 2; 641 tmp0 = x[i1]; 642 tmp1 = x[i2]; 643 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 644 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 645 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 646 } 647 if (n == sz-1) { 648 tmp0 = x[*idx++]; 649 sum1 += *v1++ * tmp0; 650 sum2 += *v2++ * tmp0; 651 sum3 += *v3++ * tmp0; 652 } 653 y[row++]=sum1; 654 y[row++]=sum2; 655 y[row++]=sum3; 656 v1 =v3; /* Since the next block to be processed starts there*/ 657 idx +=2*sz; 658 break; 659 case 4: 660 sum1 = *zt++; 661 sum2 = *zt++; 662 sum3 = *zt++; 663 sum4 = *zt++; 664 v2 = v1 + n; 665 v3 = v2 + n; 666 v4 = v3 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 677 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 678 } 679 if (n == sz-1) { 680 tmp0 = x[*idx++]; 681 sum1 += *v1++ * tmp0; 682 sum2 += *v2++ * tmp0; 683 sum3 += *v3++ * tmp0; 684 sum4 += *v4++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 y[row++]=sum4; 690 v1 =v4; /* Since the next block to be processed starts there*/ 691 idx +=3*sz; 692 break; 693 case 5: 694 sum1 = *zt++; 695 sum2 = *zt++; 696 sum3 = *zt++; 697 sum4 = *zt++; 698 sum5 = *zt++; 699 v2 = v1 + n; 700 v3 = v2 + n; 701 v4 = v3 + n; 702 v5 = v4 + n; 703 704 for (n = 0; n<sz-1; n+=2) { 705 i1 = idx[0]; 706 i2 = idx[1]; 707 idx += 2; 708 tmp0 = x[i1]; 709 tmp1 = x[i2]; 710 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 711 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 712 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 713 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 714 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 715 } 716 if (n == sz-1) { 717 tmp0 = x[*idx++]; 718 sum1 += *v1++ * tmp0; 719 sum2 += *v2++ * tmp0; 720 sum3 += *v3++ * tmp0; 721 sum4 += *v4++ * tmp0; 722 sum5 += *v5++ * tmp0; 723 } 724 y[row++]=sum1; 725 y[row++]=sum2; 726 y[row++]=sum3; 727 y[row++]=sum4; 728 y[row++]=sum5; 729 v1 =v5; /* Since the next block to be processed starts there */ 730 idx +=4*sz; 731 break; 732 default: 733 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 734 } 735 } 736 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 737 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 738 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 739 PetscFunctionReturn(0); 740 } 741 742 /* ----------------------------------------------------------- */ 743 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 744 { 745 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 746 IS iscol = a->col,isrow = a->row; 747 PetscErrorCode ierr; 748 const PetscInt *r,*c,*rout,*cout; 749 PetscInt i,j,n = A->rmap->n,nz; 750 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 751 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 752 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 753 PetscScalar sum1,sum2,sum3,sum4,sum5; 754 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 755 const PetscScalar *b; 756 757 PetscFunctionBegin; 758 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 759 node_max = a->inode.node_count; 760 ns = a->inode.size; /* Node Size array */ 761 762 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 763 ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr); 764 tmp = a->solve_work; 765 766 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 767 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 768 769 /* forward solve the lower triangular */ 770 tmps = tmp; 771 aa = a_a; 772 aj = a_j; 773 ad = a->diag; 774 775 for (i = 0,row = 0; i< node_max; ++i) { 776 nsz = ns[i]; 777 aii = ai[row]; 778 v1 = aa + aii; 779 vi = aj + aii; 780 nz = ad[row]- aii; 781 if (i < node_max-1) { 782 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 783 * but our indexing to determine it's size could. */ 784 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 785 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 786 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 787 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 788 } 789 790 switch (nsz) { /* Each loop in 'case' is unrolled */ 791 case 1: 792 sum1 = b[*r++]; 793 for (j=0; j<nz-1; j+=2) { 794 i0 = vi[0]; 795 i1 = vi[1]; 796 vi +=2; 797 tmp0 = tmps[i0]; 798 tmp1 = tmps[i1]; 799 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 800 } 801 if (j == nz-1) { 802 tmp0 = tmps[*vi++]; 803 sum1 -= *v1++ *tmp0; 804 } 805 tmp[row++]=sum1; 806 break; 807 case 2: 808 sum1 = b[*r++]; 809 sum2 = b[*r++]; 810 v2 = aa + ai[row+1]; 811 812 for (j=0; j<nz-1; j+=2) { 813 i0 = vi[0]; 814 i1 = vi[1]; 815 vi +=2; 816 tmp0 = tmps[i0]; 817 tmp1 = tmps[i1]; 818 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 819 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 820 } 821 if (j == nz-1) { 822 tmp0 = tmps[*vi++]; 823 sum1 -= *v1++ *tmp0; 824 sum2 -= *v2++ *tmp0; 825 } 826 sum2 -= *v2++ *sum1; 827 tmp[row++]=sum1; 828 tmp[row++]=sum2; 829 break; 830 case 3: 831 sum1 = b[*r++]; 832 sum2 = b[*r++]; 833 sum3 = b[*r++]; 834 v2 = aa + ai[row+1]; 835 v3 = aa + ai[row+2]; 836 837 for (j=0; j<nz-1; j+=2) { 838 i0 = vi[0]; 839 i1 = vi[1]; 840 vi +=2; 841 tmp0 = tmps[i0]; 842 tmp1 = tmps[i1]; 843 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 844 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 845 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 846 } 847 if (j == nz-1) { 848 tmp0 = tmps[*vi++]; 849 sum1 -= *v1++ *tmp0; 850 sum2 -= *v2++ *tmp0; 851 sum3 -= *v3++ *tmp0; 852 } 853 sum2 -= *v2++ * sum1; 854 sum3 -= *v3++ * sum1; 855 sum3 -= *v3++ * sum2; 856 857 tmp[row++]=sum1; 858 tmp[row++]=sum2; 859 tmp[row++]=sum3; 860 break; 861 862 case 4: 863 sum1 = b[*r++]; 864 sum2 = b[*r++]; 865 sum3 = b[*r++]; 866 sum4 = b[*r++]; 867 v2 = aa + ai[row+1]; 868 v3 = aa + ai[row+2]; 869 v4 = aa + ai[row+3]; 870 871 for (j=0; j<nz-1; j+=2) { 872 i0 = vi[0]; 873 i1 = vi[1]; 874 vi +=2; 875 tmp0 = tmps[i0]; 876 tmp1 = tmps[i1]; 877 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 878 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 879 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 880 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 881 } 882 if (j == nz-1) { 883 tmp0 = tmps[*vi++]; 884 sum1 -= *v1++ *tmp0; 885 sum2 -= *v2++ *tmp0; 886 sum3 -= *v3++ *tmp0; 887 sum4 -= *v4++ *tmp0; 888 } 889 sum2 -= *v2++ * sum1; 890 sum3 -= *v3++ * sum1; 891 sum4 -= *v4++ * sum1; 892 sum3 -= *v3++ * sum2; 893 sum4 -= *v4++ * sum2; 894 sum4 -= *v4++ * sum3; 895 896 tmp[row++]=sum1; 897 tmp[row++]=sum2; 898 tmp[row++]=sum3; 899 tmp[row++]=sum4; 900 break; 901 case 5: 902 sum1 = b[*r++]; 903 sum2 = b[*r++]; 904 sum3 = b[*r++]; 905 sum4 = b[*r++]; 906 sum5 = b[*r++]; 907 v2 = aa + ai[row+1]; 908 v3 = aa + ai[row+2]; 909 v4 = aa + ai[row+3]; 910 v5 = aa + ai[row+4]; 911 912 for (j=0; j<nz-1; j+=2) { 913 i0 = vi[0]; 914 i1 = vi[1]; 915 vi +=2; 916 tmp0 = tmps[i0]; 917 tmp1 = tmps[i1]; 918 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 919 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 920 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 921 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 922 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 923 } 924 if (j == nz-1) { 925 tmp0 = tmps[*vi++]; 926 sum1 -= *v1++ *tmp0; 927 sum2 -= *v2++ *tmp0; 928 sum3 -= *v3++ *tmp0; 929 sum4 -= *v4++ *tmp0; 930 sum5 -= *v5++ *tmp0; 931 } 932 933 sum2 -= *v2++ * sum1; 934 sum3 -= *v3++ * sum1; 935 sum4 -= *v4++ * sum1; 936 sum5 -= *v5++ * sum1; 937 sum3 -= *v3++ * sum2; 938 sum4 -= *v4++ * sum2; 939 sum5 -= *v5++ * sum2; 940 sum4 -= *v4++ * sum3; 941 sum5 -= *v5++ * sum3; 942 sum5 -= *v5++ * sum4; 943 944 tmp[row++]=sum1; 945 tmp[row++]=sum2; 946 tmp[row++]=sum3; 947 tmp[row++]=sum4; 948 tmp[row++]=sum5; 949 break; 950 default: 951 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 952 } 953 } 954 /* backward solve the upper triangular */ 955 for (i=node_max -1,row = n-1; i>=0; i--) { 956 nsz = ns[i]; 957 aii = ai[row+1] -1; 958 v1 = aa + aii; 959 vi = aj + aii; 960 nz = aii- ad[row]; 961 switch (nsz) { /* Each loop in 'case' is unrolled */ 962 case 1: 963 sum1 = tmp[row]; 964 965 for (j=nz; j>1; j-=2) { 966 vi -=2; 967 i0 = vi[2]; 968 i1 = vi[1]; 969 tmp0 = tmps[i0]; 970 tmp1 = tmps[i1]; 971 v1 -= 2; 972 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 973 } 974 if (j==1) { 975 tmp0 = tmps[*vi--]; 976 sum1 -= *v1-- * tmp0; 977 } 978 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 979 break; 980 case 2: 981 sum1 = tmp[row]; 982 sum2 = tmp[row -1]; 983 v2 = aa + ai[row]-1; 984 for (j=nz; j>1; j-=2) { 985 vi -=2; 986 i0 = vi[2]; 987 i1 = vi[1]; 988 tmp0 = tmps[i0]; 989 tmp1 = tmps[i1]; 990 v1 -= 2; 991 v2 -= 2; 992 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 993 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 994 } 995 if (j==1) { 996 tmp0 = tmps[*vi--]; 997 sum1 -= *v1-- * tmp0; 998 sum2 -= *v2-- * tmp0; 999 } 1000 1001 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1002 sum2 -= *v2-- * tmp0; 1003 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1004 break; 1005 case 3: 1006 sum1 = tmp[row]; 1007 sum2 = tmp[row -1]; 1008 sum3 = tmp[row -2]; 1009 v2 = aa + ai[row]-1; 1010 v3 = aa + ai[row -1]-1; 1011 for (j=nz; j>1; j-=2) { 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 v3 -= 2; 1020 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1021 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1022 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1023 } 1024 if (j==1) { 1025 tmp0 = tmps[*vi--]; 1026 sum1 -= *v1-- * tmp0; 1027 sum2 -= *v2-- * tmp0; 1028 sum3 -= *v3-- * tmp0; 1029 } 1030 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1031 sum2 -= *v2-- * tmp0; 1032 sum3 -= *v3-- * tmp0; 1033 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1034 sum3 -= *v3-- * tmp0; 1035 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1036 1037 break; 1038 case 4: 1039 sum1 = tmp[row]; 1040 sum2 = tmp[row -1]; 1041 sum3 = tmp[row -2]; 1042 sum4 = tmp[row -3]; 1043 v2 = aa + ai[row]-1; 1044 v3 = aa + ai[row -1]-1; 1045 v4 = aa + ai[row -2]-1; 1046 1047 for (j=nz; j>1; j-=2) { 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 v4 -= 2; 1057 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1058 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1059 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1060 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1061 } 1062 if (j==1) { 1063 tmp0 = tmps[*vi--]; 1064 sum1 -= *v1-- * tmp0; 1065 sum2 -= *v2-- * tmp0; 1066 sum3 -= *v3-- * tmp0; 1067 sum4 -= *v4-- * tmp0; 1068 } 1069 1070 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1071 sum2 -= *v2-- * tmp0; 1072 sum3 -= *v3-- * tmp0; 1073 sum4 -= *v4-- * tmp0; 1074 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1075 sum3 -= *v3-- * tmp0; 1076 sum4 -= *v4-- * tmp0; 1077 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1078 sum4 -= *v4-- * tmp0; 1079 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1080 break; 1081 case 5: 1082 sum1 = tmp[row]; 1083 sum2 = tmp[row -1]; 1084 sum3 = tmp[row -2]; 1085 sum4 = tmp[row -3]; 1086 sum5 = tmp[row -4]; 1087 v2 = aa + ai[row]-1; 1088 v3 = aa + ai[row -1]-1; 1089 v4 = aa + ai[row -2]-1; 1090 v5 = aa + ai[row -3]-1; 1091 for (j=nz; j>1; j-=2) { 1092 vi -= 2; 1093 i0 = vi[2]; 1094 i1 = vi[1]; 1095 tmp0 = tmps[i0]; 1096 tmp1 = tmps[i1]; 1097 v1 -= 2; 1098 v2 -= 2; 1099 v3 -= 2; 1100 v4 -= 2; 1101 v5 -= 2; 1102 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1103 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1104 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1105 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1106 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1107 } 1108 if (j==1) { 1109 tmp0 = tmps[*vi--]; 1110 sum1 -= *v1-- * tmp0; 1111 sum2 -= *v2-- * tmp0; 1112 sum3 -= *v3-- * tmp0; 1113 sum4 -= *v4-- * tmp0; 1114 sum5 -= *v5-- * tmp0; 1115 } 1116 1117 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1118 sum2 -= *v2-- * tmp0; 1119 sum3 -= *v3-- * tmp0; 1120 sum4 -= *v4-- * tmp0; 1121 sum5 -= *v5-- * tmp0; 1122 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1123 sum3 -= *v3-- * tmp0; 1124 sum4 -= *v4-- * tmp0; 1125 sum5 -= *v5-- * tmp0; 1126 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1127 sum4 -= *v4-- * tmp0; 1128 sum5 -= *v5-- * tmp0; 1129 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1130 sum5 -= *v5-- * tmp0; 1131 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1132 break; 1133 default: 1134 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1135 } 1136 } 1137 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1138 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1139 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1140 ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr); 1141 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1142 PetscFunctionReturn(0); 1143 } 1144 1145 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1146 { 1147 Mat C =B; 1148 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1149 IS isrow = b->row,isicol = b->icol; 1150 PetscErrorCode ierr; 1151 const PetscInt *r,*ic,*ics; 1152 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1153 PetscInt i,j,k,nz,nzL,row,*pj; 1154 const PetscInt *ajtmp,*bjtmp; 1155 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1156 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1157 FactorShiftCtx sctx; 1158 const PetscInt *ddiag; 1159 PetscReal rs; 1160 MatScalar d; 1161 PetscInt inod,nodesz,node_max,col; 1162 const PetscInt *ns; 1163 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1164 1165 PetscFunctionBegin; 1166 /* MatPivotSetUp(): initialize shift context sctx */ 1167 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1168 1169 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1170 ddiag = a->diag; 1171 sctx.shift_top = info->zeropivot; 1172 for (i=0; i<n; i++) { 1173 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1174 d = (aa)[ddiag[i]]; 1175 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1176 v = aa+ai[i]; 1177 nz = ai[i+1] - ai[i]; 1178 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1179 if (rs>sctx.shift_top) sctx.shift_top = rs; 1180 } 1181 sctx.shift_top *= 1.1; 1182 sctx.nshift_max = 5; 1183 sctx.shift_lo = 0.; 1184 sctx.shift_hi = 1.; 1185 } 1186 1187 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1188 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1189 1190 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1191 ics = ic; 1192 1193 node_max = a->inode.node_count; 1194 ns = a->inode.size; 1195 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1196 1197 /* If max inode size > 4, split it into two inodes.*/ 1198 /* also map the inode sizes according to the ordering */ 1199 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1200 for (i=0,j=0; i<node_max; ++i,++j) { 1201 if (ns[i] > 4) { 1202 tmp_vec1[j] = 4; 1203 ++j; 1204 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1205 } else { 1206 tmp_vec1[j] = ns[i]; 1207 } 1208 } 1209 /* Use the correct node_max */ 1210 node_max = j; 1211 1212 /* Now reorder the inode info based on mat re-ordering info */ 1213 /* First create a row -> inode_size_array_index map */ 1214 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1215 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1216 for (i=0,row=0; i<node_max; i++) { 1217 nodesz = tmp_vec1[i]; 1218 for (j=0; j<nodesz; j++,row++) { 1219 nsmap[row] = i; 1220 } 1221 } 1222 /* Using nsmap, create a reordered ns structure */ 1223 for (i=0,j=0; i< node_max; i++) { 1224 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1225 tmp_vec2[i] = nodesz; 1226 j += nodesz; 1227 } 1228 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1229 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1230 1231 /* Now use the correct ns */ 1232 ns = tmp_vec2; 1233 1234 do { 1235 sctx.newshift = PETSC_FALSE; 1236 /* Now loop over each block-row, and do the factorization */ 1237 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1238 nodesz = ns[inod]; 1239 1240 switch (nodesz) { 1241 case 1: 1242 /*----------*/ 1243 /* zero rtmp1 */ 1244 /* L part */ 1245 nz = bi[i+1] - bi[i]; 1246 bjtmp = bj + bi[i]; 1247 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1248 1249 /* U part */ 1250 nz = bdiag[i]-bdiag[i+1]; 1251 bjtmp = bj + bdiag[i+1]+1; 1252 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1253 1254 /* load in initial (unfactored row) */ 1255 nz = ai[r[i]+1] - ai[r[i]]; 1256 ajtmp = aj + ai[r[i]]; 1257 v = aa + ai[r[i]]; 1258 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1259 1260 /* ZeropivotApply() */ 1261 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1262 1263 /* elimination */ 1264 bjtmp = bj + bi[i]; 1265 row = *bjtmp++; 1266 nzL = bi[i+1] - bi[i]; 1267 for (k=0; k < nzL; k++) { 1268 pc = rtmp1 + row; 1269 if (*pc != 0.0) { 1270 pv = b->a + bdiag[row]; 1271 mul1 = *pc * (*pv); 1272 *pc = mul1; 1273 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1274 pv = b->a + bdiag[row+1]+1; 1275 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1276 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1277 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr); 1278 } 1279 row = *bjtmp++; 1280 } 1281 1282 /* finished row so stick it into b->a */ 1283 rs = 0.0; 1284 /* L part */ 1285 pv = b->a + bi[i]; 1286 pj = b->j + bi[i]; 1287 nz = bi[i+1] - bi[i]; 1288 for (j=0; j<nz; j++) { 1289 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1290 } 1291 1292 /* U part */ 1293 pv = b->a + bdiag[i+1]+1; 1294 pj = b->j + bdiag[i+1]+1; 1295 nz = bdiag[i] - bdiag[i+1]-1; 1296 for (j=0; j<nz; j++) { 1297 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1298 } 1299 1300 /* Check zero pivot */ 1301 sctx.rs = rs; 1302 sctx.pv = rtmp1[i]; 1303 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1304 if (sctx.newshift) break; 1305 1306 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1307 pv = b->a + bdiag[i]; 1308 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1309 break; 1310 1311 case 2: 1312 /*----------*/ 1313 /* zero rtmp1 and rtmp2 */ 1314 /* L part */ 1315 nz = bi[i+1] - bi[i]; 1316 bjtmp = bj + bi[i]; 1317 for (j=0; j<nz; j++) { 1318 col = bjtmp[j]; 1319 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1320 } 1321 1322 /* U part */ 1323 nz = bdiag[i]-bdiag[i+1]; 1324 bjtmp = bj + bdiag[i+1]+1; 1325 for (j=0; j<nz; j++) { 1326 col = bjtmp[j]; 1327 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1328 } 1329 1330 /* load in initial (unfactored row) */ 1331 nz = ai[r[i]+1] - ai[r[i]]; 1332 ajtmp = aj + ai[r[i]]; 1333 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1334 for (j=0; j<nz; j++) { 1335 col = ics[ajtmp[j]]; 1336 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1337 } 1338 /* ZeropivotApply(): shift the diagonal of the matrix */ 1339 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1340 1341 /* elimination */ 1342 bjtmp = bj + bi[i]; 1343 row = *bjtmp++; /* pivot row */ 1344 nzL = bi[i+1] - bi[i]; 1345 for (k=0; k < nzL; k++) { 1346 pc1 = rtmp1 + row; 1347 pc2 = rtmp2 + row; 1348 if (*pc1 != 0.0 || *pc2 != 0.0) { 1349 pv = b->a + bdiag[row]; 1350 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1351 *pc1 = mul1; *pc2 = mul2; 1352 1353 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1354 pv = b->a + bdiag[row+1]+1; 1355 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1356 for (j=0; j<nz; j++) { 1357 col = pj[j]; 1358 rtmp1[col] -= mul1 * pv[j]; 1359 rtmp2[col] -= mul2 * pv[j]; 1360 } 1361 ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr); 1362 } 1363 row = *bjtmp++; 1364 } 1365 1366 /* finished row i; check zero pivot, then stick row i into b->a */ 1367 rs = 0.0; 1368 /* L part */ 1369 pc1 = b->a + bi[i]; 1370 pj = b->j + bi[i]; 1371 nz = bi[i+1] - bi[i]; 1372 for (j=0; j<nz; j++) { 1373 col = pj[j]; 1374 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1375 } 1376 /* U part */ 1377 pc1 = b->a + bdiag[i+1]+1; 1378 pj = b->j + bdiag[i+1]+1; 1379 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1380 for (j=0; j<nz; j++) { 1381 col = pj[j]; 1382 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1383 } 1384 1385 sctx.rs = rs; 1386 sctx.pv = rtmp1[i]; 1387 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1388 if (sctx.newshift) break; 1389 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1390 *pc1 = 1.0/sctx.pv; 1391 1392 /* Now take care of diagonal 2x2 block. */ 1393 pc2 = rtmp2 + i; 1394 if (*pc2 != 0.0) { 1395 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1396 *pc2 = mul1; /* insert L entry */ 1397 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1398 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1399 for (j=0; j<nz; j++) { 1400 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1401 } 1402 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr); 1403 } 1404 1405 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1406 rs = 0.0; 1407 /* L part */ 1408 pc2 = b->a + bi[i+1]; 1409 pj = b->j + bi[i+1]; 1410 nz = bi[i+2] - bi[i+1]; 1411 for (j=0; j<nz; j++) { 1412 col = pj[j]; 1413 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1414 } 1415 /* U part */ 1416 pc2 = b->a + bdiag[i+2]+1; 1417 pj = b->j + bdiag[i+2]+1; 1418 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1419 for (j=0; j<nz; j++) { 1420 col = pj[j]; 1421 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1422 } 1423 1424 sctx.rs = rs; 1425 sctx.pv = rtmp2[i+1]; 1426 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1427 if (sctx.newshift) break; 1428 pc2 = b->a + bdiag[i+1]; 1429 *pc2 = 1.0/sctx.pv; 1430 break; 1431 1432 case 3: 1433 /*----------*/ 1434 /* zero rtmp */ 1435 /* L part */ 1436 nz = bi[i+1] - bi[i]; 1437 bjtmp = bj + bi[i]; 1438 for (j=0; j<nz; j++) { 1439 col = bjtmp[j]; 1440 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1441 } 1442 1443 /* U part */ 1444 nz = bdiag[i]-bdiag[i+1]; 1445 bjtmp = bj + bdiag[i+1]+1; 1446 for (j=0; j<nz; j++) { 1447 col = bjtmp[j]; 1448 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1449 } 1450 1451 /* load in initial (unfactored row) */ 1452 nz = ai[r[i]+1] - ai[r[i]]; 1453 ajtmp = aj + ai[r[i]]; 1454 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1455 for (j=0; j<nz; j++) { 1456 col = ics[ajtmp[j]]; 1457 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1458 } 1459 /* ZeropivotApply(): shift the diagonal of the matrix */ 1460 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1461 1462 /* elimination */ 1463 bjtmp = bj + bi[i]; 1464 row = *bjtmp++; /* pivot row */ 1465 nzL = bi[i+1] - bi[i]; 1466 for (k=0; k < nzL; k++) { 1467 pc1 = rtmp1 + row; 1468 pc2 = rtmp2 + row; 1469 pc3 = rtmp3 + row; 1470 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1471 pv = b->a + bdiag[row]; 1472 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1473 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1474 1475 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1476 pv = b->a + bdiag[row+1]+1; 1477 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1478 for (j=0; j<nz; j++) { 1479 col = pj[j]; 1480 rtmp1[col] -= mul1 * pv[j]; 1481 rtmp2[col] -= mul2 * pv[j]; 1482 rtmp3[col] -= mul3 * pv[j]; 1483 } 1484 ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr); 1485 } 1486 row = *bjtmp++; 1487 } 1488 1489 /* finished row i; check zero pivot, then stick row i into b->a */ 1490 rs = 0.0; 1491 /* L part */ 1492 pc1 = b->a + bi[i]; 1493 pj = b->j + bi[i]; 1494 nz = bi[i+1] - bi[i]; 1495 for (j=0; j<nz; j++) { 1496 col = pj[j]; 1497 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1498 } 1499 /* U part */ 1500 pc1 = b->a + bdiag[i+1]+1; 1501 pj = b->j + bdiag[i+1]+1; 1502 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1503 for (j=0; j<nz; j++) { 1504 col = pj[j]; 1505 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1506 } 1507 1508 sctx.rs = rs; 1509 sctx.pv = rtmp1[i]; 1510 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1511 if (sctx.newshift) break; 1512 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1513 *pc1 = 1.0/sctx.pv; 1514 1515 /* Now take care of 1st column of diagonal 3x3 block. */ 1516 pc2 = rtmp2 + i; 1517 pc3 = rtmp3 + i; 1518 if (*pc2 != 0.0 || *pc3 != 0.0) { 1519 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1520 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1521 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1522 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1523 for (j=0; j<nz; j++) { 1524 col = pj[j]; 1525 rtmp2[col] -= mul2 * rtmp1[col]; 1526 rtmp3[col] -= mul3 * rtmp1[col]; 1527 } 1528 ierr = PetscLogFlops(2+4.0*nz);CHKERRQ(ierr); 1529 } 1530 1531 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1532 rs = 0.0; 1533 /* L part */ 1534 pc2 = b->a + bi[i+1]; 1535 pj = b->j + bi[i+1]; 1536 nz = bi[i+2] - bi[i+1]; 1537 for (j=0; j<nz; j++) { 1538 col = pj[j]; 1539 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1540 } 1541 /* U part */ 1542 pc2 = b->a + bdiag[i+2]+1; 1543 pj = b->j + bdiag[i+2]+1; 1544 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1545 for (j=0; j<nz; j++) { 1546 col = pj[j]; 1547 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1548 } 1549 1550 sctx.rs = rs; 1551 sctx.pv = rtmp2[i+1]; 1552 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1553 if (sctx.newshift) break; 1554 pc2 = b->a + bdiag[i+1]; 1555 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1556 1557 /* Now take care of 2nd column of diagonal 3x3 block. */ 1558 pc3 = rtmp3 + i+1; 1559 if (*pc3 != 0.0) { 1560 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1561 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1562 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1563 for (j=0; j<nz; j++) { 1564 col = pj[j]; 1565 rtmp3[col] -= mul3 * rtmp2[col]; 1566 } 1567 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr); 1568 } 1569 1570 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1571 rs = 0.0; 1572 /* L part */ 1573 pc3 = b->a + bi[i+2]; 1574 pj = b->j + bi[i+2]; 1575 nz = bi[i+3] - bi[i+2]; 1576 for (j=0; j<nz; j++) { 1577 col = pj[j]; 1578 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1579 } 1580 /* U part */ 1581 pc3 = b->a + bdiag[i+3]+1; 1582 pj = b->j + bdiag[i+3]+1; 1583 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1584 for (j=0; j<nz; j++) { 1585 col = pj[j]; 1586 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1587 } 1588 1589 sctx.rs = rs; 1590 sctx.pv = rtmp3[i+2]; 1591 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1592 if (sctx.newshift) break; 1593 pc3 = b->a + bdiag[i+2]; 1594 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1595 break; 1596 case 4: 1597 /*----------*/ 1598 /* zero rtmp */ 1599 /* L part */ 1600 nz = bi[i+1] - bi[i]; 1601 bjtmp = bj + bi[i]; 1602 for (j=0; j<nz; j++) { 1603 col = bjtmp[j]; 1604 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1605 } 1606 1607 /* U part */ 1608 nz = bdiag[i]-bdiag[i+1]; 1609 bjtmp = bj + bdiag[i+1]+1; 1610 for (j=0; j<nz; j++) { 1611 col = bjtmp[j]; 1612 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1613 } 1614 1615 /* load in initial (unfactored row) */ 1616 nz = ai[r[i]+1] - ai[r[i]]; 1617 ajtmp = aj + ai[r[i]]; 1618 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1619 for (j=0; j<nz; j++) { 1620 col = ics[ajtmp[j]]; 1621 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1622 } 1623 /* ZeropivotApply(): shift the diagonal of the matrix */ 1624 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1625 1626 /* elimination */ 1627 bjtmp = bj + bi[i]; 1628 row = *bjtmp++; /* pivot row */ 1629 nzL = bi[i+1] - bi[i]; 1630 for (k=0; k < nzL; k++) { 1631 pc1 = rtmp1 + row; 1632 pc2 = rtmp2 + row; 1633 pc3 = rtmp3 + row; 1634 pc4 = rtmp4 + row; 1635 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1636 pv = b->a + bdiag[row]; 1637 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1638 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1639 1640 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1641 pv = b->a + bdiag[row+1]+1; 1642 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1643 for (j=0; j<nz; j++) { 1644 col = pj[j]; 1645 rtmp1[col] -= mul1 * pv[j]; 1646 rtmp2[col] -= mul2 * pv[j]; 1647 rtmp3[col] -= mul3 * pv[j]; 1648 rtmp4[col] -= mul4 * pv[j]; 1649 } 1650 ierr = PetscLogFlops(4+8.0*nz);CHKERRQ(ierr); 1651 } 1652 row = *bjtmp++; 1653 } 1654 1655 /* finished row i; check zero pivot, then stick row i into b->a */ 1656 rs = 0.0; 1657 /* L part */ 1658 pc1 = b->a + bi[i]; 1659 pj = b->j + bi[i]; 1660 nz = bi[i+1] - bi[i]; 1661 for (j=0; j<nz; j++) { 1662 col = pj[j]; 1663 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1664 } 1665 /* U part */ 1666 pc1 = b->a + bdiag[i+1]+1; 1667 pj = b->j + bdiag[i+1]+1; 1668 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1669 for (j=0; j<nz; j++) { 1670 col = pj[j]; 1671 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1672 } 1673 1674 sctx.rs = rs; 1675 sctx.pv = rtmp1[i]; 1676 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1677 if (sctx.newshift) break; 1678 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1679 *pc1 = 1.0/sctx.pv; 1680 1681 /* Now take care of 1st column of diagonal 4x4 block. */ 1682 pc2 = rtmp2 + i; 1683 pc3 = rtmp3 + i; 1684 pc4 = rtmp4 + i; 1685 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1686 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1687 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1688 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1689 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1690 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1691 for (j=0; j<nz; j++) { 1692 col = pj[j]; 1693 rtmp2[col] -= mul2 * rtmp1[col]; 1694 rtmp3[col] -= mul3 * rtmp1[col]; 1695 rtmp4[col] -= mul4 * rtmp1[col]; 1696 } 1697 ierr = PetscLogFlops(3+6.0*nz);CHKERRQ(ierr); 1698 } 1699 1700 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1701 rs = 0.0; 1702 /* L part */ 1703 pc2 = b->a + bi[i+1]; 1704 pj = b->j + bi[i+1]; 1705 nz = bi[i+2] - bi[i+1]; 1706 for (j=0; j<nz; j++) { 1707 col = pj[j]; 1708 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1709 } 1710 /* U part */ 1711 pc2 = b->a + bdiag[i+2]+1; 1712 pj = b->j + bdiag[i+2]+1; 1713 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1714 for (j=0; j<nz; j++) { 1715 col = pj[j]; 1716 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1717 } 1718 1719 sctx.rs = rs; 1720 sctx.pv = rtmp2[i+1]; 1721 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1722 if (sctx.newshift) break; 1723 pc2 = b->a + bdiag[i+1]; 1724 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1725 1726 /* Now take care of 2nd column of diagonal 4x4 block. */ 1727 pc3 = rtmp3 + i+1; 1728 pc4 = rtmp4 + i+1; 1729 if (*pc3 != 0.0 || *pc4 != 0.0) { 1730 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1731 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1732 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1733 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1734 for (j=0; j<nz; j++) { 1735 col = pj[j]; 1736 rtmp3[col] -= mul3 * rtmp2[col]; 1737 rtmp4[col] -= mul4 * rtmp2[col]; 1738 } 1739 ierr = PetscLogFlops(4.0*nz);CHKERRQ(ierr); 1740 } 1741 1742 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1743 rs = 0.0; 1744 /* L part */ 1745 pc3 = b->a + bi[i+2]; 1746 pj = b->j + bi[i+2]; 1747 nz = bi[i+3] - bi[i+2]; 1748 for (j=0; j<nz; j++) { 1749 col = pj[j]; 1750 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1751 } 1752 /* U part */ 1753 pc3 = b->a + bdiag[i+3]+1; 1754 pj = b->j + bdiag[i+3]+1; 1755 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1756 for (j=0; j<nz; j++) { 1757 col = pj[j]; 1758 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1759 } 1760 1761 sctx.rs = rs; 1762 sctx.pv = rtmp3[i+2]; 1763 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1764 if (sctx.newshift) break; 1765 pc3 = b->a + bdiag[i+2]; 1766 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1767 1768 /* Now take care of 3rd column of diagonal 4x4 block. */ 1769 pc4 = rtmp4 + i+2; 1770 if (*pc4 != 0.0) { 1771 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1772 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1773 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1774 for (j=0; j<nz; j++) { 1775 col = pj[j]; 1776 rtmp4[col] -= mul4 * rtmp3[col]; 1777 } 1778 ierr = PetscLogFlops(1+2.0*nz);CHKERRQ(ierr); 1779 } 1780 1781 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1782 rs = 0.0; 1783 /* L part */ 1784 pc4 = b->a + bi[i+3]; 1785 pj = b->j + bi[i+3]; 1786 nz = bi[i+4] - bi[i+3]; 1787 for (j=0; j<nz; j++) { 1788 col = pj[j]; 1789 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1790 } 1791 /* U part */ 1792 pc4 = b->a + bdiag[i+4]+1; 1793 pj = b->j + bdiag[i+4]+1; 1794 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1795 for (j=0; j<nz; j++) { 1796 col = pj[j]; 1797 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1798 } 1799 1800 sctx.rs = rs; 1801 sctx.pv = rtmp4[i+3]; 1802 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr); 1803 if (sctx.newshift) break; 1804 pc4 = b->a + bdiag[i+3]; 1805 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1806 break; 1807 1808 default: 1809 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1810 } 1811 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1812 i += nodesz; /* Update the row */ 1813 } 1814 1815 /* MatPivotRefine() */ 1816 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1817 /* 1818 * if no shift in this attempt & shifting & started shifting & can refine, 1819 * then try lower shift 1820 */ 1821 sctx.shift_hi = sctx.shift_fraction; 1822 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1823 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1824 sctx.newshift = PETSC_TRUE; 1825 sctx.nshift++; 1826 } 1827 } while (sctx.newshift); 1828 1829 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1830 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1831 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1832 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1833 1834 if (b->inode.size) { 1835 C->ops->solve = MatSolve_SeqAIJ_Inode; 1836 } else { 1837 C->ops->solve = MatSolve_SeqAIJ; 1838 } 1839 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1840 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1841 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1842 C->ops->matsolve = MatMatSolve_SeqAIJ; 1843 C->assembled = PETSC_TRUE; 1844 C->preallocated = PETSC_TRUE; 1845 1846 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1847 1848 /* MatShiftView(A,info,&sctx) */ 1849 if (sctx.nshift) { 1850 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1851 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1852 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1853 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1854 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1855 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1856 } 1857 } 1858 PetscFunctionReturn(0); 1859 } 1860 1861 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1862 { 1863 Mat C = B; 1864 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1865 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1866 PetscErrorCode ierr; 1867 const PetscInt *r,*ic,*c,*ics; 1868 PetscInt n = A->rmap->n,*bi = b->i; 1869 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1870 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1871 PetscInt *ai = a->i,*aj = a->j; 1872 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1873 PetscScalar mul1,mul2,mul3,tmp; 1874 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1875 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1876 PetscReal rs=0.0; 1877 FactorShiftCtx sctx; 1878 1879 PetscFunctionBegin; 1880 sctx.shift_top = 0; 1881 sctx.nshift_max = 0; 1882 sctx.shift_lo = 0; 1883 sctx.shift_hi = 0; 1884 sctx.shift_fraction = 0; 1885 1886 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1887 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1888 sctx.shift_top = 0; 1889 for (i=0; i<n; i++) { 1890 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1891 rs = 0.0; 1892 ajtmp = aj + ai[i]; 1893 rtmp1 = aa + ai[i]; 1894 nz = ai[i+1] - ai[i]; 1895 for (j=0; j<nz; j++) { 1896 if (*ajtmp != i) { 1897 rs += PetscAbsScalar(*rtmp1++); 1898 } else { 1899 rs -= PetscRealPart(*rtmp1++); 1900 } 1901 ajtmp++; 1902 } 1903 if (rs>sctx.shift_top) sctx.shift_top = rs; 1904 } 1905 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1906 sctx.shift_top *= 1.1; 1907 sctx.nshift_max = 5; 1908 sctx.shift_lo = 0.; 1909 sctx.shift_hi = 1.; 1910 } 1911 sctx.shift_amount = 0; 1912 sctx.nshift = 0; 1913 1914 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1915 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1916 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1917 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1918 ics = ic; 1919 1920 node_max = a->inode.node_count; 1921 ns = a->inode.size; 1922 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1923 1924 /* If max inode size > 3, split it into two inodes.*/ 1925 /* also map the inode sizes according to the ordering */ 1926 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1927 for (i=0,j=0; i<node_max; ++i,++j) { 1928 if (ns[i]>3) { 1929 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1930 ++j; 1931 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1932 } else { 1933 tmp_vec1[j] = ns[i]; 1934 } 1935 } 1936 /* Use the correct node_max */ 1937 node_max = j; 1938 1939 /* Now reorder the inode info based on mat re-ordering info */ 1940 /* First create a row -> inode_size_array_index map */ 1941 ierr = PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2);CHKERRQ(ierr); 1942 for (i=0,row=0; i<node_max; i++) { 1943 nodesz = tmp_vec1[i]; 1944 for (j=0; j<nodesz; j++,row++) { 1945 nsmap[row] = i; 1946 } 1947 } 1948 /* Using nsmap, create a reordered ns structure */ 1949 for (i=0,j=0; i< node_max; i++) { 1950 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1951 tmp_vec2[i] = nodesz; 1952 j += nodesz; 1953 } 1954 ierr = PetscFree2(nsmap,tmp_vec1);CHKERRQ(ierr); 1955 /* Now use the correct ns */ 1956 ns = tmp_vec2; 1957 1958 do { 1959 sctx.newshift = PETSC_FALSE; 1960 /* Now loop over each block-row, and do the factorization */ 1961 for (i=0,row=0; i<node_max; i++) { 1962 nodesz = ns[i]; 1963 nz = bi[row+1] - bi[row]; 1964 bjtmp = bj + bi[row]; 1965 1966 switch (nodesz) { 1967 case 1: 1968 for (j=0; j<nz; j++) { 1969 idx = bjtmp[j]; 1970 rtmp11[idx] = 0.0; 1971 } 1972 1973 /* load in initial (unfactored row) */ 1974 idx = r[row]; 1975 nz_tmp = ai[idx+1] - ai[idx]; 1976 ajtmp = aj + ai[idx]; 1977 v1 = aa + ai[idx]; 1978 1979 for (j=0; j<nz_tmp; j++) { 1980 idx = ics[ajtmp[j]]; 1981 rtmp11[idx] = v1[j]; 1982 } 1983 rtmp11[ics[r[row]]] += sctx.shift_amount; 1984 1985 prow = *bjtmp++; 1986 while (prow < row) { 1987 pc1 = rtmp11 + prow; 1988 if (*pc1 != 0.0) { 1989 pv = ba + bd[prow]; 1990 pj = nbj + bd[prow]; 1991 mul1 = *pc1 * *pv++; 1992 *pc1 = mul1; 1993 nz_tmp = bi[prow+1] - bd[prow] - 1; 1994 ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr); 1995 for (j=0; j<nz_tmp; j++) { 1996 tmp = pv[j]; 1997 idx = pj[j]; 1998 rtmp11[idx] -= mul1 * tmp; 1999 } 2000 } 2001 prow = *bjtmp++; 2002 } 2003 pj = bj + bi[row]; 2004 pc1 = ba + bi[row]; 2005 2006 sctx.pv = rtmp11[row]; 2007 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2008 rs = 0.0; 2009 for (j=0; j<nz; j++) { 2010 idx = pj[j]; 2011 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2012 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2013 } 2014 sctx.rs = rs; 2015 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2016 if (sctx.newshift) goto endofwhile; 2017 break; 2018 2019 case 2: 2020 for (j=0; j<nz; j++) { 2021 idx = bjtmp[j]; 2022 rtmp11[idx] = 0.0; 2023 rtmp22[idx] = 0.0; 2024 } 2025 2026 /* load in initial (unfactored row) */ 2027 idx = r[row]; 2028 nz_tmp = ai[idx+1] - ai[idx]; 2029 ajtmp = aj + ai[idx]; 2030 v1 = aa + ai[idx]; 2031 v2 = aa + ai[idx+1]; 2032 for (j=0; j<nz_tmp; j++) { 2033 idx = ics[ajtmp[j]]; 2034 rtmp11[idx] = v1[j]; 2035 rtmp22[idx] = v2[j]; 2036 } 2037 rtmp11[ics[r[row]]] += sctx.shift_amount; 2038 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2039 2040 prow = *bjtmp++; 2041 while (prow < row) { 2042 pc1 = rtmp11 + prow; 2043 pc2 = rtmp22 + prow; 2044 if (*pc1 != 0.0 || *pc2 != 0.0) { 2045 pv = ba + bd[prow]; 2046 pj = nbj + bd[prow]; 2047 mul1 = *pc1 * *pv; 2048 mul2 = *pc2 * *pv; 2049 ++pv; 2050 *pc1 = mul1; 2051 *pc2 = mul2; 2052 2053 nz_tmp = bi[prow+1] - bd[prow] - 1; 2054 for (j=0; j<nz_tmp; j++) { 2055 tmp = pv[j]; 2056 idx = pj[j]; 2057 rtmp11[idx] -= mul1 * tmp; 2058 rtmp22[idx] -= mul2 * tmp; 2059 } 2060 ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr); 2061 } 2062 prow = *bjtmp++; 2063 } 2064 2065 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2066 pc1 = rtmp11 + prow; 2067 pc2 = rtmp22 + prow; 2068 2069 sctx.pv = *pc1; 2070 pj = bj + bi[prow]; 2071 rs = 0.0; 2072 for (j=0; j<nz; j++) { 2073 idx = pj[j]; 2074 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2075 } 2076 sctx.rs = rs; 2077 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2078 if (sctx.newshift) goto endofwhile; 2079 2080 if (*pc2 != 0.0) { 2081 pj = nbj + bd[prow]; 2082 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2083 *pc2 = mul2; 2084 nz_tmp = bi[prow+1] - bd[prow] - 1; 2085 for (j=0; j<nz_tmp; j++) { 2086 idx = pj[j]; 2087 tmp = rtmp11[idx]; 2088 rtmp22[idx] -= mul2 * tmp; 2089 } 2090 ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr); 2091 } 2092 2093 pj = bj + bi[row]; 2094 pc1 = ba + bi[row]; 2095 pc2 = ba + bi[row+1]; 2096 2097 sctx.pv = rtmp22[row+1]; 2098 rs = 0.0; 2099 rtmp11[row] = 1.0/rtmp11[row]; 2100 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2101 /* copy row entries from dense representation to sparse */ 2102 for (j=0; j<nz; j++) { 2103 idx = pj[j]; 2104 pc1[j] = rtmp11[idx]; 2105 pc2[j] = rtmp22[idx]; 2106 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2107 } 2108 sctx.rs = rs; 2109 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2110 if (sctx.newshift) goto endofwhile; 2111 break; 2112 2113 case 3: 2114 for (j=0; j<nz; j++) { 2115 idx = bjtmp[j]; 2116 rtmp11[idx] = 0.0; 2117 rtmp22[idx] = 0.0; 2118 rtmp33[idx] = 0.0; 2119 } 2120 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2121 idx = r[row]; 2122 nz_tmp = ai[idx+1] - ai[idx]; 2123 ajtmp = aj + ai[idx]; 2124 v1 = aa + ai[idx]; 2125 v2 = aa + ai[idx+1]; 2126 v3 = aa + ai[idx+2]; 2127 for (j=0; j<nz_tmp; j++) { 2128 idx = ics[ajtmp[j]]; 2129 rtmp11[idx] = v1[j]; 2130 rtmp22[idx] = v2[j]; 2131 rtmp33[idx] = v3[j]; 2132 } 2133 rtmp11[ics[r[row]]] += sctx.shift_amount; 2134 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2135 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2136 2137 /* loop over all pivot row blocks above this row block */ 2138 prow = *bjtmp++; 2139 while (prow < row) { 2140 pc1 = rtmp11 + prow; 2141 pc2 = rtmp22 + prow; 2142 pc3 = rtmp33 + prow; 2143 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2144 pv = ba + bd[prow]; 2145 pj = nbj + bd[prow]; 2146 mul1 = *pc1 * *pv; 2147 mul2 = *pc2 * *pv; 2148 mul3 = *pc3 * *pv; 2149 ++pv; 2150 *pc1 = mul1; 2151 *pc2 = mul2; 2152 *pc3 = mul3; 2153 2154 nz_tmp = bi[prow+1] - bd[prow] - 1; 2155 /* update this row based on pivot row */ 2156 for (j=0; j<nz_tmp; j++) { 2157 tmp = pv[j]; 2158 idx = pj[j]; 2159 rtmp11[idx] -= mul1 * tmp; 2160 rtmp22[idx] -= mul2 * tmp; 2161 rtmp33[idx] -= mul3 * tmp; 2162 } 2163 ierr = PetscLogFlops(3+6.0*nz_tmp);CHKERRQ(ierr); 2164 } 2165 prow = *bjtmp++; 2166 } 2167 2168 /* Now take care of diagonal 3x3 block in this set of rows */ 2169 /* note: prow = row here */ 2170 pc1 = rtmp11 + prow; 2171 pc2 = rtmp22 + prow; 2172 pc3 = rtmp33 + prow; 2173 2174 sctx.pv = *pc1; 2175 pj = bj + bi[prow]; 2176 rs = 0.0; 2177 for (j=0; j<nz; j++) { 2178 idx = pj[j]; 2179 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2180 } 2181 sctx.rs = rs; 2182 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2183 if (sctx.newshift) goto endofwhile; 2184 2185 if (*pc2 != 0.0 || *pc3 != 0.0) { 2186 mul2 = (*pc2)/(*pc1); 2187 mul3 = (*pc3)/(*pc1); 2188 *pc2 = mul2; 2189 *pc3 = mul3; 2190 nz_tmp = bi[prow+1] - bd[prow] - 1; 2191 pj = nbj + bd[prow]; 2192 for (j=0; j<nz_tmp; j++) { 2193 idx = pj[j]; 2194 tmp = rtmp11[idx]; 2195 rtmp22[idx] -= mul2 * tmp; 2196 rtmp33[idx] -= mul3 * tmp; 2197 } 2198 ierr = PetscLogFlops(2+4.0*nz_tmp);CHKERRQ(ierr); 2199 } 2200 ++prow; 2201 2202 pc2 = rtmp22 + prow; 2203 pc3 = rtmp33 + prow; 2204 sctx.pv = *pc2; 2205 pj = bj + bi[prow]; 2206 rs = 0.0; 2207 for (j=0; j<nz; j++) { 2208 idx = pj[j]; 2209 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2210 } 2211 sctx.rs = rs; 2212 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2213 if (sctx.newshift) goto endofwhile; 2214 2215 if (*pc3 != 0.0) { 2216 mul3 = (*pc3)/(*pc2); 2217 *pc3 = mul3; 2218 pj = nbj + bd[prow]; 2219 nz_tmp = bi[prow+1] - bd[prow] - 1; 2220 for (j=0; j<nz_tmp; j++) { 2221 idx = pj[j]; 2222 tmp = rtmp22[idx]; 2223 rtmp33[idx] -= mul3 * tmp; 2224 } 2225 ierr = PetscLogFlops(1+2.0*nz_tmp);CHKERRQ(ierr); 2226 } 2227 2228 pj = bj + bi[row]; 2229 pc1 = ba + bi[row]; 2230 pc2 = ba + bi[row+1]; 2231 pc3 = ba + bi[row+2]; 2232 2233 sctx.pv = rtmp33[row+2]; 2234 rs = 0.0; 2235 rtmp11[row] = 1.0/rtmp11[row]; 2236 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2237 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2238 /* copy row entries from dense representation to sparse */ 2239 for (j=0; j<nz; j++) { 2240 idx = pj[j]; 2241 pc1[j] = rtmp11[idx]; 2242 pc2[j] = rtmp22[idx]; 2243 pc3[j] = rtmp33[idx]; 2244 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2245 } 2246 2247 sctx.rs = rs; 2248 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr); 2249 if (sctx.newshift) goto endofwhile; 2250 break; 2251 2252 default: 2253 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2254 } 2255 row += nodesz; /* Update the row */ 2256 } 2257 endofwhile:; 2258 } while (sctx.newshift); 2259 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2260 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2261 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2262 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2263 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2264 2265 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2266 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2267 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2268 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2269 C->assembled = PETSC_TRUE; 2270 C->preallocated = PETSC_TRUE; 2271 if (sctx.nshift) { 2272 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2273 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2274 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2275 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2276 } 2277 } 2278 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2279 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2280 PetscFunctionReturn(0); 2281 } 2282 2283 2284 /* ----------------------------------------------------------- */ 2285 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2286 { 2287 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2288 IS iscol = a->col,isrow = a->row; 2289 PetscErrorCode ierr; 2290 const PetscInt *r,*c,*rout,*cout; 2291 PetscInt i,j,n = A->rmap->n; 2292 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2293 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2294 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2295 PetscScalar sum1,sum2,sum3,sum4,sum5; 2296 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2297 const PetscScalar *b; 2298 2299 PetscFunctionBegin; 2300 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2301 node_max = a->inode.node_count; 2302 ns = a->inode.size; /* Node Size array */ 2303 2304 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2305 ierr = VecGetArrayWrite(xx,&x);CHKERRQ(ierr); 2306 tmp = a->solve_work; 2307 2308 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2309 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2310 2311 /* forward solve the lower triangular */ 2312 tmps = tmp; 2313 aa = a_a; 2314 aj = a_j; 2315 ad = a->diag; 2316 2317 for (i = 0,row = 0; i< node_max; ++i) { 2318 nsz = ns[i]; 2319 aii = ai[row]; 2320 v1 = aa + aii; 2321 vi = aj + aii; 2322 nz = ai[row+1]- ai[row]; 2323 2324 if (i < node_max-1) { 2325 /* Prefetch the indices for the next block */ 2326 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2327 /* Prefetch the data for the next block */ 2328 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2329 } 2330 2331 switch (nsz) { /* Each loop in 'case' is unrolled */ 2332 case 1: 2333 sum1 = b[r[row]]; 2334 for (j=0; j<nz-1; j+=2) { 2335 i0 = vi[j]; 2336 i1 = vi[j+1]; 2337 tmp0 = tmps[i0]; 2338 tmp1 = tmps[i1]; 2339 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2340 } 2341 if (j == nz-1) { 2342 tmp0 = tmps[vi[j]]; 2343 sum1 -= v1[j]*tmp0; 2344 } 2345 tmp[row++]=sum1; 2346 break; 2347 case 2: 2348 sum1 = b[r[row]]; 2349 sum2 = b[r[row+1]]; 2350 v2 = aa + ai[row+1]; 2351 2352 for (j=0; j<nz-1; j+=2) { 2353 i0 = vi[j]; 2354 i1 = vi[j+1]; 2355 tmp0 = tmps[i0]; 2356 tmp1 = tmps[i1]; 2357 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2358 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2359 } 2360 if (j == nz-1) { 2361 tmp0 = tmps[vi[j]]; 2362 sum1 -= v1[j] *tmp0; 2363 sum2 -= v2[j] *tmp0; 2364 } 2365 sum2 -= v2[nz] * sum1; 2366 tmp[row++]=sum1; 2367 tmp[row++]=sum2; 2368 break; 2369 case 3: 2370 sum1 = b[r[row]]; 2371 sum2 = b[r[row+1]]; 2372 sum3 = b[r[row+2]]; 2373 v2 = aa + ai[row+1]; 2374 v3 = aa + ai[row+2]; 2375 2376 for (j=0; j<nz-1; j+=2) { 2377 i0 = vi[j]; 2378 i1 = vi[j+1]; 2379 tmp0 = tmps[i0]; 2380 tmp1 = tmps[i1]; 2381 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2382 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2383 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2384 } 2385 if (j == nz-1) { 2386 tmp0 = tmps[vi[j]]; 2387 sum1 -= v1[j] *tmp0; 2388 sum2 -= v2[j] *tmp0; 2389 sum3 -= v3[j] *tmp0; 2390 } 2391 sum2 -= v2[nz] * sum1; 2392 sum3 -= v3[nz] * sum1; 2393 sum3 -= v3[nz+1] * sum2; 2394 tmp[row++]=sum1; 2395 tmp[row++]=sum2; 2396 tmp[row++]=sum3; 2397 break; 2398 2399 case 4: 2400 sum1 = b[r[row]]; 2401 sum2 = b[r[row+1]]; 2402 sum3 = b[r[row+2]]; 2403 sum4 = b[r[row+3]]; 2404 v2 = aa + ai[row+1]; 2405 v3 = aa + ai[row+2]; 2406 v4 = aa + ai[row+3]; 2407 2408 for (j=0; j<nz-1; j+=2) { 2409 i0 = vi[j]; 2410 i1 = vi[j+1]; 2411 tmp0 = tmps[i0]; 2412 tmp1 = tmps[i1]; 2413 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2414 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2415 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2416 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2417 } 2418 if (j == nz-1) { 2419 tmp0 = tmps[vi[j]]; 2420 sum1 -= v1[j] *tmp0; 2421 sum2 -= v2[j] *tmp0; 2422 sum3 -= v3[j] *tmp0; 2423 sum4 -= v4[j] *tmp0; 2424 } 2425 sum2 -= v2[nz] * sum1; 2426 sum3 -= v3[nz] * sum1; 2427 sum4 -= v4[nz] * sum1; 2428 sum3 -= v3[nz+1] * sum2; 2429 sum4 -= v4[nz+1] * sum2; 2430 sum4 -= v4[nz+2] * sum3; 2431 2432 tmp[row++]=sum1; 2433 tmp[row++]=sum2; 2434 tmp[row++]=sum3; 2435 tmp[row++]=sum4; 2436 break; 2437 case 5: 2438 sum1 = b[r[row]]; 2439 sum2 = b[r[row+1]]; 2440 sum3 = b[r[row+2]]; 2441 sum4 = b[r[row+3]]; 2442 sum5 = b[r[row+4]]; 2443 v2 = aa + ai[row+1]; 2444 v3 = aa + ai[row+2]; 2445 v4 = aa + ai[row+3]; 2446 v5 = aa + ai[row+4]; 2447 2448 for (j=0; j<nz-1; j+=2) { 2449 i0 = vi[j]; 2450 i1 = vi[j+1]; 2451 tmp0 = tmps[i0]; 2452 tmp1 = tmps[i1]; 2453 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2454 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2455 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2456 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2457 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2458 } 2459 if (j == nz-1) { 2460 tmp0 = tmps[vi[j]]; 2461 sum1 -= v1[j] *tmp0; 2462 sum2 -= v2[j] *tmp0; 2463 sum3 -= v3[j] *tmp0; 2464 sum4 -= v4[j] *tmp0; 2465 sum5 -= v5[j] *tmp0; 2466 } 2467 2468 sum2 -= v2[nz] * sum1; 2469 sum3 -= v3[nz] * sum1; 2470 sum4 -= v4[nz] * sum1; 2471 sum5 -= v5[nz] * sum1; 2472 sum3 -= v3[nz+1] * sum2; 2473 sum4 -= v4[nz+1] * sum2; 2474 sum5 -= v5[nz+1] * sum2; 2475 sum4 -= v4[nz+2] * sum3; 2476 sum5 -= v5[nz+2] * sum3; 2477 sum5 -= v5[nz+3] * sum4; 2478 2479 tmp[row++]=sum1; 2480 tmp[row++]=sum2; 2481 tmp[row++]=sum3; 2482 tmp[row++]=sum4; 2483 tmp[row++]=sum5; 2484 break; 2485 default: 2486 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2487 } 2488 } 2489 /* backward solve the upper triangular */ 2490 for (i=node_max -1,row = n-1; i>=0; i--) { 2491 nsz = ns[i]; 2492 aii = ad[row+1] + 1; 2493 v1 = aa + aii; 2494 vi = aj + aii; 2495 nz = ad[row]- ad[row+1] - 1; 2496 2497 if (i > 0) { 2498 /* Prefetch the indices for the next block */ 2499 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2500 /* Prefetch the data for the next block */ 2501 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2502 } 2503 2504 switch (nsz) { /* Each loop in 'case' is unrolled */ 2505 case 1: 2506 sum1 = tmp[row]; 2507 2508 for (j=0; j<nz-1; j+=2) { 2509 i0 = vi[j]; 2510 i1 = vi[j+1]; 2511 tmp0 = tmps[i0]; 2512 tmp1 = tmps[i1]; 2513 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2514 } 2515 if (j == nz-1) { 2516 tmp0 = tmps[vi[j]]; 2517 sum1 -= v1[j]*tmp0; 2518 } 2519 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2520 break; 2521 case 2: 2522 sum1 = tmp[row]; 2523 sum2 = tmp[row-1]; 2524 v2 = aa + ad[row] + 1; 2525 for (j=0; j<nz-1; j+=2) { 2526 i0 = vi[j]; 2527 i1 = vi[j+1]; 2528 tmp0 = tmps[i0]; 2529 tmp1 = tmps[i1]; 2530 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2531 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2532 } 2533 if (j == nz-1) { 2534 tmp0 = tmps[vi[j]]; 2535 sum1 -= v1[j]* tmp0; 2536 sum2 -= v2[j+1]* tmp0; 2537 } 2538 2539 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2540 sum2 -= v2[0] * tmp0; 2541 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2542 break; 2543 case 3: 2544 sum1 = tmp[row]; 2545 sum2 = tmp[row -1]; 2546 sum3 = tmp[row -2]; 2547 v2 = aa + ad[row] + 1; 2548 v3 = aa + ad[row -1] + 1; 2549 for (j=0; j<nz-1; j+=2) { 2550 i0 = vi[j]; 2551 i1 = vi[j+1]; 2552 tmp0 = tmps[i0]; 2553 tmp1 = tmps[i1]; 2554 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2555 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2556 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2557 } 2558 if (j== nz-1) { 2559 tmp0 = tmps[vi[j]]; 2560 sum1 -= v1[j] * tmp0; 2561 sum2 -= v2[j+1] * tmp0; 2562 sum3 -= v3[j+2] * tmp0; 2563 } 2564 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2565 sum2 -= v2[0]* tmp0; 2566 sum3 -= v3[1] * tmp0; 2567 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2568 sum3 -= v3[0]* tmp0; 2569 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2570 2571 break; 2572 case 4: 2573 sum1 = tmp[row]; 2574 sum2 = tmp[row -1]; 2575 sum3 = tmp[row -2]; 2576 sum4 = tmp[row -3]; 2577 v2 = aa + ad[row]+1; 2578 v3 = aa + ad[row -1]+1; 2579 v4 = aa + ad[row -2]+1; 2580 2581 for (j=0; j<nz-1; j+=2) { 2582 i0 = vi[j]; 2583 i1 = vi[j+1]; 2584 tmp0 = tmps[i0]; 2585 tmp1 = tmps[i1]; 2586 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2587 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2588 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2589 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2590 } 2591 if (j== nz-1) { 2592 tmp0 = tmps[vi[j]]; 2593 sum1 -= v1[j] * tmp0; 2594 sum2 -= v2[j+1] * tmp0; 2595 sum3 -= v3[j+2] * tmp0; 2596 sum4 -= v4[j+3] * tmp0; 2597 } 2598 2599 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2600 sum2 -= v2[0] * tmp0; 2601 sum3 -= v3[1] * tmp0; 2602 sum4 -= v4[2] * tmp0; 2603 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2604 sum3 -= v3[0] * tmp0; 2605 sum4 -= v4[1] * tmp0; 2606 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2607 sum4 -= v4[0] * tmp0; 2608 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2609 break; 2610 case 5: 2611 sum1 = tmp[row]; 2612 sum2 = tmp[row -1]; 2613 sum3 = tmp[row -2]; 2614 sum4 = tmp[row -3]; 2615 sum5 = tmp[row -4]; 2616 v2 = aa + ad[row]+1; 2617 v3 = aa + ad[row -1]+1; 2618 v4 = aa + ad[row -2]+1; 2619 v5 = aa + ad[row -3]+1; 2620 for (j=0; j<nz-1; j+=2) { 2621 i0 = vi[j]; 2622 i1 = vi[j+1]; 2623 tmp0 = tmps[i0]; 2624 tmp1 = tmps[i1]; 2625 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2626 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2627 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2628 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2629 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2630 } 2631 if (j==nz-1) { 2632 tmp0 = tmps[vi[j]]; 2633 sum1 -= v1[j] * tmp0; 2634 sum2 -= v2[j+1] * tmp0; 2635 sum3 -= v3[j+2] * tmp0; 2636 sum4 -= v4[j+3] * tmp0; 2637 sum5 -= v5[j+4] * tmp0; 2638 } 2639 2640 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2641 sum2 -= v2[0] * tmp0; 2642 sum3 -= v3[1] * tmp0; 2643 sum4 -= v4[2] * tmp0; 2644 sum5 -= v5[3] * tmp0; 2645 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2646 sum3 -= v3[0] * tmp0; 2647 sum4 -= v4[1] * tmp0; 2648 sum5 -= v5[2] * tmp0; 2649 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2650 sum4 -= v4[0] * tmp0; 2651 sum5 -= v5[1] * tmp0; 2652 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2653 sum5 -= v5[0] * tmp0; 2654 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2655 break; 2656 default: 2657 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2658 } 2659 } 2660 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2661 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2662 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2663 ierr = VecRestoreArrayWrite(xx,&x);CHKERRQ(ierr); 2664 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2665 PetscFunctionReturn(0); 2666 } 2667 2668 2669 /* 2670 Makes a longer coloring[] array and calls the usual code with that 2671 */ 2672 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2673 { 2674 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2675 PetscErrorCode ierr; 2676 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2677 PetscInt *colorused,i; 2678 ISColoringValue *newcolor; 2679 2680 PetscFunctionBegin; 2681 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2682 /* loop over inodes, marking a color for each column*/ 2683 row = 0; 2684 for (i=0; i<m; i++) { 2685 for (j=0; j<ns[i]; j++) { 2686 newcolor[row++] = coloring[i] + j*ncolors; 2687 } 2688 } 2689 2690 /* eliminate unneeded colors */ 2691 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2692 for (i=0; i<n; i++) { 2693 colorused[newcolor[i]] = 1; 2694 } 2695 2696 for (i=1; i<5*ncolors; i++) { 2697 colorused[i] += colorused[i-1]; 2698 } 2699 ncolors = colorused[5*ncolors-1]; 2700 for (i=0; i<n; i++) { 2701 newcolor[i] = colorused[newcolor[i]]-1; 2702 } 2703 ierr = PetscFree(colorused);CHKERRQ(ierr); 2704 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2705 ierr = PetscFree(coloring);CHKERRQ(ierr); 2706 PetscFunctionReturn(0); 2707 } 2708 2709 #include <petsc/private/kernels/blockinvert.h> 2710 2711 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2712 { 2713 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2714 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2715 MatScalar *ibdiag,*bdiag,work[25],*t; 2716 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2717 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2718 const PetscScalar *xb, *b; 2719 PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0; 2720 PetscErrorCode ierr; 2721 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2722 PetscInt sz,k,ipvt[5]; 2723 PetscBool allowzeropivot,zeropivotdetected; 2724 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2725 2726 PetscFunctionBegin; 2727 allowzeropivot = PetscNot(A->erroriffailure); 2728 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2729 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2730 2731 if (!a->inode.ibdiagvalid) { 2732 if (!a->inode.ibdiag) { 2733 /* calculate space needed for diagonal blocks */ 2734 for (i=0; i<m; i++) { 2735 cnt += sizes[i]*sizes[i]; 2736 } 2737 a->inode.bdiagsize = cnt; 2738 2739 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2740 } 2741 2742 /* copy over the diagonal blocks and invert them */ 2743 ibdiag = a->inode.ibdiag; 2744 bdiag = a->inode.bdiag; 2745 cnt = 0; 2746 for (i=0, row = 0; i<m; i++) { 2747 for (j=0; j<sizes[i]; j++) { 2748 for (k=0; k<sizes[i]; k++) { 2749 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2750 } 2751 } 2752 ierr = PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]);CHKERRQ(ierr); 2753 2754 switch (sizes[i]) { 2755 case 1: 2756 /* Create matrix data structure */ 2757 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2758 if (allowzeropivot) { 2759 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2760 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2761 A->factorerror_zeropivot_row = row; 2762 ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr); 2763 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2764 } 2765 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2766 break; 2767 case 2: 2768 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2769 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2770 break; 2771 case 3: 2772 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2773 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2774 break; 2775 case 4: 2776 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2777 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2778 break; 2779 case 5: 2780 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2781 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2782 break; 2783 default: 2784 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2785 } 2786 cnt += sizes[i]*sizes[i]; 2787 row += sizes[i]; 2788 } 2789 a->inode.ibdiagvalid = PETSC_TRUE; 2790 } 2791 ibdiag = a->inode.ibdiag; 2792 bdiag = a->inode.bdiag; 2793 t = a->inode.ssor_work; 2794 2795 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2796 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2797 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2798 if (flag & SOR_ZERO_INITIAL_GUESS) { 2799 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2800 2801 for (i=0, row=0; i<m; i++) { 2802 sz = diag[row] - ii[row]; 2803 v1 = a->a + ii[row]; 2804 idx = a->j + ii[row]; 2805 2806 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2807 switch (sizes[i]) { 2808 case 1: 2809 2810 sum1 = b[row]; 2811 for (n = 0; n<sz-1; n+=2) { 2812 i1 = idx[0]; 2813 i2 = idx[1]; 2814 idx += 2; 2815 tmp0 = x[i1]; 2816 tmp1 = x[i2]; 2817 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2818 } 2819 2820 if (n == sz-1) { 2821 tmp0 = x[*idx]; 2822 sum1 -= *v1 * tmp0; 2823 } 2824 t[row] = sum1; 2825 x[row++] = sum1*(*ibdiag++); 2826 break; 2827 case 2: 2828 v2 = a->a + ii[row+1]; 2829 sum1 = b[row]; 2830 sum2 = b[row+1]; 2831 for (n = 0; n<sz-1; n+=2) { 2832 i1 = idx[0]; 2833 i2 = idx[1]; 2834 idx += 2; 2835 tmp0 = x[i1]; 2836 tmp1 = x[i2]; 2837 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2838 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2839 } 2840 2841 if (n == sz-1) { 2842 tmp0 = x[*idx]; 2843 sum1 -= v1[0] * tmp0; 2844 sum2 -= v2[0] * tmp0; 2845 } 2846 t[row] = sum1; 2847 t[row+1] = sum2; 2848 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2849 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2850 ibdiag += 4; 2851 break; 2852 case 3: 2853 v2 = a->a + ii[row+1]; 2854 v3 = a->a + ii[row+2]; 2855 sum1 = b[row]; 2856 sum2 = b[row+1]; 2857 sum3 = b[row+2]; 2858 for (n = 0; n<sz-1; n+=2) { 2859 i1 = idx[0]; 2860 i2 = idx[1]; 2861 idx += 2; 2862 tmp0 = x[i1]; 2863 tmp1 = x[i2]; 2864 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2865 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2866 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2867 } 2868 2869 if (n == sz-1) { 2870 tmp0 = x[*idx]; 2871 sum1 -= v1[0] * tmp0; 2872 sum2 -= v2[0] * tmp0; 2873 sum3 -= v3[0] * tmp0; 2874 } 2875 t[row] = sum1; 2876 t[row+1] = sum2; 2877 t[row+2] = sum3; 2878 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2879 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2880 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2881 ibdiag += 9; 2882 break; 2883 case 4: 2884 v2 = a->a + ii[row+1]; 2885 v3 = a->a + ii[row+2]; 2886 v4 = a->a + ii[row+3]; 2887 sum1 = b[row]; 2888 sum2 = b[row+1]; 2889 sum3 = b[row+2]; 2890 sum4 = b[row+3]; 2891 for (n = 0; n<sz-1; n+=2) { 2892 i1 = idx[0]; 2893 i2 = idx[1]; 2894 idx += 2; 2895 tmp0 = x[i1]; 2896 tmp1 = x[i2]; 2897 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2898 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2899 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2900 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2901 } 2902 2903 if (n == sz-1) { 2904 tmp0 = x[*idx]; 2905 sum1 -= v1[0] * tmp0; 2906 sum2 -= v2[0] * tmp0; 2907 sum3 -= v3[0] * tmp0; 2908 sum4 -= v4[0] * tmp0; 2909 } 2910 t[row] = sum1; 2911 t[row+1] = sum2; 2912 t[row+2] = sum3; 2913 t[row+3] = sum4; 2914 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2915 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2916 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2917 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2918 ibdiag += 16; 2919 break; 2920 case 5: 2921 v2 = a->a + ii[row+1]; 2922 v3 = a->a + ii[row+2]; 2923 v4 = a->a + ii[row+3]; 2924 v5 = a->a + ii[row+4]; 2925 sum1 = b[row]; 2926 sum2 = b[row+1]; 2927 sum3 = b[row+2]; 2928 sum4 = b[row+3]; 2929 sum5 = b[row+4]; 2930 for (n = 0; n<sz-1; n+=2) { 2931 i1 = idx[0]; 2932 i2 = idx[1]; 2933 idx += 2; 2934 tmp0 = x[i1]; 2935 tmp1 = x[i2]; 2936 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2937 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2938 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2939 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2940 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2941 } 2942 2943 if (n == sz-1) { 2944 tmp0 = x[*idx]; 2945 sum1 -= v1[0] * tmp0; 2946 sum2 -= v2[0] * tmp0; 2947 sum3 -= v3[0] * tmp0; 2948 sum4 -= v4[0] * tmp0; 2949 sum5 -= v5[0] * tmp0; 2950 } 2951 t[row] = sum1; 2952 t[row+1] = sum2; 2953 t[row+2] = sum3; 2954 t[row+3] = sum4; 2955 t[row+4] = sum5; 2956 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2957 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2958 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2959 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2960 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2961 ibdiag += 25; 2962 break; 2963 default: 2964 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2965 } 2966 } 2967 2968 xb = t; 2969 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2970 } else xb = b; 2971 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 2972 2973 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2974 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2975 ibdiag -= sizes[i]*sizes[i]; 2976 sz = ii[row+1] - diag[row] - 1; 2977 v1 = a->a + diag[row] + 1; 2978 idx = a->j + diag[row] + 1; 2979 2980 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2981 switch (sizes[i]) { 2982 case 1: 2983 2984 sum1 = xb[row]; 2985 for (n = 0; n<sz-1; n+=2) { 2986 i1 = idx[0]; 2987 i2 = idx[1]; 2988 idx += 2; 2989 tmp0 = x[i1]; 2990 tmp1 = x[i2]; 2991 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2992 } 2993 2994 if (n == sz-1) { 2995 tmp0 = x[*idx]; 2996 sum1 -= *v1*tmp0; 2997 } 2998 x[row--] = sum1*(*ibdiag); 2999 break; 3000 3001 case 2: 3002 3003 sum1 = xb[row]; 3004 sum2 = xb[row-1]; 3005 /* note that sum1 is associated with the second of the two rows */ 3006 v2 = a->a + diag[row-1] + 2; 3007 for (n = 0; n<sz-1; n+=2) { 3008 i1 = idx[0]; 3009 i2 = idx[1]; 3010 idx += 2; 3011 tmp0 = x[i1]; 3012 tmp1 = x[i2]; 3013 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3014 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3015 } 3016 3017 if (n == sz-1) { 3018 tmp0 = x[*idx]; 3019 sum1 -= *v1*tmp0; 3020 sum2 -= *v2*tmp0; 3021 } 3022 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3023 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3024 break; 3025 case 3: 3026 3027 sum1 = xb[row]; 3028 sum2 = xb[row-1]; 3029 sum3 = xb[row-2]; 3030 v2 = a->a + diag[row-1] + 2; 3031 v3 = a->a + diag[row-2] + 3; 3032 for (n = 0; n<sz-1; n+=2) { 3033 i1 = idx[0]; 3034 i2 = idx[1]; 3035 idx += 2; 3036 tmp0 = x[i1]; 3037 tmp1 = x[i2]; 3038 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3039 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3040 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3041 } 3042 3043 if (n == sz-1) { 3044 tmp0 = x[*idx]; 3045 sum1 -= *v1*tmp0; 3046 sum2 -= *v2*tmp0; 3047 sum3 -= *v3*tmp0; 3048 } 3049 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3050 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3051 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3052 break; 3053 case 4: 3054 3055 sum1 = xb[row]; 3056 sum2 = xb[row-1]; 3057 sum3 = xb[row-2]; 3058 sum4 = xb[row-3]; 3059 v2 = a->a + diag[row-1] + 2; 3060 v3 = a->a + diag[row-2] + 3; 3061 v4 = a->a + diag[row-3] + 4; 3062 for (n = 0; n<sz-1; n+=2) { 3063 i1 = idx[0]; 3064 i2 = idx[1]; 3065 idx += 2; 3066 tmp0 = x[i1]; 3067 tmp1 = x[i2]; 3068 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3069 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3070 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3071 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3072 } 3073 3074 if (n == sz-1) { 3075 tmp0 = x[*idx]; 3076 sum1 -= *v1*tmp0; 3077 sum2 -= *v2*tmp0; 3078 sum3 -= *v3*tmp0; 3079 sum4 -= *v4*tmp0; 3080 } 3081 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3082 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3083 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3084 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3085 break; 3086 case 5: 3087 3088 sum1 = xb[row]; 3089 sum2 = xb[row-1]; 3090 sum3 = xb[row-2]; 3091 sum4 = xb[row-3]; 3092 sum5 = xb[row-4]; 3093 v2 = a->a + diag[row-1] + 2; 3094 v3 = a->a + diag[row-2] + 3; 3095 v4 = a->a + diag[row-3] + 4; 3096 v5 = a->a + diag[row-4] + 5; 3097 for (n = 0; n<sz-1; n+=2) { 3098 i1 = idx[0]; 3099 i2 = idx[1]; 3100 idx += 2; 3101 tmp0 = x[i1]; 3102 tmp1 = x[i2]; 3103 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3104 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3105 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3106 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3107 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3108 } 3109 3110 if (n == sz-1) { 3111 tmp0 = x[*idx]; 3112 sum1 -= *v1*tmp0; 3113 sum2 -= *v2*tmp0; 3114 sum3 -= *v3*tmp0; 3115 sum4 -= *v4*tmp0; 3116 sum5 -= *v5*tmp0; 3117 } 3118 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3119 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3120 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3121 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3122 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3123 break; 3124 default: 3125 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3126 } 3127 } 3128 3129 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3130 } 3131 its--; 3132 } 3133 while (its--) { 3134 3135 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3136 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3137 i<m; 3138 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3139 3140 sz = diag[row] - ii[row]; 3141 v1 = a->a + ii[row]; 3142 idx = a->j + ii[row]; 3143 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3144 switch (sizes[i]) { 3145 case 1: 3146 sum1 = b[row]; 3147 for (n = 0; n<sz-1; n+=2) { 3148 i1 = idx[0]; 3149 i2 = idx[1]; 3150 idx += 2; 3151 tmp0 = x[i1]; 3152 tmp1 = x[i2]; 3153 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3154 } 3155 if (n == sz-1) { 3156 tmp0 = x[*idx++]; 3157 sum1 -= *v1 * tmp0; 3158 v1++; 3159 } 3160 t[row] = sum1; 3161 sz = ii[row+1] - diag[row] - 1; 3162 idx = a->j + diag[row] + 1; 3163 v1 += 1; 3164 for (n = 0; n<sz-1; n+=2) { 3165 i1 = idx[0]; 3166 i2 = idx[1]; 3167 idx += 2; 3168 tmp0 = x[i1]; 3169 tmp1 = x[i2]; 3170 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3171 } 3172 if (n == sz-1) { 3173 tmp0 = x[*idx++]; 3174 sum1 -= *v1 * tmp0; 3175 } 3176 /* in MatSOR_SeqAIJ this line would be 3177 * 3178 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3179 * 3180 * but omega == 1, so this becomes 3181 * 3182 * x[row] = sum1*(*ibdiag++); 3183 * 3184 */ 3185 x[row] = sum1*(*ibdiag); 3186 break; 3187 case 2: 3188 v2 = a->a + ii[row+1]; 3189 sum1 = b[row]; 3190 sum2 = b[row+1]; 3191 for (n = 0; n<sz-1; n+=2) { 3192 i1 = idx[0]; 3193 i2 = idx[1]; 3194 idx += 2; 3195 tmp0 = x[i1]; 3196 tmp1 = x[i2]; 3197 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3198 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3199 } 3200 if (n == sz-1) { 3201 tmp0 = x[*idx++]; 3202 sum1 -= v1[0] * tmp0; 3203 sum2 -= v2[0] * tmp0; 3204 v1++; v2++; 3205 } 3206 t[row] = sum1; 3207 t[row+1] = sum2; 3208 sz = ii[row+1] - diag[row] - 2; 3209 idx = a->j + diag[row] + 2; 3210 v1 += 2; 3211 v2 += 2; 3212 for (n = 0; n<sz-1; n+=2) { 3213 i1 = idx[0]; 3214 i2 = idx[1]; 3215 idx += 2; 3216 tmp0 = x[i1]; 3217 tmp1 = x[i2]; 3218 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3219 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3220 } 3221 if (n == sz-1) { 3222 tmp0 = x[*idx]; 3223 sum1 -= v1[0] * tmp0; 3224 sum2 -= v2[0] * tmp0; 3225 } 3226 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3227 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3228 break; 3229 case 3: 3230 v2 = a->a + ii[row+1]; 3231 v3 = a->a + ii[row+2]; 3232 sum1 = b[row]; 3233 sum2 = b[row+1]; 3234 sum3 = b[row+2]; 3235 for (n = 0; n<sz-1; n+=2) { 3236 i1 = idx[0]; 3237 i2 = idx[1]; 3238 idx += 2; 3239 tmp0 = x[i1]; 3240 tmp1 = x[i2]; 3241 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3242 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3243 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3244 } 3245 if (n == sz-1) { 3246 tmp0 = x[*idx++]; 3247 sum1 -= v1[0] * tmp0; 3248 sum2 -= v2[0] * tmp0; 3249 sum3 -= v3[0] * tmp0; 3250 v1++; v2++; v3++; 3251 } 3252 t[row] = sum1; 3253 t[row+1] = sum2; 3254 t[row+2] = sum3; 3255 sz = ii[row+1] - diag[row] - 3; 3256 idx = a->j + diag[row] + 3; 3257 v1 += 3; 3258 v2 += 3; 3259 v3 += 3; 3260 for (n = 0; n<sz-1; n+=2) { 3261 i1 = idx[0]; 3262 i2 = idx[1]; 3263 idx += 2; 3264 tmp0 = x[i1]; 3265 tmp1 = x[i2]; 3266 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3267 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3268 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3269 } 3270 if (n == sz-1) { 3271 tmp0 = x[*idx]; 3272 sum1 -= v1[0] * tmp0; 3273 sum2 -= v2[0] * tmp0; 3274 sum3 -= v3[0] * tmp0; 3275 } 3276 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3277 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3278 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3279 break; 3280 case 4: 3281 v2 = a->a + ii[row+1]; 3282 v3 = a->a + ii[row+2]; 3283 v4 = a->a + ii[row+3]; 3284 sum1 = b[row]; 3285 sum2 = b[row+1]; 3286 sum3 = b[row+2]; 3287 sum4 = b[row+3]; 3288 for (n = 0; n<sz-1; n+=2) { 3289 i1 = idx[0]; 3290 i2 = idx[1]; 3291 idx += 2; 3292 tmp0 = x[i1]; 3293 tmp1 = x[i2]; 3294 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3295 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3296 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3297 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3298 } 3299 if (n == sz-1) { 3300 tmp0 = x[*idx++]; 3301 sum1 -= v1[0] * tmp0; 3302 sum2 -= v2[0] * tmp0; 3303 sum3 -= v3[0] * tmp0; 3304 sum4 -= v4[0] * tmp0; 3305 v1++; v2++; v3++; v4++; 3306 } 3307 t[row] = sum1; 3308 t[row+1] = sum2; 3309 t[row+2] = sum3; 3310 t[row+3] = sum4; 3311 sz = ii[row+1] - diag[row] - 4; 3312 idx = a->j + diag[row] + 4; 3313 v1 += 4; 3314 v2 += 4; 3315 v3 += 4; 3316 v4 += 4; 3317 for (n = 0; n<sz-1; n+=2) { 3318 i1 = idx[0]; 3319 i2 = idx[1]; 3320 idx += 2; 3321 tmp0 = x[i1]; 3322 tmp1 = x[i2]; 3323 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3324 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3325 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3326 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3327 } 3328 if (n == sz-1) { 3329 tmp0 = x[*idx]; 3330 sum1 -= v1[0] * tmp0; 3331 sum2 -= v2[0] * tmp0; 3332 sum3 -= v3[0] * tmp0; 3333 sum4 -= v4[0] * tmp0; 3334 } 3335 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3336 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3337 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3338 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3339 break; 3340 case 5: 3341 v2 = a->a + ii[row+1]; 3342 v3 = a->a + ii[row+2]; 3343 v4 = a->a + ii[row+3]; 3344 v5 = a->a + ii[row+4]; 3345 sum1 = b[row]; 3346 sum2 = b[row+1]; 3347 sum3 = b[row+2]; 3348 sum4 = b[row+3]; 3349 sum5 = b[row+4]; 3350 for (n = 0; n<sz-1; n+=2) { 3351 i1 = idx[0]; 3352 i2 = idx[1]; 3353 idx += 2; 3354 tmp0 = x[i1]; 3355 tmp1 = x[i2]; 3356 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3357 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3358 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3359 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3360 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3361 } 3362 if (n == sz-1) { 3363 tmp0 = x[*idx++]; 3364 sum1 -= v1[0] * tmp0; 3365 sum2 -= v2[0] * tmp0; 3366 sum3 -= v3[0] * tmp0; 3367 sum4 -= v4[0] * tmp0; 3368 sum5 -= v5[0] * tmp0; 3369 v1++; v2++; v3++; v4++; v5++; 3370 } 3371 t[row] = sum1; 3372 t[row+1] = sum2; 3373 t[row+2] = sum3; 3374 t[row+3] = sum4; 3375 t[row+4] = sum5; 3376 sz = ii[row+1] - diag[row] - 5; 3377 idx = a->j + diag[row] + 5; 3378 v1 += 5; 3379 v2 += 5; 3380 v3 += 5; 3381 v4 += 5; 3382 v5 += 5; 3383 for (n = 0; n<sz-1; n+=2) { 3384 i1 = idx[0]; 3385 i2 = idx[1]; 3386 idx += 2; 3387 tmp0 = x[i1]; 3388 tmp1 = x[i2]; 3389 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3390 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3391 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3392 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3393 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3394 } 3395 if (n == sz-1) { 3396 tmp0 = x[*idx]; 3397 sum1 -= v1[0] * tmp0; 3398 sum2 -= v2[0] * tmp0; 3399 sum3 -= v3[0] * tmp0; 3400 sum4 -= v4[0] * tmp0; 3401 sum5 -= v5[0] * tmp0; 3402 } 3403 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3404 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3405 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3406 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3407 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3408 break; 3409 default: 3410 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3411 } 3412 } 3413 xb = t; 3414 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3415 } else xb = b; 3416 3417 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3418 3419 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3420 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3421 ibdiag -= sizes[i]*sizes[i]; 3422 3423 /* set RHS */ 3424 if (xb == b) { 3425 /* whole (old way) */ 3426 sz = ii[row+1] - ii[row]; 3427 idx = a->j + ii[row]; 3428 switch (sizes[i]) { 3429 case 5: 3430 v5 = a->a + ii[row-4]; 3431 case 4: /* fall through */ 3432 v4 = a->a + ii[row-3]; 3433 case 3: 3434 v3 = a->a + ii[row-2]; 3435 case 2: 3436 v2 = a->a + ii[row-1]; 3437 case 1: 3438 v1 = a->a + ii[row]; 3439 break; 3440 default: 3441 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3442 } 3443 } else { 3444 /* upper, no diag */ 3445 sz = ii[row+1] - diag[row] - 1; 3446 idx = a->j + diag[row] + 1; 3447 switch (sizes[i]) { 3448 case 5: 3449 v5 = a->a + diag[row-4] + 5; 3450 case 4: /* fall through */ 3451 v4 = a->a + diag[row-3] + 4; 3452 case 3: 3453 v3 = a->a + diag[row-2] + 3; 3454 case 2: 3455 v2 = a->a + diag[row-1] + 2; 3456 case 1: 3457 v1 = a->a + diag[row] + 1; 3458 } 3459 } 3460 /* set sum */ 3461 switch (sizes[i]) { 3462 case 5: 3463 sum5 = xb[row-4]; 3464 case 4: /* fall through */ 3465 sum4 = xb[row-3]; 3466 case 3: 3467 sum3 = xb[row-2]; 3468 case 2: 3469 sum2 = xb[row-1]; 3470 case 1: 3471 /* note that sum1 is associated with the last row */ 3472 sum1 = xb[row]; 3473 } 3474 /* do sums */ 3475 for (n = 0; n<sz-1; n+=2) { 3476 i1 = idx[0]; 3477 i2 = idx[1]; 3478 idx += 2; 3479 tmp0 = x[i1]; 3480 tmp1 = x[i2]; 3481 switch (sizes[i]) { 3482 case 5: 3483 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3484 case 4: /* fall through */ 3485 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3486 case 3: 3487 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3488 case 2: 3489 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3490 case 1: 3491 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3492 } 3493 } 3494 /* ragged edge */ 3495 if (n == sz-1) { 3496 tmp0 = x[*idx]; 3497 switch (sizes[i]) { 3498 case 5: 3499 sum5 -= *v5*tmp0; 3500 case 4: /* fall through */ 3501 sum4 -= *v4*tmp0; 3502 case 3: 3503 sum3 -= *v3*tmp0; 3504 case 2: 3505 sum2 -= *v2*tmp0; 3506 case 1: 3507 sum1 -= *v1*tmp0; 3508 } 3509 } 3510 /* update */ 3511 if (xb == b) { 3512 /* whole (old way) w/ diag */ 3513 switch (sizes[i]) { 3514 case 5: 3515 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3516 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3517 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3518 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3519 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3520 break; 3521 case 4: 3522 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3523 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3524 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3525 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3526 break; 3527 case 3: 3528 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3529 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3530 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3531 break; 3532 case 2: 3533 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3534 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3535 break; 3536 case 1: 3537 x[row--] += sum1*(*ibdiag); 3538 break; 3539 } 3540 } else { 3541 /* no diag so set = */ 3542 switch (sizes[i]) { 3543 case 5: 3544 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3545 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3546 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3547 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3548 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3549 break; 3550 case 4: 3551 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3552 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3553 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3554 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3555 break; 3556 case 3: 3557 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3558 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3559 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3560 break; 3561 case 2: 3562 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3563 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3564 break; 3565 case 1: 3566 x[row--] = sum1*(*ibdiag); 3567 break; 3568 } 3569 } 3570 } 3571 if (xb == b) { 3572 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3573 } else { 3574 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3575 } 3576 } 3577 } 3578 if (flag & SOR_EISENSTAT) { 3579 /* 3580 Apply (U + D)^-1 where D is now the block diagonal 3581 */ 3582 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3583 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3584 ibdiag -= sizes[i]*sizes[i]; 3585 sz = ii[row+1] - diag[row] - 1; 3586 v1 = a->a + diag[row] + 1; 3587 idx = a->j + diag[row] + 1; 3588 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3589 switch (sizes[i]) { 3590 case 1: 3591 3592 sum1 = b[row]; 3593 for (n = 0; n<sz-1; n+=2) { 3594 i1 = idx[0]; 3595 i2 = idx[1]; 3596 idx += 2; 3597 tmp0 = x[i1]; 3598 tmp1 = x[i2]; 3599 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3600 } 3601 3602 if (n == sz-1) { 3603 tmp0 = x[*idx]; 3604 sum1 -= *v1*tmp0; 3605 } 3606 x[row] = sum1*(*ibdiag);row--; 3607 break; 3608 3609 case 2: 3610 3611 sum1 = b[row]; 3612 sum2 = b[row-1]; 3613 /* note that sum1 is associated with the second of the two rows */ 3614 v2 = a->a + diag[row-1] + 2; 3615 for (n = 0; n<sz-1; n+=2) { 3616 i1 = idx[0]; 3617 i2 = idx[1]; 3618 idx += 2; 3619 tmp0 = x[i1]; 3620 tmp1 = x[i2]; 3621 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3622 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3623 } 3624 3625 if (n == sz-1) { 3626 tmp0 = x[*idx]; 3627 sum1 -= *v1*tmp0; 3628 sum2 -= *v2*tmp0; 3629 } 3630 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3631 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3632 row -= 2; 3633 break; 3634 case 3: 3635 3636 sum1 = b[row]; 3637 sum2 = b[row-1]; 3638 sum3 = b[row-2]; 3639 v2 = a->a + diag[row-1] + 2; 3640 v3 = a->a + diag[row-2] + 3; 3641 for (n = 0; n<sz-1; n+=2) { 3642 i1 = idx[0]; 3643 i2 = idx[1]; 3644 idx += 2; 3645 tmp0 = x[i1]; 3646 tmp1 = x[i2]; 3647 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3648 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3649 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3650 } 3651 3652 if (n == sz-1) { 3653 tmp0 = x[*idx]; 3654 sum1 -= *v1*tmp0; 3655 sum2 -= *v2*tmp0; 3656 sum3 -= *v3*tmp0; 3657 } 3658 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3659 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3660 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3661 row -= 3; 3662 break; 3663 case 4: 3664 3665 sum1 = b[row]; 3666 sum2 = b[row-1]; 3667 sum3 = b[row-2]; 3668 sum4 = b[row-3]; 3669 v2 = a->a + diag[row-1] + 2; 3670 v3 = a->a + diag[row-2] + 3; 3671 v4 = a->a + diag[row-3] + 4; 3672 for (n = 0; n<sz-1; n+=2) { 3673 i1 = idx[0]; 3674 i2 = idx[1]; 3675 idx += 2; 3676 tmp0 = x[i1]; 3677 tmp1 = x[i2]; 3678 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3679 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3680 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3681 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3682 } 3683 3684 if (n == sz-1) { 3685 tmp0 = x[*idx]; 3686 sum1 -= *v1*tmp0; 3687 sum2 -= *v2*tmp0; 3688 sum3 -= *v3*tmp0; 3689 sum4 -= *v4*tmp0; 3690 } 3691 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3692 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3693 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3694 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3695 row -= 4; 3696 break; 3697 case 5: 3698 3699 sum1 = b[row]; 3700 sum2 = b[row-1]; 3701 sum3 = b[row-2]; 3702 sum4 = b[row-3]; 3703 sum5 = b[row-4]; 3704 v2 = a->a + diag[row-1] + 2; 3705 v3 = a->a + diag[row-2] + 3; 3706 v4 = a->a + diag[row-3] + 4; 3707 v5 = a->a + diag[row-4] + 5; 3708 for (n = 0; n<sz-1; n+=2) { 3709 i1 = idx[0]; 3710 i2 = idx[1]; 3711 idx += 2; 3712 tmp0 = x[i1]; 3713 tmp1 = x[i2]; 3714 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3715 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3716 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3717 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3718 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3719 } 3720 3721 if (n == sz-1) { 3722 tmp0 = x[*idx]; 3723 sum1 -= *v1*tmp0; 3724 sum2 -= *v2*tmp0; 3725 sum3 -= *v3*tmp0; 3726 sum4 -= *v4*tmp0; 3727 sum5 -= *v5*tmp0; 3728 } 3729 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3730 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3731 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3732 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3733 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3734 row -= 5; 3735 break; 3736 default: 3737 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3738 } 3739 } 3740 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3741 3742 /* 3743 t = b - D x where D is the block diagonal 3744 */ 3745 cnt = 0; 3746 for (i=0, row=0; i<m; i++) { 3747 switch (sizes[i]) { 3748 case 1: 3749 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3750 break; 3751 case 2: 3752 x1 = x[row]; x2 = x[row+1]; 3753 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3754 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3755 t[row] = b[row] - tmp1; 3756 t[row+1] = b[row+1] - tmp2; row += 2; 3757 cnt += 4; 3758 break; 3759 case 3: 3760 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3761 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3762 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3763 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3764 t[row] = b[row] - tmp1; 3765 t[row+1] = b[row+1] - tmp2; 3766 t[row+2] = b[row+2] - tmp3; row += 3; 3767 cnt += 9; 3768 break; 3769 case 4: 3770 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3771 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3772 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3773 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3774 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3775 t[row] = b[row] - tmp1; 3776 t[row+1] = b[row+1] - tmp2; 3777 t[row+2] = b[row+2] - tmp3; 3778 t[row+3] = b[row+3] - tmp4; row += 4; 3779 cnt += 16; 3780 break; 3781 case 5: 3782 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3783 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3784 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3785 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3786 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3787 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3788 t[row] = b[row] - tmp1; 3789 t[row+1] = b[row+1] - tmp2; 3790 t[row+2] = b[row+2] - tmp3; 3791 t[row+3] = b[row+3] - tmp4; 3792 t[row+4] = b[row+4] - tmp5;row += 5; 3793 cnt += 25; 3794 break; 3795 default: 3796 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3797 } 3798 } 3799 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3800 3801 3802 3803 /* 3804 Apply (L + D)^-1 where D is the block diagonal 3805 */ 3806 for (i=0, row=0; i<m; i++) { 3807 sz = diag[row] - ii[row]; 3808 v1 = a->a + ii[row]; 3809 idx = a->j + ii[row]; 3810 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3811 switch (sizes[i]) { 3812 case 1: 3813 3814 sum1 = t[row]; 3815 for (n = 0; n<sz-1; n+=2) { 3816 i1 = idx[0]; 3817 i2 = idx[1]; 3818 idx += 2; 3819 tmp0 = t[i1]; 3820 tmp1 = t[i2]; 3821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3822 } 3823 3824 if (n == sz-1) { 3825 tmp0 = t[*idx]; 3826 sum1 -= *v1 * tmp0; 3827 } 3828 x[row] += t[row] = sum1*(*ibdiag++); row++; 3829 break; 3830 case 2: 3831 v2 = a->a + ii[row+1]; 3832 sum1 = t[row]; 3833 sum2 = t[row+1]; 3834 for (n = 0; n<sz-1; n+=2) { 3835 i1 = idx[0]; 3836 i2 = idx[1]; 3837 idx += 2; 3838 tmp0 = t[i1]; 3839 tmp1 = t[i2]; 3840 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3841 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3842 } 3843 3844 if (n == sz-1) { 3845 tmp0 = t[*idx]; 3846 sum1 -= v1[0] * tmp0; 3847 sum2 -= v2[0] * tmp0; 3848 } 3849 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3850 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3851 ibdiag += 4; row += 2; 3852 break; 3853 case 3: 3854 v2 = a->a + ii[row+1]; 3855 v3 = a->a + ii[row+2]; 3856 sum1 = t[row]; 3857 sum2 = t[row+1]; 3858 sum3 = t[row+2]; 3859 for (n = 0; n<sz-1; n+=2) { 3860 i1 = idx[0]; 3861 i2 = idx[1]; 3862 idx += 2; 3863 tmp0 = t[i1]; 3864 tmp1 = t[i2]; 3865 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3866 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3867 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3868 } 3869 3870 if (n == sz-1) { 3871 tmp0 = t[*idx]; 3872 sum1 -= v1[0] * tmp0; 3873 sum2 -= v2[0] * tmp0; 3874 sum3 -= v3[0] * tmp0; 3875 } 3876 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3877 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3878 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3879 ibdiag += 9; row += 3; 3880 break; 3881 case 4: 3882 v2 = a->a + ii[row+1]; 3883 v3 = a->a + ii[row+2]; 3884 v4 = a->a + ii[row+3]; 3885 sum1 = t[row]; 3886 sum2 = t[row+1]; 3887 sum3 = t[row+2]; 3888 sum4 = t[row+3]; 3889 for (n = 0; n<sz-1; n+=2) { 3890 i1 = idx[0]; 3891 i2 = idx[1]; 3892 idx += 2; 3893 tmp0 = t[i1]; 3894 tmp1 = t[i2]; 3895 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3896 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3897 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3898 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3899 } 3900 3901 if (n == sz-1) { 3902 tmp0 = t[*idx]; 3903 sum1 -= v1[0] * tmp0; 3904 sum2 -= v2[0] * tmp0; 3905 sum3 -= v3[0] * tmp0; 3906 sum4 -= v4[0] * tmp0; 3907 } 3908 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3909 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3910 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3911 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3912 ibdiag += 16; row += 4; 3913 break; 3914 case 5: 3915 v2 = a->a + ii[row+1]; 3916 v3 = a->a + ii[row+2]; 3917 v4 = a->a + ii[row+3]; 3918 v5 = a->a + ii[row+4]; 3919 sum1 = t[row]; 3920 sum2 = t[row+1]; 3921 sum3 = t[row+2]; 3922 sum4 = t[row+3]; 3923 sum5 = t[row+4]; 3924 for (n = 0; n<sz-1; n+=2) { 3925 i1 = idx[0]; 3926 i2 = idx[1]; 3927 idx += 2; 3928 tmp0 = t[i1]; 3929 tmp1 = t[i2]; 3930 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3931 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3932 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3933 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3934 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3935 } 3936 3937 if (n == sz-1) { 3938 tmp0 = t[*idx]; 3939 sum1 -= v1[0] * tmp0; 3940 sum2 -= v2[0] * tmp0; 3941 sum3 -= v3[0] * tmp0; 3942 sum4 -= v4[0] * tmp0; 3943 sum5 -= v5[0] * tmp0; 3944 } 3945 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3946 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3947 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3948 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3949 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3950 ibdiag += 25; row += 5; 3951 break; 3952 default: 3953 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3954 } 3955 } 3956 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3957 } 3958 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3959 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3960 PetscFunctionReturn(0); 3961 } 3962 3963 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3964 { 3965 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3966 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3967 const MatScalar *bdiag = a->inode.bdiag; 3968 const PetscScalar *b; 3969 PetscErrorCode ierr; 3970 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3971 const PetscInt *sizes = a->inode.size; 3972 3973 PetscFunctionBegin; 3974 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3975 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3976 cnt = 0; 3977 for (i=0, row=0; i<m; i++) { 3978 switch (sizes[i]) { 3979 case 1: 3980 x[row] = b[row]*bdiag[cnt++];row++; 3981 break; 3982 case 2: 3983 x1 = b[row]; x2 = b[row+1]; 3984 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3985 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3986 x[row++] = tmp1; 3987 x[row++] = tmp2; 3988 cnt += 4; 3989 break; 3990 case 3: 3991 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3992 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3993 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3994 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3995 x[row++] = tmp1; 3996 x[row++] = tmp2; 3997 x[row++] = tmp3; 3998 cnt += 9; 3999 break; 4000 case 4: 4001 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4002 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4003 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4004 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4005 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4006 x[row++] = tmp1; 4007 x[row++] = tmp2; 4008 x[row++] = tmp3; 4009 x[row++] = tmp4; 4010 cnt += 16; 4011 break; 4012 case 5: 4013 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4014 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4015 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4016 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4017 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4018 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4019 x[row++] = tmp1; 4020 x[row++] = tmp2; 4021 x[row++] = tmp3; 4022 x[row++] = tmp4; 4023 x[row++] = tmp5; 4024 cnt += 25; 4025 break; 4026 default: 4027 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4028 } 4029 } 4030 ierr = PetscLogFlops(2.0*cnt);CHKERRQ(ierr); 4031 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4032 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4033 PetscFunctionReturn(0); 4034 } 4035 4036 /* 4037 samestructure indicates that the matrix has not changed its nonzero structure so we 4038 do not need to recompute the inodes 4039 */ 4040 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4041 { 4042 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4043 PetscErrorCode ierr; 4044 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4045 PetscBool flag; 4046 const PetscInt *idx,*idy,*ii; 4047 4048 PetscFunctionBegin; 4049 if (!a->inode.use) PetscFunctionReturn(0); 4050 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4051 4052 m = A->rmap->n; 4053 if (a->inode.size) ns = a->inode.size; 4054 else { 4055 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4056 } 4057 4058 i = 0; 4059 node_count = 0; 4060 idx = a->j; 4061 ii = a->i; 4062 while (i < m) { /* For each row */ 4063 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4064 /* Limits the number of elements in a node to 'a->inode.limit' */ 4065 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4066 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4067 if (nzy != nzx) break; 4068 idy += nzx; /* Same nonzero pattern */ 4069 ierr = PetscArraycmp(idx,idy,nzx,&flag);CHKERRQ(ierr); 4070 if (!flag) break; 4071 } 4072 ns[node_count++] = blk_size; 4073 idx += blk_size*nzx; 4074 i = j; 4075 } 4076 4077 { 4078 PetscBool is_cudatype; 4079 ierr = PetscObjectTypeCompareAny((PetscObject)A,&is_cudatype, MATAIJCUSPARSE, MATSEQAIJCUSPARSE, MATMPIAIJCUSPARSE, MATAIJVIENNACL, MATSEQAIJVIENNACL, MATMPIAIJVIENNACL,"");CHKERRQ(ierr); 4080 if (is_cudatype) { 4081 ierr = PetscInfo(A,"Not using Inode routines on GPU matrix\n");CHKERRQ(ierr); 4082 ierr = PetscFree(ns);CHKERRQ(ierr); 4083 a->inode.node_count = 0; 4084 a->inode.size = NULL; 4085 a->inode.use = PETSC_FALSE; 4086 a->inode.checked = PETSC_TRUE; 4087 a->inode.mat_nonzerostate = A->nonzerostate; 4088 PetscFunctionReturn(0); 4089 } 4090 } 4091 4092 /* If not enough inodes found,, do not use inode version of the routines */ 4093 if (!m || node_count > .8*m) { 4094 ierr = PetscFree(ns);CHKERRQ(ierr); 4095 4096 a->inode.node_count = 0; 4097 a->inode.size = NULL; 4098 a->inode.use = PETSC_FALSE; 4099 A->ops->mult = MatMult_SeqAIJ; 4100 A->ops->sor = MatSOR_SeqAIJ; 4101 A->ops->multadd = MatMultAdd_SeqAIJ; 4102 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4103 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4104 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4105 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4106 A->ops->coloringpatch = 0; 4107 A->ops->multdiagonalblock = 0; 4108 4109 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4110 } else { 4111 if (!A->factortype) { 4112 A->ops->mult = MatMult_SeqAIJ_Inode; 4113 A->ops->sor = MatSOR_SeqAIJ_Inode; 4114 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4115 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4116 if (A->rmap->n == A->cmap->n) { 4117 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4118 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4119 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4120 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4121 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4122 } 4123 } else { 4124 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4125 } 4126 a->inode.node_count = node_count; 4127 a->inode.size = ns; 4128 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4129 } 4130 a->inode.checked = PETSC_TRUE; 4131 a->inode.mat_nonzerostate = A->nonzerostate; 4132 PetscFunctionReturn(0); 4133 } 4134 4135 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4136 { 4137 Mat B =*C; 4138 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4139 PetscErrorCode ierr; 4140 PetscInt m=A->rmap->n; 4141 4142 PetscFunctionBegin; 4143 c->inode.use = a->inode.use; 4144 c->inode.limit = a->inode.limit; 4145 c->inode.max_limit = a->inode.max_limit; 4146 if (a->inode.size) { 4147 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4148 c->inode.node_count = a->inode.node_count; 4149 ierr = PetscArraycpy(c->inode.size,a->inode.size,m+1);CHKERRQ(ierr); 4150 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4151 if (!B->factortype) { 4152 B->ops->mult = MatMult_SeqAIJ_Inode; 4153 B->ops->sor = MatSOR_SeqAIJ_Inode; 4154 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4155 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4156 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4157 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4158 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4159 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4160 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4161 } else { 4162 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4163 } 4164 } else { 4165 c->inode.size = 0; 4166 c->inode.node_count = 0; 4167 } 4168 c->inode.ibdiagvalid = PETSC_FALSE; 4169 c->inode.ibdiag = 0; 4170 c->inode.bdiag = 0; 4171 PetscFunctionReturn(0); 4172 } 4173 4174 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4175 { 4176 PetscInt k; 4177 const PetscInt *vi; 4178 4179 PetscFunctionBegin; 4180 vi = aj + ai[row]; 4181 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4182 vi = aj + adiag[row]; 4183 cols[nzl] = vi[0]; 4184 vi = aj + adiag[row+1]+1; 4185 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4186 PetscFunctionReturn(0); 4187 } 4188 /* 4189 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4190 Modified from MatSeqAIJCheckInode(). 4191 4192 Input Parameters: 4193 . Mat A - ILU or LU matrix factor 4194 4195 */ 4196 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4197 { 4198 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4199 PetscErrorCode ierr; 4200 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4201 PetscInt *cols1,*cols2,*ns; 4202 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4203 PetscBool flag; 4204 4205 PetscFunctionBegin; 4206 if (!a->inode.use) PetscFunctionReturn(0); 4207 if (a->inode.checked) PetscFunctionReturn(0); 4208 4209 m = A->rmap->n; 4210 if (a->inode.size) ns = a->inode.size; 4211 else { 4212 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4213 } 4214 4215 i = 0; 4216 node_count = 0; 4217 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4218 while (i < m) { /* For each row */ 4219 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4220 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4221 nzx = nzl1 + nzu1 + 1; 4222 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4223 4224 /* Limits the number of elements in a node to 'a->inode.limit' */ 4225 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4226 nzl2 = ai[j+1] - ai[j]; 4227 nzu2 = adiag[j] - adiag[j+1] - 1; 4228 nzy = nzl2 + nzu2 + 1; 4229 if (nzy != nzx) break; 4230 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4231 ierr = PetscArraycmp(cols1,cols2,nzx,&flag);CHKERRQ(ierr); 4232 if (!flag) break; 4233 } 4234 ns[node_count++] = blk_size; 4235 i = j; 4236 } 4237 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4238 /* If not enough inodes found,, do not use inode version of the routines */ 4239 if (!m || node_count > .8*m) { 4240 ierr = PetscFree(ns);CHKERRQ(ierr); 4241 4242 a->inode.node_count = 0; 4243 a->inode.size = NULL; 4244 a->inode.use = PETSC_FALSE; 4245 4246 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4247 } else { 4248 A->ops->mult = 0; 4249 A->ops->sor = 0; 4250 A->ops->multadd = 0; 4251 A->ops->getrowij = 0; 4252 A->ops->restorerowij = 0; 4253 A->ops->getcolumnij = 0; 4254 A->ops->restorecolumnij = 0; 4255 A->ops->coloringpatch = 0; 4256 A->ops->multdiagonalblock = 0; 4257 a->inode.node_count = node_count; 4258 a->inode.size = ns; 4259 4260 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4261 } 4262 a->inode.checked = PETSC_TRUE; 4263 PetscFunctionReturn(0); 4264 } 4265 4266 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4267 { 4268 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4269 4270 PetscFunctionBegin; 4271 a->inode.ibdiagvalid = PETSC_FALSE; 4272 PetscFunctionReturn(0); 4273 } 4274 4275 /* 4276 This is really ugly. if inodes are used this replaces the 4277 permutations with ones that correspond to rows/cols of the matrix 4278 rather then inode blocks 4279 */ 4280 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4281 { 4282 PetscErrorCode ierr; 4283 4284 PetscFunctionBegin; 4285 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4286 PetscFunctionReturn(0); 4287 } 4288 4289 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4290 { 4291 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4292 PetscErrorCode ierr; 4293 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4294 const PetscInt *ridx,*cidx; 4295 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4296 PetscInt nslim_col,*ns_col; 4297 IS ris = *rperm,cis = *cperm; 4298 4299 PetscFunctionBegin; 4300 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4301 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4302 4303 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4304 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4305 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4306 4307 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4308 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4309 4310 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4311 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4312 4313 /* Construct the permutations for rows*/ 4314 for (i=0,row = 0; i<nslim_row; ++i) { 4315 indx = ridx[i]; 4316 start_val = tns[indx]; 4317 end_val = tns[indx + 1]; 4318 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4319 } 4320 4321 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4322 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4323 4324 /* Construct permutations for columns */ 4325 for (i=0,col=0; i<nslim_col; ++i) { 4326 indx = cidx[i]; 4327 start_val = tns[indx]; 4328 end_val = tns[indx + 1]; 4329 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4330 } 4331 4332 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4333 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4334 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4335 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4336 4337 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4338 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4339 4340 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4341 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4342 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4343 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4344 ierr = PetscFree(tns);CHKERRQ(ierr); 4345 PetscFunctionReturn(0); 4346 } 4347 4348 /*@C 4349 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4350 4351 Not Collective 4352 4353 Input Parameter: 4354 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4355 4356 Output Parameter: 4357 + node_count - no of inodes present in the matrix. 4358 . sizes - an array of size node_count,with sizes of each inode. 4359 - limit - the max size used to generate the inodes. 4360 4361 Level: advanced 4362 4363 Notes: 4364 This routine returns some internal storage information 4365 of the matrix, it is intended to be used by advanced users. 4366 It should be called after the matrix is assembled. 4367 The contents of the sizes[] array should not be changed. 4368 NULL may be passed for information not requested. 4369 4370 .seealso: MatGetInfo() 4371 @*/ 4372 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4373 { 4374 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4375 4376 PetscFunctionBegin; 4377 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4378 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4379 if (f) { 4380 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4381 } 4382 PetscFunctionReturn(0); 4383 } 4384 4385 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4386 { 4387 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4388 4389 PetscFunctionBegin; 4390 if (node_count) *node_count = a->inode.node_count; 4391 if (sizes) *sizes = a->inode.size; 4392 if (limit) *limit = a->inode.limit; 4393 PetscFunctionReturn(0); 4394 } 4395