1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns) 9 { 10 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 11 PetscErrorCode ierr; 12 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 13 14 PetscFunctionBegin; 15 n = A->cmap->n; 16 m = A->rmap->n; 17 ns_row = a->inode.size; 18 19 min_mn = (m < n) ? m : n; 20 if (!ns) { 21 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 22 for (; count+1 < n; count++,i++) ; 23 if (count < n) { 24 i++; 25 } 26 *size = i; 27 PetscFunctionReturn(0); 28 } 29 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 30 31 /* Use the same row structure wherever feasible. */ 32 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 33 ns_col[i] = ns_row[i]; 34 } 35 36 /* if m < n; pad up the remainder with inode_limit */ 37 for (; count+1 < n; count++,i++) { 38 ns_col[i] = 1; 39 } 40 /* The last node is the odd ball. padd it up with the remaining rows; */ 41 if (count < n) { 42 ns_col[i] = n - count; 43 i++; 44 } else if (count > n) { 45 /* Adjust for the over estimation */ 46 ns_col[i-1] += n - count; 47 } 48 *size = i; 49 *ns = ns_col; 50 PetscFunctionReturn(0); 51 } 52 53 54 /* 55 This builds symmetric version of nonzero structure, 56 */ 57 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 58 { 59 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 60 PetscErrorCode ierr; 61 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 62 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 63 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 64 65 PetscFunctionBegin; 66 nslim_row = a->inode.node_count; 67 m = A->rmap->n; 68 n = A->cmap->n; 69 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 70 71 /* Use the row_inode as column_inode */ 72 nslim_col = nslim_row; 73 ns_col = ns_row; 74 75 /* allocate space for reformated inode structure */ 76 ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr); 77 ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr); 78 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 79 80 for (i1=0,col=0; i1<nslim_col; ++i1) { 81 nsz = ns_col[i1]; 82 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 83 } 84 /* allocate space for row pointers */ 85 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 86 *iia = ia; 87 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 88 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 89 90 /* determine the number of columns in each row */ 91 ia[0] = oshift; 92 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 93 94 j = aj + ai[row] + ishift; 95 jmax = aj + ai[row+1] + ishift; 96 if (j==jmax) continue; /* empty row */ 97 col = *j++ + ishift; 98 i2 = tvc[col]; 99 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 100 ia[i1+1]++; 101 ia[i2+1]++; 102 i2++; /* Start col of next node */ 103 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 104 i2 = tvc[col]; 105 } 106 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 107 } 108 109 /* shift ia[i] to point to next row */ 110 for (i1=1; i1<nslim_row+1; i1++) { 111 row = ia[i1-1]; 112 ia[i1] += row; 113 work[i1-1] = row - oshift; 114 } 115 116 /* allocate space for column pointers */ 117 nz = ia[nslim_row] + (!ishift); 118 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 119 *jja = ja; 120 121 /* loop over lower triangular part putting into ja */ 122 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 123 j = aj + ai[row] + ishift; 124 jmax = aj + ai[row+1] + ishift; 125 if (j==jmax) continue; /* empty row */ 126 col = *j++ + ishift; 127 i2 = tvc[col]; 128 while (i2<i1 && j<jmax) { 129 ja[work[i2]++] = i1 + oshift; 130 ja[work[i1]++] = i2 + oshift; 131 ++i2; 132 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 133 i2 = tvc[col]; 134 } 135 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 136 137 } 138 ierr = PetscFree(work);CHKERRQ(ierr); 139 ierr = PetscFree(tns);CHKERRQ(ierr); 140 ierr = PetscFree(tvc);CHKERRQ(ierr); 141 PetscFunctionReturn(0); 142 } 143 144 /* 145 This builds nonsymmetric version of nonzero structure, 146 */ 147 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 148 { 149 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 150 PetscErrorCode ierr; 151 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 152 PetscInt *tns,*tvc,nsz,i1,i2; 153 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 154 155 PetscFunctionBegin; 156 nslim_row = a->inode.node_count; 157 n = A->cmap->n; 158 159 /* Create The column_inode for this matrix */ 160 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 161 162 /* allocate space for reformated column_inode structure */ 163 ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr); 164 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 165 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 166 167 for (i1=0,col=0; i1<nslim_col; ++i1) { 168 nsz = ns_col[i1]; 169 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 170 } 171 /* allocate space for row pointers */ 172 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 173 *iia = ia; 174 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 175 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 176 177 /* determine the number of columns in each row */ 178 ia[0] = oshift; 179 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 180 j = aj + ai[row] + ishift; 181 nz = ai[row+1] - ai[row]; 182 if (!nz) continue; /* empty row */ 183 col = *j++ + ishift; 184 i2 = tvc[col]; 185 while (nz-- > 0) { /* off-diagonal elemets */ 186 ia[i1+1]++; 187 i2++; /* Start col of next node */ 188 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 189 if (nz > 0) i2 = tvc[col]; 190 } 191 } 192 193 /* shift ia[i] to point to next row */ 194 for (i1=1; i1<nslim_row+1; i1++) { 195 row = ia[i1-1]; 196 ia[i1] += row; 197 work[i1-1] = row - oshift; 198 } 199 200 /* allocate space for column pointers */ 201 nz = ia[nslim_row] + (!ishift); 202 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 203 *jja = ja; 204 205 /* loop over matrix putting into ja */ 206 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 207 j = aj + ai[row] + ishift; 208 nz = ai[row+1] - ai[row]; 209 if (!nz) continue; /* empty row */ 210 col = *j++ + ishift; 211 i2 = tvc[col]; 212 while (nz-- > 0) { 213 ja[work[i1]++] = i2 + oshift; 214 ++i2; 215 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 216 if (nz > 0) i2 = tvc[col]; 217 } 218 } 219 ierr = PetscFree(ns_col);CHKERRQ(ierr); 220 ierr = PetscFree(work);CHKERRQ(ierr); 221 ierr = PetscFree(tns);CHKERRQ(ierr); 222 ierr = PetscFree(tvc);CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 227 { 228 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 229 PetscErrorCode ierr; 230 231 PetscFunctionBegin; 232 *n = a->inode.node_count; 233 if (!ia) PetscFunctionReturn(0); 234 if (!blockcompressed) { 235 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 236 } else if (symmetric) { 237 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 238 } else { 239 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 240 } 241 PetscFunctionReturn(0); 242 } 243 244 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 245 { 246 PetscErrorCode ierr; 247 248 PetscFunctionBegin; 249 if (!ia) PetscFunctionReturn(0); 250 251 if (!blockcompressed) { 252 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 253 } else { 254 ierr = PetscFree(*ia);CHKERRQ(ierr); 255 ierr = PetscFree(*ja);CHKERRQ(ierr); 256 } 257 PetscFunctionReturn(0); 258 } 259 260 /* ----------------------------------------------------------- */ 261 262 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 263 { 264 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 265 PetscErrorCode ierr; 266 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 267 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 268 269 PetscFunctionBegin; 270 nslim_row = a->inode.node_count; 271 n = A->cmap->n; 272 273 /* Create The column_inode for this matrix */ 274 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 275 276 /* allocate space for reformated column_inode structure */ 277 ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr); 278 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 279 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 280 281 for (i1=0,col=0; i1<nslim_col; ++i1) { 282 nsz = ns_col[i1]; 283 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 284 } 285 /* allocate space for column pointers */ 286 ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 287 *iia = ia; 288 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 289 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 290 291 /* determine the number of columns in each row */ 292 ia[0] = oshift; 293 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 294 j = aj + ai[row] + ishift; 295 col = *j++ + ishift; 296 i2 = tvc[col]; 297 nz = ai[row+1] - ai[row]; 298 while (nz-- > 0) { /* off-diagonal elemets */ 299 /* ia[i1+1]++; */ 300 ia[i2+1]++; 301 i2++; 302 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 303 if (nz > 0) i2 = tvc[col]; 304 } 305 } 306 307 /* shift ia[i] to point to next col */ 308 for (i1=1; i1<nslim_col+1; i1++) { 309 col = ia[i1-1]; 310 ia[i1] += col; 311 work[i1-1] = col - oshift; 312 } 313 314 /* allocate space for column pointers */ 315 nz = ia[nslim_col] + (!ishift); 316 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 317 *jja = ja; 318 319 /* loop over matrix putting into ja */ 320 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 321 j = aj + ai[row] + ishift; 322 col = *j++ + ishift; 323 i2 = tvc[col]; 324 nz = ai[row+1] - ai[row]; 325 while (nz-- > 0) { 326 /* ja[work[i1]++] = i2 + oshift; */ 327 ja[work[i2]++] = i1 + oshift; 328 i2++; 329 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 330 if (nz > 0) i2 = tvc[col]; 331 } 332 } 333 ierr = PetscFree(ns_col);CHKERRQ(ierr); 334 ierr = PetscFree(work);CHKERRQ(ierr); 335 ierr = PetscFree(tns);CHKERRQ(ierr); 336 ierr = PetscFree(tvc);CHKERRQ(ierr); 337 PetscFunctionReturn(0); 338 } 339 340 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 341 { 342 PetscErrorCode ierr; 343 344 PetscFunctionBegin; 345 ierr = MatCreateColInode_Private(A,n,NULL);CHKERRQ(ierr); 346 if (!ia) PetscFunctionReturn(0); 347 348 if (!blockcompressed) { 349 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 350 } else if (symmetric) { 351 /* Since the indices are symmetric it does'nt matter */ 352 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 353 } else { 354 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 355 } 356 PetscFunctionReturn(0); 357 } 358 359 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 360 { 361 PetscErrorCode ierr; 362 363 PetscFunctionBegin; 364 if (!ia) PetscFunctionReturn(0); 365 if (!blockcompressed) { 366 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else { 368 ierr = PetscFree(*ia);CHKERRQ(ierr); 369 ierr = PetscFree(*ja);CHKERRQ(ierr); 370 } 371 PetscFunctionReturn(0); 372 } 373 374 /* ----------------------------------------------------------- */ 375 376 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 377 { 378 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 379 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 380 PetscScalar *y; 381 const PetscScalar *x; 382 const MatScalar *v1,*v2,*v3,*v4,*v5; 383 PetscErrorCode ierr; 384 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 385 const PetscInt *idx,*ns,*ii; 386 387 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 388 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 389 #endif 390 391 PetscFunctionBegin; 392 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 393 node_max = a->inode.node_count; 394 ns = a->inode.size; /* Node Size array */ 395 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 396 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 397 idx = a->j; 398 v1 = a->a; 399 ii = a->i; 400 401 for (i = 0,row = 0; i< node_max; ++i) { 402 nsz = ns[i]; 403 n = ii[1] - ii[0]; 404 nonzerorow += (n>0)*nsz; 405 ii += nsz; 406 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 407 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 408 sz = n; /* No of non zeros in this row */ 409 /* Switch on the size of Node */ 410 switch (nsz) { /* Each loop in 'case' is unrolled */ 411 case 1: 412 sum1 = 0.; 413 414 for (n = 0; n< sz-1; n+=2) { 415 i1 = idx[0]; /* The instructions are ordered to */ 416 i2 = idx[1]; /* make the compiler's job easy */ 417 idx += 2; 418 tmp0 = x[i1]; 419 tmp1 = x[i2]; 420 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 421 } 422 423 if (n == sz-1) { /* Take care of the last nonzero */ 424 tmp0 = x[*idx++]; 425 sum1 += *v1++ *tmp0; 426 } 427 y[row++]=sum1; 428 break; 429 case 2: 430 sum1 = 0.; 431 sum2 = 0.; 432 v2 = v1 + n; 433 434 for (n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; 436 i2 = idx[1]; 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 442 } 443 if (n == sz-1) { 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 sum2 += *v2++ * tmp0; 447 } 448 y[row++]=sum1; 449 y[row++]=sum2; 450 v1 =v2; /* Since the next block to be processed starts there*/ 451 idx +=sz; 452 break; 453 case 3: 454 sum1 = 0.; 455 sum2 = 0.; 456 sum3 = 0.; 457 v2 = v1 + n; 458 v3 = v2 + n; 459 460 for (n = 0; n< sz-1; n+=2) { 461 i1 = idx[0]; 462 i2 = idx[1]; 463 idx += 2; 464 tmp0 = x[i1]; 465 tmp1 = x[i2]; 466 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 467 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 468 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 469 } 470 if (n == sz-1) { 471 tmp0 = x[*idx++]; 472 sum1 += *v1++ * tmp0; 473 sum2 += *v2++ * tmp0; 474 sum3 += *v3++ * tmp0; 475 } 476 y[row++]=sum1; 477 y[row++]=sum2; 478 y[row++]=sum3; 479 v1 =v3; /* Since the next block to be processed starts there*/ 480 idx +=2*sz; 481 break; 482 case 4: 483 sum1 = 0.; 484 sum2 = 0.; 485 sum3 = 0.; 486 sum4 = 0.; 487 v2 = v1 + n; 488 v3 = v2 + n; 489 v4 = v3 + n; 490 491 for (n = 0; n< sz-1; n+=2) { 492 i1 = idx[0]; 493 i2 = idx[1]; 494 idx += 2; 495 tmp0 = x[i1]; 496 tmp1 = x[i2]; 497 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 498 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 499 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 500 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 501 } 502 if (n == sz-1) { 503 tmp0 = x[*idx++]; 504 sum1 += *v1++ * tmp0; 505 sum2 += *v2++ * tmp0; 506 sum3 += *v3++ * tmp0; 507 sum4 += *v4++ * tmp0; 508 } 509 y[row++]=sum1; 510 y[row++]=sum2; 511 y[row++]=sum3; 512 y[row++]=sum4; 513 v1 =v4; /* Since the next block to be processed starts there*/ 514 idx +=3*sz; 515 break; 516 case 5: 517 sum1 = 0.; 518 sum2 = 0.; 519 sum3 = 0.; 520 sum4 = 0.; 521 sum5 = 0.; 522 v2 = v1 + n; 523 v3 = v2 + n; 524 v4 = v3 + n; 525 v5 = v4 + n; 526 527 for (n = 0; n<sz-1; n+=2) { 528 i1 = idx[0]; 529 i2 = idx[1]; 530 idx += 2; 531 tmp0 = x[i1]; 532 tmp1 = x[i2]; 533 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 534 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 535 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 536 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 537 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 538 } 539 if (n == sz-1) { 540 tmp0 = x[*idx++]; 541 sum1 += *v1++ * tmp0; 542 sum2 += *v2++ * tmp0; 543 sum3 += *v3++ * tmp0; 544 sum4 += *v4++ * tmp0; 545 sum5 += *v5++ * tmp0; 546 } 547 y[row++]=sum1; 548 y[row++]=sum2; 549 y[row++]=sum3; 550 y[row++]=sum4; 551 y[row++]=sum5; 552 v1 =v5; /* Since the next block to be processed starts there */ 553 idx +=4*sz; 554 break; 555 default: 556 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 557 } 558 } 559 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 560 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 561 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 562 PetscFunctionReturn(0); 563 } 564 /* ----------------------------------------------------------- */ 565 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 566 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 567 { 568 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 569 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 570 const MatScalar *v1,*v2,*v3,*v4,*v5; 571 const PetscScalar *x; 572 PetscScalar *y,*z,*zt; 573 PetscErrorCode ierr; 574 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 575 const PetscInt *idx,*ns,*ii; 576 577 PetscFunctionBegin; 578 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 579 node_max = a->inode.node_count; 580 ns = a->inode.size; /* Node Size array */ 581 582 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 583 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 584 zt = z; 585 586 idx = a->j; 587 v1 = a->a; 588 ii = a->i; 589 590 for (i = 0,row = 0; i< node_max; ++i) { 591 nsz = ns[i]; 592 n = ii[1] - ii[0]; 593 ii += nsz; 594 sz = n; /* No of non zeros in this row */ 595 /* Switch on the size of Node */ 596 switch (nsz) { /* Each loop in 'case' is unrolled */ 597 case 1: 598 sum1 = *zt++; 599 600 for (n = 0; n< sz-1; n+=2) { 601 i1 = idx[0]; /* The instructions are ordered to */ 602 i2 = idx[1]; /* make the compiler's job easy */ 603 idx += 2; 604 tmp0 = x[i1]; 605 tmp1 = x[i2]; 606 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 607 } 608 609 if (n == sz-1) { /* Take care of the last nonzero */ 610 tmp0 = x[*idx++]; 611 sum1 += *v1++ * tmp0; 612 } 613 y[row++]=sum1; 614 break; 615 case 2: 616 sum1 = *zt++; 617 sum2 = *zt++; 618 v2 = v1 + n; 619 620 for (n = 0; n< sz-1; n+=2) { 621 i1 = idx[0]; 622 i2 = idx[1]; 623 idx += 2; 624 tmp0 = x[i1]; 625 tmp1 = x[i2]; 626 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 627 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 628 } 629 if (n == sz-1) { 630 tmp0 = x[*idx++]; 631 sum1 += *v1++ * tmp0; 632 sum2 += *v2++ * tmp0; 633 } 634 y[row++]=sum1; 635 y[row++]=sum2; 636 v1 =v2; /* Since the next block to be processed starts there*/ 637 idx +=sz; 638 break; 639 case 3: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 sum3 = *zt++; 643 v2 = v1 + n; 644 v3 = v2 + n; 645 646 for (n = 0; n< sz-1; n+=2) { 647 i1 = idx[0]; 648 i2 = idx[1]; 649 idx += 2; 650 tmp0 = x[i1]; 651 tmp1 = x[i2]; 652 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 653 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 654 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 655 } 656 if (n == sz-1) { 657 tmp0 = x[*idx++]; 658 sum1 += *v1++ * tmp0; 659 sum2 += *v2++ * tmp0; 660 sum3 += *v3++ * tmp0; 661 } 662 y[row++]=sum1; 663 y[row++]=sum2; 664 y[row++]=sum3; 665 v1 =v3; /* Since the next block to be processed starts there*/ 666 idx +=2*sz; 667 break; 668 case 4: 669 sum1 = *zt++; 670 sum2 = *zt++; 671 sum3 = *zt++; 672 sum4 = *zt++; 673 v2 = v1 + n; 674 v3 = v2 + n; 675 v4 = v3 + n; 676 677 for (n = 0; n< sz-1; n+=2) { 678 i1 = idx[0]; 679 i2 = idx[1]; 680 idx += 2; 681 tmp0 = x[i1]; 682 tmp1 = x[i2]; 683 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 684 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 685 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 686 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 687 } 688 if (n == sz-1) { 689 tmp0 = x[*idx++]; 690 sum1 += *v1++ * tmp0; 691 sum2 += *v2++ * tmp0; 692 sum3 += *v3++ * tmp0; 693 sum4 += *v4++ * tmp0; 694 } 695 y[row++]=sum1; 696 y[row++]=sum2; 697 y[row++]=sum3; 698 y[row++]=sum4; 699 v1 =v4; /* Since the next block to be processed starts there*/ 700 idx +=3*sz; 701 break; 702 case 5: 703 sum1 = *zt++; 704 sum2 = *zt++; 705 sum3 = *zt++; 706 sum4 = *zt++; 707 sum5 = *zt++; 708 v2 = v1 + n; 709 v3 = v2 + n; 710 v4 = v3 + n; 711 v5 = v4 + n; 712 713 for (n = 0; n<sz-1; n+=2) { 714 i1 = idx[0]; 715 i2 = idx[1]; 716 idx += 2; 717 tmp0 = x[i1]; 718 tmp1 = x[i2]; 719 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 720 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 721 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 722 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 723 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 724 } 725 if (n == sz-1) { 726 tmp0 = x[*idx++]; 727 sum1 += *v1++ * tmp0; 728 sum2 += *v2++ * tmp0; 729 sum3 += *v3++ * tmp0; 730 sum4 += *v4++ * tmp0; 731 sum5 += *v5++ * tmp0; 732 } 733 y[row++]=sum1; 734 y[row++]=sum2; 735 y[row++]=sum3; 736 y[row++]=sum4; 737 y[row++]=sum5; 738 v1 =v5; /* Since the next block to be processed starts there */ 739 idx +=4*sz; 740 break; 741 default: 742 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 743 } 744 } 745 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 746 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 747 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 748 PetscFunctionReturn(0); 749 } 750 751 /* ----------------------------------------------------------- */ 752 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 753 { 754 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 755 IS iscol = a->col,isrow = a->row; 756 PetscErrorCode ierr; 757 const PetscInt *r,*c,*rout,*cout; 758 PetscInt i,j,n = A->rmap->n,nz; 759 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 760 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 761 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 762 PetscScalar sum1,sum2,sum3,sum4,sum5; 763 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 764 const PetscScalar *b; 765 766 PetscFunctionBegin; 767 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 768 node_max = a->inode.node_count; 769 ns = a->inode.size; /* Node Size array */ 770 771 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 772 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 773 tmp = a->solve_work; 774 775 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 776 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 777 778 /* forward solve the lower triangular */ 779 tmps = tmp; 780 aa = a_a; 781 aj = a_j; 782 ad = a->diag; 783 784 for (i = 0,row = 0; i< node_max; ++i) { 785 nsz = ns[i]; 786 aii = ai[row]; 787 v1 = aa + aii; 788 vi = aj + aii; 789 nz = ad[row]- aii; 790 if (i < node_max-1) { 791 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 792 * but our indexing to determine it's size could. */ 793 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 794 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 795 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 796 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 797 } 798 799 switch (nsz) { /* Each loop in 'case' is unrolled */ 800 case 1: 801 sum1 = b[*r++]; 802 for (j=0; j<nz-1; j+=2) { 803 i0 = vi[0]; 804 i1 = vi[1]; 805 vi +=2; 806 tmp0 = tmps[i0]; 807 tmp1 = tmps[i1]; 808 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 809 } 810 if (j == nz-1) { 811 tmp0 = tmps[*vi++]; 812 sum1 -= *v1++ *tmp0; 813 } 814 tmp[row++]=sum1; 815 break; 816 case 2: 817 sum1 = b[*r++]; 818 sum2 = b[*r++]; 819 v2 = aa + ai[row+1]; 820 821 for (j=0; j<nz-1; j+=2) { 822 i0 = vi[0]; 823 i1 = vi[1]; 824 vi +=2; 825 tmp0 = tmps[i0]; 826 tmp1 = tmps[i1]; 827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 828 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 829 } 830 if (j == nz-1) { 831 tmp0 = tmps[*vi++]; 832 sum1 -= *v1++ *tmp0; 833 sum2 -= *v2++ *tmp0; 834 } 835 sum2 -= *v2++ *sum1; 836 tmp[row++]=sum1; 837 tmp[row++]=sum2; 838 break; 839 case 3: 840 sum1 = b[*r++]; 841 sum2 = b[*r++]; 842 sum3 = b[*r++]; 843 v2 = aa + ai[row+1]; 844 v3 = aa + ai[row+2]; 845 846 for (j=0; j<nz-1; j+=2) { 847 i0 = vi[0]; 848 i1 = vi[1]; 849 vi +=2; 850 tmp0 = tmps[i0]; 851 tmp1 = tmps[i1]; 852 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 853 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 854 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 855 } 856 if (j == nz-1) { 857 tmp0 = tmps[*vi++]; 858 sum1 -= *v1++ *tmp0; 859 sum2 -= *v2++ *tmp0; 860 sum3 -= *v3++ *tmp0; 861 } 862 sum2 -= *v2++ * sum1; 863 sum3 -= *v3++ * sum1; 864 sum3 -= *v3++ * sum2; 865 866 tmp[row++]=sum1; 867 tmp[row++]=sum2; 868 tmp[row++]=sum3; 869 break; 870 871 case 4: 872 sum1 = b[*r++]; 873 sum2 = b[*r++]; 874 sum3 = b[*r++]; 875 sum4 = b[*r++]; 876 v2 = aa + ai[row+1]; 877 v3 = aa + ai[row+2]; 878 v4 = aa + ai[row+3]; 879 880 for (j=0; j<nz-1; j+=2) { 881 i0 = vi[0]; 882 i1 = vi[1]; 883 vi +=2; 884 tmp0 = tmps[i0]; 885 tmp1 = tmps[i1]; 886 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 887 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 888 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 889 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 890 } 891 if (j == nz-1) { 892 tmp0 = tmps[*vi++]; 893 sum1 -= *v1++ *tmp0; 894 sum2 -= *v2++ *tmp0; 895 sum3 -= *v3++ *tmp0; 896 sum4 -= *v4++ *tmp0; 897 } 898 sum2 -= *v2++ * sum1; 899 sum3 -= *v3++ * sum1; 900 sum4 -= *v4++ * sum1; 901 sum3 -= *v3++ * sum2; 902 sum4 -= *v4++ * sum2; 903 sum4 -= *v4++ * sum3; 904 905 tmp[row++]=sum1; 906 tmp[row++]=sum2; 907 tmp[row++]=sum3; 908 tmp[row++]=sum4; 909 break; 910 case 5: 911 sum1 = b[*r++]; 912 sum2 = b[*r++]; 913 sum3 = b[*r++]; 914 sum4 = b[*r++]; 915 sum5 = b[*r++]; 916 v2 = aa + ai[row+1]; 917 v3 = aa + ai[row+2]; 918 v4 = aa + ai[row+3]; 919 v5 = aa + ai[row+4]; 920 921 for (j=0; j<nz-1; j+=2) { 922 i0 = vi[0]; 923 i1 = vi[1]; 924 vi +=2; 925 tmp0 = tmps[i0]; 926 tmp1 = tmps[i1]; 927 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 928 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 929 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 930 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 931 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 932 } 933 if (j == nz-1) { 934 tmp0 = tmps[*vi++]; 935 sum1 -= *v1++ *tmp0; 936 sum2 -= *v2++ *tmp0; 937 sum3 -= *v3++ *tmp0; 938 sum4 -= *v4++ *tmp0; 939 sum5 -= *v5++ *tmp0; 940 } 941 942 sum2 -= *v2++ * sum1; 943 sum3 -= *v3++ * sum1; 944 sum4 -= *v4++ * sum1; 945 sum5 -= *v5++ * sum1; 946 sum3 -= *v3++ * sum2; 947 sum4 -= *v4++ * sum2; 948 sum5 -= *v5++ * sum2; 949 sum4 -= *v4++ * sum3; 950 sum5 -= *v5++ * sum3; 951 sum5 -= *v5++ * sum4; 952 953 tmp[row++]=sum1; 954 tmp[row++]=sum2; 955 tmp[row++]=sum3; 956 tmp[row++]=sum4; 957 tmp[row++]=sum5; 958 break; 959 default: 960 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 961 } 962 } 963 /* backward solve the upper triangular */ 964 for (i=node_max -1,row = n-1; i>=0; i--) { 965 nsz = ns[i]; 966 aii = ai[row+1] -1; 967 v1 = aa + aii; 968 vi = aj + aii; 969 nz = aii- ad[row]; 970 switch (nsz) { /* Each loop in 'case' is unrolled */ 971 case 1: 972 sum1 = tmp[row]; 973 974 for (j=nz; j>1; j-=2) { 975 vi -=2; 976 i0 = vi[2]; 977 i1 = vi[1]; 978 tmp0 = tmps[i0]; 979 tmp1 = tmps[i1]; 980 v1 -= 2; 981 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 982 } 983 if (j==1) { 984 tmp0 = tmps[*vi--]; 985 sum1 -= *v1-- * tmp0; 986 } 987 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 988 break; 989 case 2: 990 sum1 = tmp[row]; 991 sum2 = tmp[row -1]; 992 v2 = aa + ai[row]-1; 993 for (j=nz; j>1; j-=2) { 994 vi -=2; 995 i0 = vi[2]; 996 i1 = vi[1]; 997 tmp0 = tmps[i0]; 998 tmp1 = tmps[i1]; 999 v1 -= 2; 1000 v2 -= 2; 1001 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1002 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1003 } 1004 if (j==1) { 1005 tmp0 = tmps[*vi--]; 1006 sum1 -= *v1-- * tmp0; 1007 sum2 -= *v2-- * tmp0; 1008 } 1009 1010 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1011 sum2 -= *v2-- * tmp0; 1012 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1013 break; 1014 case 3: 1015 sum1 = tmp[row]; 1016 sum2 = tmp[row -1]; 1017 sum3 = tmp[row -2]; 1018 v2 = aa + ai[row]-1; 1019 v3 = aa + ai[row -1]-1; 1020 for (j=nz; j>1; j-=2) { 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 v3 -= 2; 1029 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1030 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1031 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1032 } 1033 if (j==1) { 1034 tmp0 = tmps[*vi--]; 1035 sum1 -= *v1-- * tmp0; 1036 sum2 -= *v2-- * tmp0; 1037 sum3 -= *v3-- * tmp0; 1038 } 1039 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1040 sum2 -= *v2-- * tmp0; 1041 sum3 -= *v3-- * tmp0; 1042 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1043 sum3 -= *v3-- * tmp0; 1044 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1045 1046 break; 1047 case 4: 1048 sum1 = tmp[row]; 1049 sum2 = tmp[row -1]; 1050 sum3 = tmp[row -2]; 1051 sum4 = tmp[row -3]; 1052 v2 = aa + ai[row]-1; 1053 v3 = aa + ai[row -1]-1; 1054 v4 = aa + ai[row -2]-1; 1055 1056 for (j=nz; j>1; j-=2) { 1057 vi -=2; 1058 i0 = vi[2]; 1059 i1 = vi[1]; 1060 tmp0 = tmps[i0]; 1061 tmp1 = tmps[i1]; 1062 v1 -= 2; 1063 v2 -= 2; 1064 v3 -= 2; 1065 v4 -= 2; 1066 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1067 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1068 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1069 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1070 } 1071 if (j==1) { 1072 tmp0 = tmps[*vi--]; 1073 sum1 -= *v1-- * tmp0; 1074 sum2 -= *v2-- * tmp0; 1075 sum3 -= *v3-- * tmp0; 1076 sum4 -= *v4-- * tmp0; 1077 } 1078 1079 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1080 sum2 -= *v2-- * tmp0; 1081 sum3 -= *v3-- * tmp0; 1082 sum4 -= *v4-- * tmp0; 1083 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1084 sum3 -= *v3-- * tmp0; 1085 sum4 -= *v4-- * tmp0; 1086 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1087 sum4 -= *v4-- * tmp0; 1088 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1089 break; 1090 case 5: 1091 sum1 = tmp[row]; 1092 sum2 = tmp[row -1]; 1093 sum3 = tmp[row -2]; 1094 sum4 = tmp[row -3]; 1095 sum5 = tmp[row -4]; 1096 v2 = aa + ai[row]-1; 1097 v3 = aa + ai[row -1]-1; 1098 v4 = aa + ai[row -2]-1; 1099 v5 = aa + ai[row -3]-1; 1100 for (j=nz; j>1; j-=2) { 1101 vi -= 2; 1102 i0 = vi[2]; 1103 i1 = vi[1]; 1104 tmp0 = tmps[i0]; 1105 tmp1 = tmps[i1]; 1106 v1 -= 2; 1107 v2 -= 2; 1108 v3 -= 2; 1109 v4 -= 2; 1110 v5 -= 2; 1111 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1112 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1113 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1114 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1115 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1116 } 1117 if (j==1) { 1118 tmp0 = tmps[*vi--]; 1119 sum1 -= *v1-- * tmp0; 1120 sum2 -= *v2-- * tmp0; 1121 sum3 -= *v3-- * tmp0; 1122 sum4 -= *v4-- * tmp0; 1123 sum5 -= *v5-- * tmp0; 1124 } 1125 1126 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1127 sum2 -= *v2-- * tmp0; 1128 sum3 -= *v3-- * tmp0; 1129 sum4 -= *v4-- * tmp0; 1130 sum5 -= *v5-- * tmp0; 1131 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1132 sum3 -= *v3-- * tmp0; 1133 sum4 -= *v4-- * tmp0; 1134 sum5 -= *v5-- * tmp0; 1135 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1136 sum4 -= *v4-- * tmp0; 1137 sum5 -= *v5-- * tmp0; 1138 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1139 sum5 -= *v5-- * tmp0; 1140 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1141 break; 1142 default: 1143 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1144 } 1145 } 1146 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1147 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1148 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1149 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1150 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1151 PetscFunctionReturn(0); 1152 } 1153 1154 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1155 { 1156 Mat C =B; 1157 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1158 IS isrow = b->row,isicol = b->icol; 1159 PetscErrorCode ierr; 1160 const PetscInt *r,*ic,*ics; 1161 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1162 PetscInt i,j,k,nz,nzL,row,*pj; 1163 const PetscInt *ajtmp,*bjtmp; 1164 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1165 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1166 FactorShiftCtx sctx; 1167 const PetscInt *ddiag; 1168 PetscReal rs; 1169 MatScalar d; 1170 PetscInt inod,nodesz,node_max,col; 1171 const PetscInt *ns; 1172 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1173 1174 PetscFunctionBegin; 1175 /* MatPivotSetUp(): initialize shift context sctx */ 1176 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1177 1178 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1179 ddiag = a->diag; 1180 sctx.shift_top = info->zeropivot; 1181 for (i=0; i<n; i++) { 1182 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1183 d = (aa)[ddiag[i]]; 1184 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1185 v = aa+ai[i]; 1186 nz = ai[i+1] - ai[i]; 1187 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1188 if (rs>sctx.shift_top) sctx.shift_top = rs; 1189 } 1190 sctx.shift_top *= 1.1; 1191 sctx.nshift_max = 5; 1192 sctx.shift_lo = 0.; 1193 sctx.shift_hi = 1.; 1194 } 1195 1196 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1197 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1198 1199 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1200 ics = ic; 1201 1202 node_max = a->inode.node_count; 1203 ns = a->inode.size; 1204 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1205 1206 /* If max inode size > 4, split it into two inodes.*/ 1207 /* also map the inode sizes according to the ordering */ 1208 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1209 for (i=0,j=0; i<node_max; ++i,++j) { 1210 if (ns[i] > 4) { 1211 tmp_vec1[j] = 4; 1212 ++j; 1213 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1214 } else { 1215 tmp_vec1[j] = ns[i]; 1216 } 1217 } 1218 /* Use the correct node_max */ 1219 node_max = j; 1220 1221 /* Now reorder the inode info based on mat re-ordering info */ 1222 /* First create a row -> inode_size_array_index map */ 1223 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1224 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1225 for (i=0,row=0; i<node_max; i++) { 1226 nodesz = tmp_vec1[i]; 1227 for (j=0; j<nodesz; j++,row++) { 1228 nsmap[row] = i; 1229 } 1230 } 1231 /* Using nsmap, create a reordered ns structure */ 1232 for (i=0,j=0; i< node_max; i++) { 1233 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1234 tmp_vec2[i] = nodesz; 1235 j += nodesz; 1236 } 1237 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1238 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1239 1240 /* Now use the correct ns */ 1241 ns = tmp_vec2; 1242 1243 do { 1244 sctx.newshift = PETSC_FALSE; 1245 /* Now loop over each block-row, and do the factorization */ 1246 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1247 nodesz = ns[inod]; 1248 1249 switch (nodesz) { 1250 case 1: 1251 /*----------*/ 1252 /* zero rtmp1 */ 1253 /* L part */ 1254 nz = bi[i+1] - bi[i]; 1255 bjtmp = bj + bi[i]; 1256 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1257 1258 /* U part */ 1259 nz = bdiag[i]-bdiag[i+1]; 1260 bjtmp = bj + bdiag[i+1]+1; 1261 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1262 1263 /* load in initial (unfactored row) */ 1264 nz = ai[r[i]+1] - ai[r[i]]; 1265 ajtmp = aj + ai[r[i]]; 1266 v = aa + ai[r[i]]; 1267 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1268 1269 /* ZeropivotApply() */ 1270 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1271 1272 /* elimination */ 1273 bjtmp = bj + bi[i]; 1274 row = *bjtmp++; 1275 nzL = bi[i+1] - bi[i]; 1276 for (k=0; k < nzL; k++) { 1277 pc = rtmp1 + row; 1278 if (*pc != 0.0) { 1279 pv = b->a + bdiag[row]; 1280 mul1 = *pc * (*pv); 1281 *pc = mul1; 1282 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1283 pv = b->a + bdiag[row+1]+1; 1284 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1285 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1286 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1287 } 1288 row = *bjtmp++; 1289 } 1290 1291 /* finished row so stick it into b->a */ 1292 rs = 0.0; 1293 /* L part */ 1294 pv = b->a + bi[i]; 1295 pj = b->j + bi[i]; 1296 nz = bi[i+1] - bi[i]; 1297 for (j=0; j<nz; j++) { 1298 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1299 } 1300 1301 /* U part */ 1302 pv = b->a + bdiag[i+1]+1; 1303 pj = b->j + bdiag[i+1]+1; 1304 nz = bdiag[i] - bdiag[i+1]-1; 1305 for (j=0; j<nz; j++) { 1306 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1307 } 1308 1309 /* Check zero pivot */ 1310 sctx.rs = rs; 1311 sctx.pv = rtmp1[i]; 1312 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1313 if (sctx.newshift) break; 1314 1315 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1316 pv = b->a + bdiag[i]; 1317 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1318 break; 1319 1320 case 2: 1321 /*----------*/ 1322 /* zero rtmp1 and rtmp2 */ 1323 /* L part */ 1324 nz = bi[i+1] - bi[i]; 1325 bjtmp = bj + bi[i]; 1326 for (j=0; j<nz; j++) { 1327 col = bjtmp[j]; 1328 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1329 } 1330 1331 /* U part */ 1332 nz = bdiag[i]-bdiag[i+1]; 1333 bjtmp = bj + bdiag[i+1]+1; 1334 for (j=0; j<nz; j++) { 1335 col = bjtmp[j]; 1336 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1337 } 1338 1339 /* load in initial (unfactored row) */ 1340 nz = ai[r[i]+1] - ai[r[i]]; 1341 ajtmp = aj + ai[r[i]]; 1342 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1343 for (j=0; j<nz; j++) { 1344 col = ics[ajtmp[j]]; 1345 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1346 } 1347 /* ZeropivotApply(): shift the diagonal of the matrix */ 1348 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1349 1350 /* elimination */ 1351 bjtmp = bj + bi[i]; 1352 row = *bjtmp++; /* pivot row */ 1353 nzL = bi[i+1] - bi[i]; 1354 for (k=0; k < nzL; k++) { 1355 pc1 = rtmp1 + row; 1356 pc2 = rtmp2 + row; 1357 if (*pc1 != 0.0 || *pc2 != 0.0) { 1358 pv = b->a + bdiag[row]; 1359 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1360 *pc1 = mul1; *pc2 = mul2; 1361 1362 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1363 pv = b->a + bdiag[row+1]+1; 1364 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1365 for (j=0; j<nz; j++) { 1366 col = pj[j]; 1367 rtmp1[col] -= mul1 * pv[j]; 1368 rtmp2[col] -= mul2 * pv[j]; 1369 } 1370 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1371 } 1372 row = *bjtmp++; 1373 } 1374 1375 /* finished row i; check zero pivot, then stick row i into b->a */ 1376 rs = 0.0; 1377 /* L part */ 1378 pc1 = b->a + bi[i]; 1379 pj = b->j + bi[i]; 1380 nz = bi[i+1] - bi[i]; 1381 for (j=0; j<nz; j++) { 1382 col = pj[j]; 1383 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1384 } 1385 /* U part */ 1386 pc1 = b->a + bdiag[i+1]+1; 1387 pj = b->j + bdiag[i+1]+1; 1388 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1389 for (j=0; j<nz; j++) { 1390 col = pj[j]; 1391 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1392 } 1393 1394 sctx.rs = rs; 1395 sctx.pv = rtmp1[i]; 1396 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1397 if (sctx.newshift) break; 1398 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1399 *pc1 = 1.0/sctx.pv; 1400 1401 /* Now take care of diagonal 2x2 block. */ 1402 pc2 = rtmp2 + i; 1403 if (*pc2 != 0.0) { 1404 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1405 *pc2 = mul1; /* insert L entry */ 1406 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1407 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1408 for (j=0; j<nz; j++) { 1409 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1410 } 1411 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1412 } 1413 1414 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1415 rs = 0.0; 1416 /* L part */ 1417 pc2 = b->a + bi[i+1]; 1418 pj = b->j + bi[i+1]; 1419 nz = bi[i+2] - bi[i+1]; 1420 for (j=0; j<nz; j++) { 1421 col = pj[j]; 1422 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1423 } 1424 /* U part */ 1425 pc2 = b->a + bdiag[i+2]+1; 1426 pj = b->j + bdiag[i+2]+1; 1427 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1428 for (j=0; j<nz; j++) { 1429 col = pj[j]; 1430 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1431 } 1432 1433 sctx.rs = rs; 1434 sctx.pv = rtmp2[i+1]; 1435 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1436 if (sctx.newshift) break; 1437 pc2 = b->a + bdiag[i+1]; 1438 *pc2 = 1.0/sctx.pv; 1439 break; 1440 1441 case 3: 1442 /*----------*/ 1443 /* zero rtmp */ 1444 /* L part */ 1445 nz = bi[i+1] - bi[i]; 1446 bjtmp = bj + bi[i]; 1447 for (j=0; j<nz; j++) { 1448 col = bjtmp[j]; 1449 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1450 } 1451 1452 /* U part */ 1453 nz = bdiag[i]-bdiag[i+1]; 1454 bjtmp = bj + bdiag[i+1]+1; 1455 for (j=0; j<nz; j++) { 1456 col = bjtmp[j]; 1457 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1458 } 1459 1460 /* load in initial (unfactored row) */ 1461 nz = ai[r[i]+1] - ai[r[i]]; 1462 ajtmp = aj + ai[r[i]]; 1463 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1464 for (j=0; j<nz; j++) { 1465 col = ics[ajtmp[j]]; 1466 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1467 } 1468 /* ZeropivotApply(): shift the diagonal of the matrix */ 1469 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1470 1471 /* elimination */ 1472 bjtmp = bj + bi[i]; 1473 row = *bjtmp++; /* pivot row */ 1474 nzL = bi[i+1] - bi[i]; 1475 for (k=0; k < nzL; k++) { 1476 pc1 = rtmp1 + row; 1477 pc2 = rtmp2 + row; 1478 pc3 = rtmp3 + row; 1479 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1480 pv = b->a + bdiag[row]; 1481 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1482 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1483 1484 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1485 pv = b->a + bdiag[row+1]+1; 1486 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1487 for (j=0; j<nz; j++) { 1488 col = pj[j]; 1489 rtmp1[col] -= mul1 * pv[j]; 1490 rtmp2[col] -= mul2 * pv[j]; 1491 rtmp3[col] -= mul3 * pv[j]; 1492 } 1493 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1494 } 1495 row = *bjtmp++; 1496 } 1497 1498 /* finished row i; check zero pivot, then stick row i into b->a */ 1499 rs = 0.0; 1500 /* L part */ 1501 pc1 = b->a + bi[i]; 1502 pj = b->j + bi[i]; 1503 nz = bi[i+1] - bi[i]; 1504 for (j=0; j<nz; j++) { 1505 col = pj[j]; 1506 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1507 } 1508 /* U part */ 1509 pc1 = b->a + bdiag[i+1]+1; 1510 pj = b->j + bdiag[i+1]+1; 1511 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1512 for (j=0; j<nz; j++) { 1513 col = pj[j]; 1514 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1515 } 1516 1517 sctx.rs = rs; 1518 sctx.pv = rtmp1[i]; 1519 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1520 if (sctx.newshift) break; 1521 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1522 *pc1 = 1.0/sctx.pv; 1523 1524 /* Now take care of 1st column of diagonal 3x3 block. */ 1525 pc2 = rtmp2 + i; 1526 pc3 = rtmp3 + i; 1527 if (*pc2 != 0.0 || *pc3 != 0.0) { 1528 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1529 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1530 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1531 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1532 for (j=0; j<nz; j++) { 1533 col = pj[j]; 1534 rtmp2[col] -= mul2 * rtmp1[col]; 1535 rtmp3[col] -= mul3 * rtmp1[col]; 1536 } 1537 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1538 } 1539 1540 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1541 rs = 0.0; 1542 /* L part */ 1543 pc2 = b->a + bi[i+1]; 1544 pj = b->j + bi[i+1]; 1545 nz = bi[i+2] - bi[i+1]; 1546 for (j=0; j<nz; j++) { 1547 col = pj[j]; 1548 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1549 } 1550 /* U part */ 1551 pc2 = b->a + bdiag[i+2]+1; 1552 pj = b->j + bdiag[i+2]+1; 1553 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1554 for (j=0; j<nz; j++) { 1555 col = pj[j]; 1556 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1557 } 1558 1559 sctx.rs = rs; 1560 sctx.pv = rtmp2[i+1]; 1561 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1562 if (sctx.newshift) break; 1563 pc2 = b->a + bdiag[i+1]; 1564 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1565 1566 /* Now take care of 2nd column of diagonal 3x3 block. */ 1567 pc3 = rtmp3 + i+1; 1568 if (*pc3 != 0.0) { 1569 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1570 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1571 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1572 for (j=0; j<nz; j++) { 1573 col = pj[j]; 1574 rtmp3[col] -= mul3 * rtmp2[col]; 1575 } 1576 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1577 } 1578 1579 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1580 rs = 0.0; 1581 /* L part */ 1582 pc3 = b->a + bi[i+2]; 1583 pj = b->j + bi[i+2]; 1584 nz = bi[i+3] - bi[i+2]; 1585 for (j=0; j<nz; j++) { 1586 col = pj[j]; 1587 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1588 } 1589 /* U part */ 1590 pc3 = b->a + bdiag[i+3]+1; 1591 pj = b->j + bdiag[i+3]+1; 1592 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1593 for (j=0; j<nz; j++) { 1594 col = pj[j]; 1595 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1596 } 1597 1598 sctx.rs = rs; 1599 sctx.pv = rtmp3[i+2]; 1600 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1601 if (sctx.newshift) break; 1602 pc3 = b->a + bdiag[i+2]; 1603 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1604 break; 1605 case 4: 1606 /*----------*/ 1607 /* zero rtmp */ 1608 /* L part */ 1609 nz = bi[i+1] - bi[i]; 1610 bjtmp = bj + bi[i]; 1611 for (j=0; j<nz; j++) { 1612 col = bjtmp[j]; 1613 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1614 } 1615 1616 /* U part */ 1617 nz = bdiag[i]-bdiag[i+1]; 1618 bjtmp = bj + bdiag[i+1]+1; 1619 for (j=0; j<nz; j++) { 1620 col = bjtmp[j]; 1621 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1622 } 1623 1624 /* load in initial (unfactored row) */ 1625 nz = ai[r[i]+1] - ai[r[i]]; 1626 ajtmp = aj + ai[r[i]]; 1627 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1628 for (j=0; j<nz; j++) { 1629 col = ics[ajtmp[j]]; 1630 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1631 } 1632 /* ZeropivotApply(): shift the diagonal of the matrix */ 1633 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1634 1635 /* elimination */ 1636 bjtmp = bj + bi[i]; 1637 row = *bjtmp++; /* pivot row */ 1638 nzL = bi[i+1] - bi[i]; 1639 for (k=0; k < nzL; k++) { 1640 pc1 = rtmp1 + row; 1641 pc2 = rtmp2 + row; 1642 pc3 = rtmp3 + row; 1643 pc4 = rtmp4 + row; 1644 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1645 pv = b->a + bdiag[row]; 1646 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1647 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1648 1649 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1650 pv = b->a + bdiag[row+1]+1; 1651 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1652 for (j=0; j<nz; j++) { 1653 col = pj[j]; 1654 rtmp1[col] -= mul1 * pv[j]; 1655 rtmp2[col] -= mul2 * pv[j]; 1656 rtmp3[col] -= mul3 * pv[j]; 1657 rtmp4[col] -= mul4 * pv[j]; 1658 } 1659 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1660 } 1661 row = *bjtmp++; 1662 } 1663 1664 /* finished row i; check zero pivot, then stick row i into b->a */ 1665 rs = 0.0; 1666 /* L part */ 1667 pc1 = b->a + bi[i]; 1668 pj = b->j + bi[i]; 1669 nz = bi[i+1] - bi[i]; 1670 for (j=0; j<nz; j++) { 1671 col = pj[j]; 1672 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1673 } 1674 /* U part */ 1675 pc1 = b->a + bdiag[i+1]+1; 1676 pj = b->j + bdiag[i+1]+1; 1677 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1678 for (j=0; j<nz; j++) { 1679 col = pj[j]; 1680 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1681 } 1682 1683 sctx.rs = rs; 1684 sctx.pv = rtmp1[i]; 1685 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1686 if (sctx.newshift) break; 1687 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1688 *pc1 = 1.0/sctx.pv; 1689 1690 /* Now take care of 1st column of diagonal 4x4 block. */ 1691 pc2 = rtmp2 + i; 1692 pc3 = rtmp3 + i; 1693 pc4 = rtmp4 + i; 1694 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1695 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1696 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1697 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1698 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1699 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1700 for (j=0; j<nz; j++) { 1701 col = pj[j]; 1702 rtmp2[col] -= mul2 * rtmp1[col]; 1703 rtmp3[col] -= mul3 * rtmp1[col]; 1704 rtmp4[col] -= mul4 * rtmp1[col]; 1705 } 1706 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1707 } 1708 1709 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1710 rs = 0.0; 1711 /* L part */ 1712 pc2 = b->a + bi[i+1]; 1713 pj = b->j + bi[i+1]; 1714 nz = bi[i+2] - bi[i+1]; 1715 for (j=0; j<nz; j++) { 1716 col = pj[j]; 1717 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1718 } 1719 /* U part */ 1720 pc2 = b->a + bdiag[i+2]+1; 1721 pj = b->j + bdiag[i+2]+1; 1722 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1723 for (j=0; j<nz; j++) { 1724 col = pj[j]; 1725 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1726 } 1727 1728 sctx.rs = rs; 1729 sctx.pv = rtmp2[i+1]; 1730 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1731 if (sctx.newshift) break; 1732 pc2 = b->a + bdiag[i+1]; 1733 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1734 1735 /* Now take care of 2nd column of diagonal 4x4 block. */ 1736 pc3 = rtmp3 + i+1; 1737 pc4 = rtmp4 + i+1; 1738 if (*pc3 != 0.0 || *pc4 != 0.0) { 1739 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1740 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1741 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1742 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1743 for (j=0; j<nz; j++) { 1744 col = pj[j]; 1745 rtmp3[col] -= mul3 * rtmp2[col]; 1746 rtmp4[col] -= mul4 * rtmp2[col]; 1747 } 1748 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1749 } 1750 1751 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1752 rs = 0.0; 1753 /* L part */ 1754 pc3 = b->a + bi[i+2]; 1755 pj = b->j + bi[i+2]; 1756 nz = bi[i+3] - bi[i+2]; 1757 for (j=0; j<nz; j++) { 1758 col = pj[j]; 1759 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1760 } 1761 /* U part */ 1762 pc3 = b->a + bdiag[i+3]+1; 1763 pj = b->j + bdiag[i+3]+1; 1764 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1765 for (j=0; j<nz; j++) { 1766 col = pj[j]; 1767 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1768 } 1769 1770 sctx.rs = rs; 1771 sctx.pv = rtmp3[i+2]; 1772 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1773 if (sctx.newshift) break; 1774 pc3 = b->a + bdiag[i+2]; 1775 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1776 1777 /* Now take care of 3rd column of diagonal 4x4 block. */ 1778 pc4 = rtmp4 + i+2; 1779 if (*pc4 != 0.0) { 1780 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1781 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1782 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1783 for (j=0; j<nz; j++) { 1784 col = pj[j]; 1785 rtmp4[col] -= mul4 * rtmp3[col]; 1786 } 1787 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1788 } 1789 1790 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1791 rs = 0.0; 1792 /* L part */ 1793 pc4 = b->a + bi[i+3]; 1794 pj = b->j + bi[i+3]; 1795 nz = bi[i+4] - bi[i+3]; 1796 for (j=0; j<nz; j++) { 1797 col = pj[j]; 1798 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1799 } 1800 /* U part */ 1801 pc4 = b->a + bdiag[i+4]+1; 1802 pj = b->j + bdiag[i+4]+1; 1803 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1804 for (j=0; j<nz; j++) { 1805 col = pj[j]; 1806 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1807 } 1808 1809 sctx.rs = rs; 1810 sctx.pv = rtmp4[i+3]; 1811 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr); 1812 if (sctx.newshift) break; 1813 pc4 = b->a + bdiag[i+3]; 1814 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1815 break; 1816 1817 default: 1818 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1819 } 1820 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1821 i += nodesz; /* Update the row */ 1822 } 1823 1824 /* MatPivotRefine() */ 1825 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1826 /* 1827 * if no shift in this attempt & shifting & started shifting & can refine, 1828 * then try lower shift 1829 */ 1830 sctx.shift_hi = sctx.shift_fraction; 1831 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1832 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1833 sctx.newshift = PETSC_TRUE; 1834 sctx.nshift++; 1835 } 1836 } while (sctx.newshift); 1837 1838 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1839 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1840 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1841 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1842 1843 if (b->inode.size) { 1844 C->ops->solve = MatSolve_SeqAIJ_Inode; 1845 } else { 1846 C->ops->solve = MatSolve_SeqAIJ; 1847 } 1848 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1849 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1850 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1851 C->ops->matsolve = MatMatSolve_SeqAIJ; 1852 C->assembled = PETSC_TRUE; 1853 C->preallocated = PETSC_TRUE; 1854 1855 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1856 1857 /* MatShiftView(A,info,&sctx) */ 1858 if (sctx.nshift) { 1859 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1860 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1861 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1862 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1863 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1864 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1865 } 1866 } 1867 PetscFunctionReturn(0); 1868 } 1869 1870 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1871 { 1872 Mat C = B; 1873 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1874 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1875 PetscErrorCode ierr; 1876 const PetscInt *r,*ic,*c,*ics; 1877 PetscInt n = A->rmap->n,*bi = b->i; 1878 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1879 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1880 PetscInt *ai = a->i,*aj = a->j; 1881 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1882 PetscScalar mul1,mul2,mul3,tmp; 1883 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1884 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1885 PetscReal rs=0.0; 1886 FactorShiftCtx sctx; 1887 1888 PetscFunctionBegin; 1889 sctx.shift_top = 0; 1890 sctx.nshift_max = 0; 1891 sctx.shift_lo = 0; 1892 sctx.shift_hi = 0; 1893 sctx.shift_fraction = 0; 1894 1895 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1896 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1897 sctx.shift_top = 0; 1898 for (i=0; i<n; i++) { 1899 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1900 rs = 0.0; 1901 ajtmp = aj + ai[i]; 1902 rtmp1 = aa + ai[i]; 1903 nz = ai[i+1] - ai[i]; 1904 for (j=0; j<nz; j++) { 1905 if (*ajtmp != i) { 1906 rs += PetscAbsScalar(*rtmp1++); 1907 } else { 1908 rs -= PetscRealPart(*rtmp1++); 1909 } 1910 ajtmp++; 1911 } 1912 if (rs>sctx.shift_top) sctx.shift_top = rs; 1913 } 1914 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1915 sctx.shift_top *= 1.1; 1916 sctx.nshift_max = 5; 1917 sctx.shift_lo = 0.; 1918 sctx.shift_hi = 1.; 1919 } 1920 sctx.shift_amount = 0; 1921 sctx.nshift = 0; 1922 1923 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1924 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1925 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1926 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1927 ics = ic; 1928 1929 node_max = a->inode.node_count; 1930 ns = a->inode.size; 1931 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1932 1933 /* If max inode size > 3, split it into two inodes.*/ 1934 /* also map the inode sizes according to the ordering */ 1935 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1936 for (i=0,j=0; i<node_max; ++i,++j) { 1937 if (ns[i]>3) { 1938 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1939 ++j; 1940 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1941 } else { 1942 tmp_vec1[j] = ns[i]; 1943 } 1944 } 1945 /* Use the correct node_max */ 1946 node_max = j; 1947 1948 /* Now reorder the inode info based on mat re-ordering info */ 1949 /* First create a row -> inode_size_array_index map */ 1950 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1951 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1952 for (i=0,row=0; i<node_max; i++) { 1953 nodesz = tmp_vec1[i]; 1954 for (j=0; j<nodesz; j++,row++) { 1955 nsmap[row] = i; 1956 } 1957 } 1958 /* Using nsmap, create a reordered ns structure */ 1959 for (i=0,j=0; i< node_max; i++) { 1960 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1961 tmp_vec2[i] = nodesz; 1962 j += nodesz; 1963 } 1964 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1965 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1966 /* Now use the correct ns */ 1967 ns = tmp_vec2; 1968 1969 do { 1970 sctx.newshift = PETSC_FALSE; 1971 /* Now loop over each block-row, and do the factorization */ 1972 for (i=0,row=0; i<node_max; i++) { 1973 nodesz = ns[i]; 1974 nz = bi[row+1] - bi[row]; 1975 bjtmp = bj + bi[row]; 1976 1977 switch (nodesz) { 1978 case 1: 1979 for (j=0; j<nz; j++) { 1980 idx = bjtmp[j]; 1981 rtmp11[idx] = 0.0; 1982 } 1983 1984 /* load in initial (unfactored row) */ 1985 idx = r[row]; 1986 nz_tmp = ai[idx+1] - ai[idx]; 1987 ajtmp = aj + ai[idx]; 1988 v1 = aa + ai[idx]; 1989 1990 for (j=0; j<nz_tmp; j++) { 1991 idx = ics[ajtmp[j]]; 1992 rtmp11[idx] = v1[j]; 1993 } 1994 rtmp11[ics[r[row]]] += sctx.shift_amount; 1995 1996 prow = *bjtmp++; 1997 while (prow < row) { 1998 pc1 = rtmp11 + prow; 1999 if (*pc1 != 0.0) { 2000 pv = ba + bd[prow]; 2001 pj = nbj + bd[prow]; 2002 mul1 = *pc1 * *pv++; 2003 *pc1 = mul1; 2004 nz_tmp = bi[prow+1] - bd[prow] - 1; 2005 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2006 for (j=0; j<nz_tmp; j++) { 2007 tmp = pv[j]; 2008 idx = pj[j]; 2009 rtmp11[idx] -= mul1 * tmp; 2010 } 2011 } 2012 prow = *bjtmp++; 2013 } 2014 pj = bj + bi[row]; 2015 pc1 = ba + bi[row]; 2016 2017 sctx.pv = rtmp11[row]; 2018 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2019 rs = 0.0; 2020 for (j=0; j<nz; j++) { 2021 idx = pj[j]; 2022 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2023 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2024 } 2025 sctx.rs = rs; 2026 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2027 if (sctx.newshift) goto endofwhile; 2028 break; 2029 2030 case 2: 2031 for (j=0; j<nz; j++) { 2032 idx = bjtmp[j]; 2033 rtmp11[idx] = 0.0; 2034 rtmp22[idx] = 0.0; 2035 } 2036 2037 /* load in initial (unfactored row) */ 2038 idx = r[row]; 2039 nz_tmp = ai[idx+1] - ai[idx]; 2040 ajtmp = aj + ai[idx]; 2041 v1 = aa + ai[idx]; 2042 v2 = aa + ai[idx+1]; 2043 for (j=0; j<nz_tmp; j++) { 2044 idx = ics[ajtmp[j]]; 2045 rtmp11[idx] = v1[j]; 2046 rtmp22[idx] = v2[j]; 2047 } 2048 rtmp11[ics[r[row]]] += sctx.shift_amount; 2049 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2050 2051 prow = *bjtmp++; 2052 while (prow < row) { 2053 pc1 = rtmp11 + prow; 2054 pc2 = rtmp22 + prow; 2055 if (*pc1 != 0.0 || *pc2 != 0.0) { 2056 pv = ba + bd[prow]; 2057 pj = nbj + bd[prow]; 2058 mul1 = *pc1 * *pv; 2059 mul2 = *pc2 * *pv; 2060 ++pv; 2061 *pc1 = mul1; 2062 *pc2 = mul2; 2063 2064 nz_tmp = bi[prow+1] - bd[prow] - 1; 2065 for (j=0; j<nz_tmp; j++) { 2066 tmp = pv[j]; 2067 idx = pj[j]; 2068 rtmp11[idx] -= mul1 * tmp; 2069 rtmp22[idx] -= mul2 * tmp; 2070 } 2071 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2072 } 2073 prow = *bjtmp++; 2074 } 2075 2076 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2077 pc1 = rtmp11 + prow; 2078 pc2 = rtmp22 + prow; 2079 2080 sctx.pv = *pc1; 2081 pj = bj + bi[prow]; 2082 rs = 0.0; 2083 for (j=0; j<nz; j++) { 2084 idx = pj[j]; 2085 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2086 } 2087 sctx.rs = rs; 2088 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2089 if (sctx.newshift) goto endofwhile; 2090 2091 if (*pc2 != 0.0) { 2092 pj = nbj + bd[prow]; 2093 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2094 *pc2 = mul2; 2095 nz_tmp = bi[prow+1] - bd[prow] - 1; 2096 for (j=0; j<nz_tmp; j++) { 2097 idx = pj[j]; 2098 tmp = rtmp11[idx]; 2099 rtmp22[idx] -= mul2 * tmp; 2100 } 2101 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2102 } 2103 2104 pj = bj + bi[row]; 2105 pc1 = ba + bi[row]; 2106 pc2 = ba + bi[row+1]; 2107 2108 sctx.pv = rtmp22[row+1]; 2109 rs = 0.0; 2110 rtmp11[row] = 1.0/rtmp11[row]; 2111 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2112 /* copy row entries from dense representation to sparse */ 2113 for (j=0; j<nz; j++) { 2114 idx = pj[j]; 2115 pc1[j] = rtmp11[idx]; 2116 pc2[j] = rtmp22[idx]; 2117 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2118 } 2119 sctx.rs = rs; 2120 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2121 if (sctx.newshift) goto endofwhile; 2122 break; 2123 2124 case 3: 2125 for (j=0; j<nz; j++) { 2126 idx = bjtmp[j]; 2127 rtmp11[idx] = 0.0; 2128 rtmp22[idx] = 0.0; 2129 rtmp33[idx] = 0.0; 2130 } 2131 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2132 idx = r[row]; 2133 nz_tmp = ai[idx+1] - ai[idx]; 2134 ajtmp = aj + ai[idx]; 2135 v1 = aa + ai[idx]; 2136 v2 = aa + ai[idx+1]; 2137 v3 = aa + ai[idx+2]; 2138 for (j=0; j<nz_tmp; j++) { 2139 idx = ics[ajtmp[j]]; 2140 rtmp11[idx] = v1[j]; 2141 rtmp22[idx] = v2[j]; 2142 rtmp33[idx] = v3[j]; 2143 } 2144 rtmp11[ics[r[row]]] += sctx.shift_amount; 2145 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2146 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2147 2148 /* loop over all pivot row blocks above this row block */ 2149 prow = *bjtmp++; 2150 while (prow < row) { 2151 pc1 = rtmp11 + prow; 2152 pc2 = rtmp22 + prow; 2153 pc3 = rtmp33 + prow; 2154 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2155 pv = ba + bd[prow]; 2156 pj = nbj + bd[prow]; 2157 mul1 = *pc1 * *pv; 2158 mul2 = *pc2 * *pv; 2159 mul3 = *pc3 * *pv; 2160 ++pv; 2161 *pc1 = mul1; 2162 *pc2 = mul2; 2163 *pc3 = mul3; 2164 2165 nz_tmp = bi[prow+1] - bd[prow] - 1; 2166 /* update this row based on pivot row */ 2167 for (j=0; j<nz_tmp; j++) { 2168 tmp = pv[j]; 2169 idx = pj[j]; 2170 rtmp11[idx] -= mul1 * tmp; 2171 rtmp22[idx] -= mul2 * tmp; 2172 rtmp33[idx] -= mul3 * tmp; 2173 } 2174 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2175 } 2176 prow = *bjtmp++; 2177 } 2178 2179 /* Now take care of diagonal 3x3 block in this set of rows */ 2180 /* note: prow = row here */ 2181 pc1 = rtmp11 + prow; 2182 pc2 = rtmp22 + prow; 2183 pc3 = rtmp33 + prow; 2184 2185 sctx.pv = *pc1; 2186 pj = bj + bi[prow]; 2187 rs = 0.0; 2188 for (j=0; j<nz; j++) { 2189 idx = pj[j]; 2190 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2191 } 2192 sctx.rs = rs; 2193 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2194 if (sctx.newshift) goto endofwhile; 2195 2196 if (*pc2 != 0.0 || *pc3 != 0.0) { 2197 mul2 = (*pc2)/(*pc1); 2198 mul3 = (*pc3)/(*pc1); 2199 *pc2 = mul2; 2200 *pc3 = mul3; 2201 nz_tmp = bi[prow+1] - bd[prow] - 1; 2202 pj = nbj + bd[prow]; 2203 for (j=0; j<nz_tmp; j++) { 2204 idx = pj[j]; 2205 tmp = rtmp11[idx]; 2206 rtmp22[idx] -= mul2 * tmp; 2207 rtmp33[idx] -= mul3 * tmp; 2208 } 2209 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2210 } 2211 ++prow; 2212 2213 pc2 = rtmp22 + prow; 2214 pc3 = rtmp33 + prow; 2215 sctx.pv = *pc2; 2216 pj = bj + bi[prow]; 2217 rs = 0.0; 2218 for (j=0; j<nz; j++) { 2219 idx = pj[j]; 2220 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2221 } 2222 sctx.rs = rs; 2223 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2224 if (sctx.newshift) goto endofwhile; 2225 2226 if (*pc3 != 0.0) { 2227 mul3 = (*pc3)/(*pc2); 2228 *pc3 = mul3; 2229 pj = nbj + bd[prow]; 2230 nz_tmp = bi[prow+1] - bd[prow] - 1; 2231 for (j=0; j<nz_tmp; j++) { 2232 idx = pj[j]; 2233 tmp = rtmp22[idx]; 2234 rtmp33[idx] -= mul3 * tmp; 2235 } 2236 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2237 } 2238 2239 pj = bj + bi[row]; 2240 pc1 = ba + bi[row]; 2241 pc2 = ba + bi[row+1]; 2242 pc3 = ba + bi[row+2]; 2243 2244 sctx.pv = rtmp33[row+2]; 2245 rs = 0.0; 2246 rtmp11[row] = 1.0/rtmp11[row]; 2247 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2248 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2249 /* copy row entries from dense representation to sparse */ 2250 for (j=0; j<nz; j++) { 2251 idx = pj[j]; 2252 pc1[j] = rtmp11[idx]; 2253 pc2[j] = rtmp22[idx]; 2254 pc3[j] = rtmp33[idx]; 2255 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2256 } 2257 2258 sctx.rs = rs; 2259 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr); 2260 if (sctx.newshift) goto endofwhile; 2261 break; 2262 2263 default: 2264 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2265 } 2266 row += nodesz; /* Update the row */ 2267 } 2268 endofwhile:; 2269 } while (sctx.newshift); 2270 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2271 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2272 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2273 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2274 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2275 2276 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2277 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2278 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2279 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2280 C->assembled = PETSC_TRUE; 2281 C->preallocated = PETSC_TRUE; 2282 if (sctx.nshift) { 2283 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2284 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2285 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2286 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2287 } 2288 } 2289 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2290 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2291 PetscFunctionReturn(0); 2292 } 2293 2294 2295 /* ----------------------------------------------------------- */ 2296 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2297 { 2298 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2299 IS iscol = a->col,isrow = a->row; 2300 PetscErrorCode ierr; 2301 const PetscInt *r,*c,*rout,*cout; 2302 PetscInt i,j,n = A->rmap->n; 2303 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2304 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2305 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2306 PetscScalar sum1,sum2,sum3,sum4,sum5; 2307 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2308 const PetscScalar *b; 2309 2310 PetscFunctionBegin; 2311 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2312 node_max = a->inode.node_count; 2313 ns = a->inode.size; /* Node Size array */ 2314 2315 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2316 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2317 tmp = a->solve_work; 2318 2319 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2320 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2321 2322 /* forward solve the lower triangular */ 2323 tmps = tmp; 2324 aa = a_a; 2325 aj = a_j; 2326 ad = a->diag; 2327 2328 for (i = 0,row = 0; i< node_max; ++i) { 2329 nsz = ns[i]; 2330 aii = ai[row]; 2331 v1 = aa + aii; 2332 vi = aj + aii; 2333 nz = ai[row+1]- ai[row]; 2334 2335 if (i < node_max-1) { 2336 /* Prefetch the indices for the next block */ 2337 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2338 /* Prefetch the data for the next block */ 2339 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2340 } 2341 2342 switch (nsz) { /* Each loop in 'case' is unrolled */ 2343 case 1: 2344 sum1 = b[r[row]]; 2345 for (j=0; j<nz-1; j+=2) { 2346 i0 = vi[j]; 2347 i1 = vi[j+1]; 2348 tmp0 = tmps[i0]; 2349 tmp1 = tmps[i1]; 2350 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2351 } 2352 if (j == nz-1) { 2353 tmp0 = tmps[vi[j]]; 2354 sum1 -= v1[j]*tmp0; 2355 } 2356 tmp[row++]=sum1; 2357 break; 2358 case 2: 2359 sum1 = b[r[row]]; 2360 sum2 = b[r[row+1]]; 2361 v2 = aa + ai[row+1]; 2362 2363 for (j=0; j<nz-1; j+=2) { 2364 i0 = vi[j]; 2365 i1 = vi[j+1]; 2366 tmp0 = tmps[i0]; 2367 tmp1 = tmps[i1]; 2368 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2369 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2370 } 2371 if (j == nz-1) { 2372 tmp0 = tmps[vi[j]]; 2373 sum1 -= v1[j] *tmp0; 2374 sum2 -= v2[j] *tmp0; 2375 } 2376 sum2 -= v2[nz] * sum1; 2377 tmp[row++]=sum1; 2378 tmp[row++]=sum2; 2379 break; 2380 case 3: 2381 sum1 = b[r[row]]; 2382 sum2 = b[r[row+1]]; 2383 sum3 = b[r[row+2]]; 2384 v2 = aa + ai[row+1]; 2385 v3 = aa + ai[row+2]; 2386 2387 for (j=0; j<nz-1; j+=2) { 2388 i0 = vi[j]; 2389 i1 = vi[j+1]; 2390 tmp0 = tmps[i0]; 2391 tmp1 = tmps[i1]; 2392 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2393 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2394 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2395 } 2396 if (j == nz-1) { 2397 tmp0 = tmps[vi[j]]; 2398 sum1 -= v1[j] *tmp0; 2399 sum2 -= v2[j] *tmp0; 2400 sum3 -= v3[j] *tmp0; 2401 } 2402 sum2 -= v2[nz] * sum1; 2403 sum3 -= v3[nz] * sum1; 2404 sum3 -= v3[nz+1] * sum2; 2405 tmp[row++]=sum1; 2406 tmp[row++]=sum2; 2407 tmp[row++]=sum3; 2408 break; 2409 2410 case 4: 2411 sum1 = b[r[row]]; 2412 sum2 = b[r[row+1]]; 2413 sum3 = b[r[row+2]]; 2414 sum4 = b[r[row+3]]; 2415 v2 = aa + ai[row+1]; 2416 v3 = aa + ai[row+2]; 2417 v4 = aa + ai[row+3]; 2418 2419 for (j=0; j<nz-1; j+=2) { 2420 i0 = vi[j]; 2421 i1 = vi[j+1]; 2422 tmp0 = tmps[i0]; 2423 tmp1 = tmps[i1]; 2424 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2425 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2426 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2427 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2428 } 2429 if (j == nz-1) { 2430 tmp0 = tmps[vi[j]]; 2431 sum1 -= v1[j] *tmp0; 2432 sum2 -= v2[j] *tmp0; 2433 sum3 -= v3[j] *tmp0; 2434 sum4 -= v4[j] *tmp0; 2435 } 2436 sum2 -= v2[nz] * sum1; 2437 sum3 -= v3[nz] * sum1; 2438 sum4 -= v4[nz] * sum1; 2439 sum3 -= v3[nz+1] * sum2; 2440 sum4 -= v4[nz+1] * sum2; 2441 sum4 -= v4[nz+2] * sum3; 2442 2443 tmp[row++]=sum1; 2444 tmp[row++]=sum2; 2445 tmp[row++]=sum3; 2446 tmp[row++]=sum4; 2447 break; 2448 case 5: 2449 sum1 = b[r[row]]; 2450 sum2 = b[r[row+1]]; 2451 sum3 = b[r[row+2]]; 2452 sum4 = b[r[row+3]]; 2453 sum5 = b[r[row+4]]; 2454 v2 = aa + ai[row+1]; 2455 v3 = aa + ai[row+2]; 2456 v4 = aa + ai[row+3]; 2457 v5 = aa + ai[row+4]; 2458 2459 for (j=0; j<nz-1; j+=2) { 2460 i0 = vi[j]; 2461 i1 = vi[j+1]; 2462 tmp0 = tmps[i0]; 2463 tmp1 = tmps[i1]; 2464 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2465 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2466 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2467 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2468 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2469 } 2470 if (j == nz-1) { 2471 tmp0 = tmps[vi[j]]; 2472 sum1 -= v1[j] *tmp0; 2473 sum2 -= v2[j] *tmp0; 2474 sum3 -= v3[j] *tmp0; 2475 sum4 -= v4[j] *tmp0; 2476 sum5 -= v5[j] *tmp0; 2477 } 2478 2479 sum2 -= v2[nz] * sum1; 2480 sum3 -= v3[nz] * sum1; 2481 sum4 -= v4[nz] * sum1; 2482 sum5 -= v5[nz] * sum1; 2483 sum3 -= v3[nz+1] * sum2; 2484 sum4 -= v4[nz+1] * sum2; 2485 sum5 -= v5[nz+1] * sum2; 2486 sum4 -= v4[nz+2] * sum3; 2487 sum5 -= v5[nz+2] * sum3; 2488 sum5 -= v5[nz+3] * sum4; 2489 2490 tmp[row++]=sum1; 2491 tmp[row++]=sum2; 2492 tmp[row++]=sum3; 2493 tmp[row++]=sum4; 2494 tmp[row++]=sum5; 2495 break; 2496 default: 2497 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2498 } 2499 } 2500 /* backward solve the upper triangular */ 2501 for (i=node_max -1,row = n-1; i>=0; i--) { 2502 nsz = ns[i]; 2503 aii = ad[row+1] + 1; 2504 v1 = aa + aii; 2505 vi = aj + aii; 2506 nz = ad[row]- ad[row+1] - 1; 2507 2508 if (i > 0) { 2509 /* Prefetch the indices for the next block */ 2510 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2511 /* Prefetch the data for the next block */ 2512 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2513 } 2514 2515 switch (nsz) { /* Each loop in 'case' is unrolled */ 2516 case 1: 2517 sum1 = tmp[row]; 2518 2519 for (j=0; j<nz-1; j+=2) { 2520 i0 = vi[j]; 2521 i1 = vi[j+1]; 2522 tmp0 = tmps[i0]; 2523 tmp1 = tmps[i1]; 2524 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2525 } 2526 if (j == nz-1) { 2527 tmp0 = tmps[vi[j]]; 2528 sum1 -= v1[j]*tmp0; 2529 } 2530 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2531 break; 2532 case 2: 2533 sum1 = tmp[row]; 2534 sum2 = tmp[row-1]; 2535 v2 = aa + ad[row] + 1; 2536 for (j=0; j<nz-1; j+=2) { 2537 i0 = vi[j]; 2538 i1 = vi[j+1]; 2539 tmp0 = tmps[i0]; 2540 tmp1 = tmps[i1]; 2541 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2542 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2543 } 2544 if (j == nz-1) { 2545 tmp0 = tmps[vi[j]]; 2546 sum1 -= v1[j]* tmp0; 2547 sum2 -= v2[j+1]* tmp0; 2548 } 2549 2550 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2551 sum2 -= v2[0] * tmp0; 2552 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2553 break; 2554 case 3: 2555 sum1 = tmp[row]; 2556 sum2 = tmp[row -1]; 2557 sum3 = tmp[row -2]; 2558 v2 = aa + ad[row] + 1; 2559 v3 = aa + ad[row -1] + 1; 2560 for (j=0; j<nz-1; j+=2) { 2561 i0 = vi[j]; 2562 i1 = vi[j+1]; 2563 tmp0 = tmps[i0]; 2564 tmp1 = tmps[i1]; 2565 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2566 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2567 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2568 } 2569 if (j== nz-1) { 2570 tmp0 = tmps[vi[j]]; 2571 sum1 -= v1[j] * tmp0; 2572 sum2 -= v2[j+1] * tmp0; 2573 sum3 -= v3[j+2] * tmp0; 2574 } 2575 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2576 sum2 -= v2[0]* tmp0; 2577 sum3 -= v3[1] * tmp0; 2578 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2579 sum3 -= v3[0]* tmp0; 2580 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2581 2582 break; 2583 case 4: 2584 sum1 = tmp[row]; 2585 sum2 = tmp[row -1]; 2586 sum3 = tmp[row -2]; 2587 sum4 = tmp[row -3]; 2588 v2 = aa + ad[row]+1; 2589 v3 = aa + ad[row -1]+1; 2590 v4 = aa + ad[row -2]+1; 2591 2592 for (j=0; j<nz-1; j+=2) { 2593 i0 = vi[j]; 2594 i1 = vi[j+1]; 2595 tmp0 = tmps[i0]; 2596 tmp1 = tmps[i1]; 2597 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2598 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2599 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2600 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2601 } 2602 if (j== nz-1) { 2603 tmp0 = tmps[vi[j]]; 2604 sum1 -= v1[j] * tmp0; 2605 sum2 -= v2[j+1] * tmp0; 2606 sum3 -= v3[j+2] * tmp0; 2607 sum4 -= v4[j+3] * tmp0; 2608 } 2609 2610 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2611 sum2 -= v2[0] * tmp0; 2612 sum3 -= v3[1] * tmp0; 2613 sum4 -= v4[2] * tmp0; 2614 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2615 sum3 -= v3[0] * tmp0; 2616 sum4 -= v4[1] * tmp0; 2617 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2618 sum4 -= v4[0] * tmp0; 2619 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2620 break; 2621 case 5: 2622 sum1 = tmp[row]; 2623 sum2 = tmp[row -1]; 2624 sum3 = tmp[row -2]; 2625 sum4 = tmp[row -3]; 2626 sum5 = tmp[row -4]; 2627 v2 = aa + ad[row]+1; 2628 v3 = aa + ad[row -1]+1; 2629 v4 = aa + ad[row -2]+1; 2630 v5 = aa + ad[row -3]+1; 2631 for (j=0; j<nz-1; j+=2) { 2632 i0 = vi[j]; 2633 i1 = vi[j+1]; 2634 tmp0 = tmps[i0]; 2635 tmp1 = tmps[i1]; 2636 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2637 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2638 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2639 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2640 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2641 } 2642 if (j==nz-1) { 2643 tmp0 = tmps[vi[j]]; 2644 sum1 -= v1[j] * tmp0; 2645 sum2 -= v2[j+1] * tmp0; 2646 sum3 -= v3[j+2] * tmp0; 2647 sum4 -= v4[j+3] * tmp0; 2648 sum5 -= v5[j+4] * tmp0; 2649 } 2650 2651 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2652 sum2 -= v2[0] * tmp0; 2653 sum3 -= v3[1] * tmp0; 2654 sum4 -= v4[2] * tmp0; 2655 sum5 -= v5[3] * tmp0; 2656 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2657 sum3 -= v3[0] * tmp0; 2658 sum4 -= v4[1] * tmp0; 2659 sum5 -= v5[2] * tmp0; 2660 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2661 sum4 -= v4[0] * tmp0; 2662 sum5 -= v5[1] * tmp0; 2663 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2664 sum5 -= v5[0] * tmp0; 2665 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2666 break; 2667 default: 2668 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2669 } 2670 } 2671 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2672 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2673 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2674 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2675 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2676 PetscFunctionReturn(0); 2677 } 2678 2679 2680 /* 2681 Makes a longer coloring[] array and calls the usual code with that 2682 */ 2683 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2684 { 2685 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2686 PetscErrorCode ierr; 2687 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2688 PetscInt *colorused,i; 2689 ISColoringValue *newcolor; 2690 2691 PetscFunctionBegin; 2692 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2693 /* loop over inodes, marking a color for each column*/ 2694 row = 0; 2695 for (i=0; i<m; i++) { 2696 for (j=0; j<ns[i]; j++) { 2697 newcolor[row++] = coloring[i] + j*ncolors; 2698 } 2699 } 2700 2701 /* eliminate unneeded colors */ 2702 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2703 for (i=0; i<n; i++) { 2704 colorused[newcolor[i]] = 1; 2705 } 2706 2707 for (i=1; i<5*ncolors; i++) { 2708 colorused[i] += colorused[i-1]; 2709 } 2710 ncolors = colorused[5*ncolors-1]; 2711 for (i=0; i<n; i++) { 2712 newcolor[i] = colorused[newcolor[i]]-1; 2713 } 2714 ierr = PetscFree(colorused);CHKERRQ(ierr); 2715 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2716 ierr = PetscFree(coloring);CHKERRQ(ierr); 2717 PetscFunctionReturn(0); 2718 } 2719 2720 #include <petsc/private/kernels/blockinvert.h> 2721 2722 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2723 { 2724 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2725 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2726 MatScalar *ibdiag,*bdiag,work[25],*t; 2727 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2728 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2729 const PetscScalar *xb, *b; 2730 PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0; 2731 PetscErrorCode ierr; 2732 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2733 PetscInt sz,k,ipvt[5]; 2734 PetscBool allowzeropivot,zeropivotdetected; 2735 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2736 2737 PetscFunctionBegin; 2738 allowzeropivot = PetscNot(A->erroriffailure); 2739 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2740 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 2741 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2742 2743 if (!a->inode.ibdiagvalid) { 2744 if (!a->inode.ibdiag) { 2745 /* calculate space needed for diagonal blocks */ 2746 for (i=0; i<m; i++) { 2747 cnt += sizes[i]*sizes[i]; 2748 } 2749 a->inode.bdiagsize = cnt; 2750 2751 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2752 } 2753 2754 /* copy over the diagonal blocks and invert them */ 2755 ibdiag = a->inode.ibdiag; 2756 bdiag = a->inode.bdiag; 2757 cnt = 0; 2758 for (i=0, row = 0; i<m; i++) { 2759 for (j=0; j<sizes[i]; j++) { 2760 for (k=0; k<sizes[i]; k++) { 2761 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2762 } 2763 } 2764 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2765 2766 switch (sizes[i]) { 2767 case 1: 2768 /* Create matrix data structure */ 2769 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2770 if (allowzeropivot) { 2771 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2772 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2773 A->factorerror_zeropivot_row = row; 2774 ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr); 2775 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2776 } 2777 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2778 break; 2779 case 2: 2780 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2781 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2782 break; 2783 case 3: 2784 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2785 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2786 break; 2787 case 4: 2788 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2789 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2790 break; 2791 case 5: 2792 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2793 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2794 break; 2795 default: 2796 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2797 } 2798 cnt += sizes[i]*sizes[i]; 2799 row += sizes[i]; 2800 } 2801 a->inode.ibdiagvalid = PETSC_TRUE; 2802 } 2803 ibdiag = a->inode.ibdiag; 2804 bdiag = a->inode.bdiag; 2805 t = a->inode.ssor_work; 2806 2807 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2808 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2809 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2810 if (flag & SOR_ZERO_INITIAL_GUESS) { 2811 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2812 2813 for (i=0, row=0; i<m; i++) { 2814 sz = diag[row] - ii[row]; 2815 v1 = a->a + ii[row]; 2816 idx = a->j + ii[row]; 2817 2818 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2819 switch (sizes[i]) { 2820 case 1: 2821 2822 sum1 = b[row]; 2823 for (n = 0; n<sz-1; n+=2) { 2824 i1 = idx[0]; 2825 i2 = idx[1]; 2826 idx += 2; 2827 tmp0 = x[i1]; 2828 tmp1 = x[i2]; 2829 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2830 } 2831 2832 if (n == sz-1) { 2833 tmp0 = x[*idx]; 2834 sum1 -= *v1 * tmp0; 2835 } 2836 t[row] = sum1; 2837 x[row++] = sum1*(*ibdiag++); 2838 break; 2839 case 2: 2840 v2 = a->a + ii[row+1]; 2841 sum1 = b[row]; 2842 sum2 = b[row+1]; 2843 for (n = 0; n<sz-1; n+=2) { 2844 i1 = idx[0]; 2845 i2 = idx[1]; 2846 idx += 2; 2847 tmp0 = x[i1]; 2848 tmp1 = x[i2]; 2849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2850 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2851 } 2852 2853 if (n == sz-1) { 2854 tmp0 = x[*idx]; 2855 sum1 -= v1[0] * tmp0; 2856 sum2 -= v2[0] * tmp0; 2857 } 2858 t[row] = sum1; 2859 t[row+1] = sum2; 2860 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2861 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2862 ibdiag += 4; 2863 break; 2864 case 3: 2865 v2 = a->a + ii[row+1]; 2866 v3 = a->a + ii[row+2]; 2867 sum1 = b[row]; 2868 sum2 = b[row+1]; 2869 sum3 = b[row+2]; 2870 for (n = 0; n<sz-1; n+=2) { 2871 i1 = idx[0]; 2872 i2 = idx[1]; 2873 idx += 2; 2874 tmp0 = x[i1]; 2875 tmp1 = x[i2]; 2876 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2877 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2878 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2879 } 2880 2881 if (n == sz-1) { 2882 tmp0 = x[*idx]; 2883 sum1 -= v1[0] * tmp0; 2884 sum2 -= v2[0] * tmp0; 2885 sum3 -= v3[0] * tmp0; 2886 } 2887 t[row] = sum1; 2888 t[row+1] = sum2; 2889 t[row+2] = sum3; 2890 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2891 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2892 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2893 ibdiag += 9; 2894 break; 2895 case 4: 2896 v2 = a->a + ii[row+1]; 2897 v3 = a->a + ii[row+2]; 2898 v4 = a->a + ii[row+3]; 2899 sum1 = b[row]; 2900 sum2 = b[row+1]; 2901 sum3 = b[row+2]; 2902 sum4 = b[row+3]; 2903 for (n = 0; n<sz-1; n+=2) { 2904 i1 = idx[0]; 2905 i2 = idx[1]; 2906 idx += 2; 2907 tmp0 = x[i1]; 2908 tmp1 = x[i2]; 2909 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2910 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2911 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2912 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2913 } 2914 2915 if (n == sz-1) { 2916 tmp0 = x[*idx]; 2917 sum1 -= v1[0] * tmp0; 2918 sum2 -= v2[0] * tmp0; 2919 sum3 -= v3[0] * tmp0; 2920 sum4 -= v4[0] * tmp0; 2921 } 2922 t[row] = sum1; 2923 t[row+1] = sum2; 2924 t[row+2] = sum3; 2925 t[row+3] = sum4; 2926 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2927 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2928 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2929 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2930 ibdiag += 16; 2931 break; 2932 case 5: 2933 v2 = a->a + ii[row+1]; 2934 v3 = a->a + ii[row+2]; 2935 v4 = a->a + ii[row+3]; 2936 v5 = a->a + ii[row+4]; 2937 sum1 = b[row]; 2938 sum2 = b[row+1]; 2939 sum3 = b[row+2]; 2940 sum4 = b[row+3]; 2941 sum5 = b[row+4]; 2942 for (n = 0; n<sz-1; n+=2) { 2943 i1 = idx[0]; 2944 i2 = idx[1]; 2945 idx += 2; 2946 tmp0 = x[i1]; 2947 tmp1 = x[i2]; 2948 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2949 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2950 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2951 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2952 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2953 } 2954 2955 if (n == sz-1) { 2956 tmp0 = x[*idx]; 2957 sum1 -= v1[0] * tmp0; 2958 sum2 -= v2[0] * tmp0; 2959 sum3 -= v3[0] * tmp0; 2960 sum4 -= v4[0] * tmp0; 2961 sum5 -= v5[0] * tmp0; 2962 } 2963 t[row] = sum1; 2964 t[row+1] = sum2; 2965 t[row+2] = sum3; 2966 t[row+3] = sum4; 2967 t[row+4] = sum5; 2968 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2969 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2970 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2971 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2972 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2973 ibdiag += 25; 2974 break; 2975 default: 2976 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2977 } 2978 } 2979 2980 xb = t; 2981 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2982 } else xb = b; 2983 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 2984 2985 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2986 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2987 ibdiag -= sizes[i]*sizes[i]; 2988 sz = ii[row+1] - diag[row] - 1; 2989 v1 = a->a + diag[row] + 1; 2990 idx = a->j + diag[row] + 1; 2991 2992 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2993 switch (sizes[i]) { 2994 case 1: 2995 2996 sum1 = xb[row]; 2997 for (n = 0; n<sz-1; n+=2) { 2998 i1 = idx[0]; 2999 i2 = idx[1]; 3000 idx += 2; 3001 tmp0 = x[i1]; 3002 tmp1 = x[i2]; 3003 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3004 } 3005 3006 if (n == sz-1) { 3007 tmp0 = x[*idx]; 3008 sum1 -= *v1*tmp0; 3009 } 3010 x[row--] = sum1*(*ibdiag); 3011 break; 3012 3013 case 2: 3014 3015 sum1 = xb[row]; 3016 sum2 = xb[row-1]; 3017 /* note that sum1 is associated with the second of the two rows */ 3018 v2 = a->a + diag[row-1] + 2; 3019 for (n = 0; n<sz-1; n+=2) { 3020 i1 = idx[0]; 3021 i2 = idx[1]; 3022 idx += 2; 3023 tmp0 = x[i1]; 3024 tmp1 = x[i2]; 3025 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3026 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3027 } 3028 3029 if (n == sz-1) { 3030 tmp0 = x[*idx]; 3031 sum1 -= *v1*tmp0; 3032 sum2 -= *v2*tmp0; 3033 } 3034 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3035 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3036 break; 3037 case 3: 3038 3039 sum1 = xb[row]; 3040 sum2 = xb[row-1]; 3041 sum3 = xb[row-2]; 3042 v2 = a->a + diag[row-1] + 2; 3043 v3 = a->a + diag[row-2] + 3; 3044 for (n = 0; n<sz-1; n+=2) { 3045 i1 = idx[0]; 3046 i2 = idx[1]; 3047 idx += 2; 3048 tmp0 = x[i1]; 3049 tmp1 = x[i2]; 3050 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3051 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3052 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3053 } 3054 3055 if (n == sz-1) { 3056 tmp0 = x[*idx]; 3057 sum1 -= *v1*tmp0; 3058 sum2 -= *v2*tmp0; 3059 sum3 -= *v3*tmp0; 3060 } 3061 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3062 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3063 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3064 break; 3065 case 4: 3066 3067 sum1 = xb[row]; 3068 sum2 = xb[row-1]; 3069 sum3 = xb[row-2]; 3070 sum4 = xb[row-3]; 3071 v2 = a->a + diag[row-1] + 2; 3072 v3 = a->a + diag[row-2] + 3; 3073 v4 = a->a + diag[row-3] + 4; 3074 for (n = 0; n<sz-1; n+=2) { 3075 i1 = idx[0]; 3076 i2 = idx[1]; 3077 idx += 2; 3078 tmp0 = x[i1]; 3079 tmp1 = x[i2]; 3080 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3081 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3082 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3083 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3084 } 3085 3086 if (n == sz-1) { 3087 tmp0 = x[*idx]; 3088 sum1 -= *v1*tmp0; 3089 sum2 -= *v2*tmp0; 3090 sum3 -= *v3*tmp0; 3091 sum4 -= *v4*tmp0; 3092 } 3093 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3094 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3095 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3096 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3097 break; 3098 case 5: 3099 3100 sum1 = xb[row]; 3101 sum2 = xb[row-1]; 3102 sum3 = xb[row-2]; 3103 sum4 = xb[row-3]; 3104 sum5 = xb[row-4]; 3105 v2 = a->a + diag[row-1] + 2; 3106 v3 = a->a + diag[row-2] + 3; 3107 v4 = a->a + diag[row-3] + 4; 3108 v5 = a->a + diag[row-4] + 5; 3109 for (n = 0; n<sz-1; n+=2) { 3110 i1 = idx[0]; 3111 i2 = idx[1]; 3112 idx += 2; 3113 tmp0 = x[i1]; 3114 tmp1 = x[i2]; 3115 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3116 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3117 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3118 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3119 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3120 } 3121 3122 if (n == sz-1) { 3123 tmp0 = x[*idx]; 3124 sum1 -= *v1*tmp0; 3125 sum2 -= *v2*tmp0; 3126 sum3 -= *v3*tmp0; 3127 sum4 -= *v4*tmp0; 3128 sum5 -= *v5*tmp0; 3129 } 3130 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3131 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3132 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3133 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3134 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3135 break; 3136 default: 3137 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3138 } 3139 } 3140 3141 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3142 } 3143 its--; 3144 } 3145 while (its--) { 3146 3147 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3148 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3149 i<m; 3150 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3151 3152 sz = diag[row] - ii[row]; 3153 v1 = a->a + ii[row]; 3154 idx = a->j + ii[row]; 3155 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3156 switch (sizes[i]) { 3157 case 1: 3158 sum1 = b[row]; 3159 for (n = 0; n<sz-1; n+=2) { 3160 i1 = idx[0]; 3161 i2 = idx[1]; 3162 idx += 2; 3163 tmp0 = x[i1]; 3164 tmp1 = x[i2]; 3165 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3166 } 3167 if (n == sz-1) { 3168 tmp0 = x[*idx++]; 3169 sum1 -= *v1 * tmp0; 3170 v1++; 3171 } 3172 t[row] = sum1; 3173 sz = ii[row+1] - diag[row] - 1; 3174 idx = a->j + diag[row] + 1; 3175 v1 += 1; 3176 for (n = 0; n<sz-1; n+=2) { 3177 i1 = idx[0]; 3178 i2 = idx[1]; 3179 idx += 2; 3180 tmp0 = x[i1]; 3181 tmp1 = x[i2]; 3182 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3183 } 3184 if (n == sz-1) { 3185 tmp0 = x[*idx++]; 3186 sum1 -= *v1 * tmp0; 3187 } 3188 /* in MatSOR_SeqAIJ this line would be 3189 * 3190 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3191 * 3192 * but omega == 1, so this becomes 3193 * 3194 * x[row] = sum1*(*ibdiag++); 3195 * 3196 */ 3197 x[row] = sum1*(*ibdiag); 3198 break; 3199 case 2: 3200 v2 = a->a + ii[row+1]; 3201 sum1 = b[row]; 3202 sum2 = b[row+1]; 3203 for (n = 0; n<sz-1; n+=2) { 3204 i1 = idx[0]; 3205 i2 = idx[1]; 3206 idx += 2; 3207 tmp0 = x[i1]; 3208 tmp1 = x[i2]; 3209 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3210 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3211 } 3212 if (n == sz-1) { 3213 tmp0 = x[*idx++]; 3214 sum1 -= v1[0] * tmp0; 3215 sum2 -= v2[0] * tmp0; 3216 v1++; v2++; 3217 } 3218 t[row] = sum1; 3219 t[row+1] = sum2; 3220 sz = ii[row+1] - diag[row] - 2; 3221 idx = a->j + diag[row] + 2; 3222 v1 += 2; 3223 v2 += 2; 3224 for (n = 0; n<sz-1; n+=2) { 3225 i1 = idx[0]; 3226 i2 = idx[1]; 3227 idx += 2; 3228 tmp0 = x[i1]; 3229 tmp1 = x[i2]; 3230 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3231 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3232 } 3233 if (n == sz-1) { 3234 tmp0 = x[*idx]; 3235 sum1 -= v1[0] * tmp0; 3236 sum2 -= v2[0] * tmp0; 3237 } 3238 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3239 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3240 break; 3241 case 3: 3242 v2 = a->a + ii[row+1]; 3243 v3 = a->a + ii[row+2]; 3244 sum1 = b[row]; 3245 sum2 = b[row+1]; 3246 sum3 = b[row+2]; 3247 for (n = 0; n<sz-1; n+=2) { 3248 i1 = idx[0]; 3249 i2 = idx[1]; 3250 idx += 2; 3251 tmp0 = x[i1]; 3252 tmp1 = x[i2]; 3253 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3254 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3255 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3256 } 3257 if (n == sz-1) { 3258 tmp0 = x[*idx++]; 3259 sum1 -= v1[0] * tmp0; 3260 sum2 -= v2[0] * tmp0; 3261 sum3 -= v3[0] * tmp0; 3262 v1++; v2++; v3++; 3263 } 3264 t[row] = sum1; 3265 t[row+1] = sum2; 3266 t[row+2] = sum3; 3267 sz = ii[row+1] - diag[row] - 3; 3268 idx = a->j + diag[row] + 3; 3269 v1 += 3; 3270 v2 += 3; 3271 v3 += 3; 3272 for (n = 0; n<sz-1; n+=2) { 3273 i1 = idx[0]; 3274 i2 = idx[1]; 3275 idx += 2; 3276 tmp0 = x[i1]; 3277 tmp1 = x[i2]; 3278 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3279 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3280 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3281 } 3282 if (n == sz-1) { 3283 tmp0 = x[*idx]; 3284 sum1 -= v1[0] * tmp0; 3285 sum2 -= v2[0] * tmp0; 3286 sum3 -= v3[0] * tmp0; 3287 } 3288 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3289 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3290 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3291 break; 3292 case 4: 3293 v2 = a->a + ii[row+1]; 3294 v3 = a->a + ii[row+2]; 3295 v4 = a->a + ii[row+3]; 3296 sum1 = b[row]; 3297 sum2 = b[row+1]; 3298 sum3 = b[row+2]; 3299 sum4 = b[row+3]; 3300 for (n = 0; n<sz-1; n+=2) { 3301 i1 = idx[0]; 3302 i2 = idx[1]; 3303 idx += 2; 3304 tmp0 = x[i1]; 3305 tmp1 = x[i2]; 3306 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3307 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3308 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3309 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3310 } 3311 if (n == sz-1) { 3312 tmp0 = x[*idx++]; 3313 sum1 -= v1[0] * tmp0; 3314 sum2 -= v2[0] * tmp0; 3315 sum3 -= v3[0] * tmp0; 3316 sum4 -= v4[0] * tmp0; 3317 v1++; v2++; v3++; v4++; 3318 } 3319 t[row] = sum1; 3320 t[row+1] = sum2; 3321 t[row+2] = sum3; 3322 t[row+3] = sum4; 3323 sz = ii[row+1] - diag[row] - 4; 3324 idx = a->j + diag[row] + 4; 3325 v1 += 4; 3326 v2 += 4; 3327 v3 += 4; 3328 v4 += 4; 3329 for (n = 0; n<sz-1; n+=2) { 3330 i1 = idx[0]; 3331 i2 = idx[1]; 3332 idx += 2; 3333 tmp0 = x[i1]; 3334 tmp1 = x[i2]; 3335 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3336 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3337 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3338 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3339 } 3340 if (n == sz-1) { 3341 tmp0 = x[*idx]; 3342 sum1 -= v1[0] * tmp0; 3343 sum2 -= v2[0] * tmp0; 3344 sum3 -= v3[0] * tmp0; 3345 sum4 -= v4[0] * tmp0; 3346 } 3347 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3348 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3349 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3350 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3351 break; 3352 case 5: 3353 v2 = a->a + ii[row+1]; 3354 v3 = a->a + ii[row+2]; 3355 v4 = a->a + ii[row+3]; 3356 v5 = a->a + ii[row+4]; 3357 sum1 = b[row]; 3358 sum2 = b[row+1]; 3359 sum3 = b[row+2]; 3360 sum4 = b[row+3]; 3361 sum5 = b[row+4]; 3362 for (n = 0; n<sz-1; n+=2) { 3363 i1 = idx[0]; 3364 i2 = idx[1]; 3365 idx += 2; 3366 tmp0 = x[i1]; 3367 tmp1 = x[i2]; 3368 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3369 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3370 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3371 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3372 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3373 } 3374 if (n == sz-1) { 3375 tmp0 = x[*idx++]; 3376 sum1 -= v1[0] * tmp0; 3377 sum2 -= v2[0] * tmp0; 3378 sum3 -= v3[0] * tmp0; 3379 sum4 -= v4[0] * tmp0; 3380 sum5 -= v5[0] * tmp0; 3381 v1++; v2++; v3++; v4++; v5++; 3382 } 3383 t[row] = sum1; 3384 t[row+1] = sum2; 3385 t[row+2] = sum3; 3386 t[row+3] = sum4; 3387 t[row+4] = sum5; 3388 sz = ii[row+1] - diag[row] - 5; 3389 idx = a->j + diag[row] + 5; 3390 v1 += 5; 3391 v2 += 5; 3392 v3 += 5; 3393 v4 += 5; 3394 v5 += 5; 3395 for (n = 0; n<sz-1; n+=2) { 3396 i1 = idx[0]; 3397 i2 = idx[1]; 3398 idx += 2; 3399 tmp0 = x[i1]; 3400 tmp1 = x[i2]; 3401 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3402 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3403 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3404 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3405 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3406 } 3407 if (n == sz-1) { 3408 tmp0 = x[*idx]; 3409 sum1 -= v1[0] * tmp0; 3410 sum2 -= v2[0] * tmp0; 3411 sum3 -= v3[0] * tmp0; 3412 sum4 -= v4[0] * tmp0; 3413 sum5 -= v5[0] * tmp0; 3414 } 3415 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3416 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3417 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3418 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3419 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3420 break; 3421 default: 3422 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3423 } 3424 } 3425 xb = t; 3426 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3427 } else xb = b; 3428 3429 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3430 3431 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3432 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3433 ibdiag -= sizes[i]*sizes[i]; 3434 3435 /* set RHS */ 3436 if (xb == b) { 3437 /* whole (old way) */ 3438 sz = ii[row+1] - ii[row]; 3439 idx = a->j + ii[row]; 3440 switch (sizes[i]) { 3441 case 5: 3442 v5 = a->a + ii[row-4]; 3443 case 4: /* fall through */ 3444 v4 = a->a + ii[row-3]; 3445 case 3: 3446 v3 = a->a + ii[row-2]; 3447 case 2: 3448 v2 = a->a + ii[row-1]; 3449 case 1: 3450 v1 = a->a + ii[row]; 3451 break; 3452 default: 3453 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3454 } 3455 } else { 3456 /* upper, no diag */ 3457 sz = ii[row+1] - diag[row] - 1; 3458 idx = a->j + diag[row] + 1; 3459 switch (sizes[i]) { 3460 case 5: 3461 v5 = a->a + diag[row-4] + 5; 3462 case 4: /* fall through */ 3463 v4 = a->a + diag[row-3] + 4; 3464 case 3: 3465 v3 = a->a + diag[row-2] + 3; 3466 case 2: 3467 v2 = a->a + diag[row-1] + 2; 3468 case 1: 3469 v1 = a->a + diag[row] + 1; 3470 } 3471 } 3472 /* set sum */ 3473 switch (sizes[i]) { 3474 case 5: 3475 sum5 = xb[row-4]; 3476 case 4: /* fall through */ 3477 sum4 = xb[row-3]; 3478 case 3: 3479 sum3 = xb[row-2]; 3480 case 2: 3481 sum2 = xb[row-1]; 3482 case 1: 3483 /* note that sum1 is associated with the last row */ 3484 sum1 = xb[row]; 3485 } 3486 /* do sums */ 3487 for (n = 0; n<sz-1; n+=2) { 3488 i1 = idx[0]; 3489 i2 = idx[1]; 3490 idx += 2; 3491 tmp0 = x[i1]; 3492 tmp1 = x[i2]; 3493 switch (sizes[i]) { 3494 case 5: 3495 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3496 case 4: /* fall through */ 3497 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3498 case 3: 3499 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3500 case 2: 3501 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3502 case 1: 3503 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3504 } 3505 } 3506 /* ragged edge */ 3507 if (n == sz-1) { 3508 tmp0 = x[*idx]; 3509 switch (sizes[i]) { 3510 case 5: 3511 sum5 -= *v5*tmp0; 3512 case 4: /* fall through */ 3513 sum4 -= *v4*tmp0; 3514 case 3: 3515 sum3 -= *v3*tmp0; 3516 case 2: 3517 sum2 -= *v2*tmp0; 3518 case 1: 3519 sum1 -= *v1*tmp0; 3520 } 3521 } 3522 /* update */ 3523 if (xb == b) { 3524 /* whole (old way) w/ diag */ 3525 switch (sizes[i]) { 3526 case 5: 3527 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3528 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3529 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3530 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3531 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3532 break; 3533 case 4: 3534 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3535 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3536 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3537 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3538 break; 3539 case 3: 3540 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3541 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3542 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3543 break; 3544 case 2: 3545 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3546 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3547 break; 3548 case 1: 3549 x[row--] += sum1*(*ibdiag); 3550 break; 3551 } 3552 } else { 3553 /* no diag so set = */ 3554 switch (sizes[i]) { 3555 case 5: 3556 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3557 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3558 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3559 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3560 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3561 break; 3562 case 4: 3563 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3564 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3565 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3566 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3567 break; 3568 case 3: 3569 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3570 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3571 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3572 break; 3573 case 2: 3574 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3575 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3576 break; 3577 case 1: 3578 x[row--] = sum1*(*ibdiag); 3579 break; 3580 } 3581 } 3582 } 3583 if (xb == b) { 3584 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3585 } else { 3586 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3587 } 3588 } 3589 } 3590 if (flag & SOR_EISENSTAT) { 3591 /* 3592 Apply (U + D)^-1 where D is now the block diagonal 3593 */ 3594 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3595 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3596 ibdiag -= sizes[i]*sizes[i]; 3597 sz = ii[row+1] - diag[row] - 1; 3598 v1 = a->a + diag[row] + 1; 3599 idx = a->j + diag[row] + 1; 3600 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3601 switch (sizes[i]) { 3602 case 1: 3603 3604 sum1 = b[row]; 3605 for (n = 0; n<sz-1; n+=2) { 3606 i1 = idx[0]; 3607 i2 = idx[1]; 3608 idx += 2; 3609 tmp0 = x[i1]; 3610 tmp1 = x[i2]; 3611 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3612 } 3613 3614 if (n == sz-1) { 3615 tmp0 = x[*idx]; 3616 sum1 -= *v1*tmp0; 3617 } 3618 x[row] = sum1*(*ibdiag);row--; 3619 break; 3620 3621 case 2: 3622 3623 sum1 = b[row]; 3624 sum2 = b[row-1]; 3625 /* note that sum1 is associated with the second of the two rows */ 3626 v2 = a->a + diag[row-1] + 2; 3627 for (n = 0; n<sz-1; n+=2) { 3628 i1 = idx[0]; 3629 i2 = idx[1]; 3630 idx += 2; 3631 tmp0 = x[i1]; 3632 tmp1 = x[i2]; 3633 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3634 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3635 } 3636 3637 if (n == sz-1) { 3638 tmp0 = x[*idx]; 3639 sum1 -= *v1*tmp0; 3640 sum2 -= *v2*tmp0; 3641 } 3642 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3643 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3644 row -= 2; 3645 break; 3646 case 3: 3647 3648 sum1 = b[row]; 3649 sum2 = b[row-1]; 3650 sum3 = b[row-2]; 3651 v2 = a->a + diag[row-1] + 2; 3652 v3 = a->a + diag[row-2] + 3; 3653 for (n = 0; n<sz-1; n+=2) { 3654 i1 = idx[0]; 3655 i2 = idx[1]; 3656 idx += 2; 3657 tmp0 = x[i1]; 3658 tmp1 = x[i2]; 3659 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3660 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3661 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3662 } 3663 3664 if (n == sz-1) { 3665 tmp0 = x[*idx]; 3666 sum1 -= *v1*tmp0; 3667 sum2 -= *v2*tmp0; 3668 sum3 -= *v3*tmp0; 3669 } 3670 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3671 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3672 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3673 row -= 3; 3674 break; 3675 case 4: 3676 3677 sum1 = b[row]; 3678 sum2 = b[row-1]; 3679 sum3 = b[row-2]; 3680 sum4 = b[row-3]; 3681 v2 = a->a + diag[row-1] + 2; 3682 v3 = a->a + diag[row-2] + 3; 3683 v4 = a->a + diag[row-3] + 4; 3684 for (n = 0; n<sz-1; n+=2) { 3685 i1 = idx[0]; 3686 i2 = idx[1]; 3687 idx += 2; 3688 tmp0 = x[i1]; 3689 tmp1 = x[i2]; 3690 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3691 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3692 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3693 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3694 } 3695 3696 if (n == sz-1) { 3697 tmp0 = x[*idx]; 3698 sum1 -= *v1*tmp0; 3699 sum2 -= *v2*tmp0; 3700 sum3 -= *v3*tmp0; 3701 sum4 -= *v4*tmp0; 3702 } 3703 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3704 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3705 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3706 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3707 row -= 4; 3708 break; 3709 case 5: 3710 3711 sum1 = b[row]; 3712 sum2 = b[row-1]; 3713 sum3 = b[row-2]; 3714 sum4 = b[row-3]; 3715 sum5 = b[row-4]; 3716 v2 = a->a + diag[row-1] + 2; 3717 v3 = a->a + diag[row-2] + 3; 3718 v4 = a->a + diag[row-3] + 4; 3719 v5 = a->a + diag[row-4] + 5; 3720 for (n = 0; n<sz-1; n+=2) { 3721 i1 = idx[0]; 3722 i2 = idx[1]; 3723 idx += 2; 3724 tmp0 = x[i1]; 3725 tmp1 = x[i2]; 3726 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3727 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3728 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3729 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3730 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3731 } 3732 3733 if (n == sz-1) { 3734 tmp0 = x[*idx]; 3735 sum1 -= *v1*tmp0; 3736 sum2 -= *v2*tmp0; 3737 sum3 -= *v3*tmp0; 3738 sum4 -= *v4*tmp0; 3739 sum5 -= *v5*tmp0; 3740 } 3741 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3742 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3743 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3744 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3745 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3746 row -= 5; 3747 break; 3748 default: 3749 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3750 } 3751 } 3752 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3753 3754 /* 3755 t = b - D x where D is the block diagonal 3756 */ 3757 cnt = 0; 3758 for (i=0, row=0; i<m; i++) { 3759 switch (sizes[i]) { 3760 case 1: 3761 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3762 break; 3763 case 2: 3764 x1 = x[row]; x2 = x[row+1]; 3765 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3766 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3767 t[row] = b[row] - tmp1; 3768 t[row+1] = b[row+1] - tmp2; row += 2; 3769 cnt += 4; 3770 break; 3771 case 3: 3772 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3773 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3774 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3775 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3776 t[row] = b[row] - tmp1; 3777 t[row+1] = b[row+1] - tmp2; 3778 t[row+2] = b[row+2] - tmp3; row += 3; 3779 cnt += 9; 3780 break; 3781 case 4: 3782 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3783 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3784 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3785 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3786 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3787 t[row] = b[row] - tmp1; 3788 t[row+1] = b[row+1] - tmp2; 3789 t[row+2] = b[row+2] - tmp3; 3790 t[row+3] = b[row+3] - tmp4; row += 4; 3791 cnt += 16; 3792 break; 3793 case 5: 3794 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3795 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3796 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3797 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3798 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3799 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3800 t[row] = b[row] - tmp1; 3801 t[row+1] = b[row+1] - tmp2; 3802 t[row+2] = b[row+2] - tmp3; 3803 t[row+3] = b[row+3] - tmp4; 3804 t[row+4] = b[row+4] - tmp5;row += 5; 3805 cnt += 25; 3806 break; 3807 default: 3808 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3809 } 3810 } 3811 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3812 3813 3814 3815 /* 3816 Apply (L + D)^-1 where D is the block diagonal 3817 */ 3818 for (i=0, row=0; i<m; i++) { 3819 sz = diag[row] - ii[row]; 3820 v1 = a->a + ii[row]; 3821 idx = a->j + ii[row]; 3822 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3823 switch (sizes[i]) { 3824 case 1: 3825 3826 sum1 = t[row]; 3827 for (n = 0; n<sz-1; n+=2) { 3828 i1 = idx[0]; 3829 i2 = idx[1]; 3830 idx += 2; 3831 tmp0 = t[i1]; 3832 tmp1 = t[i2]; 3833 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3834 } 3835 3836 if (n == sz-1) { 3837 tmp0 = t[*idx]; 3838 sum1 -= *v1 * tmp0; 3839 } 3840 x[row] += t[row] = sum1*(*ibdiag++); row++; 3841 break; 3842 case 2: 3843 v2 = a->a + ii[row+1]; 3844 sum1 = t[row]; 3845 sum2 = t[row+1]; 3846 for (n = 0; n<sz-1; n+=2) { 3847 i1 = idx[0]; 3848 i2 = idx[1]; 3849 idx += 2; 3850 tmp0 = t[i1]; 3851 tmp1 = t[i2]; 3852 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3853 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3854 } 3855 3856 if (n == sz-1) { 3857 tmp0 = t[*idx]; 3858 sum1 -= v1[0] * tmp0; 3859 sum2 -= v2[0] * tmp0; 3860 } 3861 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3862 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3863 ibdiag += 4; row += 2; 3864 break; 3865 case 3: 3866 v2 = a->a + ii[row+1]; 3867 v3 = a->a + ii[row+2]; 3868 sum1 = t[row]; 3869 sum2 = t[row+1]; 3870 sum3 = t[row+2]; 3871 for (n = 0; n<sz-1; n+=2) { 3872 i1 = idx[0]; 3873 i2 = idx[1]; 3874 idx += 2; 3875 tmp0 = t[i1]; 3876 tmp1 = t[i2]; 3877 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3878 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3879 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3880 } 3881 3882 if (n == sz-1) { 3883 tmp0 = t[*idx]; 3884 sum1 -= v1[0] * tmp0; 3885 sum2 -= v2[0] * tmp0; 3886 sum3 -= v3[0] * tmp0; 3887 } 3888 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3889 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3890 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3891 ibdiag += 9; row += 3; 3892 break; 3893 case 4: 3894 v2 = a->a + ii[row+1]; 3895 v3 = a->a + ii[row+2]; 3896 v4 = a->a + ii[row+3]; 3897 sum1 = t[row]; 3898 sum2 = t[row+1]; 3899 sum3 = t[row+2]; 3900 sum4 = t[row+3]; 3901 for (n = 0; n<sz-1; n+=2) { 3902 i1 = idx[0]; 3903 i2 = idx[1]; 3904 idx += 2; 3905 tmp0 = t[i1]; 3906 tmp1 = t[i2]; 3907 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3908 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3909 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3910 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3911 } 3912 3913 if (n == sz-1) { 3914 tmp0 = t[*idx]; 3915 sum1 -= v1[0] * tmp0; 3916 sum2 -= v2[0] * tmp0; 3917 sum3 -= v3[0] * tmp0; 3918 sum4 -= v4[0] * tmp0; 3919 } 3920 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3921 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3922 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3923 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3924 ibdiag += 16; row += 4; 3925 break; 3926 case 5: 3927 v2 = a->a + ii[row+1]; 3928 v3 = a->a + ii[row+2]; 3929 v4 = a->a + ii[row+3]; 3930 v5 = a->a + ii[row+4]; 3931 sum1 = t[row]; 3932 sum2 = t[row+1]; 3933 sum3 = t[row+2]; 3934 sum4 = t[row+3]; 3935 sum5 = t[row+4]; 3936 for (n = 0; n<sz-1; n+=2) { 3937 i1 = idx[0]; 3938 i2 = idx[1]; 3939 idx += 2; 3940 tmp0 = t[i1]; 3941 tmp1 = t[i2]; 3942 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3943 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3944 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3945 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3946 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3947 } 3948 3949 if (n == sz-1) { 3950 tmp0 = t[*idx]; 3951 sum1 -= v1[0] * tmp0; 3952 sum2 -= v2[0] * tmp0; 3953 sum3 -= v3[0] * tmp0; 3954 sum4 -= v4[0] * tmp0; 3955 sum5 -= v5[0] * tmp0; 3956 } 3957 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3958 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3959 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3960 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3961 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3962 ibdiag += 25; row += 5; 3963 break; 3964 default: 3965 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3966 } 3967 } 3968 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3969 } 3970 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3971 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3972 PetscFunctionReturn(0); 3973 } 3974 3975 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3976 { 3977 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3978 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3979 const MatScalar *bdiag = a->inode.bdiag; 3980 const PetscScalar *b; 3981 PetscErrorCode ierr; 3982 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3983 const PetscInt *sizes = a->inode.size; 3984 3985 PetscFunctionBegin; 3986 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3987 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3988 cnt = 0; 3989 for (i=0, row=0; i<m; i++) { 3990 switch (sizes[i]) { 3991 case 1: 3992 x[row] = b[row]*bdiag[cnt++];row++; 3993 break; 3994 case 2: 3995 x1 = b[row]; x2 = b[row+1]; 3996 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3997 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3998 x[row++] = tmp1; 3999 x[row++] = tmp2; 4000 cnt += 4; 4001 break; 4002 case 3: 4003 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4004 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4005 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4006 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4007 x[row++] = tmp1; 4008 x[row++] = tmp2; 4009 x[row++] = tmp3; 4010 cnt += 9; 4011 break; 4012 case 4: 4013 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4014 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4015 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4016 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4017 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4018 x[row++] = tmp1; 4019 x[row++] = tmp2; 4020 x[row++] = tmp3; 4021 x[row++] = tmp4; 4022 cnt += 16; 4023 break; 4024 case 5: 4025 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4026 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4027 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4028 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4029 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4030 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4031 x[row++] = tmp1; 4032 x[row++] = tmp2; 4033 x[row++] = tmp3; 4034 x[row++] = tmp4; 4035 x[row++] = tmp5; 4036 cnt += 25; 4037 break; 4038 default: 4039 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4040 } 4041 } 4042 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4043 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4044 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4045 PetscFunctionReturn(0); 4046 } 4047 4048 /* 4049 samestructure indicates that the matrix has not changed its nonzero structure so we 4050 do not need to recompute the inodes 4051 */ 4052 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4053 { 4054 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4055 PetscErrorCode ierr; 4056 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4057 PetscBool flag; 4058 const PetscInt *idx,*idy,*ii; 4059 4060 PetscFunctionBegin; 4061 if (!a->inode.use) PetscFunctionReturn(0); 4062 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4063 4064 m = A->rmap->n; 4065 if (a->inode.size) ns = a->inode.size; 4066 else { 4067 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4068 } 4069 4070 i = 0; 4071 node_count = 0; 4072 idx = a->j; 4073 ii = a->i; 4074 while (i < m) { /* For each row */ 4075 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4076 /* Limits the number of elements in a node to 'a->inode.limit' */ 4077 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4078 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4079 if (nzy != nzx) break; 4080 idy += nzx; /* Same nonzero pattern */ 4081 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4082 if (!flag) break; 4083 } 4084 ns[node_count++] = blk_size; 4085 idx += blk_size*nzx; 4086 i = j; 4087 } 4088 /* If not enough inodes found,, do not use inode version of the routines */ 4089 if (!m || node_count > .8*m) { 4090 ierr = PetscFree(ns);CHKERRQ(ierr); 4091 4092 a->inode.node_count = 0; 4093 a->inode.size = NULL; 4094 a->inode.use = PETSC_FALSE; 4095 A->ops->mult = MatMult_SeqAIJ; 4096 A->ops->sor = MatSOR_SeqAIJ; 4097 A->ops->multadd = MatMultAdd_SeqAIJ; 4098 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4099 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4100 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4101 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4102 A->ops->coloringpatch = 0; 4103 A->ops->multdiagonalblock = 0; 4104 4105 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4106 } else { 4107 if (!A->factortype) { 4108 A->ops->mult = MatMult_SeqAIJ_Inode; 4109 A->ops->sor = MatSOR_SeqAIJ_Inode; 4110 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4111 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4112 if (A->rmap->n == A->cmap->n) { 4113 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4114 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4115 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4116 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4117 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4118 } 4119 } else { 4120 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4121 } 4122 a->inode.node_count = node_count; 4123 a->inode.size = ns; 4124 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4125 } 4126 a->inode.checked = PETSC_TRUE; 4127 a->inode.mat_nonzerostate = A->nonzerostate; 4128 PetscFunctionReturn(0); 4129 } 4130 4131 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4132 { 4133 Mat B =*C; 4134 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4135 PetscErrorCode ierr; 4136 PetscInt m=A->rmap->n; 4137 4138 PetscFunctionBegin; 4139 c->inode.use = a->inode.use; 4140 c->inode.limit = a->inode.limit; 4141 c->inode.max_limit = a->inode.max_limit; 4142 if (a->inode.size) { 4143 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4144 c->inode.node_count = a->inode.node_count; 4145 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4146 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4147 if (!B->factortype) { 4148 B->ops->mult = MatMult_SeqAIJ_Inode; 4149 B->ops->sor = MatSOR_SeqAIJ_Inode; 4150 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4151 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4152 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4153 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4154 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4155 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4156 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4157 } else { 4158 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4159 } 4160 } else { 4161 c->inode.size = 0; 4162 c->inode.node_count = 0; 4163 } 4164 c->inode.ibdiagvalid = PETSC_FALSE; 4165 c->inode.ibdiag = 0; 4166 c->inode.bdiag = 0; 4167 PetscFunctionReturn(0); 4168 } 4169 4170 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4171 { 4172 PetscInt k; 4173 const PetscInt *vi; 4174 4175 PetscFunctionBegin; 4176 vi = aj + ai[row]; 4177 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4178 vi = aj + adiag[row]; 4179 cols[nzl] = vi[0]; 4180 vi = aj + adiag[row+1]+1; 4181 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4182 PetscFunctionReturn(0); 4183 } 4184 /* 4185 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4186 Modified from MatSeqAIJCheckInode(). 4187 4188 Input Parameters: 4189 . Mat A - ILU or LU matrix factor 4190 4191 */ 4192 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4193 { 4194 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4195 PetscErrorCode ierr; 4196 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4197 PetscInt *cols1,*cols2,*ns; 4198 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4199 PetscBool flag; 4200 4201 PetscFunctionBegin; 4202 if (!a->inode.use) PetscFunctionReturn(0); 4203 if (a->inode.checked) PetscFunctionReturn(0); 4204 4205 m = A->rmap->n; 4206 if (a->inode.size) ns = a->inode.size; 4207 else { 4208 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4209 } 4210 4211 i = 0; 4212 node_count = 0; 4213 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4214 while (i < m) { /* For each row */ 4215 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4216 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4217 nzx = nzl1 + nzu1 + 1; 4218 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4219 4220 /* Limits the number of elements in a node to 'a->inode.limit' */ 4221 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4222 nzl2 = ai[j+1] - ai[j]; 4223 nzu2 = adiag[j] - adiag[j+1] - 1; 4224 nzy = nzl2 + nzu2 + 1; 4225 if (nzy != nzx) break; 4226 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4227 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4228 if (!flag) break; 4229 } 4230 ns[node_count++] = blk_size; 4231 i = j; 4232 } 4233 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4234 /* If not enough inodes found,, do not use inode version of the routines */ 4235 if (!m || node_count > .8*m) { 4236 ierr = PetscFree(ns);CHKERRQ(ierr); 4237 4238 a->inode.node_count = 0; 4239 a->inode.size = NULL; 4240 a->inode.use = PETSC_FALSE; 4241 4242 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4243 } else { 4244 A->ops->mult = 0; 4245 A->ops->sor = 0; 4246 A->ops->multadd = 0; 4247 A->ops->getrowij = 0; 4248 A->ops->restorerowij = 0; 4249 A->ops->getcolumnij = 0; 4250 A->ops->restorecolumnij = 0; 4251 A->ops->coloringpatch = 0; 4252 A->ops->multdiagonalblock = 0; 4253 a->inode.node_count = node_count; 4254 a->inode.size = ns; 4255 4256 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4257 } 4258 a->inode.checked = PETSC_TRUE; 4259 PetscFunctionReturn(0); 4260 } 4261 4262 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4263 { 4264 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4265 4266 PetscFunctionBegin; 4267 a->inode.ibdiagvalid = PETSC_FALSE; 4268 PetscFunctionReturn(0); 4269 } 4270 4271 /* 4272 This is really ugly. if inodes are used this replaces the 4273 permutations with ones that correspond to rows/cols of the matrix 4274 rather then inode blocks 4275 */ 4276 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4277 { 4278 PetscErrorCode ierr; 4279 4280 PetscFunctionBegin; 4281 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4282 PetscFunctionReturn(0); 4283 } 4284 4285 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4286 { 4287 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4288 PetscErrorCode ierr; 4289 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4290 const PetscInt *ridx,*cidx; 4291 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4292 PetscInt nslim_col,*ns_col; 4293 IS ris = *rperm,cis = *cperm; 4294 4295 PetscFunctionBegin; 4296 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4297 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4298 4299 ierr = MatCreateColInode_Private(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4300 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4301 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4302 4303 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4304 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4305 4306 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4307 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4308 4309 /* Construct the permutations for rows*/ 4310 for (i=0,row = 0; i<nslim_row; ++i) { 4311 indx = ridx[i]; 4312 start_val = tns[indx]; 4313 end_val = tns[indx + 1]; 4314 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4315 } 4316 4317 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4318 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4319 4320 /* Construct permutations for columns */ 4321 for (i=0,col=0; i<nslim_col; ++i) { 4322 indx = cidx[i]; 4323 start_val = tns[indx]; 4324 end_val = tns[indx + 1]; 4325 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4326 } 4327 4328 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4329 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4330 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4331 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4332 4333 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4334 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4335 4336 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4337 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4338 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4339 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4340 ierr = PetscFree(tns);CHKERRQ(ierr); 4341 PetscFunctionReturn(0); 4342 } 4343 4344 /*@C 4345 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4346 4347 Not Collective 4348 4349 Input Parameter: 4350 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4351 4352 Output Parameter: 4353 + node_count - no of inodes present in the matrix. 4354 . sizes - an array of size node_count,with sizes of each inode. 4355 - limit - the max size used to generate the inodes. 4356 4357 Level: advanced 4358 4359 Notes: This routine returns some internal storage information 4360 of the matrix, it is intended to be used by advanced users. 4361 It should be called after the matrix is assembled. 4362 The contents of the sizes[] array should not be changed. 4363 NULL may be passed for information not requested. 4364 4365 .keywords: matrix, seqaij, get, inode 4366 4367 .seealso: MatGetInfo() 4368 @*/ 4369 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4370 { 4371 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4372 4373 PetscFunctionBegin; 4374 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4375 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4376 if (f) { 4377 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4378 } 4379 PetscFunctionReturn(0); 4380 } 4381 4382 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4383 { 4384 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4385 4386 PetscFunctionBegin; 4387 if (node_count) *node_count = a->inode.node_count; 4388 if (sizes) *sizes = a->inode.size; 4389 if (limit) *limit = a->inode.limit; 4390 PetscFunctionReturn(0); 4391 } 4392