1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 67 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1) { 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 if (j==jmax) continue; /* empty row */ 101 col = *j++ + ishift; 102 i2 = tvc[col]; 103 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 104 ia[i1+1]++; 105 ia[i2+1]++; 106 i2++; /* Start col of next node */ 107 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 108 i2 = tvc[col]; 109 } 110 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 111 } 112 113 /* shift ia[i] to point to next row */ 114 for (i1=1; i1<nslim_row+1; i1++) { 115 row = ia[i1-1]; 116 ia[i1] += row; 117 work[i1-1] = row - oshift; 118 } 119 120 /* allocate space for column pointers */ 121 nz = ia[nslim_row] + (!ishift); 122 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 123 *jja = ja; 124 125 /* loop over lower triangular part putting into ja */ 126 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 127 j = aj + ai[row] + ishift; 128 jmax = aj + ai[row+1] + ishift; 129 if (j==jmax) continue; /* empty row */ 130 col = *j++ + ishift; 131 i2 = tvc[col]; 132 while (i2<i1 && j<jmax) { 133 ja[work[i2]++] = i1 + oshift; 134 ja[work[i1]++] = i2 + oshift; 135 ++i2; 136 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 137 i2 = tvc[col]; 138 } 139 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 140 141 } 142 ierr = PetscFree(work);CHKERRQ(ierr); 143 ierr = PetscFree(tns);CHKERRQ(ierr); 144 ierr = PetscFree(tvc);CHKERRQ(ierr); 145 PetscFunctionReturn(0); 146 } 147 148 /* 149 This builds nonsymmetric version of nonzero structure, 150 */ 151 #undef __FUNCT__ 152 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 153 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 154 { 155 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 156 PetscErrorCode ierr; 157 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 158 PetscInt *tns,*tvc,nsz,i1,i2; 159 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1) { 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 176 } 177 /* allocate space for row pointers */ 178 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 179 *iia = ia; 180 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 181 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 182 183 /* determine the number of columns in each row */ 184 ia[0] = oshift; 185 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 186 j = aj + ai[row] + ishift; 187 nz = ai[row+1] - ai[row]; 188 if (!nz) continue; /* empty row */ 189 col = *j++ + ishift; 190 i2 = tvc[col]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 nz = ai[row+1] - ai[row]; 215 if (!nz) continue; /* empty row */ 216 col = *j++ + ishift; 217 i2 = tvc[col]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 PetscFunctionReturn(0); 268 } 269 270 /* ----------------------------------------------------------- */ 271 272 #undef __FUNCT__ 273 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 274 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 275 { 276 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 277 PetscErrorCode ierr; 278 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 279 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 280 281 PetscFunctionBegin; 282 nslim_row = a->inode.node_count; 283 n = A->cmap->n; 284 285 /* Create The column_inode for this matrix */ 286 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 287 288 /* allocate space for reformated column_inode structure */ 289 ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr); 290 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 291 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 292 293 for (i1=0,col=0; i1<nslim_col; ++i1) { 294 nsz = ns_col[i1]; 295 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 296 } 297 /* allocate space for column pointers */ 298 ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 299 *iia = ia; 300 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 301 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 302 303 /* determine the number of columns in each row */ 304 ia[0] = oshift; 305 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 306 j = aj + ai[row] + ishift; 307 col = *j++ + ishift; 308 i2 = tvc[col]; 309 nz = ai[row+1] - ai[row]; 310 while (nz-- > 0) { /* off-diagonal elemets */ 311 /* ia[i1+1]++; */ 312 ia[i2+1]++; 313 i2++; 314 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 315 if (nz > 0) i2 = tvc[col]; 316 } 317 } 318 319 /* shift ia[i] to point to next col */ 320 for (i1=1; i1<nslim_col+1; i1++) { 321 col = ia[i1-1]; 322 ia[i1] += col; 323 work[i1-1] = col - oshift; 324 } 325 326 /* allocate space for column pointers */ 327 nz = ia[nslim_col] + (!ishift); 328 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 329 *jja = ja; 330 331 /* loop over matrix putting into ja */ 332 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 333 j = aj + ai[row] + ishift; 334 col = *j++ + ishift; 335 i2 = tvc[col]; 336 nz = ai[row+1] - ai[row]; 337 while (nz-- > 0) { 338 /* ja[work[i1]++] = i2 + oshift; */ 339 ja[work[i2]++] = i1 + oshift; 340 i2++; 341 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 342 if (nz > 0) i2 = tvc[col]; 343 } 344 } 345 ierr = PetscFree(ns_col);CHKERRQ(ierr); 346 ierr = PetscFree(work);CHKERRQ(ierr); 347 ierr = PetscFree(tns);CHKERRQ(ierr); 348 ierr = PetscFree(tvc);CHKERRQ(ierr); 349 PetscFunctionReturn(0); 350 } 351 352 #undef __FUNCT__ 353 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 354 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 355 { 356 PetscErrorCode ierr; 357 358 PetscFunctionBegin; 359 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 360 if (!ia) PetscFunctionReturn(0); 361 362 if (!blockcompressed) { 363 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 364 } else if (symmetric) { 365 /* Since the indices are symmetric it does'nt matter */ 366 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 367 } else { 368 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 369 } 370 PetscFunctionReturn(0); 371 } 372 373 #undef __FUNCT__ 374 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 375 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 376 { 377 PetscErrorCode ierr; 378 379 PetscFunctionBegin; 380 if (!ia) PetscFunctionReturn(0); 381 if (!blockcompressed) { 382 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 383 } else { 384 ierr = PetscFree(*ia);CHKERRQ(ierr); 385 ierr = PetscFree(*ja);CHKERRQ(ierr); 386 } 387 PetscFunctionReturn(0); 388 } 389 390 /* ----------------------------------------------------------- */ 391 392 #undef __FUNCT__ 393 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 394 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 395 { 396 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 397 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 398 PetscScalar *y; 399 const PetscScalar *x; 400 const MatScalar *v1,*v2,*v3,*v4,*v5; 401 PetscErrorCode ierr; 402 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 403 const PetscInt *idx,*ns,*ii; 404 405 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 406 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 407 #endif 408 409 PetscFunctionBegin; 410 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 411 node_max = a->inode.node_count; 412 ns = a->inode.size; /* Node Size array */ 413 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 414 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 415 idx = a->j; 416 v1 = a->a; 417 ii = a->i; 418 419 for (i = 0,row = 0; i< node_max; ++i) { 420 nsz = ns[i]; 421 n = ii[1] - ii[0]; 422 nonzerorow += (n>0)*nsz; 423 ii += nsz; 424 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 425 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 426 sz = n; /* No of non zeros in this row */ 427 /* Switch on the size of Node */ 428 switch (nsz) { /* Each loop in 'case' is unrolled */ 429 case 1: 430 sum1 = 0.; 431 432 for (n = 0; n< sz-1; n+=2) { 433 i1 = idx[0]; /* The instructions are ordered to */ 434 i2 = idx[1]; /* make the compiler's job easy */ 435 idx += 2; 436 tmp0 = x[i1]; 437 tmp1 = x[i2]; 438 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 439 } 440 441 if (n == sz-1) { /* Take care of the last nonzero */ 442 tmp0 = x[*idx++]; 443 sum1 += *v1++ *tmp0; 444 } 445 y[row++]=sum1; 446 break; 447 case 2: 448 sum1 = 0.; 449 sum2 = 0.; 450 v2 = v1 + n; 451 452 for (n = 0; n< sz-1; n+=2) { 453 i1 = idx[0]; 454 i2 = idx[1]; 455 idx += 2; 456 tmp0 = x[i1]; 457 tmp1 = x[i2]; 458 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 459 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 460 } 461 if (n == sz-1) { 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 } 466 y[row++]=sum1; 467 y[row++]=sum2; 468 v1 =v2; /* Since the next block to be processed starts there*/ 469 idx +=sz; 470 break; 471 case 3: 472 sum1 = 0.; 473 sum2 = 0.; 474 sum3 = 0.; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 478 for (n = 0; n< sz-1; n+=2) { 479 i1 = idx[0]; 480 i2 = idx[1]; 481 idx += 2; 482 tmp0 = x[i1]; 483 tmp1 = x[i2]; 484 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 485 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 487 } 488 if (n == sz-1) { 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 } 494 y[row++]=sum1; 495 y[row++]=sum2; 496 y[row++]=sum3; 497 v1 =v3; /* Since the next block to be processed starts there*/ 498 idx +=2*sz; 499 break; 500 case 4: 501 sum1 = 0.; 502 sum2 = 0.; 503 sum3 = 0.; 504 sum4 = 0.; 505 v2 = v1 + n; 506 v3 = v2 + n; 507 v4 = v3 + n; 508 509 for (n = 0; n< sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 } 520 if (n == sz-1) { 521 tmp0 = x[*idx++]; 522 sum1 += *v1++ * tmp0; 523 sum2 += *v2++ * tmp0; 524 sum3 += *v3++ * tmp0; 525 sum4 += *v4++ * tmp0; 526 } 527 y[row++]=sum1; 528 y[row++]=sum2; 529 y[row++]=sum3; 530 y[row++]=sum4; 531 v1 =v4; /* Since the next block to be processed starts there*/ 532 idx +=3*sz; 533 break; 534 case 5: 535 sum1 = 0.; 536 sum2 = 0.; 537 sum3 = 0.; 538 sum4 = 0.; 539 sum5 = 0.; 540 v2 = v1 + n; 541 v3 = v2 + n; 542 v4 = v3 + n; 543 v5 = v4 + n; 544 545 for (n = 0; n<sz-1; n+=2) { 546 i1 = idx[0]; 547 i2 = idx[1]; 548 idx += 2; 549 tmp0 = x[i1]; 550 tmp1 = x[i2]; 551 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 552 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 553 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 554 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 555 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 556 } 557 if (n == sz-1) { 558 tmp0 = x[*idx++]; 559 sum1 += *v1++ * tmp0; 560 sum2 += *v2++ * tmp0; 561 sum3 += *v3++ * tmp0; 562 sum4 += *v4++ * tmp0; 563 sum5 += *v5++ * tmp0; 564 } 565 y[row++]=sum1; 566 y[row++]=sum2; 567 y[row++]=sum3; 568 y[row++]=sum4; 569 y[row++]=sum5; 570 v1 =v5; /* Since the next block to be processed starts there */ 571 idx +=4*sz; 572 break; 573 default: 574 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 575 } 576 } 577 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 578 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 /* ----------------------------------------------------------- */ 583 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 586 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 590 const MatScalar *v1,*v2,*v3,*v4,*v5; 591 const PetscScalar *x; 592 PetscScalar *y,*z,*zt; 593 PetscErrorCode ierr; 594 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 595 const PetscInt *idx,*ns,*ii; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 602 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 603 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 604 zt = z; 605 606 idx = a->j; 607 v1 = a->a; 608 ii = a->i; 609 610 for (i = 0,row = 0; i< node_max; ++i) { 611 nsz = ns[i]; 612 n = ii[1] - ii[0]; 613 ii += nsz; 614 sz = n; /* No of non zeros in this row */ 615 /* Switch on the size of Node */ 616 switch (nsz) { /* Each loop in 'case' is unrolled */ 617 case 1: 618 sum1 = *zt++; 619 620 for (n = 0; n< sz-1; n+=2) { 621 i1 = idx[0]; /* The instructions are ordered to */ 622 i2 = idx[1]; /* make the compiler's job easy */ 623 idx += 2; 624 tmp0 = x[i1]; 625 tmp1 = x[i2]; 626 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 627 } 628 629 if (n == sz-1) { /* Take care of the last nonzero */ 630 tmp0 = x[*idx++]; 631 sum1 += *v1++ * tmp0; 632 } 633 y[row++]=sum1; 634 break; 635 case 2: 636 sum1 = *zt++; 637 sum2 = *zt++; 638 v2 = v1 + n; 639 640 for (n = 0; n< sz-1; n+=2) { 641 i1 = idx[0]; 642 i2 = idx[1]; 643 idx += 2; 644 tmp0 = x[i1]; 645 tmp1 = x[i2]; 646 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 647 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 648 } 649 if (n == sz-1) { 650 tmp0 = x[*idx++]; 651 sum1 += *v1++ * tmp0; 652 sum2 += *v2++ * tmp0; 653 } 654 y[row++]=sum1; 655 y[row++]=sum2; 656 v1 =v2; /* Since the next block to be processed starts there*/ 657 idx +=sz; 658 break; 659 case 3: 660 sum1 = *zt++; 661 sum2 = *zt++; 662 sum3 = *zt++; 663 v2 = v1 + n; 664 v3 = v2 + n; 665 666 for (n = 0; n< sz-1; n+=2) { 667 i1 = idx[0]; 668 i2 = idx[1]; 669 idx += 2; 670 tmp0 = x[i1]; 671 tmp1 = x[i2]; 672 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 673 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 674 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 675 } 676 if (n == sz-1) { 677 tmp0 = x[*idx++]; 678 sum1 += *v1++ * tmp0; 679 sum2 += *v2++ * tmp0; 680 sum3 += *v3++ * tmp0; 681 } 682 y[row++]=sum1; 683 y[row++]=sum2; 684 y[row++]=sum3; 685 v1 =v3; /* Since the next block to be processed starts there*/ 686 idx +=2*sz; 687 break; 688 case 4: 689 sum1 = *zt++; 690 sum2 = *zt++; 691 sum3 = *zt++; 692 sum4 = *zt++; 693 v2 = v1 + n; 694 v3 = v2 + n; 695 v4 = v3 + n; 696 697 for (n = 0; n< sz-1; n+=2) { 698 i1 = idx[0]; 699 i2 = idx[1]; 700 idx += 2; 701 tmp0 = x[i1]; 702 tmp1 = x[i2]; 703 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 704 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 705 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 706 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 707 } 708 if (n == sz-1) { 709 tmp0 = x[*idx++]; 710 sum1 += *v1++ * tmp0; 711 sum2 += *v2++ * tmp0; 712 sum3 += *v3++ * tmp0; 713 sum4 += *v4++ * tmp0; 714 } 715 y[row++]=sum1; 716 y[row++]=sum2; 717 y[row++]=sum3; 718 y[row++]=sum4; 719 v1 =v4; /* Since the next block to be processed starts there*/ 720 idx +=3*sz; 721 break; 722 case 5: 723 sum1 = *zt++; 724 sum2 = *zt++; 725 sum3 = *zt++; 726 sum4 = *zt++; 727 sum5 = *zt++; 728 v2 = v1 + n; 729 v3 = v2 + n; 730 v4 = v3 + n; 731 v5 = v4 + n; 732 733 for (n = 0; n<sz-1; n+=2) { 734 i1 = idx[0]; 735 i2 = idx[1]; 736 idx += 2; 737 tmp0 = x[i1]; 738 tmp1 = x[i2]; 739 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 740 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 741 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 742 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 743 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 744 } 745 if (n == sz-1) { 746 tmp0 = x[*idx++]; 747 sum1 += *v1++ * tmp0; 748 sum2 += *v2++ * tmp0; 749 sum3 += *v3++ * tmp0; 750 sum4 += *v4++ * tmp0; 751 sum5 += *v5++ * tmp0; 752 } 753 y[row++]=sum1; 754 y[row++]=sum2; 755 y[row++]=sum3; 756 y[row++]=sum4; 757 y[row++]=sum5; 758 v1 =v5; /* Since the next block to be processed starts there */ 759 idx +=4*sz; 760 break; 761 default: 762 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 763 } 764 } 765 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 766 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 767 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 768 PetscFunctionReturn(0); 769 } 770 771 /* ----------------------------------------------------------- */ 772 #undef __FUNCT__ 773 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 774 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 775 { 776 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 777 IS iscol = a->col,isrow = a->row; 778 PetscErrorCode ierr; 779 const PetscInt *r,*c,*rout,*cout; 780 PetscInt i,j,n = A->rmap->n,nz; 781 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 782 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 783 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 784 PetscScalar sum1,sum2,sum3,sum4,sum5; 785 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 786 const PetscScalar *b; 787 788 PetscFunctionBegin; 789 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 790 node_max = a->inode.node_count; 791 ns = a->inode.size; /* Node Size array */ 792 793 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 794 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 795 tmp = a->solve_work; 796 797 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 798 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 799 800 /* forward solve the lower triangular */ 801 tmps = tmp; 802 aa = a_a; 803 aj = a_j; 804 ad = a->diag; 805 806 for (i = 0,row = 0; i< node_max; ++i) { 807 nsz = ns[i]; 808 aii = ai[row]; 809 v1 = aa + aii; 810 vi = aj + aii; 811 nz = ad[row]- aii; 812 if (i < node_max-1) { 813 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 814 * but our indexing to determine it's size could. */ 815 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 816 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 817 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 818 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 819 } 820 821 switch (nsz) { /* Each loop in 'case' is unrolled */ 822 case 1: 823 sum1 = b[*r++]; 824 for (j=0; j<nz-1; j+=2) { 825 i0 = vi[0]; 826 i1 = vi[1]; 827 vi +=2; 828 tmp0 = tmps[i0]; 829 tmp1 = tmps[i1]; 830 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 831 } 832 if (j == nz-1) { 833 tmp0 = tmps[*vi++]; 834 sum1 -= *v1++ *tmp0; 835 } 836 tmp[row++]=sum1; 837 break; 838 case 2: 839 sum1 = b[*r++]; 840 sum2 = b[*r++]; 841 v2 = aa + ai[row+1]; 842 843 for (j=0; j<nz-1; j+=2) { 844 i0 = vi[0]; 845 i1 = vi[1]; 846 vi +=2; 847 tmp0 = tmps[i0]; 848 tmp1 = tmps[i1]; 849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 850 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 851 } 852 if (j == nz-1) { 853 tmp0 = tmps[*vi++]; 854 sum1 -= *v1++ *tmp0; 855 sum2 -= *v2++ *tmp0; 856 } 857 sum2 -= *v2++ *sum1; 858 tmp[row++]=sum1; 859 tmp[row++]=sum2; 860 break; 861 case 3: 862 sum1 = b[*r++]; 863 sum2 = b[*r++]; 864 sum3 = b[*r++]; 865 v2 = aa + ai[row+1]; 866 v3 = aa + ai[row+2]; 867 868 for (j=0; j<nz-1; j+=2) { 869 i0 = vi[0]; 870 i1 = vi[1]; 871 vi +=2; 872 tmp0 = tmps[i0]; 873 tmp1 = tmps[i1]; 874 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 875 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 876 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 877 } 878 if (j == nz-1) { 879 tmp0 = tmps[*vi++]; 880 sum1 -= *v1++ *tmp0; 881 sum2 -= *v2++ *tmp0; 882 sum3 -= *v3++ *tmp0; 883 } 884 sum2 -= *v2++ * sum1; 885 sum3 -= *v3++ * sum1; 886 sum3 -= *v3++ * sum2; 887 888 tmp[row++]=sum1; 889 tmp[row++]=sum2; 890 tmp[row++]=sum3; 891 break; 892 893 case 4: 894 sum1 = b[*r++]; 895 sum2 = b[*r++]; 896 sum3 = b[*r++]; 897 sum4 = b[*r++]; 898 v2 = aa + ai[row+1]; 899 v3 = aa + ai[row+2]; 900 v4 = aa + ai[row+3]; 901 902 for (j=0; j<nz-1; j+=2) { 903 i0 = vi[0]; 904 i1 = vi[1]; 905 vi +=2; 906 tmp0 = tmps[i0]; 907 tmp1 = tmps[i1]; 908 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 909 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 910 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 911 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 912 } 913 if (j == nz-1) { 914 tmp0 = tmps[*vi++]; 915 sum1 -= *v1++ *tmp0; 916 sum2 -= *v2++ *tmp0; 917 sum3 -= *v3++ *tmp0; 918 sum4 -= *v4++ *tmp0; 919 } 920 sum2 -= *v2++ * sum1; 921 sum3 -= *v3++ * sum1; 922 sum4 -= *v4++ * sum1; 923 sum3 -= *v3++ * sum2; 924 sum4 -= *v4++ * sum2; 925 sum4 -= *v4++ * sum3; 926 927 tmp[row++]=sum1; 928 tmp[row++]=sum2; 929 tmp[row++]=sum3; 930 tmp[row++]=sum4; 931 break; 932 case 5: 933 sum1 = b[*r++]; 934 sum2 = b[*r++]; 935 sum3 = b[*r++]; 936 sum4 = b[*r++]; 937 sum5 = b[*r++]; 938 v2 = aa + ai[row+1]; 939 v3 = aa + ai[row+2]; 940 v4 = aa + ai[row+3]; 941 v5 = aa + ai[row+4]; 942 943 for (j=0; j<nz-1; j+=2) { 944 i0 = vi[0]; 945 i1 = vi[1]; 946 vi +=2; 947 tmp0 = tmps[i0]; 948 tmp1 = tmps[i1]; 949 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 950 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 951 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 952 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 953 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 954 } 955 if (j == nz-1) { 956 tmp0 = tmps[*vi++]; 957 sum1 -= *v1++ *tmp0; 958 sum2 -= *v2++ *tmp0; 959 sum3 -= *v3++ *tmp0; 960 sum4 -= *v4++ *tmp0; 961 sum5 -= *v5++ *tmp0; 962 } 963 964 sum2 -= *v2++ * sum1; 965 sum3 -= *v3++ * sum1; 966 sum4 -= *v4++ * sum1; 967 sum5 -= *v5++ * sum1; 968 sum3 -= *v3++ * sum2; 969 sum4 -= *v4++ * sum2; 970 sum5 -= *v5++ * sum2; 971 sum4 -= *v4++ * sum3; 972 sum5 -= *v5++ * sum3; 973 sum5 -= *v5++ * sum4; 974 975 tmp[row++]=sum1; 976 tmp[row++]=sum2; 977 tmp[row++]=sum3; 978 tmp[row++]=sum4; 979 tmp[row++]=sum5; 980 break; 981 default: 982 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 983 } 984 } 985 /* backward solve the upper triangular */ 986 for (i=node_max -1,row = n-1; i>=0; i--) { 987 nsz = ns[i]; 988 aii = ai[row+1] -1; 989 v1 = aa + aii; 990 vi = aj + aii; 991 nz = aii- ad[row]; 992 switch (nsz) { /* Each loop in 'case' is unrolled */ 993 case 1: 994 sum1 = tmp[row]; 995 996 for (j=nz; j>1; j-=2) { 997 vi -=2; 998 i0 = vi[2]; 999 i1 = vi[1]; 1000 tmp0 = tmps[i0]; 1001 tmp1 = tmps[i1]; 1002 v1 -= 2; 1003 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1004 } 1005 if (j==1) { 1006 tmp0 = tmps[*vi--]; 1007 sum1 -= *v1-- * tmp0; 1008 } 1009 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1010 break; 1011 case 2: 1012 sum1 = tmp[row]; 1013 sum2 = tmp[row -1]; 1014 v2 = aa + ai[row]-1; 1015 for (j=nz; j>1; j-=2) { 1016 vi -=2; 1017 i0 = vi[2]; 1018 i1 = vi[1]; 1019 tmp0 = tmps[i0]; 1020 tmp1 = tmps[i1]; 1021 v1 -= 2; 1022 v2 -= 2; 1023 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1024 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1025 } 1026 if (j==1) { 1027 tmp0 = tmps[*vi--]; 1028 sum1 -= *v1-- * tmp0; 1029 sum2 -= *v2-- * tmp0; 1030 } 1031 1032 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1033 sum2 -= *v2-- * tmp0; 1034 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1035 break; 1036 case 3: 1037 sum1 = tmp[row]; 1038 sum2 = tmp[row -1]; 1039 sum3 = tmp[row -2]; 1040 v2 = aa + ai[row]-1; 1041 v3 = aa + ai[row -1]-1; 1042 for (j=nz; j>1; j-=2) { 1043 vi -=2; 1044 i0 = vi[2]; 1045 i1 = vi[1]; 1046 tmp0 = tmps[i0]; 1047 tmp1 = tmps[i1]; 1048 v1 -= 2; 1049 v2 -= 2; 1050 v3 -= 2; 1051 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1052 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1053 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1054 } 1055 if (j==1) { 1056 tmp0 = tmps[*vi--]; 1057 sum1 -= *v1-- * tmp0; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 } 1061 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1062 sum2 -= *v2-- * tmp0; 1063 sum3 -= *v3-- * tmp0; 1064 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1065 sum3 -= *v3-- * tmp0; 1066 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1067 1068 break; 1069 case 4: 1070 sum1 = tmp[row]; 1071 sum2 = tmp[row -1]; 1072 sum3 = tmp[row -2]; 1073 sum4 = tmp[row -3]; 1074 v2 = aa + ai[row]-1; 1075 v3 = aa + ai[row -1]-1; 1076 v4 = aa + ai[row -2]-1; 1077 1078 for (j=nz; j>1; j-=2) { 1079 vi -=2; 1080 i0 = vi[2]; 1081 i1 = vi[1]; 1082 tmp0 = tmps[i0]; 1083 tmp1 = tmps[i1]; 1084 v1 -= 2; 1085 v2 -= 2; 1086 v3 -= 2; 1087 v4 -= 2; 1088 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1089 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1090 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1091 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1092 } 1093 if (j==1) { 1094 tmp0 = tmps[*vi--]; 1095 sum1 -= *v1-- * tmp0; 1096 sum2 -= *v2-- * tmp0; 1097 sum3 -= *v3-- * tmp0; 1098 sum4 -= *v4-- * tmp0; 1099 } 1100 1101 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1102 sum2 -= *v2-- * tmp0; 1103 sum3 -= *v3-- * tmp0; 1104 sum4 -= *v4-- * tmp0; 1105 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1106 sum3 -= *v3-- * tmp0; 1107 sum4 -= *v4-- * tmp0; 1108 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1109 sum4 -= *v4-- * tmp0; 1110 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1111 break; 1112 case 5: 1113 sum1 = tmp[row]; 1114 sum2 = tmp[row -1]; 1115 sum3 = tmp[row -2]; 1116 sum4 = tmp[row -3]; 1117 sum5 = tmp[row -4]; 1118 v2 = aa + ai[row]-1; 1119 v3 = aa + ai[row -1]-1; 1120 v4 = aa + ai[row -2]-1; 1121 v5 = aa + ai[row -3]-1; 1122 for (j=nz; j>1; j-=2) { 1123 vi -= 2; 1124 i0 = vi[2]; 1125 i1 = vi[1]; 1126 tmp0 = tmps[i0]; 1127 tmp1 = tmps[i1]; 1128 v1 -= 2; 1129 v2 -= 2; 1130 v3 -= 2; 1131 v4 -= 2; 1132 v5 -= 2; 1133 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1134 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1135 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1136 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1137 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1138 } 1139 if (j==1) { 1140 tmp0 = tmps[*vi--]; 1141 sum1 -= *v1-- * tmp0; 1142 sum2 -= *v2-- * tmp0; 1143 sum3 -= *v3-- * tmp0; 1144 sum4 -= *v4-- * tmp0; 1145 sum5 -= *v5-- * tmp0; 1146 } 1147 1148 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1149 sum2 -= *v2-- * tmp0; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1154 sum3 -= *v3-- * tmp0; 1155 sum4 -= *v4-- * tmp0; 1156 sum5 -= *v5-- * tmp0; 1157 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1158 sum4 -= *v4-- * tmp0; 1159 sum5 -= *v5-- * tmp0; 1160 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1161 sum5 -= *v5-- * tmp0; 1162 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1163 break; 1164 default: 1165 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1166 } 1167 } 1168 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1169 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1170 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 #undef __FUNCT__ 1177 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1178 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1179 { 1180 Mat C =B; 1181 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1182 IS isrow = b->row,isicol = b->icol; 1183 PetscErrorCode ierr; 1184 const PetscInt *r,*ic,*ics; 1185 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1186 PetscInt i,j,k,nz,nzL,row,*pj; 1187 const PetscInt *ajtmp,*bjtmp; 1188 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1189 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1190 FactorShiftCtx sctx; 1191 const PetscInt *ddiag; 1192 PetscReal rs; 1193 MatScalar d; 1194 PetscInt inod,nodesz,node_max,col; 1195 const PetscInt *ns; 1196 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1197 1198 PetscFunctionBegin; 1199 /* MatPivotSetUp(): initialize shift context sctx */ 1200 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1201 1202 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1203 ddiag = a->diag; 1204 sctx.shift_top = info->zeropivot; 1205 for (i=0; i<n; i++) { 1206 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1207 d = (aa)[ddiag[i]]; 1208 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1209 v = aa+ai[i]; 1210 nz = ai[i+1] - ai[i]; 1211 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1212 if (rs>sctx.shift_top) sctx.shift_top = rs; 1213 } 1214 sctx.shift_top *= 1.1; 1215 sctx.nshift_max = 5; 1216 sctx.shift_lo = 0.; 1217 sctx.shift_hi = 1.; 1218 } 1219 1220 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1221 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1222 1223 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1224 ics = ic; 1225 1226 node_max = a->inode.node_count; 1227 ns = a->inode.size; 1228 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1229 1230 /* If max inode size > 4, split it into two inodes.*/ 1231 /* also map the inode sizes according to the ordering */ 1232 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1233 for (i=0,j=0; i<node_max; ++i,++j) { 1234 if (ns[i] > 4) { 1235 tmp_vec1[j] = 4; 1236 ++j; 1237 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1238 } else { 1239 tmp_vec1[j] = ns[i]; 1240 } 1241 } 1242 /* Use the correct node_max */ 1243 node_max = j; 1244 1245 /* Now reorder the inode info based on mat re-ordering info */ 1246 /* First create a row -> inode_size_array_index map */ 1247 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1248 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1249 for (i=0,row=0; i<node_max; i++) { 1250 nodesz = tmp_vec1[i]; 1251 for (j=0; j<nodesz; j++,row++) { 1252 nsmap[row] = i; 1253 } 1254 } 1255 /* Using nsmap, create a reordered ns structure */ 1256 for (i=0,j=0; i< node_max; i++) { 1257 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1258 tmp_vec2[i] = nodesz; 1259 j += nodesz; 1260 } 1261 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1262 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1263 1264 /* Now use the correct ns */ 1265 ns = tmp_vec2; 1266 1267 do { 1268 sctx.newshift = PETSC_FALSE; 1269 /* Now loop over each block-row, and do the factorization */ 1270 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1271 nodesz = ns[inod]; 1272 1273 switch (nodesz) { 1274 case 1: 1275 /*----------*/ 1276 /* zero rtmp1 */ 1277 /* L part */ 1278 nz = bi[i+1] - bi[i]; 1279 bjtmp = bj + bi[i]; 1280 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1281 1282 /* U part */ 1283 nz = bdiag[i]-bdiag[i+1]; 1284 bjtmp = bj + bdiag[i+1]+1; 1285 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1286 1287 /* load in initial (unfactored row) */ 1288 nz = ai[r[i]+1] - ai[r[i]]; 1289 ajtmp = aj + ai[r[i]]; 1290 v = aa + ai[r[i]]; 1291 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1292 1293 /* ZeropivotApply() */ 1294 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1295 1296 /* elimination */ 1297 bjtmp = bj + bi[i]; 1298 row = *bjtmp++; 1299 nzL = bi[i+1] - bi[i]; 1300 for (k=0; k < nzL; k++) { 1301 pc = rtmp1 + row; 1302 if (*pc != 0.0) { 1303 pv = b->a + bdiag[row]; 1304 mul1 = *pc * (*pv); 1305 *pc = mul1; 1306 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1307 pv = b->a + bdiag[row+1]+1; 1308 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1309 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1310 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1311 } 1312 row = *bjtmp++; 1313 } 1314 1315 /* finished row so stick it into b->a */ 1316 rs = 0.0; 1317 /* L part */ 1318 pv = b->a + bi[i]; 1319 pj = b->j + bi[i]; 1320 nz = bi[i+1] - bi[i]; 1321 for (j=0; j<nz; j++) { 1322 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1323 } 1324 1325 /* U part */ 1326 pv = b->a + bdiag[i+1]+1; 1327 pj = b->j + bdiag[i+1]+1; 1328 nz = bdiag[i] - bdiag[i+1]-1; 1329 for (j=0; j<nz; j++) { 1330 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1331 } 1332 1333 /* Check zero pivot */ 1334 sctx.rs = rs; 1335 sctx.pv = rtmp1[i]; 1336 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1337 if (sctx.newshift) break; 1338 1339 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1340 pv = b->a + bdiag[i]; 1341 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1342 break; 1343 1344 case 2: 1345 /*----------*/ 1346 /* zero rtmp1 and rtmp2 */ 1347 /* L part */ 1348 nz = bi[i+1] - bi[i]; 1349 bjtmp = bj + bi[i]; 1350 for (j=0; j<nz; j++) { 1351 col = bjtmp[j]; 1352 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1353 } 1354 1355 /* U part */ 1356 nz = bdiag[i]-bdiag[i+1]; 1357 bjtmp = bj + bdiag[i+1]+1; 1358 for (j=0; j<nz; j++) { 1359 col = bjtmp[j]; 1360 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1361 } 1362 1363 /* load in initial (unfactored row) */ 1364 nz = ai[r[i]+1] - ai[r[i]]; 1365 ajtmp = aj + ai[r[i]]; 1366 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1367 for (j=0; j<nz; j++) { 1368 col = ics[ajtmp[j]]; 1369 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1370 } 1371 /* ZeropivotApply(): shift the diagonal of the matrix */ 1372 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1373 1374 /* elimination */ 1375 bjtmp = bj + bi[i]; 1376 row = *bjtmp++; /* pivot row */ 1377 nzL = bi[i+1] - bi[i]; 1378 for (k=0; k < nzL; k++) { 1379 pc1 = rtmp1 + row; 1380 pc2 = rtmp2 + row; 1381 if (*pc1 != 0.0 || *pc2 != 0.0) { 1382 pv = b->a + bdiag[row]; 1383 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1384 *pc1 = mul1; *pc2 = mul2; 1385 1386 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1387 pv = b->a + bdiag[row+1]+1; 1388 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1389 for (j=0; j<nz; j++) { 1390 col = pj[j]; 1391 rtmp1[col] -= mul1 * pv[j]; 1392 rtmp2[col] -= mul2 * pv[j]; 1393 } 1394 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1395 } 1396 row = *bjtmp++; 1397 } 1398 1399 /* finished row i; check zero pivot, then stick row i into b->a */ 1400 rs = 0.0; 1401 /* L part */ 1402 pc1 = b->a + bi[i]; 1403 pj = b->j + bi[i]; 1404 nz = bi[i+1] - bi[i]; 1405 for (j=0; j<nz; j++) { 1406 col = pj[j]; 1407 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1408 } 1409 /* U part */ 1410 pc1 = b->a + bdiag[i+1]+1; 1411 pj = b->j + bdiag[i+1]+1; 1412 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1413 for (j=0; j<nz; j++) { 1414 col = pj[j]; 1415 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1416 } 1417 1418 sctx.rs = rs; 1419 sctx.pv = rtmp1[i]; 1420 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1421 if (sctx.newshift) break; 1422 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1423 *pc1 = 1.0/sctx.pv; 1424 1425 /* Now take care of diagonal 2x2 block. */ 1426 pc2 = rtmp2 + i; 1427 if (*pc2 != 0.0) { 1428 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1429 *pc2 = mul1; /* insert L entry */ 1430 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1431 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1432 for (j=0; j<nz; j++) { 1433 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1434 } 1435 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1436 } 1437 1438 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1439 rs = 0.0; 1440 /* L part */ 1441 pc2 = b->a + bi[i+1]; 1442 pj = b->j + bi[i+1]; 1443 nz = bi[i+2] - bi[i+1]; 1444 for (j=0; j<nz; j++) { 1445 col = pj[j]; 1446 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1447 } 1448 /* U part */ 1449 pc2 = b->a + bdiag[i+2]+1; 1450 pj = b->j + bdiag[i+2]+1; 1451 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1452 for (j=0; j<nz; j++) { 1453 col = pj[j]; 1454 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1455 } 1456 1457 sctx.rs = rs; 1458 sctx.pv = rtmp2[i+1]; 1459 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1460 if (sctx.newshift) break; 1461 pc2 = b->a + bdiag[i+1]; 1462 *pc2 = 1.0/sctx.pv; 1463 break; 1464 1465 case 3: 1466 /*----------*/ 1467 /* zero rtmp */ 1468 /* L part */ 1469 nz = bi[i+1] - bi[i]; 1470 bjtmp = bj + bi[i]; 1471 for (j=0; j<nz; j++) { 1472 col = bjtmp[j]; 1473 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1474 } 1475 1476 /* U part */ 1477 nz = bdiag[i]-bdiag[i+1]; 1478 bjtmp = bj + bdiag[i+1]+1; 1479 for (j=0; j<nz; j++) { 1480 col = bjtmp[j]; 1481 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1482 } 1483 1484 /* load in initial (unfactored row) */ 1485 nz = ai[r[i]+1] - ai[r[i]]; 1486 ajtmp = aj + ai[r[i]]; 1487 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1488 for (j=0; j<nz; j++) { 1489 col = ics[ajtmp[j]]; 1490 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1491 } 1492 /* ZeropivotApply(): shift the diagonal of the matrix */ 1493 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1494 1495 /* elimination */ 1496 bjtmp = bj + bi[i]; 1497 row = *bjtmp++; /* pivot row */ 1498 nzL = bi[i+1] - bi[i]; 1499 for (k=0; k < nzL; k++) { 1500 pc1 = rtmp1 + row; 1501 pc2 = rtmp2 + row; 1502 pc3 = rtmp3 + row; 1503 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1504 pv = b->a + bdiag[row]; 1505 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1506 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1507 1508 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1509 pv = b->a + bdiag[row+1]+1; 1510 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1511 for (j=0; j<nz; j++) { 1512 col = pj[j]; 1513 rtmp1[col] -= mul1 * pv[j]; 1514 rtmp2[col] -= mul2 * pv[j]; 1515 rtmp3[col] -= mul3 * pv[j]; 1516 } 1517 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1518 } 1519 row = *bjtmp++; 1520 } 1521 1522 /* finished row i; check zero pivot, then stick row i into b->a */ 1523 rs = 0.0; 1524 /* L part */ 1525 pc1 = b->a + bi[i]; 1526 pj = b->j + bi[i]; 1527 nz = bi[i+1] - bi[i]; 1528 for (j=0; j<nz; j++) { 1529 col = pj[j]; 1530 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1531 } 1532 /* U part */ 1533 pc1 = b->a + bdiag[i+1]+1; 1534 pj = b->j + bdiag[i+1]+1; 1535 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1536 for (j=0; j<nz; j++) { 1537 col = pj[j]; 1538 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1539 } 1540 1541 sctx.rs = rs; 1542 sctx.pv = rtmp1[i]; 1543 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1544 if (sctx.newshift) break; 1545 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1546 *pc1 = 1.0/sctx.pv; 1547 1548 /* Now take care of 1st column of diagonal 3x3 block. */ 1549 pc2 = rtmp2 + i; 1550 pc3 = rtmp3 + i; 1551 if (*pc2 != 0.0 || *pc3 != 0.0) { 1552 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1553 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1554 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1555 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1556 for (j=0; j<nz; j++) { 1557 col = pj[j]; 1558 rtmp2[col] -= mul2 * rtmp1[col]; 1559 rtmp3[col] -= mul3 * rtmp1[col]; 1560 } 1561 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1562 } 1563 1564 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1565 rs = 0.0; 1566 /* L part */ 1567 pc2 = b->a + bi[i+1]; 1568 pj = b->j + bi[i+1]; 1569 nz = bi[i+2] - bi[i+1]; 1570 for (j=0; j<nz; j++) { 1571 col = pj[j]; 1572 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1573 } 1574 /* U part */ 1575 pc2 = b->a + bdiag[i+2]+1; 1576 pj = b->j + bdiag[i+2]+1; 1577 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1578 for (j=0; j<nz; j++) { 1579 col = pj[j]; 1580 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1581 } 1582 1583 sctx.rs = rs; 1584 sctx.pv = rtmp2[i+1]; 1585 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1586 if (sctx.newshift) break; 1587 pc2 = b->a + bdiag[i+1]; 1588 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1589 1590 /* Now take care of 2nd column of diagonal 3x3 block. */ 1591 pc3 = rtmp3 + i+1; 1592 if (*pc3 != 0.0) { 1593 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1594 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1595 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1596 for (j=0; j<nz; j++) { 1597 col = pj[j]; 1598 rtmp3[col] -= mul3 * rtmp2[col]; 1599 } 1600 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1601 } 1602 1603 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1604 rs = 0.0; 1605 /* L part */ 1606 pc3 = b->a + bi[i+2]; 1607 pj = b->j + bi[i+2]; 1608 nz = bi[i+3] - bi[i+2]; 1609 for (j=0; j<nz; j++) { 1610 col = pj[j]; 1611 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1612 } 1613 /* U part */ 1614 pc3 = b->a + bdiag[i+3]+1; 1615 pj = b->j + bdiag[i+3]+1; 1616 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1617 for (j=0; j<nz; j++) { 1618 col = pj[j]; 1619 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1620 } 1621 1622 sctx.rs = rs; 1623 sctx.pv = rtmp3[i+2]; 1624 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1625 if (sctx.newshift) break; 1626 pc3 = b->a + bdiag[i+2]; 1627 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1628 break; 1629 case 4: 1630 /*----------*/ 1631 /* zero rtmp */ 1632 /* L part */ 1633 nz = bi[i+1] - bi[i]; 1634 bjtmp = bj + bi[i]; 1635 for (j=0; j<nz; j++) { 1636 col = bjtmp[j]; 1637 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1638 } 1639 1640 /* U part */ 1641 nz = bdiag[i]-bdiag[i+1]; 1642 bjtmp = bj + bdiag[i+1]+1; 1643 for (j=0; j<nz; j++) { 1644 col = bjtmp[j]; 1645 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1646 } 1647 1648 /* load in initial (unfactored row) */ 1649 nz = ai[r[i]+1] - ai[r[i]]; 1650 ajtmp = aj + ai[r[i]]; 1651 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1652 for (j=0; j<nz; j++) { 1653 col = ics[ajtmp[j]]; 1654 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1655 } 1656 /* ZeropivotApply(): shift the diagonal of the matrix */ 1657 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1658 1659 /* elimination */ 1660 bjtmp = bj + bi[i]; 1661 row = *bjtmp++; /* pivot row */ 1662 nzL = bi[i+1] - bi[i]; 1663 for (k=0; k < nzL; k++) { 1664 pc1 = rtmp1 + row; 1665 pc2 = rtmp2 + row; 1666 pc3 = rtmp3 + row; 1667 pc4 = rtmp4 + row; 1668 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1669 pv = b->a + bdiag[row]; 1670 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1671 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1672 1673 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1674 pv = b->a + bdiag[row+1]+1; 1675 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1676 for (j=0; j<nz; j++) { 1677 col = pj[j]; 1678 rtmp1[col] -= mul1 * pv[j]; 1679 rtmp2[col] -= mul2 * pv[j]; 1680 rtmp3[col] -= mul3 * pv[j]; 1681 rtmp4[col] -= mul4 * pv[j]; 1682 } 1683 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1684 } 1685 row = *bjtmp++; 1686 } 1687 1688 /* finished row i; check zero pivot, then stick row i into b->a */ 1689 rs = 0.0; 1690 /* L part */ 1691 pc1 = b->a + bi[i]; 1692 pj = b->j + bi[i]; 1693 nz = bi[i+1] - bi[i]; 1694 for (j=0; j<nz; j++) { 1695 col = pj[j]; 1696 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1697 } 1698 /* U part */ 1699 pc1 = b->a + bdiag[i+1]+1; 1700 pj = b->j + bdiag[i+1]+1; 1701 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1702 for (j=0; j<nz; j++) { 1703 col = pj[j]; 1704 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1705 } 1706 1707 sctx.rs = rs; 1708 sctx.pv = rtmp1[i]; 1709 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1710 if (sctx.newshift) break; 1711 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1712 *pc1 = 1.0/sctx.pv; 1713 1714 /* Now take care of 1st column of diagonal 4x4 block. */ 1715 pc2 = rtmp2 + i; 1716 pc3 = rtmp3 + i; 1717 pc4 = rtmp4 + i; 1718 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1719 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1720 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1721 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1722 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1723 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1724 for (j=0; j<nz; j++) { 1725 col = pj[j]; 1726 rtmp2[col] -= mul2 * rtmp1[col]; 1727 rtmp3[col] -= mul3 * rtmp1[col]; 1728 rtmp4[col] -= mul4 * rtmp1[col]; 1729 } 1730 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1731 } 1732 1733 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1734 rs = 0.0; 1735 /* L part */ 1736 pc2 = b->a + bi[i+1]; 1737 pj = b->j + bi[i+1]; 1738 nz = bi[i+2] - bi[i+1]; 1739 for (j=0; j<nz; j++) { 1740 col = pj[j]; 1741 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1742 } 1743 /* U part */ 1744 pc2 = b->a + bdiag[i+2]+1; 1745 pj = b->j + bdiag[i+2]+1; 1746 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1747 for (j=0; j<nz; j++) { 1748 col = pj[j]; 1749 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1750 } 1751 1752 sctx.rs = rs; 1753 sctx.pv = rtmp2[i+1]; 1754 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1755 if (sctx.newshift) break; 1756 pc2 = b->a + bdiag[i+1]; 1757 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1758 1759 /* Now take care of 2nd column of diagonal 4x4 block. */ 1760 pc3 = rtmp3 + i+1; 1761 pc4 = rtmp4 + i+1; 1762 if (*pc3 != 0.0 || *pc4 != 0.0) { 1763 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1764 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1765 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1766 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1767 for (j=0; j<nz; j++) { 1768 col = pj[j]; 1769 rtmp3[col] -= mul3 * rtmp2[col]; 1770 rtmp4[col] -= mul4 * rtmp2[col]; 1771 } 1772 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1773 } 1774 1775 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1776 rs = 0.0; 1777 /* L part */ 1778 pc3 = b->a + bi[i+2]; 1779 pj = b->j + bi[i+2]; 1780 nz = bi[i+3] - bi[i+2]; 1781 for (j=0; j<nz; j++) { 1782 col = pj[j]; 1783 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1784 } 1785 /* U part */ 1786 pc3 = b->a + bdiag[i+3]+1; 1787 pj = b->j + bdiag[i+3]+1; 1788 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1789 for (j=0; j<nz; j++) { 1790 col = pj[j]; 1791 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1792 } 1793 1794 sctx.rs = rs; 1795 sctx.pv = rtmp3[i+2]; 1796 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1797 if (sctx.newshift) break; 1798 pc3 = b->a + bdiag[i+2]; 1799 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1800 1801 /* Now take care of 3rd column of diagonal 4x4 block. */ 1802 pc4 = rtmp4 + i+2; 1803 if (*pc4 != 0.0) { 1804 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1805 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1806 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1807 for (j=0; j<nz; j++) { 1808 col = pj[j]; 1809 rtmp4[col] -= mul4 * rtmp3[col]; 1810 } 1811 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1812 } 1813 1814 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1815 rs = 0.0; 1816 /* L part */ 1817 pc4 = b->a + bi[i+3]; 1818 pj = b->j + bi[i+3]; 1819 nz = bi[i+4] - bi[i+3]; 1820 for (j=0; j<nz; j++) { 1821 col = pj[j]; 1822 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1823 } 1824 /* U part */ 1825 pc4 = b->a + bdiag[i+4]+1; 1826 pj = b->j + bdiag[i+4]+1; 1827 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1828 for (j=0; j<nz; j++) { 1829 col = pj[j]; 1830 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1831 } 1832 1833 sctx.rs = rs; 1834 sctx.pv = rtmp4[i+3]; 1835 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr); 1836 if (sctx.newshift) break; 1837 pc4 = b->a + bdiag[i+3]; 1838 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1839 break; 1840 1841 default: 1842 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1843 } 1844 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1845 i += nodesz; /* Update the row */ 1846 } 1847 1848 /* MatPivotRefine() */ 1849 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1850 /* 1851 * if no shift in this attempt & shifting & started shifting & can refine, 1852 * then try lower shift 1853 */ 1854 sctx.shift_hi = sctx.shift_fraction; 1855 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1856 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1857 sctx.newshift = PETSC_TRUE; 1858 sctx.nshift++; 1859 } 1860 } while (sctx.newshift); 1861 1862 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1863 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1864 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1865 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1866 1867 if (b->inode.size) { 1868 C->ops->solve = MatSolve_SeqAIJ_Inode; 1869 } else { 1870 C->ops->solve = MatSolve_SeqAIJ; 1871 } 1872 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1873 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1874 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1875 C->ops->matsolve = MatMatSolve_SeqAIJ; 1876 C->assembled = PETSC_TRUE; 1877 C->preallocated = PETSC_TRUE; 1878 1879 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1880 1881 /* MatShiftView(A,info,&sctx) */ 1882 if (sctx.nshift) { 1883 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1884 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1885 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1886 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1887 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1888 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1889 } 1890 } 1891 PetscFunctionReturn(0); 1892 } 1893 1894 #undef __FUNCT__ 1895 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1896 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1897 { 1898 Mat C = B; 1899 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1900 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1901 PetscErrorCode ierr; 1902 const PetscInt *r,*ic,*c,*ics; 1903 PetscInt n = A->rmap->n,*bi = b->i; 1904 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1905 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1906 PetscInt *ai = a->i,*aj = a->j; 1907 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1908 PetscScalar mul1,mul2,mul3,tmp; 1909 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1910 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1911 PetscReal rs=0.0; 1912 FactorShiftCtx sctx; 1913 1914 PetscFunctionBegin; 1915 sctx.shift_top = 0; 1916 sctx.nshift_max = 0; 1917 sctx.shift_lo = 0; 1918 sctx.shift_hi = 0; 1919 sctx.shift_fraction = 0; 1920 1921 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1922 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1923 sctx.shift_top = 0; 1924 for (i=0; i<n; i++) { 1925 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1926 rs = 0.0; 1927 ajtmp = aj + ai[i]; 1928 rtmp1 = aa + ai[i]; 1929 nz = ai[i+1] - ai[i]; 1930 for (j=0; j<nz; j++) { 1931 if (*ajtmp != i) { 1932 rs += PetscAbsScalar(*rtmp1++); 1933 } else { 1934 rs -= PetscRealPart(*rtmp1++); 1935 } 1936 ajtmp++; 1937 } 1938 if (rs>sctx.shift_top) sctx.shift_top = rs; 1939 } 1940 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1941 sctx.shift_top *= 1.1; 1942 sctx.nshift_max = 5; 1943 sctx.shift_lo = 0.; 1944 sctx.shift_hi = 1.; 1945 } 1946 sctx.shift_amount = 0; 1947 sctx.nshift = 0; 1948 1949 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1950 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1951 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1952 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1953 ics = ic; 1954 1955 node_max = a->inode.node_count; 1956 ns = a->inode.size; 1957 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1958 1959 /* If max inode size > 3, split it into two inodes.*/ 1960 /* also map the inode sizes according to the ordering */ 1961 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1962 for (i=0,j=0; i<node_max; ++i,++j) { 1963 if (ns[i]>3) { 1964 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1965 ++j; 1966 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1967 } else { 1968 tmp_vec1[j] = ns[i]; 1969 } 1970 } 1971 /* Use the correct node_max */ 1972 node_max = j; 1973 1974 /* Now reorder the inode info based on mat re-ordering info */ 1975 /* First create a row -> inode_size_array_index map */ 1976 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1977 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1978 for (i=0,row=0; i<node_max; i++) { 1979 nodesz = tmp_vec1[i]; 1980 for (j=0; j<nodesz; j++,row++) { 1981 nsmap[row] = i; 1982 } 1983 } 1984 /* Using nsmap, create a reordered ns structure */ 1985 for (i=0,j=0; i< node_max; i++) { 1986 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1987 tmp_vec2[i] = nodesz; 1988 j += nodesz; 1989 } 1990 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1991 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1992 /* Now use the correct ns */ 1993 ns = tmp_vec2; 1994 1995 do { 1996 sctx.newshift = PETSC_FALSE; 1997 /* Now loop over each block-row, and do the factorization */ 1998 for (i=0,row=0; i<node_max; i++) { 1999 nodesz = ns[i]; 2000 nz = bi[row+1] - bi[row]; 2001 bjtmp = bj + bi[row]; 2002 2003 switch (nodesz) { 2004 case 1: 2005 for (j=0; j<nz; j++) { 2006 idx = bjtmp[j]; 2007 rtmp11[idx] = 0.0; 2008 } 2009 2010 /* load in initial (unfactored row) */ 2011 idx = r[row]; 2012 nz_tmp = ai[idx+1] - ai[idx]; 2013 ajtmp = aj + ai[idx]; 2014 v1 = aa + ai[idx]; 2015 2016 for (j=0; j<nz_tmp; j++) { 2017 idx = ics[ajtmp[j]]; 2018 rtmp11[idx] = v1[j]; 2019 } 2020 rtmp11[ics[r[row]]] += sctx.shift_amount; 2021 2022 prow = *bjtmp++; 2023 while (prow < row) { 2024 pc1 = rtmp11 + prow; 2025 if (*pc1 != 0.0) { 2026 pv = ba + bd[prow]; 2027 pj = nbj + bd[prow]; 2028 mul1 = *pc1 * *pv++; 2029 *pc1 = mul1; 2030 nz_tmp = bi[prow+1] - bd[prow] - 1; 2031 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2032 for (j=0; j<nz_tmp; j++) { 2033 tmp = pv[j]; 2034 idx = pj[j]; 2035 rtmp11[idx] -= mul1 * tmp; 2036 } 2037 } 2038 prow = *bjtmp++; 2039 } 2040 pj = bj + bi[row]; 2041 pc1 = ba + bi[row]; 2042 2043 sctx.pv = rtmp11[row]; 2044 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2045 rs = 0.0; 2046 for (j=0; j<nz; j++) { 2047 idx = pj[j]; 2048 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2049 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2050 } 2051 sctx.rs = rs; 2052 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2053 if (sctx.newshift) goto endofwhile; 2054 break; 2055 2056 case 2: 2057 for (j=0; j<nz; j++) { 2058 idx = bjtmp[j]; 2059 rtmp11[idx] = 0.0; 2060 rtmp22[idx] = 0.0; 2061 } 2062 2063 /* load in initial (unfactored row) */ 2064 idx = r[row]; 2065 nz_tmp = ai[idx+1] - ai[idx]; 2066 ajtmp = aj + ai[idx]; 2067 v1 = aa + ai[idx]; 2068 v2 = aa + ai[idx+1]; 2069 for (j=0; j<nz_tmp; j++) { 2070 idx = ics[ajtmp[j]]; 2071 rtmp11[idx] = v1[j]; 2072 rtmp22[idx] = v2[j]; 2073 } 2074 rtmp11[ics[r[row]]] += sctx.shift_amount; 2075 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2076 2077 prow = *bjtmp++; 2078 while (prow < row) { 2079 pc1 = rtmp11 + prow; 2080 pc2 = rtmp22 + prow; 2081 if (*pc1 != 0.0 || *pc2 != 0.0) { 2082 pv = ba + bd[prow]; 2083 pj = nbj + bd[prow]; 2084 mul1 = *pc1 * *pv; 2085 mul2 = *pc2 * *pv; 2086 ++pv; 2087 *pc1 = mul1; 2088 *pc2 = mul2; 2089 2090 nz_tmp = bi[prow+1] - bd[prow] - 1; 2091 for (j=0; j<nz_tmp; j++) { 2092 tmp = pv[j]; 2093 idx = pj[j]; 2094 rtmp11[idx] -= mul1 * tmp; 2095 rtmp22[idx] -= mul2 * tmp; 2096 } 2097 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2098 } 2099 prow = *bjtmp++; 2100 } 2101 2102 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2103 pc1 = rtmp11 + prow; 2104 pc2 = rtmp22 + prow; 2105 2106 sctx.pv = *pc1; 2107 pj = bj + bi[prow]; 2108 rs = 0.0; 2109 for (j=0; j<nz; j++) { 2110 idx = pj[j]; 2111 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2112 } 2113 sctx.rs = rs; 2114 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2115 if (sctx.newshift) goto endofwhile; 2116 2117 if (*pc2 != 0.0) { 2118 pj = nbj + bd[prow]; 2119 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2120 *pc2 = mul2; 2121 nz_tmp = bi[prow+1] - bd[prow] - 1; 2122 for (j=0; j<nz_tmp; j++) { 2123 idx = pj[j]; 2124 tmp = rtmp11[idx]; 2125 rtmp22[idx] -= mul2 * tmp; 2126 } 2127 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2128 } 2129 2130 pj = bj + bi[row]; 2131 pc1 = ba + bi[row]; 2132 pc2 = ba + bi[row+1]; 2133 2134 sctx.pv = rtmp22[row+1]; 2135 rs = 0.0; 2136 rtmp11[row] = 1.0/rtmp11[row]; 2137 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2138 /* copy row entries from dense representation to sparse */ 2139 for (j=0; j<nz; j++) { 2140 idx = pj[j]; 2141 pc1[j] = rtmp11[idx]; 2142 pc2[j] = rtmp22[idx]; 2143 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2144 } 2145 sctx.rs = rs; 2146 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2147 if (sctx.newshift) goto endofwhile; 2148 break; 2149 2150 case 3: 2151 for (j=0; j<nz; j++) { 2152 idx = bjtmp[j]; 2153 rtmp11[idx] = 0.0; 2154 rtmp22[idx] = 0.0; 2155 rtmp33[idx] = 0.0; 2156 } 2157 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2158 idx = r[row]; 2159 nz_tmp = ai[idx+1] - ai[idx]; 2160 ajtmp = aj + ai[idx]; 2161 v1 = aa + ai[idx]; 2162 v2 = aa + ai[idx+1]; 2163 v3 = aa + ai[idx+2]; 2164 for (j=0; j<nz_tmp; j++) { 2165 idx = ics[ajtmp[j]]; 2166 rtmp11[idx] = v1[j]; 2167 rtmp22[idx] = v2[j]; 2168 rtmp33[idx] = v3[j]; 2169 } 2170 rtmp11[ics[r[row]]] += sctx.shift_amount; 2171 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2172 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2173 2174 /* loop over all pivot row blocks above this row block */ 2175 prow = *bjtmp++; 2176 while (prow < row) { 2177 pc1 = rtmp11 + prow; 2178 pc2 = rtmp22 + prow; 2179 pc3 = rtmp33 + prow; 2180 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2181 pv = ba + bd[prow]; 2182 pj = nbj + bd[prow]; 2183 mul1 = *pc1 * *pv; 2184 mul2 = *pc2 * *pv; 2185 mul3 = *pc3 * *pv; 2186 ++pv; 2187 *pc1 = mul1; 2188 *pc2 = mul2; 2189 *pc3 = mul3; 2190 2191 nz_tmp = bi[prow+1] - bd[prow] - 1; 2192 /* update this row based on pivot row */ 2193 for (j=0; j<nz_tmp; j++) { 2194 tmp = pv[j]; 2195 idx = pj[j]; 2196 rtmp11[idx] -= mul1 * tmp; 2197 rtmp22[idx] -= mul2 * tmp; 2198 rtmp33[idx] -= mul3 * tmp; 2199 } 2200 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2201 } 2202 prow = *bjtmp++; 2203 } 2204 2205 /* Now take care of diagonal 3x3 block in this set of rows */ 2206 /* note: prow = row here */ 2207 pc1 = rtmp11 + prow; 2208 pc2 = rtmp22 + prow; 2209 pc3 = rtmp33 + prow; 2210 2211 sctx.pv = *pc1; 2212 pj = bj + bi[prow]; 2213 rs = 0.0; 2214 for (j=0; j<nz; j++) { 2215 idx = pj[j]; 2216 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2217 } 2218 sctx.rs = rs; 2219 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2220 if (sctx.newshift) goto endofwhile; 2221 2222 if (*pc2 != 0.0 || *pc3 != 0.0) { 2223 mul2 = (*pc2)/(*pc1); 2224 mul3 = (*pc3)/(*pc1); 2225 *pc2 = mul2; 2226 *pc3 = mul3; 2227 nz_tmp = bi[prow+1] - bd[prow] - 1; 2228 pj = nbj + bd[prow]; 2229 for (j=0; j<nz_tmp; j++) { 2230 idx = pj[j]; 2231 tmp = rtmp11[idx]; 2232 rtmp22[idx] -= mul2 * tmp; 2233 rtmp33[idx] -= mul3 * tmp; 2234 } 2235 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2236 } 2237 ++prow; 2238 2239 pc2 = rtmp22 + prow; 2240 pc3 = rtmp33 + prow; 2241 sctx.pv = *pc2; 2242 pj = bj + bi[prow]; 2243 rs = 0.0; 2244 for (j=0; j<nz; j++) { 2245 idx = pj[j]; 2246 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2247 } 2248 sctx.rs = rs; 2249 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2250 if (sctx.newshift) goto endofwhile; 2251 2252 if (*pc3 != 0.0) { 2253 mul3 = (*pc3)/(*pc2); 2254 *pc3 = mul3; 2255 pj = nbj + bd[prow]; 2256 nz_tmp = bi[prow+1] - bd[prow] - 1; 2257 for (j=0; j<nz_tmp; j++) { 2258 idx = pj[j]; 2259 tmp = rtmp22[idx]; 2260 rtmp33[idx] -= mul3 * tmp; 2261 } 2262 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2263 } 2264 2265 pj = bj + bi[row]; 2266 pc1 = ba + bi[row]; 2267 pc2 = ba + bi[row+1]; 2268 pc3 = ba + bi[row+2]; 2269 2270 sctx.pv = rtmp33[row+2]; 2271 rs = 0.0; 2272 rtmp11[row] = 1.0/rtmp11[row]; 2273 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2274 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2275 /* copy row entries from dense representation to sparse */ 2276 for (j=0; j<nz; j++) { 2277 idx = pj[j]; 2278 pc1[j] = rtmp11[idx]; 2279 pc2[j] = rtmp22[idx]; 2280 pc3[j] = rtmp33[idx]; 2281 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2282 } 2283 2284 sctx.rs = rs; 2285 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr); 2286 if (sctx.newshift) goto endofwhile; 2287 break; 2288 2289 default: 2290 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2291 } 2292 row += nodesz; /* Update the row */ 2293 } 2294 endofwhile:; 2295 } while (sctx.newshift); 2296 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2297 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2298 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2299 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2300 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2301 2302 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2303 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2304 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2305 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2306 C->assembled = PETSC_TRUE; 2307 C->preallocated = PETSC_TRUE; 2308 if (sctx.nshift) { 2309 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2310 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2311 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2312 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2313 } 2314 } 2315 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2316 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2317 PetscFunctionReturn(0); 2318 } 2319 2320 2321 /* ----------------------------------------------------------- */ 2322 #undef __FUNCT__ 2323 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2324 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2325 { 2326 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2327 IS iscol = a->col,isrow = a->row; 2328 PetscErrorCode ierr; 2329 const PetscInt *r,*c,*rout,*cout; 2330 PetscInt i,j,n = A->rmap->n; 2331 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2332 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2333 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2334 PetscScalar sum1,sum2,sum3,sum4,sum5; 2335 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2336 const PetscScalar *b; 2337 2338 PetscFunctionBegin; 2339 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2340 node_max = a->inode.node_count; 2341 ns = a->inode.size; /* Node Size array */ 2342 2343 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2344 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2345 tmp = a->solve_work; 2346 2347 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2348 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2349 2350 /* forward solve the lower triangular */ 2351 tmps = tmp; 2352 aa = a_a; 2353 aj = a_j; 2354 ad = a->diag; 2355 2356 for (i = 0,row = 0; i< node_max; ++i) { 2357 nsz = ns[i]; 2358 aii = ai[row]; 2359 v1 = aa + aii; 2360 vi = aj + aii; 2361 nz = ai[row+1]- ai[row]; 2362 2363 if (i < node_max-1) { 2364 /* Prefetch the indices for the next block */ 2365 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2366 /* Prefetch the data for the next block */ 2367 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2368 } 2369 2370 switch (nsz) { /* Each loop in 'case' is unrolled */ 2371 case 1: 2372 sum1 = b[r[row]]; 2373 for (j=0; j<nz-1; j+=2) { 2374 i0 = vi[j]; 2375 i1 = vi[j+1]; 2376 tmp0 = tmps[i0]; 2377 tmp1 = tmps[i1]; 2378 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2379 } 2380 if (j == nz-1) { 2381 tmp0 = tmps[vi[j]]; 2382 sum1 -= v1[j]*tmp0; 2383 } 2384 tmp[row++]=sum1; 2385 break; 2386 case 2: 2387 sum1 = b[r[row]]; 2388 sum2 = b[r[row+1]]; 2389 v2 = aa + ai[row+1]; 2390 2391 for (j=0; j<nz-1; j+=2) { 2392 i0 = vi[j]; 2393 i1 = vi[j+1]; 2394 tmp0 = tmps[i0]; 2395 tmp1 = tmps[i1]; 2396 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2397 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2398 } 2399 if (j == nz-1) { 2400 tmp0 = tmps[vi[j]]; 2401 sum1 -= v1[j] *tmp0; 2402 sum2 -= v2[j] *tmp0; 2403 } 2404 sum2 -= v2[nz] * sum1; 2405 tmp[row++]=sum1; 2406 tmp[row++]=sum2; 2407 break; 2408 case 3: 2409 sum1 = b[r[row]]; 2410 sum2 = b[r[row+1]]; 2411 sum3 = b[r[row+2]]; 2412 v2 = aa + ai[row+1]; 2413 v3 = aa + ai[row+2]; 2414 2415 for (j=0; j<nz-1; j+=2) { 2416 i0 = vi[j]; 2417 i1 = vi[j+1]; 2418 tmp0 = tmps[i0]; 2419 tmp1 = tmps[i1]; 2420 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2421 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2422 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2423 } 2424 if (j == nz-1) { 2425 tmp0 = tmps[vi[j]]; 2426 sum1 -= v1[j] *tmp0; 2427 sum2 -= v2[j] *tmp0; 2428 sum3 -= v3[j] *tmp0; 2429 } 2430 sum2 -= v2[nz] * sum1; 2431 sum3 -= v3[nz] * sum1; 2432 sum3 -= v3[nz+1] * sum2; 2433 tmp[row++]=sum1; 2434 tmp[row++]=sum2; 2435 tmp[row++]=sum3; 2436 break; 2437 2438 case 4: 2439 sum1 = b[r[row]]; 2440 sum2 = b[r[row+1]]; 2441 sum3 = b[r[row+2]]; 2442 sum4 = b[r[row+3]]; 2443 v2 = aa + ai[row+1]; 2444 v3 = aa + ai[row+2]; 2445 v4 = aa + ai[row+3]; 2446 2447 for (j=0; j<nz-1; j+=2) { 2448 i0 = vi[j]; 2449 i1 = vi[j+1]; 2450 tmp0 = tmps[i0]; 2451 tmp1 = tmps[i1]; 2452 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2453 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2454 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2455 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2456 } 2457 if (j == nz-1) { 2458 tmp0 = tmps[vi[j]]; 2459 sum1 -= v1[j] *tmp0; 2460 sum2 -= v2[j] *tmp0; 2461 sum3 -= v3[j] *tmp0; 2462 sum4 -= v4[j] *tmp0; 2463 } 2464 sum2 -= v2[nz] * sum1; 2465 sum3 -= v3[nz] * sum1; 2466 sum4 -= v4[nz] * sum1; 2467 sum3 -= v3[nz+1] * sum2; 2468 sum4 -= v4[nz+1] * sum2; 2469 sum4 -= v4[nz+2] * sum3; 2470 2471 tmp[row++]=sum1; 2472 tmp[row++]=sum2; 2473 tmp[row++]=sum3; 2474 tmp[row++]=sum4; 2475 break; 2476 case 5: 2477 sum1 = b[r[row]]; 2478 sum2 = b[r[row+1]]; 2479 sum3 = b[r[row+2]]; 2480 sum4 = b[r[row+3]]; 2481 sum5 = b[r[row+4]]; 2482 v2 = aa + ai[row+1]; 2483 v3 = aa + ai[row+2]; 2484 v4 = aa + ai[row+3]; 2485 v5 = aa + ai[row+4]; 2486 2487 for (j=0; j<nz-1; j+=2) { 2488 i0 = vi[j]; 2489 i1 = vi[j+1]; 2490 tmp0 = tmps[i0]; 2491 tmp1 = tmps[i1]; 2492 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2493 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2494 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2495 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2496 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2497 } 2498 if (j == nz-1) { 2499 tmp0 = tmps[vi[j]]; 2500 sum1 -= v1[j] *tmp0; 2501 sum2 -= v2[j] *tmp0; 2502 sum3 -= v3[j] *tmp0; 2503 sum4 -= v4[j] *tmp0; 2504 sum5 -= v5[j] *tmp0; 2505 } 2506 2507 sum2 -= v2[nz] * sum1; 2508 sum3 -= v3[nz] * sum1; 2509 sum4 -= v4[nz] * sum1; 2510 sum5 -= v5[nz] * sum1; 2511 sum3 -= v3[nz+1] * sum2; 2512 sum4 -= v4[nz+1] * sum2; 2513 sum5 -= v5[nz+1] * sum2; 2514 sum4 -= v4[nz+2] * sum3; 2515 sum5 -= v5[nz+2] * sum3; 2516 sum5 -= v5[nz+3] * sum4; 2517 2518 tmp[row++]=sum1; 2519 tmp[row++]=sum2; 2520 tmp[row++]=sum3; 2521 tmp[row++]=sum4; 2522 tmp[row++]=sum5; 2523 break; 2524 default: 2525 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2526 } 2527 } 2528 /* backward solve the upper triangular */ 2529 for (i=node_max -1,row = n-1; i>=0; i--) { 2530 nsz = ns[i]; 2531 aii = ad[row+1] + 1; 2532 v1 = aa + aii; 2533 vi = aj + aii; 2534 nz = ad[row]- ad[row+1] - 1; 2535 2536 if (i > 0) { 2537 /* Prefetch the indices for the next block */ 2538 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2539 /* Prefetch the data for the next block */ 2540 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2541 } 2542 2543 switch (nsz) { /* Each loop in 'case' is unrolled */ 2544 case 1: 2545 sum1 = tmp[row]; 2546 2547 for (j=0; j<nz-1; j+=2) { 2548 i0 = vi[j]; 2549 i1 = vi[j+1]; 2550 tmp0 = tmps[i0]; 2551 tmp1 = tmps[i1]; 2552 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2553 } 2554 if (j == nz-1) { 2555 tmp0 = tmps[vi[j]]; 2556 sum1 -= v1[j]*tmp0; 2557 } 2558 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2559 break; 2560 case 2: 2561 sum1 = tmp[row]; 2562 sum2 = tmp[row-1]; 2563 v2 = aa + ad[row] + 1; 2564 for (j=0; j<nz-1; j+=2) { 2565 i0 = vi[j]; 2566 i1 = vi[j+1]; 2567 tmp0 = tmps[i0]; 2568 tmp1 = tmps[i1]; 2569 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2570 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2571 } 2572 if (j == nz-1) { 2573 tmp0 = tmps[vi[j]]; 2574 sum1 -= v1[j]* tmp0; 2575 sum2 -= v2[j+1]* tmp0; 2576 } 2577 2578 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2579 sum2 -= v2[0] * tmp0; 2580 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2581 break; 2582 case 3: 2583 sum1 = tmp[row]; 2584 sum2 = tmp[row -1]; 2585 sum3 = tmp[row -2]; 2586 v2 = aa + ad[row] + 1; 2587 v3 = aa + ad[row -1] + 1; 2588 for (j=0; j<nz-1; j+=2) { 2589 i0 = vi[j]; 2590 i1 = vi[j+1]; 2591 tmp0 = tmps[i0]; 2592 tmp1 = tmps[i1]; 2593 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2594 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2595 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2596 } 2597 if (j== nz-1) { 2598 tmp0 = tmps[vi[j]]; 2599 sum1 -= v1[j] * tmp0; 2600 sum2 -= v2[j+1] * tmp0; 2601 sum3 -= v3[j+2] * tmp0; 2602 } 2603 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2604 sum2 -= v2[0]* tmp0; 2605 sum3 -= v3[1] * tmp0; 2606 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2607 sum3 -= v3[0]* tmp0; 2608 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2609 2610 break; 2611 case 4: 2612 sum1 = tmp[row]; 2613 sum2 = tmp[row -1]; 2614 sum3 = tmp[row -2]; 2615 sum4 = tmp[row -3]; 2616 v2 = aa + ad[row]+1; 2617 v3 = aa + ad[row -1]+1; 2618 v4 = aa + ad[row -2]+1; 2619 2620 for (j=0; j<nz-1; j+=2) { 2621 i0 = vi[j]; 2622 i1 = vi[j+1]; 2623 tmp0 = tmps[i0]; 2624 tmp1 = tmps[i1]; 2625 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2626 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2627 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2628 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2629 } 2630 if (j== nz-1) { 2631 tmp0 = tmps[vi[j]]; 2632 sum1 -= v1[j] * tmp0; 2633 sum2 -= v2[j+1] * tmp0; 2634 sum3 -= v3[j+2] * tmp0; 2635 sum4 -= v4[j+3] * tmp0; 2636 } 2637 2638 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2639 sum2 -= v2[0] * tmp0; 2640 sum3 -= v3[1] * tmp0; 2641 sum4 -= v4[2] * tmp0; 2642 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2643 sum3 -= v3[0] * tmp0; 2644 sum4 -= v4[1] * tmp0; 2645 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2646 sum4 -= v4[0] * tmp0; 2647 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2648 break; 2649 case 5: 2650 sum1 = tmp[row]; 2651 sum2 = tmp[row -1]; 2652 sum3 = tmp[row -2]; 2653 sum4 = tmp[row -3]; 2654 sum5 = tmp[row -4]; 2655 v2 = aa + ad[row]+1; 2656 v3 = aa + ad[row -1]+1; 2657 v4 = aa + ad[row -2]+1; 2658 v5 = aa + ad[row -3]+1; 2659 for (j=0; j<nz-1; j+=2) { 2660 i0 = vi[j]; 2661 i1 = vi[j+1]; 2662 tmp0 = tmps[i0]; 2663 tmp1 = tmps[i1]; 2664 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2665 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2666 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2667 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2668 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2669 } 2670 if (j==nz-1) { 2671 tmp0 = tmps[vi[j]]; 2672 sum1 -= v1[j] * tmp0; 2673 sum2 -= v2[j+1] * tmp0; 2674 sum3 -= v3[j+2] * tmp0; 2675 sum4 -= v4[j+3] * tmp0; 2676 sum5 -= v5[j+4] * tmp0; 2677 } 2678 2679 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2680 sum2 -= v2[0] * tmp0; 2681 sum3 -= v3[1] * tmp0; 2682 sum4 -= v4[2] * tmp0; 2683 sum5 -= v5[3] * tmp0; 2684 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2685 sum3 -= v3[0] * tmp0; 2686 sum4 -= v4[1] * tmp0; 2687 sum5 -= v5[2] * tmp0; 2688 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2689 sum4 -= v4[0] * tmp0; 2690 sum5 -= v5[1] * tmp0; 2691 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2692 sum5 -= v5[0] * tmp0; 2693 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2694 break; 2695 default: 2696 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2697 } 2698 } 2699 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2700 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2701 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2703 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2704 PetscFunctionReturn(0); 2705 } 2706 2707 2708 /* 2709 Makes a longer coloring[] array and calls the usual code with that 2710 */ 2711 #undef __FUNCT__ 2712 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2713 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2714 { 2715 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2716 PetscErrorCode ierr; 2717 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2718 PetscInt *colorused,i; 2719 ISColoringValue *newcolor; 2720 2721 PetscFunctionBegin; 2722 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2723 /* loop over inodes, marking a color for each column*/ 2724 row = 0; 2725 for (i=0; i<m; i++) { 2726 for (j=0; j<ns[i]; j++) { 2727 newcolor[row++] = coloring[i] + j*ncolors; 2728 } 2729 } 2730 2731 /* eliminate unneeded colors */ 2732 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2733 for (i=0; i<n; i++) { 2734 colorused[newcolor[i]] = 1; 2735 } 2736 2737 for (i=1; i<5*ncolors; i++) { 2738 colorused[i] += colorused[i-1]; 2739 } 2740 ncolors = colorused[5*ncolors-1]; 2741 for (i=0; i<n; i++) { 2742 newcolor[i] = colorused[newcolor[i]]-1; 2743 } 2744 ierr = PetscFree(colorused);CHKERRQ(ierr); 2745 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2746 ierr = PetscFree(coloring);CHKERRQ(ierr); 2747 PetscFunctionReturn(0); 2748 } 2749 2750 #include <petsc/private/kernels/blockinvert.h> 2751 2752 #undef __FUNCT__ 2753 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2754 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2755 { 2756 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2757 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2758 MatScalar *ibdiag,*bdiag,work[25],*t; 2759 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2760 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2761 const PetscScalar *xb, *b; 2762 PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0; 2763 PetscErrorCode ierr; 2764 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2765 PetscInt sz,k,ipvt[5]; 2766 PetscBool allowzeropivot,zeropivotdetected; 2767 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2768 2769 PetscFunctionBegin; 2770 allowzeropivot = PetscNot(A->erroriffailure); 2771 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2772 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 2773 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2774 2775 if (!a->inode.ibdiagvalid) { 2776 if (!a->inode.ibdiag) { 2777 /* calculate space needed for diagonal blocks */ 2778 for (i=0; i<m; i++) { 2779 cnt += sizes[i]*sizes[i]; 2780 } 2781 a->inode.bdiagsize = cnt; 2782 2783 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2784 } 2785 2786 /* copy over the diagonal blocks and invert them */ 2787 ibdiag = a->inode.ibdiag; 2788 bdiag = a->inode.bdiag; 2789 cnt = 0; 2790 for (i=0, row = 0; i<m; i++) { 2791 for (j=0; j<sizes[i]; j++) { 2792 for (k=0; k<sizes[i]; k++) { 2793 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2794 } 2795 } 2796 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2797 2798 switch (sizes[i]) { 2799 case 1: 2800 /* Create matrix data structure */ 2801 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2802 if (allowzeropivot) { 2803 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2804 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2805 A->factorerror_zeropivot_row = row; 2806 ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr); 2807 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2808 } 2809 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2810 break; 2811 case 2: 2812 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2813 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2814 break; 2815 case 3: 2816 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2817 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2818 break; 2819 case 4: 2820 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2821 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2822 break; 2823 case 5: 2824 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2825 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2826 break; 2827 default: 2828 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2829 } 2830 cnt += sizes[i]*sizes[i]; 2831 row += sizes[i]; 2832 } 2833 a->inode.ibdiagvalid = PETSC_TRUE; 2834 } 2835 ibdiag = a->inode.ibdiag; 2836 bdiag = a->inode.bdiag; 2837 t = a->inode.ssor_work; 2838 2839 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2840 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2841 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2842 if (flag & SOR_ZERO_INITIAL_GUESS) { 2843 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2844 2845 for (i=0, row=0; i<m; i++) { 2846 sz = diag[row] - ii[row]; 2847 v1 = a->a + ii[row]; 2848 idx = a->j + ii[row]; 2849 2850 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2851 switch (sizes[i]) { 2852 case 1: 2853 2854 sum1 = b[row]; 2855 for (n = 0; n<sz-1; n+=2) { 2856 i1 = idx[0]; 2857 i2 = idx[1]; 2858 idx += 2; 2859 tmp0 = x[i1]; 2860 tmp1 = x[i2]; 2861 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2862 } 2863 2864 if (n == sz-1) { 2865 tmp0 = x[*idx]; 2866 sum1 -= *v1 * tmp0; 2867 } 2868 t[row] = sum1; 2869 x[row++] = sum1*(*ibdiag++); 2870 break; 2871 case 2: 2872 v2 = a->a + ii[row+1]; 2873 sum1 = b[row]; 2874 sum2 = b[row+1]; 2875 for (n = 0; n<sz-1; n+=2) { 2876 i1 = idx[0]; 2877 i2 = idx[1]; 2878 idx += 2; 2879 tmp0 = x[i1]; 2880 tmp1 = x[i2]; 2881 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2882 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2883 } 2884 2885 if (n == sz-1) { 2886 tmp0 = x[*idx]; 2887 sum1 -= v1[0] * tmp0; 2888 sum2 -= v2[0] * tmp0; 2889 } 2890 t[row] = sum1; 2891 t[row+1] = sum2; 2892 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2893 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2894 ibdiag += 4; 2895 break; 2896 case 3: 2897 v2 = a->a + ii[row+1]; 2898 v3 = a->a + ii[row+2]; 2899 sum1 = b[row]; 2900 sum2 = b[row+1]; 2901 sum3 = b[row+2]; 2902 for (n = 0; n<sz-1; n+=2) { 2903 i1 = idx[0]; 2904 i2 = idx[1]; 2905 idx += 2; 2906 tmp0 = x[i1]; 2907 tmp1 = x[i2]; 2908 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2909 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2910 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2911 } 2912 2913 if (n == sz-1) { 2914 tmp0 = x[*idx]; 2915 sum1 -= v1[0] * tmp0; 2916 sum2 -= v2[0] * tmp0; 2917 sum3 -= v3[0] * tmp0; 2918 } 2919 t[row] = sum1; 2920 t[row+1] = sum2; 2921 t[row+2] = sum3; 2922 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2923 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2924 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2925 ibdiag += 9; 2926 break; 2927 case 4: 2928 v2 = a->a + ii[row+1]; 2929 v3 = a->a + ii[row+2]; 2930 v4 = a->a + ii[row+3]; 2931 sum1 = b[row]; 2932 sum2 = b[row+1]; 2933 sum3 = b[row+2]; 2934 sum4 = b[row+3]; 2935 for (n = 0; n<sz-1; n+=2) { 2936 i1 = idx[0]; 2937 i2 = idx[1]; 2938 idx += 2; 2939 tmp0 = x[i1]; 2940 tmp1 = x[i2]; 2941 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2942 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2943 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2944 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2945 } 2946 2947 if (n == sz-1) { 2948 tmp0 = x[*idx]; 2949 sum1 -= v1[0] * tmp0; 2950 sum2 -= v2[0] * tmp0; 2951 sum3 -= v3[0] * tmp0; 2952 sum4 -= v4[0] * tmp0; 2953 } 2954 t[row] = sum1; 2955 t[row+1] = sum2; 2956 t[row+2] = sum3; 2957 t[row+3] = sum4; 2958 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2959 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2960 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2961 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2962 ibdiag += 16; 2963 break; 2964 case 5: 2965 v2 = a->a + ii[row+1]; 2966 v3 = a->a + ii[row+2]; 2967 v4 = a->a + ii[row+3]; 2968 v5 = a->a + ii[row+4]; 2969 sum1 = b[row]; 2970 sum2 = b[row+1]; 2971 sum3 = b[row+2]; 2972 sum4 = b[row+3]; 2973 sum5 = b[row+4]; 2974 for (n = 0; n<sz-1; n+=2) { 2975 i1 = idx[0]; 2976 i2 = idx[1]; 2977 idx += 2; 2978 tmp0 = x[i1]; 2979 tmp1 = x[i2]; 2980 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2981 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2982 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2983 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2984 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2985 } 2986 2987 if (n == sz-1) { 2988 tmp0 = x[*idx]; 2989 sum1 -= v1[0] * tmp0; 2990 sum2 -= v2[0] * tmp0; 2991 sum3 -= v3[0] * tmp0; 2992 sum4 -= v4[0] * tmp0; 2993 sum5 -= v5[0] * tmp0; 2994 } 2995 t[row] = sum1; 2996 t[row+1] = sum2; 2997 t[row+2] = sum3; 2998 t[row+3] = sum4; 2999 t[row+4] = sum5; 3000 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3001 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3002 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3003 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3004 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3005 ibdiag += 25; 3006 break; 3007 default: 3008 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3009 } 3010 } 3011 3012 xb = t; 3013 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3014 } else xb = b; 3015 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3016 3017 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3018 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3019 ibdiag -= sizes[i]*sizes[i]; 3020 sz = ii[row+1] - diag[row] - 1; 3021 v1 = a->a + diag[row] + 1; 3022 idx = a->j + diag[row] + 1; 3023 3024 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3025 switch (sizes[i]) { 3026 case 1: 3027 3028 sum1 = xb[row]; 3029 for (n = 0; n<sz-1; n+=2) { 3030 i1 = idx[0]; 3031 i2 = idx[1]; 3032 idx += 2; 3033 tmp0 = x[i1]; 3034 tmp1 = x[i2]; 3035 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3036 } 3037 3038 if (n == sz-1) { 3039 tmp0 = x[*idx]; 3040 sum1 -= *v1*tmp0; 3041 } 3042 x[row--] = sum1*(*ibdiag); 3043 break; 3044 3045 case 2: 3046 3047 sum1 = xb[row]; 3048 sum2 = xb[row-1]; 3049 /* note that sum1 is associated with the second of the two rows */ 3050 v2 = a->a + diag[row-1] + 2; 3051 for (n = 0; n<sz-1; n+=2) { 3052 i1 = idx[0]; 3053 i2 = idx[1]; 3054 idx += 2; 3055 tmp0 = x[i1]; 3056 tmp1 = x[i2]; 3057 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3058 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3059 } 3060 3061 if (n == sz-1) { 3062 tmp0 = x[*idx]; 3063 sum1 -= *v1*tmp0; 3064 sum2 -= *v2*tmp0; 3065 } 3066 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3067 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3068 break; 3069 case 3: 3070 3071 sum1 = xb[row]; 3072 sum2 = xb[row-1]; 3073 sum3 = xb[row-2]; 3074 v2 = a->a + diag[row-1] + 2; 3075 v3 = a->a + diag[row-2] + 3; 3076 for (n = 0; n<sz-1; n+=2) { 3077 i1 = idx[0]; 3078 i2 = idx[1]; 3079 idx += 2; 3080 tmp0 = x[i1]; 3081 tmp1 = x[i2]; 3082 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3083 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3084 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3085 } 3086 3087 if (n == sz-1) { 3088 tmp0 = x[*idx]; 3089 sum1 -= *v1*tmp0; 3090 sum2 -= *v2*tmp0; 3091 sum3 -= *v3*tmp0; 3092 } 3093 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3094 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3095 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3096 break; 3097 case 4: 3098 3099 sum1 = xb[row]; 3100 sum2 = xb[row-1]; 3101 sum3 = xb[row-2]; 3102 sum4 = xb[row-3]; 3103 v2 = a->a + diag[row-1] + 2; 3104 v3 = a->a + diag[row-2] + 3; 3105 v4 = a->a + diag[row-3] + 4; 3106 for (n = 0; n<sz-1; n+=2) { 3107 i1 = idx[0]; 3108 i2 = idx[1]; 3109 idx += 2; 3110 tmp0 = x[i1]; 3111 tmp1 = x[i2]; 3112 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3113 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3114 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3115 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3116 } 3117 3118 if (n == sz-1) { 3119 tmp0 = x[*idx]; 3120 sum1 -= *v1*tmp0; 3121 sum2 -= *v2*tmp0; 3122 sum3 -= *v3*tmp0; 3123 sum4 -= *v4*tmp0; 3124 } 3125 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3126 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3127 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3128 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3129 break; 3130 case 5: 3131 3132 sum1 = xb[row]; 3133 sum2 = xb[row-1]; 3134 sum3 = xb[row-2]; 3135 sum4 = xb[row-3]; 3136 sum5 = xb[row-4]; 3137 v2 = a->a + diag[row-1] + 2; 3138 v3 = a->a + diag[row-2] + 3; 3139 v4 = a->a + diag[row-3] + 4; 3140 v5 = a->a + diag[row-4] + 5; 3141 for (n = 0; n<sz-1; n+=2) { 3142 i1 = idx[0]; 3143 i2 = idx[1]; 3144 idx += 2; 3145 tmp0 = x[i1]; 3146 tmp1 = x[i2]; 3147 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3148 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3149 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3150 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3151 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3152 } 3153 3154 if (n == sz-1) { 3155 tmp0 = x[*idx]; 3156 sum1 -= *v1*tmp0; 3157 sum2 -= *v2*tmp0; 3158 sum3 -= *v3*tmp0; 3159 sum4 -= *v4*tmp0; 3160 sum5 -= *v5*tmp0; 3161 } 3162 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3163 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3164 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3165 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3166 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3167 break; 3168 default: 3169 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3170 } 3171 } 3172 3173 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3174 } 3175 its--; 3176 } 3177 while (its--) { 3178 3179 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3180 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3181 i<m; 3182 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3183 3184 sz = diag[row] - ii[row]; 3185 v1 = a->a + ii[row]; 3186 idx = a->j + ii[row]; 3187 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3188 switch (sizes[i]) { 3189 case 1: 3190 sum1 = b[row]; 3191 for (n = 0; n<sz-1; n+=2) { 3192 i1 = idx[0]; 3193 i2 = idx[1]; 3194 idx += 2; 3195 tmp0 = x[i1]; 3196 tmp1 = x[i2]; 3197 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3198 } 3199 if (n == sz-1) { 3200 tmp0 = x[*idx++]; 3201 sum1 -= *v1 * tmp0; 3202 v1++; 3203 } 3204 t[row] = sum1; 3205 sz = ii[row+1] - diag[row] - 1; 3206 idx = a->j + diag[row] + 1; 3207 v1 += 1; 3208 for (n = 0; n<sz-1; n+=2) { 3209 i1 = idx[0]; 3210 i2 = idx[1]; 3211 idx += 2; 3212 tmp0 = x[i1]; 3213 tmp1 = x[i2]; 3214 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3215 } 3216 if (n == sz-1) { 3217 tmp0 = x[*idx++]; 3218 sum1 -= *v1 * tmp0; 3219 } 3220 /* in MatSOR_SeqAIJ this line would be 3221 * 3222 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3223 * 3224 * but omega == 1, so this becomes 3225 * 3226 * x[row] = sum1*(*ibdiag++); 3227 * 3228 */ 3229 x[row] = sum1*(*ibdiag); 3230 break; 3231 case 2: 3232 v2 = a->a + ii[row+1]; 3233 sum1 = b[row]; 3234 sum2 = b[row+1]; 3235 for (n = 0; n<sz-1; n+=2) { 3236 i1 = idx[0]; 3237 i2 = idx[1]; 3238 idx += 2; 3239 tmp0 = x[i1]; 3240 tmp1 = x[i2]; 3241 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3242 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3243 } 3244 if (n == sz-1) { 3245 tmp0 = x[*idx++]; 3246 sum1 -= v1[0] * tmp0; 3247 sum2 -= v2[0] * tmp0; 3248 v1++; v2++; 3249 } 3250 t[row] = sum1; 3251 t[row+1] = sum2; 3252 sz = ii[row+1] - diag[row] - 2; 3253 idx = a->j + diag[row] + 2; 3254 v1 += 2; 3255 v2 += 2; 3256 for (n = 0; n<sz-1; n+=2) { 3257 i1 = idx[0]; 3258 i2 = idx[1]; 3259 idx += 2; 3260 tmp0 = x[i1]; 3261 tmp1 = x[i2]; 3262 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3263 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3264 } 3265 if (n == sz-1) { 3266 tmp0 = x[*idx]; 3267 sum1 -= v1[0] * tmp0; 3268 sum2 -= v2[0] * tmp0; 3269 } 3270 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3271 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3272 break; 3273 case 3: 3274 v2 = a->a + ii[row+1]; 3275 v3 = a->a + ii[row+2]; 3276 sum1 = b[row]; 3277 sum2 = b[row+1]; 3278 sum3 = b[row+2]; 3279 for (n = 0; n<sz-1; n+=2) { 3280 i1 = idx[0]; 3281 i2 = idx[1]; 3282 idx += 2; 3283 tmp0 = x[i1]; 3284 tmp1 = x[i2]; 3285 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3286 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3287 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3288 } 3289 if (n == sz-1) { 3290 tmp0 = x[*idx++]; 3291 sum1 -= v1[0] * tmp0; 3292 sum2 -= v2[0] * tmp0; 3293 sum3 -= v3[0] * tmp0; 3294 v1++; v2++; v3++; 3295 } 3296 t[row] = sum1; 3297 t[row+1] = sum2; 3298 t[row+2] = sum3; 3299 sz = ii[row+1] - diag[row] - 3; 3300 idx = a->j + diag[row] + 3; 3301 v1 += 3; 3302 v2 += 3; 3303 v3 += 3; 3304 for (n = 0; n<sz-1; n+=2) { 3305 i1 = idx[0]; 3306 i2 = idx[1]; 3307 idx += 2; 3308 tmp0 = x[i1]; 3309 tmp1 = x[i2]; 3310 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3311 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3312 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3313 } 3314 if (n == sz-1) { 3315 tmp0 = x[*idx]; 3316 sum1 -= v1[0] * tmp0; 3317 sum2 -= v2[0] * tmp0; 3318 sum3 -= v3[0] * tmp0; 3319 } 3320 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3321 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3322 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3323 break; 3324 case 4: 3325 v2 = a->a + ii[row+1]; 3326 v3 = a->a + ii[row+2]; 3327 v4 = a->a + ii[row+3]; 3328 sum1 = b[row]; 3329 sum2 = b[row+1]; 3330 sum3 = b[row+2]; 3331 sum4 = b[row+3]; 3332 for (n = 0; n<sz-1; n+=2) { 3333 i1 = idx[0]; 3334 i2 = idx[1]; 3335 idx += 2; 3336 tmp0 = x[i1]; 3337 tmp1 = x[i2]; 3338 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3339 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3340 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3341 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3342 } 3343 if (n == sz-1) { 3344 tmp0 = x[*idx++]; 3345 sum1 -= v1[0] * tmp0; 3346 sum2 -= v2[0] * tmp0; 3347 sum3 -= v3[0] * tmp0; 3348 sum4 -= v4[0] * tmp0; 3349 v1++; v2++; v3++; v4++; 3350 } 3351 t[row] = sum1; 3352 t[row+1] = sum2; 3353 t[row+2] = sum3; 3354 t[row+3] = sum4; 3355 sz = ii[row+1] - diag[row] - 4; 3356 idx = a->j + diag[row] + 4; 3357 v1 += 4; 3358 v2 += 4; 3359 v3 += 4; 3360 v4 += 4; 3361 for (n = 0; n<sz-1; n+=2) { 3362 i1 = idx[0]; 3363 i2 = idx[1]; 3364 idx += 2; 3365 tmp0 = x[i1]; 3366 tmp1 = x[i2]; 3367 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3368 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3369 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3370 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3371 } 3372 if (n == sz-1) { 3373 tmp0 = x[*idx]; 3374 sum1 -= v1[0] * tmp0; 3375 sum2 -= v2[0] * tmp0; 3376 sum3 -= v3[0] * tmp0; 3377 sum4 -= v4[0] * tmp0; 3378 } 3379 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3380 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3381 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3382 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3383 break; 3384 case 5: 3385 v2 = a->a + ii[row+1]; 3386 v3 = a->a + ii[row+2]; 3387 v4 = a->a + ii[row+3]; 3388 v5 = a->a + ii[row+4]; 3389 sum1 = b[row]; 3390 sum2 = b[row+1]; 3391 sum3 = b[row+2]; 3392 sum4 = b[row+3]; 3393 sum5 = b[row+4]; 3394 for (n = 0; n<sz-1; n+=2) { 3395 i1 = idx[0]; 3396 i2 = idx[1]; 3397 idx += 2; 3398 tmp0 = x[i1]; 3399 tmp1 = x[i2]; 3400 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3401 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3402 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3403 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3404 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3405 } 3406 if (n == sz-1) { 3407 tmp0 = x[*idx++]; 3408 sum1 -= v1[0] * tmp0; 3409 sum2 -= v2[0] * tmp0; 3410 sum3 -= v3[0] * tmp0; 3411 sum4 -= v4[0] * tmp0; 3412 sum5 -= v5[0] * tmp0; 3413 v1++; v2++; v3++; v4++; v5++; 3414 } 3415 t[row] = sum1; 3416 t[row+1] = sum2; 3417 t[row+2] = sum3; 3418 t[row+3] = sum4; 3419 t[row+4] = sum5; 3420 sz = ii[row+1] - diag[row] - 5; 3421 idx = a->j + diag[row] + 5; 3422 v1 += 5; 3423 v2 += 5; 3424 v3 += 5; 3425 v4 += 5; 3426 v5 += 5; 3427 for (n = 0; n<sz-1; n+=2) { 3428 i1 = idx[0]; 3429 i2 = idx[1]; 3430 idx += 2; 3431 tmp0 = x[i1]; 3432 tmp1 = x[i2]; 3433 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3434 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3435 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3436 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3437 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3438 } 3439 if (n == sz-1) { 3440 tmp0 = x[*idx]; 3441 sum1 -= v1[0] * tmp0; 3442 sum2 -= v2[0] * tmp0; 3443 sum3 -= v3[0] * tmp0; 3444 sum4 -= v4[0] * tmp0; 3445 sum5 -= v5[0] * tmp0; 3446 } 3447 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3448 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3449 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3450 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3451 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3452 break; 3453 default: 3454 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3455 } 3456 } 3457 xb = t; 3458 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3459 } else xb = b; 3460 3461 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3462 3463 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3464 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3465 ibdiag -= sizes[i]*sizes[i]; 3466 3467 /* set RHS */ 3468 if (xb == b) { 3469 /* whole (old way) */ 3470 sz = ii[row+1] - ii[row]; 3471 idx = a->j + ii[row]; 3472 switch (sizes[i]) { 3473 case 5: 3474 v5 = a->a + ii[row-4]; 3475 case 4: /* fall through */ 3476 v4 = a->a + ii[row-3]; 3477 case 3: 3478 v3 = a->a + ii[row-2]; 3479 case 2: 3480 v2 = a->a + ii[row-1]; 3481 case 1: 3482 v1 = a->a + ii[row]; 3483 break; 3484 default: 3485 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3486 } 3487 } else { 3488 /* upper, no diag */ 3489 sz = ii[row+1] - diag[row] - 1; 3490 idx = a->j + diag[row] + 1; 3491 switch (sizes[i]) { 3492 case 5: 3493 v5 = a->a + diag[row-4] + 5; 3494 case 4: /* fall through */ 3495 v4 = a->a + diag[row-3] + 4; 3496 case 3: 3497 v3 = a->a + diag[row-2] + 3; 3498 case 2: 3499 v2 = a->a + diag[row-1] + 2; 3500 case 1: 3501 v1 = a->a + diag[row] + 1; 3502 } 3503 } 3504 /* set sum */ 3505 switch (sizes[i]) { 3506 case 5: 3507 sum5 = xb[row-4]; 3508 case 4: /* fall through */ 3509 sum4 = xb[row-3]; 3510 case 3: 3511 sum3 = xb[row-2]; 3512 case 2: 3513 sum2 = xb[row-1]; 3514 case 1: 3515 /* note that sum1 is associated with the last row */ 3516 sum1 = xb[row]; 3517 } 3518 /* do sums */ 3519 for (n = 0; n<sz-1; n+=2) { 3520 i1 = idx[0]; 3521 i2 = idx[1]; 3522 idx += 2; 3523 tmp0 = x[i1]; 3524 tmp1 = x[i2]; 3525 switch (sizes[i]) { 3526 case 5: 3527 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3528 case 4: /* fall through */ 3529 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3530 case 3: 3531 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3532 case 2: 3533 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3534 case 1: 3535 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3536 } 3537 } 3538 /* ragged edge */ 3539 if (n == sz-1) { 3540 tmp0 = x[*idx]; 3541 switch (sizes[i]) { 3542 case 5: 3543 sum5 -= *v5*tmp0; 3544 case 4: /* fall through */ 3545 sum4 -= *v4*tmp0; 3546 case 3: 3547 sum3 -= *v3*tmp0; 3548 case 2: 3549 sum2 -= *v2*tmp0; 3550 case 1: 3551 sum1 -= *v1*tmp0; 3552 } 3553 } 3554 /* update */ 3555 if (xb == b) { 3556 /* whole (old way) w/ diag */ 3557 switch (sizes[i]) { 3558 case 5: 3559 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3560 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3561 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3562 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3563 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3564 break; 3565 case 4: 3566 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3567 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3568 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3569 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3570 break; 3571 case 3: 3572 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3573 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3574 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3575 break; 3576 case 2: 3577 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3578 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3579 break; 3580 case 1: 3581 x[row--] += sum1*(*ibdiag); 3582 break; 3583 } 3584 } else { 3585 /* no diag so set = */ 3586 switch (sizes[i]) { 3587 case 5: 3588 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3589 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3590 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3591 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3592 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3593 break; 3594 case 4: 3595 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3596 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3597 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3598 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3599 break; 3600 case 3: 3601 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3602 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3603 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3604 break; 3605 case 2: 3606 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3607 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3608 break; 3609 case 1: 3610 x[row--] = sum1*(*ibdiag); 3611 break; 3612 } 3613 } 3614 } 3615 if (xb == b) { 3616 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3617 } else { 3618 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3619 } 3620 } 3621 } 3622 if (flag & SOR_EISENSTAT) { 3623 /* 3624 Apply (U + D)^-1 where D is now the block diagonal 3625 */ 3626 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3627 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3628 ibdiag -= sizes[i]*sizes[i]; 3629 sz = ii[row+1] - diag[row] - 1; 3630 v1 = a->a + diag[row] + 1; 3631 idx = a->j + diag[row] + 1; 3632 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3633 switch (sizes[i]) { 3634 case 1: 3635 3636 sum1 = b[row]; 3637 for (n = 0; n<sz-1; n+=2) { 3638 i1 = idx[0]; 3639 i2 = idx[1]; 3640 idx += 2; 3641 tmp0 = x[i1]; 3642 tmp1 = x[i2]; 3643 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3644 } 3645 3646 if (n == sz-1) { 3647 tmp0 = x[*idx]; 3648 sum1 -= *v1*tmp0; 3649 } 3650 x[row] = sum1*(*ibdiag);row--; 3651 break; 3652 3653 case 2: 3654 3655 sum1 = b[row]; 3656 sum2 = b[row-1]; 3657 /* note that sum1 is associated with the second of the two rows */ 3658 v2 = a->a + diag[row-1] + 2; 3659 for (n = 0; n<sz-1; n+=2) { 3660 i1 = idx[0]; 3661 i2 = idx[1]; 3662 idx += 2; 3663 tmp0 = x[i1]; 3664 tmp1 = x[i2]; 3665 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3666 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3667 } 3668 3669 if (n == sz-1) { 3670 tmp0 = x[*idx]; 3671 sum1 -= *v1*tmp0; 3672 sum2 -= *v2*tmp0; 3673 } 3674 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3675 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3676 row -= 2; 3677 break; 3678 case 3: 3679 3680 sum1 = b[row]; 3681 sum2 = b[row-1]; 3682 sum3 = b[row-2]; 3683 v2 = a->a + diag[row-1] + 2; 3684 v3 = a->a + diag[row-2] + 3; 3685 for (n = 0; n<sz-1; n+=2) { 3686 i1 = idx[0]; 3687 i2 = idx[1]; 3688 idx += 2; 3689 tmp0 = x[i1]; 3690 tmp1 = x[i2]; 3691 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3692 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3693 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3694 } 3695 3696 if (n == sz-1) { 3697 tmp0 = x[*idx]; 3698 sum1 -= *v1*tmp0; 3699 sum2 -= *v2*tmp0; 3700 sum3 -= *v3*tmp0; 3701 } 3702 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3703 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3704 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3705 row -= 3; 3706 break; 3707 case 4: 3708 3709 sum1 = b[row]; 3710 sum2 = b[row-1]; 3711 sum3 = b[row-2]; 3712 sum4 = b[row-3]; 3713 v2 = a->a + diag[row-1] + 2; 3714 v3 = a->a + diag[row-2] + 3; 3715 v4 = a->a + diag[row-3] + 4; 3716 for (n = 0; n<sz-1; n+=2) { 3717 i1 = idx[0]; 3718 i2 = idx[1]; 3719 idx += 2; 3720 tmp0 = x[i1]; 3721 tmp1 = x[i2]; 3722 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3723 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3724 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3725 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3726 } 3727 3728 if (n == sz-1) { 3729 tmp0 = x[*idx]; 3730 sum1 -= *v1*tmp0; 3731 sum2 -= *v2*tmp0; 3732 sum3 -= *v3*tmp0; 3733 sum4 -= *v4*tmp0; 3734 } 3735 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3736 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3737 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3738 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3739 row -= 4; 3740 break; 3741 case 5: 3742 3743 sum1 = b[row]; 3744 sum2 = b[row-1]; 3745 sum3 = b[row-2]; 3746 sum4 = b[row-3]; 3747 sum5 = b[row-4]; 3748 v2 = a->a + diag[row-1] + 2; 3749 v3 = a->a + diag[row-2] + 3; 3750 v4 = a->a + diag[row-3] + 4; 3751 v5 = a->a + diag[row-4] + 5; 3752 for (n = 0; n<sz-1; n+=2) { 3753 i1 = idx[0]; 3754 i2 = idx[1]; 3755 idx += 2; 3756 tmp0 = x[i1]; 3757 tmp1 = x[i2]; 3758 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3759 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3760 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3761 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3762 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3763 } 3764 3765 if (n == sz-1) { 3766 tmp0 = x[*idx]; 3767 sum1 -= *v1*tmp0; 3768 sum2 -= *v2*tmp0; 3769 sum3 -= *v3*tmp0; 3770 sum4 -= *v4*tmp0; 3771 sum5 -= *v5*tmp0; 3772 } 3773 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3774 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3775 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3776 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3777 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3778 row -= 5; 3779 break; 3780 default: 3781 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3782 } 3783 } 3784 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3785 3786 /* 3787 t = b - D x where D is the block diagonal 3788 */ 3789 cnt = 0; 3790 for (i=0, row=0; i<m; i++) { 3791 switch (sizes[i]) { 3792 case 1: 3793 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3794 break; 3795 case 2: 3796 x1 = x[row]; x2 = x[row+1]; 3797 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3798 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3799 t[row] = b[row] - tmp1; 3800 t[row+1] = b[row+1] - tmp2; row += 2; 3801 cnt += 4; 3802 break; 3803 case 3: 3804 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3805 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3806 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3807 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3808 t[row] = b[row] - tmp1; 3809 t[row+1] = b[row+1] - tmp2; 3810 t[row+2] = b[row+2] - tmp3; row += 3; 3811 cnt += 9; 3812 break; 3813 case 4: 3814 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3815 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3816 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3817 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3818 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3819 t[row] = b[row] - tmp1; 3820 t[row+1] = b[row+1] - tmp2; 3821 t[row+2] = b[row+2] - tmp3; 3822 t[row+3] = b[row+3] - tmp4; row += 4; 3823 cnt += 16; 3824 break; 3825 case 5: 3826 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3827 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3828 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3829 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3830 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3831 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3832 t[row] = b[row] - tmp1; 3833 t[row+1] = b[row+1] - tmp2; 3834 t[row+2] = b[row+2] - tmp3; 3835 t[row+3] = b[row+3] - tmp4; 3836 t[row+4] = b[row+4] - tmp5;row += 5; 3837 cnt += 25; 3838 break; 3839 default: 3840 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3841 } 3842 } 3843 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3844 3845 3846 3847 /* 3848 Apply (L + D)^-1 where D is the block diagonal 3849 */ 3850 for (i=0, row=0; i<m; i++) { 3851 sz = diag[row] - ii[row]; 3852 v1 = a->a + ii[row]; 3853 idx = a->j + ii[row]; 3854 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3855 switch (sizes[i]) { 3856 case 1: 3857 3858 sum1 = t[row]; 3859 for (n = 0; n<sz-1; n+=2) { 3860 i1 = idx[0]; 3861 i2 = idx[1]; 3862 idx += 2; 3863 tmp0 = t[i1]; 3864 tmp1 = t[i2]; 3865 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3866 } 3867 3868 if (n == sz-1) { 3869 tmp0 = t[*idx]; 3870 sum1 -= *v1 * tmp0; 3871 } 3872 x[row] += t[row] = sum1*(*ibdiag++); row++; 3873 break; 3874 case 2: 3875 v2 = a->a + ii[row+1]; 3876 sum1 = t[row]; 3877 sum2 = t[row+1]; 3878 for (n = 0; n<sz-1; n+=2) { 3879 i1 = idx[0]; 3880 i2 = idx[1]; 3881 idx += 2; 3882 tmp0 = t[i1]; 3883 tmp1 = t[i2]; 3884 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3885 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3886 } 3887 3888 if (n == sz-1) { 3889 tmp0 = t[*idx]; 3890 sum1 -= v1[0] * tmp0; 3891 sum2 -= v2[0] * tmp0; 3892 } 3893 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3894 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3895 ibdiag += 4; row += 2; 3896 break; 3897 case 3: 3898 v2 = a->a + ii[row+1]; 3899 v3 = a->a + ii[row+2]; 3900 sum1 = t[row]; 3901 sum2 = t[row+1]; 3902 sum3 = t[row+2]; 3903 for (n = 0; n<sz-1; n+=2) { 3904 i1 = idx[0]; 3905 i2 = idx[1]; 3906 idx += 2; 3907 tmp0 = t[i1]; 3908 tmp1 = t[i2]; 3909 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3910 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3911 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3912 } 3913 3914 if (n == sz-1) { 3915 tmp0 = t[*idx]; 3916 sum1 -= v1[0] * tmp0; 3917 sum2 -= v2[0] * tmp0; 3918 sum3 -= v3[0] * tmp0; 3919 } 3920 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3921 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3922 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3923 ibdiag += 9; row += 3; 3924 break; 3925 case 4: 3926 v2 = a->a + ii[row+1]; 3927 v3 = a->a + ii[row+2]; 3928 v4 = a->a + ii[row+3]; 3929 sum1 = t[row]; 3930 sum2 = t[row+1]; 3931 sum3 = t[row+2]; 3932 sum4 = t[row+3]; 3933 for (n = 0; n<sz-1; n+=2) { 3934 i1 = idx[0]; 3935 i2 = idx[1]; 3936 idx += 2; 3937 tmp0 = t[i1]; 3938 tmp1 = t[i2]; 3939 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3940 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3941 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3942 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3943 } 3944 3945 if (n == sz-1) { 3946 tmp0 = t[*idx]; 3947 sum1 -= v1[0] * tmp0; 3948 sum2 -= v2[0] * tmp0; 3949 sum3 -= v3[0] * tmp0; 3950 sum4 -= v4[0] * tmp0; 3951 } 3952 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3953 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3954 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3955 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3956 ibdiag += 16; row += 4; 3957 break; 3958 case 5: 3959 v2 = a->a + ii[row+1]; 3960 v3 = a->a + ii[row+2]; 3961 v4 = a->a + ii[row+3]; 3962 v5 = a->a + ii[row+4]; 3963 sum1 = t[row]; 3964 sum2 = t[row+1]; 3965 sum3 = t[row+2]; 3966 sum4 = t[row+3]; 3967 sum5 = t[row+4]; 3968 for (n = 0; n<sz-1; n+=2) { 3969 i1 = idx[0]; 3970 i2 = idx[1]; 3971 idx += 2; 3972 tmp0 = t[i1]; 3973 tmp1 = t[i2]; 3974 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3975 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3976 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3977 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3978 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3979 } 3980 3981 if (n == sz-1) { 3982 tmp0 = t[*idx]; 3983 sum1 -= v1[0] * tmp0; 3984 sum2 -= v2[0] * tmp0; 3985 sum3 -= v3[0] * tmp0; 3986 sum4 -= v4[0] * tmp0; 3987 sum5 -= v5[0] * tmp0; 3988 } 3989 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3990 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3991 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3992 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3993 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3994 ibdiag += 25; row += 5; 3995 break; 3996 default: 3997 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3998 } 3999 } 4000 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 4001 } 4002 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4003 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4004 PetscFunctionReturn(0); 4005 } 4006 4007 #undef __FUNCT__ 4008 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 4009 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 4010 { 4011 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4012 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 4013 const MatScalar *bdiag = a->inode.bdiag; 4014 const PetscScalar *b; 4015 PetscErrorCode ierr; 4016 PetscInt m = a->inode.node_count,cnt = 0,i,row; 4017 const PetscInt *sizes = a->inode.size; 4018 4019 PetscFunctionBegin; 4020 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4021 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4022 cnt = 0; 4023 for (i=0, row=0; i<m; i++) { 4024 switch (sizes[i]) { 4025 case 1: 4026 x[row] = b[row]*bdiag[cnt++];row++; 4027 break; 4028 case 2: 4029 x1 = b[row]; x2 = b[row+1]; 4030 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 4031 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 4032 x[row++] = tmp1; 4033 x[row++] = tmp2; 4034 cnt += 4; 4035 break; 4036 case 3: 4037 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4038 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4039 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4040 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4041 x[row++] = tmp1; 4042 x[row++] = tmp2; 4043 x[row++] = tmp3; 4044 cnt += 9; 4045 break; 4046 case 4: 4047 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4048 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4049 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4050 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4051 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4052 x[row++] = tmp1; 4053 x[row++] = tmp2; 4054 x[row++] = tmp3; 4055 x[row++] = tmp4; 4056 cnt += 16; 4057 break; 4058 case 5: 4059 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4060 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4061 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4062 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4063 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4064 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4065 x[row++] = tmp1; 4066 x[row++] = tmp2; 4067 x[row++] = tmp3; 4068 x[row++] = tmp4; 4069 x[row++] = tmp5; 4070 cnt += 25; 4071 break; 4072 default: 4073 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4074 } 4075 } 4076 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4077 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4078 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4079 PetscFunctionReturn(0); 4080 } 4081 4082 /* 4083 samestructure indicates that the matrix has not changed its nonzero structure so we 4084 do not need to recompute the inodes 4085 */ 4086 #undef __FUNCT__ 4087 #define __FUNCT__ "MatSeqAIJCheckInode" 4088 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4089 { 4090 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4091 PetscErrorCode ierr; 4092 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4093 PetscBool flag; 4094 const PetscInt *idx,*idy,*ii; 4095 4096 PetscFunctionBegin; 4097 if (!a->inode.use) PetscFunctionReturn(0); 4098 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4099 4100 m = A->rmap->n; 4101 if (a->inode.size) ns = a->inode.size; 4102 else { 4103 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4104 } 4105 4106 i = 0; 4107 node_count = 0; 4108 idx = a->j; 4109 ii = a->i; 4110 while (i < m) { /* For each row */ 4111 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4112 /* Limits the number of elements in a node to 'a->inode.limit' */ 4113 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4114 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4115 if (nzy != nzx) break; 4116 idy += nzx; /* Same nonzero pattern */ 4117 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4118 if (!flag) break; 4119 } 4120 ns[node_count++] = blk_size; 4121 idx += blk_size*nzx; 4122 i = j; 4123 } 4124 /* If not enough inodes found,, do not use inode version of the routines */ 4125 if (!m || node_count > .8*m) { 4126 ierr = PetscFree(ns);CHKERRQ(ierr); 4127 4128 a->inode.node_count = 0; 4129 a->inode.size = NULL; 4130 a->inode.use = PETSC_FALSE; 4131 A->ops->mult = MatMult_SeqAIJ; 4132 A->ops->sor = MatSOR_SeqAIJ; 4133 A->ops->multadd = MatMultAdd_SeqAIJ; 4134 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4135 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4136 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4137 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4138 A->ops->coloringpatch = 0; 4139 A->ops->multdiagonalblock = 0; 4140 4141 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4142 } else { 4143 if (!A->factortype) { 4144 A->ops->mult = MatMult_SeqAIJ_Inode; 4145 A->ops->sor = MatSOR_SeqAIJ_Inode; 4146 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4147 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4148 if (A->rmap->n == A->cmap->n) { 4149 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4150 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4151 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4152 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4153 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4154 } 4155 } else { 4156 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4157 } 4158 a->inode.node_count = node_count; 4159 a->inode.size = ns; 4160 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4161 } 4162 a->inode.checked = PETSC_TRUE; 4163 a->inode.mat_nonzerostate = A->nonzerostate; 4164 PetscFunctionReturn(0); 4165 } 4166 4167 #undef __FUNCT__ 4168 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4169 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4170 { 4171 Mat B =*C; 4172 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4173 PetscErrorCode ierr; 4174 PetscInt m=A->rmap->n; 4175 4176 PetscFunctionBegin; 4177 c->inode.use = a->inode.use; 4178 c->inode.limit = a->inode.limit; 4179 c->inode.max_limit = a->inode.max_limit; 4180 if (a->inode.size) { 4181 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4182 c->inode.node_count = a->inode.node_count; 4183 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4184 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4185 if (!B->factortype) { 4186 B->ops->mult = MatMult_SeqAIJ_Inode; 4187 B->ops->sor = MatSOR_SeqAIJ_Inode; 4188 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4189 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4190 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4191 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4192 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4193 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4194 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4195 } else { 4196 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4197 } 4198 } else { 4199 c->inode.size = 0; 4200 c->inode.node_count = 0; 4201 } 4202 c->inode.ibdiagvalid = PETSC_FALSE; 4203 c->inode.ibdiag = 0; 4204 c->inode.bdiag = 0; 4205 PetscFunctionReturn(0); 4206 } 4207 4208 #undef __FUNCT__ 4209 #define __FUNCT__ "MatGetRow_FactoredLU" 4210 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4211 { 4212 PetscInt k; 4213 const PetscInt *vi; 4214 4215 PetscFunctionBegin; 4216 vi = aj + ai[row]; 4217 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4218 vi = aj + adiag[row]; 4219 cols[nzl] = vi[0]; 4220 vi = aj + adiag[row+1]+1; 4221 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4222 PetscFunctionReturn(0); 4223 } 4224 /* 4225 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4226 Modified from MatSeqAIJCheckInode(). 4227 4228 Input Parameters: 4229 . Mat A - ILU or LU matrix factor 4230 4231 */ 4232 #undef __FUNCT__ 4233 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU" 4234 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4235 { 4236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4237 PetscErrorCode ierr; 4238 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4239 PetscInt *cols1,*cols2,*ns; 4240 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4241 PetscBool flag; 4242 4243 PetscFunctionBegin; 4244 if (!a->inode.use) PetscFunctionReturn(0); 4245 if (a->inode.checked) PetscFunctionReturn(0); 4246 4247 m = A->rmap->n; 4248 if (a->inode.size) ns = a->inode.size; 4249 else { 4250 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4251 } 4252 4253 i = 0; 4254 node_count = 0; 4255 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4256 while (i < m) { /* For each row */ 4257 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4258 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4259 nzx = nzl1 + nzu1 + 1; 4260 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4261 4262 /* Limits the number of elements in a node to 'a->inode.limit' */ 4263 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4264 nzl2 = ai[j+1] - ai[j]; 4265 nzu2 = adiag[j] - adiag[j+1] - 1; 4266 nzy = nzl2 + nzu2 + 1; 4267 if (nzy != nzx) break; 4268 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4269 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4270 if (!flag) break; 4271 } 4272 ns[node_count++] = blk_size; 4273 i = j; 4274 } 4275 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4276 /* If not enough inodes found,, do not use inode version of the routines */ 4277 if (!m || node_count > .8*m) { 4278 ierr = PetscFree(ns);CHKERRQ(ierr); 4279 4280 a->inode.node_count = 0; 4281 a->inode.size = NULL; 4282 a->inode.use = PETSC_FALSE; 4283 4284 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4285 } else { 4286 A->ops->mult = 0; 4287 A->ops->sor = 0; 4288 A->ops->multadd = 0; 4289 A->ops->getrowij = 0; 4290 A->ops->restorerowij = 0; 4291 A->ops->getcolumnij = 0; 4292 A->ops->restorecolumnij = 0; 4293 A->ops->coloringpatch = 0; 4294 A->ops->multdiagonalblock = 0; 4295 a->inode.node_count = node_count; 4296 a->inode.size = ns; 4297 4298 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4299 } 4300 a->inode.checked = PETSC_TRUE; 4301 PetscFunctionReturn(0); 4302 } 4303 4304 #undef __FUNCT__ 4305 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4306 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4307 { 4308 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4309 4310 PetscFunctionBegin; 4311 a->inode.ibdiagvalid = PETSC_FALSE; 4312 PetscFunctionReturn(0); 4313 } 4314 4315 /* 4316 This is really ugly. if inodes are used this replaces the 4317 permutations with ones that correspond to rows/cols of the matrix 4318 rather then inode blocks 4319 */ 4320 #undef __FUNCT__ 4321 #define __FUNCT__ "MatInodeAdjustForInodes" 4322 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4323 { 4324 PetscErrorCode ierr; 4325 4326 PetscFunctionBegin; 4327 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4328 PetscFunctionReturn(0); 4329 } 4330 4331 #undef __FUNCT__ 4332 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4333 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4334 { 4335 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4336 PetscErrorCode ierr; 4337 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4338 const PetscInt *ridx,*cidx; 4339 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4340 PetscInt nslim_col,*ns_col; 4341 IS ris = *rperm,cis = *cperm; 4342 4343 PetscFunctionBegin; 4344 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4345 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4346 4347 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4348 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4349 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4350 4351 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4352 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4353 4354 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4355 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4356 4357 /* Construct the permutations for rows*/ 4358 for (i=0,row = 0; i<nslim_row; ++i) { 4359 indx = ridx[i]; 4360 start_val = tns[indx]; 4361 end_val = tns[indx + 1]; 4362 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4363 } 4364 4365 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4366 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4367 4368 /* Construct permutations for columns */ 4369 for (i=0,col=0; i<nslim_col; ++i) { 4370 indx = cidx[i]; 4371 start_val = tns[indx]; 4372 end_val = tns[indx + 1]; 4373 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4374 } 4375 4376 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4377 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4378 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4379 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4380 4381 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4382 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4383 4384 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4385 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4386 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4387 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4388 ierr = PetscFree(tns);CHKERRQ(ierr); 4389 PetscFunctionReturn(0); 4390 } 4391 4392 #undef __FUNCT__ 4393 #define __FUNCT__ "MatInodeGetInodeSizes" 4394 /*@C 4395 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4396 4397 Not Collective 4398 4399 Input Parameter: 4400 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4401 4402 Output Parameter: 4403 + node_count - no of inodes present in the matrix. 4404 . sizes - an array of size node_count,with sizes of each inode. 4405 - limit - the max size used to generate the inodes. 4406 4407 Level: advanced 4408 4409 Notes: This routine returns some internal storage information 4410 of the matrix, it is intended to be used by advanced users. 4411 It should be called after the matrix is assembled. 4412 The contents of the sizes[] array should not be changed. 4413 NULL may be passed for information not requested. 4414 4415 .keywords: matrix, seqaij, get, inode 4416 4417 .seealso: MatGetInfo() 4418 @*/ 4419 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4420 { 4421 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4422 4423 PetscFunctionBegin; 4424 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4425 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4426 if (f) { 4427 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4428 } 4429 PetscFunctionReturn(0); 4430 } 4431 4432 #undef __FUNCT__ 4433 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4434 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4435 { 4436 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4437 4438 PetscFunctionBegin; 4439 if (node_count) *node_count = a->inode.node_count; 4440 if (sizes) *sizes = a->inode.size; 4441 if (limit) *limit = a->inode.limit; 4442 PetscFunctionReturn(0); 4443 } 4444