1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 67 68 PetscFunctionBegin; 69 nslim_row = a->inode.node_count; 70 m = A->rmap->n; 71 n = A->cmap->n; 72 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 73 74 /* Use the row_inode as column_inode */ 75 nslim_col = nslim_row; 76 ns_col = ns_row; 77 78 /* allocate space for reformated inode structure */ 79 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 80 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 81 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 82 83 for (i1=0,col=0; i1<nslim_col; ++i1) { 84 nsz = ns_col[i1]; 85 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 86 } 87 /* allocate space for row pointers */ 88 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 89 *iia = ia; 90 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 91 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 92 93 /* determine the number of columns in each row */ 94 ia[0] = oshift; 95 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 96 97 j = aj + ai[row] + ishift; 98 jmax = aj + ai[row+1] + ishift; 99 col = *j++ + ishift; 100 i2 = tvc[col]; 101 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 102 ia[i1+1]++; 103 ia[i2+1]++; 104 i2++; /* Start col of next node */ 105 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 106 i2 = tvc[col]; 107 } 108 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 109 } 110 111 /* shift ia[i] to point to next row */ 112 for (i1=1; i1<nslim_row+1; i1++) { 113 row = ia[i1-1]; 114 ia[i1] += row; 115 work[i1-1] = row - oshift; 116 } 117 118 /* allocate space for column pointers */ 119 nz = ia[nslim_row] + (!ishift); 120 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 121 *jja = ja; 122 123 /* loop over lower triangular part putting into ja */ 124 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 125 j = aj + ai[row] + ishift; 126 jmax = aj + ai[row+1] + ishift; 127 col = *j++ + ishift; 128 i2 = tvc[col]; 129 while (i2<i1 && j<jmax) { 130 ja[work[i2]++] = i1 + oshift; 131 ja[work[i1]++] = i2 + oshift; 132 ++i2; 133 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 134 i2 = tvc[col]; 135 } 136 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 137 138 } 139 ierr = PetscFree(work);CHKERRQ(ierr); 140 ierr = PetscFree(tns);CHKERRQ(ierr); 141 ierr = PetscFree(tvc);CHKERRQ(ierr); 142 PetscFunctionReturn(0); 143 } 144 145 /* 146 This builds nonsymmetric version of nonzero structure, 147 */ 148 #undef __FUNCT__ 149 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 150 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 151 { 152 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 153 PetscErrorCode ierr; 154 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 155 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 156 157 PetscFunctionBegin; 158 nslim_row = a->inode.node_count; 159 n = A->cmap->n; 160 161 /* Create The column_inode for this matrix */ 162 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 163 164 /* allocate space for reformated column_inode structure */ 165 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 166 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 167 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 168 169 for (i1=0,col=0; i1<nslim_col; ++i1) { 170 nsz = ns_col[i1]; 171 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 172 } 173 /* allocate space for row pointers */ 174 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 175 *iia = ia; 176 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 177 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 178 179 /* determine the number of columns in each row */ 180 ia[0] = oshift; 181 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 182 j = aj + ai[row] + ishift; 183 col = *j++ + ishift; 184 i2 = tvc[col]; 185 nz = ai[row+1] - ai[row]; 186 while (nz-- > 0) { /* off-diagonal elemets */ 187 ia[i1+1]++; 188 i2++; /* Start col of next node */ 189 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 190 if (nz > 0) i2 = tvc[col]; 191 } 192 } 193 194 /* shift ia[i] to point to next row */ 195 for (i1=1; i1<nslim_row+1; i1++) { 196 row = ia[i1-1]; 197 ia[i1] += row; 198 work[i1-1] = row - oshift; 199 } 200 201 /* allocate space for column pointers */ 202 nz = ia[nslim_row] + (!ishift); 203 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 204 *jja = ja; 205 206 /* loop over matrix putting into ja */ 207 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 208 j = aj + ai[row] + ishift; 209 col = *j++ + ishift; 210 i2 = tvc[col]; 211 nz = ai[row+1] - ai[row]; 212 while (nz-- > 0) { 213 ja[work[i1]++] = i2 + oshift; 214 ++i2; 215 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 216 if (nz > 0) i2 = tvc[col]; 217 } 218 } 219 ierr = PetscFree(ns_col);CHKERRQ(ierr); 220 ierr = PetscFree(work);CHKERRQ(ierr); 221 ierr = PetscFree(tns);CHKERRQ(ierr); 222 ierr = PetscFree(tvc);CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 #undef __FUNCT__ 227 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 228 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 229 { 230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 231 PetscErrorCode ierr; 232 233 PetscFunctionBegin; 234 *n = a->inode.node_count; 235 if (!ia) PetscFunctionReturn(0); 236 if (!blockcompressed) { 237 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 238 } else if (symmetric) { 239 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 240 } else { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 248 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 249 { 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 if (!ia) PetscFunctionReturn(0); 254 255 if (!blockcompressed) { 256 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 257 } else { 258 ierr = PetscFree(*ia);CHKERRQ(ierr); 259 ierr = PetscFree(*ja);CHKERRQ(ierr); 260 } 261 PetscFunctionReturn(0); 262 } 263 264 /* ----------------------------------------------------------- */ 265 266 #undef __FUNCT__ 267 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 268 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 269 { 270 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 271 PetscErrorCode ierr; 272 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 273 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 274 275 PetscFunctionBegin; 276 nslim_row = a->inode.node_count; 277 n = A->cmap->n; 278 279 /* Create The column_inode for this matrix */ 280 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 281 282 /* allocate space for reformated column_inode structure */ 283 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 284 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 285 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 286 287 for (i1=0,col=0; i1<nslim_col; ++i1) { 288 nsz = ns_col[i1]; 289 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 290 } 291 /* allocate space for column pointers */ 292 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 293 *iia = ia; 294 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 295 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 296 297 /* determine the number of columns in each row */ 298 ia[0] = oshift; 299 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 300 j = aj + ai[row] + ishift; 301 col = *j++ + ishift; 302 i2 = tvc[col]; 303 nz = ai[row+1] - ai[row]; 304 while (nz-- > 0) { /* off-diagonal elemets */ 305 /* ia[i1+1]++; */ 306 ia[i2+1]++; 307 i2++; 308 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 309 if (nz > 0) i2 = tvc[col]; 310 } 311 } 312 313 /* shift ia[i] to point to next col */ 314 for (i1=1; i1<nslim_col+1; i1++) { 315 col = ia[i1-1]; 316 ia[i1] += col; 317 work[i1-1] = col - oshift; 318 } 319 320 /* allocate space for column pointers */ 321 nz = ia[nslim_col] + (!ishift); 322 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 323 *jja = ja; 324 325 /* loop over matrix putting into ja */ 326 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 327 j = aj + ai[row] + ishift; 328 col = *j++ + ishift; 329 i2 = tvc[col]; 330 nz = ai[row+1] - ai[row]; 331 while (nz-- > 0) { 332 /* ja[work[i1]++] = i2 + oshift; */ 333 ja[work[i2]++] = i1 + oshift; 334 i2++; 335 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 336 if (nz > 0) i2 = tvc[col]; 337 } 338 } 339 ierr = PetscFree(ns_col);CHKERRQ(ierr); 340 ierr = PetscFree(work);CHKERRQ(ierr); 341 ierr = PetscFree(tns);CHKERRQ(ierr); 342 ierr = PetscFree(tvc);CHKERRQ(ierr); 343 PetscFunctionReturn(0); 344 } 345 346 #undef __FUNCT__ 347 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 348 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 349 { 350 PetscErrorCode ierr; 351 352 PetscFunctionBegin; 353 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 354 if (!ia) PetscFunctionReturn(0); 355 356 if (!blockcompressed) { 357 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 358 } else if (symmetric) { 359 /* Since the indices are symmetric it does'nt matter */ 360 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 361 } else { 362 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } 364 PetscFunctionReturn(0); 365 } 366 367 #undef __FUNCT__ 368 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 369 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 370 { 371 PetscErrorCode ierr; 372 373 PetscFunctionBegin; 374 if (!ia) PetscFunctionReturn(0); 375 if (!blockcompressed) { 376 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 377 } else { 378 ierr = PetscFree(*ia);CHKERRQ(ierr); 379 ierr = PetscFree(*ja);CHKERRQ(ierr); 380 } 381 PetscFunctionReturn(0); 382 } 383 384 /* ----------------------------------------------------------- */ 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 388 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 389 { 390 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 391 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 392 PetscScalar *y; 393 const PetscScalar *x; 394 const MatScalar *v1,*v2,*v3,*v4,*v5; 395 PetscErrorCode ierr; 396 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 397 398 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 399 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 400 #endif 401 402 PetscFunctionBegin; 403 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 404 node_max = a->inode.node_count; 405 ns = a->inode.size; /* Node Size array */ 406 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 407 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 408 idx = a->j; 409 v1 = a->a; 410 ii = a->i; 411 412 for (i = 0,row = 0; i< node_max; ++i) { 413 nsz = ns[i]; 414 n = ii[1] - ii[0]; 415 nonzerorow += (n>0)*nsz; 416 ii += nsz; 417 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 418 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 419 sz = n; /* No of non zeros in this row */ 420 /* Switch on the size of Node */ 421 switch (nsz) { /* Each loop in 'case' is unrolled */ 422 case 1: 423 sum1 = 0.; 424 425 for (n = 0; n< sz-1; n+=2) { 426 i1 = idx[0]; /* The instructions are ordered to */ 427 i2 = idx[1]; /* make the compiler's job easy */ 428 idx += 2; 429 tmp0 = x[i1]; 430 tmp1 = x[i2]; 431 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 432 } 433 434 if (n == sz-1) { /* Take care of the last nonzero */ 435 tmp0 = x[*idx++]; 436 sum1 += *v1++ *tmp0; 437 } 438 y[row++]=sum1; 439 break; 440 case 2: 441 sum1 = 0.; 442 sum2 = 0.; 443 v2 = v1 + n; 444 445 for (n = 0; n< sz-1; n+=2) { 446 i1 = idx[0]; 447 i2 = idx[1]; 448 idx += 2; 449 tmp0 = x[i1]; 450 tmp1 = x[i2]; 451 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 452 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 453 } 454 if (n == sz-1) { 455 tmp0 = x[*idx++]; 456 sum1 += *v1++ * tmp0; 457 sum2 += *v2++ * tmp0; 458 } 459 y[row++]=sum1; 460 y[row++]=sum2; 461 v1 =v2; /* Since the next block to be processed starts there*/ 462 idx +=sz; 463 break; 464 case 3: 465 sum1 = 0.; 466 sum2 = 0.; 467 sum3 = 0.; 468 v2 = v1 + n; 469 v3 = v2 + n; 470 471 for (n = 0; n< sz-1; n+=2) { 472 i1 = idx[0]; 473 i2 = idx[1]; 474 idx += 2; 475 tmp0 = x[i1]; 476 tmp1 = x[i2]; 477 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 478 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 479 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 480 } 481 if (n == sz-1) { 482 tmp0 = x[*idx++]; 483 sum1 += *v1++ * tmp0; 484 sum2 += *v2++ * tmp0; 485 sum3 += *v3++ * tmp0; 486 } 487 y[row++]=sum1; 488 y[row++]=sum2; 489 y[row++]=sum3; 490 v1 =v3; /* Since the next block to be processed starts there*/ 491 idx +=2*sz; 492 break; 493 case 4: 494 sum1 = 0.; 495 sum2 = 0.; 496 sum3 = 0.; 497 sum4 = 0.; 498 v2 = v1 + n; 499 v3 = v2 + n; 500 v4 = v3 + n; 501 502 for (n = 0; n< sz-1; n+=2) { 503 i1 = idx[0]; 504 i2 = idx[1]; 505 idx += 2; 506 tmp0 = x[i1]; 507 tmp1 = x[i2]; 508 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 509 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 510 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 511 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 512 } 513 if (n == sz-1) { 514 tmp0 = x[*idx++]; 515 sum1 += *v1++ * tmp0; 516 sum2 += *v2++ * tmp0; 517 sum3 += *v3++ * tmp0; 518 sum4 += *v4++ * tmp0; 519 } 520 y[row++]=sum1; 521 y[row++]=sum2; 522 y[row++]=sum3; 523 y[row++]=sum4; 524 v1 =v4; /* Since the next block to be processed starts there*/ 525 idx +=3*sz; 526 break; 527 case 5: 528 sum1 = 0.; 529 sum2 = 0.; 530 sum3 = 0.; 531 sum4 = 0.; 532 sum5 = 0.; 533 v2 = v1 + n; 534 v3 = v2 + n; 535 v4 = v3 + n; 536 v5 = v4 + n; 537 538 for (n = 0; n<sz-1; n+=2) { 539 i1 = idx[0]; 540 i2 = idx[1]; 541 idx += 2; 542 tmp0 = x[i1]; 543 tmp1 = x[i2]; 544 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 545 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 546 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 547 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 548 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 549 } 550 if (n == sz-1) { 551 tmp0 = x[*idx++]; 552 sum1 += *v1++ * tmp0; 553 sum2 += *v2++ * tmp0; 554 sum3 += *v3++ * tmp0; 555 sum4 += *v4++ * tmp0; 556 sum5 += *v5++ * tmp0; 557 } 558 y[row++]=sum1; 559 y[row++]=sum2; 560 y[row++]=sum3; 561 y[row++]=sum4; 562 y[row++]=sum5; 563 v1 =v5; /* Since the next block to be processed starts there */ 564 idx +=4*sz; 565 break; 566 default: 567 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 568 } 569 } 570 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 571 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 572 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 /* ----------------------------------------------------------- */ 576 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 577 #undef __FUNCT__ 578 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 579 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 580 { 581 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 582 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 583 MatScalar *v1,*v2,*v3,*v4,*v5; 584 PetscScalar *x,*y,*z,*zt; 585 PetscErrorCode ierr; 586 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 587 588 PetscFunctionBegin; 589 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 590 node_max = a->inode.node_count; 591 ns = a->inode.size; /* Node Size array */ 592 593 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 594 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 595 if (zz != yy) { 596 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 597 } else { 598 z = y; 599 } 600 zt = z; 601 602 idx = a->j; 603 v1 = a->a; 604 ii = a->i; 605 606 for (i = 0,row = 0; i< node_max; ++i) { 607 nsz = ns[i]; 608 n = ii[1] - ii[0]; 609 ii += nsz; 610 sz = n; /* No of non zeros in this row */ 611 /* Switch on the size of Node */ 612 switch (nsz) { /* Each loop in 'case' is unrolled */ 613 case 1: 614 sum1 = *zt++; 615 616 for (n = 0; n< sz-1; n+=2) { 617 i1 = idx[0]; /* The instructions are ordered to */ 618 i2 = idx[1]; /* make the compiler's job easy */ 619 idx += 2; 620 tmp0 = x[i1]; 621 tmp1 = x[i2]; 622 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 623 } 624 625 if (n == sz-1) { /* Take care of the last nonzero */ 626 tmp0 = x[*idx++]; 627 sum1 += *v1++ * tmp0; 628 } 629 y[row++]=sum1; 630 break; 631 case 2: 632 sum1 = *zt++; 633 sum2 = *zt++; 634 v2 = v1 + n; 635 636 for (n = 0; n< sz-1; n+=2) { 637 i1 = idx[0]; 638 i2 = idx[1]; 639 idx += 2; 640 tmp0 = x[i1]; 641 tmp1 = x[i2]; 642 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 643 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 644 } 645 if (n == sz-1) { 646 tmp0 = x[*idx++]; 647 sum1 += *v1++ * tmp0; 648 sum2 += *v2++ * tmp0; 649 } 650 y[row++]=sum1; 651 y[row++]=sum2; 652 v1 =v2; /* Since the next block to be processed starts there*/ 653 idx +=sz; 654 break; 655 case 3: 656 sum1 = *zt++; 657 sum2 = *zt++; 658 sum3 = *zt++; 659 v2 = v1 + n; 660 v3 = v2 + n; 661 662 for (n = 0; n< sz-1; n+=2) { 663 i1 = idx[0]; 664 i2 = idx[1]; 665 idx += 2; 666 tmp0 = x[i1]; 667 tmp1 = x[i2]; 668 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 669 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 670 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 671 } 672 if (n == sz-1) { 673 tmp0 = x[*idx++]; 674 sum1 += *v1++ * tmp0; 675 sum2 += *v2++ * tmp0; 676 sum3 += *v3++ * tmp0; 677 } 678 y[row++]=sum1; 679 y[row++]=sum2; 680 y[row++]=sum3; 681 v1 =v3; /* Since the next block to be processed starts there*/ 682 idx +=2*sz; 683 break; 684 case 4: 685 sum1 = *zt++; 686 sum2 = *zt++; 687 sum3 = *zt++; 688 sum4 = *zt++; 689 v2 = v1 + n; 690 v3 = v2 + n; 691 v4 = v3 + n; 692 693 for (n = 0; n< sz-1; n+=2) { 694 i1 = idx[0]; 695 i2 = idx[1]; 696 idx += 2; 697 tmp0 = x[i1]; 698 tmp1 = x[i2]; 699 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 700 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 701 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 702 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 703 } 704 if (n == sz-1) { 705 tmp0 = x[*idx++]; 706 sum1 += *v1++ * tmp0; 707 sum2 += *v2++ * tmp0; 708 sum3 += *v3++ * tmp0; 709 sum4 += *v4++ * tmp0; 710 } 711 y[row++]=sum1; 712 y[row++]=sum2; 713 y[row++]=sum3; 714 y[row++]=sum4; 715 v1 =v4; /* Since the next block to be processed starts there*/ 716 idx +=3*sz; 717 break; 718 case 5: 719 sum1 = *zt++; 720 sum2 = *zt++; 721 sum3 = *zt++; 722 sum4 = *zt++; 723 sum5 = *zt++; 724 v2 = v1 + n; 725 v3 = v2 + n; 726 v4 = v3 + n; 727 v5 = v4 + n; 728 729 for (n = 0; n<sz-1; n+=2) { 730 i1 = idx[0]; 731 i2 = idx[1]; 732 idx += 2; 733 tmp0 = x[i1]; 734 tmp1 = x[i2]; 735 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 736 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 737 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 738 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 739 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 740 } 741 if (n == sz-1) { 742 tmp0 = x[*idx++]; 743 sum1 += *v1++ * tmp0; 744 sum2 += *v2++ * tmp0; 745 sum3 += *v3++ * tmp0; 746 sum4 += *v4++ * tmp0; 747 sum5 += *v5++ * tmp0; 748 } 749 y[row++]=sum1; 750 y[row++]=sum2; 751 y[row++]=sum3; 752 y[row++]=sum4; 753 y[row++]=sum5; 754 v1 =v5; /* Since the next block to be processed starts there */ 755 idx +=4*sz; 756 break; 757 default: 758 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 759 } 760 } 761 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 762 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 763 if (zz != yy) { 764 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 765 } 766 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 767 PetscFunctionReturn(0); 768 } 769 770 /* ----------------------------------------------------------- */ 771 #undef __FUNCT__ 772 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 773 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 774 { 775 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 776 IS iscol = a->col,isrow = a->row; 777 PetscErrorCode ierr; 778 const PetscInt *r,*c,*rout,*cout; 779 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 780 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 781 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 782 PetscScalar sum1,sum2,sum3,sum4,sum5; 783 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 784 const PetscScalar *b; 785 786 PetscFunctionBegin; 787 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 788 node_max = a->inode.node_count; 789 ns = a->inode.size; /* Node Size array */ 790 791 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 792 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 793 tmp = a->solve_work; 794 795 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 796 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 797 798 /* forward solve the lower triangular */ 799 tmps = tmp; 800 aa = a_a; 801 aj = a_j; 802 ad = a->diag; 803 804 for (i = 0,row = 0; i< node_max; ++i) { 805 nsz = ns[i]; 806 aii = ai[row]; 807 v1 = aa + aii; 808 vi = aj + aii; 809 nz = ad[row]- aii; 810 if (i < node_max-1) { 811 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 812 * but our indexing to determine it's size could. */ 813 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 814 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 815 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 816 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 817 } 818 819 switch (nsz) { /* Each loop in 'case' is unrolled */ 820 case 1: 821 sum1 = b[*r++]; 822 for (j=0; j<nz-1; j+=2) { 823 i0 = vi[0]; 824 i1 = vi[1]; 825 vi +=2; 826 tmp0 = tmps[i0]; 827 tmp1 = tmps[i1]; 828 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 829 } 830 if (j == nz-1) { 831 tmp0 = tmps[*vi++]; 832 sum1 -= *v1++ *tmp0; 833 } 834 tmp[row++]=sum1; 835 break; 836 case 2: 837 sum1 = b[*r++]; 838 sum2 = b[*r++]; 839 v2 = aa + ai[row+1]; 840 841 for (j=0; j<nz-1; j+=2) { 842 i0 = vi[0]; 843 i1 = vi[1]; 844 vi +=2; 845 tmp0 = tmps[i0]; 846 tmp1 = tmps[i1]; 847 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 848 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 849 } 850 if (j == nz-1) { 851 tmp0 = tmps[*vi++]; 852 sum1 -= *v1++ *tmp0; 853 sum2 -= *v2++ *tmp0; 854 } 855 sum2 -= *v2++ *sum1; 856 tmp[row++]=sum1; 857 tmp[row++]=sum2; 858 break; 859 case 3: 860 sum1 = b[*r++]; 861 sum2 = b[*r++]; 862 sum3 = b[*r++]; 863 v2 = aa + ai[row+1]; 864 v3 = aa + ai[row+2]; 865 866 for (j=0; j<nz-1; j+=2) { 867 i0 = vi[0]; 868 i1 = vi[1]; 869 vi +=2; 870 tmp0 = tmps[i0]; 871 tmp1 = tmps[i1]; 872 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 873 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 874 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 875 } 876 if (j == nz-1) { 877 tmp0 = tmps[*vi++]; 878 sum1 -= *v1++ *tmp0; 879 sum2 -= *v2++ *tmp0; 880 sum3 -= *v3++ *tmp0; 881 } 882 sum2 -= *v2++ * sum1; 883 sum3 -= *v3++ * sum1; 884 sum3 -= *v3++ * sum2; 885 886 tmp[row++]=sum1; 887 tmp[row++]=sum2; 888 tmp[row++]=sum3; 889 break; 890 891 case 4: 892 sum1 = b[*r++]; 893 sum2 = b[*r++]; 894 sum3 = b[*r++]; 895 sum4 = b[*r++]; 896 v2 = aa + ai[row+1]; 897 v3 = aa + ai[row+2]; 898 v4 = aa + ai[row+3]; 899 900 for (j=0; j<nz-1; j+=2) { 901 i0 = vi[0]; 902 i1 = vi[1]; 903 vi +=2; 904 tmp0 = tmps[i0]; 905 tmp1 = tmps[i1]; 906 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 908 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 909 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 910 } 911 if (j == nz-1) { 912 tmp0 = tmps[*vi++]; 913 sum1 -= *v1++ *tmp0; 914 sum2 -= *v2++ *tmp0; 915 sum3 -= *v3++ *tmp0; 916 sum4 -= *v4++ *tmp0; 917 } 918 sum2 -= *v2++ * sum1; 919 sum3 -= *v3++ * sum1; 920 sum4 -= *v4++ * sum1; 921 sum3 -= *v3++ * sum2; 922 sum4 -= *v4++ * sum2; 923 sum4 -= *v4++ * sum3; 924 925 tmp[row++]=sum1; 926 tmp[row++]=sum2; 927 tmp[row++]=sum3; 928 tmp[row++]=sum4; 929 break; 930 case 5: 931 sum1 = b[*r++]; 932 sum2 = b[*r++]; 933 sum3 = b[*r++]; 934 sum4 = b[*r++]; 935 sum5 = b[*r++]; 936 v2 = aa + ai[row+1]; 937 v3 = aa + ai[row+2]; 938 v4 = aa + ai[row+3]; 939 v5 = aa + ai[row+4]; 940 941 for (j=0; j<nz-1; j+=2) { 942 i0 = vi[0]; 943 i1 = vi[1]; 944 vi +=2; 945 tmp0 = tmps[i0]; 946 tmp1 = tmps[i1]; 947 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 948 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 949 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 950 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 951 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 952 } 953 if (j == nz-1) { 954 tmp0 = tmps[*vi++]; 955 sum1 -= *v1++ *tmp0; 956 sum2 -= *v2++ *tmp0; 957 sum3 -= *v3++ *tmp0; 958 sum4 -= *v4++ *tmp0; 959 sum5 -= *v5++ *tmp0; 960 } 961 962 sum2 -= *v2++ * sum1; 963 sum3 -= *v3++ * sum1; 964 sum4 -= *v4++ * sum1; 965 sum5 -= *v5++ * sum1; 966 sum3 -= *v3++ * sum2; 967 sum4 -= *v4++ * sum2; 968 sum5 -= *v5++ * sum2; 969 sum4 -= *v4++ * sum3; 970 sum5 -= *v5++ * sum3; 971 sum5 -= *v5++ * sum4; 972 973 tmp[row++]=sum1; 974 tmp[row++]=sum2; 975 tmp[row++]=sum3; 976 tmp[row++]=sum4; 977 tmp[row++]=sum5; 978 break; 979 default: 980 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 981 } 982 } 983 /* backward solve the upper triangular */ 984 for (i=node_max -1,row = n-1; i>=0; i--) { 985 nsz = ns[i]; 986 aii = ai[row+1] -1; 987 v1 = aa + aii; 988 vi = aj + aii; 989 nz = aii- ad[row]; 990 switch (nsz) { /* Each loop in 'case' is unrolled */ 991 case 1: 992 sum1 = tmp[row]; 993 994 for (j=nz; j>1; j-=2) { 995 vi -=2; 996 i0 = vi[2]; 997 i1 = vi[1]; 998 tmp0 = tmps[i0]; 999 tmp1 = tmps[i1]; 1000 v1 -= 2; 1001 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1002 } 1003 if (j==1) { 1004 tmp0 = tmps[*vi--]; 1005 sum1 -= *v1-- * tmp0; 1006 } 1007 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1008 break; 1009 case 2: 1010 sum1 = tmp[row]; 1011 sum2 = tmp[row -1]; 1012 v2 = aa + ai[row]-1; 1013 for (j=nz; j>1; j-=2) { 1014 vi -=2; 1015 i0 = vi[2]; 1016 i1 = vi[1]; 1017 tmp0 = tmps[i0]; 1018 tmp1 = tmps[i1]; 1019 v1 -= 2; 1020 v2 -= 2; 1021 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1022 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1023 } 1024 if (j==1) { 1025 tmp0 = tmps[*vi--]; 1026 sum1 -= *v1-- * tmp0; 1027 sum2 -= *v2-- * tmp0; 1028 } 1029 1030 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1031 sum2 -= *v2-- * tmp0; 1032 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1033 break; 1034 case 3: 1035 sum1 = tmp[row]; 1036 sum2 = tmp[row -1]; 1037 sum3 = tmp[row -2]; 1038 v2 = aa + ai[row]-1; 1039 v3 = aa + ai[row -1]-1; 1040 for (j=nz; j>1; j-=2) { 1041 vi -=2; 1042 i0 = vi[2]; 1043 i1 = vi[1]; 1044 tmp0 = tmps[i0]; 1045 tmp1 = tmps[i1]; 1046 v1 -= 2; 1047 v2 -= 2; 1048 v3 -= 2; 1049 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1050 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1051 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1052 } 1053 if (j==1) { 1054 tmp0 = tmps[*vi--]; 1055 sum1 -= *v1-- * tmp0; 1056 sum2 -= *v2-- * tmp0; 1057 sum3 -= *v3-- * tmp0; 1058 } 1059 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1060 sum2 -= *v2-- * tmp0; 1061 sum3 -= *v3-- * tmp0; 1062 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1063 sum3 -= *v3-- * tmp0; 1064 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1065 1066 break; 1067 case 4: 1068 sum1 = tmp[row]; 1069 sum2 = tmp[row -1]; 1070 sum3 = tmp[row -2]; 1071 sum4 = tmp[row -3]; 1072 v2 = aa + ai[row]-1; 1073 v3 = aa + ai[row -1]-1; 1074 v4 = aa + ai[row -2]-1; 1075 1076 for (j=nz; j>1; j-=2) { 1077 vi -=2; 1078 i0 = vi[2]; 1079 i1 = vi[1]; 1080 tmp0 = tmps[i0]; 1081 tmp1 = tmps[i1]; 1082 v1 -= 2; 1083 v2 -= 2; 1084 v3 -= 2; 1085 v4 -= 2; 1086 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1087 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1088 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1089 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1090 } 1091 if (j==1) { 1092 tmp0 = tmps[*vi--]; 1093 sum1 -= *v1-- * tmp0; 1094 sum2 -= *v2-- * tmp0; 1095 sum3 -= *v3-- * tmp0; 1096 sum4 -= *v4-- * tmp0; 1097 } 1098 1099 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1104 sum3 -= *v3-- * tmp0; 1105 sum4 -= *v4-- * tmp0; 1106 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1107 sum4 -= *v4-- * tmp0; 1108 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1109 break; 1110 case 5: 1111 sum1 = tmp[row]; 1112 sum2 = tmp[row -1]; 1113 sum3 = tmp[row -2]; 1114 sum4 = tmp[row -3]; 1115 sum5 = tmp[row -4]; 1116 v2 = aa + ai[row]-1; 1117 v3 = aa + ai[row -1]-1; 1118 v4 = aa + ai[row -2]-1; 1119 v5 = aa + ai[row -3]-1; 1120 for (j=nz; j>1; j-=2) { 1121 vi -= 2; 1122 i0 = vi[2]; 1123 i1 = vi[1]; 1124 tmp0 = tmps[i0]; 1125 tmp1 = tmps[i1]; 1126 v1 -= 2; 1127 v2 -= 2; 1128 v3 -= 2; 1129 v4 -= 2; 1130 v5 -= 2; 1131 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1132 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1133 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1134 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1135 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1136 } 1137 if (j==1) { 1138 tmp0 = tmps[*vi--]; 1139 sum1 -= *v1-- * tmp0; 1140 sum2 -= *v2-- * tmp0; 1141 sum3 -= *v3-- * tmp0; 1142 sum4 -= *v4-- * tmp0; 1143 sum5 -= *v5-- * tmp0; 1144 } 1145 1146 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1152 sum3 -= *v3-- * tmp0; 1153 sum4 -= *v4-- * tmp0; 1154 sum5 -= *v5-- * tmp0; 1155 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1159 sum5 -= *v5-- * tmp0; 1160 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1161 break; 1162 default: 1163 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1164 } 1165 } 1166 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1167 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1168 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1176 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1177 { 1178 Mat C =B; 1179 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1180 IS isrow = b->row,isicol = b->icol; 1181 PetscErrorCode ierr; 1182 const PetscInt *r,*ic,*ics; 1183 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1184 PetscInt i,j,k,nz,nzL,row,*pj; 1185 const PetscInt *ajtmp,*bjtmp; 1186 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1187 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1188 FactorShiftCtx sctx; 1189 const PetscInt *ddiag; 1190 PetscReal rs; 1191 MatScalar d; 1192 PetscInt inod,nodesz,node_max,col; 1193 const PetscInt *ns; 1194 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1195 1196 PetscFunctionBegin; 1197 /* MatPivotSetUp(): initialize shift context sctx */ 1198 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1199 1200 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1201 ddiag = a->diag; 1202 sctx.shift_top = info->zeropivot; 1203 for (i=0; i<n; i++) { 1204 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1205 d = (aa)[ddiag[i]]; 1206 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1207 v = aa+ai[i]; 1208 nz = ai[i+1] - ai[i]; 1209 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1210 if (rs>sctx.shift_top) sctx.shift_top = rs; 1211 } 1212 sctx.shift_top *= 1.1; 1213 sctx.nshift_max = 5; 1214 sctx.shift_lo = 0.; 1215 sctx.shift_hi = 1.; 1216 } 1217 1218 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1219 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1220 1221 ierr = PetscMalloc4(n,PetscScalar,&rtmp1,n,PetscScalar,&rtmp2,n,PetscScalar,&rtmp3,n,PetscScalar,&rtmp4);CHKERRQ(ierr); 1222 ierr = PetscMemzero(rtmp1,n*sizeof(PetscScalar));CHKERRQ(ierr); 1223 ierr = PetscMemzero(rtmp2,n*sizeof(PetscScalar));CHKERRQ(ierr); 1224 ierr = PetscMemzero(rtmp3,n*sizeof(PetscScalar));CHKERRQ(ierr); 1225 ierr = PetscMemzero(rtmp4,n*sizeof(PetscScalar));CHKERRQ(ierr); 1226 ics = ic; 1227 1228 node_max = a->inode.node_count; 1229 ns = a->inode.size; 1230 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1231 1232 /* If max inode size > 4, split it into two inodes.*/ 1233 /* also map the inode sizes according to the ordering */ 1234 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1235 for (i=0,j=0; i<node_max; ++i,++j) { 1236 if (ns[i] > 4) { 1237 tmp_vec1[j] = 4; 1238 ++j; 1239 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1240 } else { 1241 tmp_vec1[j] = ns[i]; 1242 } 1243 } 1244 /* Use the correct node_max */ 1245 node_max = j; 1246 1247 /* Now reorder the inode info based on mat re-ordering info */ 1248 /* First create a row -> inode_size_array_index map */ 1249 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1250 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1251 for (i=0,row=0; i<node_max; i++) { 1252 nodesz = tmp_vec1[i]; 1253 for (j=0; j<nodesz; j++,row++) { 1254 nsmap[row] = i; 1255 } 1256 } 1257 /* Using nsmap, create a reordered ns structure */ 1258 for (i=0,j=0; i< node_max; i++) { 1259 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1260 tmp_vec2[i] = nodesz; 1261 j += nodesz; 1262 } 1263 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1264 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1265 1266 /* Now use the correct ns */ 1267 ns = tmp_vec2; 1268 1269 do { 1270 sctx.newshift = PETSC_FALSE; 1271 /* Now loop over each block-row, and do the factorization */ 1272 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1273 nodesz = ns[inod]; 1274 1275 switch (nodesz) { 1276 case 1: 1277 /*----------*/ 1278 /* zero rtmp1 */ 1279 /* L part */ 1280 nz = bi[i+1] - bi[i]; 1281 bjtmp = bj + bi[i]; 1282 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1283 1284 /* U part */ 1285 nz = bdiag[i]-bdiag[i+1]; 1286 bjtmp = bj + bdiag[i+1]+1; 1287 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1288 1289 /* load in initial (unfactored row) */ 1290 nz = ai[r[i]+1] - ai[r[i]]; 1291 ajtmp = aj + ai[r[i]]; 1292 v = aa + ai[r[i]]; 1293 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1294 1295 /* ZeropivotApply() */ 1296 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1297 1298 /* elimination */ 1299 bjtmp = bj + bi[i]; 1300 row = *bjtmp++; 1301 nzL = bi[i+1] - bi[i]; 1302 for (k=0; k < nzL; k++) { 1303 pc = rtmp1 + row; 1304 if (*pc != 0.0) { 1305 pv = b->a + bdiag[row]; 1306 mul1 = *pc * (*pv); 1307 *pc = mul1; 1308 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1309 pv = b->a + bdiag[row+1]+1; 1310 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1311 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1312 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1313 } 1314 row = *bjtmp++; 1315 } 1316 1317 /* finished row so stick it into b->a */ 1318 rs = 0.0; 1319 /* L part */ 1320 pv = b->a + bi[i]; 1321 pj = b->j + bi[i]; 1322 nz = bi[i+1] - bi[i]; 1323 for (j=0; j<nz; j++) { 1324 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1325 } 1326 1327 /* U part */ 1328 pv = b->a + bdiag[i+1]+1; 1329 pj = b->j + bdiag[i+1]+1; 1330 nz = bdiag[i] - bdiag[i+1]-1; 1331 for (j=0; j<nz; j++) { 1332 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1333 } 1334 1335 /* Check zero pivot */ 1336 sctx.rs = rs; 1337 sctx.pv = rtmp1[i]; 1338 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1339 if (sctx.newshift) break; 1340 1341 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1342 pv = b->a + bdiag[i]; 1343 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1344 break; 1345 1346 case 2: 1347 /*----------*/ 1348 /* zero rtmp1 and rtmp2 */ 1349 /* L part */ 1350 nz = bi[i+1] - bi[i]; 1351 bjtmp = bj + bi[i]; 1352 for (j=0; j<nz; j++) { 1353 col = bjtmp[j]; 1354 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1355 } 1356 1357 /* U part */ 1358 nz = bdiag[i]-bdiag[i+1]; 1359 bjtmp = bj + bdiag[i+1]+1; 1360 for (j=0; j<nz; j++) { 1361 col = bjtmp[j]; 1362 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1363 } 1364 1365 /* load in initial (unfactored row) */ 1366 nz = ai[r[i]+1] - ai[r[i]]; 1367 ajtmp = aj + ai[r[i]]; 1368 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1369 for (j=0; j<nz; j++) { 1370 col = ics[ajtmp[j]]; 1371 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1372 } 1373 /* ZeropivotApply(): shift the diagonal of the matrix */ 1374 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1375 1376 /* elimination */ 1377 bjtmp = bj + bi[i]; 1378 row = *bjtmp++; /* pivot row */ 1379 nzL = bi[i+1] - bi[i]; 1380 for (k=0; k < nzL; k++) { 1381 pc1 = rtmp1 + row; 1382 pc2 = rtmp2 + row; 1383 if (*pc1 != 0.0 || *pc2 != 0.0) { 1384 pv = b->a + bdiag[row]; 1385 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1386 *pc1 = mul1; *pc2 = mul2; 1387 1388 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1389 pv = b->a + bdiag[row+1]+1; 1390 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1391 for (j=0; j<nz; j++) { 1392 col = pj[j]; 1393 rtmp1[col] -= mul1 * pv[j]; 1394 rtmp2[col] -= mul2 * pv[j]; 1395 } 1396 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1397 } 1398 row = *bjtmp++; 1399 } 1400 1401 /* finished row i; check zero pivot, then stick row i into b->a */ 1402 rs = 0.0; 1403 /* L part */ 1404 pc1 = b->a + bi[i]; 1405 pj = b->j + bi[i]; 1406 nz = bi[i+1] - bi[i]; 1407 for (j=0; j<nz; j++) { 1408 col = pj[j]; 1409 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1410 } 1411 /* U part */ 1412 pc1 = b->a + bdiag[i+1]+1; 1413 pj = b->j + bdiag[i+1]+1; 1414 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1415 for (j=0; j<nz; j++) { 1416 col = pj[j]; 1417 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1418 } 1419 1420 sctx.rs = rs; 1421 sctx.pv = rtmp1[i]; 1422 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1423 if (sctx.newshift) break; 1424 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1425 *pc1 = 1.0/sctx.pv; 1426 1427 /* Now take care of diagonal 2x2 block. */ 1428 pc2 = rtmp2 + i; 1429 if (*pc2 != 0.0) { 1430 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1431 *pc2 = mul1; /* insert L entry */ 1432 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1433 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1434 for (j=0; j<nz; j++) { 1435 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1436 } 1437 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1438 } 1439 1440 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1441 rs = 0.0; 1442 /* L part */ 1443 pc2 = b->a + bi[i+1]; 1444 pj = b->j + bi[i+1]; 1445 nz = bi[i+2] - bi[i+1]; 1446 for (j=0; j<nz; j++) { 1447 col = pj[j]; 1448 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1449 } 1450 /* U part */ 1451 pc2 = b->a + bdiag[i+2]+1; 1452 pj = b->j + bdiag[i+2]+1; 1453 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1454 for (j=0; j<nz; j++) { 1455 col = pj[j]; 1456 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1457 } 1458 1459 sctx.rs = rs; 1460 sctx.pv = rtmp2[i+1]; 1461 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1462 if (sctx.newshift) break; 1463 pc2 = b->a + bdiag[i+1]; 1464 *pc2 = 1.0/sctx.pv; 1465 break; 1466 1467 case 3: 1468 /*----------*/ 1469 /* zero rtmp */ 1470 /* L part */ 1471 nz = bi[i+1] - bi[i]; 1472 bjtmp = bj + bi[i]; 1473 for (j=0; j<nz; j++) { 1474 col = bjtmp[j]; 1475 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1476 } 1477 1478 /* U part */ 1479 nz = bdiag[i]-bdiag[i+1]; 1480 bjtmp = bj + bdiag[i+1]+1; 1481 for (j=0; j<nz; j++) { 1482 col = bjtmp[j]; 1483 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1484 } 1485 1486 /* load in initial (unfactored row) */ 1487 nz = ai[r[i]+1] - ai[r[i]]; 1488 ajtmp = aj + ai[r[i]]; 1489 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1490 for (j=0; j<nz; j++) { 1491 col = ics[ajtmp[j]]; 1492 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1493 } 1494 /* ZeropivotApply(): shift the diagonal of the matrix */ 1495 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1496 1497 /* elimination */ 1498 bjtmp = bj + bi[i]; 1499 row = *bjtmp++; /* pivot row */ 1500 nzL = bi[i+1] - bi[i]; 1501 for (k=0; k < nzL; k++) { 1502 pc1 = rtmp1 + row; 1503 pc2 = rtmp2 + row; 1504 pc3 = rtmp3 + row; 1505 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1506 pv = b->a + bdiag[row]; 1507 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1508 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1509 1510 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1511 pv = b->a + bdiag[row+1]+1; 1512 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1513 for (j=0; j<nz; j++) { 1514 col = pj[j]; 1515 rtmp1[col] -= mul1 * pv[j]; 1516 rtmp2[col] -= mul2 * pv[j]; 1517 rtmp3[col] -= mul3 * pv[j]; 1518 } 1519 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1520 } 1521 row = *bjtmp++; 1522 } 1523 1524 /* finished row i; check zero pivot, then stick row i into b->a */ 1525 rs = 0.0; 1526 /* L part */ 1527 pc1 = b->a + bi[i]; 1528 pj = b->j + bi[i]; 1529 nz = bi[i+1] - bi[i]; 1530 for (j=0; j<nz; j++) { 1531 col = pj[j]; 1532 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1533 } 1534 /* U part */ 1535 pc1 = b->a + bdiag[i+1]+1; 1536 pj = b->j + bdiag[i+1]+1; 1537 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1538 for (j=0; j<nz; j++) { 1539 col = pj[j]; 1540 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1541 } 1542 1543 sctx.rs = rs; 1544 sctx.pv = rtmp1[i]; 1545 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1546 if (sctx.newshift) break; 1547 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1548 *pc1 = 1.0/sctx.pv; 1549 1550 /* Now take care of 1st column of diagonal 3x3 block. */ 1551 pc2 = rtmp2 + i; 1552 pc3 = rtmp3 + i; 1553 if (*pc2 != 0.0 || *pc3 != 0.0) { 1554 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1555 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1556 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1557 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1558 for (j=0; j<nz; j++) { 1559 col = pj[j]; 1560 rtmp2[col] -= mul2 * rtmp1[col]; 1561 rtmp3[col] -= mul3 * rtmp1[col]; 1562 } 1563 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1564 } 1565 1566 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1567 rs = 0.0; 1568 /* L part */ 1569 pc2 = b->a + bi[i+1]; 1570 pj = b->j + bi[i+1]; 1571 nz = bi[i+2] - bi[i+1]; 1572 for (j=0; j<nz; j++) { 1573 col = pj[j]; 1574 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1575 } 1576 /* U part */ 1577 pc2 = b->a + bdiag[i+2]+1; 1578 pj = b->j + bdiag[i+2]+1; 1579 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1580 for (j=0; j<nz; j++) { 1581 col = pj[j]; 1582 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1583 } 1584 1585 sctx.rs = rs; 1586 sctx.pv = rtmp2[i+1]; 1587 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1588 if (sctx.newshift) break; 1589 pc2 = b->a + bdiag[i+1]; 1590 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1591 1592 /* Now take care of 2nd column of diagonal 3x3 block. */ 1593 pc3 = rtmp3 + i+1; 1594 if (*pc3 != 0.0) { 1595 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1596 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1597 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1598 for (j=0; j<nz; j++) { 1599 col = pj[j]; 1600 rtmp3[col] -= mul3 * rtmp2[col]; 1601 } 1602 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1603 } 1604 1605 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1606 rs = 0.0; 1607 /* L part */ 1608 pc3 = b->a + bi[i+2]; 1609 pj = b->j + bi[i+2]; 1610 nz = bi[i+3] - bi[i+2]; 1611 for (j=0; j<nz; j++) { 1612 col = pj[j]; 1613 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1614 } 1615 /* U part */ 1616 pc3 = b->a + bdiag[i+3]+1; 1617 pj = b->j + bdiag[i+3]+1; 1618 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1619 for (j=0; j<nz; j++) { 1620 col = pj[j]; 1621 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1622 } 1623 1624 sctx.rs = rs; 1625 sctx.pv = rtmp3[i+2]; 1626 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1627 if (sctx.newshift) break; 1628 pc3 = b->a + bdiag[i+2]; 1629 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1630 break; 1631 case 4: 1632 /*----------*/ 1633 /* zero rtmp */ 1634 /* L part */ 1635 nz = bi[i+1] - bi[i]; 1636 bjtmp = bj + bi[i]; 1637 for (j=0; j<nz; j++) { 1638 col = bjtmp[j]; 1639 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1640 } 1641 1642 /* U part */ 1643 nz = bdiag[i]-bdiag[i+1]; 1644 bjtmp = bj + bdiag[i+1]+1; 1645 for (j=0; j<nz; j++) { 1646 col = bjtmp[j]; 1647 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1648 } 1649 1650 /* load in initial (unfactored row) */ 1651 nz = ai[r[i]+1] - ai[r[i]]; 1652 ajtmp = aj + ai[r[i]]; 1653 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1654 for (j=0; j<nz; j++) { 1655 col = ics[ajtmp[j]]; 1656 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1657 } 1658 /* ZeropivotApply(): shift the diagonal of the matrix */ 1659 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1660 1661 /* elimination */ 1662 bjtmp = bj + bi[i]; 1663 row = *bjtmp++; /* pivot row */ 1664 nzL = bi[i+1] - bi[i]; 1665 for (k=0; k < nzL; k++) { 1666 pc1 = rtmp1 + row; 1667 pc2 = rtmp2 + row; 1668 pc3 = rtmp3 + row; 1669 pc4 = rtmp4 + row; 1670 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1671 pv = b->a + bdiag[row]; 1672 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1673 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1674 1675 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1676 pv = b->a + bdiag[row+1]+1; 1677 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1678 for (j=0; j<nz; j++) { 1679 col = pj[j]; 1680 rtmp1[col] -= mul1 * pv[j]; 1681 rtmp2[col] -= mul2 * pv[j]; 1682 rtmp3[col] -= mul3 * pv[j]; 1683 rtmp4[col] -= mul4 * pv[j]; 1684 } 1685 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1686 } 1687 row = *bjtmp++; 1688 } 1689 1690 /* finished row i; check zero pivot, then stick row i into b->a */ 1691 rs = 0.0; 1692 /* L part */ 1693 pc1 = b->a + bi[i]; 1694 pj = b->j + bi[i]; 1695 nz = bi[i+1] - bi[i]; 1696 for (j=0; j<nz; j++) { 1697 col = pj[j]; 1698 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1699 } 1700 /* U part */ 1701 pc1 = b->a + bdiag[i+1]+1; 1702 pj = b->j + bdiag[i+1]+1; 1703 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1704 for (j=0; j<nz; j++) { 1705 col = pj[j]; 1706 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1707 } 1708 1709 sctx.rs = rs; 1710 sctx.pv = rtmp1[i]; 1711 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1712 if (sctx.newshift) break; 1713 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1714 *pc1 = 1.0/sctx.pv; 1715 1716 /* Now take care of 1st column of diagonal 4x4 block. */ 1717 pc2 = rtmp2 + i; 1718 pc3 = rtmp3 + i; 1719 pc4 = rtmp4 + i; 1720 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1721 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1722 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1723 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1724 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1725 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1726 for (j=0; j<nz; j++) { 1727 col = pj[j]; 1728 rtmp2[col] -= mul2 * rtmp1[col]; 1729 rtmp3[col] -= mul3 * rtmp1[col]; 1730 rtmp4[col] -= mul4 * rtmp1[col]; 1731 } 1732 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1733 } 1734 1735 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1736 rs = 0.0; 1737 /* L part */ 1738 pc2 = b->a + bi[i+1]; 1739 pj = b->j + bi[i+1]; 1740 nz = bi[i+2] - bi[i+1]; 1741 for (j=0; j<nz; j++) { 1742 col = pj[j]; 1743 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1744 } 1745 /* U part */ 1746 pc2 = b->a + bdiag[i+2]+1; 1747 pj = b->j + bdiag[i+2]+1; 1748 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1749 for (j=0; j<nz; j++) { 1750 col = pj[j]; 1751 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1752 } 1753 1754 sctx.rs = rs; 1755 sctx.pv = rtmp2[i+1]; 1756 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1757 if (sctx.newshift) break; 1758 pc2 = b->a + bdiag[i+1]; 1759 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1760 1761 /* Now take care of 2nd column of diagonal 4x4 block. */ 1762 pc3 = rtmp3 + i+1; 1763 pc4 = rtmp4 + i+1; 1764 if (*pc3 != 0.0 || *pc4 != 0.0) { 1765 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1766 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1767 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1768 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1769 for (j=0; j<nz; j++) { 1770 col = pj[j]; 1771 rtmp3[col] -= mul3 * rtmp2[col]; 1772 rtmp4[col] -= mul4 * rtmp2[col]; 1773 } 1774 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1775 } 1776 1777 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1778 rs = 0.0; 1779 /* L part */ 1780 pc3 = b->a + bi[i+2]; 1781 pj = b->j + bi[i+2]; 1782 nz = bi[i+3] - bi[i+2]; 1783 for (j=0; j<nz; j++) { 1784 col = pj[j]; 1785 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1786 } 1787 /* U part */ 1788 pc3 = b->a + bdiag[i+3]+1; 1789 pj = b->j + bdiag[i+3]+1; 1790 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1791 for (j=0; j<nz; j++) { 1792 col = pj[j]; 1793 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1794 } 1795 1796 sctx.rs = rs; 1797 sctx.pv = rtmp3[i+2]; 1798 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1799 if (sctx.newshift) break; 1800 pc3 = b->a + bdiag[i+2]; 1801 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1802 1803 /* Now take care of 3rd column of diagonal 4x4 block. */ 1804 pc4 = rtmp4 + i+2; 1805 if (*pc4 != 0.0) { 1806 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1807 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1808 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1809 for (j=0; j<nz; j++) { 1810 col = pj[j]; 1811 rtmp4[col] -= mul4 * rtmp3[col]; 1812 } 1813 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1814 } 1815 1816 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1817 rs = 0.0; 1818 /* L part */ 1819 pc4 = b->a + bi[i+3]; 1820 pj = b->j + bi[i+3]; 1821 nz = bi[i+4] - bi[i+3]; 1822 for (j=0; j<nz; j++) { 1823 col = pj[j]; 1824 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1825 } 1826 /* U part */ 1827 pc4 = b->a + bdiag[i+4]+1; 1828 pj = b->j + bdiag[i+4]+1; 1829 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1830 for (j=0; j<nz; j++) { 1831 col = pj[j]; 1832 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1833 } 1834 1835 sctx.rs = rs; 1836 sctx.pv = rtmp4[i+3]; 1837 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1838 if (sctx.newshift) break; 1839 pc4 = b->a + bdiag[i+3]; 1840 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1841 break; 1842 1843 default: 1844 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1845 } 1846 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1847 i += nodesz; /* Update the row */ 1848 } 1849 1850 /* MatPivotRefine() */ 1851 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1852 /* 1853 * if no shift in this attempt & shifting & started shifting & can refine, 1854 * then try lower shift 1855 */ 1856 sctx.shift_hi = sctx.shift_fraction; 1857 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1858 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1859 sctx.newshift = PETSC_TRUE; 1860 sctx.nshift++; 1861 } 1862 } while (sctx.newshift); 1863 1864 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1865 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1866 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1867 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1868 1869 C->ops->solve = MatSolve_SeqAIJ; 1870 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1871 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1872 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1873 C->ops->matsolve = MatMatSolve_SeqAIJ; 1874 C->assembled = PETSC_TRUE; 1875 C->preallocated = PETSC_TRUE; 1876 1877 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1878 1879 /* MatShiftView(A,info,&sctx) */ 1880 if (sctx.nshift) { 1881 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1882 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1883 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1884 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1885 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1886 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1887 } 1888 } 1889 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1890 PetscFunctionReturn(0); 1891 } 1892 1893 #undef __FUNCT__ 1894 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1895 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1896 { 1897 Mat C = B; 1898 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1899 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1900 PetscErrorCode ierr; 1901 const PetscInt *r,*ic,*c,*ics; 1902 PetscInt n = A->rmap->n,*bi = b->i; 1903 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1904 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1905 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1906 PetscScalar mul1,mul2,mul3,tmp; 1907 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1908 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1909 PetscReal rs=0.0; 1910 FactorShiftCtx sctx; 1911 1912 PetscFunctionBegin; 1913 sctx.shift_top = 0; 1914 sctx.nshift_max = 0; 1915 sctx.shift_lo = 0; 1916 sctx.shift_hi = 0; 1917 sctx.shift_fraction = 0; 1918 1919 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1920 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1921 sctx.shift_top = 0; 1922 for (i=0; i<n; i++) { 1923 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1924 rs = 0.0; 1925 ajtmp = aj + ai[i]; 1926 rtmp1 = aa + ai[i]; 1927 nz = ai[i+1] - ai[i]; 1928 for (j=0; j<nz; j++) { 1929 if (*ajtmp != i) { 1930 rs += PetscAbsScalar(*rtmp1++); 1931 } else { 1932 rs -= PetscRealPart(*rtmp1++); 1933 } 1934 ajtmp++; 1935 } 1936 if (rs>sctx.shift_top) sctx.shift_top = rs; 1937 } 1938 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1939 sctx.shift_top *= 1.1; 1940 sctx.nshift_max = 5; 1941 sctx.shift_lo = 0.; 1942 sctx.shift_hi = 1.; 1943 } 1944 sctx.shift_amount = 0; 1945 sctx.nshift = 0; 1946 1947 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1948 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1949 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1950 ierr = PetscMalloc3(n,PetscScalar,&rtmp11,n,PetscScalar,&rtmp22,n,PetscScalar,&rtmp33);CHKERRQ(ierr); 1951 ierr = PetscMemzero(rtmp11,n*sizeof(PetscScalar));CHKERRQ(ierr); 1952 ierr = PetscMemzero(rtmp22,n*sizeof(PetscScalar));CHKERRQ(ierr); 1953 ierr = PetscMemzero(rtmp33,n*sizeof(PetscScalar));CHKERRQ(ierr); 1954 ics = ic; 1955 1956 node_max = a->inode.node_count; 1957 ns = a->inode.size; 1958 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1959 1960 /* If max inode size > 3, split it into two inodes.*/ 1961 /* also map the inode sizes according to the ordering */ 1962 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1963 for (i=0,j=0; i<node_max; ++i,++j) { 1964 if (ns[i]>3) { 1965 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1966 ++j; 1967 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1968 } else { 1969 tmp_vec1[j] = ns[i]; 1970 } 1971 } 1972 /* Use the correct node_max */ 1973 node_max = j; 1974 1975 /* Now reorder the inode info based on mat re-ordering info */ 1976 /* First create a row -> inode_size_array_index map */ 1977 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1978 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1979 for (i=0,row=0; i<node_max; i++) { 1980 nodesz = tmp_vec1[i]; 1981 for (j=0; j<nodesz; j++,row++) { 1982 nsmap[row] = i; 1983 } 1984 } 1985 /* Using nsmap, create a reordered ns structure */ 1986 for (i=0,j=0; i< node_max; i++) { 1987 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1988 tmp_vec2[i] = nodesz; 1989 j += nodesz; 1990 } 1991 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1992 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1993 /* Now use the correct ns */ 1994 ns = tmp_vec2; 1995 1996 do { 1997 sctx.newshift = PETSC_FALSE; 1998 /* Now loop over each block-row, and do the factorization */ 1999 for (i=0,row=0; i<node_max; i++) { 2000 nodesz = ns[i]; 2001 nz = bi[row+1] - bi[row]; 2002 bjtmp = bj + bi[row]; 2003 2004 switch (nodesz) { 2005 case 1: 2006 for (j=0; j<nz; j++) { 2007 idx = bjtmp[j]; 2008 rtmp11[idx] = 0.0; 2009 } 2010 2011 /* load in initial (unfactored row) */ 2012 idx = r[row]; 2013 nz_tmp = ai[idx+1] - ai[idx]; 2014 ajtmp = aj + ai[idx]; 2015 v1 = aa + ai[idx]; 2016 2017 for (j=0; j<nz_tmp; j++) { 2018 idx = ics[ajtmp[j]]; 2019 rtmp11[idx] = v1[j]; 2020 } 2021 rtmp11[ics[r[row]]] += sctx.shift_amount; 2022 2023 prow = *bjtmp++; 2024 while (prow < row) { 2025 pc1 = rtmp11 + prow; 2026 if (*pc1 != 0.0) { 2027 pv = ba + bd[prow]; 2028 pj = nbj + bd[prow]; 2029 mul1 = *pc1 * *pv++; 2030 *pc1 = mul1; 2031 nz_tmp = bi[prow+1] - bd[prow] - 1; 2032 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2033 for (j=0; j<nz_tmp; j++) { 2034 tmp = pv[j]; 2035 idx = pj[j]; 2036 rtmp11[idx] -= mul1 * tmp; 2037 } 2038 } 2039 prow = *bjtmp++; 2040 } 2041 pj = bj + bi[row]; 2042 pc1 = ba + bi[row]; 2043 2044 sctx.pv = rtmp11[row]; 2045 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2046 rs = 0.0; 2047 for (j=0; j<nz; j++) { 2048 idx = pj[j]; 2049 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2050 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2051 } 2052 sctx.rs = rs; 2053 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2054 if (sctx.newshift) goto endofwhile; 2055 break; 2056 2057 case 2: 2058 for (j=0; j<nz; j++) { 2059 idx = bjtmp[j]; 2060 rtmp11[idx] = 0.0; 2061 rtmp22[idx] = 0.0; 2062 } 2063 2064 /* load in initial (unfactored row) */ 2065 idx = r[row]; 2066 nz_tmp = ai[idx+1] - ai[idx]; 2067 ajtmp = aj + ai[idx]; 2068 v1 = aa + ai[idx]; 2069 v2 = aa + ai[idx+1]; 2070 for (j=0; j<nz_tmp; j++) { 2071 idx = ics[ajtmp[j]]; 2072 rtmp11[idx] = v1[j]; 2073 rtmp22[idx] = v2[j]; 2074 } 2075 rtmp11[ics[r[row]]] += sctx.shift_amount; 2076 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2077 2078 prow = *bjtmp++; 2079 while (prow < row) { 2080 pc1 = rtmp11 + prow; 2081 pc2 = rtmp22 + prow; 2082 if (*pc1 != 0.0 || *pc2 != 0.0) { 2083 pv = ba + bd[prow]; 2084 pj = nbj + bd[prow]; 2085 mul1 = *pc1 * *pv; 2086 mul2 = *pc2 * *pv; 2087 ++pv; 2088 *pc1 = mul1; 2089 *pc2 = mul2; 2090 2091 nz_tmp = bi[prow+1] - bd[prow] - 1; 2092 for (j=0; j<nz_tmp; j++) { 2093 tmp = pv[j]; 2094 idx = pj[j]; 2095 rtmp11[idx] -= mul1 * tmp; 2096 rtmp22[idx] -= mul2 * tmp; 2097 } 2098 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2099 } 2100 prow = *bjtmp++; 2101 } 2102 2103 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2104 pc1 = rtmp11 + prow; 2105 pc2 = rtmp22 + prow; 2106 2107 sctx.pv = *pc1; 2108 pj = bj + bi[prow]; 2109 rs = 0.0; 2110 for (j=0; j<nz; j++) { 2111 idx = pj[j]; 2112 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2113 } 2114 sctx.rs = rs; 2115 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2116 if (sctx.newshift) goto endofwhile; 2117 2118 if (*pc2 != 0.0) { 2119 pj = nbj + bd[prow]; 2120 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2121 *pc2 = mul2; 2122 nz_tmp = bi[prow+1] - bd[prow] - 1; 2123 for (j=0; j<nz_tmp; j++) { 2124 idx = pj[j]; 2125 tmp = rtmp11[idx]; 2126 rtmp22[idx] -= mul2 * tmp; 2127 } 2128 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2129 } 2130 2131 pj = bj + bi[row]; 2132 pc1 = ba + bi[row]; 2133 pc2 = ba + bi[row+1]; 2134 2135 sctx.pv = rtmp22[row+1]; 2136 rs = 0.0; 2137 rtmp11[row] = 1.0/rtmp11[row]; 2138 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2139 /* copy row entries from dense representation to sparse */ 2140 for (j=0; j<nz; j++) { 2141 idx = pj[j]; 2142 pc1[j] = rtmp11[idx]; 2143 pc2[j] = rtmp22[idx]; 2144 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2145 } 2146 sctx.rs = rs; 2147 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2148 if (sctx.newshift) goto endofwhile; 2149 break; 2150 2151 case 3: 2152 for (j=0; j<nz; j++) { 2153 idx = bjtmp[j]; 2154 rtmp11[idx] = 0.0; 2155 rtmp22[idx] = 0.0; 2156 rtmp33[idx] = 0.0; 2157 } 2158 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2159 idx = r[row]; 2160 nz_tmp = ai[idx+1] - ai[idx]; 2161 ajtmp = aj + ai[idx]; 2162 v1 = aa + ai[idx]; 2163 v2 = aa + ai[idx+1]; 2164 v3 = aa + ai[idx+2]; 2165 for (j=0; j<nz_tmp; j++) { 2166 idx = ics[ajtmp[j]]; 2167 rtmp11[idx] = v1[j]; 2168 rtmp22[idx] = v2[j]; 2169 rtmp33[idx] = v3[j]; 2170 } 2171 rtmp11[ics[r[row]]] += sctx.shift_amount; 2172 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2173 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2174 2175 /* loop over all pivot row blocks above this row block */ 2176 prow = *bjtmp++; 2177 while (prow < row) { 2178 pc1 = rtmp11 + prow; 2179 pc2 = rtmp22 + prow; 2180 pc3 = rtmp33 + prow; 2181 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2182 pv = ba + bd[prow]; 2183 pj = nbj + bd[prow]; 2184 mul1 = *pc1 * *pv; 2185 mul2 = *pc2 * *pv; 2186 mul3 = *pc3 * *pv; 2187 ++pv; 2188 *pc1 = mul1; 2189 *pc2 = mul2; 2190 *pc3 = mul3; 2191 2192 nz_tmp = bi[prow+1] - bd[prow] - 1; 2193 /* update this row based on pivot row */ 2194 for (j=0; j<nz_tmp; j++) { 2195 tmp = pv[j]; 2196 idx = pj[j]; 2197 rtmp11[idx] -= mul1 * tmp; 2198 rtmp22[idx] -= mul2 * tmp; 2199 rtmp33[idx] -= mul3 * tmp; 2200 } 2201 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2202 } 2203 prow = *bjtmp++; 2204 } 2205 2206 /* Now take care of diagonal 3x3 block in this set of rows */ 2207 /* note: prow = row here */ 2208 pc1 = rtmp11 + prow; 2209 pc2 = rtmp22 + prow; 2210 pc3 = rtmp33 + prow; 2211 2212 sctx.pv = *pc1; 2213 pj = bj + bi[prow]; 2214 rs = 0.0; 2215 for (j=0; j<nz; j++) { 2216 idx = pj[j]; 2217 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2218 } 2219 sctx.rs = rs; 2220 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2221 if (sctx.newshift) goto endofwhile; 2222 2223 if (*pc2 != 0.0 || *pc3 != 0.0) { 2224 mul2 = (*pc2)/(*pc1); 2225 mul3 = (*pc3)/(*pc1); 2226 *pc2 = mul2; 2227 *pc3 = mul3; 2228 nz_tmp = bi[prow+1] - bd[prow] - 1; 2229 pj = nbj + bd[prow]; 2230 for (j=0; j<nz_tmp; j++) { 2231 idx = pj[j]; 2232 tmp = rtmp11[idx]; 2233 rtmp22[idx] -= mul2 * tmp; 2234 rtmp33[idx] -= mul3 * tmp; 2235 } 2236 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2237 } 2238 ++prow; 2239 2240 pc2 = rtmp22 + prow; 2241 pc3 = rtmp33 + prow; 2242 sctx.pv = *pc2; 2243 pj = bj + bi[prow]; 2244 rs = 0.0; 2245 for (j=0; j<nz; j++) { 2246 idx = pj[j]; 2247 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2248 } 2249 sctx.rs = rs; 2250 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2251 if (sctx.newshift) goto endofwhile; 2252 2253 if (*pc3 != 0.0) { 2254 mul3 = (*pc3)/(*pc2); 2255 *pc3 = mul3; 2256 pj = nbj + bd[prow]; 2257 nz_tmp = bi[prow+1] - bd[prow] - 1; 2258 for (j=0; j<nz_tmp; j++) { 2259 idx = pj[j]; 2260 tmp = rtmp22[idx]; 2261 rtmp33[idx] -= mul3 * tmp; 2262 } 2263 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2264 } 2265 2266 pj = bj + bi[row]; 2267 pc1 = ba + bi[row]; 2268 pc2 = ba + bi[row+1]; 2269 pc3 = ba + bi[row+2]; 2270 2271 sctx.pv = rtmp33[row+2]; 2272 rs = 0.0; 2273 rtmp11[row] = 1.0/rtmp11[row]; 2274 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2275 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2276 /* copy row entries from dense representation to sparse */ 2277 for (j=0; j<nz; j++) { 2278 idx = pj[j]; 2279 pc1[j] = rtmp11[idx]; 2280 pc2[j] = rtmp22[idx]; 2281 pc3[j] = rtmp33[idx]; 2282 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2283 } 2284 2285 sctx.rs = rs; 2286 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2287 if (sctx.newshift) goto endofwhile; 2288 break; 2289 2290 default: 2291 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2292 } 2293 row += nodesz; /* Update the row */ 2294 } 2295 endofwhile:; 2296 } while (sctx.newshift); 2297 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2298 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2299 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2300 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2301 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2302 2303 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2304 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2305 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2306 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2307 C->assembled = PETSC_TRUE; 2308 C->preallocated = PETSC_TRUE; 2309 if (sctx.nshift) { 2310 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2311 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2312 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2313 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2314 } 2315 } 2316 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2317 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2318 PetscFunctionReturn(0); 2319 } 2320 2321 2322 /* ----------------------------------------------------------- */ 2323 #undef __FUNCT__ 2324 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2325 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2326 { 2327 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2328 IS iscol = a->col,isrow = a->row; 2329 PetscErrorCode ierr; 2330 const PetscInt *r,*c,*rout,*cout; 2331 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2332 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2333 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2334 PetscScalar sum1,sum2,sum3,sum4,sum5; 2335 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2336 const PetscScalar *b; 2337 2338 PetscFunctionBegin; 2339 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2340 node_max = a->inode.node_count; 2341 ns = a->inode.size; /* Node Size array */ 2342 2343 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2344 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2345 tmp = a->solve_work; 2346 2347 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2348 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2349 2350 /* forward solve the lower triangular */ 2351 tmps = tmp; 2352 aa = a_a; 2353 aj = a_j; 2354 ad = a->diag; 2355 2356 for (i = 0,row = 0; i< node_max; ++i) { 2357 nsz = ns[i]; 2358 aii = ai[row]; 2359 v1 = aa + aii; 2360 vi = aj + aii; 2361 nz = ai[row+1]- ai[row]; 2362 2363 if (i < node_max-1) { 2364 /* Prefetch the indices for the next block */ 2365 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2366 /* Prefetch the data for the next block */ 2367 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2368 } 2369 2370 switch (nsz) { /* Each loop in 'case' is unrolled */ 2371 case 1: 2372 sum1 = b[r[row]]; 2373 for (j=0; j<nz-1; j+=2) { 2374 i0 = vi[j]; 2375 i1 = vi[j+1]; 2376 tmp0 = tmps[i0]; 2377 tmp1 = tmps[i1]; 2378 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2379 } 2380 if (j == nz-1) { 2381 tmp0 = tmps[vi[j]]; 2382 sum1 -= v1[j]*tmp0; 2383 } 2384 tmp[row++]=sum1; 2385 break; 2386 case 2: 2387 sum1 = b[r[row]]; 2388 sum2 = b[r[row+1]]; 2389 v2 = aa + ai[row+1]; 2390 2391 for (j=0; j<nz-1; j+=2) { 2392 i0 = vi[j]; 2393 i1 = vi[j+1]; 2394 tmp0 = tmps[i0]; 2395 tmp1 = tmps[i1]; 2396 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2397 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2398 } 2399 if (j == nz-1) { 2400 tmp0 = tmps[vi[j]]; 2401 sum1 -= v1[j] *tmp0; 2402 sum2 -= v2[j] *tmp0; 2403 } 2404 sum2 -= v2[nz] * sum1; 2405 tmp[row++]=sum1; 2406 tmp[row++]=sum2; 2407 break; 2408 case 3: 2409 sum1 = b[r[row]]; 2410 sum2 = b[r[row+1]]; 2411 sum3 = b[r[row+2]]; 2412 v2 = aa + ai[row+1]; 2413 v3 = aa + ai[row+2]; 2414 2415 for (j=0; j<nz-1; j+=2) { 2416 i0 = vi[j]; 2417 i1 = vi[j+1]; 2418 tmp0 = tmps[i0]; 2419 tmp1 = tmps[i1]; 2420 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2421 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2422 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2423 } 2424 if (j == nz-1) { 2425 tmp0 = tmps[vi[j]]; 2426 sum1 -= v1[j] *tmp0; 2427 sum2 -= v2[j] *tmp0; 2428 sum3 -= v3[j] *tmp0; 2429 } 2430 sum2 -= v2[nz] * sum1; 2431 sum3 -= v3[nz] * sum1; 2432 sum3 -= v3[nz+1] * sum2; 2433 tmp[row++]=sum1; 2434 tmp[row++]=sum2; 2435 tmp[row++]=sum3; 2436 break; 2437 2438 case 4: 2439 sum1 = b[r[row]]; 2440 sum2 = b[r[row+1]]; 2441 sum3 = b[r[row+2]]; 2442 sum4 = b[r[row+3]]; 2443 v2 = aa + ai[row+1]; 2444 v3 = aa + ai[row+2]; 2445 v4 = aa + ai[row+3]; 2446 2447 for (j=0; j<nz-1; j+=2) { 2448 i0 = vi[j]; 2449 i1 = vi[j+1]; 2450 tmp0 = tmps[i0]; 2451 tmp1 = tmps[i1]; 2452 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2453 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2454 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2455 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2456 } 2457 if (j == nz-1) { 2458 tmp0 = tmps[vi[j]]; 2459 sum1 -= v1[j] *tmp0; 2460 sum2 -= v2[j] *tmp0; 2461 sum3 -= v3[j] *tmp0; 2462 sum4 -= v4[j] *tmp0; 2463 } 2464 sum2 -= v2[nz] * sum1; 2465 sum3 -= v3[nz] * sum1; 2466 sum4 -= v4[nz] * sum1; 2467 sum3 -= v3[nz+1] * sum2; 2468 sum4 -= v4[nz+1] * sum2; 2469 sum4 -= v4[nz+2] * sum3; 2470 2471 tmp[row++]=sum1; 2472 tmp[row++]=sum2; 2473 tmp[row++]=sum3; 2474 tmp[row++]=sum4; 2475 break; 2476 case 5: 2477 sum1 = b[r[row]]; 2478 sum2 = b[r[row+1]]; 2479 sum3 = b[r[row+2]]; 2480 sum4 = b[r[row+3]]; 2481 sum5 = b[r[row+4]]; 2482 v2 = aa + ai[row+1]; 2483 v3 = aa + ai[row+2]; 2484 v4 = aa + ai[row+3]; 2485 v5 = aa + ai[row+4]; 2486 2487 for (j=0; j<nz-1; j+=2) { 2488 i0 = vi[j]; 2489 i1 = vi[j+1]; 2490 tmp0 = tmps[i0]; 2491 tmp1 = tmps[i1]; 2492 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2493 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2494 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2495 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2496 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2497 } 2498 if (j == nz-1) { 2499 tmp0 = tmps[vi[j]]; 2500 sum1 -= v1[j] *tmp0; 2501 sum2 -= v2[j] *tmp0; 2502 sum3 -= v3[j] *tmp0; 2503 sum4 -= v4[j] *tmp0; 2504 sum5 -= v5[j] *tmp0; 2505 } 2506 2507 sum2 -= v2[nz] * sum1; 2508 sum3 -= v3[nz] * sum1; 2509 sum4 -= v4[nz] * sum1; 2510 sum5 -= v5[nz] * sum1; 2511 sum3 -= v3[nz+1] * sum2; 2512 sum4 -= v4[nz+1] * sum2; 2513 sum5 -= v5[nz+1] * sum2; 2514 sum4 -= v4[nz+2] * sum3; 2515 sum5 -= v5[nz+2] * sum3; 2516 sum5 -= v5[nz+3] * sum4; 2517 2518 tmp[row++]=sum1; 2519 tmp[row++]=sum2; 2520 tmp[row++]=sum3; 2521 tmp[row++]=sum4; 2522 tmp[row++]=sum5; 2523 break; 2524 default: 2525 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2526 } 2527 } 2528 /* backward solve the upper triangular */ 2529 for (i=node_max -1,row = n-1; i>=0; i--) { 2530 nsz = ns[i]; 2531 aii = ad[row+1] + 1; 2532 v1 = aa + aii; 2533 vi = aj + aii; 2534 nz = ad[row]- ad[row+1] - 1; 2535 2536 if (i > 0) { 2537 /* Prefetch the indices for the next block */ 2538 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2539 /* Prefetch the data for the next block */ 2540 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2541 } 2542 2543 switch (nsz) { /* Each loop in 'case' is unrolled */ 2544 case 1: 2545 sum1 = tmp[row]; 2546 2547 for (j=0; j<nz-1; j+=2) { 2548 i0 = vi[j]; 2549 i1 = vi[j+1]; 2550 tmp0 = tmps[i0]; 2551 tmp1 = tmps[i1]; 2552 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2553 } 2554 if (j == nz-1) { 2555 tmp0 = tmps[vi[j]]; 2556 sum1 -= v1[j]*tmp0; 2557 } 2558 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2559 break; 2560 case 2: 2561 sum1 = tmp[row]; 2562 sum2 = tmp[row-1]; 2563 v2 = aa + ad[row] + 1; 2564 for (j=0; j<nz-1; j+=2) { 2565 i0 = vi[j]; 2566 i1 = vi[j+1]; 2567 tmp0 = tmps[i0]; 2568 tmp1 = tmps[i1]; 2569 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2570 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2571 } 2572 if (j == nz-1) { 2573 tmp0 = tmps[vi[j]]; 2574 sum1 -= v1[j]* tmp0; 2575 sum2 -= v2[j+1]* tmp0; 2576 } 2577 2578 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2579 sum2 -= v2[0] * tmp0; 2580 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2581 break; 2582 case 3: 2583 sum1 = tmp[row]; 2584 sum2 = tmp[row -1]; 2585 sum3 = tmp[row -2]; 2586 v2 = aa + ad[row] + 1; 2587 v3 = aa + ad[row -1] + 1; 2588 for (j=0; j<nz-1; j+=2) { 2589 i0 = vi[j]; 2590 i1 = vi[j+1]; 2591 tmp0 = tmps[i0]; 2592 tmp1 = tmps[i1]; 2593 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2594 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2595 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2596 } 2597 if (j== nz-1) { 2598 tmp0 = tmps[vi[j]]; 2599 sum1 -= v1[j] * tmp0; 2600 sum2 -= v2[j+1] * tmp0; 2601 sum3 -= v3[j+2] * tmp0; 2602 } 2603 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2604 sum2 -= v2[0]* tmp0; 2605 sum3 -= v3[1] * tmp0; 2606 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2607 sum3 -= v3[0]* tmp0; 2608 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2609 2610 break; 2611 case 4: 2612 sum1 = tmp[row]; 2613 sum2 = tmp[row -1]; 2614 sum3 = tmp[row -2]; 2615 sum4 = tmp[row -3]; 2616 v2 = aa + ad[row]+1; 2617 v3 = aa + ad[row -1]+1; 2618 v4 = aa + ad[row -2]+1; 2619 2620 for (j=0; j<nz-1; j+=2) { 2621 i0 = vi[j]; 2622 i1 = vi[j+1]; 2623 tmp0 = tmps[i0]; 2624 tmp1 = tmps[i1]; 2625 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2626 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2627 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2628 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2629 } 2630 if (j== nz-1) { 2631 tmp0 = tmps[vi[j]]; 2632 sum1 -= v1[j] * tmp0; 2633 sum2 -= v2[j+1] * tmp0; 2634 sum3 -= v3[j+2] * tmp0; 2635 sum4 -= v4[j+3] * tmp0; 2636 } 2637 2638 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2639 sum2 -= v2[0] * tmp0; 2640 sum3 -= v3[1] * tmp0; 2641 sum4 -= v4[2] * tmp0; 2642 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2643 sum3 -= v3[0] * tmp0; 2644 sum4 -= v4[1] * tmp0; 2645 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2646 sum4 -= v4[0] * tmp0; 2647 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2648 break; 2649 case 5: 2650 sum1 = tmp[row]; 2651 sum2 = tmp[row -1]; 2652 sum3 = tmp[row -2]; 2653 sum4 = tmp[row -3]; 2654 sum5 = tmp[row -4]; 2655 v2 = aa + ad[row]+1; 2656 v3 = aa + ad[row -1]+1; 2657 v4 = aa + ad[row -2]+1; 2658 v5 = aa + ad[row -3]+1; 2659 for (j=0; j<nz-1; j+=2) { 2660 i0 = vi[j]; 2661 i1 = vi[j+1]; 2662 tmp0 = tmps[i0]; 2663 tmp1 = tmps[i1]; 2664 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2665 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2666 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2667 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2668 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2669 } 2670 if (j==nz-1) { 2671 tmp0 = tmps[vi[j]]; 2672 sum1 -= v1[j] * tmp0; 2673 sum2 -= v2[j+1] * tmp0; 2674 sum3 -= v3[j+2] * tmp0; 2675 sum4 -= v4[j+3] * tmp0; 2676 sum5 -= v5[j+4] * tmp0; 2677 } 2678 2679 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2680 sum2 -= v2[0] * tmp0; 2681 sum3 -= v3[1] * tmp0; 2682 sum4 -= v4[2] * tmp0; 2683 sum5 -= v5[3] * tmp0; 2684 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2685 sum3 -= v3[0] * tmp0; 2686 sum4 -= v4[1] * tmp0; 2687 sum5 -= v5[2] * tmp0; 2688 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2689 sum4 -= v4[0] * tmp0; 2690 sum5 -= v5[1] * tmp0; 2691 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2692 sum5 -= v5[0] * tmp0; 2693 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2694 break; 2695 default: 2696 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2697 } 2698 } 2699 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2700 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2701 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2703 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2704 PetscFunctionReturn(0); 2705 } 2706 2707 2708 /* 2709 Makes a longer coloring[] array and calls the usual code with that 2710 */ 2711 #undef __FUNCT__ 2712 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2713 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2714 { 2715 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2716 PetscErrorCode ierr; 2717 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2718 PetscInt *colorused,i; 2719 ISColoringValue *newcolor; 2720 2721 PetscFunctionBegin; 2722 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2723 /* loop over inodes, marking a color for each column*/ 2724 row = 0; 2725 for (i=0; i<m; i++) { 2726 for (j=0; j<ns[i]; j++) { 2727 newcolor[row++] = coloring[i] + j*ncolors; 2728 } 2729 } 2730 2731 /* eliminate unneeded colors */ 2732 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2733 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2734 for (i=0; i<n; i++) { 2735 colorused[newcolor[i]] = 1; 2736 } 2737 2738 for (i=1; i<5*ncolors; i++) { 2739 colorused[i] += colorused[i-1]; 2740 } 2741 ncolors = colorused[5*ncolors-1]; 2742 for (i=0; i<n; i++) { 2743 newcolor[i] = colorused[newcolor[i]]-1; 2744 } 2745 ierr = PetscFree(colorused);CHKERRQ(ierr); 2746 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2747 ierr = PetscFree(coloring);CHKERRQ(ierr); 2748 PetscFunctionReturn(0); 2749 } 2750 2751 #include <petsc-private/kernels/blockinvert.h> 2752 2753 #undef __FUNCT__ 2754 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2755 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2756 { 2757 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2758 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2759 MatScalar *ibdiag,*bdiag,work[25],*t; 2760 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2761 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2762 const PetscScalar *xb, *b; 2763 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2764 PetscErrorCode ierr; 2765 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2766 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2767 2768 PetscFunctionBegin; 2769 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2770 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2771 2772 if (!a->inode.ibdiagvalid) { 2773 if (!a->inode.ibdiag) { 2774 /* calculate space needed for diagonal blocks */ 2775 for (i=0; i<m; i++) { 2776 cnt += sizes[i]*sizes[i]; 2777 } 2778 a->inode.bdiagsize = cnt; 2779 2780 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2781 } 2782 2783 /* copy over the diagonal blocks and invert them */ 2784 ibdiag = a->inode.ibdiag; 2785 bdiag = a->inode.bdiag; 2786 cnt = 0; 2787 for (i=0, row = 0; i<m; i++) { 2788 for (j=0; j<sizes[i]; j++) { 2789 for (k=0; k<sizes[i]; k++) { 2790 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2791 } 2792 } 2793 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2794 2795 switch (sizes[i]) { 2796 case 1: 2797 /* Create matrix data structure */ 2798 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2799 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2800 break; 2801 case 2: 2802 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2803 break; 2804 case 3: 2805 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2806 break; 2807 case 4: 2808 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2809 break; 2810 case 5: 2811 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2812 break; 2813 default: 2814 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2815 } 2816 cnt += sizes[i]*sizes[i]; 2817 row += sizes[i]; 2818 } 2819 a->inode.ibdiagvalid = PETSC_TRUE; 2820 } 2821 ibdiag = a->inode.ibdiag; 2822 bdiag = a->inode.bdiag; 2823 t = a->inode.ssor_work; 2824 2825 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2826 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2827 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2828 if (flag & SOR_ZERO_INITIAL_GUESS) { 2829 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2830 2831 for (i=0, row=0; i<m; i++) { 2832 sz = diag[row] - ii[row]; 2833 v1 = a->a + ii[row]; 2834 idx = a->j + ii[row]; 2835 2836 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2837 switch (sizes[i]) { 2838 case 1: 2839 2840 sum1 = b[row]; 2841 for (n = 0; n<sz-1; n+=2) { 2842 i1 = idx[0]; 2843 i2 = idx[1]; 2844 idx += 2; 2845 tmp0 = x[i1]; 2846 tmp1 = x[i2]; 2847 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2848 } 2849 2850 if (n == sz-1) { 2851 tmp0 = x[*idx]; 2852 sum1 -= *v1 * tmp0; 2853 } 2854 t[row] = sum1; 2855 x[row++] = sum1*(*ibdiag++); 2856 break; 2857 case 2: 2858 v2 = a->a + ii[row+1]; 2859 sum1 = b[row]; 2860 sum2 = b[row+1]; 2861 for (n = 0; n<sz-1; n+=2) { 2862 i1 = idx[0]; 2863 i2 = idx[1]; 2864 idx += 2; 2865 tmp0 = x[i1]; 2866 tmp1 = x[i2]; 2867 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2868 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2869 } 2870 2871 if (n == sz-1) { 2872 tmp0 = x[*idx]; 2873 sum1 -= v1[0] * tmp0; 2874 sum2 -= v2[0] * tmp0; 2875 } 2876 t[row] = sum1; 2877 t[row+1] = sum2; 2878 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2879 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2880 ibdiag += 4; 2881 break; 2882 case 3: 2883 v2 = a->a + ii[row+1]; 2884 v3 = a->a + ii[row+2]; 2885 sum1 = b[row]; 2886 sum2 = b[row+1]; 2887 sum3 = b[row+2]; 2888 for (n = 0; n<sz-1; n+=2) { 2889 i1 = idx[0]; 2890 i2 = idx[1]; 2891 idx += 2; 2892 tmp0 = x[i1]; 2893 tmp1 = x[i2]; 2894 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2895 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2896 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2897 } 2898 2899 if (n == sz-1) { 2900 tmp0 = x[*idx]; 2901 sum1 -= v1[0] * tmp0; 2902 sum2 -= v2[0] * tmp0; 2903 sum3 -= v3[0] * tmp0; 2904 } 2905 t[row] = sum1; 2906 t[row+1] = sum2; 2907 t[row+2] = sum3; 2908 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2909 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2910 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2911 ibdiag += 9; 2912 break; 2913 case 4: 2914 v2 = a->a + ii[row+1]; 2915 v3 = a->a + ii[row+2]; 2916 v4 = a->a + ii[row+3]; 2917 sum1 = b[row]; 2918 sum2 = b[row+1]; 2919 sum3 = b[row+2]; 2920 sum4 = b[row+3]; 2921 for (n = 0; n<sz-1; n+=2) { 2922 i1 = idx[0]; 2923 i2 = idx[1]; 2924 idx += 2; 2925 tmp0 = x[i1]; 2926 tmp1 = x[i2]; 2927 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2928 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2929 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2930 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2931 } 2932 2933 if (n == sz-1) { 2934 tmp0 = x[*idx]; 2935 sum1 -= v1[0] * tmp0; 2936 sum2 -= v2[0] * tmp0; 2937 sum3 -= v3[0] * tmp0; 2938 sum4 -= v4[0] * tmp0; 2939 } 2940 t[row] = sum1; 2941 t[row+1] = sum2; 2942 t[row+2] = sum3; 2943 t[row+3] = sum4; 2944 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2945 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2946 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2947 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2948 ibdiag += 16; 2949 break; 2950 case 5: 2951 v2 = a->a + ii[row+1]; 2952 v3 = a->a + ii[row+2]; 2953 v4 = a->a + ii[row+3]; 2954 v5 = a->a + ii[row+4]; 2955 sum1 = b[row]; 2956 sum2 = b[row+1]; 2957 sum3 = b[row+2]; 2958 sum4 = b[row+3]; 2959 sum5 = b[row+4]; 2960 for (n = 0; n<sz-1; n+=2) { 2961 i1 = idx[0]; 2962 i2 = idx[1]; 2963 idx += 2; 2964 tmp0 = x[i1]; 2965 tmp1 = x[i2]; 2966 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2967 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2968 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2969 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2970 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2971 } 2972 2973 if (n == sz-1) { 2974 tmp0 = x[*idx]; 2975 sum1 -= v1[0] * tmp0; 2976 sum2 -= v2[0] * tmp0; 2977 sum3 -= v3[0] * tmp0; 2978 sum4 -= v4[0] * tmp0; 2979 sum5 -= v5[0] * tmp0; 2980 } 2981 t[row] = sum1; 2982 t[row+1] = sum2; 2983 t[row+2] = sum3; 2984 t[row+3] = sum4; 2985 t[row+4] = sum5; 2986 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2987 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2988 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2989 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2990 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2991 ibdiag += 25; 2992 break; 2993 default: 2994 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2995 } 2996 } 2997 2998 xb = t; 2999 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3000 } else xb = b; 3001 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3002 3003 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3004 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3005 ibdiag -= sizes[i]*sizes[i]; 3006 sz = ii[row+1] - diag[row] - 1; 3007 v1 = a->a + diag[row] + 1; 3008 idx = a->j + diag[row] + 1; 3009 3010 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3011 switch (sizes[i]) { 3012 case 1: 3013 3014 sum1 = xb[row]; 3015 for (n = 0; n<sz-1; n+=2) { 3016 i1 = idx[0]; 3017 i2 = idx[1]; 3018 idx += 2; 3019 tmp0 = x[i1]; 3020 tmp1 = x[i2]; 3021 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3022 } 3023 3024 if (n == sz-1) { 3025 tmp0 = x[*idx]; 3026 sum1 -= *v1*tmp0; 3027 } 3028 x[row--] = sum1*(*ibdiag); 3029 break; 3030 3031 case 2: 3032 3033 sum1 = xb[row]; 3034 sum2 = xb[row-1]; 3035 /* note that sum1 is associated with the second of the two rows */ 3036 v2 = a->a + diag[row-1] + 2; 3037 for (n = 0; n<sz-1; n+=2) { 3038 i1 = idx[0]; 3039 i2 = idx[1]; 3040 idx += 2; 3041 tmp0 = x[i1]; 3042 tmp1 = x[i2]; 3043 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3044 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3045 } 3046 3047 if (n == sz-1) { 3048 tmp0 = x[*idx]; 3049 sum1 -= *v1*tmp0; 3050 sum2 -= *v2*tmp0; 3051 } 3052 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3053 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3054 break; 3055 case 3: 3056 3057 sum1 = xb[row]; 3058 sum2 = xb[row-1]; 3059 sum3 = xb[row-2]; 3060 v2 = a->a + diag[row-1] + 2; 3061 v3 = a->a + diag[row-2] + 3; 3062 for (n = 0; n<sz-1; n+=2) { 3063 i1 = idx[0]; 3064 i2 = idx[1]; 3065 idx += 2; 3066 tmp0 = x[i1]; 3067 tmp1 = x[i2]; 3068 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3069 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3070 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3071 } 3072 3073 if (n == sz-1) { 3074 tmp0 = x[*idx]; 3075 sum1 -= *v1*tmp0; 3076 sum2 -= *v2*tmp0; 3077 sum3 -= *v3*tmp0; 3078 } 3079 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3080 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3081 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3082 break; 3083 case 4: 3084 3085 sum1 = xb[row]; 3086 sum2 = xb[row-1]; 3087 sum3 = xb[row-2]; 3088 sum4 = xb[row-3]; 3089 v2 = a->a + diag[row-1] + 2; 3090 v3 = a->a + diag[row-2] + 3; 3091 v4 = a->a + diag[row-3] + 4; 3092 for (n = 0; n<sz-1; n+=2) { 3093 i1 = idx[0]; 3094 i2 = idx[1]; 3095 idx += 2; 3096 tmp0 = x[i1]; 3097 tmp1 = x[i2]; 3098 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3099 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3100 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3101 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3102 } 3103 3104 if (n == sz-1) { 3105 tmp0 = x[*idx]; 3106 sum1 -= *v1*tmp0; 3107 sum2 -= *v2*tmp0; 3108 sum3 -= *v3*tmp0; 3109 sum4 -= *v4*tmp0; 3110 } 3111 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3112 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3113 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3114 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3115 break; 3116 case 5: 3117 3118 sum1 = xb[row]; 3119 sum2 = xb[row-1]; 3120 sum3 = xb[row-2]; 3121 sum4 = xb[row-3]; 3122 sum5 = xb[row-4]; 3123 v2 = a->a + diag[row-1] + 2; 3124 v3 = a->a + diag[row-2] + 3; 3125 v4 = a->a + diag[row-3] + 4; 3126 v5 = a->a + diag[row-4] + 5; 3127 for (n = 0; n<sz-1; n+=2) { 3128 i1 = idx[0]; 3129 i2 = idx[1]; 3130 idx += 2; 3131 tmp0 = x[i1]; 3132 tmp1 = x[i2]; 3133 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3134 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3135 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3136 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3137 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3138 } 3139 3140 if (n == sz-1) { 3141 tmp0 = x[*idx]; 3142 sum1 -= *v1*tmp0; 3143 sum2 -= *v2*tmp0; 3144 sum3 -= *v3*tmp0; 3145 sum4 -= *v4*tmp0; 3146 sum5 -= *v5*tmp0; 3147 } 3148 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3149 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3150 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3151 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3152 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3153 break; 3154 default: 3155 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3156 } 3157 } 3158 3159 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3160 } 3161 its--; 3162 } 3163 while (its--) { 3164 3165 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3166 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3167 i<m; 3168 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3169 3170 sz = diag[row] - ii[row]; 3171 v1 = a->a + ii[row]; 3172 idx = a->j + ii[row]; 3173 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3174 switch (sizes[i]) { 3175 case 1: 3176 sum1 = b[row]; 3177 for (n = 0; n<sz-1; n+=2) { 3178 i1 = idx[0]; 3179 i2 = idx[1]; 3180 idx += 2; 3181 tmp0 = x[i1]; 3182 tmp1 = x[i2]; 3183 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3184 } 3185 if (n == sz-1) { 3186 tmp0 = x[*idx++]; 3187 sum1 -= *v1 * tmp0; 3188 v1++; 3189 } 3190 t[row] = sum1; 3191 sz = ii[row+1] - diag[row] - 1; 3192 idx = a->j + diag[row] + 1; 3193 v1 += 1; 3194 for (n = 0; n<sz-1; n+=2) { 3195 i1 = idx[0]; 3196 i2 = idx[1]; 3197 idx += 2; 3198 tmp0 = x[i1]; 3199 tmp1 = x[i2]; 3200 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3201 } 3202 if (n == sz-1) { 3203 tmp0 = x[*idx++]; 3204 sum1 -= *v1 * tmp0; 3205 } 3206 /* in MatSOR_SeqAIJ this line would be 3207 * 3208 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3209 * 3210 * but omega == 1, so this becomes 3211 * 3212 * x[row] = sum1*(*ibdiag++); 3213 * 3214 */ 3215 x[row] = sum1*(*ibdiag); 3216 break; 3217 case 2: 3218 v2 = a->a + ii[row+1]; 3219 sum1 = b[row]; 3220 sum2 = b[row+1]; 3221 for (n = 0; n<sz-1; n+=2) { 3222 i1 = idx[0]; 3223 i2 = idx[1]; 3224 idx += 2; 3225 tmp0 = x[i1]; 3226 tmp1 = x[i2]; 3227 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3228 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3229 } 3230 if (n == sz-1) { 3231 tmp0 = x[*idx++]; 3232 sum1 -= v1[0] * tmp0; 3233 sum2 -= v2[0] * tmp0; 3234 v1++; v2++; 3235 } 3236 t[row] = sum1; 3237 t[row+1] = sum2; 3238 sz = ii[row+1] - diag[row] - 2; 3239 idx = a->j + diag[row] + 2; 3240 v1 += 2; 3241 v2 += 2; 3242 for (n = 0; n<sz-1; n+=2) { 3243 i1 = idx[0]; 3244 i2 = idx[1]; 3245 idx += 2; 3246 tmp0 = x[i1]; 3247 tmp1 = x[i2]; 3248 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3249 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3250 } 3251 if (n == sz-1) { 3252 tmp0 = x[*idx]; 3253 sum1 -= v1[0] * tmp0; 3254 sum2 -= v2[0] * tmp0; 3255 } 3256 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3257 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3258 break; 3259 case 3: 3260 v2 = a->a + ii[row+1]; 3261 v3 = a->a + ii[row+2]; 3262 sum1 = b[row]; 3263 sum2 = b[row+1]; 3264 sum3 = b[row+2]; 3265 for (n = 0; n<sz-1; n+=2) { 3266 i1 = idx[0]; 3267 i2 = idx[1]; 3268 idx += 2; 3269 tmp0 = x[i1]; 3270 tmp1 = x[i2]; 3271 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3272 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3273 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3274 } 3275 if (n == sz-1) { 3276 tmp0 = x[*idx++]; 3277 sum1 -= v1[0] * tmp0; 3278 sum2 -= v2[0] * tmp0; 3279 sum3 -= v3[0] * tmp0; 3280 v1++; v2++; v3++; 3281 } 3282 t[row] = sum1; 3283 t[row+1] = sum2; 3284 t[row+2] = sum3; 3285 sz = ii[row+1] - diag[row] - 3; 3286 idx = a->j + diag[row] + 3; 3287 v1 += 3; 3288 v2 += 3; 3289 v3 += 3; 3290 for (n = 0; n<sz-1; n+=2) { 3291 i1 = idx[0]; 3292 i2 = idx[1]; 3293 idx += 2; 3294 tmp0 = x[i1]; 3295 tmp1 = x[i2]; 3296 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3297 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3298 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3299 } 3300 if (n == sz-1) { 3301 tmp0 = x[*idx]; 3302 sum1 -= v1[0] * tmp0; 3303 sum2 -= v2[0] * tmp0; 3304 sum3 -= v3[0] * tmp0; 3305 } 3306 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3307 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3308 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3309 break; 3310 case 4: 3311 v2 = a->a + ii[row+1]; 3312 v3 = a->a + ii[row+2]; 3313 v4 = a->a + ii[row+3]; 3314 sum1 = b[row]; 3315 sum2 = b[row+1]; 3316 sum3 = b[row+2]; 3317 sum4 = b[row+3]; 3318 for (n = 0; n<sz-1; n+=2) { 3319 i1 = idx[0]; 3320 i2 = idx[1]; 3321 idx += 2; 3322 tmp0 = x[i1]; 3323 tmp1 = x[i2]; 3324 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3325 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3326 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3327 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3328 } 3329 if (n == sz-1) { 3330 tmp0 = x[*idx++]; 3331 sum1 -= v1[0] * tmp0; 3332 sum2 -= v2[0] * tmp0; 3333 sum3 -= v3[0] * tmp0; 3334 sum4 -= v4[0] * tmp0; 3335 v1++; v2++; v3++; v4++; 3336 } 3337 t[row] = sum1; 3338 t[row+1] = sum2; 3339 t[row+2] = sum3; 3340 t[row+3] = sum4; 3341 sz = ii[row+1] - diag[row] - 4; 3342 idx = a->j + diag[row] + 4; 3343 v1 += 4; 3344 v2 += 4; 3345 v3 += 4; 3346 v4 += 4; 3347 for (n = 0; n<sz-1; n+=2) { 3348 i1 = idx[0]; 3349 i2 = idx[1]; 3350 idx += 2; 3351 tmp0 = x[i1]; 3352 tmp1 = x[i2]; 3353 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3354 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3355 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3356 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3357 } 3358 if (n == sz-1) { 3359 tmp0 = x[*idx]; 3360 sum1 -= v1[0] * tmp0; 3361 sum2 -= v2[0] * tmp0; 3362 sum3 -= v3[0] * tmp0; 3363 sum4 -= v4[0] * tmp0; 3364 } 3365 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3366 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3367 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3368 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3369 break; 3370 case 5: 3371 v2 = a->a + ii[row+1]; 3372 v3 = a->a + ii[row+2]; 3373 v4 = a->a + ii[row+3]; 3374 v5 = a->a + ii[row+4]; 3375 sum1 = b[row]; 3376 sum2 = b[row+1]; 3377 sum3 = b[row+2]; 3378 sum4 = b[row+3]; 3379 sum5 = b[row+4]; 3380 for (n = 0; n<sz-1; n+=2) { 3381 i1 = idx[0]; 3382 i2 = idx[1]; 3383 idx += 2; 3384 tmp0 = x[i1]; 3385 tmp1 = x[i2]; 3386 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3387 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3388 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3389 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3390 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3391 } 3392 if (n == sz-1) { 3393 tmp0 = x[*idx++]; 3394 sum1 -= v1[0] * tmp0; 3395 sum2 -= v2[0] * tmp0; 3396 sum3 -= v3[0] * tmp0; 3397 sum4 -= v4[0] * tmp0; 3398 sum5 -= v5[0] * tmp0; 3399 v1++; v2++; v3++; v4++; v5++; 3400 } 3401 t[row] = sum1; 3402 t[row+1] = sum2; 3403 t[row+2] = sum3; 3404 t[row+3] = sum4; 3405 t[row+4] = sum5; 3406 sz = ii[row+1] - diag[row] - 5; 3407 idx = a->j + diag[row] + 5; 3408 v1 += 5; 3409 v2 += 5; 3410 v3 += 5; 3411 v4 += 5; 3412 v5 += 5; 3413 for (n = 0; n<sz-1; n+=2) { 3414 i1 = idx[0]; 3415 i2 = idx[1]; 3416 idx += 2; 3417 tmp0 = x[i1]; 3418 tmp1 = x[i2]; 3419 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3420 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3421 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3422 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3423 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3424 } 3425 if (n == sz-1) { 3426 tmp0 = x[*idx]; 3427 sum1 -= v1[0] * tmp0; 3428 sum2 -= v2[0] * tmp0; 3429 sum3 -= v3[0] * tmp0; 3430 sum4 -= v4[0] * tmp0; 3431 sum5 -= v5[0] * tmp0; 3432 } 3433 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3434 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3435 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3436 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3437 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3438 break; 3439 default: 3440 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3441 } 3442 } 3443 xb = t; 3444 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3445 } else xb = b; 3446 3447 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3448 3449 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3450 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3451 ibdiag -= sizes[i]*sizes[i]; 3452 3453 /* set RHS */ 3454 if (xb == b) { 3455 /* whole (old way) */ 3456 sz = ii[row+1] - ii[row]; 3457 idx = a->j + ii[row]; 3458 switch (sizes[i]) { 3459 case 5: 3460 v5 = a->a + ii[row-4]; 3461 case 4: /* fall through */ 3462 v4 = a->a + ii[row-3]; 3463 case 3: 3464 v3 = a->a + ii[row-2]; 3465 case 2: 3466 v2 = a->a + ii[row-1]; 3467 case 1: 3468 v1 = a->a + ii[row]; 3469 break; 3470 default: 3471 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3472 } 3473 } else { 3474 /* upper, no diag */ 3475 sz = ii[row+1] - diag[row] - 1; 3476 idx = a->j + diag[row] + 1; 3477 switch (sizes[i]) { 3478 case 5: 3479 v5 = a->a + diag[row-4] + 5; 3480 case 4: /* fall through */ 3481 v4 = a->a + diag[row-3] + 4; 3482 case 3: 3483 v3 = a->a + diag[row-2] + 3; 3484 case 2: 3485 v2 = a->a + diag[row-1] + 2; 3486 case 1: 3487 v1 = a->a + diag[row] + 1; 3488 } 3489 } 3490 /* set sum */ 3491 switch (sizes[i]) { 3492 case 5: 3493 sum5 = xb[row-4]; 3494 case 4: /* fall through */ 3495 sum4 = xb[row-3]; 3496 case 3: 3497 sum3 = xb[row-2]; 3498 case 2: 3499 sum2 = xb[row-1]; 3500 case 1: 3501 /* note that sum1 is associated with the last row */ 3502 sum1 = xb[row]; 3503 } 3504 /* do sums */ 3505 for (n = 0; n<sz-1; n+=2) { 3506 i1 = idx[0]; 3507 i2 = idx[1]; 3508 idx += 2; 3509 tmp0 = x[i1]; 3510 tmp1 = x[i2]; 3511 switch (sizes[i]) { 3512 case 5: 3513 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3514 case 4: /* fall through */ 3515 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3516 case 3: 3517 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3518 case 2: 3519 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3520 case 1: 3521 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3522 } 3523 } 3524 /* ragged edge */ 3525 if (n == sz-1) { 3526 tmp0 = x[*idx]; 3527 switch (sizes[i]) { 3528 case 5: 3529 sum5 -= *v5*tmp0; 3530 case 4: /* fall through */ 3531 sum4 -= *v4*tmp0; 3532 case 3: 3533 sum3 -= *v3*tmp0; 3534 case 2: 3535 sum2 -= *v2*tmp0; 3536 case 1: 3537 sum1 -= *v1*tmp0; 3538 } 3539 } 3540 /* update */ 3541 if (xb == b) { 3542 /* whole (old way) w/ diag */ 3543 switch (sizes[i]) { 3544 case 5: 3545 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3546 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3547 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3548 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3549 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3550 break; 3551 case 4: 3552 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3553 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3554 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3555 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3556 break; 3557 case 3: 3558 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3559 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3560 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3561 break; 3562 case 2: 3563 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3564 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3565 break; 3566 case 1: 3567 x[row--] += sum1*(*ibdiag); 3568 break; 3569 } 3570 } else { 3571 /* no diag so set = */ 3572 switch (sizes[i]) { 3573 case 5: 3574 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3575 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3576 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3577 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3578 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3579 break; 3580 case 4: 3581 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3582 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3583 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3584 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3585 break; 3586 case 3: 3587 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3588 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3589 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3590 break; 3591 case 2: 3592 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3593 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3594 break; 3595 case 1: 3596 x[row--] = sum1*(*ibdiag); 3597 break; 3598 } 3599 } 3600 } 3601 if (xb == b) { 3602 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3603 } else { 3604 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3605 } 3606 } 3607 } 3608 if (flag & SOR_EISENSTAT) { 3609 /* 3610 Apply (U + D)^-1 where D is now the block diagonal 3611 */ 3612 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3613 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3614 ibdiag -= sizes[i]*sizes[i]; 3615 sz = ii[row+1] - diag[row] - 1; 3616 v1 = a->a + diag[row] + 1; 3617 idx = a->j + diag[row] + 1; 3618 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3619 switch (sizes[i]) { 3620 case 1: 3621 3622 sum1 = b[row]; 3623 for (n = 0; n<sz-1; n+=2) { 3624 i1 = idx[0]; 3625 i2 = idx[1]; 3626 idx += 2; 3627 tmp0 = x[i1]; 3628 tmp1 = x[i2]; 3629 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3630 } 3631 3632 if (n == sz-1) { 3633 tmp0 = x[*idx]; 3634 sum1 -= *v1*tmp0; 3635 } 3636 x[row] = sum1*(*ibdiag);row--; 3637 break; 3638 3639 case 2: 3640 3641 sum1 = b[row]; 3642 sum2 = b[row-1]; 3643 /* note that sum1 is associated with the second of the two rows */ 3644 v2 = a->a + diag[row-1] + 2; 3645 for (n = 0; n<sz-1; n+=2) { 3646 i1 = idx[0]; 3647 i2 = idx[1]; 3648 idx += 2; 3649 tmp0 = x[i1]; 3650 tmp1 = x[i2]; 3651 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3652 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3653 } 3654 3655 if (n == sz-1) { 3656 tmp0 = x[*idx]; 3657 sum1 -= *v1*tmp0; 3658 sum2 -= *v2*tmp0; 3659 } 3660 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3661 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3662 row -= 2; 3663 break; 3664 case 3: 3665 3666 sum1 = b[row]; 3667 sum2 = b[row-1]; 3668 sum3 = b[row-2]; 3669 v2 = a->a + diag[row-1] + 2; 3670 v3 = a->a + diag[row-2] + 3; 3671 for (n = 0; n<sz-1; n+=2) { 3672 i1 = idx[0]; 3673 i2 = idx[1]; 3674 idx += 2; 3675 tmp0 = x[i1]; 3676 tmp1 = x[i2]; 3677 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3678 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3679 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3680 } 3681 3682 if (n == sz-1) { 3683 tmp0 = x[*idx]; 3684 sum1 -= *v1*tmp0; 3685 sum2 -= *v2*tmp0; 3686 sum3 -= *v3*tmp0; 3687 } 3688 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3689 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3690 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3691 row -= 3; 3692 break; 3693 case 4: 3694 3695 sum1 = b[row]; 3696 sum2 = b[row-1]; 3697 sum3 = b[row-2]; 3698 sum4 = b[row-3]; 3699 v2 = a->a + diag[row-1] + 2; 3700 v3 = a->a + diag[row-2] + 3; 3701 v4 = a->a + diag[row-3] + 4; 3702 for (n = 0; n<sz-1; n+=2) { 3703 i1 = idx[0]; 3704 i2 = idx[1]; 3705 idx += 2; 3706 tmp0 = x[i1]; 3707 tmp1 = x[i2]; 3708 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3709 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3710 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3711 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3712 } 3713 3714 if (n == sz-1) { 3715 tmp0 = x[*idx]; 3716 sum1 -= *v1*tmp0; 3717 sum2 -= *v2*tmp0; 3718 sum3 -= *v3*tmp0; 3719 sum4 -= *v4*tmp0; 3720 } 3721 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3722 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3723 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3724 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3725 row -= 4; 3726 break; 3727 case 5: 3728 3729 sum1 = b[row]; 3730 sum2 = b[row-1]; 3731 sum3 = b[row-2]; 3732 sum4 = b[row-3]; 3733 sum5 = b[row-4]; 3734 v2 = a->a + diag[row-1] + 2; 3735 v3 = a->a + diag[row-2] + 3; 3736 v4 = a->a + diag[row-3] + 4; 3737 v5 = a->a + diag[row-4] + 5; 3738 for (n = 0; n<sz-1; n+=2) { 3739 i1 = idx[0]; 3740 i2 = idx[1]; 3741 idx += 2; 3742 tmp0 = x[i1]; 3743 tmp1 = x[i2]; 3744 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3745 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3746 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3747 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3748 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3749 } 3750 3751 if (n == sz-1) { 3752 tmp0 = x[*idx]; 3753 sum1 -= *v1*tmp0; 3754 sum2 -= *v2*tmp0; 3755 sum3 -= *v3*tmp0; 3756 sum4 -= *v4*tmp0; 3757 sum5 -= *v5*tmp0; 3758 } 3759 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3760 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3761 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3762 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3763 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3764 row -= 5; 3765 break; 3766 default: 3767 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3768 } 3769 } 3770 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3771 3772 /* 3773 t = b - D x where D is the block diagonal 3774 */ 3775 cnt = 0; 3776 for (i=0, row=0; i<m; i++) { 3777 switch (sizes[i]) { 3778 case 1: 3779 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3780 break; 3781 case 2: 3782 x1 = x[row]; x2 = x[row+1]; 3783 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3784 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3785 t[row] = b[row] - tmp1; 3786 t[row+1] = b[row+1] - tmp2; row += 2; 3787 cnt += 4; 3788 break; 3789 case 3: 3790 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3791 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3792 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3793 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3794 t[row] = b[row] - tmp1; 3795 t[row+1] = b[row+1] - tmp2; 3796 t[row+2] = b[row+2] - tmp3; row += 3; 3797 cnt += 9; 3798 break; 3799 case 4: 3800 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3801 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3802 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3803 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3804 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3805 t[row] = b[row] - tmp1; 3806 t[row+1] = b[row+1] - tmp2; 3807 t[row+2] = b[row+2] - tmp3; 3808 t[row+3] = b[row+3] - tmp4; row += 4; 3809 cnt += 16; 3810 break; 3811 case 5: 3812 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3813 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3814 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3815 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3816 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3817 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3818 t[row] = b[row] - tmp1; 3819 t[row+1] = b[row+1] - tmp2; 3820 t[row+2] = b[row+2] - tmp3; 3821 t[row+3] = b[row+3] - tmp4; 3822 t[row+4] = b[row+4] - tmp5;row += 5; 3823 cnt += 25; 3824 break; 3825 default: 3826 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3827 } 3828 } 3829 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3830 3831 3832 3833 /* 3834 Apply (L + D)^-1 where D is the block diagonal 3835 */ 3836 for (i=0, row=0; i<m; i++) { 3837 sz = diag[row] - ii[row]; 3838 v1 = a->a + ii[row]; 3839 idx = a->j + ii[row]; 3840 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3841 switch (sizes[i]) { 3842 case 1: 3843 3844 sum1 = t[row]; 3845 for (n = 0; n<sz-1; n+=2) { 3846 i1 = idx[0]; 3847 i2 = idx[1]; 3848 idx += 2; 3849 tmp0 = t[i1]; 3850 tmp1 = t[i2]; 3851 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3852 } 3853 3854 if (n == sz-1) { 3855 tmp0 = t[*idx]; 3856 sum1 -= *v1 * tmp0; 3857 } 3858 x[row] += t[row] = sum1*(*ibdiag++); row++; 3859 break; 3860 case 2: 3861 v2 = a->a + ii[row+1]; 3862 sum1 = t[row]; 3863 sum2 = t[row+1]; 3864 for (n = 0; n<sz-1; n+=2) { 3865 i1 = idx[0]; 3866 i2 = idx[1]; 3867 idx += 2; 3868 tmp0 = t[i1]; 3869 tmp1 = t[i2]; 3870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3872 } 3873 3874 if (n == sz-1) { 3875 tmp0 = t[*idx]; 3876 sum1 -= v1[0] * tmp0; 3877 sum2 -= v2[0] * tmp0; 3878 } 3879 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3880 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3881 ibdiag += 4; row += 2; 3882 break; 3883 case 3: 3884 v2 = a->a + ii[row+1]; 3885 v3 = a->a + ii[row+2]; 3886 sum1 = t[row]; 3887 sum2 = t[row+1]; 3888 sum3 = t[row+2]; 3889 for (n = 0; n<sz-1; n+=2) { 3890 i1 = idx[0]; 3891 i2 = idx[1]; 3892 idx += 2; 3893 tmp0 = t[i1]; 3894 tmp1 = t[i2]; 3895 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3896 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3897 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3898 } 3899 3900 if (n == sz-1) { 3901 tmp0 = t[*idx]; 3902 sum1 -= v1[0] * tmp0; 3903 sum2 -= v2[0] * tmp0; 3904 sum3 -= v3[0] * tmp0; 3905 } 3906 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3907 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3908 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3909 ibdiag += 9; row += 3; 3910 break; 3911 case 4: 3912 v2 = a->a + ii[row+1]; 3913 v3 = a->a + ii[row+2]; 3914 v4 = a->a + ii[row+3]; 3915 sum1 = t[row]; 3916 sum2 = t[row+1]; 3917 sum3 = t[row+2]; 3918 sum4 = t[row+3]; 3919 for (n = 0; n<sz-1; n+=2) { 3920 i1 = idx[0]; 3921 i2 = idx[1]; 3922 idx += 2; 3923 tmp0 = t[i1]; 3924 tmp1 = t[i2]; 3925 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3926 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3927 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3928 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3929 } 3930 3931 if (n == sz-1) { 3932 tmp0 = t[*idx]; 3933 sum1 -= v1[0] * tmp0; 3934 sum2 -= v2[0] * tmp0; 3935 sum3 -= v3[0] * tmp0; 3936 sum4 -= v4[0] * tmp0; 3937 } 3938 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3939 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3940 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3941 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3942 ibdiag += 16; row += 4; 3943 break; 3944 case 5: 3945 v2 = a->a + ii[row+1]; 3946 v3 = a->a + ii[row+2]; 3947 v4 = a->a + ii[row+3]; 3948 v5 = a->a + ii[row+4]; 3949 sum1 = t[row]; 3950 sum2 = t[row+1]; 3951 sum3 = t[row+2]; 3952 sum4 = t[row+3]; 3953 sum5 = t[row+4]; 3954 for (n = 0; n<sz-1; n+=2) { 3955 i1 = idx[0]; 3956 i2 = idx[1]; 3957 idx += 2; 3958 tmp0 = t[i1]; 3959 tmp1 = t[i2]; 3960 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3961 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3962 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3963 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3964 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3965 } 3966 3967 if (n == sz-1) { 3968 tmp0 = t[*idx]; 3969 sum1 -= v1[0] * tmp0; 3970 sum2 -= v2[0] * tmp0; 3971 sum3 -= v3[0] * tmp0; 3972 sum4 -= v4[0] * tmp0; 3973 sum5 -= v5[0] * tmp0; 3974 } 3975 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3976 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3977 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3978 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3979 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3980 ibdiag += 25; row += 5; 3981 break; 3982 default: 3983 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3984 } 3985 } 3986 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3987 } 3988 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3989 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3990 PetscFunctionReturn(0); 3991 } 3992 3993 #undef __FUNCT__ 3994 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3995 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3996 { 3997 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3998 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3999 const MatScalar *bdiag = a->inode.bdiag; 4000 const PetscScalar *b; 4001 PetscErrorCode ierr; 4002 PetscInt m = a->inode.node_count,cnt = 0,i,row; 4003 const PetscInt *sizes = a->inode.size; 4004 4005 PetscFunctionBegin; 4006 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4007 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4008 cnt = 0; 4009 for (i=0, row=0; i<m; i++) { 4010 switch (sizes[i]) { 4011 case 1: 4012 x[row] = b[row]*bdiag[cnt++];row++; 4013 break; 4014 case 2: 4015 x1 = b[row]; x2 = b[row+1]; 4016 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 4017 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 4018 x[row++] = tmp1; 4019 x[row++] = tmp2; 4020 cnt += 4; 4021 break; 4022 case 3: 4023 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4024 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4025 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4026 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4027 x[row++] = tmp1; 4028 x[row++] = tmp2; 4029 x[row++] = tmp3; 4030 cnt += 9; 4031 break; 4032 case 4: 4033 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4034 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4035 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4036 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4037 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4038 x[row++] = tmp1; 4039 x[row++] = tmp2; 4040 x[row++] = tmp3; 4041 x[row++] = tmp4; 4042 cnt += 16; 4043 break; 4044 case 5: 4045 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4046 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4047 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4048 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4049 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4050 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4051 x[row++] = tmp1; 4052 x[row++] = tmp2; 4053 x[row++] = tmp3; 4054 x[row++] = tmp4; 4055 x[row++] = tmp5; 4056 cnt += 25; 4057 break; 4058 default: 4059 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4060 } 4061 } 4062 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4063 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4064 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4065 PetscFunctionReturn(0); 4066 } 4067 4068 /* 4069 samestructure indicates that the matrix has not changed its nonzero structure so we 4070 do not need to recompute the inodes 4071 */ 4072 #undef __FUNCT__ 4073 #define __FUNCT__ "Mat_CheckInode" 4074 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 4075 { 4076 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4077 PetscErrorCode ierr; 4078 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 4079 PetscBool flag; 4080 4081 PetscFunctionBegin; 4082 if (!a->inode.use) PetscFunctionReturn(0); 4083 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4084 4085 4086 m = A->rmap->n; 4087 if (a->inode.size) ns = a->inode.size; 4088 else { 4089 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 4090 } 4091 4092 i = 0; 4093 node_count = 0; 4094 idx = a->j; 4095 ii = a->i; 4096 while (i < m) { /* For each row */ 4097 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4098 /* Limits the number of elements in a node to 'a->inode.limit' */ 4099 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4100 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4101 if (nzy != nzx) break; 4102 idy += nzx; /* Same nonzero pattern */ 4103 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4104 if (!flag) break; 4105 } 4106 ns[node_count++] = blk_size; 4107 idx += blk_size*nzx; 4108 i = j; 4109 } 4110 /* If not enough inodes found,, do not use inode version of the routines */ 4111 if (!m || node_count > .8*m) { 4112 ierr = PetscFree(ns);CHKERRQ(ierr); 4113 4114 a->inode.node_count = 0; 4115 a->inode.size = NULL; 4116 a->inode.use = PETSC_FALSE; 4117 A->ops->mult = MatMult_SeqAIJ; 4118 A->ops->sor = MatSOR_SeqAIJ; 4119 A->ops->multadd = MatMultAdd_SeqAIJ; 4120 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4121 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4122 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4123 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4124 A->ops->coloringpatch = 0; 4125 A->ops->multdiagonalblock = 0; 4126 4127 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4128 } else { 4129 if (!A->factortype) { 4130 A->ops->mult = MatMult_SeqAIJ_Inode; 4131 A->ops->sor = MatSOR_SeqAIJ_Inode; 4132 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4133 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4134 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4135 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4136 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4137 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4138 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4139 } else { 4140 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4141 } 4142 a->inode.node_count = node_count; 4143 a->inode.size = ns; 4144 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4145 } 4146 a->inode.checked = PETSC_TRUE; 4147 PetscFunctionReturn(0); 4148 } 4149 4150 #undef __FUNCT__ 4151 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4152 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4153 { 4154 Mat B =*C; 4155 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4156 PetscErrorCode ierr; 4157 PetscInt m=A->rmap->n; 4158 4159 PetscFunctionBegin; 4160 c->inode.use = a->inode.use; 4161 c->inode.limit = a->inode.limit; 4162 c->inode.max_limit = a->inode.max_limit; 4163 if (a->inode.size) { 4164 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr); 4165 c->inode.node_count = a->inode.node_count; 4166 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4167 /* note the table of functions below should match that in Mat_CheckInode() */ 4168 if (!B->factortype) { 4169 B->ops->mult = MatMult_SeqAIJ_Inode; 4170 B->ops->sor = MatSOR_SeqAIJ_Inode; 4171 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4172 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4173 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4174 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4175 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4176 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4177 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4178 } else { 4179 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4180 } 4181 } else { 4182 c->inode.size = 0; 4183 c->inode.node_count = 0; 4184 } 4185 c->inode.ibdiagvalid = PETSC_FALSE; 4186 c->inode.ibdiag = 0; 4187 c->inode.bdiag = 0; 4188 PetscFunctionReturn(0); 4189 } 4190 4191 #undef __FUNCT__ 4192 #define __FUNCT__ "MatGetRow_FactoredLU" 4193 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row) 4194 { 4195 PetscInt k, *vi; 4196 4197 PetscFunctionBegin; 4198 vi = aj + ai[row]; 4199 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4200 vi = aj + adiag[row]; 4201 cols[nzl] = vi[0]; 4202 vi = aj + adiag[row+1]+1; 4203 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4204 PetscFunctionReturn(0); 4205 } 4206 /* 4207 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4208 Modified from Mat_CheckInode(). 4209 4210 Input Parameters: 4211 + Mat A - ILU or LU matrix factor 4212 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 4213 do not need to recompute the inodes 4214 */ 4215 #undef __FUNCT__ 4216 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4217 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure) 4218 { 4219 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4220 PetscErrorCode ierr; 4221 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4222 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 4223 PetscBool flag; 4224 4225 PetscFunctionBegin; 4226 if (!a->inode.use) PetscFunctionReturn(0); 4227 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4228 4229 m = A->rmap->n; 4230 if (a->inode.size) ns = a->inode.size; 4231 else { 4232 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 4233 } 4234 4235 i = 0; 4236 node_count = 0; 4237 while (i < m) { /* For each row */ 4238 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4239 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4240 nzx = nzl1 + nzu1 + 1; 4241 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 4242 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4243 4244 /* Limits the number of elements in a node to 'a->inode.limit' */ 4245 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4246 nzl2 = ai[j+1] - ai[j]; 4247 nzu2 = adiag[j] - adiag[j+1] - 1; 4248 nzy = nzl2 + nzu2 + 1; 4249 if (nzy != nzx) break; 4250 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 4251 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4252 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4253 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 4254 ierr = PetscFree(cols2);CHKERRQ(ierr); 4255 } 4256 ns[node_count++] = blk_size; 4257 ierr = PetscFree(cols1);CHKERRQ(ierr); 4258 i = j; 4259 } 4260 /* If not enough inodes found,, do not use inode version of the routines */ 4261 if (!m || node_count > .8*m) { 4262 ierr = PetscFree(ns);CHKERRQ(ierr); 4263 4264 a->inode.node_count = 0; 4265 a->inode.size = NULL; 4266 a->inode.use = PETSC_FALSE; 4267 4268 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4269 } else { 4270 A->ops->mult = 0; 4271 A->ops->sor = 0; 4272 A->ops->multadd = 0; 4273 A->ops->getrowij = 0; 4274 A->ops->restorerowij = 0; 4275 A->ops->getcolumnij = 0; 4276 A->ops->restorecolumnij = 0; 4277 A->ops->coloringpatch = 0; 4278 A->ops->multdiagonalblock = 0; 4279 A->ops->solve = MatSolve_SeqAIJ_Inode; 4280 a->inode.node_count = node_count; 4281 a->inode.size = ns; 4282 4283 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4284 } 4285 a->inode.checked = PETSC_TRUE; 4286 PetscFunctionReturn(0); 4287 } 4288 4289 #undef __FUNCT__ 4290 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4291 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4292 { 4293 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4294 4295 PetscFunctionBegin; 4296 a->inode.ibdiagvalid = PETSC_FALSE; 4297 PetscFunctionReturn(0); 4298 } 4299 4300 /* 4301 This is really ugly. if inodes are used this replaces the 4302 permutations with ones that correspond to rows/cols of the matrix 4303 rather then inode blocks 4304 */ 4305 #undef __FUNCT__ 4306 #define __FUNCT__ "MatInodeAdjustForInodes" 4307 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4308 { 4309 PetscErrorCode ierr; 4310 4311 PetscFunctionBegin; 4312 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4313 PetscFunctionReturn(0); 4314 } 4315 4316 #undef __FUNCT__ 4317 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4318 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4319 { 4320 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4321 PetscErrorCode ierr; 4322 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4323 const PetscInt *ridx,*cidx; 4324 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4325 PetscInt nslim_col,*ns_col; 4326 IS ris = *rperm,cis = *cperm; 4327 4328 PetscFunctionBegin; 4329 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4330 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4331 4332 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4333 ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 4334 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 4335 4336 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4337 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4338 4339 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4340 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4341 4342 /* Construct the permutations for rows*/ 4343 for (i=0,row = 0; i<nslim_row; ++i) { 4344 indx = ridx[i]; 4345 start_val = tns[indx]; 4346 end_val = tns[indx + 1]; 4347 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4348 } 4349 4350 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4351 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4352 4353 /* Construct permutations for columns */ 4354 for (i=0,col=0; i<nslim_col; ++i) { 4355 indx = cidx[i]; 4356 start_val = tns[indx]; 4357 end_val = tns[indx + 1]; 4358 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4359 } 4360 4361 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4362 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4363 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4364 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4365 4366 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4367 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4368 4369 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4370 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4371 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4372 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4373 ierr = PetscFree(tns);CHKERRQ(ierr); 4374 PetscFunctionReturn(0); 4375 } 4376 4377 #undef __FUNCT__ 4378 #define __FUNCT__ "MatInodeGetInodeSizes" 4379 /*@C 4380 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4381 4382 Not Collective 4383 4384 Input Parameter: 4385 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4386 4387 Output Parameter: 4388 + node_count - no of inodes present in the matrix. 4389 . sizes - an array of size node_count,with sizes of each inode. 4390 - limit - the max size used to generate the inodes. 4391 4392 Level: advanced 4393 4394 Notes: This routine returns some internal storage information 4395 of the matrix, it is intended to be used by advanced users. 4396 It should be called after the matrix is assembled. 4397 The contents of the sizes[] array should not be changed. 4398 NULL may be passed for information not requested. 4399 4400 .keywords: matrix, seqaij, get, inode 4401 4402 .seealso: MatGetInfo() 4403 @*/ 4404 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4405 { 4406 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4407 4408 PetscFunctionBegin; 4409 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4410 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4411 if (f) { 4412 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4413 } 4414 PetscFunctionReturn(0); 4415 } 4416 4417 #undef __FUNCT__ 4418 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4419 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4420 { 4421 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4422 4423 PetscFunctionBegin; 4424 if (node_count) *node_count = a->inode.node_count; 4425 if (sizes) *sizes = a->inode.size; 4426 if (limit) *limit = a->inode.limit; 4427 PetscFunctionReturn(0); 4428 } 4429