1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc1((n+1),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 67 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc1((nslim_col+1),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc1((n+1),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1) { 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc1((nslim_row+1),&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc1((nslim_row+1),&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,nsz,i1,i2; 157 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 158 159 PetscFunctionBegin; 160 nslim_row = a->inode.node_count; 161 n = A->cmap->n; 162 163 /* Create The column_inode for this matrix */ 164 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 165 166 /* allocate space for reformated column_inode structure */ 167 ierr = PetscMalloc1((nslim_col +1),&tns);CHKERRQ(ierr); 168 ierr = PetscMalloc1((n +1),&tvc);CHKERRQ(ierr); 169 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 170 171 for (i1=0,col=0; i1<nslim_col; ++i1) { 172 nsz = ns_col[i1]; 173 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc1((nslim_row+1),&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc1((nslim_row+1),&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 PetscFunctionReturn(0); 264 } 265 266 /* ----------------------------------------------------------- */ 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscErrorCode ierr; 274 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 275 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 276 277 PetscFunctionBegin; 278 nslim_row = a->inode.node_count; 279 n = A->cmap->n; 280 281 /* Create The column_inode for this matrix */ 282 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 283 284 /* allocate space for reformated column_inode structure */ 285 ierr = PetscMalloc1((nslim_col + 1),&tns);CHKERRQ(ierr); 286 ierr = PetscMalloc1((n + 1),&tvc);CHKERRQ(ierr); 287 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 288 289 for (i1=0,col=0; i1<nslim_col; ++i1) { 290 nsz = ns_col[i1]; 291 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 292 } 293 /* allocate space for column pointers */ 294 ierr = PetscMalloc1((nslim_col+1),&ia);CHKERRQ(ierr); 295 *iia = ia; 296 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 297 ierr = PetscMalloc1((nslim_col+1),&work);CHKERRQ(ierr); 298 299 /* determine the number of columns in each row */ 300 ia[0] = oshift; 301 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 302 j = aj + ai[row] + ishift; 303 col = *j++ + ishift; 304 i2 = tvc[col]; 305 nz = ai[row+1] - ai[row]; 306 while (nz-- > 0) { /* off-diagonal elemets */ 307 /* ia[i1+1]++; */ 308 ia[i2+1]++; 309 i2++; 310 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 311 if (nz > 0) i2 = tvc[col]; 312 } 313 } 314 315 /* shift ia[i] to point to next col */ 316 for (i1=1; i1<nslim_col+1; i1++) { 317 col = ia[i1-1]; 318 ia[i1] += col; 319 work[i1-1] = col - oshift; 320 } 321 322 /* allocate space for column pointers */ 323 nz = ia[nslim_col] + (!ishift); 324 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 325 *jja = ja; 326 327 /* loop over matrix putting into ja */ 328 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 329 j = aj + ai[row] + ishift; 330 col = *j++ + ishift; 331 i2 = tvc[col]; 332 nz = ai[row+1] - ai[row]; 333 while (nz-- > 0) { 334 /* ja[work[i1]++] = i2 + oshift; */ 335 ja[work[i2]++] = i1 + oshift; 336 i2++; 337 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 338 if (nz > 0) i2 = tvc[col]; 339 } 340 } 341 ierr = PetscFree(ns_col);CHKERRQ(ierr); 342 ierr = PetscFree(work);CHKERRQ(ierr); 343 ierr = PetscFree(tns);CHKERRQ(ierr); 344 ierr = PetscFree(tvc);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 356 if (!ia) PetscFunctionReturn(0); 357 358 if (!blockcompressed) { 359 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 360 } else if (symmetric) { 361 /* Since the indices are symmetric it does'nt matter */ 362 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } else { 364 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 365 } 366 PetscFunctionReturn(0); 367 } 368 369 #undef __FUNCT__ 370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 372 { 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 if (!ia) PetscFunctionReturn(0); 377 if (!blockcompressed) { 378 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 379 } else { 380 ierr = PetscFree(*ia);CHKERRQ(ierr); 381 ierr = PetscFree(*ja);CHKERRQ(ierr); 382 } 383 PetscFunctionReturn(0); 384 } 385 386 /* ----------------------------------------------------------- */ 387 388 #undef __FUNCT__ 389 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 393 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 394 PetscScalar *y; 395 const PetscScalar *x; 396 const MatScalar *v1,*v2,*v3,*v4,*v5; 397 PetscErrorCode ierr; 398 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 399 const PetscInt *idx,*ns,*ii; 400 401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 403 #endif 404 405 PetscFunctionBegin; 406 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 407 node_max = a->inode.node_count; 408 ns = a->inode.size; /* Node Size array */ 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 411 idx = a->j; 412 v1 = a->a; 413 ii = a->i; 414 415 for (i = 0,row = 0; i< node_max; ++i) { 416 nsz = ns[i]; 417 n = ii[1] - ii[0]; 418 nonzerorow += (n>0)*nsz; 419 ii += nsz; 420 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 421 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 422 sz = n; /* No of non zeros in this row */ 423 /* Switch on the size of Node */ 424 switch (nsz) { /* Each loop in 'case' is unrolled */ 425 case 1: 426 sum1 = 0.; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; /* The instructions are ordered to */ 430 i2 = idx[1]; /* make the compiler's job easy */ 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 } 436 437 if (n == sz-1) { /* Take care of the last nonzero */ 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ *tmp0; 440 } 441 y[row++]=sum1; 442 break; 443 case 2: 444 sum1 = 0.; 445 sum2 = 0.; 446 v2 = v1 + n; 447 448 for (n = 0; n< sz-1; n+=2) { 449 i1 = idx[0]; 450 i2 = idx[1]; 451 idx += 2; 452 tmp0 = x[i1]; 453 tmp1 = x[i2]; 454 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 455 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 456 } 457 if (n == sz-1) { 458 tmp0 = x[*idx++]; 459 sum1 += *v1++ * tmp0; 460 sum2 += *v2++ * tmp0; 461 } 462 y[row++]=sum1; 463 y[row++]=sum2; 464 v1 =v2; /* Since the next block to be processed starts there*/ 465 idx +=sz; 466 break; 467 case 3: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 v2 = v1 + n; 472 v3 = v2 + n; 473 474 for (n = 0; n< sz-1; n+=2) { 475 i1 = idx[0]; 476 i2 = idx[1]; 477 idx += 2; 478 tmp0 = x[i1]; 479 tmp1 = x[i2]; 480 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 } 490 y[row++]=sum1; 491 y[row++]=sum2; 492 y[row++]=sum3; 493 v1 =v3; /* Since the next block to be processed starts there*/ 494 idx +=2*sz; 495 break; 496 case 4: 497 sum1 = 0.; 498 sum2 = 0.; 499 sum3 = 0.; 500 sum4 = 0.; 501 v2 = v1 + n; 502 v3 = v2 + n; 503 v4 = v3 + n; 504 505 for (n = 0; n< sz-1; n+=2) { 506 i1 = idx[0]; 507 i2 = idx[1]; 508 idx += 2; 509 tmp0 = x[i1]; 510 tmp1 = x[i2]; 511 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 513 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 514 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 515 } 516 if (n == sz-1) { 517 tmp0 = x[*idx++]; 518 sum1 += *v1++ * tmp0; 519 sum2 += *v2++ * tmp0; 520 sum3 += *v3++ * tmp0; 521 sum4 += *v4++ * tmp0; 522 } 523 y[row++]=sum1; 524 y[row++]=sum2; 525 y[row++]=sum3; 526 y[row++]=sum4; 527 v1 =v4; /* Since the next block to be processed starts there*/ 528 idx +=3*sz; 529 break; 530 case 5: 531 sum1 = 0.; 532 sum2 = 0.; 533 sum3 = 0.; 534 sum4 = 0.; 535 sum5 = 0.; 536 v2 = v1 + n; 537 v3 = v2 + n; 538 v4 = v3 + n; 539 v5 = v4 + n; 540 541 for (n = 0; n<sz-1; n+=2) { 542 i1 = idx[0]; 543 i2 = idx[1]; 544 idx += 2; 545 tmp0 = x[i1]; 546 tmp1 = x[i2]; 547 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 548 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 549 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 550 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 551 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 552 } 553 if (n == sz-1) { 554 tmp0 = x[*idx++]; 555 sum1 += *v1++ * tmp0; 556 sum2 += *v2++ * tmp0; 557 sum3 += *v3++ * tmp0; 558 sum4 += *v4++ * tmp0; 559 sum5 += *v5++ * tmp0; 560 } 561 y[row++]=sum1; 562 y[row++]=sum2; 563 y[row++]=sum3; 564 y[row++]=sum4; 565 y[row++]=sum5; 566 v1 =v5; /* Since the next block to be processed starts there */ 567 idx +=4*sz; 568 break; 569 default: 570 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 571 } 572 } 573 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 575 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 /* ----------------------------------------------------------- */ 579 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 583 { 584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 585 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 586 const MatScalar *v1,*v2,*v3,*v4,*v5; 587 const PetscScalar *x; 588 PetscScalar *y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 591 const PetscInt *idx,*ns,*ii; 592 593 PetscFunctionBegin; 594 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 595 node_max = a->inode.node_count; 596 ns = a->inode.size; /* Node Size array */ 597 598 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 600 if (zz != yy) { 601 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 602 } else { 603 z = y; 604 } 605 zt = z; 606 607 idx = a->j; 608 v1 = a->a; 609 ii = a->i; 610 611 for (i = 0,row = 0; i< node_max; ++i) { 612 nsz = ns[i]; 613 n = ii[1] - ii[0]; 614 ii += nsz; 615 sz = n; /* No of non zeros in this row */ 616 /* Switch on the size of Node */ 617 switch (nsz) { /* Each loop in 'case' is unrolled */ 618 case 1: 619 sum1 = *zt++; 620 621 for (n = 0; n< sz-1; n+=2) { 622 i1 = idx[0]; /* The instructions are ordered to */ 623 i2 = idx[1]; /* make the compiler's job easy */ 624 idx += 2; 625 tmp0 = x[i1]; 626 tmp1 = x[i2]; 627 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 628 } 629 630 if (n == sz-1) { /* Take care of the last nonzero */ 631 tmp0 = x[*idx++]; 632 sum1 += *v1++ * tmp0; 633 } 634 y[row++]=sum1; 635 break; 636 case 2: 637 sum1 = *zt++; 638 sum2 = *zt++; 639 v2 = v1 + n; 640 641 for (n = 0; n< sz-1; n+=2) { 642 i1 = idx[0]; 643 i2 = idx[1]; 644 idx += 2; 645 tmp0 = x[i1]; 646 tmp1 = x[i2]; 647 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 648 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 649 } 650 if (n == sz-1) { 651 tmp0 = x[*idx++]; 652 sum1 += *v1++ * tmp0; 653 sum2 += *v2++ * tmp0; 654 } 655 y[row++]=sum1; 656 y[row++]=sum2; 657 v1 =v2; /* Since the next block to be processed starts there*/ 658 idx +=sz; 659 break; 660 case 3: 661 sum1 = *zt++; 662 sum2 = *zt++; 663 sum3 = *zt++; 664 v2 = v1 + n; 665 v3 = v2 + n; 666 667 for (n = 0; n< sz-1; n+=2) { 668 i1 = idx[0]; 669 i2 = idx[1]; 670 idx += 2; 671 tmp0 = x[i1]; 672 tmp1 = x[i2]; 673 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 674 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 675 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 676 } 677 if (n == sz-1) { 678 tmp0 = x[*idx++]; 679 sum1 += *v1++ * tmp0; 680 sum2 += *v2++ * tmp0; 681 sum3 += *v3++ * tmp0; 682 } 683 y[row++]=sum1; 684 y[row++]=sum2; 685 y[row++]=sum3; 686 v1 =v3; /* Since the next block to be processed starts there*/ 687 idx +=2*sz; 688 break; 689 case 4: 690 sum1 = *zt++; 691 sum2 = *zt++; 692 sum3 = *zt++; 693 sum4 = *zt++; 694 v2 = v1 + n; 695 v3 = v2 + n; 696 v4 = v3 + n; 697 698 for (n = 0; n< sz-1; n+=2) { 699 i1 = idx[0]; 700 i2 = idx[1]; 701 idx += 2; 702 tmp0 = x[i1]; 703 tmp1 = x[i2]; 704 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 705 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 706 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 707 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 708 } 709 if (n == sz-1) { 710 tmp0 = x[*idx++]; 711 sum1 += *v1++ * tmp0; 712 sum2 += *v2++ * tmp0; 713 sum3 += *v3++ * tmp0; 714 sum4 += *v4++ * tmp0; 715 } 716 y[row++]=sum1; 717 y[row++]=sum2; 718 y[row++]=sum3; 719 y[row++]=sum4; 720 v1 =v4; /* Since the next block to be processed starts there*/ 721 idx +=3*sz; 722 break; 723 case 5: 724 sum1 = *zt++; 725 sum2 = *zt++; 726 sum3 = *zt++; 727 sum4 = *zt++; 728 sum5 = *zt++; 729 v2 = v1 + n; 730 v3 = v2 + n; 731 v4 = v3 + n; 732 v5 = v4 + n; 733 734 for (n = 0; n<sz-1; n+=2) { 735 i1 = idx[0]; 736 i2 = idx[1]; 737 idx += 2; 738 tmp0 = x[i1]; 739 tmp1 = x[i2]; 740 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 741 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 742 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 743 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 744 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 745 } 746 if (n == sz-1) { 747 tmp0 = x[*idx++]; 748 sum1 += *v1++ * tmp0; 749 sum2 += *v2++ * tmp0; 750 sum3 += *v3++ * tmp0; 751 sum4 += *v4++ * tmp0; 752 sum5 += *v5++ * tmp0; 753 } 754 y[row++]=sum1; 755 y[row++]=sum2; 756 y[row++]=sum3; 757 y[row++]=sum4; 758 y[row++]=sum5; 759 v1 =v5; /* Since the next block to be processed starts there */ 760 idx +=4*sz; 761 break; 762 default: 763 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 764 } 765 } 766 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 767 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 768 if (zz != yy) { 769 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 770 } 771 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 772 PetscFunctionReturn(0); 773 } 774 775 /* ----------------------------------------------------------- */ 776 #undef __FUNCT__ 777 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 778 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 779 { 780 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 781 IS iscol = a->col,isrow = a->row; 782 PetscErrorCode ierr; 783 const PetscInt *r,*c,*rout,*cout; 784 PetscInt i,j,n = A->rmap->n,nz; 785 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 786 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp; 806 aa = a_a; 807 aj = a_j; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i) { 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 if (i < node_max-1) { 817 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 818 * but our indexing to determine it's size could. */ 819 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 820 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 821 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 822 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 823 } 824 825 switch (nsz) { /* Each loop in 'case' is unrolled */ 826 case 1: 827 sum1 = b[*r++]; 828 for (j=0; j<nz-1; j+=2) { 829 i0 = vi[0]; 830 i1 = vi[1]; 831 vi +=2; 832 tmp0 = tmps[i0]; 833 tmp1 = tmps[i1]; 834 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 835 } 836 if (j == nz-1) { 837 tmp0 = tmps[*vi++]; 838 sum1 -= *v1++ *tmp0; 839 } 840 tmp[row++]=sum1; 841 break; 842 case 2: 843 sum1 = b[*r++]; 844 sum2 = b[*r++]; 845 v2 = aa + ai[row+1]; 846 847 for (j=0; j<nz-1; j+=2) { 848 i0 = vi[0]; 849 i1 = vi[1]; 850 vi +=2; 851 tmp0 = tmps[i0]; 852 tmp1 = tmps[i1]; 853 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 854 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 855 } 856 if (j == nz-1) { 857 tmp0 = tmps[*vi++]; 858 sum1 -= *v1++ *tmp0; 859 sum2 -= *v2++ *tmp0; 860 } 861 sum2 -= *v2++ *sum1; 862 tmp[row++]=sum1; 863 tmp[row++]=sum2; 864 break; 865 case 3: 866 sum1 = b[*r++]; 867 sum2 = b[*r++]; 868 sum3 = b[*r++]; 869 v2 = aa + ai[row+1]; 870 v3 = aa + ai[row+2]; 871 872 for (j=0; j<nz-1; j+=2) { 873 i0 = vi[0]; 874 i1 = vi[1]; 875 vi +=2; 876 tmp0 = tmps[i0]; 877 tmp1 = tmps[i1]; 878 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 879 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 880 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 881 } 882 if (j == nz-1) { 883 tmp0 = tmps[*vi++]; 884 sum1 -= *v1++ *tmp0; 885 sum2 -= *v2++ *tmp0; 886 sum3 -= *v3++ *tmp0; 887 } 888 sum2 -= *v2++ * sum1; 889 sum3 -= *v3++ * sum1; 890 sum3 -= *v3++ * sum2; 891 892 tmp[row++]=sum1; 893 tmp[row++]=sum2; 894 tmp[row++]=sum3; 895 break; 896 897 case 4: 898 sum1 = b[*r++]; 899 sum2 = b[*r++]; 900 sum3 = b[*r++]; 901 sum4 = b[*r++]; 902 v2 = aa + ai[row+1]; 903 v3 = aa + ai[row+2]; 904 v4 = aa + ai[row+3]; 905 906 for (j=0; j<nz-1; j+=2) { 907 i0 = vi[0]; 908 i1 = vi[1]; 909 vi +=2; 910 tmp0 = tmps[i0]; 911 tmp1 = tmps[i1]; 912 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 913 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 914 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 915 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 916 } 917 if (j == nz-1) { 918 tmp0 = tmps[*vi++]; 919 sum1 -= *v1++ *tmp0; 920 sum2 -= *v2++ *tmp0; 921 sum3 -= *v3++ *tmp0; 922 sum4 -= *v4++ *tmp0; 923 } 924 sum2 -= *v2++ * sum1; 925 sum3 -= *v3++ * sum1; 926 sum4 -= *v4++ * sum1; 927 sum3 -= *v3++ * sum2; 928 sum4 -= *v4++ * sum2; 929 sum4 -= *v4++ * sum3; 930 931 tmp[row++]=sum1; 932 tmp[row++]=sum2; 933 tmp[row++]=sum3; 934 tmp[row++]=sum4; 935 break; 936 case 5: 937 sum1 = b[*r++]; 938 sum2 = b[*r++]; 939 sum3 = b[*r++]; 940 sum4 = b[*r++]; 941 sum5 = b[*r++]; 942 v2 = aa + ai[row+1]; 943 v3 = aa + ai[row+2]; 944 v4 = aa + ai[row+3]; 945 v5 = aa + ai[row+4]; 946 947 for (j=0; j<nz-1; j+=2) { 948 i0 = vi[0]; 949 i1 = vi[1]; 950 vi +=2; 951 tmp0 = tmps[i0]; 952 tmp1 = tmps[i1]; 953 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 954 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 955 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 957 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 958 } 959 if (j == nz-1) { 960 tmp0 = tmps[*vi++]; 961 sum1 -= *v1++ *tmp0; 962 sum2 -= *v2++ *tmp0; 963 sum3 -= *v3++ *tmp0; 964 sum4 -= *v4++ *tmp0; 965 sum5 -= *v5++ *tmp0; 966 } 967 968 sum2 -= *v2++ * sum1; 969 sum3 -= *v3++ * sum1; 970 sum4 -= *v4++ * sum1; 971 sum5 -= *v5++ * sum1; 972 sum3 -= *v3++ * sum2; 973 sum4 -= *v4++ * sum2; 974 sum5 -= *v5++ * sum2; 975 sum4 -= *v4++ * sum3; 976 sum5 -= *v5++ * sum3; 977 sum5 -= *v5++ * sum4; 978 979 tmp[row++]=sum1; 980 tmp[row++]=sum2; 981 tmp[row++]=sum3; 982 tmp[row++]=sum4; 983 tmp[row++]=sum5; 984 break; 985 default: 986 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 987 } 988 } 989 /* backward solve the upper triangular */ 990 for (i=node_max -1,row = n-1; i>=0; i--) { 991 nsz = ns[i]; 992 aii = ai[row+1] -1; 993 v1 = aa + aii; 994 vi = aj + aii; 995 nz = aii- ad[row]; 996 switch (nsz) { /* Each loop in 'case' is unrolled */ 997 case 1: 998 sum1 = tmp[row]; 999 1000 for (j=nz; j>1; j-=2) { 1001 vi -=2; 1002 i0 = vi[2]; 1003 i1 = vi[1]; 1004 tmp0 = tmps[i0]; 1005 tmp1 = tmps[i1]; 1006 v1 -= 2; 1007 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1008 } 1009 if (j==1) { 1010 tmp0 = tmps[*vi--]; 1011 sum1 -= *v1-- * tmp0; 1012 } 1013 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1014 break; 1015 case 2: 1016 sum1 = tmp[row]; 1017 sum2 = tmp[row -1]; 1018 v2 = aa + ai[row]-1; 1019 for (j=nz; j>1; j-=2) { 1020 vi -=2; 1021 i0 = vi[2]; 1022 i1 = vi[1]; 1023 tmp0 = tmps[i0]; 1024 tmp1 = tmps[i1]; 1025 v1 -= 2; 1026 v2 -= 2; 1027 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1028 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1029 } 1030 if (j==1) { 1031 tmp0 = tmps[*vi--]; 1032 sum1 -= *v1-- * tmp0; 1033 sum2 -= *v2-- * tmp0; 1034 } 1035 1036 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1037 sum2 -= *v2-- * tmp0; 1038 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1039 break; 1040 case 3: 1041 sum1 = tmp[row]; 1042 sum2 = tmp[row -1]; 1043 sum3 = tmp[row -2]; 1044 v2 = aa + ai[row]-1; 1045 v3 = aa + ai[row -1]-1; 1046 for (j=nz; j>1; j-=2) { 1047 vi -=2; 1048 i0 = vi[2]; 1049 i1 = vi[1]; 1050 tmp0 = tmps[i0]; 1051 tmp1 = tmps[i1]; 1052 v1 -= 2; 1053 v2 -= 2; 1054 v3 -= 2; 1055 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1056 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1057 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1058 } 1059 if (j==1) { 1060 tmp0 = tmps[*vi--]; 1061 sum1 -= *v1-- * tmp0; 1062 sum2 -= *v2-- * tmp0; 1063 sum3 -= *v3-- * tmp0; 1064 } 1065 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1066 sum2 -= *v2-- * tmp0; 1067 sum3 -= *v3-- * tmp0; 1068 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1069 sum3 -= *v3-- * tmp0; 1070 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1071 1072 break; 1073 case 4: 1074 sum1 = tmp[row]; 1075 sum2 = tmp[row -1]; 1076 sum3 = tmp[row -2]; 1077 sum4 = tmp[row -3]; 1078 v2 = aa + ai[row]-1; 1079 v3 = aa + ai[row -1]-1; 1080 v4 = aa + ai[row -2]-1; 1081 1082 for (j=nz; j>1; j-=2) { 1083 vi -=2; 1084 i0 = vi[2]; 1085 i1 = vi[1]; 1086 tmp0 = tmps[i0]; 1087 tmp1 = tmps[i1]; 1088 v1 -= 2; 1089 v2 -= 2; 1090 v3 -= 2; 1091 v4 -= 2; 1092 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1093 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1094 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1095 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1096 } 1097 if (j==1) { 1098 tmp0 = tmps[*vi--]; 1099 sum1 -= *v1-- * tmp0; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 } 1104 1105 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1106 sum2 -= *v2-- * tmp0; 1107 sum3 -= *v3-- * tmp0; 1108 sum4 -= *v4-- * tmp0; 1109 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1113 sum4 -= *v4-- * tmp0; 1114 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1115 break; 1116 case 5: 1117 sum1 = tmp[row]; 1118 sum2 = tmp[row -1]; 1119 sum3 = tmp[row -2]; 1120 sum4 = tmp[row -3]; 1121 sum5 = tmp[row -4]; 1122 v2 = aa + ai[row]-1; 1123 v3 = aa + ai[row -1]-1; 1124 v4 = aa + ai[row -2]-1; 1125 v5 = aa + ai[row -3]-1; 1126 for (j=nz; j>1; j-=2) { 1127 vi -= 2; 1128 i0 = vi[2]; 1129 i1 = vi[1]; 1130 tmp0 = tmps[i0]; 1131 tmp1 = tmps[i1]; 1132 v1 -= 2; 1133 v2 -= 2; 1134 v3 -= 2; 1135 v4 -= 2; 1136 v5 -= 2; 1137 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1138 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1139 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1140 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1141 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1142 } 1143 if (j==1) { 1144 tmp0 = tmps[*vi--]; 1145 sum1 -= *v1-- * tmp0; 1146 sum2 -= *v2-- * tmp0; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 } 1151 1152 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1153 sum2 -= *v2-- * tmp0; 1154 sum3 -= *v3-- * tmp0; 1155 sum4 -= *v4-- * tmp0; 1156 sum5 -= *v5-- * tmp0; 1157 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1158 sum3 -= *v3-- * tmp0; 1159 sum4 -= *v4-- * tmp0; 1160 sum5 -= *v5-- * tmp0; 1161 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1162 sum4 -= *v4-- * tmp0; 1163 sum5 -= *v5-- * tmp0; 1164 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1165 sum5 -= *v5-- * tmp0; 1166 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1167 break; 1168 default: 1169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1170 } 1171 } 1172 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1173 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1174 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1176 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1177 PetscFunctionReturn(0); 1178 } 1179 1180 #undef __FUNCT__ 1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1183 { 1184 Mat C =B; 1185 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1186 IS isrow = b->row,isicol = b->icol; 1187 PetscErrorCode ierr; 1188 const PetscInt *r,*ic,*ics; 1189 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1190 PetscInt i,j,k,nz,nzL,row,*pj; 1191 const PetscInt *ajtmp,*bjtmp; 1192 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1193 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1194 FactorShiftCtx sctx; 1195 const PetscInt *ddiag; 1196 PetscReal rs; 1197 MatScalar d; 1198 PetscInt inod,nodesz,node_max,col; 1199 const PetscInt *ns; 1200 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1201 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1216 if (rs>sctx.shift_top) sctx.shift_top = rs; 1217 } 1218 sctx.shift_top *= 1.1; 1219 sctx.nshift_max = 5; 1220 sctx.shift_lo = 0.; 1221 sctx.shift_hi = 1.; 1222 } 1223 1224 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1225 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1226 1227 ierr = PetscMalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1228 ierr = PetscMemzero(rtmp1,n*sizeof(PetscScalar));CHKERRQ(ierr); 1229 ierr = PetscMemzero(rtmp2,n*sizeof(PetscScalar));CHKERRQ(ierr); 1230 ierr = PetscMemzero(rtmp3,n*sizeof(PetscScalar));CHKERRQ(ierr); 1231 ierr = PetscMemzero(rtmp4,n*sizeof(PetscScalar));CHKERRQ(ierr); 1232 ics = ic; 1233 1234 node_max = a->inode.node_count; 1235 ns = a->inode.size; 1236 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1237 1238 /* If max inode size > 4, split it into two inodes.*/ 1239 /* also map the inode sizes according to the ordering */ 1240 ierr = PetscMalloc1((n+1),&tmp_vec1);CHKERRQ(ierr); 1241 for (i=0,j=0; i<node_max; ++i,++j) { 1242 if (ns[i] > 4) { 1243 tmp_vec1[j] = 4; 1244 ++j; 1245 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1246 } else { 1247 tmp_vec1[j] = ns[i]; 1248 } 1249 } 1250 /* Use the correct node_max */ 1251 node_max = j; 1252 1253 /* Now reorder the inode info based on mat re-ordering info */ 1254 /* First create a row -> inode_size_array_index map */ 1255 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1256 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1257 for (i=0,row=0; i<node_max; i++) { 1258 nodesz = tmp_vec1[i]; 1259 for (j=0; j<nodesz; j++,row++) { 1260 nsmap[row] = i; 1261 } 1262 } 1263 /* Using nsmap, create a reordered ns structure */ 1264 for (i=0,j=0; i< node_max; i++) { 1265 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1266 tmp_vec2[i] = nodesz; 1267 j += nodesz; 1268 } 1269 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1270 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1271 1272 /* Now use the correct ns */ 1273 ns = tmp_vec2; 1274 1275 do { 1276 sctx.newshift = PETSC_FALSE; 1277 /* Now loop over each block-row, and do the factorization */ 1278 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1279 nodesz = ns[inod]; 1280 1281 switch (nodesz) { 1282 case 1: 1283 /*----------*/ 1284 /* zero rtmp1 */ 1285 /* L part */ 1286 nz = bi[i+1] - bi[i]; 1287 bjtmp = bj + bi[i]; 1288 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1289 1290 /* U part */ 1291 nz = bdiag[i]-bdiag[i+1]; 1292 bjtmp = bj + bdiag[i+1]+1; 1293 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1294 1295 /* load in initial (unfactored row) */ 1296 nz = ai[r[i]+1] - ai[r[i]]; 1297 ajtmp = aj + ai[r[i]]; 1298 v = aa + ai[r[i]]; 1299 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1300 1301 /* ZeropivotApply() */ 1302 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1303 1304 /* elimination */ 1305 bjtmp = bj + bi[i]; 1306 row = *bjtmp++; 1307 nzL = bi[i+1] - bi[i]; 1308 for (k=0; k < nzL; k++) { 1309 pc = rtmp1 + row; 1310 if (*pc != 0.0) { 1311 pv = b->a + bdiag[row]; 1312 mul1 = *pc * (*pv); 1313 *pc = mul1; 1314 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1315 pv = b->a + bdiag[row+1]+1; 1316 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1317 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1318 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1319 } 1320 row = *bjtmp++; 1321 } 1322 1323 /* finished row so stick it into b->a */ 1324 rs = 0.0; 1325 /* L part */ 1326 pv = b->a + bi[i]; 1327 pj = b->j + bi[i]; 1328 nz = bi[i+1] - bi[i]; 1329 for (j=0; j<nz; j++) { 1330 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1331 } 1332 1333 /* U part */ 1334 pv = b->a + bdiag[i+1]+1; 1335 pj = b->j + bdiag[i+1]+1; 1336 nz = bdiag[i] - bdiag[i+1]-1; 1337 for (j=0; j<nz; j++) { 1338 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1339 } 1340 1341 /* Check zero pivot */ 1342 sctx.rs = rs; 1343 sctx.pv = rtmp1[i]; 1344 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1345 if (sctx.newshift) break; 1346 1347 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1348 pv = b->a + bdiag[i]; 1349 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1350 break; 1351 1352 case 2: 1353 /*----------*/ 1354 /* zero rtmp1 and rtmp2 */ 1355 /* L part */ 1356 nz = bi[i+1] - bi[i]; 1357 bjtmp = bj + bi[i]; 1358 for (j=0; j<nz; j++) { 1359 col = bjtmp[j]; 1360 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1361 } 1362 1363 /* U part */ 1364 nz = bdiag[i]-bdiag[i+1]; 1365 bjtmp = bj + bdiag[i+1]+1; 1366 for (j=0; j<nz; j++) { 1367 col = bjtmp[j]; 1368 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1369 } 1370 1371 /* load in initial (unfactored row) */ 1372 nz = ai[r[i]+1] - ai[r[i]]; 1373 ajtmp = aj + ai[r[i]]; 1374 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1375 for (j=0; j<nz; j++) { 1376 col = ics[ajtmp[j]]; 1377 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1378 } 1379 /* ZeropivotApply(): shift the diagonal of the matrix */ 1380 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1381 1382 /* elimination */ 1383 bjtmp = bj + bi[i]; 1384 row = *bjtmp++; /* pivot row */ 1385 nzL = bi[i+1] - bi[i]; 1386 for (k=0; k < nzL; k++) { 1387 pc1 = rtmp1 + row; 1388 pc2 = rtmp2 + row; 1389 if (*pc1 != 0.0 || *pc2 != 0.0) { 1390 pv = b->a + bdiag[row]; 1391 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1392 *pc1 = mul1; *pc2 = mul2; 1393 1394 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1395 pv = b->a + bdiag[row+1]+1; 1396 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1397 for (j=0; j<nz; j++) { 1398 col = pj[j]; 1399 rtmp1[col] -= mul1 * pv[j]; 1400 rtmp2[col] -= mul2 * pv[j]; 1401 } 1402 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1403 } 1404 row = *bjtmp++; 1405 } 1406 1407 /* finished row i; check zero pivot, then stick row i into b->a */ 1408 rs = 0.0; 1409 /* L part */ 1410 pc1 = b->a + bi[i]; 1411 pj = b->j + bi[i]; 1412 nz = bi[i+1] - bi[i]; 1413 for (j=0; j<nz; j++) { 1414 col = pj[j]; 1415 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1416 } 1417 /* U part */ 1418 pc1 = b->a + bdiag[i+1]+1; 1419 pj = b->j + bdiag[i+1]+1; 1420 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1421 for (j=0; j<nz; j++) { 1422 col = pj[j]; 1423 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1424 } 1425 1426 sctx.rs = rs; 1427 sctx.pv = rtmp1[i]; 1428 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1429 if (sctx.newshift) break; 1430 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1431 *pc1 = 1.0/sctx.pv; 1432 1433 /* Now take care of diagonal 2x2 block. */ 1434 pc2 = rtmp2 + i; 1435 if (*pc2 != 0.0) { 1436 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1437 *pc2 = mul1; /* insert L entry */ 1438 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1439 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1440 for (j=0; j<nz; j++) { 1441 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1442 } 1443 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1444 } 1445 1446 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1447 rs = 0.0; 1448 /* L part */ 1449 pc2 = b->a + bi[i+1]; 1450 pj = b->j + bi[i+1]; 1451 nz = bi[i+2] - bi[i+1]; 1452 for (j=0; j<nz; j++) { 1453 col = pj[j]; 1454 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1455 } 1456 /* U part */ 1457 pc2 = b->a + bdiag[i+2]+1; 1458 pj = b->j + bdiag[i+2]+1; 1459 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1460 for (j=0; j<nz; j++) { 1461 col = pj[j]; 1462 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1463 } 1464 1465 sctx.rs = rs; 1466 sctx.pv = rtmp2[i+1]; 1467 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1468 if (sctx.newshift) break; 1469 pc2 = b->a + bdiag[i+1]; 1470 *pc2 = 1.0/sctx.pv; 1471 break; 1472 1473 case 3: 1474 /*----------*/ 1475 /* zero rtmp */ 1476 /* L part */ 1477 nz = bi[i+1] - bi[i]; 1478 bjtmp = bj + bi[i]; 1479 for (j=0; j<nz; j++) { 1480 col = bjtmp[j]; 1481 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1482 } 1483 1484 /* U part */ 1485 nz = bdiag[i]-bdiag[i+1]; 1486 bjtmp = bj + bdiag[i+1]+1; 1487 for (j=0; j<nz; j++) { 1488 col = bjtmp[j]; 1489 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1490 } 1491 1492 /* load in initial (unfactored row) */ 1493 nz = ai[r[i]+1] - ai[r[i]]; 1494 ajtmp = aj + ai[r[i]]; 1495 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1496 for (j=0; j<nz; j++) { 1497 col = ics[ajtmp[j]]; 1498 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1499 } 1500 /* ZeropivotApply(): shift the diagonal of the matrix */ 1501 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1502 1503 /* elimination */ 1504 bjtmp = bj + bi[i]; 1505 row = *bjtmp++; /* pivot row */ 1506 nzL = bi[i+1] - bi[i]; 1507 for (k=0; k < nzL; k++) { 1508 pc1 = rtmp1 + row; 1509 pc2 = rtmp2 + row; 1510 pc3 = rtmp3 + row; 1511 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1512 pv = b->a + bdiag[row]; 1513 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1514 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1515 1516 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1517 pv = b->a + bdiag[row+1]+1; 1518 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1519 for (j=0; j<nz; j++) { 1520 col = pj[j]; 1521 rtmp1[col] -= mul1 * pv[j]; 1522 rtmp2[col] -= mul2 * pv[j]; 1523 rtmp3[col] -= mul3 * pv[j]; 1524 } 1525 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1526 } 1527 row = *bjtmp++; 1528 } 1529 1530 /* finished row i; check zero pivot, then stick row i into b->a */ 1531 rs = 0.0; 1532 /* L part */ 1533 pc1 = b->a + bi[i]; 1534 pj = b->j + bi[i]; 1535 nz = bi[i+1] - bi[i]; 1536 for (j=0; j<nz; j++) { 1537 col = pj[j]; 1538 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1539 } 1540 /* U part */ 1541 pc1 = b->a + bdiag[i+1]+1; 1542 pj = b->j + bdiag[i+1]+1; 1543 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1544 for (j=0; j<nz; j++) { 1545 col = pj[j]; 1546 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1547 } 1548 1549 sctx.rs = rs; 1550 sctx.pv = rtmp1[i]; 1551 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1552 if (sctx.newshift) break; 1553 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1554 *pc1 = 1.0/sctx.pv; 1555 1556 /* Now take care of 1st column of diagonal 3x3 block. */ 1557 pc2 = rtmp2 + i; 1558 pc3 = rtmp3 + i; 1559 if (*pc2 != 0.0 || *pc3 != 0.0) { 1560 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1561 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1562 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1563 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1564 for (j=0; j<nz; j++) { 1565 col = pj[j]; 1566 rtmp2[col] -= mul2 * rtmp1[col]; 1567 rtmp3[col] -= mul3 * rtmp1[col]; 1568 } 1569 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1570 } 1571 1572 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1573 rs = 0.0; 1574 /* L part */ 1575 pc2 = b->a + bi[i+1]; 1576 pj = b->j + bi[i+1]; 1577 nz = bi[i+2] - bi[i+1]; 1578 for (j=0; j<nz; j++) { 1579 col = pj[j]; 1580 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1581 } 1582 /* U part */ 1583 pc2 = b->a + bdiag[i+2]+1; 1584 pj = b->j + bdiag[i+2]+1; 1585 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1586 for (j=0; j<nz; j++) { 1587 col = pj[j]; 1588 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1589 } 1590 1591 sctx.rs = rs; 1592 sctx.pv = rtmp2[i+1]; 1593 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1594 if (sctx.newshift) break; 1595 pc2 = b->a + bdiag[i+1]; 1596 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1597 1598 /* Now take care of 2nd column of diagonal 3x3 block. */ 1599 pc3 = rtmp3 + i+1; 1600 if (*pc3 != 0.0) { 1601 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1602 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1603 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1604 for (j=0; j<nz; j++) { 1605 col = pj[j]; 1606 rtmp3[col] -= mul3 * rtmp2[col]; 1607 } 1608 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1609 } 1610 1611 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1612 rs = 0.0; 1613 /* L part */ 1614 pc3 = b->a + bi[i+2]; 1615 pj = b->j + bi[i+2]; 1616 nz = bi[i+3] - bi[i+2]; 1617 for (j=0; j<nz; j++) { 1618 col = pj[j]; 1619 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1620 } 1621 /* U part */ 1622 pc3 = b->a + bdiag[i+3]+1; 1623 pj = b->j + bdiag[i+3]+1; 1624 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1625 for (j=0; j<nz; j++) { 1626 col = pj[j]; 1627 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1628 } 1629 1630 sctx.rs = rs; 1631 sctx.pv = rtmp3[i+2]; 1632 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1633 if (sctx.newshift) break; 1634 pc3 = b->a + bdiag[i+2]; 1635 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1636 break; 1637 case 4: 1638 /*----------*/ 1639 /* zero rtmp */ 1640 /* L part */ 1641 nz = bi[i+1] - bi[i]; 1642 bjtmp = bj + bi[i]; 1643 for (j=0; j<nz; j++) { 1644 col = bjtmp[j]; 1645 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1646 } 1647 1648 /* U part */ 1649 nz = bdiag[i]-bdiag[i+1]; 1650 bjtmp = bj + bdiag[i+1]+1; 1651 for (j=0; j<nz; j++) { 1652 col = bjtmp[j]; 1653 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1654 } 1655 1656 /* load in initial (unfactored row) */ 1657 nz = ai[r[i]+1] - ai[r[i]]; 1658 ajtmp = aj + ai[r[i]]; 1659 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1660 for (j=0; j<nz; j++) { 1661 col = ics[ajtmp[j]]; 1662 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1663 } 1664 /* ZeropivotApply(): shift the diagonal of the matrix */ 1665 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1666 1667 /* elimination */ 1668 bjtmp = bj + bi[i]; 1669 row = *bjtmp++; /* pivot row */ 1670 nzL = bi[i+1] - bi[i]; 1671 for (k=0; k < nzL; k++) { 1672 pc1 = rtmp1 + row; 1673 pc2 = rtmp2 + row; 1674 pc3 = rtmp3 + row; 1675 pc4 = rtmp4 + row; 1676 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1677 pv = b->a + bdiag[row]; 1678 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1679 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1680 1681 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1682 pv = b->a + bdiag[row+1]+1; 1683 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1684 for (j=0; j<nz; j++) { 1685 col = pj[j]; 1686 rtmp1[col] -= mul1 * pv[j]; 1687 rtmp2[col] -= mul2 * pv[j]; 1688 rtmp3[col] -= mul3 * pv[j]; 1689 rtmp4[col] -= mul4 * pv[j]; 1690 } 1691 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1692 } 1693 row = *bjtmp++; 1694 } 1695 1696 /* finished row i; check zero pivot, then stick row i into b->a */ 1697 rs = 0.0; 1698 /* L part */ 1699 pc1 = b->a + bi[i]; 1700 pj = b->j + bi[i]; 1701 nz = bi[i+1] - bi[i]; 1702 for (j=0; j<nz; j++) { 1703 col = pj[j]; 1704 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1705 } 1706 /* U part */ 1707 pc1 = b->a + bdiag[i+1]+1; 1708 pj = b->j + bdiag[i+1]+1; 1709 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1710 for (j=0; j<nz; j++) { 1711 col = pj[j]; 1712 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1713 } 1714 1715 sctx.rs = rs; 1716 sctx.pv = rtmp1[i]; 1717 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1718 if (sctx.newshift) break; 1719 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1720 *pc1 = 1.0/sctx.pv; 1721 1722 /* Now take care of 1st column of diagonal 4x4 block. */ 1723 pc2 = rtmp2 + i; 1724 pc3 = rtmp3 + i; 1725 pc4 = rtmp4 + i; 1726 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1727 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1728 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1729 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1730 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1731 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1732 for (j=0; j<nz; j++) { 1733 col = pj[j]; 1734 rtmp2[col] -= mul2 * rtmp1[col]; 1735 rtmp3[col] -= mul3 * rtmp1[col]; 1736 rtmp4[col] -= mul4 * rtmp1[col]; 1737 } 1738 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1739 } 1740 1741 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1742 rs = 0.0; 1743 /* L part */ 1744 pc2 = b->a + bi[i+1]; 1745 pj = b->j + bi[i+1]; 1746 nz = bi[i+2] - bi[i+1]; 1747 for (j=0; j<nz; j++) { 1748 col = pj[j]; 1749 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1750 } 1751 /* U part */ 1752 pc2 = b->a + bdiag[i+2]+1; 1753 pj = b->j + bdiag[i+2]+1; 1754 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1755 for (j=0; j<nz; j++) { 1756 col = pj[j]; 1757 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1758 } 1759 1760 sctx.rs = rs; 1761 sctx.pv = rtmp2[i+1]; 1762 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1763 if (sctx.newshift) break; 1764 pc2 = b->a + bdiag[i+1]; 1765 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1766 1767 /* Now take care of 2nd column of diagonal 4x4 block. */ 1768 pc3 = rtmp3 + i+1; 1769 pc4 = rtmp4 + i+1; 1770 if (*pc3 != 0.0 || *pc4 != 0.0) { 1771 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1772 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1773 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1774 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1775 for (j=0; j<nz; j++) { 1776 col = pj[j]; 1777 rtmp3[col] -= mul3 * rtmp2[col]; 1778 rtmp4[col] -= mul4 * rtmp2[col]; 1779 } 1780 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1781 } 1782 1783 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1784 rs = 0.0; 1785 /* L part */ 1786 pc3 = b->a + bi[i+2]; 1787 pj = b->j + bi[i+2]; 1788 nz = bi[i+3] - bi[i+2]; 1789 for (j=0; j<nz; j++) { 1790 col = pj[j]; 1791 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1792 } 1793 /* U part */ 1794 pc3 = b->a + bdiag[i+3]+1; 1795 pj = b->j + bdiag[i+3]+1; 1796 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1797 for (j=0; j<nz; j++) { 1798 col = pj[j]; 1799 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1800 } 1801 1802 sctx.rs = rs; 1803 sctx.pv = rtmp3[i+2]; 1804 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1805 if (sctx.newshift) break; 1806 pc3 = b->a + bdiag[i+2]; 1807 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1808 1809 /* Now take care of 3rd column of diagonal 4x4 block. */ 1810 pc4 = rtmp4 + i+2; 1811 if (*pc4 != 0.0) { 1812 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1813 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1814 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1815 for (j=0; j<nz; j++) { 1816 col = pj[j]; 1817 rtmp4[col] -= mul4 * rtmp3[col]; 1818 } 1819 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1820 } 1821 1822 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1823 rs = 0.0; 1824 /* L part */ 1825 pc4 = b->a + bi[i+3]; 1826 pj = b->j + bi[i+3]; 1827 nz = bi[i+4] - bi[i+3]; 1828 for (j=0; j<nz; j++) { 1829 col = pj[j]; 1830 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1831 } 1832 /* U part */ 1833 pc4 = b->a + bdiag[i+4]+1; 1834 pj = b->j + bdiag[i+4]+1; 1835 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1836 for (j=0; j<nz; j++) { 1837 col = pj[j]; 1838 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1839 } 1840 1841 sctx.rs = rs; 1842 sctx.pv = rtmp4[i+3]; 1843 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1844 if (sctx.newshift) break; 1845 pc4 = b->a + bdiag[i+3]; 1846 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1847 break; 1848 1849 default: 1850 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1851 } 1852 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1853 i += nodesz; /* Update the row */ 1854 } 1855 1856 /* MatPivotRefine() */ 1857 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1858 /* 1859 * if no shift in this attempt & shifting & started shifting & can refine, 1860 * then try lower shift 1861 */ 1862 sctx.shift_hi = sctx.shift_fraction; 1863 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1864 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1865 sctx.newshift = PETSC_TRUE; 1866 sctx.nshift++; 1867 } 1868 } while (sctx.newshift); 1869 1870 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1871 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1872 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1873 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1874 1875 if (b->inode.size) { 1876 C->ops->solve = MatSolve_SeqAIJ_Inode; 1877 } else { 1878 C->ops->solve = MatSolve_SeqAIJ; 1879 } 1880 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1881 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1882 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1883 C->ops->matsolve = MatMatSolve_SeqAIJ; 1884 C->assembled = PETSC_TRUE; 1885 C->preallocated = PETSC_TRUE; 1886 1887 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1888 1889 /* MatShiftView(A,info,&sctx) */ 1890 if (sctx.nshift) { 1891 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1892 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1893 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1894 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1895 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1896 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1897 } 1898 } 1899 PetscFunctionReturn(0); 1900 } 1901 1902 #undef __FUNCT__ 1903 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1904 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1905 { 1906 Mat C = B; 1907 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1908 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1909 PetscErrorCode ierr; 1910 const PetscInt *r,*ic,*c,*ics; 1911 PetscInt n = A->rmap->n,*bi = b->i; 1912 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1913 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1914 PetscInt *ai = a->i,*aj = a->j; 1915 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1916 PetscScalar mul1,mul2,mul3,tmp; 1917 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1918 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1919 PetscReal rs=0.0; 1920 FactorShiftCtx sctx; 1921 1922 PetscFunctionBegin; 1923 sctx.shift_top = 0; 1924 sctx.nshift_max = 0; 1925 sctx.shift_lo = 0; 1926 sctx.shift_hi = 0; 1927 sctx.shift_fraction = 0; 1928 1929 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1930 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1931 sctx.shift_top = 0; 1932 for (i=0; i<n; i++) { 1933 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1934 rs = 0.0; 1935 ajtmp = aj + ai[i]; 1936 rtmp1 = aa + ai[i]; 1937 nz = ai[i+1] - ai[i]; 1938 for (j=0; j<nz; j++) { 1939 if (*ajtmp != i) { 1940 rs += PetscAbsScalar(*rtmp1++); 1941 } else { 1942 rs -= PetscRealPart(*rtmp1++); 1943 } 1944 ajtmp++; 1945 } 1946 if (rs>sctx.shift_top) sctx.shift_top = rs; 1947 } 1948 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1949 sctx.shift_top *= 1.1; 1950 sctx.nshift_max = 5; 1951 sctx.shift_lo = 0.; 1952 sctx.shift_hi = 1.; 1953 } 1954 sctx.shift_amount = 0; 1955 sctx.nshift = 0; 1956 1957 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1958 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1959 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1960 ierr = PetscMalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1961 ierr = PetscMemzero(rtmp11,n*sizeof(PetscScalar));CHKERRQ(ierr); 1962 ierr = PetscMemzero(rtmp22,n*sizeof(PetscScalar));CHKERRQ(ierr); 1963 ierr = PetscMemzero(rtmp33,n*sizeof(PetscScalar));CHKERRQ(ierr); 1964 ics = ic; 1965 1966 node_max = a->inode.node_count; 1967 ns = a->inode.size; 1968 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1969 1970 /* If max inode size > 3, split it into two inodes.*/ 1971 /* also map the inode sizes according to the ordering */ 1972 ierr = PetscMalloc1((n+1),&tmp_vec1);CHKERRQ(ierr); 1973 for (i=0,j=0; i<node_max; ++i,++j) { 1974 if (ns[i]>3) { 1975 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1976 ++j; 1977 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1978 } else { 1979 tmp_vec1[j] = ns[i]; 1980 } 1981 } 1982 /* Use the correct node_max */ 1983 node_max = j; 1984 1985 /* Now reorder the inode info based on mat re-ordering info */ 1986 /* First create a row -> inode_size_array_index map */ 1987 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1988 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1989 for (i=0,row=0; i<node_max; i++) { 1990 nodesz = tmp_vec1[i]; 1991 for (j=0; j<nodesz; j++,row++) { 1992 nsmap[row] = i; 1993 } 1994 } 1995 /* Using nsmap, create a reordered ns structure */ 1996 for (i=0,j=0; i< node_max; i++) { 1997 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1998 tmp_vec2[i] = nodesz; 1999 j += nodesz; 2000 } 2001 ierr = PetscFree(nsmap);CHKERRQ(ierr); 2002 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 2003 /* Now use the correct ns */ 2004 ns = tmp_vec2; 2005 2006 do { 2007 sctx.newshift = PETSC_FALSE; 2008 /* Now loop over each block-row, and do the factorization */ 2009 for (i=0,row=0; i<node_max; i++) { 2010 nodesz = ns[i]; 2011 nz = bi[row+1] - bi[row]; 2012 bjtmp = bj + bi[row]; 2013 2014 switch (nodesz) { 2015 case 1: 2016 for (j=0; j<nz; j++) { 2017 idx = bjtmp[j]; 2018 rtmp11[idx] = 0.0; 2019 } 2020 2021 /* load in initial (unfactored row) */ 2022 idx = r[row]; 2023 nz_tmp = ai[idx+1] - ai[idx]; 2024 ajtmp = aj + ai[idx]; 2025 v1 = aa + ai[idx]; 2026 2027 for (j=0; j<nz_tmp; j++) { 2028 idx = ics[ajtmp[j]]; 2029 rtmp11[idx] = v1[j]; 2030 } 2031 rtmp11[ics[r[row]]] += sctx.shift_amount; 2032 2033 prow = *bjtmp++; 2034 while (prow < row) { 2035 pc1 = rtmp11 + prow; 2036 if (*pc1 != 0.0) { 2037 pv = ba + bd[prow]; 2038 pj = nbj + bd[prow]; 2039 mul1 = *pc1 * *pv++; 2040 *pc1 = mul1; 2041 nz_tmp = bi[prow+1] - bd[prow] - 1; 2042 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2043 for (j=0; j<nz_tmp; j++) { 2044 tmp = pv[j]; 2045 idx = pj[j]; 2046 rtmp11[idx] -= mul1 * tmp; 2047 } 2048 } 2049 prow = *bjtmp++; 2050 } 2051 pj = bj + bi[row]; 2052 pc1 = ba + bi[row]; 2053 2054 sctx.pv = rtmp11[row]; 2055 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2056 rs = 0.0; 2057 for (j=0; j<nz; j++) { 2058 idx = pj[j]; 2059 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2060 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2061 } 2062 sctx.rs = rs; 2063 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2064 if (sctx.newshift) goto endofwhile; 2065 break; 2066 2067 case 2: 2068 for (j=0; j<nz; j++) { 2069 idx = bjtmp[j]; 2070 rtmp11[idx] = 0.0; 2071 rtmp22[idx] = 0.0; 2072 } 2073 2074 /* load in initial (unfactored row) */ 2075 idx = r[row]; 2076 nz_tmp = ai[idx+1] - ai[idx]; 2077 ajtmp = aj + ai[idx]; 2078 v1 = aa + ai[idx]; 2079 v2 = aa + ai[idx+1]; 2080 for (j=0; j<nz_tmp; j++) { 2081 idx = ics[ajtmp[j]]; 2082 rtmp11[idx] = v1[j]; 2083 rtmp22[idx] = v2[j]; 2084 } 2085 rtmp11[ics[r[row]]] += sctx.shift_amount; 2086 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2087 2088 prow = *bjtmp++; 2089 while (prow < row) { 2090 pc1 = rtmp11 + prow; 2091 pc2 = rtmp22 + prow; 2092 if (*pc1 != 0.0 || *pc2 != 0.0) { 2093 pv = ba + bd[prow]; 2094 pj = nbj + bd[prow]; 2095 mul1 = *pc1 * *pv; 2096 mul2 = *pc2 * *pv; 2097 ++pv; 2098 *pc1 = mul1; 2099 *pc2 = mul2; 2100 2101 nz_tmp = bi[prow+1] - bd[prow] - 1; 2102 for (j=0; j<nz_tmp; j++) { 2103 tmp = pv[j]; 2104 idx = pj[j]; 2105 rtmp11[idx] -= mul1 * tmp; 2106 rtmp22[idx] -= mul2 * tmp; 2107 } 2108 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2109 } 2110 prow = *bjtmp++; 2111 } 2112 2113 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2114 pc1 = rtmp11 + prow; 2115 pc2 = rtmp22 + prow; 2116 2117 sctx.pv = *pc1; 2118 pj = bj + bi[prow]; 2119 rs = 0.0; 2120 for (j=0; j<nz; j++) { 2121 idx = pj[j]; 2122 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2123 } 2124 sctx.rs = rs; 2125 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2126 if (sctx.newshift) goto endofwhile; 2127 2128 if (*pc2 != 0.0) { 2129 pj = nbj + bd[prow]; 2130 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2131 *pc2 = mul2; 2132 nz_tmp = bi[prow+1] - bd[prow] - 1; 2133 for (j=0; j<nz_tmp; j++) { 2134 idx = pj[j]; 2135 tmp = rtmp11[idx]; 2136 rtmp22[idx] -= mul2 * tmp; 2137 } 2138 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2139 } 2140 2141 pj = bj + bi[row]; 2142 pc1 = ba + bi[row]; 2143 pc2 = ba + bi[row+1]; 2144 2145 sctx.pv = rtmp22[row+1]; 2146 rs = 0.0; 2147 rtmp11[row] = 1.0/rtmp11[row]; 2148 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2149 /* copy row entries from dense representation to sparse */ 2150 for (j=0; j<nz; j++) { 2151 idx = pj[j]; 2152 pc1[j] = rtmp11[idx]; 2153 pc2[j] = rtmp22[idx]; 2154 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2155 } 2156 sctx.rs = rs; 2157 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2158 if (sctx.newshift) goto endofwhile; 2159 break; 2160 2161 case 3: 2162 for (j=0; j<nz; j++) { 2163 idx = bjtmp[j]; 2164 rtmp11[idx] = 0.0; 2165 rtmp22[idx] = 0.0; 2166 rtmp33[idx] = 0.0; 2167 } 2168 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2169 idx = r[row]; 2170 nz_tmp = ai[idx+1] - ai[idx]; 2171 ajtmp = aj + ai[idx]; 2172 v1 = aa + ai[idx]; 2173 v2 = aa + ai[idx+1]; 2174 v3 = aa + ai[idx+2]; 2175 for (j=0; j<nz_tmp; j++) { 2176 idx = ics[ajtmp[j]]; 2177 rtmp11[idx] = v1[j]; 2178 rtmp22[idx] = v2[j]; 2179 rtmp33[idx] = v3[j]; 2180 } 2181 rtmp11[ics[r[row]]] += sctx.shift_amount; 2182 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2183 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2184 2185 /* loop over all pivot row blocks above this row block */ 2186 prow = *bjtmp++; 2187 while (prow < row) { 2188 pc1 = rtmp11 + prow; 2189 pc2 = rtmp22 + prow; 2190 pc3 = rtmp33 + prow; 2191 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2192 pv = ba + bd[prow]; 2193 pj = nbj + bd[prow]; 2194 mul1 = *pc1 * *pv; 2195 mul2 = *pc2 * *pv; 2196 mul3 = *pc3 * *pv; 2197 ++pv; 2198 *pc1 = mul1; 2199 *pc2 = mul2; 2200 *pc3 = mul3; 2201 2202 nz_tmp = bi[prow+1] - bd[prow] - 1; 2203 /* update this row based on pivot row */ 2204 for (j=0; j<nz_tmp; j++) { 2205 tmp = pv[j]; 2206 idx = pj[j]; 2207 rtmp11[idx] -= mul1 * tmp; 2208 rtmp22[idx] -= mul2 * tmp; 2209 rtmp33[idx] -= mul3 * tmp; 2210 } 2211 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2212 } 2213 prow = *bjtmp++; 2214 } 2215 2216 /* Now take care of diagonal 3x3 block in this set of rows */ 2217 /* note: prow = row here */ 2218 pc1 = rtmp11 + prow; 2219 pc2 = rtmp22 + prow; 2220 pc3 = rtmp33 + prow; 2221 2222 sctx.pv = *pc1; 2223 pj = bj + bi[prow]; 2224 rs = 0.0; 2225 for (j=0; j<nz; j++) { 2226 idx = pj[j]; 2227 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2228 } 2229 sctx.rs = rs; 2230 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2231 if (sctx.newshift) goto endofwhile; 2232 2233 if (*pc2 != 0.0 || *pc3 != 0.0) { 2234 mul2 = (*pc2)/(*pc1); 2235 mul3 = (*pc3)/(*pc1); 2236 *pc2 = mul2; 2237 *pc3 = mul3; 2238 nz_tmp = bi[prow+1] - bd[prow] - 1; 2239 pj = nbj + bd[prow]; 2240 for (j=0; j<nz_tmp; j++) { 2241 idx = pj[j]; 2242 tmp = rtmp11[idx]; 2243 rtmp22[idx] -= mul2 * tmp; 2244 rtmp33[idx] -= mul3 * tmp; 2245 } 2246 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2247 } 2248 ++prow; 2249 2250 pc2 = rtmp22 + prow; 2251 pc3 = rtmp33 + prow; 2252 sctx.pv = *pc2; 2253 pj = bj + bi[prow]; 2254 rs = 0.0; 2255 for (j=0; j<nz; j++) { 2256 idx = pj[j]; 2257 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2258 } 2259 sctx.rs = rs; 2260 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2261 if (sctx.newshift) goto endofwhile; 2262 2263 if (*pc3 != 0.0) { 2264 mul3 = (*pc3)/(*pc2); 2265 *pc3 = mul3; 2266 pj = nbj + bd[prow]; 2267 nz_tmp = bi[prow+1] - bd[prow] - 1; 2268 for (j=0; j<nz_tmp; j++) { 2269 idx = pj[j]; 2270 tmp = rtmp22[idx]; 2271 rtmp33[idx] -= mul3 * tmp; 2272 } 2273 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2274 } 2275 2276 pj = bj + bi[row]; 2277 pc1 = ba + bi[row]; 2278 pc2 = ba + bi[row+1]; 2279 pc3 = ba + bi[row+2]; 2280 2281 sctx.pv = rtmp33[row+2]; 2282 rs = 0.0; 2283 rtmp11[row] = 1.0/rtmp11[row]; 2284 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2285 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2286 /* copy row entries from dense representation to sparse */ 2287 for (j=0; j<nz; j++) { 2288 idx = pj[j]; 2289 pc1[j] = rtmp11[idx]; 2290 pc2[j] = rtmp22[idx]; 2291 pc3[j] = rtmp33[idx]; 2292 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2293 } 2294 2295 sctx.rs = rs; 2296 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2297 if (sctx.newshift) goto endofwhile; 2298 break; 2299 2300 default: 2301 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2302 } 2303 row += nodesz; /* Update the row */ 2304 } 2305 endofwhile:; 2306 } while (sctx.newshift); 2307 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2308 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2309 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2310 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2311 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2312 2313 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2314 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2315 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2316 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2317 C->assembled = PETSC_TRUE; 2318 C->preallocated = PETSC_TRUE; 2319 if (sctx.nshift) { 2320 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2321 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2322 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2323 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2324 } 2325 } 2326 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2327 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2328 PetscFunctionReturn(0); 2329 } 2330 2331 2332 /* ----------------------------------------------------------- */ 2333 #undef __FUNCT__ 2334 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2335 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2336 { 2337 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2338 IS iscol = a->col,isrow = a->row; 2339 PetscErrorCode ierr; 2340 const PetscInt *r,*c,*rout,*cout; 2341 PetscInt i,j,n = A->rmap->n; 2342 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2343 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2344 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2345 PetscScalar sum1,sum2,sum3,sum4,sum5; 2346 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2347 const PetscScalar *b; 2348 2349 PetscFunctionBegin; 2350 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2351 node_max = a->inode.node_count; 2352 ns = a->inode.size; /* Node Size array */ 2353 2354 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2355 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2356 tmp = a->solve_work; 2357 2358 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2359 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2360 2361 /* forward solve the lower triangular */ 2362 tmps = tmp; 2363 aa = a_a; 2364 aj = a_j; 2365 ad = a->diag; 2366 2367 for (i = 0,row = 0; i< node_max; ++i) { 2368 nsz = ns[i]; 2369 aii = ai[row]; 2370 v1 = aa + aii; 2371 vi = aj + aii; 2372 nz = ai[row+1]- ai[row]; 2373 2374 if (i < node_max-1) { 2375 /* Prefetch the indices for the next block */ 2376 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2377 /* Prefetch the data for the next block */ 2378 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2379 } 2380 2381 switch (nsz) { /* Each loop in 'case' is unrolled */ 2382 case 1: 2383 sum1 = b[r[row]]; 2384 for (j=0; j<nz-1; j+=2) { 2385 i0 = vi[j]; 2386 i1 = vi[j+1]; 2387 tmp0 = tmps[i0]; 2388 tmp1 = tmps[i1]; 2389 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2390 } 2391 if (j == nz-1) { 2392 tmp0 = tmps[vi[j]]; 2393 sum1 -= v1[j]*tmp0; 2394 } 2395 tmp[row++]=sum1; 2396 break; 2397 case 2: 2398 sum1 = b[r[row]]; 2399 sum2 = b[r[row+1]]; 2400 v2 = aa + ai[row+1]; 2401 2402 for (j=0; j<nz-1; j+=2) { 2403 i0 = vi[j]; 2404 i1 = vi[j+1]; 2405 tmp0 = tmps[i0]; 2406 tmp1 = tmps[i1]; 2407 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2408 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2409 } 2410 if (j == nz-1) { 2411 tmp0 = tmps[vi[j]]; 2412 sum1 -= v1[j] *tmp0; 2413 sum2 -= v2[j] *tmp0; 2414 } 2415 sum2 -= v2[nz] * sum1; 2416 tmp[row++]=sum1; 2417 tmp[row++]=sum2; 2418 break; 2419 case 3: 2420 sum1 = b[r[row]]; 2421 sum2 = b[r[row+1]]; 2422 sum3 = b[r[row+2]]; 2423 v2 = aa + ai[row+1]; 2424 v3 = aa + ai[row+2]; 2425 2426 for (j=0; j<nz-1; j+=2) { 2427 i0 = vi[j]; 2428 i1 = vi[j+1]; 2429 tmp0 = tmps[i0]; 2430 tmp1 = tmps[i1]; 2431 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2432 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2433 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2434 } 2435 if (j == nz-1) { 2436 tmp0 = tmps[vi[j]]; 2437 sum1 -= v1[j] *tmp0; 2438 sum2 -= v2[j] *tmp0; 2439 sum3 -= v3[j] *tmp0; 2440 } 2441 sum2 -= v2[nz] * sum1; 2442 sum3 -= v3[nz] * sum1; 2443 sum3 -= v3[nz+1] * sum2; 2444 tmp[row++]=sum1; 2445 tmp[row++]=sum2; 2446 tmp[row++]=sum3; 2447 break; 2448 2449 case 4: 2450 sum1 = b[r[row]]; 2451 sum2 = b[r[row+1]]; 2452 sum3 = b[r[row+2]]; 2453 sum4 = b[r[row+3]]; 2454 v2 = aa + ai[row+1]; 2455 v3 = aa + ai[row+2]; 2456 v4 = aa + ai[row+3]; 2457 2458 for (j=0; j<nz-1; j+=2) { 2459 i0 = vi[j]; 2460 i1 = vi[j+1]; 2461 tmp0 = tmps[i0]; 2462 tmp1 = tmps[i1]; 2463 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2464 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2465 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2466 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2467 } 2468 if (j == nz-1) { 2469 tmp0 = tmps[vi[j]]; 2470 sum1 -= v1[j] *tmp0; 2471 sum2 -= v2[j] *tmp0; 2472 sum3 -= v3[j] *tmp0; 2473 sum4 -= v4[j] *tmp0; 2474 } 2475 sum2 -= v2[nz] * sum1; 2476 sum3 -= v3[nz] * sum1; 2477 sum4 -= v4[nz] * sum1; 2478 sum3 -= v3[nz+1] * sum2; 2479 sum4 -= v4[nz+1] * sum2; 2480 sum4 -= v4[nz+2] * sum3; 2481 2482 tmp[row++]=sum1; 2483 tmp[row++]=sum2; 2484 tmp[row++]=sum3; 2485 tmp[row++]=sum4; 2486 break; 2487 case 5: 2488 sum1 = b[r[row]]; 2489 sum2 = b[r[row+1]]; 2490 sum3 = b[r[row+2]]; 2491 sum4 = b[r[row+3]]; 2492 sum5 = b[r[row+4]]; 2493 v2 = aa + ai[row+1]; 2494 v3 = aa + ai[row+2]; 2495 v4 = aa + ai[row+3]; 2496 v5 = aa + ai[row+4]; 2497 2498 for (j=0; j<nz-1; j+=2) { 2499 i0 = vi[j]; 2500 i1 = vi[j+1]; 2501 tmp0 = tmps[i0]; 2502 tmp1 = tmps[i1]; 2503 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2504 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2505 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2506 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2507 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2508 } 2509 if (j == nz-1) { 2510 tmp0 = tmps[vi[j]]; 2511 sum1 -= v1[j] *tmp0; 2512 sum2 -= v2[j] *tmp0; 2513 sum3 -= v3[j] *tmp0; 2514 sum4 -= v4[j] *tmp0; 2515 sum5 -= v5[j] *tmp0; 2516 } 2517 2518 sum2 -= v2[nz] * sum1; 2519 sum3 -= v3[nz] * sum1; 2520 sum4 -= v4[nz] * sum1; 2521 sum5 -= v5[nz] * sum1; 2522 sum3 -= v3[nz+1] * sum2; 2523 sum4 -= v4[nz+1] * sum2; 2524 sum5 -= v5[nz+1] * sum2; 2525 sum4 -= v4[nz+2] * sum3; 2526 sum5 -= v5[nz+2] * sum3; 2527 sum5 -= v5[nz+3] * sum4; 2528 2529 tmp[row++]=sum1; 2530 tmp[row++]=sum2; 2531 tmp[row++]=sum3; 2532 tmp[row++]=sum4; 2533 tmp[row++]=sum5; 2534 break; 2535 default: 2536 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2537 } 2538 } 2539 /* backward solve the upper triangular */ 2540 for (i=node_max -1,row = n-1; i>=0; i--) { 2541 nsz = ns[i]; 2542 aii = ad[row+1] + 1; 2543 v1 = aa + aii; 2544 vi = aj + aii; 2545 nz = ad[row]- ad[row+1] - 1; 2546 2547 if (i > 0) { 2548 /* Prefetch the indices for the next block */ 2549 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2550 /* Prefetch the data for the next block */ 2551 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2552 } 2553 2554 switch (nsz) { /* Each loop in 'case' is unrolled */ 2555 case 1: 2556 sum1 = tmp[row]; 2557 2558 for (j=0; j<nz-1; j+=2) { 2559 i0 = vi[j]; 2560 i1 = vi[j+1]; 2561 tmp0 = tmps[i0]; 2562 tmp1 = tmps[i1]; 2563 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2564 } 2565 if (j == nz-1) { 2566 tmp0 = tmps[vi[j]]; 2567 sum1 -= v1[j]*tmp0; 2568 } 2569 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2570 break; 2571 case 2: 2572 sum1 = tmp[row]; 2573 sum2 = tmp[row-1]; 2574 v2 = aa + ad[row] + 1; 2575 for (j=0; j<nz-1; j+=2) { 2576 i0 = vi[j]; 2577 i1 = vi[j+1]; 2578 tmp0 = tmps[i0]; 2579 tmp1 = tmps[i1]; 2580 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2581 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2582 } 2583 if (j == nz-1) { 2584 tmp0 = tmps[vi[j]]; 2585 sum1 -= v1[j]* tmp0; 2586 sum2 -= v2[j+1]* tmp0; 2587 } 2588 2589 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2590 sum2 -= v2[0] * tmp0; 2591 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2592 break; 2593 case 3: 2594 sum1 = tmp[row]; 2595 sum2 = tmp[row -1]; 2596 sum3 = tmp[row -2]; 2597 v2 = aa + ad[row] + 1; 2598 v3 = aa + ad[row -1] + 1; 2599 for (j=0; j<nz-1; j+=2) { 2600 i0 = vi[j]; 2601 i1 = vi[j+1]; 2602 tmp0 = tmps[i0]; 2603 tmp1 = tmps[i1]; 2604 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2605 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2606 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2607 } 2608 if (j== nz-1) { 2609 tmp0 = tmps[vi[j]]; 2610 sum1 -= v1[j] * tmp0; 2611 sum2 -= v2[j+1] * tmp0; 2612 sum3 -= v3[j+2] * tmp0; 2613 } 2614 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2615 sum2 -= v2[0]* tmp0; 2616 sum3 -= v3[1] * tmp0; 2617 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2618 sum3 -= v3[0]* tmp0; 2619 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2620 2621 break; 2622 case 4: 2623 sum1 = tmp[row]; 2624 sum2 = tmp[row -1]; 2625 sum3 = tmp[row -2]; 2626 sum4 = tmp[row -3]; 2627 v2 = aa + ad[row]+1; 2628 v3 = aa + ad[row -1]+1; 2629 v4 = aa + ad[row -2]+1; 2630 2631 for (j=0; j<nz-1; j+=2) { 2632 i0 = vi[j]; 2633 i1 = vi[j+1]; 2634 tmp0 = tmps[i0]; 2635 tmp1 = tmps[i1]; 2636 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2637 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2638 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2639 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2640 } 2641 if (j== nz-1) { 2642 tmp0 = tmps[vi[j]]; 2643 sum1 -= v1[j] * tmp0; 2644 sum2 -= v2[j+1] * tmp0; 2645 sum3 -= v3[j+2] * tmp0; 2646 sum4 -= v4[j+3] * tmp0; 2647 } 2648 2649 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2650 sum2 -= v2[0] * tmp0; 2651 sum3 -= v3[1] * tmp0; 2652 sum4 -= v4[2] * tmp0; 2653 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2654 sum3 -= v3[0] * tmp0; 2655 sum4 -= v4[1] * tmp0; 2656 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2657 sum4 -= v4[0] * tmp0; 2658 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2659 break; 2660 case 5: 2661 sum1 = tmp[row]; 2662 sum2 = tmp[row -1]; 2663 sum3 = tmp[row -2]; 2664 sum4 = tmp[row -3]; 2665 sum5 = tmp[row -4]; 2666 v2 = aa + ad[row]+1; 2667 v3 = aa + ad[row -1]+1; 2668 v4 = aa + ad[row -2]+1; 2669 v5 = aa + ad[row -3]+1; 2670 for (j=0; j<nz-1; j+=2) { 2671 i0 = vi[j]; 2672 i1 = vi[j+1]; 2673 tmp0 = tmps[i0]; 2674 tmp1 = tmps[i1]; 2675 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2676 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2677 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2678 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2679 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2680 } 2681 if (j==nz-1) { 2682 tmp0 = tmps[vi[j]]; 2683 sum1 -= v1[j] * tmp0; 2684 sum2 -= v2[j+1] * tmp0; 2685 sum3 -= v3[j+2] * tmp0; 2686 sum4 -= v4[j+3] * tmp0; 2687 sum5 -= v5[j+4] * tmp0; 2688 } 2689 2690 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2691 sum2 -= v2[0] * tmp0; 2692 sum3 -= v3[1] * tmp0; 2693 sum4 -= v4[2] * tmp0; 2694 sum5 -= v5[3] * tmp0; 2695 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2696 sum3 -= v3[0] * tmp0; 2697 sum4 -= v4[1] * tmp0; 2698 sum5 -= v5[2] * tmp0; 2699 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2700 sum4 -= v4[0] * tmp0; 2701 sum5 -= v5[1] * tmp0; 2702 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2703 sum5 -= v5[0] * tmp0; 2704 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2705 break; 2706 default: 2707 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2708 } 2709 } 2710 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2711 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2712 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2713 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2714 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2715 PetscFunctionReturn(0); 2716 } 2717 2718 2719 /* 2720 Makes a longer coloring[] array and calls the usual code with that 2721 */ 2722 #undef __FUNCT__ 2723 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2724 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2725 { 2726 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2727 PetscErrorCode ierr; 2728 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2729 PetscInt *colorused,i; 2730 ISColoringValue *newcolor; 2731 2732 PetscFunctionBegin; 2733 ierr = PetscMalloc1((n+1),&newcolor);CHKERRQ(ierr); 2734 /* loop over inodes, marking a color for each column*/ 2735 row = 0; 2736 for (i=0; i<m; i++) { 2737 for (j=0; j<ns[i]; j++) { 2738 newcolor[row++] = coloring[i] + j*ncolors; 2739 } 2740 } 2741 2742 /* eliminate unneeded colors */ 2743 ierr = PetscMalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2744 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2745 for (i=0; i<n; i++) { 2746 colorused[newcolor[i]] = 1; 2747 } 2748 2749 for (i=1; i<5*ncolors; i++) { 2750 colorused[i] += colorused[i-1]; 2751 } 2752 ncolors = colorused[5*ncolors-1]; 2753 for (i=0; i<n; i++) { 2754 newcolor[i] = colorused[newcolor[i]]-1; 2755 } 2756 ierr = PetscFree(colorused);CHKERRQ(ierr); 2757 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2758 ierr = PetscFree(coloring);CHKERRQ(ierr); 2759 PetscFunctionReturn(0); 2760 } 2761 2762 #include <petsc-private/kernels/blockinvert.h> 2763 2764 #undef __FUNCT__ 2765 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2766 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2767 { 2768 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2769 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2770 MatScalar *ibdiag,*bdiag,work[25],*t; 2771 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2772 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2773 const PetscScalar *xb, *b; 2774 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2775 PetscErrorCode ierr; 2776 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2777 PetscInt sz,k,ipvt[5]; 2778 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2779 2780 PetscFunctionBegin; 2781 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2782 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2783 2784 if (!a->inode.ibdiagvalid) { 2785 if (!a->inode.ibdiag) { 2786 /* calculate space needed for diagonal blocks */ 2787 for (i=0; i<m; i++) { 2788 cnt += sizes[i]*sizes[i]; 2789 } 2790 a->inode.bdiagsize = cnt; 2791 2792 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2793 } 2794 2795 /* copy over the diagonal blocks and invert them */ 2796 ibdiag = a->inode.ibdiag; 2797 bdiag = a->inode.bdiag; 2798 cnt = 0; 2799 for (i=0, row = 0; i<m; i++) { 2800 for (j=0; j<sizes[i]; j++) { 2801 for (k=0; k<sizes[i]; k++) { 2802 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2803 } 2804 } 2805 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2806 2807 switch (sizes[i]) { 2808 case 1: 2809 /* Create matrix data structure */ 2810 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2811 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2812 break; 2813 case 2: 2814 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2815 break; 2816 case 3: 2817 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2818 break; 2819 case 4: 2820 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2821 break; 2822 case 5: 2823 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2824 break; 2825 default: 2826 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2827 } 2828 cnt += sizes[i]*sizes[i]; 2829 row += sizes[i]; 2830 } 2831 a->inode.ibdiagvalid = PETSC_TRUE; 2832 } 2833 ibdiag = a->inode.ibdiag; 2834 bdiag = a->inode.bdiag; 2835 t = a->inode.ssor_work; 2836 2837 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2838 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2839 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2840 if (flag & SOR_ZERO_INITIAL_GUESS) { 2841 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2842 2843 for (i=0, row=0; i<m; i++) { 2844 sz = diag[row] - ii[row]; 2845 v1 = a->a + ii[row]; 2846 idx = a->j + ii[row]; 2847 2848 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2849 switch (sizes[i]) { 2850 case 1: 2851 2852 sum1 = b[row]; 2853 for (n = 0; n<sz-1; n+=2) { 2854 i1 = idx[0]; 2855 i2 = idx[1]; 2856 idx += 2; 2857 tmp0 = x[i1]; 2858 tmp1 = x[i2]; 2859 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2860 } 2861 2862 if (n == sz-1) { 2863 tmp0 = x[*idx]; 2864 sum1 -= *v1 * tmp0; 2865 } 2866 t[row] = sum1; 2867 x[row++] = sum1*(*ibdiag++); 2868 break; 2869 case 2: 2870 v2 = a->a + ii[row+1]; 2871 sum1 = b[row]; 2872 sum2 = b[row+1]; 2873 for (n = 0; n<sz-1; n+=2) { 2874 i1 = idx[0]; 2875 i2 = idx[1]; 2876 idx += 2; 2877 tmp0 = x[i1]; 2878 tmp1 = x[i2]; 2879 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2880 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2881 } 2882 2883 if (n == sz-1) { 2884 tmp0 = x[*idx]; 2885 sum1 -= v1[0] * tmp0; 2886 sum2 -= v2[0] * tmp0; 2887 } 2888 t[row] = sum1; 2889 t[row+1] = sum2; 2890 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2891 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2892 ibdiag += 4; 2893 break; 2894 case 3: 2895 v2 = a->a + ii[row+1]; 2896 v3 = a->a + ii[row+2]; 2897 sum1 = b[row]; 2898 sum2 = b[row+1]; 2899 sum3 = b[row+2]; 2900 for (n = 0; n<sz-1; n+=2) { 2901 i1 = idx[0]; 2902 i2 = idx[1]; 2903 idx += 2; 2904 tmp0 = x[i1]; 2905 tmp1 = x[i2]; 2906 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2908 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2909 } 2910 2911 if (n == sz-1) { 2912 tmp0 = x[*idx]; 2913 sum1 -= v1[0] * tmp0; 2914 sum2 -= v2[0] * tmp0; 2915 sum3 -= v3[0] * tmp0; 2916 } 2917 t[row] = sum1; 2918 t[row+1] = sum2; 2919 t[row+2] = sum3; 2920 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2921 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2922 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2923 ibdiag += 9; 2924 break; 2925 case 4: 2926 v2 = a->a + ii[row+1]; 2927 v3 = a->a + ii[row+2]; 2928 v4 = a->a + ii[row+3]; 2929 sum1 = b[row]; 2930 sum2 = b[row+1]; 2931 sum3 = b[row+2]; 2932 sum4 = b[row+3]; 2933 for (n = 0; n<sz-1; n+=2) { 2934 i1 = idx[0]; 2935 i2 = idx[1]; 2936 idx += 2; 2937 tmp0 = x[i1]; 2938 tmp1 = x[i2]; 2939 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2940 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2941 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2942 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2943 } 2944 2945 if (n == sz-1) { 2946 tmp0 = x[*idx]; 2947 sum1 -= v1[0] * tmp0; 2948 sum2 -= v2[0] * tmp0; 2949 sum3 -= v3[0] * tmp0; 2950 sum4 -= v4[0] * tmp0; 2951 } 2952 t[row] = sum1; 2953 t[row+1] = sum2; 2954 t[row+2] = sum3; 2955 t[row+3] = sum4; 2956 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2957 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2958 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2959 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2960 ibdiag += 16; 2961 break; 2962 case 5: 2963 v2 = a->a + ii[row+1]; 2964 v3 = a->a + ii[row+2]; 2965 v4 = a->a + ii[row+3]; 2966 v5 = a->a + ii[row+4]; 2967 sum1 = b[row]; 2968 sum2 = b[row+1]; 2969 sum3 = b[row+2]; 2970 sum4 = b[row+3]; 2971 sum5 = b[row+4]; 2972 for (n = 0; n<sz-1; n+=2) { 2973 i1 = idx[0]; 2974 i2 = idx[1]; 2975 idx += 2; 2976 tmp0 = x[i1]; 2977 tmp1 = x[i2]; 2978 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2979 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2980 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2981 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2982 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2983 } 2984 2985 if (n == sz-1) { 2986 tmp0 = x[*idx]; 2987 sum1 -= v1[0] * tmp0; 2988 sum2 -= v2[0] * tmp0; 2989 sum3 -= v3[0] * tmp0; 2990 sum4 -= v4[0] * tmp0; 2991 sum5 -= v5[0] * tmp0; 2992 } 2993 t[row] = sum1; 2994 t[row+1] = sum2; 2995 t[row+2] = sum3; 2996 t[row+3] = sum4; 2997 t[row+4] = sum5; 2998 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2999 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3000 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3001 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3002 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3003 ibdiag += 25; 3004 break; 3005 default: 3006 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3007 } 3008 } 3009 3010 xb = t; 3011 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3012 } else xb = b; 3013 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3014 3015 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3016 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3017 ibdiag -= sizes[i]*sizes[i]; 3018 sz = ii[row+1] - diag[row] - 1; 3019 v1 = a->a + diag[row] + 1; 3020 idx = a->j + diag[row] + 1; 3021 3022 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3023 switch (sizes[i]) { 3024 case 1: 3025 3026 sum1 = xb[row]; 3027 for (n = 0; n<sz-1; n+=2) { 3028 i1 = idx[0]; 3029 i2 = idx[1]; 3030 idx += 2; 3031 tmp0 = x[i1]; 3032 tmp1 = x[i2]; 3033 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3034 } 3035 3036 if (n == sz-1) { 3037 tmp0 = x[*idx]; 3038 sum1 -= *v1*tmp0; 3039 } 3040 x[row--] = sum1*(*ibdiag); 3041 break; 3042 3043 case 2: 3044 3045 sum1 = xb[row]; 3046 sum2 = xb[row-1]; 3047 /* note that sum1 is associated with the second of the two rows */ 3048 v2 = a->a + diag[row-1] + 2; 3049 for (n = 0; n<sz-1; n+=2) { 3050 i1 = idx[0]; 3051 i2 = idx[1]; 3052 idx += 2; 3053 tmp0 = x[i1]; 3054 tmp1 = x[i2]; 3055 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3056 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3057 } 3058 3059 if (n == sz-1) { 3060 tmp0 = x[*idx]; 3061 sum1 -= *v1*tmp0; 3062 sum2 -= *v2*tmp0; 3063 } 3064 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3065 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3066 break; 3067 case 3: 3068 3069 sum1 = xb[row]; 3070 sum2 = xb[row-1]; 3071 sum3 = xb[row-2]; 3072 v2 = a->a + diag[row-1] + 2; 3073 v3 = a->a + diag[row-2] + 3; 3074 for (n = 0; n<sz-1; n+=2) { 3075 i1 = idx[0]; 3076 i2 = idx[1]; 3077 idx += 2; 3078 tmp0 = x[i1]; 3079 tmp1 = x[i2]; 3080 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3081 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3082 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3083 } 3084 3085 if (n == sz-1) { 3086 tmp0 = x[*idx]; 3087 sum1 -= *v1*tmp0; 3088 sum2 -= *v2*tmp0; 3089 sum3 -= *v3*tmp0; 3090 } 3091 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3092 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3093 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3094 break; 3095 case 4: 3096 3097 sum1 = xb[row]; 3098 sum2 = xb[row-1]; 3099 sum3 = xb[row-2]; 3100 sum4 = xb[row-3]; 3101 v2 = a->a + diag[row-1] + 2; 3102 v3 = a->a + diag[row-2] + 3; 3103 v4 = a->a + diag[row-3] + 4; 3104 for (n = 0; n<sz-1; n+=2) { 3105 i1 = idx[0]; 3106 i2 = idx[1]; 3107 idx += 2; 3108 tmp0 = x[i1]; 3109 tmp1 = x[i2]; 3110 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3111 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3112 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3113 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3114 } 3115 3116 if (n == sz-1) { 3117 tmp0 = x[*idx]; 3118 sum1 -= *v1*tmp0; 3119 sum2 -= *v2*tmp0; 3120 sum3 -= *v3*tmp0; 3121 sum4 -= *v4*tmp0; 3122 } 3123 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3124 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3125 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3126 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3127 break; 3128 case 5: 3129 3130 sum1 = xb[row]; 3131 sum2 = xb[row-1]; 3132 sum3 = xb[row-2]; 3133 sum4 = xb[row-3]; 3134 sum5 = xb[row-4]; 3135 v2 = a->a + diag[row-1] + 2; 3136 v3 = a->a + diag[row-2] + 3; 3137 v4 = a->a + diag[row-3] + 4; 3138 v5 = a->a + diag[row-4] + 5; 3139 for (n = 0; n<sz-1; n+=2) { 3140 i1 = idx[0]; 3141 i2 = idx[1]; 3142 idx += 2; 3143 tmp0 = x[i1]; 3144 tmp1 = x[i2]; 3145 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3146 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3147 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3148 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3149 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3150 } 3151 3152 if (n == sz-1) { 3153 tmp0 = x[*idx]; 3154 sum1 -= *v1*tmp0; 3155 sum2 -= *v2*tmp0; 3156 sum3 -= *v3*tmp0; 3157 sum4 -= *v4*tmp0; 3158 sum5 -= *v5*tmp0; 3159 } 3160 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3161 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3162 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3163 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3164 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3165 break; 3166 default: 3167 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3168 } 3169 } 3170 3171 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3172 } 3173 its--; 3174 } 3175 while (its--) { 3176 3177 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3178 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3179 i<m; 3180 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3181 3182 sz = diag[row] - ii[row]; 3183 v1 = a->a + ii[row]; 3184 idx = a->j + ii[row]; 3185 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3186 switch (sizes[i]) { 3187 case 1: 3188 sum1 = b[row]; 3189 for (n = 0; n<sz-1; n+=2) { 3190 i1 = idx[0]; 3191 i2 = idx[1]; 3192 idx += 2; 3193 tmp0 = x[i1]; 3194 tmp1 = x[i2]; 3195 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3196 } 3197 if (n == sz-1) { 3198 tmp0 = x[*idx++]; 3199 sum1 -= *v1 * tmp0; 3200 v1++; 3201 } 3202 t[row] = sum1; 3203 sz = ii[row+1] - diag[row] - 1; 3204 idx = a->j + diag[row] + 1; 3205 v1 += 1; 3206 for (n = 0; n<sz-1; n+=2) { 3207 i1 = idx[0]; 3208 i2 = idx[1]; 3209 idx += 2; 3210 tmp0 = x[i1]; 3211 tmp1 = x[i2]; 3212 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3213 } 3214 if (n == sz-1) { 3215 tmp0 = x[*idx++]; 3216 sum1 -= *v1 * tmp0; 3217 } 3218 /* in MatSOR_SeqAIJ this line would be 3219 * 3220 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3221 * 3222 * but omega == 1, so this becomes 3223 * 3224 * x[row] = sum1*(*ibdiag++); 3225 * 3226 */ 3227 x[row] = sum1*(*ibdiag); 3228 break; 3229 case 2: 3230 v2 = a->a + ii[row+1]; 3231 sum1 = b[row]; 3232 sum2 = b[row+1]; 3233 for (n = 0; n<sz-1; n+=2) { 3234 i1 = idx[0]; 3235 i2 = idx[1]; 3236 idx += 2; 3237 tmp0 = x[i1]; 3238 tmp1 = x[i2]; 3239 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3240 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3241 } 3242 if (n == sz-1) { 3243 tmp0 = x[*idx++]; 3244 sum1 -= v1[0] * tmp0; 3245 sum2 -= v2[0] * tmp0; 3246 v1++; v2++; 3247 } 3248 t[row] = sum1; 3249 t[row+1] = sum2; 3250 sz = ii[row+1] - diag[row] - 2; 3251 idx = a->j + diag[row] + 2; 3252 v1 += 2; 3253 v2 += 2; 3254 for (n = 0; n<sz-1; n+=2) { 3255 i1 = idx[0]; 3256 i2 = idx[1]; 3257 idx += 2; 3258 tmp0 = x[i1]; 3259 tmp1 = x[i2]; 3260 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3261 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3262 } 3263 if (n == sz-1) { 3264 tmp0 = x[*idx]; 3265 sum1 -= v1[0] * tmp0; 3266 sum2 -= v2[0] * tmp0; 3267 } 3268 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3269 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3270 break; 3271 case 3: 3272 v2 = a->a + ii[row+1]; 3273 v3 = a->a + ii[row+2]; 3274 sum1 = b[row]; 3275 sum2 = b[row+1]; 3276 sum3 = b[row+2]; 3277 for (n = 0; n<sz-1; n+=2) { 3278 i1 = idx[0]; 3279 i2 = idx[1]; 3280 idx += 2; 3281 tmp0 = x[i1]; 3282 tmp1 = x[i2]; 3283 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3284 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3285 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3286 } 3287 if (n == sz-1) { 3288 tmp0 = x[*idx++]; 3289 sum1 -= v1[0] * tmp0; 3290 sum2 -= v2[0] * tmp0; 3291 sum3 -= v3[0] * tmp0; 3292 v1++; v2++; v3++; 3293 } 3294 t[row] = sum1; 3295 t[row+1] = sum2; 3296 t[row+2] = sum3; 3297 sz = ii[row+1] - diag[row] - 3; 3298 idx = a->j + diag[row] + 3; 3299 v1 += 3; 3300 v2 += 3; 3301 v3 += 3; 3302 for (n = 0; n<sz-1; n+=2) { 3303 i1 = idx[0]; 3304 i2 = idx[1]; 3305 idx += 2; 3306 tmp0 = x[i1]; 3307 tmp1 = x[i2]; 3308 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3309 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3310 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3311 } 3312 if (n == sz-1) { 3313 tmp0 = x[*idx]; 3314 sum1 -= v1[0] * tmp0; 3315 sum2 -= v2[0] * tmp0; 3316 sum3 -= v3[0] * tmp0; 3317 } 3318 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3319 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3320 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3321 break; 3322 case 4: 3323 v2 = a->a + ii[row+1]; 3324 v3 = a->a + ii[row+2]; 3325 v4 = a->a + ii[row+3]; 3326 sum1 = b[row]; 3327 sum2 = b[row+1]; 3328 sum3 = b[row+2]; 3329 sum4 = b[row+3]; 3330 for (n = 0; n<sz-1; n+=2) { 3331 i1 = idx[0]; 3332 i2 = idx[1]; 3333 idx += 2; 3334 tmp0 = x[i1]; 3335 tmp1 = x[i2]; 3336 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3337 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3338 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3339 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3340 } 3341 if (n == sz-1) { 3342 tmp0 = x[*idx++]; 3343 sum1 -= v1[0] * tmp0; 3344 sum2 -= v2[0] * tmp0; 3345 sum3 -= v3[0] * tmp0; 3346 sum4 -= v4[0] * tmp0; 3347 v1++; v2++; v3++; v4++; 3348 } 3349 t[row] = sum1; 3350 t[row+1] = sum2; 3351 t[row+2] = sum3; 3352 t[row+3] = sum4; 3353 sz = ii[row+1] - diag[row] - 4; 3354 idx = a->j + diag[row] + 4; 3355 v1 += 4; 3356 v2 += 4; 3357 v3 += 4; 3358 v4 += 4; 3359 for (n = 0; n<sz-1; n+=2) { 3360 i1 = idx[0]; 3361 i2 = idx[1]; 3362 idx += 2; 3363 tmp0 = x[i1]; 3364 tmp1 = x[i2]; 3365 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3366 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3367 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3368 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3369 } 3370 if (n == sz-1) { 3371 tmp0 = x[*idx]; 3372 sum1 -= v1[0] * tmp0; 3373 sum2 -= v2[0] * tmp0; 3374 sum3 -= v3[0] * tmp0; 3375 sum4 -= v4[0] * tmp0; 3376 } 3377 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3378 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3379 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3380 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3381 break; 3382 case 5: 3383 v2 = a->a + ii[row+1]; 3384 v3 = a->a + ii[row+2]; 3385 v4 = a->a + ii[row+3]; 3386 v5 = a->a + ii[row+4]; 3387 sum1 = b[row]; 3388 sum2 = b[row+1]; 3389 sum3 = b[row+2]; 3390 sum4 = b[row+3]; 3391 sum5 = b[row+4]; 3392 for (n = 0; n<sz-1; n+=2) { 3393 i1 = idx[0]; 3394 i2 = idx[1]; 3395 idx += 2; 3396 tmp0 = x[i1]; 3397 tmp1 = x[i2]; 3398 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3399 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3400 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3401 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3402 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3403 } 3404 if (n == sz-1) { 3405 tmp0 = x[*idx++]; 3406 sum1 -= v1[0] * tmp0; 3407 sum2 -= v2[0] * tmp0; 3408 sum3 -= v3[0] * tmp0; 3409 sum4 -= v4[0] * tmp0; 3410 sum5 -= v5[0] * tmp0; 3411 v1++; v2++; v3++; v4++; v5++; 3412 } 3413 t[row] = sum1; 3414 t[row+1] = sum2; 3415 t[row+2] = sum3; 3416 t[row+3] = sum4; 3417 t[row+4] = sum5; 3418 sz = ii[row+1] - diag[row] - 5; 3419 idx = a->j + diag[row] + 5; 3420 v1 += 5; 3421 v2 += 5; 3422 v3 += 5; 3423 v4 += 5; 3424 v5 += 5; 3425 for (n = 0; n<sz-1; n+=2) { 3426 i1 = idx[0]; 3427 i2 = idx[1]; 3428 idx += 2; 3429 tmp0 = x[i1]; 3430 tmp1 = x[i2]; 3431 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3432 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3433 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3434 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3435 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3436 } 3437 if (n == sz-1) { 3438 tmp0 = x[*idx]; 3439 sum1 -= v1[0] * tmp0; 3440 sum2 -= v2[0] * tmp0; 3441 sum3 -= v3[0] * tmp0; 3442 sum4 -= v4[0] * tmp0; 3443 sum5 -= v5[0] * tmp0; 3444 } 3445 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3446 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3447 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3448 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3449 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3450 break; 3451 default: 3452 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3453 } 3454 } 3455 xb = t; 3456 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3457 } else xb = b; 3458 3459 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3460 3461 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3462 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3463 ibdiag -= sizes[i]*sizes[i]; 3464 3465 /* set RHS */ 3466 if (xb == b) { 3467 /* whole (old way) */ 3468 sz = ii[row+1] - ii[row]; 3469 idx = a->j + ii[row]; 3470 switch (sizes[i]) { 3471 case 5: 3472 v5 = a->a + ii[row-4]; 3473 case 4: /* fall through */ 3474 v4 = a->a + ii[row-3]; 3475 case 3: 3476 v3 = a->a + ii[row-2]; 3477 case 2: 3478 v2 = a->a + ii[row-1]; 3479 case 1: 3480 v1 = a->a + ii[row]; 3481 break; 3482 default: 3483 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3484 } 3485 } else { 3486 /* upper, no diag */ 3487 sz = ii[row+1] - diag[row] - 1; 3488 idx = a->j + diag[row] + 1; 3489 switch (sizes[i]) { 3490 case 5: 3491 v5 = a->a + diag[row-4] + 5; 3492 case 4: /* fall through */ 3493 v4 = a->a + diag[row-3] + 4; 3494 case 3: 3495 v3 = a->a + diag[row-2] + 3; 3496 case 2: 3497 v2 = a->a + diag[row-1] + 2; 3498 case 1: 3499 v1 = a->a + diag[row] + 1; 3500 } 3501 } 3502 /* set sum */ 3503 switch (sizes[i]) { 3504 case 5: 3505 sum5 = xb[row-4]; 3506 case 4: /* fall through */ 3507 sum4 = xb[row-3]; 3508 case 3: 3509 sum3 = xb[row-2]; 3510 case 2: 3511 sum2 = xb[row-1]; 3512 case 1: 3513 /* note that sum1 is associated with the last row */ 3514 sum1 = xb[row]; 3515 } 3516 /* do sums */ 3517 for (n = 0; n<sz-1; n+=2) { 3518 i1 = idx[0]; 3519 i2 = idx[1]; 3520 idx += 2; 3521 tmp0 = x[i1]; 3522 tmp1 = x[i2]; 3523 switch (sizes[i]) { 3524 case 5: 3525 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3526 case 4: /* fall through */ 3527 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3528 case 3: 3529 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3530 case 2: 3531 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3532 case 1: 3533 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3534 } 3535 } 3536 /* ragged edge */ 3537 if (n == sz-1) { 3538 tmp0 = x[*idx]; 3539 switch (sizes[i]) { 3540 case 5: 3541 sum5 -= *v5*tmp0; 3542 case 4: /* fall through */ 3543 sum4 -= *v4*tmp0; 3544 case 3: 3545 sum3 -= *v3*tmp0; 3546 case 2: 3547 sum2 -= *v2*tmp0; 3548 case 1: 3549 sum1 -= *v1*tmp0; 3550 } 3551 } 3552 /* update */ 3553 if (xb == b) { 3554 /* whole (old way) w/ diag */ 3555 switch (sizes[i]) { 3556 case 5: 3557 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3558 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3559 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3560 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3561 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3562 break; 3563 case 4: 3564 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3565 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3566 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3567 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3568 break; 3569 case 3: 3570 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3571 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3572 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3573 break; 3574 case 2: 3575 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3576 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3577 break; 3578 case 1: 3579 x[row--] += sum1*(*ibdiag); 3580 break; 3581 } 3582 } else { 3583 /* no diag so set = */ 3584 switch (sizes[i]) { 3585 case 5: 3586 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3587 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3588 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3589 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3590 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3591 break; 3592 case 4: 3593 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3594 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3595 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3596 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3597 break; 3598 case 3: 3599 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3600 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3601 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3602 break; 3603 case 2: 3604 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3605 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3606 break; 3607 case 1: 3608 x[row--] = sum1*(*ibdiag); 3609 break; 3610 } 3611 } 3612 } 3613 if (xb == b) { 3614 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3615 } else { 3616 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3617 } 3618 } 3619 } 3620 if (flag & SOR_EISENSTAT) { 3621 /* 3622 Apply (U + D)^-1 where D is now the block diagonal 3623 */ 3624 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3625 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3626 ibdiag -= sizes[i]*sizes[i]; 3627 sz = ii[row+1] - diag[row] - 1; 3628 v1 = a->a + diag[row] + 1; 3629 idx = a->j + diag[row] + 1; 3630 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3631 switch (sizes[i]) { 3632 case 1: 3633 3634 sum1 = b[row]; 3635 for (n = 0; n<sz-1; n+=2) { 3636 i1 = idx[0]; 3637 i2 = idx[1]; 3638 idx += 2; 3639 tmp0 = x[i1]; 3640 tmp1 = x[i2]; 3641 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3642 } 3643 3644 if (n == sz-1) { 3645 tmp0 = x[*idx]; 3646 sum1 -= *v1*tmp0; 3647 } 3648 x[row] = sum1*(*ibdiag);row--; 3649 break; 3650 3651 case 2: 3652 3653 sum1 = b[row]; 3654 sum2 = b[row-1]; 3655 /* note that sum1 is associated with the second of the two rows */ 3656 v2 = a->a + diag[row-1] + 2; 3657 for (n = 0; n<sz-1; n+=2) { 3658 i1 = idx[0]; 3659 i2 = idx[1]; 3660 idx += 2; 3661 tmp0 = x[i1]; 3662 tmp1 = x[i2]; 3663 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3664 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3665 } 3666 3667 if (n == sz-1) { 3668 tmp0 = x[*idx]; 3669 sum1 -= *v1*tmp0; 3670 sum2 -= *v2*tmp0; 3671 } 3672 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3673 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3674 row -= 2; 3675 break; 3676 case 3: 3677 3678 sum1 = b[row]; 3679 sum2 = b[row-1]; 3680 sum3 = b[row-2]; 3681 v2 = a->a + diag[row-1] + 2; 3682 v3 = a->a + diag[row-2] + 3; 3683 for (n = 0; n<sz-1; n+=2) { 3684 i1 = idx[0]; 3685 i2 = idx[1]; 3686 idx += 2; 3687 tmp0 = x[i1]; 3688 tmp1 = x[i2]; 3689 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3690 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3691 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3692 } 3693 3694 if (n == sz-1) { 3695 tmp0 = x[*idx]; 3696 sum1 -= *v1*tmp0; 3697 sum2 -= *v2*tmp0; 3698 sum3 -= *v3*tmp0; 3699 } 3700 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3701 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3702 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3703 row -= 3; 3704 break; 3705 case 4: 3706 3707 sum1 = b[row]; 3708 sum2 = b[row-1]; 3709 sum3 = b[row-2]; 3710 sum4 = b[row-3]; 3711 v2 = a->a + diag[row-1] + 2; 3712 v3 = a->a + diag[row-2] + 3; 3713 v4 = a->a + diag[row-3] + 4; 3714 for (n = 0; n<sz-1; n+=2) { 3715 i1 = idx[0]; 3716 i2 = idx[1]; 3717 idx += 2; 3718 tmp0 = x[i1]; 3719 tmp1 = x[i2]; 3720 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3721 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3722 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3723 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3724 } 3725 3726 if (n == sz-1) { 3727 tmp0 = x[*idx]; 3728 sum1 -= *v1*tmp0; 3729 sum2 -= *v2*tmp0; 3730 sum3 -= *v3*tmp0; 3731 sum4 -= *v4*tmp0; 3732 } 3733 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3734 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3735 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3736 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3737 row -= 4; 3738 break; 3739 case 5: 3740 3741 sum1 = b[row]; 3742 sum2 = b[row-1]; 3743 sum3 = b[row-2]; 3744 sum4 = b[row-3]; 3745 sum5 = b[row-4]; 3746 v2 = a->a + diag[row-1] + 2; 3747 v3 = a->a + diag[row-2] + 3; 3748 v4 = a->a + diag[row-3] + 4; 3749 v5 = a->a + diag[row-4] + 5; 3750 for (n = 0; n<sz-1; n+=2) { 3751 i1 = idx[0]; 3752 i2 = idx[1]; 3753 idx += 2; 3754 tmp0 = x[i1]; 3755 tmp1 = x[i2]; 3756 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3757 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3758 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3759 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3760 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3761 } 3762 3763 if (n == sz-1) { 3764 tmp0 = x[*idx]; 3765 sum1 -= *v1*tmp0; 3766 sum2 -= *v2*tmp0; 3767 sum3 -= *v3*tmp0; 3768 sum4 -= *v4*tmp0; 3769 sum5 -= *v5*tmp0; 3770 } 3771 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3772 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3773 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3774 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3775 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3776 row -= 5; 3777 break; 3778 default: 3779 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3780 } 3781 } 3782 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3783 3784 /* 3785 t = b - D x where D is the block diagonal 3786 */ 3787 cnt = 0; 3788 for (i=0, row=0; i<m; i++) { 3789 switch (sizes[i]) { 3790 case 1: 3791 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3792 break; 3793 case 2: 3794 x1 = x[row]; x2 = x[row+1]; 3795 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3796 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3797 t[row] = b[row] - tmp1; 3798 t[row+1] = b[row+1] - tmp2; row += 2; 3799 cnt += 4; 3800 break; 3801 case 3: 3802 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3803 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3804 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3805 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3806 t[row] = b[row] - tmp1; 3807 t[row+1] = b[row+1] - tmp2; 3808 t[row+2] = b[row+2] - tmp3; row += 3; 3809 cnt += 9; 3810 break; 3811 case 4: 3812 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3813 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3814 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3815 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3816 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3817 t[row] = b[row] - tmp1; 3818 t[row+1] = b[row+1] - tmp2; 3819 t[row+2] = b[row+2] - tmp3; 3820 t[row+3] = b[row+3] - tmp4; row += 4; 3821 cnt += 16; 3822 break; 3823 case 5: 3824 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3825 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3826 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3827 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3828 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3829 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3830 t[row] = b[row] - tmp1; 3831 t[row+1] = b[row+1] - tmp2; 3832 t[row+2] = b[row+2] - tmp3; 3833 t[row+3] = b[row+3] - tmp4; 3834 t[row+4] = b[row+4] - tmp5;row += 5; 3835 cnt += 25; 3836 break; 3837 default: 3838 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3839 } 3840 } 3841 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3842 3843 3844 3845 /* 3846 Apply (L + D)^-1 where D is the block diagonal 3847 */ 3848 for (i=0, row=0; i<m; i++) { 3849 sz = diag[row] - ii[row]; 3850 v1 = a->a + ii[row]; 3851 idx = a->j + ii[row]; 3852 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3853 switch (sizes[i]) { 3854 case 1: 3855 3856 sum1 = t[row]; 3857 for (n = 0; n<sz-1; n+=2) { 3858 i1 = idx[0]; 3859 i2 = idx[1]; 3860 idx += 2; 3861 tmp0 = t[i1]; 3862 tmp1 = t[i2]; 3863 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3864 } 3865 3866 if (n == sz-1) { 3867 tmp0 = t[*idx]; 3868 sum1 -= *v1 * tmp0; 3869 } 3870 x[row] += t[row] = sum1*(*ibdiag++); row++; 3871 break; 3872 case 2: 3873 v2 = a->a + ii[row+1]; 3874 sum1 = t[row]; 3875 sum2 = t[row+1]; 3876 for (n = 0; n<sz-1; n+=2) { 3877 i1 = idx[0]; 3878 i2 = idx[1]; 3879 idx += 2; 3880 tmp0 = t[i1]; 3881 tmp1 = t[i2]; 3882 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3883 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3884 } 3885 3886 if (n == sz-1) { 3887 tmp0 = t[*idx]; 3888 sum1 -= v1[0] * tmp0; 3889 sum2 -= v2[0] * tmp0; 3890 } 3891 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3892 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3893 ibdiag += 4; row += 2; 3894 break; 3895 case 3: 3896 v2 = a->a + ii[row+1]; 3897 v3 = a->a + ii[row+2]; 3898 sum1 = t[row]; 3899 sum2 = t[row+1]; 3900 sum3 = t[row+2]; 3901 for (n = 0; n<sz-1; n+=2) { 3902 i1 = idx[0]; 3903 i2 = idx[1]; 3904 idx += 2; 3905 tmp0 = t[i1]; 3906 tmp1 = t[i2]; 3907 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3908 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3909 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3910 } 3911 3912 if (n == sz-1) { 3913 tmp0 = t[*idx]; 3914 sum1 -= v1[0] * tmp0; 3915 sum2 -= v2[0] * tmp0; 3916 sum3 -= v3[0] * tmp0; 3917 } 3918 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3919 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3920 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3921 ibdiag += 9; row += 3; 3922 break; 3923 case 4: 3924 v2 = a->a + ii[row+1]; 3925 v3 = a->a + ii[row+2]; 3926 v4 = a->a + ii[row+3]; 3927 sum1 = t[row]; 3928 sum2 = t[row+1]; 3929 sum3 = t[row+2]; 3930 sum4 = t[row+3]; 3931 for (n = 0; n<sz-1; n+=2) { 3932 i1 = idx[0]; 3933 i2 = idx[1]; 3934 idx += 2; 3935 tmp0 = t[i1]; 3936 tmp1 = t[i2]; 3937 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3938 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3939 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3940 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3941 } 3942 3943 if (n == sz-1) { 3944 tmp0 = t[*idx]; 3945 sum1 -= v1[0] * tmp0; 3946 sum2 -= v2[0] * tmp0; 3947 sum3 -= v3[0] * tmp0; 3948 sum4 -= v4[0] * tmp0; 3949 } 3950 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3951 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3952 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3953 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3954 ibdiag += 16; row += 4; 3955 break; 3956 case 5: 3957 v2 = a->a + ii[row+1]; 3958 v3 = a->a + ii[row+2]; 3959 v4 = a->a + ii[row+3]; 3960 v5 = a->a + ii[row+4]; 3961 sum1 = t[row]; 3962 sum2 = t[row+1]; 3963 sum3 = t[row+2]; 3964 sum4 = t[row+3]; 3965 sum5 = t[row+4]; 3966 for (n = 0; n<sz-1; n+=2) { 3967 i1 = idx[0]; 3968 i2 = idx[1]; 3969 idx += 2; 3970 tmp0 = t[i1]; 3971 tmp1 = t[i2]; 3972 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3973 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3974 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3975 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3976 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3977 } 3978 3979 if (n == sz-1) { 3980 tmp0 = t[*idx]; 3981 sum1 -= v1[0] * tmp0; 3982 sum2 -= v2[0] * tmp0; 3983 sum3 -= v3[0] * tmp0; 3984 sum4 -= v4[0] * tmp0; 3985 sum5 -= v5[0] * tmp0; 3986 } 3987 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3988 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3989 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3990 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3991 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3992 ibdiag += 25; row += 5; 3993 break; 3994 default: 3995 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3996 } 3997 } 3998 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3999 } 4000 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4001 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4002 PetscFunctionReturn(0); 4003 } 4004 4005 #undef __FUNCT__ 4006 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 4007 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 4008 { 4009 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4010 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 4011 const MatScalar *bdiag = a->inode.bdiag; 4012 const PetscScalar *b; 4013 PetscErrorCode ierr; 4014 PetscInt m = a->inode.node_count,cnt = 0,i,row; 4015 const PetscInt *sizes = a->inode.size; 4016 4017 PetscFunctionBegin; 4018 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4019 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4020 cnt = 0; 4021 for (i=0, row=0; i<m; i++) { 4022 switch (sizes[i]) { 4023 case 1: 4024 x[row] = b[row]*bdiag[cnt++];row++; 4025 break; 4026 case 2: 4027 x1 = b[row]; x2 = b[row+1]; 4028 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 4029 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 4030 x[row++] = tmp1; 4031 x[row++] = tmp2; 4032 cnt += 4; 4033 break; 4034 case 3: 4035 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4036 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4037 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4038 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4039 x[row++] = tmp1; 4040 x[row++] = tmp2; 4041 x[row++] = tmp3; 4042 cnt += 9; 4043 break; 4044 case 4: 4045 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4046 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4047 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4048 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4049 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4050 x[row++] = tmp1; 4051 x[row++] = tmp2; 4052 x[row++] = tmp3; 4053 x[row++] = tmp4; 4054 cnt += 16; 4055 break; 4056 case 5: 4057 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4058 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4059 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4060 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4061 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4062 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4063 x[row++] = tmp1; 4064 x[row++] = tmp2; 4065 x[row++] = tmp3; 4066 x[row++] = tmp4; 4067 x[row++] = tmp5; 4068 cnt += 25; 4069 break; 4070 default: 4071 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4072 } 4073 } 4074 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4075 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4076 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4077 PetscFunctionReturn(0); 4078 } 4079 4080 /* 4081 samestructure indicates that the matrix has not changed its nonzero structure so we 4082 do not need to recompute the inodes 4083 */ 4084 #undef __FUNCT__ 4085 #define __FUNCT__ "Mat_CheckInode" 4086 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 4087 { 4088 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4089 PetscErrorCode ierr; 4090 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4091 PetscBool flag; 4092 const PetscInt *idx,*idy,*ii; 4093 4094 PetscFunctionBegin; 4095 if (!a->inode.use) PetscFunctionReturn(0); 4096 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4097 4098 4099 m = A->rmap->n; 4100 if (a->inode.size) ns = a->inode.size; 4101 else { 4102 ierr = PetscMalloc1((m+1),&ns);CHKERRQ(ierr); 4103 } 4104 4105 i = 0; 4106 node_count = 0; 4107 idx = a->j; 4108 ii = a->i; 4109 while (i < m) { /* For each row */ 4110 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4111 /* Limits the number of elements in a node to 'a->inode.limit' */ 4112 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4113 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4114 if (nzy != nzx) break; 4115 idy += nzx; /* Same nonzero pattern */ 4116 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4117 if (!flag) break; 4118 } 4119 ns[node_count++] = blk_size; 4120 idx += blk_size*nzx; 4121 i = j; 4122 } 4123 /* If not enough inodes found,, do not use inode version of the routines */ 4124 if (!m || node_count > .8*m) { 4125 ierr = PetscFree(ns);CHKERRQ(ierr); 4126 4127 a->inode.node_count = 0; 4128 a->inode.size = NULL; 4129 a->inode.use = PETSC_FALSE; 4130 A->ops->mult = MatMult_SeqAIJ; 4131 A->ops->sor = MatSOR_SeqAIJ; 4132 A->ops->multadd = MatMultAdd_SeqAIJ; 4133 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4134 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4135 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4136 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4137 A->ops->coloringpatch = 0; 4138 A->ops->multdiagonalblock = 0; 4139 4140 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4141 } else { 4142 if (!A->factortype) { 4143 A->ops->mult = MatMult_SeqAIJ_Inode; 4144 A->ops->sor = MatSOR_SeqAIJ_Inode; 4145 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4146 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4147 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4148 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4149 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4150 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4151 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4152 } else { 4153 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4154 } 4155 a->inode.node_count = node_count; 4156 a->inode.size = ns; 4157 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4158 } 4159 a->inode.checked = PETSC_TRUE; 4160 PetscFunctionReturn(0); 4161 } 4162 4163 #undef __FUNCT__ 4164 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4165 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4166 { 4167 Mat B =*C; 4168 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4169 PetscErrorCode ierr; 4170 PetscInt m=A->rmap->n; 4171 4172 PetscFunctionBegin; 4173 c->inode.use = a->inode.use; 4174 c->inode.limit = a->inode.limit; 4175 c->inode.max_limit = a->inode.max_limit; 4176 if (a->inode.size) { 4177 ierr = PetscMalloc1((m+1),&c->inode.size);CHKERRQ(ierr); 4178 c->inode.node_count = a->inode.node_count; 4179 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4180 /* note the table of functions below should match that in Mat_CheckInode() */ 4181 if (!B->factortype) { 4182 B->ops->mult = MatMult_SeqAIJ_Inode; 4183 B->ops->sor = MatSOR_SeqAIJ_Inode; 4184 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4185 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4186 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4187 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4188 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4189 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4190 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4191 } else { 4192 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4193 } 4194 } else { 4195 c->inode.size = 0; 4196 c->inode.node_count = 0; 4197 } 4198 c->inode.ibdiagvalid = PETSC_FALSE; 4199 c->inode.ibdiag = 0; 4200 c->inode.bdiag = 0; 4201 PetscFunctionReturn(0); 4202 } 4203 4204 #undef __FUNCT__ 4205 #define __FUNCT__ "MatGetRow_FactoredLU" 4206 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4207 { 4208 PetscInt k; 4209 const PetscInt *vi; 4210 4211 PetscFunctionBegin; 4212 vi = aj + ai[row]; 4213 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4214 vi = aj + adiag[row]; 4215 cols[nzl] = vi[0]; 4216 vi = aj + adiag[row+1]+1; 4217 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4218 PetscFunctionReturn(0); 4219 } 4220 /* 4221 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4222 Modified from Mat_CheckInode(). 4223 4224 Input Parameters: 4225 . Mat A - ILU or LU matrix factor 4226 4227 */ 4228 #undef __FUNCT__ 4229 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4230 PetscErrorCode Mat_CheckInode_FactorLU(Mat A) 4231 { 4232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4233 PetscErrorCode ierr; 4234 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4235 PetscInt *cols1,*cols2,*ns; 4236 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4237 PetscBool flag; 4238 4239 PetscFunctionBegin; 4240 if (!a->inode.use) PetscFunctionReturn(0); 4241 if (a->inode.checked) PetscFunctionReturn(0); 4242 4243 m = A->rmap->n; 4244 if (a->inode.size) ns = a->inode.size; 4245 else { 4246 ierr = PetscMalloc1((m+1),&ns);CHKERRQ(ierr); 4247 } 4248 4249 i = 0; 4250 node_count = 0; 4251 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4252 while (i < m) { /* For each row */ 4253 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4254 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4255 nzx = nzl1 + nzu1 + 1; 4256 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4257 4258 /* Limits the number of elements in a node to 'a->inode.limit' */ 4259 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4260 nzl2 = ai[j+1] - ai[j]; 4261 nzu2 = adiag[j] - adiag[j+1] - 1; 4262 nzy = nzl2 + nzu2 + 1; 4263 if (nzy != nzx) break; 4264 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4265 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4266 if (!flag) break; 4267 } 4268 ns[node_count++] = blk_size; 4269 i = j; 4270 } 4271 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4272 /* If not enough inodes found,, do not use inode version of the routines */ 4273 if (!m || node_count > .8*m) { 4274 ierr = PetscFree(ns);CHKERRQ(ierr); 4275 4276 a->inode.node_count = 0; 4277 a->inode.size = NULL; 4278 a->inode.use = PETSC_FALSE; 4279 4280 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4281 } else { 4282 A->ops->mult = 0; 4283 A->ops->sor = 0; 4284 A->ops->multadd = 0; 4285 A->ops->getrowij = 0; 4286 A->ops->restorerowij = 0; 4287 A->ops->getcolumnij = 0; 4288 A->ops->restorecolumnij = 0; 4289 A->ops->coloringpatch = 0; 4290 A->ops->multdiagonalblock = 0; 4291 a->inode.node_count = node_count; 4292 a->inode.size = ns; 4293 4294 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4295 } 4296 a->inode.checked = PETSC_TRUE; 4297 PetscFunctionReturn(0); 4298 } 4299 4300 #undef __FUNCT__ 4301 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4302 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4303 { 4304 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4305 4306 PetscFunctionBegin; 4307 a->inode.ibdiagvalid = PETSC_FALSE; 4308 PetscFunctionReturn(0); 4309 } 4310 4311 /* 4312 This is really ugly. if inodes are used this replaces the 4313 permutations with ones that correspond to rows/cols of the matrix 4314 rather then inode blocks 4315 */ 4316 #undef __FUNCT__ 4317 #define __FUNCT__ "MatInodeAdjustForInodes" 4318 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4319 { 4320 PetscErrorCode ierr; 4321 4322 PetscFunctionBegin; 4323 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4324 PetscFunctionReturn(0); 4325 } 4326 4327 #undef __FUNCT__ 4328 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4329 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4330 { 4331 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4332 PetscErrorCode ierr; 4333 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4334 const PetscInt *ridx,*cidx; 4335 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4336 PetscInt nslim_col,*ns_col; 4337 IS ris = *rperm,cis = *cperm; 4338 4339 PetscFunctionBegin; 4340 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4341 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4342 4343 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4344 ierr = PetscMalloc1((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1),&tns);CHKERRQ(ierr); 4345 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4346 4347 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4348 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4349 4350 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4351 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4352 4353 /* Construct the permutations for rows*/ 4354 for (i=0,row = 0; i<nslim_row; ++i) { 4355 indx = ridx[i]; 4356 start_val = tns[indx]; 4357 end_val = tns[indx + 1]; 4358 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4359 } 4360 4361 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4362 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4363 4364 /* Construct permutations for columns */ 4365 for (i=0,col=0; i<nslim_col; ++i) { 4366 indx = cidx[i]; 4367 start_val = tns[indx]; 4368 end_val = tns[indx + 1]; 4369 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4370 } 4371 4372 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4373 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4374 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4375 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4376 4377 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4378 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4379 4380 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4381 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4382 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4383 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4384 ierr = PetscFree(tns);CHKERRQ(ierr); 4385 PetscFunctionReturn(0); 4386 } 4387 4388 #undef __FUNCT__ 4389 #define __FUNCT__ "MatInodeGetInodeSizes" 4390 /*@C 4391 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4392 4393 Not Collective 4394 4395 Input Parameter: 4396 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4397 4398 Output Parameter: 4399 + node_count - no of inodes present in the matrix. 4400 . sizes - an array of size node_count,with sizes of each inode. 4401 - limit - the max size used to generate the inodes. 4402 4403 Level: advanced 4404 4405 Notes: This routine returns some internal storage information 4406 of the matrix, it is intended to be used by advanced users. 4407 It should be called after the matrix is assembled. 4408 The contents of the sizes[] array should not be changed. 4409 NULL may be passed for information not requested. 4410 4411 .keywords: matrix, seqaij, get, inode 4412 4413 .seealso: MatGetInfo() 4414 @*/ 4415 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4416 { 4417 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4418 4419 PetscFunctionBegin; 4420 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4421 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4422 if (f) { 4423 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4424 } 4425 PetscFunctionReturn(0); 4426 } 4427 4428 #undef __FUNCT__ 4429 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4430 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4431 { 4432 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4433 4434 PetscFunctionBegin; 4435 if (node_count) *node_count = a->inode.node_count; 4436 if (sizes) *sizes = a->inode.size; 4437 if (limit) *limit = a->inode.limit; 4438 PetscFunctionReturn(0); 4439 } 4440