1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 67 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1) { 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,nsz,i1,i2; 157 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 158 159 PetscFunctionBegin; 160 nslim_row = a->inode.node_count; 161 n = A->cmap->n; 162 163 /* Create The column_inode for this matrix */ 164 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 165 166 /* allocate space for reformated column_inode structure */ 167 ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr); 168 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 169 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 170 171 for (i1=0,col=0; i1<nslim_col; ++i1) { 172 nsz = ns_col[i1]; 173 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 PetscFunctionReturn(0); 264 } 265 266 /* ----------------------------------------------------------- */ 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscErrorCode ierr; 274 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 275 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 276 277 PetscFunctionBegin; 278 nslim_row = a->inode.node_count; 279 n = A->cmap->n; 280 281 /* Create The column_inode for this matrix */ 282 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 283 284 /* allocate space for reformated column_inode structure */ 285 ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr); 286 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 287 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 288 289 for (i1=0,col=0; i1<nslim_col; ++i1) { 290 nsz = ns_col[i1]; 291 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 292 } 293 /* allocate space for column pointers */ 294 ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 295 *iia = ia; 296 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 297 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 298 299 /* determine the number of columns in each row */ 300 ia[0] = oshift; 301 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 302 j = aj + ai[row] + ishift; 303 col = *j++ + ishift; 304 i2 = tvc[col]; 305 nz = ai[row+1] - ai[row]; 306 while (nz-- > 0) { /* off-diagonal elemets */ 307 /* ia[i1+1]++; */ 308 ia[i2+1]++; 309 i2++; 310 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 311 if (nz > 0) i2 = tvc[col]; 312 } 313 } 314 315 /* shift ia[i] to point to next col */ 316 for (i1=1; i1<nslim_col+1; i1++) { 317 col = ia[i1-1]; 318 ia[i1] += col; 319 work[i1-1] = col - oshift; 320 } 321 322 /* allocate space for column pointers */ 323 nz = ia[nslim_col] + (!ishift); 324 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 325 *jja = ja; 326 327 /* loop over matrix putting into ja */ 328 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 329 j = aj + ai[row] + ishift; 330 col = *j++ + ishift; 331 i2 = tvc[col]; 332 nz = ai[row+1] - ai[row]; 333 while (nz-- > 0) { 334 /* ja[work[i1]++] = i2 + oshift; */ 335 ja[work[i2]++] = i1 + oshift; 336 i2++; 337 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 338 if (nz > 0) i2 = tvc[col]; 339 } 340 } 341 ierr = PetscFree(ns_col);CHKERRQ(ierr); 342 ierr = PetscFree(work);CHKERRQ(ierr); 343 ierr = PetscFree(tns);CHKERRQ(ierr); 344 ierr = PetscFree(tvc);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 356 if (!ia) PetscFunctionReturn(0); 357 358 if (!blockcompressed) { 359 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 360 } else if (symmetric) { 361 /* Since the indices are symmetric it does'nt matter */ 362 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } else { 364 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 365 } 366 PetscFunctionReturn(0); 367 } 368 369 #undef __FUNCT__ 370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 372 { 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 if (!ia) PetscFunctionReturn(0); 377 if (!blockcompressed) { 378 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 379 } else { 380 ierr = PetscFree(*ia);CHKERRQ(ierr); 381 ierr = PetscFree(*ja);CHKERRQ(ierr); 382 } 383 PetscFunctionReturn(0); 384 } 385 386 /* ----------------------------------------------------------- */ 387 388 #undef __FUNCT__ 389 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 393 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 394 PetscScalar *y; 395 const PetscScalar *x; 396 const MatScalar *v1,*v2,*v3,*v4,*v5; 397 PetscErrorCode ierr; 398 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 399 const PetscInt *idx,*ns,*ii; 400 401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 403 #endif 404 405 PetscFunctionBegin; 406 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 407 node_max = a->inode.node_count; 408 ns = a->inode.size; /* Node Size array */ 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 411 idx = a->j; 412 v1 = a->a; 413 ii = a->i; 414 415 for (i = 0,row = 0; i< node_max; ++i) { 416 nsz = ns[i]; 417 n = ii[1] - ii[0]; 418 nonzerorow += (n>0)*nsz; 419 ii += nsz; 420 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 421 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 422 sz = n; /* No of non zeros in this row */ 423 /* Switch on the size of Node */ 424 switch (nsz) { /* Each loop in 'case' is unrolled */ 425 case 1: 426 sum1 = 0.; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; /* The instructions are ordered to */ 430 i2 = idx[1]; /* make the compiler's job easy */ 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 } 436 437 if (n == sz-1) { /* Take care of the last nonzero */ 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ *tmp0; 440 } 441 y[row++]=sum1; 442 break; 443 case 2: 444 sum1 = 0.; 445 sum2 = 0.; 446 v2 = v1 + n; 447 448 for (n = 0; n< sz-1; n+=2) { 449 i1 = idx[0]; 450 i2 = idx[1]; 451 idx += 2; 452 tmp0 = x[i1]; 453 tmp1 = x[i2]; 454 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 455 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 456 } 457 if (n == sz-1) { 458 tmp0 = x[*idx++]; 459 sum1 += *v1++ * tmp0; 460 sum2 += *v2++ * tmp0; 461 } 462 y[row++]=sum1; 463 y[row++]=sum2; 464 v1 =v2; /* Since the next block to be processed starts there*/ 465 idx +=sz; 466 break; 467 case 3: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 v2 = v1 + n; 472 v3 = v2 + n; 473 474 for (n = 0; n< sz-1; n+=2) { 475 i1 = idx[0]; 476 i2 = idx[1]; 477 idx += 2; 478 tmp0 = x[i1]; 479 tmp1 = x[i2]; 480 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 } 490 y[row++]=sum1; 491 y[row++]=sum2; 492 y[row++]=sum3; 493 v1 =v3; /* Since the next block to be processed starts there*/ 494 idx +=2*sz; 495 break; 496 case 4: 497 sum1 = 0.; 498 sum2 = 0.; 499 sum3 = 0.; 500 sum4 = 0.; 501 v2 = v1 + n; 502 v3 = v2 + n; 503 v4 = v3 + n; 504 505 for (n = 0; n< sz-1; n+=2) { 506 i1 = idx[0]; 507 i2 = idx[1]; 508 idx += 2; 509 tmp0 = x[i1]; 510 tmp1 = x[i2]; 511 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 513 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 514 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 515 } 516 if (n == sz-1) { 517 tmp0 = x[*idx++]; 518 sum1 += *v1++ * tmp0; 519 sum2 += *v2++ * tmp0; 520 sum3 += *v3++ * tmp0; 521 sum4 += *v4++ * tmp0; 522 } 523 y[row++]=sum1; 524 y[row++]=sum2; 525 y[row++]=sum3; 526 y[row++]=sum4; 527 v1 =v4; /* Since the next block to be processed starts there*/ 528 idx +=3*sz; 529 break; 530 case 5: 531 sum1 = 0.; 532 sum2 = 0.; 533 sum3 = 0.; 534 sum4 = 0.; 535 sum5 = 0.; 536 v2 = v1 + n; 537 v3 = v2 + n; 538 v4 = v3 + n; 539 v5 = v4 + n; 540 541 for (n = 0; n<sz-1; n+=2) { 542 i1 = idx[0]; 543 i2 = idx[1]; 544 idx += 2; 545 tmp0 = x[i1]; 546 tmp1 = x[i2]; 547 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 548 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 549 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 550 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 551 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 552 } 553 if (n == sz-1) { 554 tmp0 = x[*idx++]; 555 sum1 += *v1++ * tmp0; 556 sum2 += *v2++ * tmp0; 557 sum3 += *v3++ * tmp0; 558 sum4 += *v4++ * tmp0; 559 sum5 += *v5++ * tmp0; 560 } 561 y[row++]=sum1; 562 y[row++]=sum2; 563 y[row++]=sum3; 564 y[row++]=sum4; 565 y[row++]=sum5; 566 v1 =v5; /* Since the next block to be processed starts there */ 567 idx +=4*sz; 568 break; 569 default: 570 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 571 } 572 } 573 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 575 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 /* ----------------------------------------------------------- */ 579 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 583 { 584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 585 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 586 const MatScalar *v1,*v2,*v3,*v4,*v5; 587 const PetscScalar *x; 588 PetscScalar *y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 591 const PetscInt *idx,*ns,*ii; 592 593 PetscFunctionBegin; 594 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 595 node_max = a->inode.node_count; 596 ns = a->inode.size; /* Node Size array */ 597 598 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 600 if (zz != yy) { 601 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 602 } else { 603 z = y; 604 } 605 zt = z; 606 607 idx = a->j; 608 v1 = a->a; 609 ii = a->i; 610 611 for (i = 0,row = 0; i< node_max; ++i) { 612 nsz = ns[i]; 613 n = ii[1] - ii[0]; 614 ii += nsz; 615 sz = n; /* No of non zeros in this row */ 616 /* Switch on the size of Node */ 617 switch (nsz) { /* Each loop in 'case' is unrolled */ 618 case 1: 619 sum1 = *zt++; 620 621 for (n = 0; n< sz-1; n+=2) { 622 i1 = idx[0]; /* The instructions are ordered to */ 623 i2 = idx[1]; /* make the compiler's job easy */ 624 idx += 2; 625 tmp0 = x[i1]; 626 tmp1 = x[i2]; 627 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 628 } 629 630 if (n == sz-1) { /* Take care of the last nonzero */ 631 tmp0 = x[*idx++]; 632 sum1 += *v1++ * tmp0; 633 } 634 y[row++]=sum1; 635 break; 636 case 2: 637 sum1 = *zt++; 638 sum2 = *zt++; 639 v2 = v1 + n; 640 641 for (n = 0; n< sz-1; n+=2) { 642 i1 = idx[0]; 643 i2 = idx[1]; 644 idx += 2; 645 tmp0 = x[i1]; 646 tmp1 = x[i2]; 647 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 648 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 649 } 650 if (n == sz-1) { 651 tmp0 = x[*idx++]; 652 sum1 += *v1++ * tmp0; 653 sum2 += *v2++ * tmp0; 654 } 655 y[row++]=sum1; 656 y[row++]=sum2; 657 v1 =v2; /* Since the next block to be processed starts there*/ 658 idx +=sz; 659 break; 660 case 3: 661 sum1 = *zt++; 662 sum2 = *zt++; 663 sum3 = *zt++; 664 v2 = v1 + n; 665 v3 = v2 + n; 666 667 for (n = 0; n< sz-1; n+=2) { 668 i1 = idx[0]; 669 i2 = idx[1]; 670 idx += 2; 671 tmp0 = x[i1]; 672 tmp1 = x[i2]; 673 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 674 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 675 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 676 } 677 if (n == sz-1) { 678 tmp0 = x[*idx++]; 679 sum1 += *v1++ * tmp0; 680 sum2 += *v2++ * tmp0; 681 sum3 += *v3++ * tmp0; 682 } 683 y[row++]=sum1; 684 y[row++]=sum2; 685 y[row++]=sum3; 686 v1 =v3; /* Since the next block to be processed starts there*/ 687 idx +=2*sz; 688 break; 689 case 4: 690 sum1 = *zt++; 691 sum2 = *zt++; 692 sum3 = *zt++; 693 sum4 = *zt++; 694 v2 = v1 + n; 695 v3 = v2 + n; 696 v4 = v3 + n; 697 698 for (n = 0; n< sz-1; n+=2) { 699 i1 = idx[0]; 700 i2 = idx[1]; 701 idx += 2; 702 tmp0 = x[i1]; 703 tmp1 = x[i2]; 704 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 705 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 706 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 707 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 708 } 709 if (n == sz-1) { 710 tmp0 = x[*idx++]; 711 sum1 += *v1++ * tmp0; 712 sum2 += *v2++ * tmp0; 713 sum3 += *v3++ * tmp0; 714 sum4 += *v4++ * tmp0; 715 } 716 y[row++]=sum1; 717 y[row++]=sum2; 718 y[row++]=sum3; 719 y[row++]=sum4; 720 v1 =v4; /* Since the next block to be processed starts there*/ 721 idx +=3*sz; 722 break; 723 case 5: 724 sum1 = *zt++; 725 sum2 = *zt++; 726 sum3 = *zt++; 727 sum4 = *zt++; 728 sum5 = *zt++; 729 v2 = v1 + n; 730 v3 = v2 + n; 731 v4 = v3 + n; 732 v5 = v4 + n; 733 734 for (n = 0; n<sz-1; n+=2) { 735 i1 = idx[0]; 736 i2 = idx[1]; 737 idx += 2; 738 tmp0 = x[i1]; 739 tmp1 = x[i2]; 740 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 741 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 742 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 743 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 744 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 745 } 746 if (n == sz-1) { 747 tmp0 = x[*idx++]; 748 sum1 += *v1++ * tmp0; 749 sum2 += *v2++ * tmp0; 750 sum3 += *v3++ * tmp0; 751 sum4 += *v4++ * tmp0; 752 sum5 += *v5++ * tmp0; 753 } 754 y[row++]=sum1; 755 y[row++]=sum2; 756 y[row++]=sum3; 757 y[row++]=sum4; 758 y[row++]=sum5; 759 v1 =v5; /* Since the next block to be processed starts there */ 760 idx +=4*sz; 761 break; 762 default: 763 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 764 } 765 } 766 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 767 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 768 if (zz != yy) { 769 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 770 } 771 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 772 PetscFunctionReturn(0); 773 } 774 775 /* ----------------------------------------------------------- */ 776 #undef __FUNCT__ 777 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 778 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 779 { 780 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 781 IS iscol = a->col,isrow = a->row; 782 PetscErrorCode ierr; 783 const PetscInt *r,*c,*rout,*cout; 784 PetscInt i,j,n = A->rmap->n,nz; 785 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 786 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp; 806 aa = a_a; 807 aj = a_j; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i) { 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 if (i < node_max-1) { 817 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 818 * but our indexing to determine it's size could. */ 819 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 820 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 821 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 822 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 823 } 824 825 switch (nsz) { /* Each loop in 'case' is unrolled */ 826 case 1: 827 sum1 = b[*r++]; 828 for (j=0; j<nz-1; j+=2) { 829 i0 = vi[0]; 830 i1 = vi[1]; 831 vi +=2; 832 tmp0 = tmps[i0]; 833 tmp1 = tmps[i1]; 834 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 835 } 836 if (j == nz-1) { 837 tmp0 = tmps[*vi++]; 838 sum1 -= *v1++ *tmp0; 839 } 840 tmp[row++]=sum1; 841 break; 842 case 2: 843 sum1 = b[*r++]; 844 sum2 = b[*r++]; 845 v2 = aa + ai[row+1]; 846 847 for (j=0; j<nz-1; j+=2) { 848 i0 = vi[0]; 849 i1 = vi[1]; 850 vi +=2; 851 tmp0 = tmps[i0]; 852 tmp1 = tmps[i1]; 853 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 854 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 855 } 856 if (j == nz-1) { 857 tmp0 = tmps[*vi++]; 858 sum1 -= *v1++ *tmp0; 859 sum2 -= *v2++ *tmp0; 860 } 861 sum2 -= *v2++ *sum1; 862 tmp[row++]=sum1; 863 tmp[row++]=sum2; 864 break; 865 case 3: 866 sum1 = b[*r++]; 867 sum2 = b[*r++]; 868 sum3 = b[*r++]; 869 v2 = aa + ai[row+1]; 870 v3 = aa + ai[row+2]; 871 872 for (j=0; j<nz-1; j+=2) { 873 i0 = vi[0]; 874 i1 = vi[1]; 875 vi +=2; 876 tmp0 = tmps[i0]; 877 tmp1 = tmps[i1]; 878 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 879 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 880 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 881 } 882 if (j == nz-1) { 883 tmp0 = tmps[*vi++]; 884 sum1 -= *v1++ *tmp0; 885 sum2 -= *v2++ *tmp0; 886 sum3 -= *v3++ *tmp0; 887 } 888 sum2 -= *v2++ * sum1; 889 sum3 -= *v3++ * sum1; 890 sum3 -= *v3++ * sum2; 891 892 tmp[row++]=sum1; 893 tmp[row++]=sum2; 894 tmp[row++]=sum3; 895 break; 896 897 case 4: 898 sum1 = b[*r++]; 899 sum2 = b[*r++]; 900 sum3 = b[*r++]; 901 sum4 = b[*r++]; 902 v2 = aa + ai[row+1]; 903 v3 = aa + ai[row+2]; 904 v4 = aa + ai[row+3]; 905 906 for (j=0; j<nz-1; j+=2) { 907 i0 = vi[0]; 908 i1 = vi[1]; 909 vi +=2; 910 tmp0 = tmps[i0]; 911 tmp1 = tmps[i1]; 912 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 913 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 914 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 915 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 916 } 917 if (j == nz-1) { 918 tmp0 = tmps[*vi++]; 919 sum1 -= *v1++ *tmp0; 920 sum2 -= *v2++ *tmp0; 921 sum3 -= *v3++ *tmp0; 922 sum4 -= *v4++ *tmp0; 923 } 924 sum2 -= *v2++ * sum1; 925 sum3 -= *v3++ * sum1; 926 sum4 -= *v4++ * sum1; 927 sum3 -= *v3++ * sum2; 928 sum4 -= *v4++ * sum2; 929 sum4 -= *v4++ * sum3; 930 931 tmp[row++]=sum1; 932 tmp[row++]=sum2; 933 tmp[row++]=sum3; 934 tmp[row++]=sum4; 935 break; 936 case 5: 937 sum1 = b[*r++]; 938 sum2 = b[*r++]; 939 sum3 = b[*r++]; 940 sum4 = b[*r++]; 941 sum5 = b[*r++]; 942 v2 = aa + ai[row+1]; 943 v3 = aa + ai[row+2]; 944 v4 = aa + ai[row+3]; 945 v5 = aa + ai[row+4]; 946 947 for (j=0; j<nz-1; j+=2) { 948 i0 = vi[0]; 949 i1 = vi[1]; 950 vi +=2; 951 tmp0 = tmps[i0]; 952 tmp1 = tmps[i1]; 953 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 954 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 955 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 957 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 958 } 959 if (j == nz-1) { 960 tmp0 = tmps[*vi++]; 961 sum1 -= *v1++ *tmp0; 962 sum2 -= *v2++ *tmp0; 963 sum3 -= *v3++ *tmp0; 964 sum4 -= *v4++ *tmp0; 965 sum5 -= *v5++ *tmp0; 966 } 967 968 sum2 -= *v2++ * sum1; 969 sum3 -= *v3++ * sum1; 970 sum4 -= *v4++ * sum1; 971 sum5 -= *v5++ * sum1; 972 sum3 -= *v3++ * sum2; 973 sum4 -= *v4++ * sum2; 974 sum5 -= *v5++ * sum2; 975 sum4 -= *v4++ * sum3; 976 sum5 -= *v5++ * sum3; 977 sum5 -= *v5++ * sum4; 978 979 tmp[row++]=sum1; 980 tmp[row++]=sum2; 981 tmp[row++]=sum3; 982 tmp[row++]=sum4; 983 tmp[row++]=sum5; 984 break; 985 default: 986 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 987 } 988 } 989 /* backward solve the upper triangular */ 990 for (i=node_max -1,row = n-1; i>=0; i--) { 991 nsz = ns[i]; 992 aii = ai[row+1] -1; 993 v1 = aa + aii; 994 vi = aj + aii; 995 nz = aii- ad[row]; 996 switch (nsz) { /* Each loop in 'case' is unrolled */ 997 case 1: 998 sum1 = tmp[row]; 999 1000 for (j=nz; j>1; j-=2) { 1001 vi -=2; 1002 i0 = vi[2]; 1003 i1 = vi[1]; 1004 tmp0 = tmps[i0]; 1005 tmp1 = tmps[i1]; 1006 v1 -= 2; 1007 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1008 } 1009 if (j==1) { 1010 tmp0 = tmps[*vi--]; 1011 sum1 -= *v1-- * tmp0; 1012 } 1013 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1014 break; 1015 case 2: 1016 sum1 = tmp[row]; 1017 sum2 = tmp[row -1]; 1018 v2 = aa + ai[row]-1; 1019 for (j=nz; j>1; j-=2) { 1020 vi -=2; 1021 i0 = vi[2]; 1022 i1 = vi[1]; 1023 tmp0 = tmps[i0]; 1024 tmp1 = tmps[i1]; 1025 v1 -= 2; 1026 v2 -= 2; 1027 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1028 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1029 } 1030 if (j==1) { 1031 tmp0 = tmps[*vi--]; 1032 sum1 -= *v1-- * tmp0; 1033 sum2 -= *v2-- * tmp0; 1034 } 1035 1036 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1037 sum2 -= *v2-- * tmp0; 1038 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1039 break; 1040 case 3: 1041 sum1 = tmp[row]; 1042 sum2 = tmp[row -1]; 1043 sum3 = tmp[row -2]; 1044 v2 = aa + ai[row]-1; 1045 v3 = aa + ai[row -1]-1; 1046 for (j=nz; j>1; j-=2) { 1047 vi -=2; 1048 i0 = vi[2]; 1049 i1 = vi[1]; 1050 tmp0 = tmps[i0]; 1051 tmp1 = tmps[i1]; 1052 v1 -= 2; 1053 v2 -= 2; 1054 v3 -= 2; 1055 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1056 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1057 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1058 } 1059 if (j==1) { 1060 tmp0 = tmps[*vi--]; 1061 sum1 -= *v1-- * tmp0; 1062 sum2 -= *v2-- * tmp0; 1063 sum3 -= *v3-- * tmp0; 1064 } 1065 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1066 sum2 -= *v2-- * tmp0; 1067 sum3 -= *v3-- * tmp0; 1068 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1069 sum3 -= *v3-- * tmp0; 1070 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1071 1072 break; 1073 case 4: 1074 sum1 = tmp[row]; 1075 sum2 = tmp[row -1]; 1076 sum3 = tmp[row -2]; 1077 sum4 = tmp[row -3]; 1078 v2 = aa + ai[row]-1; 1079 v3 = aa + ai[row -1]-1; 1080 v4 = aa + ai[row -2]-1; 1081 1082 for (j=nz; j>1; j-=2) { 1083 vi -=2; 1084 i0 = vi[2]; 1085 i1 = vi[1]; 1086 tmp0 = tmps[i0]; 1087 tmp1 = tmps[i1]; 1088 v1 -= 2; 1089 v2 -= 2; 1090 v3 -= 2; 1091 v4 -= 2; 1092 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1093 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1094 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1095 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1096 } 1097 if (j==1) { 1098 tmp0 = tmps[*vi--]; 1099 sum1 -= *v1-- * tmp0; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 } 1104 1105 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1106 sum2 -= *v2-- * tmp0; 1107 sum3 -= *v3-- * tmp0; 1108 sum4 -= *v4-- * tmp0; 1109 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1113 sum4 -= *v4-- * tmp0; 1114 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1115 break; 1116 case 5: 1117 sum1 = tmp[row]; 1118 sum2 = tmp[row -1]; 1119 sum3 = tmp[row -2]; 1120 sum4 = tmp[row -3]; 1121 sum5 = tmp[row -4]; 1122 v2 = aa + ai[row]-1; 1123 v3 = aa + ai[row -1]-1; 1124 v4 = aa + ai[row -2]-1; 1125 v5 = aa + ai[row -3]-1; 1126 for (j=nz; j>1; j-=2) { 1127 vi -= 2; 1128 i0 = vi[2]; 1129 i1 = vi[1]; 1130 tmp0 = tmps[i0]; 1131 tmp1 = tmps[i1]; 1132 v1 -= 2; 1133 v2 -= 2; 1134 v3 -= 2; 1135 v4 -= 2; 1136 v5 -= 2; 1137 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1138 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1139 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1140 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1141 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1142 } 1143 if (j==1) { 1144 tmp0 = tmps[*vi--]; 1145 sum1 -= *v1-- * tmp0; 1146 sum2 -= *v2-- * tmp0; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 } 1151 1152 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1153 sum2 -= *v2-- * tmp0; 1154 sum3 -= *v3-- * tmp0; 1155 sum4 -= *v4-- * tmp0; 1156 sum5 -= *v5-- * tmp0; 1157 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1158 sum3 -= *v3-- * tmp0; 1159 sum4 -= *v4-- * tmp0; 1160 sum5 -= *v5-- * tmp0; 1161 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1162 sum4 -= *v4-- * tmp0; 1163 sum5 -= *v5-- * tmp0; 1164 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1165 sum5 -= *v5-- * tmp0; 1166 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1167 break; 1168 default: 1169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1170 } 1171 } 1172 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1173 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1174 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1176 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1177 PetscFunctionReturn(0); 1178 } 1179 1180 #undef __FUNCT__ 1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1183 { 1184 Mat C =B; 1185 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1186 IS isrow = b->row,isicol = b->icol; 1187 PetscErrorCode ierr; 1188 const PetscInt *r,*ic,*ics; 1189 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1190 PetscInt i,j,k,nz,nzL,row,*pj; 1191 const PetscInt *ajtmp,*bjtmp; 1192 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1193 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1194 FactorShiftCtx sctx; 1195 const PetscInt *ddiag; 1196 PetscReal rs; 1197 MatScalar d; 1198 PetscInt inod,nodesz,node_max,col; 1199 const PetscInt *ns; 1200 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1201 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1216 if (rs>sctx.shift_top) sctx.shift_top = rs; 1217 } 1218 sctx.shift_top *= 1.1; 1219 sctx.nshift_max = 5; 1220 sctx.shift_lo = 0.; 1221 sctx.shift_hi = 1.; 1222 } 1223 1224 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1225 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1226 1227 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1228 ics = ic; 1229 1230 node_max = a->inode.node_count; 1231 ns = a->inode.size; 1232 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1233 1234 /* If max inode size > 4, split it into two inodes.*/ 1235 /* also map the inode sizes according to the ordering */ 1236 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1237 for (i=0,j=0; i<node_max; ++i,++j) { 1238 if (ns[i] > 4) { 1239 tmp_vec1[j] = 4; 1240 ++j; 1241 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1242 } else { 1243 tmp_vec1[j] = ns[i]; 1244 } 1245 } 1246 /* Use the correct node_max */ 1247 node_max = j; 1248 1249 /* Now reorder the inode info based on mat re-ordering info */ 1250 /* First create a row -> inode_size_array_index map */ 1251 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1252 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1253 for (i=0,row=0; i<node_max; i++) { 1254 nodesz = tmp_vec1[i]; 1255 for (j=0; j<nodesz; j++,row++) { 1256 nsmap[row] = i; 1257 } 1258 } 1259 /* Using nsmap, create a reordered ns structure */ 1260 for (i=0,j=0; i< node_max; i++) { 1261 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1262 tmp_vec2[i] = nodesz; 1263 j += nodesz; 1264 } 1265 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1266 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1267 1268 /* Now use the correct ns */ 1269 ns = tmp_vec2; 1270 1271 do { 1272 sctx.newshift = PETSC_FALSE; 1273 /* Now loop over each block-row, and do the factorization */ 1274 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1275 nodesz = ns[inod]; 1276 1277 switch (nodesz) { 1278 case 1: 1279 /*----------*/ 1280 /* zero rtmp1 */ 1281 /* L part */ 1282 nz = bi[i+1] - bi[i]; 1283 bjtmp = bj + bi[i]; 1284 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1285 1286 /* U part */ 1287 nz = bdiag[i]-bdiag[i+1]; 1288 bjtmp = bj + bdiag[i+1]+1; 1289 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1290 1291 /* load in initial (unfactored row) */ 1292 nz = ai[r[i]+1] - ai[r[i]]; 1293 ajtmp = aj + ai[r[i]]; 1294 v = aa + ai[r[i]]; 1295 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1296 1297 /* ZeropivotApply() */ 1298 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1299 1300 /* elimination */ 1301 bjtmp = bj + bi[i]; 1302 row = *bjtmp++; 1303 nzL = bi[i+1] - bi[i]; 1304 for (k=0; k < nzL; k++) { 1305 pc = rtmp1 + row; 1306 if (*pc != 0.0) { 1307 pv = b->a + bdiag[row]; 1308 mul1 = *pc * (*pv); 1309 *pc = mul1; 1310 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1311 pv = b->a + bdiag[row+1]+1; 1312 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1313 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1314 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1315 } 1316 row = *bjtmp++; 1317 } 1318 1319 /* finished row so stick it into b->a */ 1320 rs = 0.0; 1321 /* L part */ 1322 pv = b->a + bi[i]; 1323 pj = b->j + bi[i]; 1324 nz = bi[i+1] - bi[i]; 1325 for (j=0; j<nz; j++) { 1326 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1327 } 1328 1329 /* U part */ 1330 pv = b->a + bdiag[i+1]+1; 1331 pj = b->j + bdiag[i+1]+1; 1332 nz = bdiag[i] - bdiag[i+1]-1; 1333 for (j=0; j<nz; j++) { 1334 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1335 } 1336 1337 /* Check zero pivot */ 1338 sctx.rs = rs; 1339 sctx.pv = rtmp1[i]; 1340 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1341 if (sctx.newshift) break; 1342 1343 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1344 pv = b->a + bdiag[i]; 1345 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1346 break; 1347 1348 case 2: 1349 /*----------*/ 1350 /* zero rtmp1 and rtmp2 */ 1351 /* L part */ 1352 nz = bi[i+1] - bi[i]; 1353 bjtmp = bj + bi[i]; 1354 for (j=0; j<nz; j++) { 1355 col = bjtmp[j]; 1356 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1357 } 1358 1359 /* U part */ 1360 nz = bdiag[i]-bdiag[i+1]; 1361 bjtmp = bj + bdiag[i+1]+1; 1362 for (j=0; j<nz; j++) { 1363 col = bjtmp[j]; 1364 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1365 } 1366 1367 /* load in initial (unfactored row) */ 1368 nz = ai[r[i]+1] - ai[r[i]]; 1369 ajtmp = aj + ai[r[i]]; 1370 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1371 for (j=0; j<nz; j++) { 1372 col = ics[ajtmp[j]]; 1373 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1374 } 1375 /* ZeropivotApply(): shift the diagonal of the matrix */ 1376 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1377 1378 /* elimination */ 1379 bjtmp = bj + bi[i]; 1380 row = *bjtmp++; /* pivot row */ 1381 nzL = bi[i+1] - bi[i]; 1382 for (k=0; k < nzL; k++) { 1383 pc1 = rtmp1 + row; 1384 pc2 = rtmp2 + row; 1385 if (*pc1 != 0.0 || *pc2 != 0.0) { 1386 pv = b->a + bdiag[row]; 1387 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1388 *pc1 = mul1; *pc2 = mul2; 1389 1390 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1391 pv = b->a + bdiag[row+1]+1; 1392 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1393 for (j=0; j<nz; j++) { 1394 col = pj[j]; 1395 rtmp1[col] -= mul1 * pv[j]; 1396 rtmp2[col] -= mul2 * pv[j]; 1397 } 1398 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1399 } 1400 row = *bjtmp++; 1401 } 1402 1403 /* finished row i; check zero pivot, then stick row i into b->a */ 1404 rs = 0.0; 1405 /* L part */ 1406 pc1 = b->a + bi[i]; 1407 pj = b->j + bi[i]; 1408 nz = bi[i+1] - bi[i]; 1409 for (j=0; j<nz; j++) { 1410 col = pj[j]; 1411 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1412 } 1413 /* U part */ 1414 pc1 = b->a + bdiag[i+1]+1; 1415 pj = b->j + bdiag[i+1]+1; 1416 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1417 for (j=0; j<nz; j++) { 1418 col = pj[j]; 1419 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1420 } 1421 1422 sctx.rs = rs; 1423 sctx.pv = rtmp1[i]; 1424 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1425 if (sctx.newshift) break; 1426 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1427 *pc1 = 1.0/sctx.pv; 1428 1429 /* Now take care of diagonal 2x2 block. */ 1430 pc2 = rtmp2 + i; 1431 if (*pc2 != 0.0) { 1432 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1433 *pc2 = mul1; /* insert L entry */ 1434 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1435 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1436 for (j=0; j<nz; j++) { 1437 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1438 } 1439 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1440 } 1441 1442 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1443 rs = 0.0; 1444 /* L part */ 1445 pc2 = b->a + bi[i+1]; 1446 pj = b->j + bi[i+1]; 1447 nz = bi[i+2] - bi[i+1]; 1448 for (j=0; j<nz; j++) { 1449 col = pj[j]; 1450 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1451 } 1452 /* U part */ 1453 pc2 = b->a + bdiag[i+2]+1; 1454 pj = b->j + bdiag[i+2]+1; 1455 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1456 for (j=0; j<nz; j++) { 1457 col = pj[j]; 1458 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1459 } 1460 1461 sctx.rs = rs; 1462 sctx.pv = rtmp2[i+1]; 1463 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1464 if (sctx.newshift) break; 1465 pc2 = b->a + bdiag[i+1]; 1466 *pc2 = 1.0/sctx.pv; 1467 break; 1468 1469 case 3: 1470 /*----------*/ 1471 /* zero rtmp */ 1472 /* L part */ 1473 nz = bi[i+1] - bi[i]; 1474 bjtmp = bj + bi[i]; 1475 for (j=0; j<nz; j++) { 1476 col = bjtmp[j]; 1477 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1478 } 1479 1480 /* U part */ 1481 nz = bdiag[i]-bdiag[i+1]; 1482 bjtmp = bj + bdiag[i+1]+1; 1483 for (j=0; j<nz; j++) { 1484 col = bjtmp[j]; 1485 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1486 } 1487 1488 /* load in initial (unfactored row) */ 1489 nz = ai[r[i]+1] - ai[r[i]]; 1490 ajtmp = aj + ai[r[i]]; 1491 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1492 for (j=0; j<nz; j++) { 1493 col = ics[ajtmp[j]]; 1494 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1495 } 1496 /* ZeropivotApply(): shift the diagonal of the matrix */ 1497 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1498 1499 /* elimination */ 1500 bjtmp = bj + bi[i]; 1501 row = *bjtmp++; /* pivot row */ 1502 nzL = bi[i+1] - bi[i]; 1503 for (k=0; k < nzL; k++) { 1504 pc1 = rtmp1 + row; 1505 pc2 = rtmp2 + row; 1506 pc3 = rtmp3 + row; 1507 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1508 pv = b->a + bdiag[row]; 1509 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1510 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1511 1512 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1513 pv = b->a + bdiag[row+1]+1; 1514 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1515 for (j=0; j<nz; j++) { 1516 col = pj[j]; 1517 rtmp1[col] -= mul1 * pv[j]; 1518 rtmp2[col] -= mul2 * pv[j]; 1519 rtmp3[col] -= mul3 * pv[j]; 1520 } 1521 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1522 } 1523 row = *bjtmp++; 1524 } 1525 1526 /* finished row i; check zero pivot, then stick row i into b->a */ 1527 rs = 0.0; 1528 /* L part */ 1529 pc1 = b->a + bi[i]; 1530 pj = b->j + bi[i]; 1531 nz = bi[i+1] - bi[i]; 1532 for (j=0; j<nz; j++) { 1533 col = pj[j]; 1534 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1535 } 1536 /* U part */ 1537 pc1 = b->a + bdiag[i+1]+1; 1538 pj = b->j + bdiag[i+1]+1; 1539 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1540 for (j=0; j<nz; j++) { 1541 col = pj[j]; 1542 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1543 } 1544 1545 sctx.rs = rs; 1546 sctx.pv = rtmp1[i]; 1547 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1548 if (sctx.newshift) break; 1549 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1550 *pc1 = 1.0/sctx.pv; 1551 1552 /* Now take care of 1st column of diagonal 3x3 block. */ 1553 pc2 = rtmp2 + i; 1554 pc3 = rtmp3 + i; 1555 if (*pc2 != 0.0 || *pc3 != 0.0) { 1556 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1557 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1558 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1559 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1560 for (j=0; j<nz; j++) { 1561 col = pj[j]; 1562 rtmp2[col] -= mul2 * rtmp1[col]; 1563 rtmp3[col] -= mul3 * rtmp1[col]; 1564 } 1565 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1566 } 1567 1568 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1569 rs = 0.0; 1570 /* L part */ 1571 pc2 = b->a + bi[i+1]; 1572 pj = b->j + bi[i+1]; 1573 nz = bi[i+2] - bi[i+1]; 1574 for (j=0; j<nz; j++) { 1575 col = pj[j]; 1576 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1577 } 1578 /* U part */ 1579 pc2 = b->a + bdiag[i+2]+1; 1580 pj = b->j + bdiag[i+2]+1; 1581 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1582 for (j=0; j<nz; j++) { 1583 col = pj[j]; 1584 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1585 } 1586 1587 sctx.rs = rs; 1588 sctx.pv = rtmp2[i+1]; 1589 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1590 if (sctx.newshift) break; 1591 pc2 = b->a + bdiag[i+1]; 1592 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1593 1594 /* Now take care of 2nd column of diagonal 3x3 block. */ 1595 pc3 = rtmp3 + i+1; 1596 if (*pc3 != 0.0) { 1597 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1598 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1599 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1600 for (j=0; j<nz; j++) { 1601 col = pj[j]; 1602 rtmp3[col] -= mul3 * rtmp2[col]; 1603 } 1604 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1605 } 1606 1607 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1608 rs = 0.0; 1609 /* L part */ 1610 pc3 = b->a + bi[i+2]; 1611 pj = b->j + bi[i+2]; 1612 nz = bi[i+3] - bi[i+2]; 1613 for (j=0; j<nz; j++) { 1614 col = pj[j]; 1615 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1616 } 1617 /* U part */ 1618 pc3 = b->a + bdiag[i+3]+1; 1619 pj = b->j + bdiag[i+3]+1; 1620 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1621 for (j=0; j<nz; j++) { 1622 col = pj[j]; 1623 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1624 } 1625 1626 sctx.rs = rs; 1627 sctx.pv = rtmp3[i+2]; 1628 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1629 if (sctx.newshift) break; 1630 pc3 = b->a + bdiag[i+2]; 1631 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1632 break; 1633 case 4: 1634 /*----------*/ 1635 /* zero rtmp */ 1636 /* L part */ 1637 nz = bi[i+1] - bi[i]; 1638 bjtmp = bj + bi[i]; 1639 for (j=0; j<nz; j++) { 1640 col = bjtmp[j]; 1641 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1642 } 1643 1644 /* U part */ 1645 nz = bdiag[i]-bdiag[i+1]; 1646 bjtmp = bj + bdiag[i+1]+1; 1647 for (j=0; j<nz; j++) { 1648 col = bjtmp[j]; 1649 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1650 } 1651 1652 /* load in initial (unfactored row) */ 1653 nz = ai[r[i]+1] - ai[r[i]]; 1654 ajtmp = aj + ai[r[i]]; 1655 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1656 for (j=0; j<nz; j++) { 1657 col = ics[ajtmp[j]]; 1658 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1659 } 1660 /* ZeropivotApply(): shift the diagonal of the matrix */ 1661 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1662 1663 /* elimination */ 1664 bjtmp = bj + bi[i]; 1665 row = *bjtmp++; /* pivot row */ 1666 nzL = bi[i+1] - bi[i]; 1667 for (k=0; k < nzL; k++) { 1668 pc1 = rtmp1 + row; 1669 pc2 = rtmp2 + row; 1670 pc3 = rtmp3 + row; 1671 pc4 = rtmp4 + row; 1672 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1673 pv = b->a + bdiag[row]; 1674 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1675 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1676 1677 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1678 pv = b->a + bdiag[row+1]+1; 1679 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1680 for (j=0; j<nz; j++) { 1681 col = pj[j]; 1682 rtmp1[col] -= mul1 * pv[j]; 1683 rtmp2[col] -= mul2 * pv[j]; 1684 rtmp3[col] -= mul3 * pv[j]; 1685 rtmp4[col] -= mul4 * pv[j]; 1686 } 1687 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1688 } 1689 row = *bjtmp++; 1690 } 1691 1692 /* finished row i; check zero pivot, then stick row i into b->a */ 1693 rs = 0.0; 1694 /* L part */ 1695 pc1 = b->a + bi[i]; 1696 pj = b->j + bi[i]; 1697 nz = bi[i+1] - bi[i]; 1698 for (j=0; j<nz; j++) { 1699 col = pj[j]; 1700 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1701 } 1702 /* U part */ 1703 pc1 = b->a + bdiag[i+1]+1; 1704 pj = b->j + bdiag[i+1]+1; 1705 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1706 for (j=0; j<nz; j++) { 1707 col = pj[j]; 1708 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1709 } 1710 1711 sctx.rs = rs; 1712 sctx.pv = rtmp1[i]; 1713 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1714 if (sctx.newshift) break; 1715 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1716 *pc1 = 1.0/sctx.pv; 1717 1718 /* Now take care of 1st column of diagonal 4x4 block. */ 1719 pc2 = rtmp2 + i; 1720 pc3 = rtmp3 + i; 1721 pc4 = rtmp4 + i; 1722 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1723 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1724 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1725 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1726 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1727 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1728 for (j=0; j<nz; j++) { 1729 col = pj[j]; 1730 rtmp2[col] -= mul2 * rtmp1[col]; 1731 rtmp3[col] -= mul3 * rtmp1[col]; 1732 rtmp4[col] -= mul4 * rtmp1[col]; 1733 } 1734 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1735 } 1736 1737 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1738 rs = 0.0; 1739 /* L part */ 1740 pc2 = b->a + bi[i+1]; 1741 pj = b->j + bi[i+1]; 1742 nz = bi[i+2] - bi[i+1]; 1743 for (j=0; j<nz; j++) { 1744 col = pj[j]; 1745 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1746 } 1747 /* U part */ 1748 pc2 = b->a + bdiag[i+2]+1; 1749 pj = b->j + bdiag[i+2]+1; 1750 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1751 for (j=0; j<nz; j++) { 1752 col = pj[j]; 1753 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1754 } 1755 1756 sctx.rs = rs; 1757 sctx.pv = rtmp2[i+1]; 1758 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1759 if (sctx.newshift) break; 1760 pc2 = b->a + bdiag[i+1]; 1761 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1762 1763 /* Now take care of 2nd column of diagonal 4x4 block. */ 1764 pc3 = rtmp3 + i+1; 1765 pc4 = rtmp4 + i+1; 1766 if (*pc3 != 0.0 || *pc4 != 0.0) { 1767 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1768 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1769 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1770 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1771 for (j=0; j<nz; j++) { 1772 col = pj[j]; 1773 rtmp3[col] -= mul3 * rtmp2[col]; 1774 rtmp4[col] -= mul4 * rtmp2[col]; 1775 } 1776 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1777 } 1778 1779 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1780 rs = 0.0; 1781 /* L part */ 1782 pc3 = b->a + bi[i+2]; 1783 pj = b->j + bi[i+2]; 1784 nz = bi[i+3] - bi[i+2]; 1785 for (j=0; j<nz; j++) { 1786 col = pj[j]; 1787 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1788 } 1789 /* U part */ 1790 pc3 = b->a + bdiag[i+3]+1; 1791 pj = b->j + bdiag[i+3]+1; 1792 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1793 for (j=0; j<nz; j++) { 1794 col = pj[j]; 1795 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1796 } 1797 1798 sctx.rs = rs; 1799 sctx.pv = rtmp3[i+2]; 1800 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1801 if (sctx.newshift) break; 1802 pc3 = b->a + bdiag[i+2]; 1803 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1804 1805 /* Now take care of 3rd column of diagonal 4x4 block. */ 1806 pc4 = rtmp4 + i+2; 1807 if (*pc4 != 0.0) { 1808 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1809 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1810 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1811 for (j=0; j<nz; j++) { 1812 col = pj[j]; 1813 rtmp4[col] -= mul4 * rtmp3[col]; 1814 } 1815 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1816 } 1817 1818 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1819 rs = 0.0; 1820 /* L part */ 1821 pc4 = b->a + bi[i+3]; 1822 pj = b->j + bi[i+3]; 1823 nz = bi[i+4] - bi[i+3]; 1824 for (j=0; j<nz; j++) { 1825 col = pj[j]; 1826 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1827 } 1828 /* U part */ 1829 pc4 = b->a + bdiag[i+4]+1; 1830 pj = b->j + bdiag[i+4]+1; 1831 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1832 for (j=0; j<nz; j++) { 1833 col = pj[j]; 1834 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1835 } 1836 1837 sctx.rs = rs; 1838 sctx.pv = rtmp4[i+3]; 1839 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1840 if (sctx.newshift) break; 1841 pc4 = b->a + bdiag[i+3]; 1842 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1843 break; 1844 1845 default: 1846 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1847 } 1848 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1849 i += nodesz; /* Update the row */ 1850 } 1851 1852 /* MatPivotRefine() */ 1853 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1854 /* 1855 * if no shift in this attempt & shifting & started shifting & can refine, 1856 * then try lower shift 1857 */ 1858 sctx.shift_hi = sctx.shift_fraction; 1859 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1860 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1861 sctx.newshift = PETSC_TRUE; 1862 sctx.nshift++; 1863 } 1864 } while (sctx.newshift); 1865 1866 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1867 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1868 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1869 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1870 1871 if (b->inode.size) { 1872 C->ops->solve = MatSolve_SeqAIJ_Inode; 1873 } else { 1874 C->ops->solve = MatSolve_SeqAIJ; 1875 } 1876 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1877 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1878 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1879 C->ops->matsolve = MatMatSolve_SeqAIJ; 1880 C->assembled = PETSC_TRUE; 1881 C->preallocated = PETSC_TRUE; 1882 1883 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1884 1885 /* MatShiftView(A,info,&sctx) */ 1886 if (sctx.nshift) { 1887 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1888 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1889 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1890 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1891 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1892 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1893 } 1894 } 1895 PetscFunctionReturn(0); 1896 } 1897 1898 #undef __FUNCT__ 1899 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1900 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1901 { 1902 Mat C = B; 1903 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1904 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1905 PetscErrorCode ierr; 1906 const PetscInt *r,*ic,*c,*ics; 1907 PetscInt n = A->rmap->n,*bi = b->i; 1908 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1909 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1910 PetscInt *ai = a->i,*aj = a->j; 1911 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1912 PetscScalar mul1,mul2,mul3,tmp; 1913 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1914 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1915 PetscReal rs=0.0; 1916 FactorShiftCtx sctx; 1917 1918 PetscFunctionBegin; 1919 sctx.shift_top = 0; 1920 sctx.nshift_max = 0; 1921 sctx.shift_lo = 0; 1922 sctx.shift_hi = 0; 1923 sctx.shift_fraction = 0; 1924 1925 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1926 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1927 sctx.shift_top = 0; 1928 for (i=0; i<n; i++) { 1929 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1930 rs = 0.0; 1931 ajtmp = aj + ai[i]; 1932 rtmp1 = aa + ai[i]; 1933 nz = ai[i+1] - ai[i]; 1934 for (j=0; j<nz; j++) { 1935 if (*ajtmp != i) { 1936 rs += PetscAbsScalar(*rtmp1++); 1937 } else { 1938 rs -= PetscRealPart(*rtmp1++); 1939 } 1940 ajtmp++; 1941 } 1942 if (rs>sctx.shift_top) sctx.shift_top = rs; 1943 } 1944 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1945 sctx.shift_top *= 1.1; 1946 sctx.nshift_max = 5; 1947 sctx.shift_lo = 0.; 1948 sctx.shift_hi = 1.; 1949 } 1950 sctx.shift_amount = 0; 1951 sctx.nshift = 0; 1952 1953 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1954 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1955 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1956 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1957 ics = ic; 1958 1959 node_max = a->inode.node_count; 1960 ns = a->inode.size; 1961 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1962 1963 /* If max inode size > 3, split it into two inodes.*/ 1964 /* also map the inode sizes according to the ordering */ 1965 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1966 for (i=0,j=0; i<node_max; ++i,++j) { 1967 if (ns[i]>3) { 1968 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1969 ++j; 1970 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1971 } else { 1972 tmp_vec1[j] = ns[i]; 1973 } 1974 } 1975 /* Use the correct node_max */ 1976 node_max = j; 1977 1978 /* Now reorder the inode info based on mat re-ordering info */ 1979 /* First create a row -> inode_size_array_index map */ 1980 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1981 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1982 for (i=0,row=0; i<node_max; i++) { 1983 nodesz = tmp_vec1[i]; 1984 for (j=0; j<nodesz; j++,row++) { 1985 nsmap[row] = i; 1986 } 1987 } 1988 /* Using nsmap, create a reordered ns structure */ 1989 for (i=0,j=0; i< node_max; i++) { 1990 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1991 tmp_vec2[i] = nodesz; 1992 j += nodesz; 1993 } 1994 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1995 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1996 /* Now use the correct ns */ 1997 ns = tmp_vec2; 1998 1999 do { 2000 sctx.newshift = PETSC_FALSE; 2001 /* Now loop over each block-row, and do the factorization */ 2002 for (i=0,row=0; i<node_max; i++) { 2003 nodesz = ns[i]; 2004 nz = bi[row+1] - bi[row]; 2005 bjtmp = bj + bi[row]; 2006 2007 switch (nodesz) { 2008 case 1: 2009 for (j=0; j<nz; j++) { 2010 idx = bjtmp[j]; 2011 rtmp11[idx] = 0.0; 2012 } 2013 2014 /* load in initial (unfactored row) */ 2015 idx = r[row]; 2016 nz_tmp = ai[idx+1] - ai[idx]; 2017 ajtmp = aj + ai[idx]; 2018 v1 = aa + ai[idx]; 2019 2020 for (j=0; j<nz_tmp; j++) { 2021 idx = ics[ajtmp[j]]; 2022 rtmp11[idx] = v1[j]; 2023 } 2024 rtmp11[ics[r[row]]] += sctx.shift_amount; 2025 2026 prow = *bjtmp++; 2027 while (prow < row) { 2028 pc1 = rtmp11 + prow; 2029 if (*pc1 != 0.0) { 2030 pv = ba + bd[prow]; 2031 pj = nbj + bd[prow]; 2032 mul1 = *pc1 * *pv++; 2033 *pc1 = mul1; 2034 nz_tmp = bi[prow+1] - bd[prow] - 1; 2035 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2036 for (j=0; j<nz_tmp; j++) { 2037 tmp = pv[j]; 2038 idx = pj[j]; 2039 rtmp11[idx] -= mul1 * tmp; 2040 } 2041 } 2042 prow = *bjtmp++; 2043 } 2044 pj = bj + bi[row]; 2045 pc1 = ba + bi[row]; 2046 2047 sctx.pv = rtmp11[row]; 2048 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2049 rs = 0.0; 2050 for (j=0; j<nz; j++) { 2051 idx = pj[j]; 2052 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2053 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2054 } 2055 sctx.rs = rs; 2056 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2057 if (sctx.newshift) goto endofwhile; 2058 break; 2059 2060 case 2: 2061 for (j=0; j<nz; j++) { 2062 idx = bjtmp[j]; 2063 rtmp11[idx] = 0.0; 2064 rtmp22[idx] = 0.0; 2065 } 2066 2067 /* load in initial (unfactored row) */ 2068 idx = r[row]; 2069 nz_tmp = ai[idx+1] - ai[idx]; 2070 ajtmp = aj + ai[idx]; 2071 v1 = aa + ai[idx]; 2072 v2 = aa + ai[idx+1]; 2073 for (j=0; j<nz_tmp; j++) { 2074 idx = ics[ajtmp[j]]; 2075 rtmp11[idx] = v1[j]; 2076 rtmp22[idx] = v2[j]; 2077 } 2078 rtmp11[ics[r[row]]] += sctx.shift_amount; 2079 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2080 2081 prow = *bjtmp++; 2082 while (prow < row) { 2083 pc1 = rtmp11 + prow; 2084 pc2 = rtmp22 + prow; 2085 if (*pc1 != 0.0 || *pc2 != 0.0) { 2086 pv = ba + bd[prow]; 2087 pj = nbj + bd[prow]; 2088 mul1 = *pc1 * *pv; 2089 mul2 = *pc2 * *pv; 2090 ++pv; 2091 *pc1 = mul1; 2092 *pc2 = mul2; 2093 2094 nz_tmp = bi[prow+1] - bd[prow] - 1; 2095 for (j=0; j<nz_tmp; j++) { 2096 tmp = pv[j]; 2097 idx = pj[j]; 2098 rtmp11[idx] -= mul1 * tmp; 2099 rtmp22[idx] -= mul2 * tmp; 2100 } 2101 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2102 } 2103 prow = *bjtmp++; 2104 } 2105 2106 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2107 pc1 = rtmp11 + prow; 2108 pc2 = rtmp22 + prow; 2109 2110 sctx.pv = *pc1; 2111 pj = bj + bi[prow]; 2112 rs = 0.0; 2113 for (j=0; j<nz; j++) { 2114 idx = pj[j]; 2115 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2116 } 2117 sctx.rs = rs; 2118 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2119 if (sctx.newshift) goto endofwhile; 2120 2121 if (*pc2 != 0.0) { 2122 pj = nbj + bd[prow]; 2123 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2124 *pc2 = mul2; 2125 nz_tmp = bi[prow+1] - bd[prow] - 1; 2126 for (j=0; j<nz_tmp; j++) { 2127 idx = pj[j]; 2128 tmp = rtmp11[idx]; 2129 rtmp22[idx] -= mul2 * tmp; 2130 } 2131 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2132 } 2133 2134 pj = bj + bi[row]; 2135 pc1 = ba + bi[row]; 2136 pc2 = ba + bi[row+1]; 2137 2138 sctx.pv = rtmp22[row+1]; 2139 rs = 0.0; 2140 rtmp11[row] = 1.0/rtmp11[row]; 2141 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2142 /* copy row entries from dense representation to sparse */ 2143 for (j=0; j<nz; j++) { 2144 idx = pj[j]; 2145 pc1[j] = rtmp11[idx]; 2146 pc2[j] = rtmp22[idx]; 2147 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2148 } 2149 sctx.rs = rs; 2150 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2151 if (sctx.newshift) goto endofwhile; 2152 break; 2153 2154 case 3: 2155 for (j=0; j<nz; j++) { 2156 idx = bjtmp[j]; 2157 rtmp11[idx] = 0.0; 2158 rtmp22[idx] = 0.0; 2159 rtmp33[idx] = 0.0; 2160 } 2161 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2162 idx = r[row]; 2163 nz_tmp = ai[idx+1] - ai[idx]; 2164 ajtmp = aj + ai[idx]; 2165 v1 = aa + ai[idx]; 2166 v2 = aa + ai[idx+1]; 2167 v3 = aa + ai[idx+2]; 2168 for (j=0; j<nz_tmp; j++) { 2169 idx = ics[ajtmp[j]]; 2170 rtmp11[idx] = v1[j]; 2171 rtmp22[idx] = v2[j]; 2172 rtmp33[idx] = v3[j]; 2173 } 2174 rtmp11[ics[r[row]]] += sctx.shift_amount; 2175 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2176 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2177 2178 /* loop over all pivot row blocks above this row block */ 2179 prow = *bjtmp++; 2180 while (prow < row) { 2181 pc1 = rtmp11 + prow; 2182 pc2 = rtmp22 + prow; 2183 pc3 = rtmp33 + prow; 2184 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2185 pv = ba + bd[prow]; 2186 pj = nbj + bd[prow]; 2187 mul1 = *pc1 * *pv; 2188 mul2 = *pc2 * *pv; 2189 mul3 = *pc3 * *pv; 2190 ++pv; 2191 *pc1 = mul1; 2192 *pc2 = mul2; 2193 *pc3 = mul3; 2194 2195 nz_tmp = bi[prow+1] - bd[prow] - 1; 2196 /* update this row based on pivot row */ 2197 for (j=0; j<nz_tmp; j++) { 2198 tmp = pv[j]; 2199 idx = pj[j]; 2200 rtmp11[idx] -= mul1 * tmp; 2201 rtmp22[idx] -= mul2 * tmp; 2202 rtmp33[idx] -= mul3 * tmp; 2203 } 2204 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2205 } 2206 prow = *bjtmp++; 2207 } 2208 2209 /* Now take care of diagonal 3x3 block in this set of rows */ 2210 /* note: prow = row here */ 2211 pc1 = rtmp11 + prow; 2212 pc2 = rtmp22 + prow; 2213 pc3 = rtmp33 + prow; 2214 2215 sctx.pv = *pc1; 2216 pj = bj + bi[prow]; 2217 rs = 0.0; 2218 for (j=0; j<nz; j++) { 2219 idx = pj[j]; 2220 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2221 } 2222 sctx.rs = rs; 2223 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2224 if (sctx.newshift) goto endofwhile; 2225 2226 if (*pc2 != 0.0 || *pc3 != 0.0) { 2227 mul2 = (*pc2)/(*pc1); 2228 mul3 = (*pc3)/(*pc1); 2229 *pc2 = mul2; 2230 *pc3 = mul3; 2231 nz_tmp = bi[prow+1] - bd[prow] - 1; 2232 pj = nbj + bd[prow]; 2233 for (j=0; j<nz_tmp; j++) { 2234 idx = pj[j]; 2235 tmp = rtmp11[idx]; 2236 rtmp22[idx] -= mul2 * tmp; 2237 rtmp33[idx] -= mul3 * tmp; 2238 } 2239 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2240 } 2241 ++prow; 2242 2243 pc2 = rtmp22 + prow; 2244 pc3 = rtmp33 + prow; 2245 sctx.pv = *pc2; 2246 pj = bj + bi[prow]; 2247 rs = 0.0; 2248 for (j=0; j<nz; j++) { 2249 idx = pj[j]; 2250 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2251 } 2252 sctx.rs = rs; 2253 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2254 if (sctx.newshift) goto endofwhile; 2255 2256 if (*pc3 != 0.0) { 2257 mul3 = (*pc3)/(*pc2); 2258 *pc3 = mul3; 2259 pj = nbj + bd[prow]; 2260 nz_tmp = bi[prow+1] - bd[prow] - 1; 2261 for (j=0; j<nz_tmp; j++) { 2262 idx = pj[j]; 2263 tmp = rtmp22[idx]; 2264 rtmp33[idx] -= mul3 * tmp; 2265 } 2266 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2267 } 2268 2269 pj = bj + bi[row]; 2270 pc1 = ba + bi[row]; 2271 pc2 = ba + bi[row+1]; 2272 pc3 = ba + bi[row+2]; 2273 2274 sctx.pv = rtmp33[row+2]; 2275 rs = 0.0; 2276 rtmp11[row] = 1.0/rtmp11[row]; 2277 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2278 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2279 /* copy row entries from dense representation to sparse */ 2280 for (j=0; j<nz; j++) { 2281 idx = pj[j]; 2282 pc1[j] = rtmp11[idx]; 2283 pc2[j] = rtmp22[idx]; 2284 pc3[j] = rtmp33[idx]; 2285 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2286 } 2287 2288 sctx.rs = rs; 2289 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2290 if (sctx.newshift) goto endofwhile; 2291 break; 2292 2293 default: 2294 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2295 } 2296 row += nodesz; /* Update the row */ 2297 } 2298 endofwhile:; 2299 } while (sctx.newshift); 2300 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2301 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2302 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2303 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2304 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2305 2306 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2307 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2308 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2309 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2310 C->assembled = PETSC_TRUE; 2311 C->preallocated = PETSC_TRUE; 2312 if (sctx.nshift) { 2313 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2314 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2315 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2316 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2317 } 2318 } 2319 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2320 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2321 PetscFunctionReturn(0); 2322 } 2323 2324 2325 /* ----------------------------------------------------------- */ 2326 #undef __FUNCT__ 2327 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2328 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2329 { 2330 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2331 IS iscol = a->col,isrow = a->row; 2332 PetscErrorCode ierr; 2333 const PetscInt *r,*c,*rout,*cout; 2334 PetscInt i,j,n = A->rmap->n; 2335 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2336 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2337 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2338 PetscScalar sum1,sum2,sum3,sum4,sum5; 2339 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2340 const PetscScalar *b; 2341 2342 PetscFunctionBegin; 2343 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2344 node_max = a->inode.node_count; 2345 ns = a->inode.size; /* Node Size array */ 2346 2347 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2348 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2349 tmp = a->solve_work; 2350 2351 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2352 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2353 2354 /* forward solve the lower triangular */ 2355 tmps = tmp; 2356 aa = a_a; 2357 aj = a_j; 2358 ad = a->diag; 2359 2360 for (i = 0,row = 0; i< node_max; ++i) { 2361 nsz = ns[i]; 2362 aii = ai[row]; 2363 v1 = aa + aii; 2364 vi = aj + aii; 2365 nz = ai[row+1]- ai[row]; 2366 2367 if (i < node_max-1) { 2368 /* Prefetch the indices for the next block */ 2369 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2370 /* Prefetch the data for the next block */ 2371 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2372 } 2373 2374 switch (nsz) { /* Each loop in 'case' is unrolled */ 2375 case 1: 2376 sum1 = b[r[row]]; 2377 for (j=0; j<nz-1; j+=2) { 2378 i0 = vi[j]; 2379 i1 = vi[j+1]; 2380 tmp0 = tmps[i0]; 2381 tmp1 = tmps[i1]; 2382 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2383 } 2384 if (j == nz-1) { 2385 tmp0 = tmps[vi[j]]; 2386 sum1 -= v1[j]*tmp0; 2387 } 2388 tmp[row++]=sum1; 2389 break; 2390 case 2: 2391 sum1 = b[r[row]]; 2392 sum2 = b[r[row+1]]; 2393 v2 = aa + ai[row+1]; 2394 2395 for (j=0; j<nz-1; j+=2) { 2396 i0 = vi[j]; 2397 i1 = vi[j+1]; 2398 tmp0 = tmps[i0]; 2399 tmp1 = tmps[i1]; 2400 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2401 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2402 } 2403 if (j == nz-1) { 2404 tmp0 = tmps[vi[j]]; 2405 sum1 -= v1[j] *tmp0; 2406 sum2 -= v2[j] *tmp0; 2407 } 2408 sum2 -= v2[nz] * sum1; 2409 tmp[row++]=sum1; 2410 tmp[row++]=sum2; 2411 break; 2412 case 3: 2413 sum1 = b[r[row]]; 2414 sum2 = b[r[row+1]]; 2415 sum3 = b[r[row+2]]; 2416 v2 = aa + ai[row+1]; 2417 v3 = aa + ai[row+2]; 2418 2419 for (j=0; j<nz-1; j+=2) { 2420 i0 = vi[j]; 2421 i1 = vi[j+1]; 2422 tmp0 = tmps[i0]; 2423 tmp1 = tmps[i1]; 2424 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2425 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2426 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2427 } 2428 if (j == nz-1) { 2429 tmp0 = tmps[vi[j]]; 2430 sum1 -= v1[j] *tmp0; 2431 sum2 -= v2[j] *tmp0; 2432 sum3 -= v3[j] *tmp0; 2433 } 2434 sum2 -= v2[nz] * sum1; 2435 sum3 -= v3[nz] * sum1; 2436 sum3 -= v3[nz+1] * sum2; 2437 tmp[row++]=sum1; 2438 tmp[row++]=sum2; 2439 tmp[row++]=sum3; 2440 break; 2441 2442 case 4: 2443 sum1 = b[r[row]]; 2444 sum2 = b[r[row+1]]; 2445 sum3 = b[r[row+2]]; 2446 sum4 = b[r[row+3]]; 2447 v2 = aa + ai[row+1]; 2448 v3 = aa + ai[row+2]; 2449 v4 = aa + ai[row+3]; 2450 2451 for (j=0; j<nz-1; j+=2) { 2452 i0 = vi[j]; 2453 i1 = vi[j+1]; 2454 tmp0 = tmps[i0]; 2455 tmp1 = tmps[i1]; 2456 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2457 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2458 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2459 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2460 } 2461 if (j == nz-1) { 2462 tmp0 = tmps[vi[j]]; 2463 sum1 -= v1[j] *tmp0; 2464 sum2 -= v2[j] *tmp0; 2465 sum3 -= v3[j] *tmp0; 2466 sum4 -= v4[j] *tmp0; 2467 } 2468 sum2 -= v2[nz] * sum1; 2469 sum3 -= v3[nz] * sum1; 2470 sum4 -= v4[nz] * sum1; 2471 sum3 -= v3[nz+1] * sum2; 2472 sum4 -= v4[nz+1] * sum2; 2473 sum4 -= v4[nz+2] * sum3; 2474 2475 tmp[row++]=sum1; 2476 tmp[row++]=sum2; 2477 tmp[row++]=sum3; 2478 tmp[row++]=sum4; 2479 break; 2480 case 5: 2481 sum1 = b[r[row]]; 2482 sum2 = b[r[row+1]]; 2483 sum3 = b[r[row+2]]; 2484 sum4 = b[r[row+3]]; 2485 sum5 = b[r[row+4]]; 2486 v2 = aa + ai[row+1]; 2487 v3 = aa + ai[row+2]; 2488 v4 = aa + ai[row+3]; 2489 v5 = aa + ai[row+4]; 2490 2491 for (j=0; j<nz-1; j+=2) { 2492 i0 = vi[j]; 2493 i1 = vi[j+1]; 2494 tmp0 = tmps[i0]; 2495 tmp1 = tmps[i1]; 2496 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2497 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2498 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2499 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2500 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2501 } 2502 if (j == nz-1) { 2503 tmp0 = tmps[vi[j]]; 2504 sum1 -= v1[j] *tmp0; 2505 sum2 -= v2[j] *tmp0; 2506 sum3 -= v3[j] *tmp0; 2507 sum4 -= v4[j] *tmp0; 2508 sum5 -= v5[j] *tmp0; 2509 } 2510 2511 sum2 -= v2[nz] * sum1; 2512 sum3 -= v3[nz] * sum1; 2513 sum4 -= v4[nz] * sum1; 2514 sum5 -= v5[nz] * sum1; 2515 sum3 -= v3[nz+1] * sum2; 2516 sum4 -= v4[nz+1] * sum2; 2517 sum5 -= v5[nz+1] * sum2; 2518 sum4 -= v4[nz+2] * sum3; 2519 sum5 -= v5[nz+2] * sum3; 2520 sum5 -= v5[nz+3] * sum4; 2521 2522 tmp[row++]=sum1; 2523 tmp[row++]=sum2; 2524 tmp[row++]=sum3; 2525 tmp[row++]=sum4; 2526 tmp[row++]=sum5; 2527 break; 2528 default: 2529 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2530 } 2531 } 2532 /* backward solve the upper triangular */ 2533 for (i=node_max -1,row = n-1; i>=0; i--) { 2534 nsz = ns[i]; 2535 aii = ad[row+1] + 1; 2536 v1 = aa + aii; 2537 vi = aj + aii; 2538 nz = ad[row]- ad[row+1] - 1; 2539 2540 if (i > 0) { 2541 /* Prefetch the indices for the next block */ 2542 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2543 /* Prefetch the data for the next block */ 2544 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2545 } 2546 2547 switch (nsz) { /* Each loop in 'case' is unrolled */ 2548 case 1: 2549 sum1 = tmp[row]; 2550 2551 for (j=0; j<nz-1; j+=2) { 2552 i0 = vi[j]; 2553 i1 = vi[j+1]; 2554 tmp0 = tmps[i0]; 2555 tmp1 = tmps[i1]; 2556 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2557 } 2558 if (j == nz-1) { 2559 tmp0 = tmps[vi[j]]; 2560 sum1 -= v1[j]*tmp0; 2561 } 2562 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2563 break; 2564 case 2: 2565 sum1 = tmp[row]; 2566 sum2 = tmp[row-1]; 2567 v2 = aa + ad[row] + 1; 2568 for (j=0; j<nz-1; j+=2) { 2569 i0 = vi[j]; 2570 i1 = vi[j+1]; 2571 tmp0 = tmps[i0]; 2572 tmp1 = tmps[i1]; 2573 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2574 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2575 } 2576 if (j == nz-1) { 2577 tmp0 = tmps[vi[j]]; 2578 sum1 -= v1[j]* tmp0; 2579 sum2 -= v2[j+1]* tmp0; 2580 } 2581 2582 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2583 sum2 -= v2[0] * tmp0; 2584 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2585 break; 2586 case 3: 2587 sum1 = tmp[row]; 2588 sum2 = tmp[row -1]; 2589 sum3 = tmp[row -2]; 2590 v2 = aa + ad[row] + 1; 2591 v3 = aa + ad[row -1] + 1; 2592 for (j=0; j<nz-1; j+=2) { 2593 i0 = vi[j]; 2594 i1 = vi[j+1]; 2595 tmp0 = tmps[i0]; 2596 tmp1 = tmps[i1]; 2597 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2598 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2599 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2600 } 2601 if (j== nz-1) { 2602 tmp0 = tmps[vi[j]]; 2603 sum1 -= v1[j] * tmp0; 2604 sum2 -= v2[j+1] * tmp0; 2605 sum3 -= v3[j+2] * tmp0; 2606 } 2607 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2608 sum2 -= v2[0]* tmp0; 2609 sum3 -= v3[1] * tmp0; 2610 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2611 sum3 -= v3[0]* tmp0; 2612 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2613 2614 break; 2615 case 4: 2616 sum1 = tmp[row]; 2617 sum2 = tmp[row -1]; 2618 sum3 = tmp[row -2]; 2619 sum4 = tmp[row -3]; 2620 v2 = aa + ad[row]+1; 2621 v3 = aa + ad[row -1]+1; 2622 v4 = aa + ad[row -2]+1; 2623 2624 for (j=0; j<nz-1; j+=2) { 2625 i0 = vi[j]; 2626 i1 = vi[j+1]; 2627 tmp0 = tmps[i0]; 2628 tmp1 = tmps[i1]; 2629 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2630 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2631 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2632 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2633 } 2634 if (j== nz-1) { 2635 tmp0 = tmps[vi[j]]; 2636 sum1 -= v1[j] * tmp0; 2637 sum2 -= v2[j+1] * tmp0; 2638 sum3 -= v3[j+2] * tmp0; 2639 sum4 -= v4[j+3] * tmp0; 2640 } 2641 2642 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2643 sum2 -= v2[0] * tmp0; 2644 sum3 -= v3[1] * tmp0; 2645 sum4 -= v4[2] * tmp0; 2646 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2647 sum3 -= v3[0] * tmp0; 2648 sum4 -= v4[1] * tmp0; 2649 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2650 sum4 -= v4[0] * tmp0; 2651 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2652 break; 2653 case 5: 2654 sum1 = tmp[row]; 2655 sum2 = tmp[row -1]; 2656 sum3 = tmp[row -2]; 2657 sum4 = tmp[row -3]; 2658 sum5 = tmp[row -4]; 2659 v2 = aa + ad[row]+1; 2660 v3 = aa + ad[row -1]+1; 2661 v4 = aa + ad[row -2]+1; 2662 v5 = aa + ad[row -3]+1; 2663 for (j=0; j<nz-1; j+=2) { 2664 i0 = vi[j]; 2665 i1 = vi[j+1]; 2666 tmp0 = tmps[i0]; 2667 tmp1 = tmps[i1]; 2668 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2669 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2670 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2671 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2672 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2673 } 2674 if (j==nz-1) { 2675 tmp0 = tmps[vi[j]]; 2676 sum1 -= v1[j] * tmp0; 2677 sum2 -= v2[j+1] * tmp0; 2678 sum3 -= v3[j+2] * tmp0; 2679 sum4 -= v4[j+3] * tmp0; 2680 sum5 -= v5[j+4] * tmp0; 2681 } 2682 2683 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2684 sum2 -= v2[0] * tmp0; 2685 sum3 -= v3[1] * tmp0; 2686 sum4 -= v4[2] * tmp0; 2687 sum5 -= v5[3] * tmp0; 2688 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2689 sum3 -= v3[0] * tmp0; 2690 sum4 -= v4[1] * tmp0; 2691 sum5 -= v5[2] * tmp0; 2692 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2693 sum4 -= v4[0] * tmp0; 2694 sum5 -= v5[1] * tmp0; 2695 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2696 sum5 -= v5[0] * tmp0; 2697 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2698 break; 2699 default: 2700 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2701 } 2702 } 2703 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2704 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2705 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2706 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2707 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2708 PetscFunctionReturn(0); 2709 } 2710 2711 2712 /* 2713 Makes a longer coloring[] array and calls the usual code with that 2714 */ 2715 #undef __FUNCT__ 2716 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2717 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2718 { 2719 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2720 PetscErrorCode ierr; 2721 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2722 PetscInt *colorused,i; 2723 ISColoringValue *newcolor; 2724 2725 PetscFunctionBegin; 2726 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2727 /* loop over inodes, marking a color for each column*/ 2728 row = 0; 2729 for (i=0; i<m; i++) { 2730 for (j=0; j<ns[i]; j++) { 2731 newcolor[row++] = coloring[i] + j*ncolors; 2732 } 2733 } 2734 2735 /* eliminate unneeded colors */ 2736 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2737 for (i=0; i<n; i++) { 2738 colorused[newcolor[i]] = 1; 2739 } 2740 2741 for (i=1; i<5*ncolors; i++) { 2742 colorused[i] += colorused[i-1]; 2743 } 2744 ncolors = colorused[5*ncolors-1]; 2745 for (i=0; i<n; i++) { 2746 newcolor[i] = colorused[newcolor[i]]-1; 2747 } 2748 ierr = PetscFree(colorused);CHKERRQ(ierr); 2749 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2750 ierr = PetscFree(coloring);CHKERRQ(ierr); 2751 PetscFunctionReturn(0); 2752 } 2753 2754 #include <petsc-private/kernels/blockinvert.h> 2755 2756 #undef __FUNCT__ 2757 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2758 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2759 { 2760 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2761 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2762 MatScalar *ibdiag,*bdiag,work[25],*t; 2763 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2764 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2765 const PetscScalar *xb, *b; 2766 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2767 PetscErrorCode ierr; 2768 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2769 PetscInt sz,k,ipvt[5]; 2770 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2771 2772 PetscFunctionBegin; 2773 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2774 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2775 2776 if (!a->inode.ibdiagvalid) { 2777 if (!a->inode.ibdiag) { 2778 /* calculate space needed for diagonal blocks */ 2779 for (i=0; i<m; i++) { 2780 cnt += sizes[i]*sizes[i]; 2781 } 2782 a->inode.bdiagsize = cnt; 2783 2784 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2785 } 2786 2787 /* copy over the diagonal blocks and invert them */ 2788 ibdiag = a->inode.ibdiag; 2789 bdiag = a->inode.bdiag; 2790 cnt = 0; 2791 for (i=0, row = 0; i<m; i++) { 2792 for (j=0; j<sizes[i]; j++) { 2793 for (k=0; k<sizes[i]; k++) { 2794 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2795 } 2796 } 2797 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2798 2799 switch (sizes[i]) { 2800 case 1: 2801 /* Create matrix data structure */ 2802 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2803 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2804 break; 2805 case 2: 2806 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2807 break; 2808 case 3: 2809 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2810 break; 2811 case 4: 2812 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2813 break; 2814 case 5: 2815 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2816 break; 2817 default: 2818 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2819 } 2820 cnt += sizes[i]*sizes[i]; 2821 row += sizes[i]; 2822 } 2823 a->inode.ibdiagvalid = PETSC_TRUE; 2824 } 2825 ibdiag = a->inode.ibdiag; 2826 bdiag = a->inode.bdiag; 2827 t = a->inode.ssor_work; 2828 2829 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2830 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2831 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2832 if (flag & SOR_ZERO_INITIAL_GUESS) { 2833 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2834 2835 for (i=0, row=0; i<m; i++) { 2836 sz = diag[row] - ii[row]; 2837 v1 = a->a + ii[row]; 2838 idx = a->j + ii[row]; 2839 2840 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2841 switch (sizes[i]) { 2842 case 1: 2843 2844 sum1 = b[row]; 2845 for (n = 0; n<sz-1; n+=2) { 2846 i1 = idx[0]; 2847 i2 = idx[1]; 2848 idx += 2; 2849 tmp0 = x[i1]; 2850 tmp1 = x[i2]; 2851 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2852 } 2853 2854 if (n == sz-1) { 2855 tmp0 = x[*idx]; 2856 sum1 -= *v1 * tmp0; 2857 } 2858 t[row] = sum1; 2859 x[row++] = sum1*(*ibdiag++); 2860 break; 2861 case 2: 2862 v2 = a->a + ii[row+1]; 2863 sum1 = b[row]; 2864 sum2 = b[row+1]; 2865 for (n = 0; n<sz-1; n+=2) { 2866 i1 = idx[0]; 2867 i2 = idx[1]; 2868 idx += 2; 2869 tmp0 = x[i1]; 2870 tmp1 = x[i2]; 2871 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2872 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2873 } 2874 2875 if (n == sz-1) { 2876 tmp0 = x[*idx]; 2877 sum1 -= v1[0] * tmp0; 2878 sum2 -= v2[0] * tmp0; 2879 } 2880 t[row] = sum1; 2881 t[row+1] = sum2; 2882 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2883 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2884 ibdiag += 4; 2885 break; 2886 case 3: 2887 v2 = a->a + ii[row+1]; 2888 v3 = a->a + ii[row+2]; 2889 sum1 = b[row]; 2890 sum2 = b[row+1]; 2891 sum3 = b[row+2]; 2892 for (n = 0; n<sz-1; n+=2) { 2893 i1 = idx[0]; 2894 i2 = idx[1]; 2895 idx += 2; 2896 tmp0 = x[i1]; 2897 tmp1 = x[i2]; 2898 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2899 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2900 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2901 } 2902 2903 if (n == sz-1) { 2904 tmp0 = x[*idx]; 2905 sum1 -= v1[0] * tmp0; 2906 sum2 -= v2[0] * tmp0; 2907 sum3 -= v3[0] * tmp0; 2908 } 2909 t[row] = sum1; 2910 t[row+1] = sum2; 2911 t[row+2] = sum3; 2912 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2913 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2914 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2915 ibdiag += 9; 2916 break; 2917 case 4: 2918 v2 = a->a + ii[row+1]; 2919 v3 = a->a + ii[row+2]; 2920 v4 = a->a + ii[row+3]; 2921 sum1 = b[row]; 2922 sum2 = b[row+1]; 2923 sum3 = b[row+2]; 2924 sum4 = b[row+3]; 2925 for (n = 0; n<sz-1; n+=2) { 2926 i1 = idx[0]; 2927 i2 = idx[1]; 2928 idx += 2; 2929 tmp0 = x[i1]; 2930 tmp1 = x[i2]; 2931 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2932 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2933 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2934 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2935 } 2936 2937 if (n == sz-1) { 2938 tmp0 = x[*idx]; 2939 sum1 -= v1[0] * tmp0; 2940 sum2 -= v2[0] * tmp0; 2941 sum3 -= v3[0] * tmp0; 2942 sum4 -= v4[0] * tmp0; 2943 } 2944 t[row] = sum1; 2945 t[row+1] = sum2; 2946 t[row+2] = sum3; 2947 t[row+3] = sum4; 2948 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2949 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2950 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2951 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2952 ibdiag += 16; 2953 break; 2954 case 5: 2955 v2 = a->a + ii[row+1]; 2956 v3 = a->a + ii[row+2]; 2957 v4 = a->a + ii[row+3]; 2958 v5 = a->a + ii[row+4]; 2959 sum1 = b[row]; 2960 sum2 = b[row+1]; 2961 sum3 = b[row+2]; 2962 sum4 = b[row+3]; 2963 sum5 = b[row+4]; 2964 for (n = 0; n<sz-1; n+=2) { 2965 i1 = idx[0]; 2966 i2 = idx[1]; 2967 idx += 2; 2968 tmp0 = x[i1]; 2969 tmp1 = x[i2]; 2970 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2971 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2972 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2973 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2974 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2975 } 2976 2977 if (n == sz-1) { 2978 tmp0 = x[*idx]; 2979 sum1 -= v1[0] * tmp0; 2980 sum2 -= v2[0] * tmp0; 2981 sum3 -= v3[0] * tmp0; 2982 sum4 -= v4[0] * tmp0; 2983 sum5 -= v5[0] * tmp0; 2984 } 2985 t[row] = sum1; 2986 t[row+1] = sum2; 2987 t[row+2] = sum3; 2988 t[row+3] = sum4; 2989 t[row+4] = sum5; 2990 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2991 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2992 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2993 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2994 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2995 ibdiag += 25; 2996 break; 2997 default: 2998 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2999 } 3000 } 3001 3002 xb = t; 3003 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3004 } else xb = b; 3005 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3006 3007 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3008 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3009 ibdiag -= sizes[i]*sizes[i]; 3010 sz = ii[row+1] - diag[row] - 1; 3011 v1 = a->a + diag[row] + 1; 3012 idx = a->j + diag[row] + 1; 3013 3014 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3015 switch (sizes[i]) { 3016 case 1: 3017 3018 sum1 = xb[row]; 3019 for (n = 0; n<sz-1; n+=2) { 3020 i1 = idx[0]; 3021 i2 = idx[1]; 3022 idx += 2; 3023 tmp0 = x[i1]; 3024 tmp1 = x[i2]; 3025 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3026 } 3027 3028 if (n == sz-1) { 3029 tmp0 = x[*idx]; 3030 sum1 -= *v1*tmp0; 3031 } 3032 x[row--] = sum1*(*ibdiag); 3033 break; 3034 3035 case 2: 3036 3037 sum1 = xb[row]; 3038 sum2 = xb[row-1]; 3039 /* note that sum1 is associated with the second of the two rows */ 3040 v2 = a->a + diag[row-1] + 2; 3041 for (n = 0; n<sz-1; n+=2) { 3042 i1 = idx[0]; 3043 i2 = idx[1]; 3044 idx += 2; 3045 tmp0 = x[i1]; 3046 tmp1 = x[i2]; 3047 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3048 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3049 } 3050 3051 if (n == sz-1) { 3052 tmp0 = x[*idx]; 3053 sum1 -= *v1*tmp0; 3054 sum2 -= *v2*tmp0; 3055 } 3056 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3057 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3058 break; 3059 case 3: 3060 3061 sum1 = xb[row]; 3062 sum2 = xb[row-1]; 3063 sum3 = xb[row-2]; 3064 v2 = a->a + diag[row-1] + 2; 3065 v3 = a->a + diag[row-2] + 3; 3066 for (n = 0; n<sz-1; n+=2) { 3067 i1 = idx[0]; 3068 i2 = idx[1]; 3069 idx += 2; 3070 tmp0 = x[i1]; 3071 tmp1 = x[i2]; 3072 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3073 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3074 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3075 } 3076 3077 if (n == sz-1) { 3078 tmp0 = x[*idx]; 3079 sum1 -= *v1*tmp0; 3080 sum2 -= *v2*tmp0; 3081 sum3 -= *v3*tmp0; 3082 } 3083 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3084 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3085 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3086 break; 3087 case 4: 3088 3089 sum1 = xb[row]; 3090 sum2 = xb[row-1]; 3091 sum3 = xb[row-2]; 3092 sum4 = xb[row-3]; 3093 v2 = a->a + diag[row-1] + 2; 3094 v3 = a->a + diag[row-2] + 3; 3095 v4 = a->a + diag[row-3] + 4; 3096 for (n = 0; n<sz-1; n+=2) { 3097 i1 = idx[0]; 3098 i2 = idx[1]; 3099 idx += 2; 3100 tmp0 = x[i1]; 3101 tmp1 = x[i2]; 3102 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3103 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3104 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3105 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3106 } 3107 3108 if (n == sz-1) { 3109 tmp0 = x[*idx]; 3110 sum1 -= *v1*tmp0; 3111 sum2 -= *v2*tmp0; 3112 sum3 -= *v3*tmp0; 3113 sum4 -= *v4*tmp0; 3114 } 3115 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3116 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3117 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3118 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3119 break; 3120 case 5: 3121 3122 sum1 = xb[row]; 3123 sum2 = xb[row-1]; 3124 sum3 = xb[row-2]; 3125 sum4 = xb[row-3]; 3126 sum5 = xb[row-4]; 3127 v2 = a->a + diag[row-1] + 2; 3128 v3 = a->a + diag[row-2] + 3; 3129 v4 = a->a + diag[row-3] + 4; 3130 v5 = a->a + diag[row-4] + 5; 3131 for (n = 0; n<sz-1; n+=2) { 3132 i1 = idx[0]; 3133 i2 = idx[1]; 3134 idx += 2; 3135 tmp0 = x[i1]; 3136 tmp1 = x[i2]; 3137 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3138 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3139 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3140 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3141 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3142 } 3143 3144 if (n == sz-1) { 3145 tmp0 = x[*idx]; 3146 sum1 -= *v1*tmp0; 3147 sum2 -= *v2*tmp0; 3148 sum3 -= *v3*tmp0; 3149 sum4 -= *v4*tmp0; 3150 sum5 -= *v5*tmp0; 3151 } 3152 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3153 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3154 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3155 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3156 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3157 break; 3158 default: 3159 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3160 } 3161 } 3162 3163 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3164 } 3165 its--; 3166 } 3167 while (its--) { 3168 3169 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3170 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3171 i<m; 3172 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3173 3174 sz = diag[row] - ii[row]; 3175 v1 = a->a + ii[row]; 3176 idx = a->j + ii[row]; 3177 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3178 switch (sizes[i]) { 3179 case 1: 3180 sum1 = b[row]; 3181 for (n = 0; n<sz-1; n+=2) { 3182 i1 = idx[0]; 3183 i2 = idx[1]; 3184 idx += 2; 3185 tmp0 = x[i1]; 3186 tmp1 = x[i2]; 3187 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3188 } 3189 if (n == sz-1) { 3190 tmp0 = x[*idx++]; 3191 sum1 -= *v1 * tmp0; 3192 v1++; 3193 } 3194 t[row] = sum1; 3195 sz = ii[row+1] - diag[row] - 1; 3196 idx = a->j + diag[row] + 1; 3197 v1 += 1; 3198 for (n = 0; n<sz-1; n+=2) { 3199 i1 = idx[0]; 3200 i2 = idx[1]; 3201 idx += 2; 3202 tmp0 = x[i1]; 3203 tmp1 = x[i2]; 3204 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3205 } 3206 if (n == sz-1) { 3207 tmp0 = x[*idx++]; 3208 sum1 -= *v1 * tmp0; 3209 } 3210 /* in MatSOR_SeqAIJ this line would be 3211 * 3212 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3213 * 3214 * but omega == 1, so this becomes 3215 * 3216 * x[row] = sum1*(*ibdiag++); 3217 * 3218 */ 3219 x[row] = sum1*(*ibdiag); 3220 break; 3221 case 2: 3222 v2 = a->a + ii[row+1]; 3223 sum1 = b[row]; 3224 sum2 = b[row+1]; 3225 for (n = 0; n<sz-1; n+=2) { 3226 i1 = idx[0]; 3227 i2 = idx[1]; 3228 idx += 2; 3229 tmp0 = x[i1]; 3230 tmp1 = x[i2]; 3231 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3232 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3233 } 3234 if (n == sz-1) { 3235 tmp0 = x[*idx++]; 3236 sum1 -= v1[0] * tmp0; 3237 sum2 -= v2[0] * tmp0; 3238 v1++; v2++; 3239 } 3240 t[row] = sum1; 3241 t[row+1] = sum2; 3242 sz = ii[row+1] - diag[row] - 2; 3243 idx = a->j + diag[row] + 2; 3244 v1 += 2; 3245 v2 += 2; 3246 for (n = 0; n<sz-1; n+=2) { 3247 i1 = idx[0]; 3248 i2 = idx[1]; 3249 idx += 2; 3250 tmp0 = x[i1]; 3251 tmp1 = x[i2]; 3252 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3253 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3254 } 3255 if (n == sz-1) { 3256 tmp0 = x[*idx]; 3257 sum1 -= v1[0] * tmp0; 3258 sum2 -= v2[0] * tmp0; 3259 } 3260 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3261 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3262 break; 3263 case 3: 3264 v2 = a->a + ii[row+1]; 3265 v3 = a->a + ii[row+2]; 3266 sum1 = b[row]; 3267 sum2 = b[row+1]; 3268 sum3 = b[row+2]; 3269 for (n = 0; n<sz-1; n+=2) { 3270 i1 = idx[0]; 3271 i2 = idx[1]; 3272 idx += 2; 3273 tmp0 = x[i1]; 3274 tmp1 = x[i2]; 3275 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3276 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3277 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3278 } 3279 if (n == sz-1) { 3280 tmp0 = x[*idx++]; 3281 sum1 -= v1[0] * tmp0; 3282 sum2 -= v2[0] * tmp0; 3283 sum3 -= v3[0] * tmp0; 3284 v1++; v2++; v3++; 3285 } 3286 t[row] = sum1; 3287 t[row+1] = sum2; 3288 t[row+2] = sum3; 3289 sz = ii[row+1] - diag[row] - 3; 3290 idx = a->j + diag[row] + 3; 3291 v1 += 3; 3292 v2 += 3; 3293 v3 += 3; 3294 for (n = 0; n<sz-1; n+=2) { 3295 i1 = idx[0]; 3296 i2 = idx[1]; 3297 idx += 2; 3298 tmp0 = x[i1]; 3299 tmp1 = x[i2]; 3300 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3301 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3302 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3303 } 3304 if (n == sz-1) { 3305 tmp0 = x[*idx]; 3306 sum1 -= v1[0] * tmp0; 3307 sum2 -= v2[0] * tmp0; 3308 sum3 -= v3[0] * tmp0; 3309 } 3310 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3311 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3312 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3313 break; 3314 case 4: 3315 v2 = a->a + ii[row+1]; 3316 v3 = a->a + ii[row+2]; 3317 v4 = a->a + ii[row+3]; 3318 sum1 = b[row]; 3319 sum2 = b[row+1]; 3320 sum3 = b[row+2]; 3321 sum4 = b[row+3]; 3322 for (n = 0; n<sz-1; n+=2) { 3323 i1 = idx[0]; 3324 i2 = idx[1]; 3325 idx += 2; 3326 tmp0 = x[i1]; 3327 tmp1 = x[i2]; 3328 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3329 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3330 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3331 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3332 } 3333 if (n == sz-1) { 3334 tmp0 = x[*idx++]; 3335 sum1 -= v1[0] * tmp0; 3336 sum2 -= v2[0] * tmp0; 3337 sum3 -= v3[0] * tmp0; 3338 sum4 -= v4[0] * tmp0; 3339 v1++; v2++; v3++; v4++; 3340 } 3341 t[row] = sum1; 3342 t[row+1] = sum2; 3343 t[row+2] = sum3; 3344 t[row+3] = sum4; 3345 sz = ii[row+1] - diag[row] - 4; 3346 idx = a->j + diag[row] + 4; 3347 v1 += 4; 3348 v2 += 4; 3349 v3 += 4; 3350 v4 += 4; 3351 for (n = 0; n<sz-1; n+=2) { 3352 i1 = idx[0]; 3353 i2 = idx[1]; 3354 idx += 2; 3355 tmp0 = x[i1]; 3356 tmp1 = x[i2]; 3357 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3358 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3359 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3360 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3361 } 3362 if (n == sz-1) { 3363 tmp0 = x[*idx]; 3364 sum1 -= v1[0] * tmp0; 3365 sum2 -= v2[0] * tmp0; 3366 sum3 -= v3[0] * tmp0; 3367 sum4 -= v4[0] * tmp0; 3368 } 3369 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3370 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3371 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3372 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3373 break; 3374 case 5: 3375 v2 = a->a + ii[row+1]; 3376 v3 = a->a + ii[row+2]; 3377 v4 = a->a + ii[row+3]; 3378 v5 = a->a + ii[row+4]; 3379 sum1 = b[row]; 3380 sum2 = b[row+1]; 3381 sum3 = b[row+2]; 3382 sum4 = b[row+3]; 3383 sum5 = b[row+4]; 3384 for (n = 0; n<sz-1; n+=2) { 3385 i1 = idx[0]; 3386 i2 = idx[1]; 3387 idx += 2; 3388 tmp0 = x[i1]; 3389 tmp1 = x[i2]; 3390 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3391 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3392 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3393 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3394 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3395 } 3396 if (n == sz-1) { 3397 tmp0 = x[*idx++]; 3398 sum1 -= v1[0] * tmp0; 3399 sum2 -= v2[0] * tmp0; 3400 sum3 -= v3[0] * tmp0; 3401 sum4 -= v4[0] * tmp0; 3402 sum5 -= v5[0] * tmp0; 3403 v1++; v2++; v3++; v4++; v5++; 3404 } 3405 t[row] = sum1; 3406 t[row+1] = sum2; 3407 t[row+2] = sum3; 3408 t[row+3] = sum4; 3409 t[row+4] = sum5; 3410 sz = ii[row+1] - diag[row] - 5; 3411 idx = a->j + diag[row] + 5; 3412 v1 += 5; 3413 v2 += 5; 3414 v3 += 5; 3415 v4 += 5; 3416 v5 += 5; 3417 for (n = 0; n<sz-1; n+=2) { 3418 i1 = idx[0]; 3419 i2 = idx[1]; 3420 idx += 2; 3421 tmp0 = x[i1]; 3422 tmp1 = x[i2]; 3423 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3424 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3425 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3426 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3427 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3428 } 3429 if (n == sz-1) { 3430 tmp0 = x[*idx]; 3431 sum1 -= v1[0] * tmp0; 3432 sum2 -= v2[0] * tmp0; 3433 sum3 -= v3[0] * tmp0; 3434 sum4 -= v4[0] * tmp0; 3435 sum5 -= v5[0] * tmp0; 3436 } 3437 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3438 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3439 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3440 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3441 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3442 break; 3443 default: 3444 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3445 } 3446 } 3447 xb = t; 3448 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3449 } else xb = b; 3450 3451 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3452 3453 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3454 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3455 ibdiag -= sizes[i]*sizes[i]; 3456 3457 /* set RHS */ 3458 if (xb == b) { 3459 /* whole (old way) */ 3460 sz = ii[row+1] - ii[row]; 3461 idx = a->j + ii[row]; 3462 switch (sizes[i]) { 3463 case 5: 3464 v5 = a->a + ii[row-4]; 3465 case 4: /* fall through */ 3466 v4 = a->a + ii[row-3]; 3467 case 3: 3468 v3 = a->a + ii[row-2]; 3469 case 2: 3470 v2 = a->a + ii[row-1]; 3471 case 1: 3472 v1 = a->a + ii[row]; 3473 break; 3474 default: 3475 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3476 } 3477 } else { 3478 /* upper, no diag */ 3479 sz = ii[row+1] - diag[row] - 1; 3480 idx = a->j + diag[row] + 1; 3481 switch (sizes[i]) { 3482 case 5: 3483 v5 = a->a + diag[row-4] + 5; 3484 case 4: /* fall through */ 3485 v4 = a->a + diag[row-3] + 4; 3486 case 3: 3487 v3 = a->a + diag[row-2] + 3; 3488 case 2: 3489 v2 = a->a + diag[row-1] + 2; 3490 case 1: 3491 v1 = a->a + diag[row] + 1; 3492 } 3493 } 3494 /* set sum */ 3495 switch (sizes[i]) { 3496 case 5: 3497 sum5 = xb[row-4]; 3498 case 4: /* fall through */ 3499 sum4 = xb[row-3]; 3500 case 3: 3501 sum3 = xb[row-2]; 3502 case 2: 3503 sum2 = xb[row-1]; 3504 case 1: 3505 /* note that sum1 is associated with the last row */ 3506 sum1 = xb[row]; 3507 } 3508 /* do sums */ 3509 for (n = 0; n<sz-1; n+=2) { 3510 i1 = idx[0]; 3511 i2 = idx[1]; 3512 idx += 2; 3513 tmp0 = x[i1]; 3514 tmp1 = x[i2]; 3515 switch (sizes[i]) { 3516 case 5: 3517 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3518 case 4: /* fall through */ 3519 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3520 case 3: 3521 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3522 case 2: 3523 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3524 case 1: 3525 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3526 } 3527 } 3528 /* ragged edge */ 3529 if (n == sz-1) { 3530 tmp0 = x[*idx]; 3531 switch (sizes[i]) { 3532 case 5: 3533 sum5 -= *v5*tmp0; 3534 case 4: /* fall through */ 3535 sum4 -= *v4*tmp0; 3536 case 3: 3537 sum3 -= *v3*tmp0; 3538 case 2: 3539 sum2 -= *v2*tmp0; 3540 case 1: 3541 sum1 -= *v1*tmp0; 3542 } 3543 } 3544 /* update */ 3545 if (xb == b) { 3546 /* whole (old way) w/ diag */ 3547 switch (sizes[i]) { 3548 case 5: 3549 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3550 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3551 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3552 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3553 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3554 break; 3555 case 4: 3556 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3557 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3558 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3559 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3560 break; 3561 case 3: 3562 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3563 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3564 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3565 break; 3566 case 2: 3567 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3568 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3569 break; 3570 case 1: 3571 x[row--] += sum1*(*ibdiag); 3572 break; 3573 } 3574 } else { 3575 /* no diag so set = */ 3576 switch (sizes[i]) { 3577 case 5: 3578 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3579 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3580 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3581 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3582 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3583 break; 3584 case 4: 3585 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3586 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3587 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3588 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3589 break; 3590 case 3: 3591 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3592 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3593 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3594 break; 3595 case 2: 3596 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3597 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3598 break; 3599 case 1: 3600 x[row--] = sum1*(*ibdiag); 3601 break; 3602 } 3603 } 3604 } 3605 if (xb == b) { 3606 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3607 } else { 3608 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3609 } 3610 } 3611 } 3612 if (flag & SOR_EISENSTAT) { 3613 /* 3614 Apply (U + D)^-1 where D is now the block diagonal 3615 */ 3616 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3617 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3618 ibdiag -= sizes[i]*sizes[i]; 3619 sz = ii[row+1] - diag[row] - 1; 3620 v1 = a->a + diag[row] + 1; 3621 idx = a->j + diag[row] + 1; 3622 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3623 switch (sizes[i]) { 3624 case 1: 3625 3626 sum1 = b[row]; 3627 for (n = 0; n<sz-1; n+=2) { 3628 i1 = idx[0]; 3629 i2 = idx[1]; 3630 idx += 2; 3631 tmp0 = x[i1]; 3632 tmp1 = x[i2]; 3633 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3634 } 3635 3636 if (n == sz-1) { 3637 tmp0 = x[*idx]; 3638 sum1 -= *v1*tmp0; 3639 } 3640 x[row] = sum1*(*ibdiag);row--; 3641 break; 3642 3643 case 2: 3644 3645 sum1 = b[row]; 3646 sum2 = b[row-1]; 3647 /* note that sum1 is associated with the second of the two rows */ 3648 v2 = a->a + diag[row-1] + 2; 3649 for (n = 0; n<sz-1; n+=2) { 3650 i1 = idx[0]; 3651 i2 = idx[1]; 3652 idx += 2; 3653 tmp0 = x[i1]; 3654 tmp1 = x[i2]; 3655 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3656 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3657 } 3658 3659 if (n == sz-1) { 3660 tmp0 = x[*idx]; 3661 sum1 -= *v1*tmp0; 3662 sum2 -= *v2*tmp0; 3663 } 3664 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3665 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3666 row -= 2; 3667 break; 3668 case 3: 3669 3670 sum1 = b[row]; 3671 sum2 = b[row-1]; 3672 sum3 = b[row-2]; 3673 v2 = a->a + diag[row-1] + 2; 3674 v3 = a->a + diag[row-2] + 3; 3675 for (n = 0; n<sz-1; n+=2) { 3676 i1 = idx[0]; 3677 i2 = idx[1]; 3678 idx += 2; 3679 tmp0 = x[i1]; 3680 tmp1 = x[i2]; 3681 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3682 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3683 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3684 } 3685 3686 if (n == sz-1) { 3687 tmp0 = x[*idx]; 3688 sum1 -= *v1*tmp0; 3689 sum2 -= *v2*tmp0; 3690 sum3 -= *v3*tmp0; 3691 } 3692 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3693 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3694 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3695 row -= 3; 3696 break; 3697 case 4: 3698 3699 sum1 = b[row]; 3700 sum2 = b[row-1]; 3701 sum3 = b[row-2]; 3702 sum4 = b[row-3]; 3703 v2 = a->a + diag[row-1] + 2; 3704 v3 = a->a + diag[row-2] + 3; 3705 v4 = a->a + diag[row-3] + 4; 3706 for (n = 0; n<sz-1; n+=2) { 3707 i1 = idx[0]; 3708 i2 = idx[1]; 3709 idx += 2; 3710 tmp0 = x[i1]; 3711 tmp1 = x[i2]; 3712 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3713 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3714 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3715 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3716 } 3717 3718 if (n == sz-1) { 3719 tmp0 = x[*idx]; 3720 sum1 -= *v1*tmp0; 3721 sum2 -= *v2*tmp0; 3722 sum3 -= *v3*tmp0; 3723 sum4 -= *v4*tmp0; 3724 } 3725 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3726 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3727 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3728 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3729 row -= 4; 3730 break; 3731 case 5: 3732 3733 sum1 = b[row]; 3734 sum2 = b[row-1]; 3735 sum3 = b[row-2]; 3736 sum4 = b[row-3]; 3737 sum5 = b[row-4]; 3738 v2 = a->a + diag[row-1] + 2; 3739 v3 = a->a + diag[row-2] + 3; 3740 v4 = a->a + diag[row-3] + 4; 3741 v5 = a->a + diag[row-4] + 5; 3742 for (n = 0; n<sz-1; n+=2) { 3743 i1 = idx[0]; 3744 i2 = idx[1]; 3745 idx += 2; 3746 tmp0 = x[i1]; 3747 tmp1 = x[i2]; 3748 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3749 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3750 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3751 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3752 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3753 } 3754 3755 if (n == sz-1) { 3756 tmp0 = x[*idx]; 3757 sum1 -= *v1*tmp0; 3758 sum2 -= *v2*tmp0; 3759 sum3 -= *v3*tmp0; 3760 sum4 -= *v4*tmp0; 3761 sum5 -= *v5*tmp0; 3762 } 3763 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3764 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3765 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3766 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3767 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3768 row -= 5; 3769 break; 3770 default: 3771 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3772 } 3773 } 3774 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3775 3776 /* 3777 t = b - D x where D is the block diagonal 3778 */ 3779 cnt = 0; 3780 for (i=0, row=0; i<m; i++) { 3781 switch (sizes[i]) { 3782 case 1: 3783 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3784 break; 3785 case 2: 3786 x1 = x[row]; x2 = x[row+1]; 3787 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3788 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3789 t[row] = b[row] - tmp1; 3790 t[row+1] = b[row+1] - tmp2; row += 2; 3791 cnt += 4; 3792 break; 3793 case 3: 3794 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3795 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3796 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3797 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3798 t[row] = b[row] - tmp1; 3799 t[row+1] = b[row+1] - tmp2; 3800 t[row+2] = b[row+2] - tmp3; row += 3; 3801 cnt += 9; 3802 break; 3803 case 4: 3804 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3805 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3806 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3807 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3808 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3809 t[row] = b[row] - tmp1; 3810 t[row+1] = b[row+1] - tmp2; 3811 t[row+2] = b[row+2] - tmp3; 3812 t[row+3] = b[row+3] - tmp4; row += 4; 3813 cnt += 16; 3814 break; 3815 case 5: 3816 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3817 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3818 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3819 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3820 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3821 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3822 t[row] = b[row] - tmp1; 3823 t[row+1] = b[row+1] - tmp2; 3824 t[row+2] = b[row+2] - tmp3; 3825 t[row+3] = b[row+3] - tmp4; 3826 t[row+4] = b[row+4] - tmp5;row += 5; 3827 cnt += 25; 3828 break; 3829 default: 3830 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3831 } 3832 } 3833 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3834 3835 3836 3837 /* 3838 Apply (L + D)^-1 where D is the block diagonal 3839 */ 3840 for (i=0, row=0; i<m; i++) { 3841 sz = diag[row] - ii[row]; 3842 v1 = a->a + ii[row]; 3843 idx = a->j + ii[row]; 3844 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3845 switch (sizes[i]) { 3846 case 1: 3847 3848 sum1 = t[row]; 3849 for (n = 0; n<sz-1; n+=2) { 3850 i1 = idx[0]; 3851 i2 = idx[1]; 3852 idx += 2; 3853 tmp0 = t[i1]; 3854 tmp1 = t[i2]; 3855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3856 } 3857 3858 if (n == sz-1) { 3859 tmp0 = t[*idx]; 3860 sum1 -= *v1 * tmp0; 3861 } 3862 x[row] += t[row] = sum1*(*ibdiag++); row++; 3863 break; 3864 case 2: 3865 v2 = a->a + ii[row+1]; 3866 sum1 = t[row]; 3867 sum2 = t[row+1]; 3868 for (n = 0; n<sz-1; n+=2) { 3869 i1 = idx[0]; 3870 i2 = idx[1]; 3871 idx += 2; 3872 tmp0 = t[i1]; 3873 tmp1 = t[i2]; 3874 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3875 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3876 } 3877 3878 if (n == sz-1) { 3879 tmp0 = t[*idx]; 3880 sum1 -= v1[0] * tmp0; 3881 sum2 -= v2[0] * tmp0; 3882 } 3883 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3884 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3885 ibdiag += 4; row += 2; 3886 break; 3887 case 3: 3888 v2 = a->a + ii[row+1]; 3889 v3 = a->a + ii[row+2]; 3890 sum1 = t[row]; 3891 sum2 = t[row+1]; 3892 sum3 = t[row+2]; 3893 for (n = 0; n<sz-1; n+=2) { 3894 i1 = idx[0]; 3895 i2 = idx[1]; 3896 idx += 2; 3897 tmp0 = t[i1]; 3898 tmp1 = t[i2]; 3899 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3900 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3901 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3902 } 3903 3904 if (n == sz-1) { 3905 tmp0 = t[*idx]; 3906 sum1 -= v1[0] * tmp0; 3907 sum2 -= v2[0] * tmp0; 3908 sum3 -= v3[0] * tmp0; 3909 } 3910 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3911 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3912 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3913 ibdiag += 9; row += 3; 3914 break; 3915 case 4: 3916 v2 = a->a + ii[row+1]; 3917 v3 = a->a + ii[row+2]; 3918 v4 = a->a + ii[row+3]; 3919 sum1 = t[row]; 3920 sum2 = t[row+1]; 3921 sum3 = t[row+2]; 3922 sum4 = t[row+3]; 3923 for (n = 0; n<sz-1; n+=2) { 3924 i1 = idx[0]; 3925 i2 = idx[1]; 3926 idx += 2; 3927 tmp0 = t[i1]; 3928 tmp1 = t[i2]; 3929 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3930 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3931 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3932 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3933 } 3934 3935 if (n == sz-1) { 3936 tmp0 = t[*idx]; 3937 sum1 -= v1[0] * tmp0; 3938 sum2 -= v2[0] * tmp0; 3939 sum3 -= v3[0] * tmp0; 3940 sum4 -= v4[0] * tmp0; 3941 } 3942 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3943 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3944 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3945 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3946 ibdiag += 16; row += 4; 3947 break; 3948 case 5: 3949 v2 = a->a + ii[row+1]; 3950 v3 = a->a + ii[row+2]; 3951 v4 = a->a + ii[row+3]; 3952 v5 = a->a + ii[row+4]; 3953 sum1 = t[row]; 3954 sum2 = t[row+1]; 3955 sum3 = t[row+2]; 3956 sum4 = t[row+3]; 3957 sum5 = t[row+4]; 3958 for (n = 0; n<sz-1; n+=2) { 3959 i1 = idx[0]; 3960 i2 = idx[1]; 3961 idx += 2; 3962 tmp0 = t[i1]; 3963 tmp1 = t[i2]; 3964 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3965 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3966 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3967 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3968 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3969 } 3970 3971 if (n == sz-1) { 3972 tmp0 = t[*idx]; 3973 sum1 -= v1[0] * tmp0; 3974 sum2 -= v2[0] * tmp0; 3975 sum3 -= v3[0] * tmp0; 3976 sum4 -= v4[0] * tmp0; 3977 sum5 -= v5[0] * tmp0; 3978 } 3979 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3980 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3981 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3982 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3983 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3984 ibdiag += 25; row += 5; 3985 break; 3986 default: 3987 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3988 } 3989 } 3990 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3991 } 3992 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3993 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3994 PetscFunctionReturn(0); 3995 } 3996 3997 #undef __FUNCT__ 3998 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3999 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 4000 { 4001 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4002 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 4003 const MatScalar *bdiag = a->inode.bdiag; 4004 const PetscScalar *b; 4005 PetscErrorCode ierr; 4006 PetscInt m = a->inode.node_count,cnt = 0,i,row; 4007 const PetscInt *sizes = a->inode.size; 4008 4009 PetscFunctionBegin; 4010 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4011 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4012 cnt = 0; 4013 for (i=0, row=0; i<m; i++) { 4014 switch (sizes[i]) { 4015 case 1: 4016 x[row] = b[row]*bdiag[cnt++];row++; 4017 break; 4018 case 2: 4019 x1 = b[row]; x2 = b[row+1]; 4020 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 4021 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 4022 x[row++] = tmp1; 4023 x[row++] = tmp2; 4024 cnt += 4; 4025 break; 4026 case 3: 4027 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4028 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4029 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4030 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4031 x[row++] = tmp1; 4032 x[row++] = tmp2; 4033 x[row++] = tmp3; 4034 cnt += 9; 4035 break; 4036 case 4: 4037 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4038 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4039 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4040 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4041 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4042 x[row++] = tmp1; 4043 x[row++] = tmp2; 4044 x[row++] = tmp3; 4045 x[row++] = tmp4; 4046 cnt += 16; 4047 break; 4048 case 5: 4049 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4050 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4051 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4052 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4053 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4054 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4055 x[row++] = tmp1; 4056 x[row++] = tmp2; 4057 x[row++] = tmp3; 4058 x[row++] = tmp4; 4059 x[row++] = tmp5; 4060 cnt += 25; 4061 break; 4062 default: 4063 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4064 } 4065 } 4066 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4067 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4068 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4069 PetscFunctionReturn(0); 4070 } 4071 4072 /* 4073 samestructure indicates that the matrix has not changed its nonzero structure so we 4074 do not need to recompute the inodes 4075 */ 4076 #undef __FUNCT__ 4077 #define __FUNCT__ "Mat_CheckInode" 4078 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 4079 { 4080 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4081 PetscErrorCode ierr; 4082 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4083 PetscBool flag; 4084 const PetscInt *idx,*idy,*ii; 4085 4086 PetscFunctionBegin; 4087 if (!a->inode.use) PetscFunctionReturn(0); 4088 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4089 4090 4091 m = A->rmap->n; 4092 if (a->inode.size) ns = a->inode.size; 4093 else { 4094 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4095 } 4096 4097 i = 0; 4098 node_count = 0; 4099 idx = a->j; 4100 ii = a->i; 4101 while (i < m) { /* For each row */ 4102 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4103 /* Limits the number of elements in a node to 'a->inode.limit' */ 4104 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4105 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4106 if (nzy != nzx) break; 4107 idy += nzx; /* Same nonzero pattern */ 4108 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4109 if (!flag) break; 4110 } 4111 ns[node_count++] = blk_size; 4112 idx += blk_size*nzx; 4113 i = j; 4114 } 4115 /* If not enough inodes found,, do not use inode version of the routines */ 4116 if (!m || node_count > .8*m) { 4117 ierr = PetscFree(ns);CHKERRQ(ierr); 4118 4119 a->inode.node_count = 0; 4120 a->inode.size = NULL; 4121 a->inode.use = PETSC_FALSE; 4122 A->ops->mult = MatMult_SeqAIJ; 4123 A->ops->sor = MatSOR_SeqAIJ; 4124 A->ops->multadd = MatMultAdd_SeqAIJ; 4125 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4126 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4127 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4128 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4129 A->ops->coloringpatch = 0; 4130 A->ops->multdiagonalblock = 0; 4131 4132 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4133 } else { 4134 if (!A->factortype) { 4135 A->ops->mult = MatMult_SeqAIJ_Inode; 4136 A->ops->sor = MatSOR_SeqAIJ_Inode; 4137 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4138 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4139 if (A->rmap->n == A->cmap->n) { 4140 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4141 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4142 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4143 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4144 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4145 } 4146 } else { 4147 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4148 } 4149 a->inode.node_count = node_count; 4150 a->inode.size = ns; 4151 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4152 } 4153 a->inode.checked = PETSC_TRUE; 4154 PetscFunctionReturn(0); 4155 } 4156 4157 #undef __FUNCT__ 4158 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4159 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4160 { 4161 Mat B =*C; 4162 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4163 PetscErrorCode ierr; 4164 PetscInt m=A->rmap->n; 4165 4166 PetscFunctionBegin; 4167 c->inode.use = a->inode.use; 4168 c->inode.limit = a->inode.limit; 4169 c->inode.max_limit = a->inode.max_limit; 4170 if (a->inode.size) { 4171 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4172 c->inode.node_count = a->inode.node_count; 4173 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4174 /* note the table of functions below should match that in Mat_CheckInode() */ 4175 if (!B->factortype) { 4176 B->ops->mult = MatMult_SeqAIJ_Inode; 4177 B->ops->sor = MatSOR_SeqAIJ_Inode; 4178 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4179 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4180 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4181 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4182 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4183 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4184 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4185 } else { 4186 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4187 } 4188 } else { 4189 c->inode.size = 0; 4190 c->inode.node_count = 0; 4191 } 4192 c->inode.ibdiagvalid = PETSC_FALSE; 4193 c->inode.ibdiag = 0; 4194 c->inode.bdiag = 0; 4195 PetscFunctionReturn(0); 4196 } 4197 4198 #undef __FUNCT__ 4199 #define __FUNCT__ "MatGetRow_FactoredLU" 4200 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4201 { 4202 PetscInt k; 4203 const PetscInt *vi; 4204 4205 PetscFunctionBegin; 4206 vi = aj + ai[row]; 4207 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4208 vi = aj + adiag[row]; 4209 cols[nzl] = vi[0]; 4210 vi = aj + adiag[row+1]+1; 4211 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4212 PetscFunctionReturn(0); 4213 } 4214 /* 4215 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4216 Modified from Mat_CheckInode(). 4217 4218 Input Parameters: 4219 . Mat A - ILU or LU matrix factor 4220 4221 */ 4222 #undef __FUNCT__ 4223 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4224 PetscErrorCode Mat_CheckInode_FactorLU(Mat A) 4225 { 4226 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4227 PetscErrorCode ierr; 4228 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4229 PetscInt *cols1,*cols2,*ns; 4230 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4231 PetscBool flag; 4232 4233 PetscFunctionBegin; 4234 if (!a->inode.use) PetscFunctionReturn(0); 4235 if (a->inode.checked) PetscFunctionReturn(0); 4236 4237 m = A->rmap->n; 4238 if (a->inode.size) ns = a->inode.size; 4239 else { 4240 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4241 } 4242 4243 i = 0; 4244 node_count = 0; 4245 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4246 while (i < m) { /* For each row */ 4247 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4248 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4249 nzx = nzl1 + nzu1 + 1; 4250 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4251 4252 /* Limits the number of elements in a node to 'a->inode.limit' */ 4253 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4254 nzl2 = ai[j+1] - ai[j]; 4255 nzu2 = adiag[j] - adiag[j+1] - 1; 4256 nzy = nzl2 + nzu2 + 1; 4257 if (nzy != nzx) break; 4258 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4259 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4260 if (!flag) break; 4261 } 4262 ns[node_count++] = blk_size; 4263 i = j; 4264 } 4265 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4266 /* If not enough inodes found,, do not use inode version of the routines */ 4267 if (!m || node_count > .8*m) { 4268 ierr = PetscFree(ns);CHKERRQ(ierr); 4269 4270 a->inode.node_count = 0; 4271 a->inode.size = NULL; 4272 a->inode.use = PETSC_FALSE; 4273 4274 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4275 } else { 4276 A->ops->mult = 0; 4277 A->ops->sor = 0; 4278 A->ops->multadd = 0; 4279 A->ops->getrowij = 0; 4280 A->ops->restorerowij = 0; 4281 A->ops->getcolumnij = 0; 4282 A->ops->restorecolumnij = 0; 4283 A->ops->coloringpatch = 0; 4284 A->ops->multdiagonalblock = 0; 4285 a->inode.node_count = node_count; 4286 a->inode.size = ns; 4287 4288 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4289 } 4290 a->inode.checked = PETSC_TRUE; 4291 PetscFunctionReturn(0); 4292 } 4293 4294 #undef __FUNCT__ 4295 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4296 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4297 { 4298 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4299 4300 PetscFunctionBegin; 4301 a->inode.ibdiagvalid = PETSC_FALSE; 4302 PetscFunctionReturn(0); 4303 } 4304 4305 /* 4306 This is really ugly. if inodes are used this replaces the 4307 permutations with ones that correspond to rows/cols of the matrix 4308 rather then inode blocks 4309 */ 4310 #undef __FUNCT__ 4311 #define __FUNCT__ "MatInodeAdjustForInodes" 4312 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4313 { 4314 PetscErrorCode ierr; 4315 4316 PetscFunctionBegin; 4317 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4318 PetscFunctionReturn(0); 4319 } 4320 4321 #undef __FUNCT__ 4322 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4323 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4324 { 4325 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4326 PetscErrorCode ierr; 4327 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4328 const PetscInt *ridx,*cidx; 4329 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4330 PetscInt nslim_col,*ns_col; 4331 IS ris = *rperm,cis = *cperm; 4332 4333 PetscFunctionBegin; 4334 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4335 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4336 4337 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4338 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4339 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4340 4341 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4342 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4343 4344 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4345 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4346 4347 /* Construct the permutations for rows*/ 4348 for (i=0,row = 0; i<nslim_row; ++i) { 4349 indx = ridx[i]; 4350 start_val = tns[indx]; 4351 end_val = tns[indx + 1]; 4352 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4353 } 4354 4355 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4356 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4357 4358 /* Construct permutations for columns */ 4359 for (i=0,col=0; i<nslim_col; ++i) { 4360 indx = cidx[i]; 4361 start_val = tns[indx]; 4362 end_val = tns[indx + 1]; 4363 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4364 } 4365 4366 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4367 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4368 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4369 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4370 4371 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4372 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4373 4374 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4375 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4376 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4377 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4378 ierr = PetscFree(tns);CHKERRQ(ierr); 4379 PetscFunctionReturn(0); 4380 } 4381 4382 #undef __FUNCT__ 4383 #define __FUNCT__ "MatInodeGetInodeSizes" 4384 /*@C 4385 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4386 4387 Not Collective 4388 4389 Input Parameter: 4390 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4391 4392 Output Parameter: 4393 + node_count - no of inodes present in the matrix. 4394 . sizes - an array of size node_count,with sizes of each inode. 4395 - limit - the max size used to generate the inodes. 4396 4397 Level: advanced 4398 4399 Notes: This routine returns some internal storage information 4400 of the matrix, it is intended to be used by advanced users. 4401 It should be called after the matrix is assembled. 4402 The contents of the sizes[] array should not be changed. 4403 NULL may be passed for information not requested. 4404 4405 .keywords: matrix, seqaij, get, inode 4406 4407 .seealso: MatGetInfo() 4408 @*/ 4409 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4410 { 4411 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4412 4413 PetscFunctionBegin; 4414 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4415 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4416 if (f) { 4417 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4418 } 4419 PetscFunctionReturn(0); 4420 } 4421 4422 #undef __FUNCT__ 4423 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4424 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4425 { 4426 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4427 4428 PetscFunctionBegin; 4429 if (node_count) *node_count = a->inode.node_count; 4430 if (sizes) *sizes = a->inode.size; 4431 if (limit) *limit = a->inode.limit; 4432 PetscFunctionReturn(0); 4433 } 4434