1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 67 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1) { 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,nsz,i1,i2; 157 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 158 159 PetscFunctionBegin; 160 nslim_row = a->inode.node_count; 161 n = A->cmap->n; 162 163 /* Create The column_inode for this matrix */ 164 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 165 166 /* allocate space for reformated column_inode structure */ 167 ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr); 168 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 169 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 170 171 for (i1=0,col=0; i1<nslim_col; ++i1) { 172 nsz = ns_col[i1]; 173 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 PetscFunctionReturn(0); 264 } 265 266 /* ----------------------------------------------------------- */ 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscErrorCode ierr; 274 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 275 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 276 277 PetscFunctionBegin; 278 nslim_row = a->inode.node_count; 279 n = A->cmap->n; 280 281 /* Create The column_inode for this matrix */ 282 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 283 284 /* allocate space for reformated column_inode structure */ 285 ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr); 286 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 287 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 288 289 for (i1=0,col=0; i1<nslim_col; ++i1) { 290 nsz = ns_col[i1]; 291 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 292 } 293 /* allocate space for column pointers */ 294 ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 295 *iia = ia; 296 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 297 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 298 299 /* determine the number of columns in each row */ 300 ia[0] = oshift; 301 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 302 j = aj + ai[row] + ishift; 303 col = *j++ + ishift; 304 i2 = tvc[col]; 305 nz = ai[row+1] - ai[row]; 306 while (nz-- > 0) { /* off-diagonal elemets */ 307 /* ia[i1+1]++; */ 308 ia[i2+1]++; 309 i2++; 310 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 311 if (nz > 0) i2 = tvc[col]; 312 } 313 } 314 315 /* shift ia[i] to point to next col */ 316 for (i1=1; i1<nslim_col+1; i1++) { 317 col = ia[i1-1]; 318 ia[i1] += col; 319 work[i1-1] = col - oshift; 320 } 321 322 /* allocate space for column pointers */ 323 nz = ia[nslim_col] + (!ishift); 324 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 325 *jja = ja; 326 327 /* loop over matrix putting into ja */ 328 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 329 j = aj + ai[row] + ishift; 330 col = *j++ + ishift; 331 i2 = tvc[col]; 332 nz = ai[row+1] - ai[row]; 333 while (nz-- > 0) { 334 /* ja[work[i1]++] = i2 + oshift; */ 335 ja[work[i2]++] = i1 + oshift; 336 i2++; 337 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 338 if (nz > 0) i2 = tvc[col]; 339 } 340 } 341 ierr = PetscFree(ns_col);CHKERRQ(ierr); 342 ierr = PetscFree(work);CHKERRQ(ierr); 343 ierr = PetscFree(tns);CHKERRQ(ierr); 344 ierr = PetscFree(tvc);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 356 if (!ia) PetscFunctionReturn(0); 357 358 if (!blockcompressed) { 359 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 360 } else if (symmetric) { 361 /* Since the indices are symmetric it does'nt matter */ 362 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } else { 364 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 365 } 366 PetscFunctionReturn(0); 367 } 368 369 #undef __FUNCT__ 370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 372 { 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 if (!ia) PetscFunctionReturn(0); 377 if (!blockcompressed) { 378 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 379 } else { 380 ierr = PetscFree(*ia);CHKERRQ(ierr); 381 ierr = PetscFree(*ja);CHKERRQ(ierr); 382 } 383 PetscFunctionReturn(0); 384 } 385 386 /* ----------------------------------------------------------- */ 387 388 #undef __FUNCT__ 389 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 393 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 394 PetscScalar *y; 395 const PetscScalar *x; 396 const MatScalar *v1,*v2,*v3,*v4,*v5; 397 PetscErrorCode ierr; 398 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 399 const PetscInt *idx,*ns,*ii; 400 401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 403 #endif 404 405 PetscFunctionBegin; 406 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 407 node_max = a->inode.node_count; 408 ns = a->inode.size; /* Node Size array */ 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 411 idx = a->j; 412 v1 = a->a; 413 ii = a->i; 414 415 for (i = 0,row = 0; i< node_max; ++i) { 416 nsz = ns[i]; 417 n = ii[1] - ii[0]; 418 nonzerorow += (n>0)*nsz; 419 ii += nsz; 420 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 421 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 422 sz = n; /* No of non zeros in this row */ 423 /* Switch on the size of Node */ 424 switch (nsz) { /* Each loop in 'case' is unrolled */ 425 case 1: 426 sum1 = 0.; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; /* The instructions are ordered to */ 430 i2 = idx[1]; /* make the compiler's job easy */ 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 } 436 437 if (n == sz-1) { /* Take care of the last nonzero */ 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ *tmp0; 440 } 441 y[row++]=sum1; 442 break; 443 case 2: 444 sum1 = 0.; 445 sum2 = 0.; 446 v2 = v1 + n; 447 448 for (n = 0; n< sz-1; n+=2) { 449 i1 = idx[0]; 450 i2 = idx[1]; 451 idx += 2; 452 tmp0 = x[i1]; 453 tmp1 = x[i2]; 454 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 455 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 456 } 457 if (n == sz-1) { 458 tmp0 = x[*idx++]; 459 sum1 += *v1++ * tmp0; 460 sum2 += *v2++ * tmp0; 461 } 462 y[row++]=sum1; 463 y[row++]=sum2; 464 v1 =v2; /* Since the next block to be processed starts there*/ 465 idx +=sz; 466 break; 467 case 3: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 v2 = v1 + n; 472 v3 = v2 + n; 473 474 for (n = 0; n< sz-1; n+=2) { 475 i1 = idx[0]; 476 i2 = idx[1]; 477 idx += 2; 478 tmp0 = x[i1]; 479 tmp1 = x[i2]; 480 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 } 490 y[row++]=sum1; 491 y[row++]=sum2; 492 y[row++]=sum3; 493 v1 =v3; /* Since the next block to be processed starts there*/ 494 idx +=2*sz; 495 break; 496 case 4: 497 sum1 = 0.; 498 sum2 = 0.; 499 sum3 = 0.; 500 sum4 = 0.; 501 v2 = v1 + n; 502 v3 = v2 + n; 503 v4 = v3 + n; 504 505 for (n = 0; n< sz-1; n+=2) { 506 i1 = idx[0]; 507 i2 = idx[1]; 508 idx += 2; 509 tmp0 = x[i1]; 510 tmp1 = x[i2]; 511 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 513 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 514 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 515 } 516 if (n == sz-1) { 517 tmp0 = x[*idx++]; 518 sum1 += *v1++ * tmp0; 519 sum2 += *v2++ * tmp0; 520 sum3 += *v3++ * tmp0; 521 sum4 += *v4++ * tmp0; 522 } 523 y[row++]=sum1; 524 y[row++]=sum2; 525 y[row++]=sum3; 526 y[row++]=sum4; 527 v1 =v4; /* Since the next block to be processed starts there*/ 528 idx +=3*sz; 529 break; 530 case 5: 531 sum1 = 0.; 532 sum2 = 0.; 533 sum3 = 0.; 534 sum4 = 0.; 535 sum5 = 0.; 536 v2 = v1 + n; 537 v3 = v2 + n; 538 v4 = v3 + n; 539 v5 = v4 + n; 540 541 for (n = 0; n<sz-1; n+=2) { 542 i1 = idx[0]; 543 i2 = idx[1]; 544 idx += 2; 545 tmp0 = x[i1]; 546 tmp1 = x[i2]; 547 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 548 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 549 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 550 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 551 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 552 } 553 if (n == sz-1) { 554 tmp0 = x[*idx++]; 555 sum1 += *v1++ * tmp0; 556 sum2 += *v2++ * tmp0; 557 sum3 += *v3++ * tmp0; 558 sum4 += *v4++ * tmp0; 559 sum5 += *v5++ * tmp0; 560 } 561 y[row++]=sum1; 562 y[row++]=sum2; 563 y[row++]=sum3; 564 y[row++]=sum4; 565 y[row++]=sum5; 566 v1 =v5; /* Since the next block to be processed starts there */ 567 idx +=4*sz; 568 break; 569 default: 570 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 571 } 572 } 573 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 575 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 /* ----------------------------------------------------------- */ 579 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 583 { 584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 585 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 586 const MatScalar *v1,*v2,*v3,*v4,*v5; 587 const PetscScalar *x; 588 PetscScalar *y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 591 const PetscInt *idx,*ns,*ii; 592 593 PetscFunctionBegin; 594 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 595 node_max = a->inode.node_count; 596 ns = a->inode.size; /* Node Size array */ 597 598 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 600 zt = z; 601 602 idx = a->j; 603 v1 = a->a; 604 ii = a->i; 605 606 for (i = 0,row = 0; i< node_max; ++i) { 607 nsz = ns[i]; 608 n = ii[1] - ii[0]; 609 ii += nsz; 610 sz = n; /* No of non zeros in this row */ 611 /* Switch on the size of Node */ 612 switch (nsz) { /* Each loop in 'case' is unrolled */ 613 case 1: 614 sum1 = *zt++; 615 616 for (n = 0; n< sz-1; n+=2) { 617 i1 = idx[0]; /* The instructions are ordered to */ 618 i2 = idx[1]; /* make the compiler's job easy */ 619 idx += 2; 620 tmp0 = x[i1]; 621 tmp1 = x[i2]; 622 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 623 } 624 625 if (n == sz-1) { /* Take care of the last nonzero */ 626 tmp0 = x[*idx++]; 627 sum1 += *v1++ * tmp0; 628 } 629 y[row++]=sum1; 630 break; 631 case 2: 632 sum1 = *zt++; 633 sum2 = *zt++; 634 v2 = v1 + n; 635 636 for (n = 0; n< sz-1; n+=2) { 637 i1 = idx[0]; 638 i2 = idx[1]; 639 idx += 2; 640 tmp0 = x[i1]; 641 tmp1 = x[i2]; 642 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 643 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 644 } 645 if (n == sz-1) { 646 tmp0 = x[*idx++]; 647 sum1 += *v1++ * tmp0; 648 sum2 += *v2++ * tmp0; 649 } 650 y[row++]=sum1; 651 y[row++]=sum2; 652 v1 =v2; /* Since the next block to be processed starts there*/ 653 idx +=sz; 654 break; 655 case 3: 656 sum1 = *zt++; 657 sum2 = *zt++; 658 sum3 = *zt++; 659 v2 = v1 + n; 660 v3 = v2 + n; 661 662 for (n = 0; n< sz-1; n+=2) { 663 i1 = idx[0]; 664 i2 = idx[1]; 665 idx += 2; 666 tmp0 = x[i1]; 667 tmp1 = x[i2]; 668 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 669 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 670 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 671 } 672 if (n == sz-1) { 673 tmp0 = x[*idx++]; 674 sum1 += *v1++ * tmp0; 675 sum2 += *v2++ * tmp0; 676 sum3 += *v3++ * tmp0; 677 } 678 y[row++]=sum1; 679 y[row++]=sum2; 680 y[row++]=sum3; 681 v1 =v3; /* Since the next block to be processed starts there*/ 682 idx +=2*sz; 683 break; 684 case 4: 685 sum1 = *zt++; 686 sum2 = *zt++; 687 sum3 = *zt++; 688 sum4 = *zt++; 689 v2 = v1 + n; 690 v3 = v2 + n; 691 v4 = v3 + n; 692 693 for (n = 0; n< sz-1; n+=2) { 694 i1 = idx[0]; 695 i2 = idx[1]; 696 idx += 2; 697 tmp0 = x[i1]; 698 tmp1 = x[i2]; 699 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 700 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 701 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 702 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 703 } 704 if (n == sz-1) { 705 tmp0 = x[*idx++]; 706 sum1 += *v1++ * tmp0; 707 sum2 += *v2++ * tmp0; 708 sum3 += *v3++ * tmp0; 709 sum4 += *v4++ * tmp0; 710 } 711 y[row++]=sum1; 712 y[row++]=sum2; 713 y[row++]=sum3; 714 y[row++]=sum4; 715 v1 =v4; /* Since the next block to be processed starts there*/ 716 idx +=3*sz; 717 break; 718 case 5: 719 sum1 = *zt++; 720 sum2 = *zt++; 721 sum3 = *zt++; 722 sum4 = *zt++; 723 sum5 = *zt++; 724 v2 = v1 + n; 725 v3 = v2 + n; 726 v4 = v3 + n; 727 v5 = v4 + n; 728 729 for (n = 0; n<sz-1; n+=2) { 730 i1 = idx[0]; 731 i2 = idx[1]; 732 idx += 2; 733 tmp0 = x[i1]; 734 tmp1 = x[i2]; 735 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 736 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 737 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 738 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 739 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 740 } 741 if (n == sz-1) { 742 tmp0 = x[*idx++]; 743 sum1 += *v1++ * tmp0; 744 sum2 += *v2++ * tmp0; 745 sum3 += *v3++ * tmp0; 746 sum4 += *v4++ * tmp0; 747 sum5 += *v5++ * tmp0; 748 } 749 y[row++]=sum1; 750 y[row++]=sum2; 751 y[row++]=sum3; 752 y[row++]=sum4; 753 y[row++]=sum5; 754 v1 =v5; /* Since the next block to be processed starts there */ 755 idx +=4*sz; 756 break; 757 default: 758 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 759 } 760 } 761 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 762 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 763 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 764 PetscFunctionReturn(0); 765 } 766 767 /* ----------------------------------------------------------- */ 768 #undef __FUNCT__ 769 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 770 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 771 { 772 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 773 IS iscol = a->col,isrow = a->row; 774 PetscErrorCode ierr; 775 const PetscInt *r,*c,*rout,*cout; 776 PetscInt i,j,n = A->rmap->n,nz; 777 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 778 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 779 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 780 PetscScalar sum1,sum2,sum3,sum4,sum5; 781 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 782 const PetscScalar *b; 783 784 PetscFunctionBegin; 785 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 786 node_max = a->inode.node_count; 787 ns = a->inode.size; /* Node Size array */ 788 789 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 790 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 791 tmp = a->solve_work; 792 793 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 794 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 795 796 /* forward solve the lower triangular */ 797 tmps = tmp; 798 aa = a_a; 799 aj = a_j; 800 ad = a->diag; 801 802 for (i = 0,row = 0; i< node_max; ++i) { 803 nsz = ns[i]; 804 aii = ai[row]; 805 v1 = aa + aii; 806 vi = aj + aii; 807 nz = ad[row]- aii; 808 if (i < node_max-1) { 809 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 810 * but our indexing to determine it's size could. */ 811 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 812 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 813 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 814 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 815 } 816 817 switch (nsz) { /* Each loop in 'case' is unrolled */ 818 case 1: 819 sum1 = b[*r++]; 820 for (j=0; j<nz-1; j+=2) { 821 i0 = vi[0]; 822 i1 = vi[1]; 823 vi +=2; 824 tmp0 = tmps[i0]; 825 tmp1 = tmps[i1]; 826 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 827 } 828 if (j == nz-1) { 829 tmp0 = tmps[*vi++]; 830 sum1 -= *v1++ *tmp0; 831 } 832 tmp[row++]=sum1; 833 break; 834 case 2: 835 sum1 = b[*r++]; 836 sum2 = b[*r++]; 837 v2 = aa + ai[row+1]; 838 839 for (j=0; j<nz-1; j+=2) { 840 i0 = vi[0]; 841 i1 = vi[1]; 842 vi +=2; 843 tmp0 = tmps[i0]; 844 tmp1 = tmps[i1]; 845 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 846 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 847 } 848 if (j == nz-1) { 849 tmp0 = tmps[*vi++]; 850 sum1 -= *v1++ *tmp0; 851 sum2 -= *v2++ *tmp0; 852 } 853 sum2 -= *v2++ *sum1; 854 tmp[row++]=sum1; 855 tmp[row++]=sum2; 856 break; 857 case 3: 858 sum1 = b[*r++]; 859 sum2 = b[*r++]; 860 sum3 = b[*r++]; 861 v2 = aa + ai[row+1]; 862 v3 = aa + ai[row+2]; 863 864 for (j=0; j<nz-1; j+=2) { 865 i0 = vi[0]; 866 i1 = vi[1]; 867 vi +=2; 868 tmp0 = tmps[i0]; 869 tmp1 = tmps[i1]; 870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 873 } 874 if (j == nz-1) { 875 tmp0 = tmps[*vi++]; 876 sum1 -= *v1++ *tmp0; 877 sum2 -= *v2++ *tmp0; 878 sum3 -= *v3++ *tmp0; 879 } 880 sum2 -= *v2++ * sum1; 881 sum3 -= *v3++ * sum1; 882 sum3 -= *v3++ * sum2; 883 884 tmp[row++]=sum1; 885 tmp[row++]=sum2; 886 tmp[row++]=sum3; 887 break; 888 889 case 4: 890 sum1 = b[*r++]; 891 sum2 = b[*r++]; 892 sum3 = b[*r++]; 893 sum4 = b[*r++]; 894 v2 = aa + ai[row+1]; 895 v3 = aa + ai[row+2]; 896 v4 = aa + ai[row+3]; 897 898 for (j=0; j<nz-1; j+=2) { 899 i0 = vi[0]; 900 i1 = vi[1]; 901 vi +=2; 902 tmp0 = tmps[i0]; 903 tmp1 = tmps[i1]; 904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 908 } 909 if (j == nz-1) { 910 tmp0 = tmps[*vi++]; 911 sum1 -= *v1++ *tmp0; 912 sum2 -= *v2++ *tmp0; 913 sum3 -= *v3++ *tmp0; 914 sum4 -= *v4++ *tmp0; 915 } 916 sum2 -= *v2++ * sum1; 917 sum3 -= *v3++ * sum1; 918 sum4 -= *v4++ * sum1; 919 sum3 -= *v3++ * sum2; 920 sum4 -= *v4++ * sum2; 921 sum4 -= *v4++ * sum3; 922 923 tmp[row++]=sum1; 924 tmp[row++]=sum2; 925 tmp[row++]=sum3; 926 tmp[row++]=sum4; 927 break; 928 case 5: 929 sum1 = b[*r++]; 930 sum2 = b[*r++]; 931 sum3 = b[*r++]; 932 sum4 = b[*r++]; 933 sum5 = b[*r++]; 934 v2 = aa + ai[row+1]; 935 v3 = aa + ai[row+2]; 936 v4 = aa + ai[row+3]; 937 v5 = aa + ai[row+4]; 938 939 for (j=0; j<nz-1; j+=2) { 940 i0 = vi[0]; 941 i1 = vi[1]; 942 vi +=2; 943 tmp0 = tmps[i0]; 944 tmp1 = tmps[i1]; 945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 950 } 951 if (j == nz-1) { 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ *tmp0; 954 sum2 -= *v2++ *tmp0; 955 sum3 -= *v3++ *tmp0; 956 sum4 -= *v4++ *tmp0; 957 sum5 -= *v5++ *tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row++]=sum1; 972 tmp[row++]=sum2; 973 tmp[row++]=sum3; 974 tmp[row++]=sum4; 975 tmp[row++]=sum5; 976 break; 977 default: 978 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i=node_max -1,row = n-1; i>=0; i--) { 983 nsz = ns[i]; 984 aii = ai[row+1] -1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii- ad[row]; 988 switch (nsz) { /* Each loop in 'case' is unrolled */ 989 case 1: 990 sum1 = tmp[row]; 991 992 for (j=nz; j>1; j-=2) { 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j==1) { 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1006 break; 1007 case 2: 1008 sum1 = tmp[row]; 1009 sum2 = tmp[row -1]; 1010 v2 = aa + ai[row]-1; 1011 for (j=nz; j>1; j-=2) { 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1020 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1021 } 1022 if (j==1) { 1023 tmp0 = tmps[*vi--]; 1024 sum1 -= *v1-- * tmp0; 1025 sum2 -= *v2-- * tmp0; 1026 } 1027 1028 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1031 break; 1032 case 3: 1033 sum1 = tmp[row]; 1034 sum2 = tmp[row -1]; 1035 sum3 = tmp[row -2]; 1036 v2 = aa + ai[row]-1; 1037 v3 = aa + ai[row -1]-1; 1038 for (j=nz; j>1; j-=2) { 1039 vi -=2; 1040 i0 = vi[2]; 1041 i1 = vi[1]; 1042 tmp0 = tmps[i0]; 1043 tmp1 = tmps[i1]; 1044 v1 -= 2; 1045 v2 -= 2; 1046 v3 -= 2; 1047 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1048 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1049 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1050 } 1051 if (j==1) { 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 } 1057 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1061 sum3 -= *v3-- * tmp0; 1062 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1063 1064 break; 1065 case 4: 1066 sum1 = tmp[row]; 1067 sum2 = tmp[row -1]; 1068 sum3 = tmp[row -2]; 1069 sum4 = tmp[row -3]; 1070 v2 = aa + ai[row]-1; 1071 v3 = aa + ai[row -1]-1; 1072 v4 = aa + ai[row -2]-1; 1073 1074 for (j=nz; j>1; j-=2) { 1075 vi -=2; 1076 i0 = vi[2]; 1077 i1 = vi[1]; 1078 tmp0 = tmps[i0]; 1079 tmp1 = tmps[i1]; 1080 v1 -= 2; 1081 v2 -= 2; 1082 v3 -= 2; 1083 v4 -= 2; 1084 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1085 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1086 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1087 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1088 } 1089 if (j==1) { 1090 tmp0 = tmps[*vi--]; 1091 sum1 -= *v1-- * tmp0; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 } 1096 1097 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1105 sum4 -= *v4-- * tmp0; 1106 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1107 break; 1108 case 5: 1109 sum1 = tmp[row]; 1110 sum2 = tmp[row -1]; 1111 sum3 = tmp[row -2]; 1112 sum4 = tmp[row -3]; 1113 sum5 = tmp[row -4]; 1114 v2 = aa + ai[row]-1; 1115 v3 = aa + ai[row -1]-1; 1116 v4 = aa + ai[row -2]-1; 1117 v5 = aa + ai[row -3]-1; 1118 for (j=nz; j>1; j-=2) { 1119 vi -= 2; 1120 i0 = vi[2]; 1121 i1 = vi[1]; 1122 tmp0 = tmps[i0]; 1123 tmp1 = tmps[i1]; 1124 v1 -= 2; 1125 v2 -= 2; 1126 v3 -= 2; 1127 v4 -= 2; 1128 v5 -= 2; 1129 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1130 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1131 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1132 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1133 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1134 } 1135 if (j==1) { 1136 tmp0 = tmps[*vi--]; 1137 sum1 -= *v1-- * tmp0; 1138 sum2 -= *v2-- * tmp0; 1139 sum3 -= *v3-- * tmp0; 1140 sum4 -= *v4-- * tmp0; 1141 sum5 -= *v5-- * tmp0; 1142 } 1143 1144 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1157 sum5 -= *v5-- * tmp0; 1158 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1159 break; 1160 default: 1161 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1162 } 1163 } 1164 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1165 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1166 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1174 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1175 { 1176 Mat C =B; 1177 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1178 IS isrow = b->row,isicol = b->icol; 1179 PetscErrorCode ierr; 1180 const PetscInt *r,*ic,*ics; 1181 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1182 PetscInt i,j,k,nz,nzL,row,*pj; 1183 const PetscInt *ajtmp,*bjtmp; 1184 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1185 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1186 FactorShiftCtx sctx; 1187 const PetscInt *ddiag; 1188 PetscReal rs; 1189 MatScalar d; 1190 PetscInt inod,nodesz,node_max,col; 1191 const PetscInt *ns; 1192 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1193 1194 PetscFunctionBegin; 1195 /* MatPivotSetUp(): initialize shift context sctx */ 1196 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1197 1198 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1199 ddiag = a->diag; 1200 sctx.shift_top = info->zeropivot; 1201 for (i=0; i<n; i++) { 1202 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1203 d = (aa)[ddiag[i]]; 1204 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1205 v = aa+ai[i]; 1206 nz = ai[i+1] - ai[i]; 1207 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1208 if (rs>sctx.shift_top) sctx.shift_top = rs; 1209 } 1210 sctx.shift_top *= 1.1; 1211 sctx.nshift_max = 5; 1212 sctx.shift_lo = 0.; 1213 sctx.shift_hi = 1.; 1214 } 1215 1216 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1217 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1218 1219 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1220 ics = ic; 1221 1222 node_max = a->inode.node_count; 1223 ns = a->inode.size; 1224 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1225 1226 /* If max inode size > 4, split it into two inodes.*/ 1227 /* also map the inode sizes according to the ordering */ 1228 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1229 for (i=0,j=0; i<node_max; ++i,++j) { 1230 if (ns[i] > 4) { 1231 tmp_vec1[j] = 4; 1232 ++j; 1233 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1234 } else { 1235 tmp_vec1[j] = ns[i]; 1236 } 1237 } 1238 /* Use the correct node_max */ 1239 node_max = j; 1240 1241 /* Now reorder the inode info based on mat re-ordering info */ 1242 /* First create a row -> inode_size_array_index map */ 1243 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1244 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1245 for (i=0,row=0; i<node_max; i++) { 1246 nodesz = tmp_vec1[i]; 1247 for (j=0; j<nodesz; j++,row++) { 1248 nsmap[row] = i; 1249 } 1250 } 1251 /* Using nsmap, create a reordered ns structure */ 1252 for (i=0,j=0; i< node_max; i++) { 1253 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1254 tmp_vec2[i] = nodesz; 1255 j += nodesz; 1256 } 1257 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1258 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1259 1260 /* Now use the correct ns */ 1261 ns = tmp_vec2; 1262 1263 do { 1264 sctx.newshift = PETSC_FALSE; 1265 /* Now loop over each block-row, and do the factorization */ 1266 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1267 nodesz = ns[inod]; 1268 1269 switch (nodesz) { 1270 case 1: 1271 /*----------*/ 1272 /* zero rtmp1 */ 1273 /* L part */ 1274 nz = bi[i+1] - bi[i]; 1275 bjtmp = bj + bi[i]; 1276 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1277 1278 /* U part */ 1279 nz = bdiag[i]-bdiag[i+1]; 1280 bjtmp = bj + bdiag[i+1]+1; 1281 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1282 1283 /* load in initial (unfactored row) */ 1284 nz = ai[r[i]+1] - ai[r[i]]; 1285 ajtmp = aj + ai[r[i]]; 1286 v = aa + ai[r[i]]; 1287 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1288 1289 /* ZeropivotApply() */ 1290 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1291 1292 /* elimination */ 1293 bjtmp = bj + bi[i]; 1294 row = *bjtmp++; 1295 nzL = bi[i+1] - bi[i]; 1296 for (k=0; k < nzL; k++) { 1297 pc = rtmp1 + row; 1298 if (*pc != 0.0) { 1299 pv = b->a + bdiag[row]; 1300 mul1 = *pc * (*pv); 1301 *pc = mul1; 1302 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1303 pv = b->a + bdiag[row+1]+1; 1304 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1305 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1306 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1307 } 1308 row = *bjtmp++; 1309 } 1310 1311 /* finished row so stick it into b->a */ 1312 rs = 0.0; 1313 /* L part */ 1314 pv = b->a + bi[i]; 1315 pj = b->j + bi[i]; 1316 nz = bi[i+1] - bi[i]; 1317 for (j=0; j<nz; j++) { 1318 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1319 } 1320 1321 /* U part */ 1322 pv = b->a + bdiag[i+1]+1; 1323 pj = b->j + bdiag[i+1]+1; 1324 nz = bdiag[i] - bdiag[i+1]-1; 1325 for (j=0; j<nz; j++) { 1326 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1327 } 1328 1329 /* Check zero pivot */ 1330 sctx.rs = rs; 1331 sctx.pv = rtmp1[i]; 1332 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1333 if (sctx.newshift) break; 1334 1335 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1336 pv = b->a + bdiag[i]; 1337 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1338 break; 1339 1340 case 2: 1341 /*----------*/ 1342 /* zero rtmp1 and rtmp2 */ 1343 /* L part */ 1344 nz = bi[i+1] - bi[i]; 1345 bjtmp = bj + bi[i]; 1346 for (j=0; j<nz; j++) { 1347 col = bjtmp[j]; 1348 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1349 } 1350 1351 /* U part */ 1352 nz = bdiag[i]-bdiag[i+1]; 1353 bjtmp = bj + bdiag[i+1]+1; 1354 for (j=0; j<nz; j++) { 1355 col = bjtmp[j]; 1356 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1357 } 1358 1359 /* load in initial (unfactored row) */ 1360 nz = ai[r[i]+1] - ai[r[i]]; 1361 ajtmp = aj + ai[r[i]]; 1362 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1363 for (j=0; j<nz; j++) { 1364 col = ics[ajtmp[j]]; 1365 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1366 } 1367 /* ZeropivotApply(): shift the diagonal of the matrix */ 1368 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1369 1370 /* elimination */ 1371 bjtmp = bj + bi[i]; 1372 row = *bjtmp++; /* pivot row */ 1373 nzL = bi[i+1] - bi[i]; 1374 for (k=0; k < nzL; k++) { 1375 pc1 = rtmp1 + row; 1376 pc2 = rtmp2 + row; 1377 if (*pc1 != 0.0 || *pc2 != 0.0) { 1378 pv = b->a + bdiag[row]; 1379 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1380 *pc1 = mul1; *pc2 = mul2; 1381 1382 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1383 pv = b->a + bdiag[row+1]+1; 1384 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1385 for (j=0; j<nz; j++) { 1386 col = pj[j]; 1387 rtmp1[col] -= mul1 * pv[j]; 1388 rtmp2[col] -= mul2 * pv[j]; 1389 } 1390 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1391 } 1392 row = *bjtmp++; 1393 } 1394 1395 /* finished row i; check zero pivot, then stick row i into b->a */ 1396 rs = 0.0; 1397 /* L part */ 1398 pc1 = b->a + bi[i]; 1399 pj = b->j + bi[i]; 1400 nz = bi[i+1] - bi[i]; 1401 for (j=0; j<nz; j++) { 1402 col = pj[j]; 1403 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1404 } 1405 /* U part */ 1406 pc1 = b->a + bdiag[i+1]+1; 1407 pj = b->j + bdiag[i+1]+1; 1408 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1409 for (j=0; j<nz; j++) { 1410 col = pj[j]; 1411 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1412 } 1413 1414 sctx.rs = rs; 1415 sctx.pv = rtmp1[i]; 1416 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1417 if (sctx.newshift) break; 1418 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1419 *pc1 = 1.0/sctx.pv; 1420 1421 /* Now take care of diagonal 2x2 block. */ 1422 pc2 = rtmp2 + i; 1423 if (*pc2 != 0.0) { 1424 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1425 *pc2 = mul1; /* insert L entry */ 1426 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1427 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1428 for (j=0; j<nz; j++) { 1429 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1430 } 1431 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1432 } 1433 1434 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1435 rs = 0.0; 1436 /* L part */ 1437 pc2 = b->a + bi[i+1]; 1438 pj = b->j + bi[i+1]; 1439 nz = bi[i+2] - bi[i+1]; 1440 for (j=0; j<nz; j++) { 1441 col = pj[j]; 1442 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1443 } 1444 /* U part */ 1445 pc2 = b->a + bdiag[i+2]+1; 1446 pj = b->j + bdiag[i+2]+1; 1447 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1448 for (j=0; j<nz; j++) { 1449 col = pj[j]; 1450 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1451 } 1452 1453 sctx.rs = rs; 1454 sctx.pv = rtmp2[i+1]; 1455 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1456 if (sctx.newshift) break; 1457 pc2 = b->a + bdiag[i+1]; 1458 *pc2 = 1.0/sctx.pv; 1459 break; 1460 1461 case 3: 1462 /*----------*/ 1463 /* zero rtmp */ 1464 /* L part */ 1465 nz = bi[i+1] - bi[i]; 1466 bjtmp = bj + bi[i]; 1467 for (j=0; j<nz; j++) { 1468 col = bjtmp[j]; 1469 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1470 } 1471 1472 /* U part */ 1473 nz = bdiag[i]-bdiag[i+1]; 1474 bjtmp = bj + bdiag[i+1]+1; 1475 for (j=0; j<nz; j++) { 1476 col = bjtmp[j]; 1477 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1478 } 1479 1480 /* load in initial (unfactored row) */ 1481 nz = ai[r[i]+1] - ai[r[i]]; 1482 ajtmp = aj + ai[r[i]]; 1483 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1484 for (j=0; j<nz; j++) { 1485 col = ics[ajtmp[j]]; 1486 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1487 } 1488 /* ZeropivotApply(): shift the diagonal of the matrix */ 1489 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1490 1491 /* elimination */ 1492 bjtmp = bj + bi[i]; 1493 row = *bjtmp++; /* pivot row */ 1494 nzL = bi[i+1] - bi[i]; 1495 for (k=0; k < nzL; k++) { 1496 pc1 = rtmp1 + row; 1497 pc2 = rtmp2 + row; 1498 pc3 = rtmp3 + row; 1499 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1500 pv = b->a + bdiag[row]; 1501 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1502 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1503 1504 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1505 pv = b->a + bdiag[row+1]+1; 1506 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1507 for (j=0; j<nz; j++) { 1508 col = pj[j]; 1509 rtmp1[col] -= mul1 * pv[j]; 1510 rtmp2[col] -= mul2 * pv[j]; 1511 rtmp3[col] -= mul3 * pv[j]; 1512 } 1513 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1514 } 1515 row = *bjtmp++; 1516 } 1517 1518 /* finished row i; check zero pivot, then stick row i into b->a */ 1519 rs = 0.0; 1520 /* L part */ 1521 pc1 = b->a + bi[i]; 1522 pj = b->j + bi[i]; 1523 nz = bi[i+1] - bi[i]; 1524 for (j=0; j<nz; j++) { 1525 col = pj[j]; 1526 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1527 } 1528 /* U part */ 1529 pc1 = b->a + bdiag[i+1]+1; 1530 pj = b->j + bdiag[i+1]+1; 1531 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1532 for (j=0; j<nz; j++) { 1533 col = pj[j]; 1534 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1535 } 1536 1537 sctx.rs = rs; 1538 sctx.pv = rtmp1[i]; 1539 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1540 if (sctx.newshift) break; 1541 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1542 *pc1 = 1.0/sctx.pv; 1543 1544 /* Now take care of 1st column of diagonal 3x3 block. */ 1545 pc2 = rtmp2 + i; 1546 pc3 = rtmp3 + i; 1547 if (*pc2 != 0.0 || *pc3 != 0.0) { 1548 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1549 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1550 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1551 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1552 for (j=0; j<nz; j++) { 1553 col = pj[j]; 1554 rtmp2[col] -= mul2 * rtmp1[col]; 1555 rtmp3[col] -= mul3 * rtmp1[col]; 1556 } 1557 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1558 } 1559 1560 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1561 rs = 0.0; 1562 /* L part */ 1563 pc2 = b->a + bi[i+1]; 1564 pj = b->j + bi[i+1]; 1565 nz = bi[i+2] - bi[i+1]; 1566 for (j=0; j<nz; j++) { 1567 col = pj[j]; 1568 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1569 } 1570 /* U part */ 1571 pc2 = b->a + bdiag[i+2]+1; 1572 pj = b->j + bdiag[i+2]+1; 1573 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1574 for (j=0; j<nz; j++) { 1575 col = pj[j]; 1576 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1577 } 1578 1579 sctx.rs = rs; 1580 sctx.pv = rtmp2[i+1]; 1581 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1582 if (sctx.newshift) break; 1583 pc2 = b->a + bdiag[i+1]; 1584 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1585 1586 /* Now take care of 2nd column of diagonal 3x3 block. */ 1587 pc3 = rtmp3 + i+1; 1588 if (*pc3 != 0.0) { 1589 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1590 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1591 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1592 for (j=0; j<nz; j++) { 1593 col = pj[j]; 1594 rtmp3[col] -= mul3 * rtmp2[col]; 1595 } 1596 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1597 } 1598 1599 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1600 rs = 0.0; 1601 /* L part */ 1602 pc3 = b->a + bi[i+2]; 1603 pj = b->j + bi[i+2]; 1604 nz = bi[i+3] - bi[i+2]; 1605 for (j=0; j<nz; j++) { 1606 col = pj[j]; 1607 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1608 } 1609 /* U part */ 1610 pc3 = b->a + bdiag[i+3]+1; 1611 pj = b->j + bdiag[i+3]+1; 1612 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1613 for (j=0; j<nz; j++) { 1614 col = pj[j]; 1615 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1616 } 1617 1618 sctx.rs = rs; 1619 sctx.pv = rtmp3[i+2]; 1620 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1621 if (sctx.newshift) break; 1622 pc3 = b->a + bdiag[i+2]; 1623 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1624 break; 1625 case 4: 1626 /*----------*/ 1627 /* zero rtmp */ 1628 /* L part */ 1629 nz = bi[i+1] - bi[i]; 1630 bjtmp = bj + bi[i]; 1631 for (j=0; j<nz; j++) { 1632 col = bjtmp[j]; 1633 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1634 } 1635 1636 /* U part */ 1637 nz = bdiag[i]-bdiag[i+1]; 1638 bjtmp = bj + bdiag[i+1]+1; 1639 for (j=0; j<nz; j++) { 1640 col = bjtmp[j]; 1641 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1642 } 1643 1644 /* load in initial (unfactored row) */ 1645 nz = ai[r[i]+1] - ai[r[i]]; 1646 ajtmp = aj + ai[r[i]]; 1647 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1648 for (j=0; j<nz; j++) { 1649 col = ics[ajtmp[j]]; 1650 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1651 } 1652 /* ZeropivotApply(): shift the diagonal of the matrix */ 1653 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1654 1655 /* elimination */ 1656 bjtmp = bj + bi[i]; 1657 row = *bjtmp++; /* pivot row */ 1658 nzL = bi[i+1] - bi[i]; 1659 for (k=0; k < nzL; k++) { 1660 pc1 = rtmp1 + row; 1661 pc2 = rtmp2 + row; 1662 pc3 = rtmp3 + row; 1663 pc4 = rtmp4 + row; 1664 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1665 pv = b->a + bdiag[row]; 1666 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1667 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1668 1669 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1670 pv = b->a + bdiag[row+1]+1; 1671 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1672 for (j=0; j<nz; j++) { 1673 col = pj[j]; 1674 rtmp1[col] -= mul1 * pv[j]; 1675 rtmp2[col] -= mul2 * pv[j]; 1676 rtmp3[col] -= mul3 * pv[j]; 1677 rtmp4[col] -= mul4 * pv[j]; 1678 } 1679 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1680 } 1681 row = *bjtmp++; 1682 } 1683 1684 /* finished row i; check zero pivot, then stick row i into b->a */ 1685 rs = 0.0; 1686 /* L part */ 1687 pc1 = b->a + bi[i]; 1688 pj = b->j + bi[i]; 1689 nz = bi[i+1] - bi[i]; 1690 for (j=0; j<nz; j++) { 1691 col = pj[j]; 1692 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1693 } 1694 /* U part */ 1695 pc1 = b->a + bdiag[i+1]+1; 1696 pj = b->j + bdiag[i+1]+1; 1697 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1698 for (j=0; j<nz; j++) { 1699 col = pj[j]; 1700 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1701 } 1702 1703 sctx.rs = rs; 1704 sctx.pv = rtmp1[i]; 1705 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1706 if (sctx.newshift) break; 1707 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1708 *pc1 = 1.0/sctx.pv; 1709 1710 /* Now take care of 1st column of diagonal 4x4 block. */ 1711 pc2 = rtmp2 + i; 1712 pc3 = rtmp3 + i; 1713 pc4 = rtmp4 + i; 1714 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1715 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1716 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1717 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1718 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1719 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1720 for (j=0; j<nz; j++) { 1721 col = pj[j]; 1722 rtmp2[col] -= mul2 * rtmp1[col]; 1723 rtmp3[col] -= mul3 * rtmp1[col]; 1724 rtmp4[col] -= mul4 * rtmp1[col]; 1725 } 1726 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1727 } 1728 1729 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1730 rs = 0.0; 1731 /* L part */ 1732 pc2 = b->a + bi[i+1]; 1733 pj = b->j + bi[i+1]; 1734 nz = bi[i+2] - bi[i+1]; 1735 for (j=0; j<nz; j++) { 1736 col = pj[j]; 1737 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1738 } 1739 /* U part */ 1740 pc2 = b->a + bdiag[i+2]+1; 1741 pj = b->j + bdiag[i+2]+1; 1742 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1743 for (j=0; j<nz; j++) { 1744 col = pj[j]; 1745 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1746 } 1747 1748 sctx.rs = rs; 1749 sctx.pv = rtmp2[i+1]; 1750 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1751 if (sctx.newshift) break; 1752 pc2 = b->a + bdiag[i+1]; 1753 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1754 1755 /* Now take care of 2nd column of diagonal 4x4 block. */ 1756 pc3 = rtmp3 + i+1; 1757 pc4 = rtmp4 + i+1; 1758 if (*pc3 != 0.0 || *pc4 != 0.0) { 1759 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1760 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1761 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1762 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1763 for (j=0; j<nz; j++) { 1764 col = pj[j]; 1765 rtmp3[col] -= mul3 * rtmp2[col]; 1766 rtmp4[col] -= mul4 * rtmp2[col]; 1767 } 1768 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1769 } 1770 1771 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1772 rs = 0.0; 1773 /* L part */ 1774 pc3 = b->a + bi[i+2]; 1775 pj = b->j + bi[i+2]; 1776 nz = bi[i+3] - bi[i+2]; 1777 for (j=0; j<nz; j++) { 1778 col = pj[j]; 1779 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1780 } 1781 /* U part */ 1782 pc3 = b->a + bdiag[i+3]+1; 1783 pj = b->j + bdiag[i+3]+1; 1784 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1785 for (j=0; j<nz; j++) { 1786 col = pj[j]; 1787 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1788 } 1789 1790 sctx.rs = rs; 1791 sctx.pv = rtmp3[i+2]; 1792 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1793 if (sctx.newshift) break; 1794 pc3 = b->a + bdiag[i+2]; 1795 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1796 1797 /* Now take care of 3rd column of diagonal 4x4 block. */ 1798 pc4 = rtmp4 + i+2; 1799 if (*pc4 != 0.0) { 1800 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1801 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1802 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1803 for (j=0; j<nz; j++) { 1804 col = pj[j]; 1805 rtmp4[col] -= mul4 * rtmp3[col]; 1806 } 1807 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1808 } 1809 1810 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1811 rs = 0.0; 1812 /* L part */ 1813 pc4 = b->a + bi[i+3]; 1814 pj = b->j + bi[i+3]; 1815 nz = bi[i+4] - bi[i+3]; 1816 for (j=0; j<nz; j++) { 1817 col = pj[j]; 1818 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1819 } 1820 /* U part */ 1821 pc4 = b->a + bdiag[i+4]+1; 1822 pj = b->j + bdiag[i+4]+1; 1823 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1824 for (j=0; j<nz; j++) { 1825 col = pj[j]; 1826 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1827 } 1828 1829 sctx.rs = rs; 1830 sctx.pv = rtmp4[i+3]; 1831 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr); 1832 if (sctx.newshift) break; 1833 pc4 = b->a + bdiag[i+3]; 1834 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1835 break; 1836 1837 default: 1838 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1839 } 1840 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1841 i += nodesz; /* Update the row */ 1842 } 1843 1844 /* MatPivotRefine() */ 1845 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1846 /* 1847 * if no shift in this attempt & shifting & started shifting & can refine, 1848 * then try lower shift 1849 */ 1850 sctx.shift_hi = sctx.shift_fraction; 1851 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1852 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1853 sctx.newshift = PETSC_TRUE; 1854 sctx.nshift++; 1855 } 1856 } while (sctx.newshift); 1857 1858 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1859 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1860 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1861 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1862 1863 if (b->inode.size) { 1864 C->ops->solve = MatSolve_SeqAIJ_Inode; 1865 } else { 1866 C->ops->solve = MatSolve_SeqAIJ; 1867 } 1868 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1869 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1870 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1871 C->ops->matsolve = MatMatSolve_SeqAIJ; 1872 C->assembled = PETSC_TRUE; 1873 C->preallocated = PETSC_TRUE; 1874 1875 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1876 1877 /* MatShiftView(A,info,&sctx) */ 1878 if (sctx.nshift) { 1879 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1880 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1881 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1882 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1883 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1884 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1885 } 1886 } 1887 PetscFunctionReturn(0); 1888 } 1889 1890 #undef __FUNCT__ 1891 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1892 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1893 { 1894 Mat C = B; 1895 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1896 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1897 PetscErrorCode ierr; 1898 const PetscInt *r,*ic,*c,*ics; 1899 PetscInt n = A->rmap->n,*bi = b->i; 1900 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1901 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1902 PetscInt *ai = a->i,*aj = a->j; 1903 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1904 PetscScalar mul1,mul2,mul3,tmp; 1905 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1906 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1907 PetscReal rs=0.0; 1908 FactorShiftCtx sctx; 1909 1910 PetscFunctionBegin; 1911 sctx.shift_top = 0; 1912 sctx.nshift_max = 0; 1913 sctx.shift_lo = 0; 1914 sctx.shift_hi = 0; 1915 sctx.shift_fraction = 0; 1916 1917 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1918 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1919 sctx.shift_top = 0; 1920 for (i=0; i<n; i++) { 1921 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1922 rs = 0.0; 1923 ajtmp = aj + ai[i]; 1924 rtmp1 = aa + ai[i]; 1925 nz = ai[i+1] - ai[i]; 1926 for (j=0; j<nz; j++) { 1927 if (*ajtmp != i) { 1928 rs += PetscAbsScalar(*rtmp1++); 1929 } else { 1930 rs -= PetscRealPart(*rtmp1++); 1931 } 1932 ajtmp++; 1933 } 1934 if (rs>sctx.shift_top) sctx.shift_top = rs; 1935 } 1936 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1937 sctx.shift_top *= 1.1; 1938 sctx.nshift_max = 5; 1939 sctx.shift_lo = 0.; 1940 sctx.shift_hi = 1.; 1941 } 1942 sctx.shift_amount = 0; 1943 sctx.nshift = 0; 1944 1945 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1946 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1947 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1948 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1949 ics = ic; 1950 1951 node_max = a->inode.node_count; 1952 ns = a->inode.size; 1953 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1954 1955 /* If max inode size > 3, split it into two inodes.*/ 1956 /* also map the inode sizes according to the ordering */ 1957 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1958 for (i=0,j=0; i<node_max; ++i,++j) { 1959 if (ns[i]>3) { 1960 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1961 ++j; 1962 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1963 } else { 1964 tmp_vec1[j] = ns[i]; 1965 } 1966 } 1967 /* Use the correct node_max */ 1968 node_max = j; 1969 1970 /* Now reorder the inode info based on mat re-ordering info */ 1971 /* First create a row -> inode_size_array_index map */ 1972 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1973 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1974 for (i=0,row=0; i<node_max; i++) { 1975 nodesz = tmp_vec1[i]; 1976 for (j=0; j<nodesz; j++,row++) { 1977 nsmap[row] = i; 1978 } 1979 } 1980 /* Using nsmap, create a reordered ns structure */ 1981 for (i=0,j=0; i< node_max; i++) { 1982 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1983 tmp_vec2[i] = nodesz; 1984 j += nodesz; 1985 } 1986 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1987 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1988 /* Now use the correct ns */ 1989 ns = tmp_vec2; 1990 1991 do { 1992 sctx.newshift = PETSC_FALSE; 1993 /* Now loop over each block-row, and do the factorization */ 1994 for (i=0,row=0; i<node_max; i++) { 1995 nodesz = ns[i]; 1996 nz = bi[row+1] - bi[row]; 1997 bjtmp = bj + bi[row]; 1998 1999 switch (nodesz) { 2000 case 1: 2001 for (j=0; j<nz; j++) { 2002 idx = bjtmp[j]; 2003 rtmp11[idx] = 0.0; 2004 } 2005 2006 /* load in initial (unfactored row) */ 2007 idx = r[row]; 2008 nz_tmp = ai[idx+1] - ai[idx]; 2009 ajtmp = aj + ai[idx]; 2010 v1 = aa + ai[idx]; 2011 2012 for (j=0; j<nz_tmp; j++) { 2013 idx = ics[ajtmp[j]]; 2014 rtmp11[idx] = v1[j]; 2015 } 2016 rtmp11[ics[r[row]]] += sctx.shift_amount; 2017 2018 prow = *bjtmp++; 2019 while (prow < row) { 2020 pc1 = rtmp11 + prow; 2021 if (*pc1 != 0.0) { 2022 pv = ba + bd[prow]; 2023 pj = nbj + bd[prow]; 2024 mul1 = *pc1 * *pv++; 2025 *pc1 = mul1; 2026 nz_tmp = bi[prow+1] - bd[prow] - 1; 2027 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2028 for (j=0; j<nz_tmp; j++) { 2029 tmp = pv[j]; 2030 idx = pj[j]; 2031 rtmp11[idx] -= mul1 * tmp; 2032 } 2033 } 2034 prow = *bjtmp++; 2035 } 2036 pj = bj + bi[row]; 2037 pc1 = ba + bi[row]; 2038 2039 sctx.pv = rtmp11[row]; 2040 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2041 rs = 0.0; 2042 for (j=0; j<nz; j++) { 2043 idx = pj[j]; 2044 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2045 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2046 } 2047 sctx.rs = rs; 2048 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2049 if (sctx.newshift) goto endofwhile; 2050 break; 2051 2052 case 2: 2053 for (j=0; j<nz; j++) { 2054 idx = bjtmp[j]; 2055 rtmp11[idx] = 0.0; 2056 rtmp22[idx] = 0.0; 2057 } 2058 2059 /* load in initial (unfactored row) */ 2060 idx = r[row]; 2061 nz_tmp = ai[idx+1] - ai[idx]; 2062 ajtmp = aj + ai[idx]; 2063 v1 = aa + ai[idx]; 2064 v2 = aa + ai[idx+1]; 2065 for (j=0; j<nz_tmp; j++) { 2066 idx = ics[ajtmp[j]]; 2067 rtmp11[idx] = v1[j]; 2068 rtmp22[idx] = v2[j]; 2069 } 2070 rtmp11[ics[r[row]]] += sctx.shift_amount; 2071 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2072 2073 prow = *bjtmp++; 2074 while (prow < row) { 2075 pc1 = rtmp11 + prow; 2076 pc2 = rtmp22 + prow; 2077 if (*pc1 != 0.0 || *pc2 != 0.0) { 2078 pv = ba + bd[prow]; 2079 pj = nbj + bd[prow]; 2080 mul1 = *pc1 * *pv; 2081 mul2 = *pc2 * *pv; 2082 ++pv; 2083 *pc1 = mul1; 2084 *pc2 = mul2; 2085 2086 nz_tmp = bi[prow+1] - bd[prow] - 1; 2087 for (j=0; j<nz_tmp; j++) { 2088 tmp = pv[j]; 2089 idx = pj[j]; 2090 rtmp11[idx] -= mul1 * tmp; 2091 rtmp22[idx] -= mul2 * tmp; 2092 } 2093 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2094 } 2095 prow = *bjtmp++; 2096 } 2097 2098 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2099 pc1 = rtmp11 + prow; 2100 pc2 = rtmp22 + prow; 2101 2102 sctx.pv = *pc1; 2103 pj = bj + bi[prow]; 2104 rs = 0.0; 2105 for (j=0; j<nz; j++) { 2106 idx = pj[j]; 2107 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2108 } 2109 sctx.rs = rs; 2110 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2111 if (sctx.newshift) goto endofwhile; 2112 2113 if (*pc2 != 0.0) { 2114 pj = nbj + bd[prow]; 2115 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2116 *pc2 = mul2; 2117 nz_tmp = bi[prow+1] - bd[prow] - 1; 2118 for (j=0; j<nz_tmp; j++) { 2119 idx = pj[j]; 2120 tmp = rtmp11[idx]; 2121 rtmp22[idx] -= mul2 * tmp; 2122 } 2123 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2124 } 2125 2126 pj = bj + bi[row]; 2127 pc1 = ba + bi[row]; 2128 pc2 = ba + bi[row+1]; 2129 2130 sctx.pv = rtmp22[row+1]; 2131 rs = 0.0; 2132 rtmp11[row] = 1.0/rtmp11[row]; 2133 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2134 /* copy row entries from dense representation to sparse */ 2135 for (j=0; j<nz; j++) { 2136 idx = pj[j]; 2137 pc1[j] = rtmp11[idx]; 2138 pc2[j] = rtmp22[idx]; 2139 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2140 } 2141 sctx.rs = rs; 2142 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2143 if (sctx.newshift) goto endofwhile; 2144 break; 2145 2146 case 3: 2147 for (j=0; j<nz; j++) { 2148 idx = bjtmp[j]; 2149 rtmp11[idx] = 0.0; 2150 rtmp22[idx] = 0.0; 2151 rtmp33[idx] = 0.0; 2152 } 2153 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2154 idx = r[row]; 2155 nz_tmp = ai[idx+1] - ai[idx]; 2156 ajtmp = aj + ai[idx]; 2157 v1 = aa + ai[idx]; 2158 v2 = aa + ai[idx+1]; 2159 v3 = aa + ai[idx+2]; 2160 for (j=0; j<nz_tmp; j++) { 2161 idx = ics[ajtmp[j]]; 2162 rtmp11[idx] = v1[j]; 2163 rtmp22[idx] = v2[j]; 2164 rtmp33[idx] = v3[j]; 2165 } 2166 rtmp11[ics[r[row]]] += sctx.shift_amount; 2167 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2168 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2169 2170 /* loop over all pivot row blocks above this row block */ 2171 prow = *bjtmp++; 2172 while (prow < row) { 2173 pc1 = rtmp11 + prow; 2174 pc2 = rtmp22 + prow; 2175 pc3 = rtmp33 + prow; 2176 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2177 pv = ba + bd[prow]; 2178 pj = nbj + bd[prow]; 2179 mul1 = *pc1 * *pv; 2180 mul2 = *pc2 * *pv; 2181 mul3 = *pc3 * *pv; 2182 ++pv; 2183 *pc1 = mul1; 2184 *pc2 = mul2; 2185 *pc3 = mul3; 2186 2187 nz_tmp = bi[prow+1] - bd[prow] - 1; 2188 /* update this row based on pivot row */ 2189 for (j=0; j<nz_tmp; j++) { 2190 tmp = pv[j]; 2191 idx = pj[j]; 2192 rtmp11[idx] -= mul1 * tmp; 2193 rtmp22[idx] -= mul2 * tmp; 2194 rtmp33[idx] -= mul3 * tmp; 2195 } 2196 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2197 } 2198 prow = *bjtmp++; 2199 } 2200 2201 /* Now take care of diagonal 3x3 block in this set of rows */ 2202 /* note: prow = row here */ 2203 pc1 = rtmp11 + prow; 2204 pc2 = rtmp22 + prow; 2205 pc3 = rtmp33 + prow; 2206 2207 sctx.pv = *pc1; 2208 pj = bj + bi[prow]; 2209 rs = 0.0; 2210 for (j=0; j<nz; j++) { 2211 idx = pj[j]; 2212 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2213 } 2214 sctx.rs = rs; 2215 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2216 if (sctx.newshift) goto endofwhile; 2217 2218 if (*pc2 != 0.0 || *pc3 != 0.0) { 2219 mul2 = (*pc2)/(*pc1); 2220 mul3 = (*pc3)/(*pc1); 2221 *pc2 = mul2; 2222 *pc3 = mul3; 2223 nz_tmp = bi[prow+1] - bd[prow] - 1; 2224 pj = nbj + bd[prow]; 2225 for (j=0; j<nz_tmp; j++) { 2226 idx = pj[j]; 2227 tmp = rtmp11[idx]; 2228 rtmp22[idx] -= mul2 * tmp; 2229 rtmp33[idx] -= mul3 * tmp; 2230 } 2231 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2232 } 2233 ++prow; 2234 2235 pc2 = rtmp22 + prow; 2236 pc3 = rtmp33 + prow; 2237 sctx.pv = *pc2; 2238 pj = bj + bi[prow]; 2239 rs = 0.0; 2240 for (j=0; j<nz; j++) { 2241 idx = pj[j]; 2242 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2243 } 2244 sctx.rs = rs; 2245 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2246 if (sctx.newshift) goto endofwhile; 2247 2248 if (*pc3 != 0.0) { 2249 mul3 = (*pc3)/(*pc2); 2250 *pc3 = mul3; 2251 pj = nbj + bd[prow]; 2252 nz_tmp = bi[prow+1] - bd[prow] - 1; 2253 for (j=0; j<nz_tmp; j++) { 2254 idx = pj[j]; 2255 tmp = rtmp22[idx]; 2256 rtmp33[idx] -= mul3 * tmp; 2257 } 2258 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2259 } 2260 2261 pj = bj + bi[row]; 2262 pc1 = ba + bi[row]; 2263 pc2 = ba + bi[row+1]; 2264 pc3 = ba + bi[row+2]; 2265 2266 sctx.pv = rtmp33[row+2]; 2267 rs = 0.0; 2268 rtmp11[row] = 1.0/rtmp11[row]; 2269 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2270 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2271 /* copy row entries from dense representation to sparse */ 2272 for (j=0; j<nz; j++) { 2273 idx = pj[j]; 2274 pc1[j] = rtmp11[idx]; 2275 pc2[j] = rtmp22[idx]; 2276 pc3[j] = rtmp33[idx]; 2277 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2278 } 2279 2280 sctx.rs = rs; 2281 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr); 2282 if (sctx.newshift) goto endofwhile; 2283 break; 2284 2285 default: 2286 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2287 } 2288 row += nodesz; /* Update the row */ 2289 } 2290 endofwhile:; 2291 } while (sctx.newshift); 2292 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2293 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2294 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2295 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2296 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2297 2298 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2299 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2300 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2301 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2302 C->assembled = PETSC_TRUE; 2303 C->preallocated = PETSC_TRUE; 2304 if (sctx.nshift) { 2305 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2306 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2307 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2308 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2309 } 2310 } 2311 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2312 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2313 PetscFunctionReturn(0); 2314 } 2315 2316 2317 /* ----------------------------------------------------------- */ 2318 #undef __FUNCT__ 2319 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2320 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2321 { 2322 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2323 IS iscol = a->col,isrow = a->row; 2324 PetscErrorCode ierr; 2325 const PetscInt *r,*c,*rout,*cout; 2326 PetscInt i,j,n = A->rmap->n; 2327 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2328 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2329 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2330 PetscScalar sum1,sum2,sum3,sum4,sum5; 2331 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2332 const PetscScalar *b; 2333 2334 PetscFunctionBegin; 2335 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2336 node_max = a->inode.node_count; 2337 ns = a->inode.size; /* Node Size array */ 2338 2339 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2340 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2341 tmp = a->solve_work; 2342 2343 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2344 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2345 2346 /* forward solve the lower triangular */ 2347 tmps = tmp; 2348 aa = a_a; 2349 aj = a_j; 2350 ad = a->diag; 2351 2352 for (i = 0,row = 0; i< node_max; ++i) { 2353 nsz = ns[i]; 2354 aii = ai[row]; 2355 v1 = aa + aii; 2356 vi = aj + aii; 2357 nz = ai[row+1]- ai[row]; 2358 2359 if (i < node_max-1) { 2360 /* Prefetch the indices for the next block */ 2361 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2362 /* Prefetch the data for the next block */ 2363 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2364 } 2365 2366 switch (nsz) { /* Each loop in 'case' is unrolled */ 2367 case 1: 2368 sum1 = b[r[row]]; 2369 for (j=0; j<nz-1; j+=2) { 2370 i0 = vi[j]; 2371 i1 = vi[j+1]; 2372 tmp0 = tmps[i0]; 2373 tmp1 = tmps[i1]; 2374 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2375 } 2376 if (j == nz-1) { 2377 tmp0 = tmps[vi[j]]; 2378 sum1 -= v1[j]*tmp0; 2379 } 2380 tmp[row++]=sum1; 2381 break; 2382 case 2: 2383 sum1 = b[r[row]]; 2384 sum2 = b[r[row+1]]; 2385 v2 = aa + ai[row+1]; 2386 2387 for (j=0; j<nz-1; j+=2) { 2388 i0 = vi[j]; 2389 i1 = vi[j+1]; 2390 tmp0 = tmps[i0]; 2391 tmp1 = tmps[i1]; 2392 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2393 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2394 } 2395 if (j == nz-1) { 2396 tmp0 = tmps[vi[j]]; 2397 sum1 -= v1[j] *tmp0; 2398 sum2 -= v2[j] *tmp0; 2399 } 2400 sum2 -= v2[nz] * sum1; 2401 tmp[row++]=sum1; 2402 tmp[row++]=sum2; 2403 break; 2404 case 3: 2405 sum1 = b[r[row]]; 2406 sum2 = b[r[row+1]]; 2407 sum3 = b[r[row+2]]; 2408 v2 = aa + ai[row+1]; 2409 v3 = aa + ai[row+2]; 2410 2411 for (j=0; j<nz-1; j+=2) { 2412 i0 = vi[j]; 2413 i1 = vi[j+1]; 2414 tmp0 = tmps[i0]; 2415 tmp1 = tmps[i1]; 2416 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2417 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2418 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2419 } 2420 if (j == nz-1) { 2421 tmp0 = tmps[vi[j]]; 2422 sum1 -= v1[j] *tmp0; 2423 sum2 -= v2[j] *tmp0; 2424 sum3 -= v3[j] *tmp0; 2425 } 2426 sum2 -= v2[nz] * sum1; 2427 sum3 -= v3[nz] * sum1; 2428 sum3 -= v3[nz+1] * sum2; 2429 tmp[row++]=sum1; 2430 tmp[row++]=sum2; 2431 tmp[row++]=sum3; 2432 break; 2433 2434 case 4: 2435 sum1 = b[r[row]]; 2436 sum2 = b[r[row+1]]; 2437 sum3 = b[r[row+2]]; 2438 sum4 = b[r[row+3]]; 2439 v2 = aa + ai[row+1]; 2440 v3 = aa + ai[row+2]; 2441 v4 = aa + ai[row+3]; 2442 2443 for (j=0; j<nz-1; j+=2) { 2444 i0 = vi[j]; 2445 i1 = vi[j+1]; 2446 tmp0 = tmps[i0]; 2447 tmp1 = tmps[i1]; 2448 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2449 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2450 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2451 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2452 } 2453 if (j == nz-1) { 2454 tmp0 = tmps[vi[j]]; 2455 sum1 -= v1[j] *tmp0; 2456 sum2 -= v2[j] *tmp0; 2457 sum3 -= v3[j] *tmp0; 2458 sum4 -= v4[j] *tmp0; 2459 } 2460 sum2 -= v2[nz] * sum1; 2461 sum3 -= v3[nz] * sum1; 2462 sum4 -= v4[nz] * sum1; 2463 sum3 -= v3[nz+1] * sum2; 2464 sum4 -= v4[nz+1] * sum2; 2465 sum4 -= v4[nz+2] * sum3; 2466 2467 tmp[row++]=sum1; 2468 tmp[row++]=sum2; 2469 tmp[row++]=sum3; 2470 tmp[row++]=sum4; 2471 break; 2472 case 5: 2473 sum1 = b[r[row]]; 2474 sum2 = b[r[row+1]]; 2475 sum3 = b[r[row+2]]; 2476 sum4 = b[r[row+3]]; 2477 sum5 = b[r[row+4]]; 2478 v2 = aa + ai[row+1]; 2479 v3 = aa + ai[row+2]; 2480 v4 = aa + ai[row+3]; 2481 v5 = aa + ai[row+4]; 2482 2483 for (j=0; j<nz-1; j+=2) { 2484 i0 = vi[j]; 2485 i1 = vi[j+1]; 2486 tmp0 = tmps[i0]; 2487 tmp1 = tmps[i1]; 2488 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2489 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2490 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2491 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2492 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2493 } 2494 if (j == nz-1) { 2495 tmp0 = tmps[vi[j]]; 2496 sum1 -= v1[j] *tmp0; 2497 sum2 -= v2[j] *tmp0; 2498 sum3 -= v3[j] *tmp0; 2499 sum4 -= v4[j] *tmp0; 2500 sum5 -= v5[j] *tmp0; 2501 } 2502 2503 sum2 -= v2[nz] * sum1; 2504 sum3 -= v3[nz] * sum1; 2505 sum4 -= v4[nz] * sum1; 2506 sum5 -= v5[nz] * sum1; 2507 sum3 -= v3[nz+1] * sum2; 2508 sum4 -= v4[nz+1] * sum2; 2509 sum5 -= v5[nz+1] * sum2; 2510 sum4 -= v4[nz+2] * sum3; 2511 sum5 -= v5[nz+2] * sum3; 2512 sum5 -= v5[nz+3] * sum4; 2513 2514 tmp[row++]=sum1; 2515 tmp[row++]=sum2; 2516 tmp[row++]=sum3; 2517 tmp[row++]=sum4; 2518 tmp[row++]=sum5; 2519 break; 2520 default: 2521 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2522 } 2523 } 2524 /* backward solve the upper triangular */ 2525 for (i=node_max -1,row = n-1; i>=0; i--) { 2526 nsz = ns[i]; 2527 aii = ad[row+1] + 1; 2528 v1 = aa + aii; 2529 vi = aj + aii; 2530 nz = ad[row]- ad[row+1] - 1; 2531 2532 if (i > 0) { 2533 /* Prefetch the indices for the next block */ 2534 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2535 /* Prefetch the data for the next block */ 2536 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2537 } 2538 2539 switch (nsz) { /* Each loop in 'case' is unrolled */ 2540 case 1: 2541 sum1 = tmp[row]; 2542 2543 for (j=0; j<nz-1; j+=2) { 2544 i0 = vi[j]; 2545 i1 = vi[j+1]; 2546 tmp0 = tmps[i0]; 2547 tmp1 = tmps[i1]; 2548 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2549 } 2550 if (j == nz-1) { 2551 tmp0 = tmps[vi[j]]; 2552 sum1 -= v1[j]*tmp0; 2553 } 2554 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2555 break; 2556 case 2: 2557 sum1 = tmp[row]; 2558 sum2 = tmp[row-1]; 2559 v2 = aa + ad[row] + 1; 2560 for (j=0; j<nz-1; j+=2) { 2561 i0 = vi[j]; 2562 i1 = vi[j+1]; 2563 tmp0 = tmps[i0]; 2564 tmp1 = tmps[i1]; 2565 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2566 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2567 } 2568 if (j == nz-1) { 2569 tmp0 = tmps[vi[j]]; 2570 sum1 -= v1[j]* tmp0; 2571 sum2 -= v2[j+1]* tmp0; 2572 } 2573 2574 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2575 sum2 -= v2[0] * tmp0; 2576 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2577 break; 2578 case 3: 2579 sum1 = tmp[row]; 2580 sum2 = tmp[row -1]; 2581 sum3 = tmp[row -2]; 2582 v2 = aa + ad[row] + 1; 2583 v3 = aa + ad[row -1] + 1; 2584 for (j=0; j<nz-1; j+=2) { 2585 i0 = vi[j]; 2586 i1 = vi[j+1]; 2587 tmp0 = tmps[i0]; 2588 tmp1 = tmps[i1]; 2589 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2590 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2591 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2592 } 2593 if (j== nz-1) { 2594 tmp0 = tmps[vi[j]]; 2595 sum1 -= v1[j] * tmp0; 2596 sum2 -= v2[j+1] * tmp0; 2597 sum3 -= v3[j+2] * tmp0; 2598 } 2599 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2600 sum2 -= v2[0]* tmp0; 2601 sum3 -= v3[1] * tmp0; 2602 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2603 sum3 -= v3[0]* tmp0; 2604 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2605 2606 break; 2607 case 4: 2608 sum1 = tmp[row]; 2609 sum2 = tmp[row -1]; 2610 sum3 = tmp[row -2]; 2611 sum4 = tmp[row -3]; 2612 v2 = aa + ad[row]+1; 2613 v3 = aa + ad[row -1]+1; 2614 v4 = aa + ad[row -2]+1; 2615 2616 for (j=0; j<nz-1; j+=2) { 2617 i0 = vi[j]; 2618 i1 = vi[j+1]; 2619 tmp0 = tmps[i0]; 2620 tmp1 = tmps[i1]; 2621 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2622 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2623 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2624 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2625 } 2626 if (j== nz-1) { 2627 tmp0 = tmps[vi[j]]; 2628 sum1 -= v1[j] * tmp0; 2629 sum2 -= v2[j+1] * tmp0; 2630 sum3 -= v3[j+2] * tmp0; 2631 sum4 -= v4[j+3] * tmp0; 2632 } 2633 2634 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2635 sum2 -= v2[0] * tmp0; 2636 sum3 -= v3[1] * tmp0; 2637 sum4 -= v4[2] * tmp0; 2638 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2639 sum3 -= v3[0] * tmp0; 2640 sum4 -= v4[1] * tmp0; 2641 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2642 sum4 -= v4[0] * tmp0; 2643 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2644 break; 2645 case 5: 2646 sum1 = tmp[row]; 2647 sum2 = tmp[row -1]; 2648 sum3 = tmp[row -2]; 2649 sum4 = tmp[row -3]; 2650 sum5 = tmp[row -4]; 2651 v2 = aa + ad[row]+1; 2652 v3 = aa + ad[row -1]+1; 2653 v4 = aa + ad[row -2]+1; 2654 v5 = aa + ad[row -3]+1; 2655 for (j=0; j<nz-1; j+=2) { 2656 i0 = vi[j]; 2657 i1 = vi[j+1]; 2658 tmp0 = tmps[i0]; 2659 tmp1 = tmps[i1]; 2660 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2661 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2662 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2663 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2664 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2665 } 2666 if (j==nz-1) { 2667 tmp0 = tmps[vi[j]]; 2668 sum1 -= v1[j] * tmp0; 2669 sum2 -= v2[j+1] * tmp0; 2670 sum3 -= v3[j+2] * tmp0; 2671 sum4 -= v4[j+3] * tmp0; 2672 sum5 -= v5[j+4] * tmp0; 2673 } 2674 2675 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2676 sum2 -= v2[0] * tmp0; 2677 sum3 -= v3[1] * tmp0; 2678 sum4 -= v4[2] * tmp0; 2679 sum5 -= v5[3] * tmp0; 2680 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2681 sum3 -= v3[0] * tmp0; 2682 sum4 -= v4[1] * tmp0; 2683 sum5 -= v5[2] * tmp0; 2684 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2685 sum4 -= v4[0] * tmp0; 2686 sum5 -= v5[1] * tmp0; 2687 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2688 sum5 -= v5[0] * tmp0; 2689 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2690 break; 2691 default: 2692 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2693 } 2694 } 2695 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2696 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2697 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2698 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2699 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2700 PetscFunctionReturn(0); 2701 } 2702 2703 2704 /* 2705 Makes a longer coloring[] array and calls the usual code with that 2706 */ 2707 #undef __FUNCT__ 2708 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2709 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2710 { 2711 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2712 PetscErrorCode ierr; 2713 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2714 PetscInt *colorused,i; 2715 ISColoringValue *newcolor; 2716 2717 PetscFunctionBegin; 2718 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2719 /* loop over inodes, marking a color for each column*/ 2720 row = 0; 2721 for (i=0; i<m; i++) { 2722 for (j=0; j<ns[i]; j++) { 2723 newcolor[row++] = coloring[i] + j*ncolors; 2724 } 2725 } 2726 2727 /* eliminate unneeded colors */ 2728 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2729 for (i=0; i<n; i++) { 2730 colorused[newcolor[i]] = 1; 2731 } 2732 2733 for (i=1; i<5*ncolors; i++) { 2734 colorused[i] += colorused[i-1]; 2735 } 2736 ncolors = colorused[5*ncolors-1]; 2737 for (i=0; i<n; i++) { 2738 newcolor[i] = colorused[newcolor[i]]-1; 2739 } 2740 ierr = PetscFree(colorused);CHKERRQ(ierr); 2741 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2742 ierr = PetscFree(coloring);CHKERRQ(ierr); 2743 PetscFunctionReturn(0); 2744 } 2745 2746 #include <petsc/private/kernels/blockinvert.h> 2747 2748 #undef __FUNCT__ 2749 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2750 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2751 { 2752 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2753 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2754 MatScalar *ibdiag,*bdiag,work[25],*t; 2755 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2756 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2757 const PetscScalar *xb, *b; 2758 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2759 PetscErrorCode ierr; 2760 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2761 PetscInt sz,k,ipvt[5]; 2762 PetscBool zeropivotdetected=PETSC_FALSE; 2763 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2764 2765 PetscFunctionBegin; 2766 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2767 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 2768 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2769 2770 if (!a->inode.ibdiagvalid) { 2771 if (!a->inode.ibdiag) { 2772 /* calculate space needed for diagonal blocks */ 2773 for (i=0; i<m; i++) { 2774 cnt += sizes[i]*sizes[i]; 2775 } 2776 a->inode.bdiagsize = cnt; 2777 2778 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2779 } 2780 2781 /* copy over the diagonal blocks and invert them */ 2782 ibdiag = a->inode.ibdiag; 2783 bdiag = a->inode.bdiag; 2784 cnt = 0; 2785 for (i=0, row = 0; i<m; i++) { 2786 for (j=0; j<sizes[i]; j++) { 2787 for (k=0; k<sizes[i]; k++) { 2788 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2789 } 2790 } 2791 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2792 2793 switch (sizes[i]) { 2794 case 1: 2795 /* Create matrix data structure */ 2796 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2797 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2798 break; 2799 case 2: 2800 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2801 break; 2802 case 3: 2803 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,!A->erroriffailure,&zeropivotdetected);CHKERRQ(ierr); 2804 break; 2805 case 4: 2806 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2807 break; 2808 case 5: 2809 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2810 break; 2811 default: 2812 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2813 } 2814 if (zeropivotdetected) { 2815 A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2816 } 2817 2818 cnt += sizes[i]*sizes[i]; 2819 row += sizes[i]; 2820 } 2821 a->inode.ibdiagvalid = PETSC_TRUE; 2822 } 2823 ibdiag = a->inode.ibdiag; 2824 bdiag = a->inode.bdiag; 2825 t = a->inode.ssor_work; 2826 2827 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2828 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2829 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2830 if (flag & SOR_ZERO_INITIAL_GUESS) { 2831 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2832 2833 for (i=0, row=0; i<m; i++) { 2834 sz = diag[row] - ii[row]; 2835 v1 = a->a + ii[row]; 2836 idx = a->j + ii[row]; 2837 2838 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2839 switch (sizes[i]) { 2840 case 1: 2841 2842 sum1 = b[row]; 2843 for (n = 0; n<sz-1; n+=2) { 2844 i1 = idx[0]; 2845 i2 = idx[1]; 2846 idx += 2; 2847 tmp0 = x[i1]; 2848 tmp1 = x[i2]; 2849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2850 } 2851 2852 if (n == sz-1) { 2853 tmp0 = x[*idx]; 2854 sum1 -= *v1 * tmp0; 2855 } 2856 t[row] = sum1; 2857 x[row++] = sum1*(*ibdiag++); 2858 break; 2859 case 2: 2860 v2 = a->a + ii[row+1]; 2861 sum1 = b[row]; 2862 sum2 = b[row+1]; 2863 for (n = 0; n<sz-1; n+=2) { 2864 i1 = idx[0]; 2865 i2 = idx[1]; 2866 idx += 2; 2867 tmp0 = x[i1]; 2868 tmp1 = x[i2]; 2869 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2870 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2871 } 2872 2873 if (n == sz-1) { 2874 tmp0 = x[*idx]; 2875 sum1 -= v1[0] * tmp0; 2876 sum2 -= v2[0] * tmp0; 2877 } 2878 t[row] = sum1; 2879 t[row+1] = sum2; 2880 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2881 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2882 ibdiag += 4; 2883 break; 2884 case 3: 2885 v2 = a->a + ii[row+1]; 2886 v3 = a->a + ii[row+2]; 2887 sum1 = b[row]; 2888 sum2 = b[row+1]; 2889 sum3 = b[row+2]; 2890 for (n = 0; n<sz-1; n+=2) { 2891 i1 = idx[0]; 2892 i2 = idx[1]; 2893 idx += 2; 2894 tmp0 = x[i1]; 2895 tmp1 = x[i2]; 2896 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2897 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2898 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2899 } 2900 2901 if (n == sz-1) { 2902 tmp0 = x[*idx]; 2903 sum1 -= v1[0] * tmp0; 2904 sum2 -= v2[0] * tmp0; 2905 sum3 -= v3[0] * tmp0; 2906 } 2907 t[row] = sum1; 2908 t[row+1] = sum2; 2909 t[row+2] = sum3; 2910 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2911 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2912 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2913 ibdiag += 9; 2914 break; 2915 case 4: 2916 v2 = a->a + ii[row+1]; 2917 v3 = a->a + ii[row+2]; 2918 v4 = a->a + ii[row+3]; 2919 sum1 = b[row]; 2920 sum2 = b[row+1]; 2921 sum3 = b[row+2]; 2922 sum4 = b[row+3]; 2923 for (n = 0; n<sz-1; n+=2) { 2924 i1 = idx[0]; 2925 i2 = idx[1]; 2926 idx += 2; 2927 tmp0 = x[i1]; 2928 tmp1 = x[i2]; 2929 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2930 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2931 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2932 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2933 } 2934 2935 if (n == sz-1) { 2936 tmp0 = x[*idx]; 2937 sum1 -= v1[0] * tmp0; 2938 sum2 -= v2[0] * tmp0; 2939 sum3 -= v3[0] * tmp0; 2940 sum4 -= v4[0] * tmp0; 2941 } 2942 t[row] = sum1; 2943 t[row+1] = sum2; 2944 t[row+2] = sum3; 2945 t[row+3] = sum4; 2946 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2947 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2948 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2949 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2950 ibdiag += 16; 2951 break; 2952 case 5: 2953 v2 = a->a + ii[row+1]; 2954 v3 = a->a + ii[row+2]; 2955 v4 = a->a + ii[row+3]; 2956 v5 = a->a + ii[row+4]; 2957 sum1 = b[row]; 2958 sum2 = b[row+1]; 2959 sum3 = b[row+2]; 2960 sum4 = b[row+3]; 2961 sum5 = b[row+4]; 2962 for (n = 0; n<sz-1; n+=2) { 2963 i1 = idx[0]; 2964 i2 = idx[1]; 2965 idx += 2; 2966 tmp0 = x[i1]; 2967 tmp1 = x[i2]; 2968 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2969 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2970 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2971 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2972 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2973 } 2974 2975 if (n == sz-1) { 2976 tmp0 = x[*idx]; 2977 sum1 -= v1[0] * tmp0; 2978 sum2 -= v2[0] * tmp0; 2979 sum3 -= v3[0] * tmp0; 2980 sum4 -= v4[0] * tmp0; 2981 sum5 -= v5[0] * tmp0; 2982 } 2983 t[row] = sum1; 2984 t[row+1] = sum2; 2985 t[row+2] = sum3; 2986 t[row+3] = sum4; 2987 t[row+4] = sum5; 2988 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2989 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2990 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2991 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2992 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2993 ibdiag += 25; 2994 break; 2995 default: 2996 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2997 } 2998 } 2999 3000 xb = t; 3001 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3002 } else xb = b; 3003 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3004 3005 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3006 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3007 ibdiag -= sizes[i]*sizes[i]; 3008 sz = ii[row+1] - diag[row] - 1; 3009 v1 = a->a + diag[row] + 1; 3010 idx = a->j + diag[row] + 1; 3011 3012 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3013 switch (sizes[i]) { 3014 case 1: 3015 3016 sum1 = xb[row]; 3017 for (n = 0; n<sz-1; n+=2) { 3018 i1 = idx[0]; 3019 i2 = idx[1]; 3020 idx += 2; 3021 tmp0 = x[i1]; 3022 tmp1 = x[i2]; 3023 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3024 } 3025 3026 if (n == sz-1) { 3027 tmp0 = x[*idx]; 3028 sum1 -= *v1*tmp0; 3029 } 3030 x[row--] = sum1*(*ibdiag); 3031 break; 3032 3033 case 2: 3034 3035 sum1 = xb[row]; 3036 sum2 = xb[row-1]; 3037 /* note that sum1 is associated with the second of the two rows */ 3038 v2 = a->a + diag[row-1] + 2; 3039 for (n = 0; n<sz-1; n+=2) { 3040 i1 = idx[0]; 3041 i2 = idx[1]; 3042 idx += 2; 3043 tmp0 = x[i1]; 3044 tmp1 = x[i2]; 3045 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3046 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3047 } 3048 3049 if (n == sz-1) { 3050 tmp0 = x[*idx]; 3051 sum1 -= *v1*tmp0; 3052 sum2 -= *v2*tmp0; 3053 } 3054 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3055 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3056 break; 3057 case 3: 3058 3059 sum1 = xb[row]; 3060 sum2 = xb[row-1]; 3061 sum3 = xb[row-2]; 3062 v2 = a->a + diag[row-1] + 2; 3063 v3 = a->a + diag[row-2] + 3; 3064 for (n = 0; n<sz-1; n+=2) { 3065 i1 = idx[0]; 3066 i2 = idx[1]; 3067 idx += 2; 3068 tmp0 = x[i1]; 3069 tmp1 = x[i2]; 3070 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3071 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3072 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3073 } 3074 3075 if (n == sz-1) { 3076 tmp0 = x[*idx]; 3077 sum1 -= *v1*tmp0; 3078 sum2 -= *v2*tmp0; 3079 sum3 -= *v3*tmp0; 3080 } 3081 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3082 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3083 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3084 break; 3085 case 4: 3086 3087 sum1 = xb[row]; 3088 sum2 = xb[row-1]; 3089 sum3 = xb[row-2]; 3090 sum4 = xb[row-3]; 3091 v2 = a->a + diag[row-1] + 2; 3092 v3 = a->a + diag[row-2] + 3; 3093 v4 = a->a + diag[row-3] + 4; 3094 for (n = 0; n<sz-1; n+=2) { 3095 i1 = idx[0]; 3096 i2 = idx[1]; 3097 idx += 2; 3098 tmp0 = x[i1]; 3099 tmp1 = x[i2]; 3100 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3101 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3102 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3103 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3104 } 3105 3106 if (n == sz-1) { 3107 tmp0 = x[*idx]; 3108 sum1 -= *v1*tmp0; 3109 sum2 -= *v2*tmp0; 3110 sum3 -= *v3*tmp0; 3111 sum4 -= *v4*tmp0; 3112 } 3113 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3114 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3115 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3116 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3117 break; 3118 case 5: 3119 3120 sum1 = xb[row]; 3121 sum2 = xb[row-1]; 3122 sum3 = xb[row-2]; 3123 sum4 = xb[row-3]; 3124 sum5 = xb[row-4]; 3125 v2 = a->a + diag[row-1] + 2; 3126 v3 = a->a + diag[row-2] + 3; 3127 v4 = a->a + diag[row-3] + 4; 3128 v5 = a->a + diag[row-4] + 5; 3129 for (n = 0; n<sz-1; n+=2) { 3130 i1 = idx[0]; 3131 i2 = idx[1]; 3132 idx += 2; 3133 tmp0 = x[i1]; 3134 tmp1 = x[i2]; 3135 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3136 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3137 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3138 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3139 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3140 } 3141 3142 if (n == sz-1) { 3143 tmp0 = x[*idx]; 3144 sum1 -= *v1*tmp0; 3145 sum2 -= *v2*tmp0; 3146 sum3 -= *v3*tmp0; 3147 sum4 -= *v4*tmp0; 3148 sum5 -= *v5*tmp0; 3149 } 3150 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3151 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3152 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3153 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3154 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3155 break; 3156 default: 3157 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3158 } 3159 } 3160 3161 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3162 } 3163 its--; 3164 } 3165 while (its--) { 3166 3167 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3168 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3169 i<m; 3170 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3171 3172 sz = diag[row] - ii[row]; 3173 v1 = a->a + ii[row]; 3174 idx = a->j + ii[row]; 3175 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3176 switch (sizes[i]) { 3177 case 1: 3178 sum1 = b[row]; 3179 for (n = 0; n<sz-1; n+=2) { 3180 i1 = idx[0]; 3181 i2 = idx[1]; 3182 idx += 2; 3183 tmp0 = x[i1]; 3184 tmp1 = x[i2]; 3185 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3186 } 3187 if (n == sz-1) { 3188 tmp0 = x[*idx++]; 3189 sum1 -= *v1 * tmp0; 3190 v1++; 3191 } 3192 t[row] = sum1; 3193 sz = ii[row+1] - diag[row] - 1; 3194 idx = a->j + diag[row] + 1; 3195 v1 += 1; 3196 for (n = 0; n<sz-1; n+=2) { 3197 i1 = idx[0]; 3198 i2 = idx[1]; 3199 idx += 2; 3200 tmp0 = x[i1]; 3201 tmp1 = x[i2]; 3202 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3203 } 3204 if (n == sz-1) { 3205 tmp0 = x[*idx++]; 3206 sum1 -= *v1 * tmp0; 3207 } 3208 /* in MatSOR_SeqAIJ this line would be 3209 * 3210 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3211 * 3212 * but omega == 1, so this becomes 3213 * 3214 * x[row] = sum1*(*ibdiag++); 3215 * 3216 */ 3217 x[row] = sum1*(*ibdiag); 3218 break; 3219 case 2: 3220 v2 = a->a + ii[row+1]; 3221 sum1 = b[row]; 3222 sum2 = b[row+1]; 3223 for (n = 0; n<sz-1; n+=2) { 3224 i1 = idx[0]; 3225 i2 = idx[1]; 3226 idx += 2; 3227 tmp0 = x[i1]; 3228 tmp1 = x[i2]; 3229 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3230 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3231 } 3232 if (n == sz-1) { 3233 tmp0 = x[*idx++]; 3234 sum1 -= v1[0] * tmp0; 3235 sum2 -= v2[0] * tmp0; 3236 v1++; v2++; 3237 } 3238 t[row] = sum1; 3239 t[row+1] = sum2; 3240 sz = ii[row+1] - diag[row] - 2; 3241 idx = a->j + diag[row] + 2; 3242 v1 += 2; 3243 v2 += 2; 3244 for (n = 0; n<sz-1; n+=2) { 3245 i1 = idx[0]; 3246 i2 = idx[1]; 3247 idx += 2; 3248 tmp0 = x[i1]; 3249 tmp1 = x[i2]; 3250 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3251 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3252 } 3253 if (n == sz-1) { 3254 tmp0 = x[*idx]; 3255 sum1 -= v1[0] * tmp0; 3256 sum2 -= v2[0] * tmp0; 3257 } 3258 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3259 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3260 break; 3261 case 3: 3262 v2 = a->a + ii[row+1]; 3263 v3 = a->a + ii[row+2]; 3264 sum1 = b[row]; 3265 sum2 = b[row+1]; 3266 sum3 = b[row+2]; 3267 for (n = 0; n<sz-1; n+=2) { 3268 i1 = idx[0]; 3269 i2 = idx[1]; 3270 idx += 2; 3271 tmp0 = x[i1]; 3272 tmp1 = x[i2]; 3273 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3274 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3275 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3276 } 3277 if (n == sz-1) { 3278 tmp0 = x[*idx++]; 3279 sum1 -= v1[0] * tmp0; 3280 sum2 -= v2[0] * tmp0; 3281 sum3 -= v3[0] * tmp0; 3282 v1++; v2++; v3++; 3283 } 3284 t[row] = sum1; 3285 t[row+1] = sum2; 3286 t[row+2] = sum3; 3287 sz = ii[row+1] - diag[row] - 3; 3288 idx = a->j + diag[row] + 3; 3289 v1 += 3; 3290 v2 += 3; 3291 v3 += 3; 3292 for (n = 0; n<sz-1; n+=2) { 3293 i1 = idx[0]; 3294 i2 = idx[1]; 3295 idx += 2; 3296 tmp0 = x[i1]; 3297 tmp1 = x[i2]; 3298 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3299 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3300 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3301 } 3302 if (n == sz-1) { 3303 tmp0 = x[*idx]; 3304 sum1 -= v1[0] * tmp0; 3305 sum2 -= v2[0] * tmp0; 3306 sum3 -= v3[0] * tmp0; 3307 } 3308 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3309 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3310 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3311 break; 3312 case 4: 3313 v2 = a->a + ii[row+1]; 3314 v3 = a->a + ii[row+2]; 3315 v4 = a->a + ii[row+3]; 3316 sum1 = b[row]; 3317 sum2 = b[row+1]; 3318 sum3 = b[row+2]; 3319 sum4 = b[row+3]; 3320 for (n = 0; n<sz-1; n+=2) { 3321 i1 = idx[0]; 3322 i2 = idx[1]; 3323 idx += 2; 3324 tmp0 = x[i1]; 3325 tmp1 = x[i2]; 3326 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3327 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3328 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3329 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3330 } 3331 if (n == sz-1) { 3332 tmp0 = x[*idx++]; 3333 sum1 -= v1[0] * tmp0; 3334 sum2 -= v2[0] * tmp0; 3335 sum3 -= v3[0] * tmp0; 3336 sum4 -= v4[0] * tmp0; 3337 v1++; v2++; v3++; v4++; 3338 } 3339 t[row] = sum1; 3340 t[row+1] = sum2; 3341 t[row+2] = sum3; 3342 t[row+3] = sum4; 3343 sz = ii[row+1] - diag[row] - 4; 3344 idx = a->j + diag[row] + 4; 3345 v1 += 4; 3346 v2 += 4; 3347 v3 += 4; 3348 v4 += 4; 3349 for (n = 0; n<sz-1; n+=2) { 3350 i1 = idx[0]; 3351 i2 = idx[1]; 3352 idx += 2; 3353 tmp0 = x[i1]; 3354 tmp1 = x[i2]; 3355 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3356 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3357 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3358 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3359 } 3360 if (n == sz-1) { 3361 tmp0 = x[*idx]; 3362 sum1 -= v1[0] * tmp0; 3363 sum2 -= v2[0] * tmp0; 3364 sum3 -= v3[0] * tmp0; 3365 sum4 -= v4[0] * tmp0; 3366 } 3367 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3368 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3369 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3370 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3371 break; 3372 case 5: 3373 v2 = a->a + ii[row+1]; 3374 v3 = a->a + ii[row+2]; 3375 v4 = a->a + ii[row+3]; 3376 v5 = a->a + ii[row+4]; 3377 sum1 = b[row]; 3378 sum2 = b[row+1]; 3379 sum3 = b[row+2]; 3380 sum4 = b[row+3]; 3381 sum5 = b[row+4]; 3382 for (n = 0; n<sz-1; n+=2) { 3383 i1 = idx[0]; 3384 i2 = idx[1]; 3385 idx += 2; 3386 tmp0 = x[i1]; 3387 tmp1 = x[i2]; 3388 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3389 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3390 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3391 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3392 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3393 } 3394 if (n == sz-1) { 3395 tmp0 = x[*idx++]; 3396 sum1 -= v1[0] * tmp0; 3397 sum2 -= v2[0] * tmp0; 3398 sum3 -= v3[0] * tmp0; 3399 sum4 -= v4[0] * tmp0; 3400 sum5 -= v5[0] * tmp0; 3401 v1++; v2++; v3++; v4++; v5++; 3402 } 3403 t[row] = sum1; 3404 t[row+1] = sum2; 3405 t[row+2] = sum3; 3406 t[row+3] = sum4; 3407 t[row+4] = sum5; 3408 sz = ii[row+1] - diag[row] - 5; 3409 idx = a->j + diag[row] + 5; 3410 v1 += 5; 3411 v2 += 5; 3412 v3 += 5; 3413 v4 += 5; 3414 v5 += 5; 3415 for (n = 0; n<sz-1; n+=2) { 3416 i1 = idx[0]; 3417 i2 = idx[1]; 3418 idx += 2; 3419 tmp0 = x[i1]; 3420 tmp1 = x[i2]; 3421 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3422 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3423 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3424 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3425 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3426 } 3427 if (n == sz-1) { 3428 tmp0 = x[*idx]; 3429 sum1 -= v1[0] * tmp0; 3430 sum2 -= v2[0] * tmp0; 3431 sum3 -= v3[0] * tmp0; 3432 sum4 -= v4[0] * tmp0; 3433 sum5 -= v5[0] * tmp0; 3434 } 3435 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3436 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3437 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3438 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3439 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3440 break; 3441 default: 3442 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3443 } 3444 } 3445 xb = t; 3446 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3447 } else xb = b; 3448 3449 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3450 3451 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3452 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3453 ibdiag -= sizes[i]*sizes[i]; 3454 3455 /* set RHS */ 3456 if (xb == b) { 3457 /* whole (old way) */ 3458 sz = ii[row+1] - ii[row]; 3459 idx = a->j + ii[row]; 3460 switch (sizes[i]) { 3461 case 5: 3462 v5 = a->a + ii[row-4]; 3463 case 4: /* fall through */ 3464 v4 = a->a + ii[row-3]; 3465 case 3: 3466 v3 = a->a + ii[row-2]; 3467 case 2: 3468 v2 = a->a + ii[row-1]; 3469 case 1: 3470 v1 = a->a + ii[row]; 3471 break; 3472 default: 3473 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3474 } 3475 } else { 3476 /* upper, no diag */ 3477 sz = ii[row+1] - diag[row] - 1; 3478 idx = a->j + diag[row] + 1; 3479 switch (sizes[i]) { 3480 case 5: 3481 v5 = a->a + diag[row-4] + 5; 3482 case 4: /* fall through */ 3483 v4 = a->a + diag[row-3] + 4; 3484 case 3: 3485 v3 = a->a + diag[row-2] + 3; 3486 case 2: 3487 v2 = a->a + diag[row-1] + 2; 3488 case 1: 3489 v1 = a->a + diag[row] + 1; 3490 } 3491 } 3492 /* set sum */ 3493 switch (sizes[i]) { 3494 case 5: 3495 sum5 = xb[row-4]; 3496 case 4: /* fall through */ 3497 sum4 = xb[row-3]; 3498 case 3: 3499 sum3 = xb[row-2]; 3500 case 2: 3501 sum2 = xb[row-1]; 3502 case 1: 3503 /* note that sum1 is associated with the last row */ 3504 sum1 = xb[row]; 3505 } 3506 /* do sums */ 3507 for (n = 0; n<sz-1; n+=2) { 3508 i1 = idx[0]; 3509 i2 = idx[1]; 3510 idx += 2; 3511 tmp0 = x[i1]; 3512 tmp1 = x[i2]; 3513 switch (sizes[i]) { 3514 case 5: 3515 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3516 case 4: /* fall through */ 3517 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3518 case 3: 3519 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3520 case 2: 3521 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3522 case 1: 3523 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3524 } 3525 } 3526 /* ragged edge */ 3527 if (n == sz-1) { 3528 tmp0 = x[*idx]; 3529 switch (sizes[i]) { 3530 case 5: 3531 sum5 -= *v5*tmp0; 3532 case 4: /* fall through */ 3533 sum4 -= *v4*tmp0; 3534 case 3: 3535 sum3 -= *v3*tmp0; 3536 case 2: 3537 sum2 -= *v2*tmp0; 3538 case 1: 3539 sum1 -= *v1*tmp0; 3540 } 3541 } 3542 /* update */ 3543 if (xb == b) { 3544 /* whole (old way) w/ diag */ 3545 switch (sizes[i]) { 3546 case 5: 3547 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3548 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3549 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3550 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3551 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3552 break; 3553 case 4: 3554 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3555 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3556 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3557 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3558 break; 3559 case 3: 3560 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3561 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3562 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3563 break; 3564 case 2: 3565 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3566 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3567 break; 3568 case 1: 3569 x[row--] += sum1*(*ibdiag); 3570 break; 3571 } 3572 } else { 3573 /* no diag so set = */ 3574 switch (sizes[i]) { 3575 case 5: 3576 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3577 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3578 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3579 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3580 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3581 break; 3582 case 4: 3583 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3584 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3585 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3586 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3587 break; 3588 case 3: 3589 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3590 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3591 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3592 break; 3593 case 2: 3594 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3595 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3596 break; 3597 case 1: 3598 x[row--] = sum1*(*ibdiag); 3599 break; 3600 } 3601 } 3602 } 3603 if (xb == b) { 3604 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3605 } else { 3606 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3607 } 3608 } 3609 } 3610 if (flag & SOR_EISENSTAT) { 3611 /* 3612 Apply (U + D)^-1 where D is now the block diagonal 3613 */ 3614 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3615 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3616 ibdiag -= sizes[i]*sizes[i]; 3617 sz = ii[row+1] - diag[row] - 1; 3618 v1 = a->a + diag[row] + 1; 3619 idx = a->j + diag[row] + 1; 3620 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3621 switch (sizes[i]) { 3622 case 1: 3623 3624 sum1 = b[row]; 3625 for (n = 0; n<sz-1; n+=2) { 3626 i1 = idx[0]; 3627 i2 = idx[1]; 3628 idx += 2; 3629 tmp0 = x[i1]; 3630 tmp1 = x[i2]; 3631 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3632 } 3633 3634 if (n == sz-1) { 3635 tmp0 = x[*idx]; 3636 sum1 -= *v1*tmp0; 3637 } 3638 x[row] = sum1*(*ibdiag);row--; 3639 break; 3640 3641 case 2: 3642 3643 sum1 = b[row]; 3644 sum2 = b[row-1]; 3645 /* note that sum1 is associated with the second of the two rows */ 3646 v2 = a->a + diag[row-1] + 2; 3647 for (n = 0; n<sz-1; n+=2) { 3648 i1 = idx[0]; 3649 i2 = idx[1]; 3650 idx += 2; 3651 tmp0 = x[i1]; 3652 tmp1 = x[i2]; 3653 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3654 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3655 } 3656 3657 if (n == sz-1) { 3658 tmp0 = x[*idx]; 3659 sum1 -= *v1*tmp0; 3660 sum2 -= *v2*tmp0; 3661 } 3662 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3663 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3664 row -= 2; 3665 break; 3666 case 3: 3667 3668 sum1 = b[row]; 3669 sum2 = b[row-1]; 3670 sum3 = b[row-2]; 3671 v2 = a->a + diag[row-1] + 2; 3672 v3 = a->a + diag[row-2] + 3; 3673 for (n = 0; n<sz-1; n+=2) { 3674 i1 = idx[0]; 3675 i2 = idx[1]; 3676 idx += 2; 3677 tmp0 = x[i1]; 3678 tmp1 = x[i2]; 3679 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3680 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3681 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3682 } 3683 3684 if (n == sz-1) { 3685 tmp0 = x[*idx]; 3686 sum1 -= *v1*tmp0; 3687 sum2 -= *v2*tmp0; 3688 sum3 -= *v3*tmp0; 3689 } 3690 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3691 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3692 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3693 row -= 3; 3694 break; 3695 case 4: 3696 3697 sum1 = b[row]; 3698 sum2 = b[row-1]; 3699 sum3 = b[row-2]; 3700 sum4 = b[row-3]; 3701 v2 = a->a + diag[row-1] + 2; 3702 v3 = a->a + diag[row-2] + 3; 3703 v4 = a->a + diag[row-3] + 4; 3704 for (n = 0; n<sz-1; n+=2) { 3705 i1 = idx[0]; 3706 i2 = idx[1]; 3707 idx += 2; 3708 tmp0 = x[i1]; 3709 tmp1 = x[i2]; 3710 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3711 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3712 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3713 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3714 } 3715 3716 if (n == sz-1) { 3717 tmp0 = x[*idx]; 3718 sum1 -= *v1*tmp0; 3719 sum2 -= *v2*tmp0; 3720 sum3 -= *v3*tmp0; 3721 sum4 -= *v4*tmp0; 3722 } 3723 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3724 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3725 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3726 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3727 row -= 4; 3728 break; 3729 case 5: 3730 3731 sum1 = b[row]; 3732 sum2 = b[row-1]; 3733 sum3 = b[row-2]; 3734 sum4 = b[row-3]; 3735 sum5 = b[row-4]; 3736 v2 = a->a + diag[row-1] + 2; 3737 v3 = a->a + diag[row-2] + 3; 3738 v4 = a->a + diag[row-3] + 4; 3739 v5 = a->a + diag[row-4] + 5; 3740 for (n = 0; n<sz-1; n+=2) { 3741 i1 = idx[0]; 3742 i2 = idx[1]; 3743 idx += 2; 3744 tmp0 = x[i1]; 3745 tmp1 = x[i2]; 3746 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3747 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3748 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3749 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3750 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3751 } 3752 3753 if (n == sz-1) { 3754 tmp0 = x[*idx]; 3755 sum1 -= *v1*tmp0; 3756 sum2 -= *v2*tmp0; 3757 sum3 -= *v3*tmp0; 3758 sum4 -= *v4*tmp0; 3759 sum5 -= *v5*tmp0; 3760 } 3761 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3762 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3763 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3764 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3765 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3766 row -= 5; 3767 break; 3768 default: 3769 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3770 } 3771 } 3772 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3773 3774 /* 3775 t = b - D x where D is the block diagonal 3776 */ 3777 cnt = 0; 3778 for (i=0, row=0; i<m; i++) { 3779 switch (sizes[i]) { 3780 case 1: 3781 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3782 break; 3783 case 2: 3784 x1 = x[row]; x2 = x[row+1]; 3785 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3786 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3787 t[row] = b[row] - tmp1; 3788 t[row+1] = b[row+1] - tmp2; row += 2; 3789 cnt += 4; 3790 break; 3791 case 3: 3792 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3793 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3794 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3795 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3796 t[row] = b[row] - tmp1; 3797 t[row+1] = b[row+1] - tmp2; 3798 t[row+2] = b[row+2] - tmp3; row += 3; 3799 cnt += 9; 3800 break; 3801 case 4: 3802 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3803 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3804 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3805 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3806 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3807 t[row] = b[row] - tmp1; 3808 t[row+1] = b[row+1] - tmp2; 3809 t[row+2] = b[row+2] - tmp3; 3810 t[row+3] = b[row+3] - tmp4; row += 4; 3811 cnt += 16; 3812 break; 3813 case 5: 3814 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3815 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3816 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3817 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3818 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3819 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3820 t[row] = b[row] - tmp1; 3821 t[row+1] = b[row+1] - tmp2; 3822 t[row+2] = b[row+2] - tmp3; 3823 t[row+3] = b[row+3] - tmp4; 3824 t[row+4] = b[row+4] - tmp5;row += 5; 3825 cnt += 25; 3826 break; 3827 default: 3828 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3829 } 3830 } 3831 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3832 3833 3834 3835 /* 3836 Apply (L + D)^-1 where D is the block diagonal 3837 */ 3838 for (i=0, row=0; i<m; i++) { 3839 sz = diag[row] - ii[row]; 3840 v1 = a->a + ii[row]; 3841 idx = a->j + ii[row]; 3842 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3843 switch (sizes[i]) { 3844 case 1: 3845 3846 sum1 = t[row]; 3847 for (n = 0; n<sz-1; n+=2) { 3848 i1 = idx[0]; 3849 i2 = idx[1]; 3850 idx += 2; 3851 tmp0 = t[i1]; 3852 tmp1 = t[i2]; 3853 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3854 } 3855 3856 if (n == sz-1) { 3857 tmp0 = t[*idx]; 3858 sum1 -= *v1 * tmp0; 3859 } 3860 x[row] += t[row] = sum1*(*ibdiag++); row++; 3861 break; 3862 case 2: 3863 v2 = a->a + ii[row+1]; 3864 sum1 = t[row]; 3865 sum2 = t[row+1]; 3866 for (n = 0; n<sz-1; n+=2) { 3867 i1 = idx[0]; 3868 i2 = idx[1]; 3869 idx += 2; 3870 tmp0 = t[i1]; 3871 tmp1 = t[i2]; 3872 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3873 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3874 } 3875 3876 if (n == sz-1) { 3877 tmp0 = t[*idx]; 3878 sum1 -= v1[0] * tmp0; 3879 sum2 -= v2[0] * tmp0; 3880 } 3881 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3882 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3883 ibdiag += 4; row += 2; 3884 break; 3885 case 3: 3886 v2 = a->a + ii[row+1]; 3887 v3 = a->a + ii[row+2]; 3888 sum1 = t[row]; 3889 sum2 = t[row+1]; 3890 sum3 = t[row+2]; 3891 for (n = 0; n<sz-1; n+=2) { 3892 i1 = idx[0]; 3893 i2 = idx[1]; 3894 idx += 2; 3895 tmp0 = t[i1]; 3896 tmp1 = t[i2]; 3897 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3898 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3899 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3900 } 3901 3902 if (n == sz-1) { 3903 tmp0 = t[*idx]; 3904 sum1 -= v1[0] * tmp0; 3905 sum2 -= v2[0] * tmp0; 3906 sum3 -= v3[0] * tmp0; 3907 } 3908 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3909 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3910 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3911 ibdiag += 9; row += 3; 3912 break; 3913 case 4: 3914 v2 = a->a + ii[row+1]; 3915 v3 = a->a + ii[row+2]; 3916 v4 = a->a + ii[row+3]; 3917 sum1 = t[row]; 3918 sum2 = t[row+1]; 3919 sum3 = t[row+2]; 3920 sum4 = t[row+3]; 3921 for (n = 0; n<sz-1; n+=2) { 3922 i1 = idx[0]; 3923 i2 = idx[1]; 3924 idx += 2; 3925 tmp0 = t[i1]; 3926 tmp1 = t[i2]; 3927 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3928 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3929 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3930 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3931 } 3932 3933 if (n == sz-1) { 3934 tmp0 = t[*idx]; 3935 sum1 -= v1[0] * tmp0; 3936 sum2 -= v2[0] * tmp0; 3937 sum3 -= v3[0] * tmp0; 3938 sum4 -= v4[0] * tmp0; 3939 } 3940 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3941 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3942 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3943 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3944 ibdiag += 16; row += 4; 3945 break; 3946 case 5: 3947 v2 = a->a + ii[row+1]; 3948 v3 = a->a + ii[row+2]; 3949 v4 = a->a + ii[row+3]; 3950 v5 = a->a + ii[row+4]; 3951 sum1 = t[row]; 3952 sum2 = t[row+1]; 3953 sum3 = t[row+2]; 3954 sum4 = t[row+3]; 3955 sum5 = t[row+4]; 3956 for (n = 0; n<sz-1; n+=2) { 3957 i1 = idx[0]; 3958 i2 = idx[1]; 3959 idx += 2; 3960 tmp0 = t[i1]; 3961 tmp1 = t[i2]; 3962 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3963 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3964 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3965 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3966 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3967 } 3968 3969 if (n == sz-1) { 3970 tmp0 = t[*idx]; 3971 sum1 -= v1[0] * tmp0; 3972 sum2 -= v2[0] * tmp0; 3973 sum3 -= v3[0] * tmp0; 3974 sum4 -= v4[0] * tmp0; 3975 sum5 -= v5[0] * tmp0; 3976 } 3977 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3978 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3979 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3980 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3981 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3982 ibdiag += 25; row += 5; 3983 break; 3984 default: 3985 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3986 } 3987 } 3988 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3989 } 3990 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3991 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3992 PetscFunctionReturn(0); 3993 } 3994 3995 #undef __FUNCT__ 3996 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3997 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3998 { 3999 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4000 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 4001 const MatScalar *bdiag = a->inode.bdiag; 4002 const PetscScalar *b; 4003 PetscErrorCode ierr; 4004 PetscInt m = a->inode.node_count,cnt = 0,i,row; 4005 const PetscInt *sizes = a->inode.size; 4006 4007 PetscFunctionBegin; 4008 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4009 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4010 cnt = 0; 4011 for (i=0, row=0; i<m; i++) { 4012 switch (sizes[i]) { 4013 case 1: 4014 x[row] = b[row]*bdiag[cnt++];row++; 4015 break; 4016 case 2: 4017 x1 = b[row]; x2 = b[row+1]; 4018 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 4019 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 4020 x[row++] = tmp1; 4021 x[row++] = tmp2; 4022 cnt += 4; 4023 break; 4024 case 3: 4025 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4026 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4027 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4028 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4029 x[row++] = tmp1; 4030 x[row++] = tmp2; 4031 x[row++] = tmp3; 4032 cnt += 9; 4033 break; 4034 case 4: 4035 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4036 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4037 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4038 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4039 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4040 x[row++] = tmp1; 4041 x[row++] = tmp2; 4042 x[row++] = tmp3; 4043 x[row++] = tmp4; 4044 cnt += 16; 4045 break; 4046 case 5: 4047 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4048 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4049 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4050 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4051 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4052 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4053 x[row++] = tmp1; 4054 x[row++] = tmp2; 4055 x[row++] = tmp3; 4056 x[row++] = tmp4; 4057 x[row++] = tmp5; 4058 cnt += 25; 4059 break; 4060 default: 4061 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4062 } 4063 } 4064 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4065 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4066 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4067 PetscFunctionReturn(0); 4068 } 4069 4070 /* 4071 samestructure indicates that the matrix has not changed its nonzero structure so we 4072 do not need to recompute the inodes 4073 */ 4074 #undef __FUNCT__ 4075 #define __FUNCT__ "MatSeqAIJCheckInode" 4076 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4077 { 4078 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4079 PetscErrorCode ierr; 4080 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4081 PetscBool flag; 4082 const PetscInt *idx,*idy,*ii; 4083 4084 PetscFunctionBegin; 4085 if (!a->inode.use) PetscFunctionReturn(0); 4086 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4087 4088 m = A->rmap->n; 4089 if (a->inode.size) ns = a->inode.size; 4090 else { 4091 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4092 } 4093 4094 i = 0; 4095 node_count = 0; 4096 idx = a->j; 4097 ii = a->i; 4098 while (i < m) { /* For each row */ 4099 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4100 /* Limits the number of elements in a node to 'a->inode.limit' */ 4101 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4102 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4103 if (nzy != nzx) break; 4104 idy += nzx; /* Same nonzero pattern */ 4105 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4106 if (!flag) break; 4107 } 4108 ns[node_count++] = blk_size; 4109 idx += blk_size*nzx; 4110 i = j; 4111 } 4112 /* If not enough inodes found,, do not use inode version of the routines */ 4113 if (!m || node_count > .8*m) { 4114 ierr = PetscFree(ns);CHKERRQ(ierr); 4115 4116 a->inode.node_count = 0; 4117 a->inode.size = NULL; 4118 a->inode.use = PETSC_FALSE; 4119 A->ops->mult = MatMult_SeqAIJ; 4120 A->ops->sor = MatSOR_SeqAIJ; 4121 A->ops->multadd = MatMultAdd_SeqAIJ; 4122 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4123 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4124 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4125 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4126 A->ops->coloringpatch = 0; 4127 A->ops->multdiagonalblock = 0; 4128 4129 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4130 } else { 4131 if (!A->factortype) { 4132 A->ops->mult = MatMult_SeqAIJ_Inode; 4133 A->ops->sor = MatSOR_SeqAIJ_Inode; 4134 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4135 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4136 if (A->rmap->n == A->cmap->n) { 4137 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4138 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4139 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4140 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4141 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4142 } 4143 } else { 4144 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4145 } 4146 a->inode.node_count = node_count; 4147 a->inode.size = ns; 4148 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4149 } 4150 a->inode.checked = PETSC_TRUE; 4151 a->inode.mat_nonzerostate = A->nonzerostate; 4152 PetscFunctionReturn(0); 4153 } 4154 4155 #undef __FUNCT__ 4156 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4157 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4158 { 4159 Mat B =*C; 4160 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4161 PetscErrorCode ierr; 4162 PetscInt m=A->rmap->n; 4163 4164 PetscFunctionBegin; 4165 c->inode.use = a->inode.use; 4166 c->inode.limit = a->inode.limit; 4167 c->inode.max_limit = a->inode.max_limit; 4168 if (a->inode.size) { 4169 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4170 c->inode.node_count = a->inode.node_count; 4171 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4172 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4173 if (!B->factortype) { 4174 B->ops->mult = MatMult_SeqAIJ_Inode; 4175 B->ops->sor = MatSOR_SeqAIJ_Inode; 4176 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4177 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4178 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4179 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4180 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4181 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4182 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4183 } else { 4184 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4185 } 4186 } else { 4187 c->inode.size = 0; 4188 c->inode.node_count = 0; 4189 } 4190 c->inode.ibdiagvalid = PETSC_FALSE; 4191 c->inode.ibdiag = 0; 4192 c->inode.bdiag = 0; 4193 PetscFunctionReturn(0); 4194 } 4195 4196 #undef __FUNCT__ 4197 #define __FUNCT__ "MatGetRow_FactoredLU" 4198 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4199 { 4200 PetscInt k; 4201 const PetscInt *vi; 4202 4203 PetscFunctionBegin; 4204 vi = aj + ai[row]; 4205 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4206 vi = aj + adiag[row]; 4207 cols[nzl] = vi[0]; 4208 vi = aj + adiag[row+1]+1; 4209 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4210 PetscFunctionReturn(0); 4211 } 4212 /* 4213 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4214 Modified from MatSeqAIJCheckInode(). 4215 4216 Input Parameters: 4217 . Mat A - ILU or LU matrix factor 4218 4219 */ 4220 #undef __FUNCT__ 4221 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU" 4222 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4223 { 4224 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4225 PetscErrorCode ierr; 4226 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4227 PetscInt *cols1,*cols2,*ns; 4228 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4229 PetscBool flag; 4230 4231 PetscFunctionBegin; 4232 if (!a->inode.use) PetscFunctionReturn(0); 4233 if (a->inode.checked) PetscFunctionReturn(0); 4234 4235 m = A->rmap->n; 4236 if (a->inode.size) ns = a->inode.size; 4237 else { 4238 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4239 } 4240 4241 i = 0; 4242 node_count = 0; 4243 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4244 while (i < m) { /* For each row */ 4245 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4246 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4247 nzx = nzl1 + nzu1 + 1; 4248 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4249 4250 /* Limits the number of elements in a node to 'a->inode.limit' */ 4251 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4252 nzl2 = ai[j+1] - ai[j]; 4253 nzu2 = adiag[j] - adiag[j+1] - 1; 4254 nzy = nzl2 + nzu2 + 1; 4255 if (nzy != nzx) break; 4256 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4257 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4258 if (!flag) break; 4259 } 4260 ns[node_count++] = blk_size; 4261 i = j; 4262 } 4263 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4264 /* If not enough inodes found,, do not use inode version of the routines */ 4265 if (!m || node_count > .8*m) { 4266 ierr = PetscFree(ns);CHKERRQ(ierr); 4267 4268 a->inode.node_count = 0; 4269 a->inode.size = NULL; 4270 a->inode.use = PETSC_FALSE; 4271 4272 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4273 } else { 4274 A->ops->mult = 0; 4275 A->ops->sor = 0; 4276 A->ops->multadd = 0; 4277 A->ops->getrowij = 0; 4278 A->ops->restorerowij = 0; 4279 A->ops->getcolumnij = 0; 4280 A->ops->restorecolumnij = 0; 4281 A->ops->coloringpatch = 0; 4282 A->ops->multdiagonalblock = 0; 4283 a->inode.node_count = node_count; 4284 a->inode.size = ns; 4285 4286 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4287 } 4288 a->inode.checked = PETSC_TRUE; 4289 PetscFunctionReturn(0); 4290 } 4291 4292 #undef __FUNCT__ 4293 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4294 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4295 { 4296 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4297 4298 PetscFunctionBegin; 4299 a->inode.ibdiagvalid = PETSC_FALSE; 4300 PetscFunctionReturn(0); 4301 } 4302 4303 /* 4304 This is really ugly. if inodes are used this replaces the 4305 permutations with ones that correspond to rows/cols of the matrix 4306 rather then inode blocks 4307 */ 4308 #undef __FUNCT__ 4309 #define __FUNCT__ "MatInodeAdjustForInodes" 4310 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4311 { 4312 PetscErrorCode ierr; 4313 4314 PetscFunctionBegin; 4315 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4316 PetscFunctionReturn(0); 4317 } 4318 4319 #undef __FUNCT__ 4320 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4321 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4322 { 4323 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4324 PetscErrorCode ierr; 4325 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4326 const PetscInt *ridx,*cidx; 4327 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4328 PetscInt nslim_col,*ns_col; 4329 IS ris = *rperm,cis = *cperm; 4330 4331 PetscFunctionBegin; 4332 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4333 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4334 4335 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4336 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4337 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4338 4339 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4340 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4341 4342 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4343 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4344 4345 /* Construct the permutations for rows*/ 4346 for (i=0,row = 0; i<nslim_row; ++i) { 4347 indx = ridx[i]; 4348 start_val = tns[indx]; 4349 end_val = tns[indx + 1]; 4350 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4351 } 4352 4353 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4354 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4355 4356 /* Construct permutations for columns */ 4357 for (i=0,col=0; i<nslim_col; ++i) { 4358 indx = cidx[i]; 4359 start_val = tns[indx]; 4360 end_val = tns[indx + 1]; 4361 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4362 } 4363 4364 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4365 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4366 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4367 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4368 4369 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4370 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4371 4372 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4373 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4374 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4375 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4376 ierr = PetscFree(tns);CHKERRQ(ierr); 4377 PetscFunctionReturn(0); 4378 } 4379 4380 #undef __FUNCT__ 4381 #define __FUNCT__ "MatInodeGetInodeSizes" 4382 /*@C 4383 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4384 4385 Not Collective 4386 4387 Input Parameter: 4388 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4389 4390 Output Parameter: 4391 + node_count - no of inodes present in the matrix. 4392 . sizes - an array of size node_count,with sizes of each inode. 4393 - limit - the max size used to generate the inodes. 4394 4395 Level: advanced 4396 4397 Notes: This routine returns some internal storage information 4398 of the matrix, it is intended to be used by advanced users. 4399 It should be called after the matrix is assembled. 4400 The contents of the sizes[] array should not be changed. 4401 NULL may be passed for information not requested. 4402 4403 .keywords: matrix, seqaij, get, inode 4404 4405 .seealso: MatGetInfo() 4406 @*/ 4407 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4408 { 4409 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4410 4411 PetscFunctionBegin; 4412 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4413 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4414 if (f) { 4415 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4416 } 4417 PetscFunctionReturn(0); 4418 } 4419 4420 #undef __FUNCT__ 4421 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4422 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4423 { 4424 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4425 4426 PetscFunctionBegin; 4427 if (node_count) *node_count = a->inode.node_count; 4428 if (sizes) *sizes = a->inode.size; 4429 if (limit) *limit = a->inode.limit; 4430 PetscFunctionReturn(0); 4431 } 4432