1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 67 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1) { 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,nsz,i1,i2; 157 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 158 159 PetscFunctionBegin; 160 nslim_row = a->inode.node_count; 161 n = A->cmap->n; 162 163 /* Create The column_inode for this matrix */ 164 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 165 166 /* allocate space for reformated column_inode structure */ 167 ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr); 168 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 169 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 170 171 for (i1=0,col=0; i1<nslim_col; ++i1) { 172 nsz = ns_col[i1]; 173 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 PetscFunctionReturn(0); 264 } 265 266 /* ----------------------------------------------------------- */ 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscErrorCode ierr; 274 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 275 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 276 277 PetscFunctionBegin; 278 nslim_row = a->inode.node_count; 279 n = A->cmap->n; 280 281 /* Create The column_inode for this matrix */ 282 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 283 284 /* allocate space for reformated column_inode structure */ 285 ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr); 286 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 287 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 288 289 for (i1=0,col=0; i1<nslim_col; ++i1) { 290 nsz = ns_col[i1]; 291 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 292 } 293 /* allocate space for column pointers */ 294 ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 295 *iia = ia; 296 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 297 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 298 299 /* determine the number of columns in each row */ 300 ia[0] = oshift; 301 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 302 j = aj + ai[row] + ishift; 303 col = *j++ + ishift; 304 i2 = tvc[col]; 305 nz = ai[row+1] - ai[row]; 306 while (nz-- > 0) { /* off-diagonal elemets */ 307 /* ia[i1+1]++; */ 308 ia[i2+1]++; 309 i2++; 310 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 311 if (nz > 0) i2 = tvc[col]; 312 } 313 } 314 315 /* shift ia[i] to point to next col */ 316 for (i1=1; i1<nslim_col+1; i1++) { 317 col = ia[i1-1]; 318 ia[i1] += col; 319 work[i1-1] = col - oshift; 320 } 321 322 /* allocate space for column pointers */ 323 nz = ia[nslim_col] + (!ishift); 324 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 325 *jja = ja; 326 327 /* loop over matrix putting into ja */ 328 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 329 j = aj + ai[row] + ishift; 330 col = *j++ + ishift; 331 i2 = tvc[col]; 332 nz = ai[row+1] - ai[row]; 333 while (nz-- > 0) { 334 /* ja[work[i1]++] = i2 + oshift; */ 335 ja[work[i2]++] = i1 + oshift; 336 i2++; 337 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 338 if (nz > 0) i2 = tvc[col]; 339 } 340 } 341 ierr = PetscFree(ns_col);CHKERRQ(ierr); 342 ierr = PetscFree(work);CHKERRQ(ierr); 343 ierr = PetscFree(tns);CHKERRQ(ierr); 344 ierr = PetscFree(tvc);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 356 if (!ia) PetscFunctionReturn(0); 357 358 if (!blockcompressed) { 359 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 360 } else if (symmetric) { 361 /* Since the indices are symmetric it does'nt matter */ 362 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } else { 364 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 365 } 366 PetscFunctionReturn(0); 367 } 368 369 #undef __FUNCT__ 370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 372 { 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 if (!ia) PetscFunctionReturn(0); 377 if (!blockcompressed) { 378 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 379 } else { 380 ierr = PetscFree(*ia);CHKERRQ(ierr); 381 ierr = PetscFree(*ja);CHKERRQ(ierr); 382 } 383 PetscFunctionReturn(0); 384 } 385 386 /* ----------------------------------------------------------- */ 387 388 #undef __FUNCT__ 389 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 393 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 394 PetscScalar *y; 395 const PetscScalar *x; 396 const MatScalar *v1,*v2,*v3,*v4,*v5; 397 PetscErrorCode ierr; 398 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 399 const PetscInt *idx,*ns,*ii; 400 401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 403 #endif 404 405 PetscFunctionBegin; 406 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 407 node_max = a->inode.node_count; 408 ns = a->inode.size; /* Node Size array */ 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 411 idx = a->j; 412 v1 = a->a; 413 ii = a->i; 414 415 for (i = 0,row = 0; i< node_max; ++i) { 416 nsz = ns[i]; 417 n = ii[1] - ii[0]; 418 nonzerorow += (n>0)*nsz; 419 ii += nsz; 420 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 421 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 422 sz = n; /* No of non zeros in this row */ 423 /* Switch on the size of Node */ 424 switch (nsz) { /* Each loop in 'case' is unrolled */ 425 case 1: 426 sum1 = 0.; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; /* The instructions are ordered to */ 430 i2 = idx[1]; /* make the compiler's job easy */ 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 } 436 437 if (n == sz-1) { /* Take care of the last nonzero */ 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ *tmp0; 440 } 441 y[row++]=sum1; 442 break; 443 case 2: 444 sum1 = 0.; 445 sum2 = 0.; 446 v2 = v1 + n; 447 448 for (n = 0; n< sz-1; n+=2) { 449 i1 = idx[0]; 450 i2 = idx[1]; 451 idx += 2; 452 tmp0 = x[i1]; 453 tmp1 = x[i2]; 454 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 455 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 456 } 457 if (n == sz-1) { 458 tmp0 = x[*idx++]; 459 sum1 += *v1++ * tmp0; 460 sum2 += *v2++ * tmp0; 461 } 462 y[row++]=sum1; 463 y[row++]=sum2; 464 v1 =v2; /* Since the next block to be processed starts there*/ 465 idx +=sz; 466 break; 467 case 3: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 v2 = v1 + n; 472 v3 = v2 + n; 473 474 for (n = 0; n< sz-1; n+=2) { 475 i1 = idx[0]; 476 i2 = idx[1]; 477 idx += 2; 478 tmp0 = x[i1]; 479 tmp1 = x[i2]; 480 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 } 490 y[row++]=sum1; 491 y[row++]=sum2; 492 y[row++]=sum3; 493 v1 =v3; /* Since the next block to be processed starts there*/ 494 idx +=2*sz; 495 break; 496 case 4: 497 sum1 = 0.; 498 sum2 = 0.; 499 sum3 = 0.; 500 sum4 = 0.; 501 v2 = v1 + n; 502 v3 = v2 + n; 503 v4 = v3 + n; 504 505 for (n = 0; n< sz-1; n+=2) { 506 i1 = idx[0]; 507 i2 = idx[1]; 508 idx += 2; 509 tmp0 = x[i1]; 510 tmp1 = x[i2]; 511 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 513 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 514 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 515 } 516 if (n == sz-1) { 517 tmp0 = x[*idx++]; 518 sum1 += *v1++ * tmp0; 519 sum2 += *v2++ * tmp0; 520 sum3 += *v3++ * tmp0; 521 sum4 += *v4++ * tmp0; 522 } 523 y[row++]=sum1; 524 y[row++]=sum2; 525 y[row++]=sum3; 526 y[row++]=sum4; 527 v1 =v4; /* Since the next block to be processed starts there*/ 528 idx +=3*sz; 529 break; 530 case 5: 531 sum1 = 0.; 532 sum2 = 0.; 533 sum3 = 0.; 534 sum4 = 0.; 535 sum5 = 0.; 536 v2 = v1 + n; 537 v3 = v2 + n; 538 v4 = v3 + n; 539 v5 = v4 + n; 540 541 for (n = 0; n<sz-1; n+=2) { 542 i1 = idx[0]; 543 i2 = idx[1]; 544 idx += 2; 545 tmp0 = x[i1]; 546 tmp1 = x[i2]; 547 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 548 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 549 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 550 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 551 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 552 } 553 if (n == sz-1) { 554 tmp0 = x[*idx++]; 555 sum1 += *v1++ * tmp0; 556 sum2 += *v2++ * tmp0; 557 sum3 += *v3++ * tmp0; 558 sum4 += *v4++ * tmp0; 559 sum5 += *v5++ * tmp0; 560 } 561 y[row++]=sum1; 562 y[row++]=sum2; 563 y[row++]=sum3; 564 y[row++]=sum4; 565 y[row++]=sum5; 566 v1 =v5; /* Since the next block to be processed starts there */ 567 idx +=4*sz; 568 break; 569 default: 570 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 571 } 572 } 573 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 575 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 /* ----------------------------------------------------------- */ 579 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 583 { 584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 585 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 586 const MatScalar *v1,*v2,*v3,*v4,*v5; 587 const PetscScalar *x; 588 PetscScalar *y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 591 const PetscInt *idx,*ns,*ii; 592 593 PetscFunctionBegin; 594 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 595 node_max = a->inode.node_count; 596 ns = a->inode.size; /* Node Size array */ 597 598 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 600 zt = z; 601 602 idx = a->j; 603 v1 = a->a; 604 ii = a->i; 605 606 for (i = 0,row = 0; i< node_max; ++i) { 607 nsz = ns[i]; 608 n = ii[1] - ii[0]; 609 ii += nsz; 610 sz = n; /* No of non zeros in this row */ 611 /* Switch on the size of Node */ 612 switch (nsz) { /* Each loop in 'case' is unrolled */ 613 case 1: 614 sum1 = *zt++; 615 616 for (n = 0; n< sz-1; n+=2) { 617 i1 = idx[0]; /* The instructions are ordered to */ 618 i2 = idx[1]; /* make the compiler's job easy */ 619 idx += 2; 620 tmp0 = x[i1]; 621 tmp1 = x[i2]; 622 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 623 } 624 625 if (n == sz-1) { /* Take care of the last nonzero */ 626 tmp0 = x[*idx++]; 627 sum1 += *v1++ * tmp0; 628 } 629 y[row++]=sum1; 630 break; 631 case 2: 632 sum1 = *zt++; 633 sum2 = *zt++; 634 v2 = v1 + n; 635 636 for (n = 0; n< sz-1; n+=2) { 637 i1 = idx[0]; 638 i2 = idx[1]; 639 idx += 2; 640 tmp0 = x[i1]; 641 tmp1 = x[i2]; 642 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 643 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 644 } 645 if (n == sz-1) { 646 tmp0 = x[*idx++]; 647 sum1 += *v1++ * tmp0; 648 sum2 += *v2++ * tmp0; 649 } 650 y[row++]=sum1; 651 y[row++]=sum2; 652 v1 =v2; /* Since the next block to be processed starts there*/ 653 idx +=sz; 654 break; 655 case 3: 656 sum1 = *zt++; 657 sum2 = *zt++; 658 sum3 = *zt++; 659 v2 = v1 + n; 660 v3 = v2 + n; 661 662 for (n = 0; n< sz-1; n+=2) { 663 i1 = idx[0]; 664 i2 = idx[1]; 665 idx += 2; 666 tmp0 = x[i1]; 667 tmp1 = x[i2]; 668 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 669 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 670 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 671 } 672 if (n == sz-1) { 673 tmp0 = x[*idx++]; 674 sum1 += *v1++ * tmp0; 675 sum2 += *v2++ * tmp0; 676 sum3 += *v3++ * tmp0; 677 } 678 y[row++]=sum1; 679 y[row++]=sum2; 680 y[row++]=sum3; 681 v1 =v3; /* Since the next block to be processed starts there*/ 682 idx +=2*sz; 683 break; 684 case 4: 685 sum1 = *zt++; 686 sum2 = *zt++; 687 sum3 = *zt++; 688 sum4 = *zt++; 689 v2 = v1 + n; 690 v3 = v2 + n; 691 v4 = v3 + n; 692 693 for (n = 0; n< sz-1; n+=2) { 694 i1 = idx[0]; 695 i2 = idx[1]; 696 idx += 2; 697 tmp0 = x[i1]; 698 tmp1 = x[i2]; 699 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 700 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 701 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 702 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 703 } 704 if (n == sz-1) { 705 tmp0 = x[*idx++]; 706 sum1 += *v1++ * tmp0; 707 sum2 += *v2++ * tmp0; 708 sum3 += *v3++ * tmp0; 709 sum4 += *v4++ * tmp0; 710 } 711 y[row++]=sum1; 712 y[row++]=sum2; 713 y[row++]=sum3; 714 y[row++]=sum4; 715 v1 =v4; /* Since the next block to be processed starts there*/ 716 idx +=3*sz; 717 break; 718 case 5: 719 sum1 = *zt++; 720 sum2 = *zt++; 721 sum3 = *zt++; 722 sum4 = *zt++; 723 sum5 = *zt++; 724 v2 = v1 + n; 725 v3 = v2 + n; 726 v4 = v3 + n; 727 v5 = v4 + n; 728 729 for (n = 0; n<sz-1; n+=2) { 730 i1 = idx[0]; 731 i2 = idx[1]; 732 idx += 2; 733 tmp0 = x[i1]; 734 tmp1 = x[i2]; 735 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 736 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 737 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 738 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 739 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 740 } 741 if (n == sz-1) { 742 tmp0 = x[*idx++]; 743 sum1 += *v1++ * tmp0; 744 sum2 += *v2++ * tmp0; 745 sum3 += *v3++ * tmp0; 746 sum4 += *v4++ * tmp0; 747 sum5 += *v5++ * tmp0; 748 } 749 y[row++]=sum1; 750 y[row++]=sum2; 751 y[row++]=sum3; 752 y[row++]=sum4; 753 y[row++]=sum5; 754 v1 =v5; /* Since the next block to be processed starts there */ 755 idx +=4*sz; 756 break; 757 default: 758 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 759 } 760 } 761 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 762 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 763 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 764 PetscFunctionReturn(0); 765 } 766 767 /* ----------------------------------------------------------- */ 768 #undef __FUNCT__ 769 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 770 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 771 { 772 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 773 IS iscol = a->col,isrow = a->row; 774 PetscErrorCode ierr; 775 const PetscInt *r,*c,*rout,*cout; 776 PetscInt i,j,n = A->rmap->n,nz; 777 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 778 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 779 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 780 PetscScalar sum1,sum2,sum3,sum4,sum5; 781 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 782 const PetscScalar *b; 783 784 PetscFunctionBegin; 785 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 786 node_max = a->inode.node_count; 787 ns = a->inode.size; /* Node Size array */ 788 789 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 790 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 791 tmp = a->solve_work; 792 793 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 794 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 795 796 /* forward solve the lower triangular */ 797 tmps = tmp; 798 aa = a_a; 799 aj = a_j; 800 ad = a->diag; 801 802 for (i = 0,row = 0; i< node_max; ++i) { 803 nsz = ns[i]; 804 aii = ai[row]; 805 v1 = aa + aii; 806 vi = aj + aii; 807 nz = ad[row]- aii; 808 if (i < node_max-1) { 809 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 810 * but our indexing to determine it's size could. */ 811 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 812 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 813 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 814 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 815 } 816 817 switch (nsz) { /* Each loop in 'case' is unrolled */ 818 case 1: 819 sum1 = b[*r++]; 820 for (j=0; j<nz-1; j+=2) { 821 i0 = vi[0]; 822 i1 = vi[1]; 823 vi +=2; 824 tmp0 = tmps[i0]; 825 tmp1 = tmps[i1]; 826 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 827 } 828 if (j == nz-1) { 829 tmp0 = tmps[*vi++]; 830 sum1 -= *v1++ *tmp0; 831 } 832 tmp[row++]=sum1; 833 break; 834 case 2: 835 sum1 = b[*r++]; 836 sum2 = b[*r++]; 837 v2 = aa + ai[row+1]; 838 839 for (j=0; j<nz-1; j+=2) { 840 i0 = vi[0]; 841 i1 = vi[1]; 842 vi +=2; 843 tmp0 = tmps[i0]; 844 tmp1 = tmps[i1]; 845 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 846 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 847 } 848 if (j == nz-1) { 849 tmp0 = tmps[*vi++]; 850 sum1 -= *v1++ *tmp0; 851 sum2 -= *v2++ *tmp0; 852 } 853 sum2 -= *v2++ *sum1; 854 tmp[row++]=sum1; 855 tmp[row++]=sum2; 856 break; 857 case 3: 858 sum1 = b[*r++]; 859 sum2 = b[*r++]; 860 sum3 = b[*r++]; 861 v2 = aa + ai[row+1]; 862 v3 = aa + ai[row+2]; 863 864 for (j=0; j<nz-1; j+=2) { 865 i0 = vi[0]; 866 i1 = vi[1]; 867 vi +=2; 868 tmp0 = tmps[i0]; 869 tmp1 = tmps[i1]; 870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 873 } 874 if (j == nz-1) { 875 tmp0 = tmps[*vi++]; 876 sum1 -= *v1++ *tmp0; 877 sum2 -= *v2++ *tmp0; 878 sum3 -= *v3++ *tmp0; 879 } 880 sum2 -= *v2++ * sum1; 881 sum3 -= *v3++ * sum1; 882 sum3 -= *v3++ * sum2; 883 884 tmp[row++]=sum1; 885 tmp[row++]=sum2; 886 tmp[row++]=sum3; 887 break; 888 889 case 4: 890 sum1 = b[*r++]; 891 sum2 = b[*r++]; 892 sum3 = b[*r++]; 893 sum4 = b[*r++]; 894 v2 = aa + ai[row+1]; 895 v3 = aa + ai[row+2]; 896 v4 = aa + ai[row+3]; 897 898 for (j=0; j<nz-1; j+=2) { 899 i0 = vi[0]; 900 i1 = vi[1]; 901 vi +=2; 902 tmp0 = tmps[i0]; 903 tmp1 = tmps[i1]; 904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 908 } 909 if (j == nz-1) { 910 tmp0 = tmps[*vi++]; 911 sum1 -= *v1++ *tmp0; 912 sum2 -= *v2++ *tmp0; 913 sum3 -= *v3++ *tmp0; 914 sum4 -= *v4++ *tmp0; 915 } 916 sum2 -= *v2++ * sum1; 917 sum3 -= *v3++ * sum1; 918 sum4 -= *v4++ * sum1; 919 sum3 -= *v3++ * sum2; 920 sum4 -= *v4++ * sum2; 921 sum4 -= *v4++ * sum3; 922 923 tmp[row++]=sum1; 924 tmp[row++]=sum2; 925 tmp[row++]=sum3; 926 tmp[row++]=sum4; 927 break; 928 case 5: 929 sum1 = b[*r++]; 930 sum2 = b[*r++]; 931 sum3 = b[*r++]; 932 sum4 = b[*r++]; 933 sum5 = b[*r++]; 934 v2 = aa + ai[row+1]; 935 v3 = aa + ai[row+2]; 936 v4 = aa + ai[row+3]; 937 v5 = aa + ai[row+4]; 938 939 for (j=0; j<nz-1; j+=2) { 940 i0 = vi[0]; 941 i1 = vi[1]; 942 vi +=2; 943 tmp0 = tmps[i0]; 944 tmp1 = tmps[i1]; 945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 950 } 951 if (j == nz-1) { 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ *tmp0; 954 sum2 -= *v2++ *tmp0; 955 sum3 -= *v3++ *tmp0; 956 sum4 -= *v4++ *tmp0; 957 sum5 -= *v5++ *tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row++]=sum1; 972 tmp[row++]=sum2; 973 tmp[row++]=sum3; 974 tmp[row++]=sum4; 975 tmp[row++]=sum5; 976 break; 977 default: 978 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i=node_max -1,row = n-1; i>=0; i--) { 983 nsz = ns[i]; 984 aii = ai[row+1] -1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii- ad[row]; 988 switch (nsz) { /* Each loop in 'case' is unrolled */ 989 case 1: 990 sum1 = tmp[row]; 991 992 for (j=nz; j>1; j-=2) { 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j==1) { 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1006 break; 1007 case 2: 1008 sum1 = tmp[row]; 1009 sum2 = tmp[row -1]; 1010 v2 = aa + ai[row]-1; 1011 for (j=nz; j>1; j-=2) { 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1020 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1021 } 1022 if (j==1) { 1023 tmp0 = tmps[*vi--]; 1024 sum1 -= *v1-- * tmp0; 1025 sum2 -= *v2-- * tmp0; 1026 } 1027 1028 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1031 break; 1032 case 3: 1033 sum1 = tmp[row]; 1034 sum2 = tmp[row -1]; 1035 sum3 = tmp[row -2]; 1036 v2 = aa + ai[row]-1; 1037 v3 = aa + ai[row -1]-1; 1038 for (j=nz; j>1; j-=2) { 1039 vi -=2; 1040 i0 = vi[2]; 1041 i1 = vi[1]; 1042 tmp0 = tmps[i0]; 1043 tmp1 = tmps[i1]; 1044 v1 -= 2; 1045 v2 -= 2; 1046 v3 -= 2; 1047 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1048 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1049 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1050 } 1051 if (j==1) { 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 } 1057 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1061 sum3 -= *v3-- * tmp0; 1062 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1063 1064 break; 1065 case 4: 1066 sum1 = tmp[row]; 1067 sum2 = tmp[row -1]; 1068 sum3 = tmp[row -2]; 1069 sum4 = tmp[row -3]; 1070 v2 = aa + ai[row]-1; 1071 v3 = aa + ai[row -1]-1; 1072 v4 = aa + ai[row -2]-1; 1073 1074 for (j=nz; j>1; j-=2) { 1075 vi -=2; 1076 i0 = vi[2]; 1077 i1 = vi[1]; 1078 tmp0 = tmps[i0]; 1079 tmp1 = tmps[i1]; 1080 v1 -= 2; 1081 v2 -= 2; 1082 v3 -= 2; 1083 v4 -= 2; 1084 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1085 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1086 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1087 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1088 } 1089 if (j==1) { 1090 tmp0 = tmps[*vi--]; 1091 sum1 -= *v1-- * tmp0; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 } 1096 1097 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1105 sum4 -= *v4-- * tmp0; 1106 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1107 break; 1108 case 5: 1109 sum1 = tmp[row]; 1110 sum2 = tmp[row -1]; 1111 sum3 = tmp[row -2]; 1112 sum4 = tmp[row -3]; 1113 sum5 = tmp[row -4]; 1114 v2 = aa + ai[row]-1; 1115 v3 = aa + ai[row -1]-1; 1116 v4 = aa + ai[row -2]-1; 1117 v5 = aa + ai[row -3]-1; 1118 for (j=nz; j>1; j-=2) { 1119 vi -= 2; 1120 i0 = vi[2]; 1121 i1 = vi[1]; 1122 tmp0 = tmps[i0]; 1123 tmp1 = tmps[i1]; 1124 v1 -= 2; 1125 v2 -= 2; 1126 v3 -= 2; 1127 v4 -= 2; 1128 v5 -= 2; 1129 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1130 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1131 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1132 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1133 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1134 } 1135 if (j==1) { 1136 tmp0 = tmps[*vi--]; 1137 sum1 -= *v1-- * tmp0; 1138 sum2 -= *v2-- * tmp0; 1139 sum3 -= *v3-- * tmp0; 1140 sum4 -= *v4-- * tmp0; 1141 sum5 -= *v5-- * tmp0; 1142 } 1143 1144 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1157 sum5 -= *v5-- * tmp0; 1158 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1159 break; 1160 default: 1161 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1162 } 1163 } 1164 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1165 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1166 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1174 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1175 { 1176 Mat C =B; 1177 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1178 IS isrow = b->row,isicol = b->icol; 1179 PetscErrorCode ierr; 1180 const PetscInt *r,*ic,*ics; 1181 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1182 PetscInt i,j,k,nz,nzL,row,*pj; 1183 const PetscInt *ajtmp,*bjtmp; 1184 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1185 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1186 FactorShiftCtx sctx; 1187 const PetscInt *ddiag; 1188 PetscReal rs; 1189 MatScalar d; 1190 PetscInt inod,nodesz,node_max,col; 1191 const PetscInt *ns; 1192 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1193 1194 PetscFunctionBegin; 1195 /* MatPivotSetUp(): initialize shift context sctx */ 1196 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1197 1198 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1199 ddiag = a->diag; 1200 sctx.shift_top = info->zeropivot; 1201 for (i=0; i<n; i++) { 1202 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1203 d = (aa)[ddiag[i]]; 1204 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1205 v = aa+ai[i]; 1206 nz = ai[i+1] - ai[i]; 1207 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1208 if (rs>sctx.shift_top) sctx.shift_top = rs; 1209 } 1210 sctx.shift_top *= 1.1; 1211 sctx.nshift_max = 5; 1212 sctx.shift_lo = 0.; 1213 sctx.shift_hi = 1.; 1214 } 1215 1216 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1217 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1218 1219 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1220 ics = ic; 1221 1222 node_max = a->inode.node_count; 1223 ns = a->inode.size; 1224 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1225 1226 /* If max inode size > 4, split it into two inodes.*/ 1227 /* also map the inode sizes according to the ordering */ 1228 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1229 for (i=0,j=0; i<node_max; ++i,++j) { 1230 if (ns[i] > 4) { 1231 tmp_vec1[j] = 4; 1232 ++j; 1233 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1234 } else { 1235 tmp_vec1[j] = ns[i]; 1236 } 1237 } 1238 /* Use the correct node_max */ 1239 node_max = j; 1240 1241 /* Now reorder the inode info based on mat re-ordering info */ 1242 /* First create a row -> inode_size_array_index map */ 1243 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1244 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1245 for (i=0,row=0; i<node_max; i++) { 1246 nodesz = tmp_vec1[i]; 1247 for (j=0; j<nodesz; j++,row++) { 1248 nsmap[row] = i; 1249 } 1250 } 1251 /* Using nsmap, create a reordered ns structure */ 1252 for (i=0,j=0; i< node_max; i++) { 1253 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1254 tmp_vec2[i] = nodesz; 1255 j += nodesz; 1256 } 1257 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1258 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1259 1260 /* Now use the correct ns */ 1261 ns = tmp_vec2; 1262 1263 do { 1264 sctx.newshift = PETSC_FALSE; 1265 /* Now loop over each block-row, and do the factorization */ 1266 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1267 nodesz = ns[inod]; 1268 1269 switch (nodesz) { 1270 case 1: 1271 /*----------*/ 1272 /* zero rtmp1 */ 1273 /* L part */ 1274 nz = bi[i+1] - bi[i]; 1275 bjtmp = bj + bi[i]; 1276 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1277 1278 /* U part */ 1279 nz = bdiag[i]-bdiag[i+1]; 1280 bjtmp = bj + bdiag[i+1]+1; 1281 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1282 1283 /* load in initial (unfactored row) */ 1284 nz = ai[r[i]+1] - ai[r[i]]; 1285 ajtmp = aj + ai[r[i]]; 1286 v = aa + ai[r[i]]; 1287 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1288 1289 /* ZeropivotApply() */ 1290 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1291 1292 /* elimination */ 1293 bjtmp = bj + bi[i]; 1294 row = *bjtmp++; 1295 nzL = bi[i+1] - bi[i]; 1296 for (k=0; k < nzL; k++) { 1297 pc = rtmp1 + row; 1298 if (*pc != 0.0) { 1299 pv = b->a + bdiag[row]; 1300 mul1 = *pc * (*pv); 1301 *pc = mul1; 1302 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1303 pv = b->a + bdiag[row+1]+1; 1304 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1305 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1306 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1307 } 1308 row = *bjtmp++; 1309 } 1310 1311 /* finished row so stick it into b->a */ 1312 rs = 0.0; 1313 /* L part */ 1314 pv = b->a + bi[i]; 1315 pj = b->j + bi[i]; 1316 nz = bi[i+1] - bi[i]; 1317 for (j=0; j<nz; j++) { 1318 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1319 } 1320 1321 /* U part */ 1322 pv = b->a + bdiag[i+1]+1; 1323 pj = b->j + bdiag[i+1]+1; 1324 nz = bdiag[i] - bdiag[i+1]-1; 1325 for (j=0; j<nz; j++) { 1326 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1327 } 1328 1329 /* Check zero pivot */ 1330 sctx.rs = rs; 1331 sctx.pv = rtmp1[i]; 1332 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1333 if (sctx.newshift) break; 1334 1335 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1336 pv = b->a + bdiag[i]; 1337 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1338 break; 1339 1340 case 2: 1341 /*----------*/ 1342 /* zero rtmp1 and rtmp2 */ 1343 /* L part */ 1344 nz = bi[i+1] - bi[i]; 1345 bjtmp = bj + bi[i]; 1346 for (j=0; j<nz; j++) { 1347 col = bjtmp[j]; 1348 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1349 } 1350 1351 /* U part */ 1352 nz = bdiag[i]-bdiag[i+1]; 1353 bjtmp = bj + bdiag[i+1]+1; 1354 for (j=0; j<nz; j++) { 1355 col = bjtmp[j]; 1356 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1357 } 1358 1359 /* load in initial (unfactored row) */ 1360 nz = ai[r[i]+1] - ai[r[i]]; 1361 ajtmp = aj + ai[r[i]]; 1362 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1363 for (j=0; j<nz; j++) { 1364 col = ics[ajtmp[j]]; 1365 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1366 } 1367 /* ZeropivotApply(): shift the diagonal of the matrix */ 1368 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1369 1370 /* elimination */ 1371 bjtmp = bj + bi[i]; 1372 row = *bjtmp++; /* pivot row */ 1373 nzL = bi[i+1] - bi[i]; 1374 for (k=0; k < nzL; k++) { 1375 pc1 = rtmp1 + row; 1376 pc2 = rtmp2 + row; 1377 if (*pc1 != 0.0 || *pc2 != 0.0) { 1378 pv = b->a + bdiag[row]; 1379 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1380 *pc1 = mul1; *pc2 = mul2; 1381 1382 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1383 pv = b->a + bdiag[row+1]+1; 1384 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1385 for (j=0; j<nz; j++) { 1386 col = pj[j]; 1387 rtmp1[col] -= mul1 * pv[j]; 1388 rtmp2[col] -= mul2 * pv[j]; 1389 } 1390 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1391 } 1392 row = *bjtmp++; 1393 } 1394 1395 /* finished row i; check zero pivot, then stick row i into b->a */ 1396 rs = 0.0; 1397 /* L part */ 1398 pc1 = b->a + bi[i]; 1399 pj = b->j + bi[i]; 1400 nz = bi[i+1] - bi[i]; 1401 for (j=0; j<nz; j++) { 1402 col = pj[j]; 1403 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1404 } 1405 /* U part */ 1406 pc1 = b->a + bdiag[i+1]+1; 1407 pj = b->j + bdiag[i+1]+1; 1408 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1409 for (j=0; j<nz; j++) { 1410 col = pj[j]; 1411 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1412 } 1413 1414 sctx.rs = rs; 1415 sctx.pv = rtmp1[i]; 1416 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1417 if (sctx.newshift) break; 1418 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1419 *pc1 = 1.0/sctx.pv; 1420 1421 /* Now take care of diagonal 2x2 block. */ 1422 pc2 = rtmp2 + i; 1423 if (*pc2 != 0.0) { 1424 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1425 *pc2 = mul1; /* insert L entry */ 1426 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1427 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1428 for (j=0; j<nz; j++) { 1429 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1430 } 1431 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1432 } 1433 1434 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1435 rs = 0.0; 1436 /* L part */ 1437 pc2 = b->a + bi[i+1]; 1438 pj = b->j + bi[i+1]; 1439 nz = bi[i+2] - bi[i+1]; 1440 for (j=0; j<nz; j++) { 1441 col = pj[j]; 1442 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1443 } 1444 /* U part */ 1445 pc2 = b->a + bdiag[i+2]+1; 1446 pj = b->j + bdiag[i+2]+1; 1447 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1448 for (j=0; j<nz; j++) { 1449 col = pj[j]; 1450 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1451 } 1452 1453 sctx.rs = rs; 1454 sctx.pv = rtmp2[i+1]; 1455 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1456 if (sctx.newshift) break; 1457 pc2 = b->a + bdiag[i+1]; 1458 *pc2 = 1.0/sctx.pv; 1459 break; 1460 1461 case 3: 1462 /*----------*/ 1463 /* zero rtmp */ 1464 /* L part */ 1465 nz = bi[i+1] - bi[i]; 1466 bjtmp = bj + bi[i]; 1467 for (j=0; j<nz; j++) { 1468 col = bjtmp[j]; 1469 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1470 } 1471 1472 /* U part */ 1473 nz = bdiag[i]-bdiag[i+1]; 1474 bjtmp = bj + bdiag[i+1]+1; 1475 for (j=0; j<nz; j++) { 1476 col = bjtmp[j]; 1477 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1478 } 1479 1480 /* load in initial (unfactored row) */ 1481 nz = ai[r[i]+1] - ai[r[i]]; 1482 ajtmp = aj + ai[r[i]]; 1483 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1484 for (j=0; j<nz; j++) { 1485 col = ics[ajtmp[j]]; 1486 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1487 } 1488 /* ZeropivotApply(): shift the diagonal of the matrix */ 1489 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1490 1491 /* elimination */ 1492 bjtmp = bj + bi[i]; 1493 row = *bjtmp++; /* pivot row */ 1494 nzL = bi[i+1] - bi[i]; 1495 for (k=0; k < nzL; k++) { 1496 pc1 = rtmp1 + row; 1497 pc2 = rtmp2 + row; 1498 pc3 = rtmp3 + row; 1499 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1500 pv = b->a + bdiag[row]; 1501 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1502 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1503 1504 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1505 pv = b->a + bdiag[row+1]+1; 1506 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1507 for (j=0; j<nz; j++) { 1508 col = pj[j]; 1509 rtmp1[col] -= mul1 * pv[j]; 1510 rtmp2[col] -= mul2 * pv[j]; 1511 rtmp3[col] -= mul3 * pv[j]; 1512 } 1513 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1514 } 1515 row = *bjtmp++; 1516 } 1517 1518 /* finished row i; check zero pivot, then stick row i into b->a */ 1519 rs = 0.0; 1520 /* L part */ 1521 pc1 = b->a + bi[i]; 1522 pj = b->j + bi[i]; 1523 nz = bi[i+1] - bi[i]; 1524 for (j=0; j<nz; j++) { 1525 col = pj[j]; 1526 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1527 } 1528 /* U part */ 1529 pc1 = b->a + bdiag[i+1]+1; 1530 pj = b->j + bdiag[i+1]+1; 1531 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1532 for (j=0; j<nz; j++) { 1533 col = pj[j]; 1534 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1535 } 1536 1537 sctx.rs = rs; 1538 sctx.pv = rtmp1[i]; 1539 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1540 if (sctx.newshift) break; 1541 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1542 *pc1 = 1.0/sctx.pv; 1543 1544 /* Now take care of 1st column of diagonal 3x3 block. */ 1545 pc2 = rtmp2 + i; 1546 pc3 = rtmp3 + i; 1547 if (*pc2 != 0.0 || *pc3 != 0.0) { 1548 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1549 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1550 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1551 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1552 for (j=0; j<nz; j++) { 1553 col = pj[j]; 1554 rtmp2[col] -= mul2 * rtmp1[col]; 1555 rtmp3[col] -= mul3 * rtmp1[col]; 1556 } 1557 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1558 } 1559 1560 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1561 rs = 0.0; 1562 /* L part */ 1563 pc2 = b->a + bi[i+1]; 1564 pj = b->j + bi[i+1]; 1565 nz = bi[i+2] - bi[i+1]; 1566 for (j=0; j<nz; j++) { 1567 col = pj[j]; 1568 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1569 } 1570 /* U part */ 1571 pc2 = b->a + bdiag[i+2]+1; 1572 pj = b->j + bdiag[i+2]+1; 1573 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1574 for (j=0; j<nz; j++) { 1575 col = pj[j]; 1576 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1577 } 1578 1579 sctx.rs = rs; 1580 sctx.pv = rtmp2[i+1]; 1581 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1582 if (sctx.newshift) break; 1583 pc2 = b->a + bdiag[i+1]; 1584 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1585 1586 /* Now take care of 2nd column of diagonal 3x3 block. */ 1587 pc3 = rtmp3 + i+1; 1588 if (*pc3 != 0.0) { 1589 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1590 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1591 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1592 for (j=0; j<nz; j++) { 1593 col = pj[j]; 1594 rtmp3[col] -= mul3 * rtmp2[col]; 1595 } 1596 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1597 } 1598 1599 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1600 rs = 0.0; 1601 /* L part */ 1602 pc3 = b->a + bi[i+2]; 1603 pj = b->j + bi[i+2]; 1604 nz = bi[i+3] - bi[i+2]; 1605 for (j=0; j<nz; j++) { 1606 col = pj[j]; 1607 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1608 } 1609 /* U part */ 1610 pc3 = b->a + bdiag[i+3]+1; 1611 pj = b->j + bdiag[i+3]+1; 1612 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1613 for (j=0; j<nz; j++) { 1614 col = pj[j]; 1615 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1616 } 1617 1618 sctx.rs = rs; 1619 sctx.pv = rtmp3[i+2]; 1620 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1621 if (sctx.newshift) break; 1622 pc3 = b->a + bdiag[i+2]; 1623 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1624 break; 1625 case 4: 1626 /*----------*/ 1627 /* zero rtmp */ 1628 /* L part */ 1629 nz = bi[i+1] - bi[i]; 1630 bjtmp = bj + bi[i]; 1631 for (j=0; j<nz; j++) { 1632 col = bjtmp[j]; 1633 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1634 } 1635 1636 /* U part */ 1637 nz = bdiag[i]-bdiag[i+1]; 1638 bjtmp = bj + bdiag[i+1]+1; 1639 for (j=0; j<nz; j++) { 1640 col = bjtmp[j]; 1641 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1642 } 1643 1644 /* load in initial (unfactored row) */ 1645 nz = ai[r[i]+1] - ai[r[i]]; 1646 ajtmp = aj + ai[r[i]]; 1647 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1648 for (j=0; j<nz; j++) { 1649 col = ics[ajtmp[j]]; 1650 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1651 } 1652 /* ZeropivotApply(): shift the diagonal of the matrix */ 1653 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1654 1655 /* elimination */ 1656 bjtmp = bj + bi[i]; 1657 row = *bjtmp++; /* pivot row */ 1658 nzL = bi[i+1] - bi[i]; 1659 for (k=0; k < nzL; k++) { 1660 pc1 = rtmp1 + row; 1661 pc2 = rtmp2 + row; 1662 pc3 = rtmp3 + row; 1663 pc4 = rtmp4 + row; 1664 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1665 pv = b->a + bdiag[row]; 1666 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1667 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1668 1669 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1670 pv = b->a + bdiag[row+1]+1; 1671 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1672 for (j=0; j<nz; j++) { 1673 col = pj[j]; 1674 rtmp1[col] -= mul1 * pv[j]; 1675 rtmp2[col] -= mul2 * pv[j]; 1676 rtmp3[col] -= mul3 * pv[j]; 1677 rtmp4[col] -= mul4 * pv[j]; 1678 } 1679 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1680 } 1681 row = *bjtmp++; 1682 } 1683 1684 /* finished row i; check zero pivot, then stick row i into b->a */ 1685 rs = 0.0; 1686 /* L part */ 1687 pc1 = b->a + bi[i]; 1688 pj = b->j + bi[i]; 1689 nz = bi[i+1] - bi[i]; 1690 for (j=0; j<nz; j++) { 1691 col = pj[j]; 1692 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1693 } 1694 /* U part */ 1695 pc1 = b->a + bdiag[i+1]+1; 1696 pj = b->j + bdiag[i+1]+1; 1697 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1698 for (j=0; j<nz; j++) { 1699 col = pj[j]; 1700 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1701 } 1702 1703 sctx.rs = rs; 1704 sctx.pv = rtmp1[i]; 1705 ierr = MatPivotCheck(B,A,info,&sctx,i);CHKERRQ(ierr); 1706 if (sctx.newshift) break; 1707 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1708 *pc1 = 1.0/sctx.pv; 1709 1710 /* Now take care of 1st column of diagonal 4x4 block. */ 1711 pc2 = rtmp2 + i; 1712 pc3 = rtmp3 + i; 1713 pc4 = rtmp4 + i; 1714 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1715 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1716 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1717 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1718 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1719 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1720 for (j=0; j<nz; j++) { 1721 col = pj[j]; 1722 rtmp2[col] -= mul2 * rtmp1[col]; 1723 rtmp3[col] -= mul3 * rtmp1[col]; 1724 rtmp4[col] -= mul4 * rtmp1[col]; 1725 } 1726 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1727 } 1728 1729 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1730 rs = 0.0; 1731 /* L part */ 1732 pc2 = b->a + bi[i+1]; 1733 pj = b->j + bi[i+1]; 1734 nz = bi[i+2] - bi[i+1]; 1735 for (j=0; j<nz; j++) { 1736 col = pj[j]; 1737 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1738 } 1739 /* U part */ 1740 pc2 = b->a + bdiag[i+2]+1; 1741 pj = b->j + bdiag[i+2]+1; 1742 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1743 for (j=0; j<nz; j++) { 1744 col = pj[j]; 1745 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1746 } 1747 1748 sctx.rs = rs; 1749 sctx.pv = rtmp2[i+1]; 1750 ierr = MatPivotCheck(B,A,info,&sctx,i+1);CHKERRQ(ierr); 1751 if (sctx.newshift) break; 1752 pc2 = b->a + bdiag[i+1]; 1753 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1754 1755 /* Now take care of 2nd column of diagonal 4x4 block. */ 1756 pc3 = rtmp3 + i+1; 1757 pc4 = rtmp4 + i+1; 1758 if (*pc3 != 0.0 || *pc4 != 0.0) { 1759 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1760 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1761 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1762 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1763 for (j=0; j<nz; j++) { 1764 col = pj[j]; 1765 rtmp3[col] -= mul3 * rtmp2[col]; 1766 rtmp4[col] -= mul4 * rtmp2[col]; 1767 } 1768 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1769 } 1770 1771 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1772 rs = 0.0; 1773 /* L part */ 1774 pc3 = b->a + bi[i+2]; 1775 pj = b->j + bi[i+2]; 1776 nz = bi[i+3] - bi[i+2]; 1777 for (j=0; j<nz; j++) { 1778 col = pj[j]; 1779 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1780 } 1781 /* U part */ 1782 pc3 = b->a + bdiag[i+3]+1; 1783 pj = b->j + bdiag[i+3]+1; 1784 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1785 for (j=0; j<nz; j++) { 1786 col = pj[j]; 1787 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1788 } 1789 1790 sctx.rs = rs; 1791 sctx.pv = rtmp3[i+2]; 1792 ierr = MatPivotCheck(B,A,info,&sctx,i+2);CHKERRQ(ierr); 1793 if (sctx.newshift) break; 1794 pc3 = b->a + bdiag[i+2]; 1795 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1796 1797 /* Now take care of 3rd column of diagonal 4x4 block. */ 1798 pc4 = rtmp4 + i+2; 1799 if (*pc4 != 0.0) { 1800 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1801 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1802 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1803 for (j=0; j<nz; j++) { 1804 col = pj[j]; 1805 rtmp4[col] -= mul4 * rtmp3[col]; 1806 } 1807 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1808 } 1809 1810 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1811 rs = 0.0; 1812 /* L part */ 1813 pc4 = b->a + bi[i+3]; 1814 pj = b->j + bi[i+3]; 1815 nz = bi[i+4] - bi[i+3]; 1816 for (j=0; j<nz; j++) { 1817 col = pj[j]; 1818 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1819 } 1820 /* U part */ 1821 pc4 = b->a + bdiag[i+4]+1; 1822 pj = b->j + bdiag[i+4]+1; 1823 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1824 for (j=0; j<nz; j++) { 1825 col = pj[j]; 1826 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1827 } 1828 1829 sctx.rs = rs; 1830 sctx.pv = rtmp4[i+3]; 1831 ierr = MatPivotCheck(B,A,info,&sctx,i+3);CHKERRQ(ierr); 1832 if (sctx.newshift) break; 1833 pc4 = b->a + bdiag[i+3]; 1834 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1835 break; 1836 1837 default: 1838 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1839 } 1840 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1841 i += nodesz; /* Update the row */ 1842 } 1843 1844 /* MatPivotRefine() */ 1845 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1846 /* 1847 * if no shift in this attempt & shifting & started shifting & can refine, 1848 * then try lower shift 1849 */ 1850 sctx.shift_hi = sctx.shift_fraction; 1851 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1852 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1853 sctx.newshift = PETSC_TRUE; 1854 sctx.nshift++; 1855 } 1856 } while (sctx.newshift); 1857 1858 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1859 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1860 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1861 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1862 1863 if (b->inode.size) { 1864 C->ops->solve = MatSolve_SeqAIJ_Inode; 1865 } else { 1866 C->ops->solve = MatSolve_SeqAIJ; 1867 } 1868 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1869 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1870 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1871 C->ops->matsolve = MatMatSolve_SeqAIJ; 1872 C->assembled = PETSC_TRUE; 1873 C->preallocated = PETSC_TRUE; 1874 1875 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1876 1877 /* MatShiftView(A,info,&sctx) */ 1878 if (sctx.nshift) { 1879 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1880 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1881 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1882 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1883 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1884 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1885 } 1886 } 1887 PetscFunctionReturn(0); 1888 } 1889 1890 #undef __FUNCT__ 1891 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1892 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1893 { 1894 Mat C = B; 1895 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1896 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1897 PetscErrorCode ierr; 1898 const PetscInt *r,*ic,*c,*ics; 1899 PetscInt n = A->rmap->n,*bi = b->i; 1900 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1901 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1902 PetscInt *ai = a->i,*aj = a->j; 1903 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1904 PetscScalar mul1,mul2,mul3,tmp; 1905 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1906 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1907 PetscReal rs=0.0; 1908 FactorShiftCtx sctx; 1909 1910 PetscFunctionBegin; 1911 sctx.shift_top = 0; 1912 sctx.nshift_max = 0; 1913 sctx.shift_lo = 0; 1914 sctx.shift_hi = 0; 1915 sctx.shift_fraction = 0; 1916 1917 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1918 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1919 sctx.shift_top = 0; 1920 for (i=0; i<n; i++) { 1921 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1922 rs = 0.0; 1923 ajtmp = aj + ai[i]; 1924 rtmp1 = aa + ai[i]; 1925 nz = ai[i+1] - ai[i]; 1926 for (j=0; j<nz; j++) { 1927 if (*ajtmp != i) { 1928 rs += PetscAbsScalar(*rtmp1++); 1929 } else { 1930 rs -= PetscRealPart(*rtmp1++); 1931 } 1932 ajtmp++; 1933 } 1934 if (rs>sctx.shift_top) sctx.shift_top = rs; 1935 } 1936 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1937 sctx.shift_top *= 1.1; 1938 sctx.nshift_max = 5; 1939 sctx.shift_lo = 0.; 1940 sctx.shift_hi = 1.; 1941 } 1942 sctx.shift_amount = 0; 1943 sctx.nshift = 0; 1944 1945 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1946 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1947 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1948 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1949 ics = ic; 1950 1951 node_max = a->inode.node_count; 1952 ns = a->inode.size; 1953 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1954 1955 /* If max inode size > 3, split it into two inodes.*/ 1956 /* also map the inode sizes according to the ordering */ 1957 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1958 for (i=0,j=0; i<node_max; ++i,++j) { 1959 if (ns[i]>3) { 1960 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1961 ++j; 1962 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1963 } else { 1964 tmp_vec1[j] = ns[i]; 1965 } 1966 } 1967 /* Use the correct node_max */ 1968 node_max = j; 1969 1970 /* Now reorder the inode info based on mat re-ordering info */ 1971 /* First create a row -> inode_size_array_index map */ 1972 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1973 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1974 for (i=0,row=0; i<node_max; i++) { 1975 nodesz = tmp_vec1[i]; 1976 for (j=0; j<nodesz; j++,row++) { 1977 nsmap[row] = i; 1978 } 1979 } 1980 /* Using nsmap, create a reordered ns structure */ 1981 for (i=0,j=0; i< node_max; i++) { 1982 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1983 tmp_vec2[i] = nodesz; 1984 j += nodesz; 1985 } 1986 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1987 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1988 /* Now use the correct ns */ 1989 ns = tmp_vec2; 1990 1991 do { 1992 sctx.newshift = PETSC_FALSE; 1993 /* Now loop over each block-row, and do the factorization */ 1994 for (i=0,row=0; i<node_max; i++) { 1995 nodesz = ns[i]; 1996 nz = bi[row+1] - bi[row]; 1997 bjtmp = bj + bi[row]; 1998 1999 switch (nodesz) { 2000 case 1: 2001 for (j=0; j<nz; j++) { 2002 idx = bjtmp[j]; 2003 rtmp11[idx] = 0.0; 2004 } 2005 2006 /* load in initial (unfactored row) */ 2007 idx = r[row]; 2008 nz_tmp = ai[idx+1] - ai[idx]; 2009 ajtmp = aj + ai[idx]; 2010 v1 = aa + ai[idx]; 2011 2012 for (j=0; j<nz_tmp; j++) { 2013 idx = ics[ajtmp[j]]; 2014 rtmp11[idx] = v1[j]; 2015 } 2016 rtmp11[ics[r[row]]] += sctx.shift_amount; 2017 2018 prow = *bjtmp++; 2019 while (prow < row) { 2020 pc1 = rtmp11 + prow; 2021 if (*pc1 != 0.0) { 2022 pv = ba + bd[prow]; 2023 pj = nbj + bd[prow]; 2024 mul1 = *pc1 * *pv++; 2025 *pc1 = mul1; 2026 nz_tmp = bi[prow+1] - bd[prow] - 1; 2027 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2028 for (j=0; j<nz_tmp; j++) { 2029 tmp = pv[j]; 2030 idx = pj[j]; 2031 rtmp11[idx] -= mul1 * tmp; 2032 } 2033 } 2034 prow = *bjtmp++; 2035 } 2036 pj = bj + bi[row]; 2037 pc1 = ba + bi[row]; 2038 2039 sctx.pv = rtmp11[row]; 2040 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2041 rs = 0.0; 2042 for (j=0; j<nz; j++) { 2043 idx = pj[j]; 2044 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2045 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2046 } 2047 sctx.rs = rs; 2048 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2049 if (sctx.newshift) goto endofwhile; 2050 break; 2051 2052 case 2: 2053 for (j=0; j<nz; j++) { 2054 idx = bjtmp[j]; 2055 rtmp11[idx] = 0.0; 2056 rtmp22[idx] = 0.0; 2057 } 2058 2059 /* load in initial (unfactored row) */ 2060 idx = r[row]; 2061 nz_tmp = ai[idx+1] - ai[idx]; 2062 ajtmp = aj + ai[idx]; 2063 v1 = aa + ai[idx]; 2064 v2 = aa + ai[idx+1]; 2065 for (j=0; j<nz_tmp; j++) { 2066 idx = ics[ajtmp[j]]; 2067 rtmp11[idx] = v1[j]; 2068 rtmp22[idx] = v2[j]; 2069 } 2070 rtmp11[ics[r[row]]] += sctx.shift_amount; 2071 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2072 2073 prow = *bjtmp++; 2074 while (prow < row) { 2075 pc1 = rtmp11 + prow; 2076 pc2 = rtmp22 + prow; 2077 if (*pc1 != 0.0 || *pc2 != 0.0) { 2078 pv = ba + bd[prow]; 2079 pj = nbj + bd[prow]; 2080 mul1 = *pc1 * *pv; 2081 mul2 = *pc2 * *pv; 2082 ++pv; 2083 *pc1 = mul1; 2084 *pc2 = mul2; 2085 2086 nz_tmp = bi[prow+1] - bd[prow] - 1; 2087 for (j=0; j<nz_tmp; j++) { 2088 tmp = pv[j]; 2089 idx = pj[j]; 2090 rtmp11[idx] -= mul1 * tmp; 2091 rtmp22[idx] -= mul2 * tmp; 2092 } 2093 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2094 } 2095 prow = *bjtmp++; 2096 } 2097 2098 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2099 pc1 = rtmp11 + prow; 2100 pc2 = rtmp22 + prow; 2101 2102 sctx.pv = *pc1; 2103 pj = bj + bi[prow]; 2104 rs = 0.0; 2105 for (j=0; j<nz; j++) { 2106 idx = pj[j]; 2107 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2108 } 2109 sctx.rs = rs; 2110 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2111 if (sctx.newshift) goto endofwhile; 2112 2113 if (*pc2 != 0.0) { 2114 pj = nbj + bd[prow]; 2115 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2116 *pc2 = mul2; 2117 nz_tmp = bi[prow+1] - bd[prow] - 1; 2118 for (j=0; j<nz_tmp; j++) { 2119 idx = pj[j]; 2120 tmp = rtmp11[idx]; 2121 rtmp22[idx] -= mul2 * tmp; 2122 } 2123 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2124 } 2125 2126 pj = bj + bi[row]; 2127 pc1 = ba + bi[row]; 2128 pc2 = ba + bi[row+1]; 2129 2130 sctx.pv = rtmp22[row+1]; 2131 rs = 0.0; 2132 rtmp11[row] = 1.0/rtmp11[row]; 2133 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2134 /* copy row entries from dense representation to sparse */ 2135 for (j=0; j<nz; j++) { 2136 idx = pj[j]; 2137 pc1[j] = rtmp11[idx]; 2138 pc2[j] = rtmp22[idx]; 2139 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2140 } 2141 sctx.rs = rs; 2142 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2143 if (sctx.newshift) goto endofwhile; 2144 break; 2145 2146 case 3: 2147 for (j=0; j<nz; j++) { 2148 idx = bjtmp[j]; 2149 rtmp11[idx] = 0.0; 2150 rtmp22[idx] = 0.0; 2151 rtmp33[idx] = 0.0; 2152 } 2153 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2154 idx = r[row]; 2155 nz_tmp = ai[idx+1] - ai[idx]; 2156 ajtmp = aj + ai[idx]; 2157 v1 = aa + ai[idx]; 2158 v2 = aa + ai[idx+1]; 2159 v3 = aa + ai[idx+2]; 2160 for (j=0; j<nz_tmp; j++) { 2161 idx = ics[ajtmp[j]]; 2162 rtmp11[idx] = v1[j]; 2163 rtmp22[idx] = v2[j]; 2164 rtmp33[idx] = v3[j]; 2165 } 2166 rtmp11[ics[r[row]]] += sctx.shift_amount; 2167 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2168 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2169 2170 /* loop over all pivot row blocks above this row block */ 2171 prow = *bjtmp++; 2172 while (prow < row) { 2173 pc1 = rtmp11 + prow; 2174 pc2 = rtmp22 + prow; 2175 pc3 = rtmp33 + prow; 2176 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2177 pv = ba + bd[prow]; 2178 pj = nbj + bd[prow]; 2179 mul1 = *pc1 * *pv; 2180 mul2 = *pc2 * *pv; 2181 mul3 = *pc3 * *pv; 2182 ++pv; 2183 *pc1 = mul1; 2184 *pc2 = mul2; 2185 *pc3 = mul3; 2186 2187 nz_tmp = bi[prow+1] - bd[prow] - 1; 2188 /* update this row based on pivot row */ 2189 for (j=0; j<nz_tmp; j++) { 2190 tmp = pv[j]; 2191 idx = pj[j]; 2192 rtmp11[idx] -= mul1 * tmp; 2193 rtmp22[idx] -= mul2 * tmp; 2194 rtmp33[idx] -= mul3 * tmp; 2195 } 2196 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2197 } 2198 prow = *bjtmp++; 2199 } 2200 2201 /* Now take care of diagonal 3x3 block in this set of rows */ 2202 /* note: prow = row here */ 2203 pc1 = rtmp11 + prow; 2204 pc2 = rtmp22 + prow; 2205 pc3 = rtmp33 + prow; 2206 2207 sctx.pv = *pc1; 2208 pj = bj + bi[prow]; 2209 rs = 0.0; 2210 for (j=0; j<nz; j++) { 2211 idx = pj[j]; 2212 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2213 } 2214 sctx.rs = rs; 2215 ierr = MatPivotCheck(B,A,info,&sctx,row);CHKERRQ(ierr); 2216 if (sctx.newshift) goto endofwhile; 2217 2218 if (*pc2 != 0.0 || *pc3 != 0.0) { 2219 mul2 = (*pc2)/(*pc1); 2220 mul3 = (*pc3)/(*pc1); 2221 *pc2 = mul2; 2222 *pc3 = mul3; 2223 nz_tmp = bi[prow+1] - bd[prow] - 1; 2224 pj = nbj + bd[prow]; 2225 for (j=0; j<nz_tmp; j++) { 2226 idx = pj[j]; 2227 tmp = rtmp11[idx]; 2228 rtmp22[idx] -= mul2 * tmp; 2229 rtmp33[idx] -= mul3 * tmp; 2230 } 2231 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2232 } 2233 ++prow; 2234 2235 pc2 = rtmp22 + prow; 2236 pc3 = rtmp33 + prow; 2237 sctx.pv = *pc2; 2238 pj = bj + bi[prow]; 2239 rs = 0.0; 2240 for (j=0; j<nz; j++) { 2241 idx = pj[j]; 2242 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2243 } 2244 sctx.rs = rs; 2245 ierr = MatPivotCheck(B,A,info,&sctx,row+1);CHKERRQ(ierr); 2246 if (sctx.newshift) goto endofwhile; 2247 2248 if (*pc3 != 0.0) { 2249 mul3 = (*pc3)/(*pc2); 2250 *pc3 = mul3; 2251 pj = nbj + bd[prow]; 2252 nz_tmp = bi[prow+1] - bd[prow] - 1; 2253 for (j=0; j<nz_tmp; j++) { 2254 idx = pj[j]; 2255 tmp = rtmp22[idx]; 2256 rtmp33[idx] -= mul3 * tmp; 2257 } 2258 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2259 } 2260 2261 pj = bj + bi[row]; 2262 pc1 = ba + bi[row]; 2263 pc2 = ba + bi[row+1]; 2264 pc3 = ba + bi[row+2]; 2265 2266 sctx.pv = rtmp33[row+2]; 2267 rs = 0.0; 2268 rtmp11[row] = 1.0/rtmp11[row]; 2269 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2270 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2271 /* copy row entries from dense representation to sparse */ 2272 for (j=0; j<nz; j++) { 2273 idx = pj[j]; 2274 pc1[j] = rtmp11[idx]; 2275 pc2[j] = rtmp22[idx]; 2276 pc3[j] = rtmp33[idx]; 2277 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2278 } 2279 2280 sctx.rs = rs; 2281 ierr = MatPivotCheck(B,A,info,&sctx,row+2);CHKERRQ(ierr); 2282 if (sctx.newshift) goto endofwhile; 2283 break; 2284 2285 default: 2286 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2287 } 2288 row += nodesz; /* Update the row */ 2289 } 2290 endofwhile:; 2291 } while (sctx.newshift); 2292 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2293 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2294 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2295 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2296 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2297 2298 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2299 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2300 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2301 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2302 C->assembled = PETSC_TRUE; 2303 C->preallocated = PETSC_TRUE; 2304 if (sctx.nshift) { 2305 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2306 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2307 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2308 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2309 } 2310 } 2311 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2312 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2313 PetscFunctionReturn(0); 2314 } 2315 2316 2317 /* ----------------------------------------------------------- */ 2318 #undef __FUNCT__ 2319 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2320 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2321 { 2322 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2323 IS iscol = a->col,isrow = a->row; 2324 PetscErrorCode ierr; 2325 const PetscInt *r,*c,*rout,*cout; 2326 PetscInt i,j,n = A->rmap->n; 2327 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2328 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2329 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2330 PetscScalar sum1,sum2,sum3,sum4,sum5; 2331 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2332 const PetscScalar *b; 2333 2334 PetscFunctionBegin; 2335 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2336 node_max = a->inode.node_count; 2337 ns = a->inode.size; /* Node Size array */ 2338 2339 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2340 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2341 tmp = a->solve_work; 2342 2343 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2344 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2345 2346 /* forward solve the lower triangular */ 2347 tmps = tmp; 2348 aa = a_a; 2349 aj = a_j; 2350 ad = a->diag; 2351 2352 for (i = 0,row = 0; i< node_max; ++i) { 2353 nsz = ns[i]; 2354 aii = ai[row]; 2355 v1 = aa + aii; 2356 vi = aj + aii; 2357 nz = ai[row+1]- ai[row]; 2358 2359 if (i < node_max-1) { 2360 /* Prefetch the indices for the next block */ 2361 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2362 /* Prefetch the data for the next block */ 2363 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2364 } 2365 2366 switch (nsz) { /* Each loop in 'case' is unrolled */ 2367 case 1: 2368 sum1 = b[r[row]]; 2369 for (j=0; j<nz-1; j+=2) { 2370 i0 = vi[j]; 2371 i1 = vi[j+1]; 2372 tmp0 = tmps[i0]; 2373 tmp1 = tmps[i1]; 2374 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2375 } 2376 if (j == nz-1) { 2377 tmp0 = tmps[vi[j]]; 2378 sum1 -= v1[j]*tmp0; 2379 } 2380 tmp[row++]=sum1; 2381 break; 2382 case 2: 2383 sum1 = b[r[row]]; 2384 sum2 = b[r[row+1]]; 2385 v2 = aa + ai[row+1]; 2386 2387 for (j=0; j<nz-1; j+=2) { 2388 i0 = vi[j]; 2389 i1 = vi[j+1]; 2390 tmp0 = tmps[i0]; 2391 tmp1 = tmps[i1]; 2392 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2393 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2394 } 2395 if (j == nz-1) { 2396 tmp0 = tmps[vi[j]]; 2397 sum1 -= v1[j] *tmp0; 2398 sum2 -= v2[j] *tmp0; 2399 } 2400 sum2 -= v2[nz] * sum1; 2401 tmp[row++]=sum1; 2402 tmp[row++]=sum2; 2403 break; 2404 case 3: 2405 sum1 = b[r[row]]; 2406 sum2 = b[r[row+1]]; 2407 sum3 = b[r[row+2]]; 2408 v2 = aa + ai[row+1]; 2409 v3 = aa + ai[row+2]; 2410 2411 for (j=0; j<nz-1; j+=2) { 2412 i0 = vi[j]; 2413 i1 = vi[j+1]; 2414 tmp0 = tmps[i0]; 2415 tmp1 = tmps[i1]; 2416 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2417 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2418 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2419 } 2420 if (j == nz-1) { 2421 tmp0 = tmps[vi[j]]; 2422 sum1 -= v1[j] *tmp0; 2423 sum2 -= v2[j] *tmp0; 2424 sum3 -= v3[j] *tmp0; 2425 } 2426 sum2 -= v2[nz] * sum1; 2427 sum3 -= v3[nz] * sum1; 2428 sum3 -= v3[nz+1] * sum2; 2429 tmp[row++]=sum1; 2430 tmp[row++]=sum2; 2431 tmp[row++]=sum3; 2432 break; 2433 2434 case 4: 2435 sum1 = b[r[row]]; 2436 sum2 = b[r[row+1]]; 2437 sum3 = b[r[row+2]]; 2438 sum4 = b[r[row+3]]; 2439 v2 = aa + ai[row+1]; 2440 v3 = aa + ai[row+2]; 2441 v4 = aa + ai[row+3]; 2442 2443 for (j=0; j<nz-1; j+=2) { 2444 i0 = vi[j]; 2445 i1 = vi[j+1]; 2446 tmp0 = tmps[i0]; 2447 tmp1 = tmps[i1]; 2448 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2449 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2450 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2451 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2452 } 2453 if (j == nz-1) { 2454 tmp0 = tmps[vi[j]]; 2455 sum1 -= v1[j] *tmp0; 2456 sum2 -= v2[j] *tmp0; 2457 sum3 -= v3[j] *tmp0; 2458 sum4 -= v4[j] *tmp0; 2459 } 2460 sum2 -= v2[nz] * sum1; 2461 sum3 -= v3[nz] * sum1; 2462 sum4 -= v4[nz] * sum1; 2463 sum3 -= v3[nz+1] * sum2; 2464 sum4 -= v4[nz+1] * sum2; 2465 sum4 -= v4[nz+2] * sum3; 2466 2467 tmp[row++]=sum1; 2468 tmp[row++]=sum2; 2469 tmp[row++]=sum3; 2470 tmp[row++]=sum4; 2471 break; 2472 case 5: 2473 sum1 = b[r[row]]; 2474 sum2 = b[r[row+1]]; 2475 sum3 = b[r[row+2]]; 2476 sum4 = b[r[row+3]]; 2477 sum5 = b[r[row+4]]; 2478 v2 = aa + ai[row+1]; 2479 v3 = aa + ai[row+2]; 2480 v4 = aa + ai[row+3]; 2481 v5 = aa + ai[row+4]; 2482 2483 for (j=0; j<nz-1; j+=2) { 2484 i0 = vi[j]; 2485 i1 = vi[j+1]; 2486 tmp0 = tmps[i0]; 2487 tmp1 = tmps[i1]; 2488 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2489 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2490 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2491 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2492 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2493 } 2494 if (j == nz-1) { 2495 tmp0 = tmps[vi[j]]; 2496 sum1 -= v1[j] *tmp0; 2497 sum2 -= v2[j] *tmp0; 2498 sum3 -= v3[j] *tmp0; 2499 sum4 -= v4[j] *tmp0; 2500 sum5 -= v5[j] *tmp0; 2501 } 2502 2503 sum2 -= v2[nz] * sum1; 2504 sum3 -= v3[nz] * sum1; 2505 sum4 -= v4[nz] * sum1; 2506 sum5 -= v5[nz] * sum1; 2507 sum3 -= v3[nz+1] * sum2; 2508 sum4 -= v4[nz+1] * sum2; 2509 sum5 -= v5[nz+1] * sum2; 2510 sum4 -= v4[nz+2] * sum3; 2511 sum5 -= v5[nz+2] * sum3; 2512 sum5 -= v5[nz+3] * sum4; 2513 2514 tmp[row++]=sum1; 2515 tmp[row++]=sum2; 2516 tmp[row++]=sum3; 2517 tmp[row++]=sum4; 2518 tmp[row++]=sum5; 2519 break; 2520 default: 2521 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2522 } 2523 } 2524 /* backward solve the upper triangular */ 2525 for (i=node_max -1,row = n-1; i>=0; i--) { 2526 nsz = ns[i]; 2527 aii = ad[row+1] + 1; 2528 v1 = aa + aii; 2529 vi = aj + aii; 2530 nz = ad[row]- ad[row+1] - 1; 2531 2532 if (i > 0) { 2533 /* Prefetch the indices for the next block */ 2534 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2535 /* Prefetch the data for the next block */ 2536 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2537 } 2538 2539 switch (nsz) { /* Each loop in 'case' is unrolled */ 2540 case 1: 2541 sum1 = tmp[row]; 2542 2543 for (j=0; j<nz-1; j+=2) { 2544 i0 = vi[j]; 2545 i1 = vi[j+1]; 2546 tmp0 = tmps[i0]; 2547 tmp1 = tmps[i1]; 2548 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2549 } 2550 if (j == nz-1) { 2551 tmp0 = tmps[vi[j]]; 2552 sum1 -= v1[j]*tmp0; 2553 } 2554 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2555 break; 2556 case 2: 2557 sum1 = tmp[row]; 2558 sum2 = tmp[row-1]; 2559 v2 = aa + ad[row] + 1; 2560 for (j=0; j<nz-1; j+=2) { 2561 i0 = vi[j]; 2562 i1 = vi[j+1]; 2563 tmp0 = tmps[i0]; 2564 tmp1 = tmps[i1]; 2565 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2566 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2567 } 2568 if (j == nz-1) { 2569 tmp0 = tmps[vi[j]]; 2570 sum1 -= v1[j]* tmp0; 2571 sum2 -= v2[j+1]* tmp0; 2572 } 2573 2574 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2575 sum2 -= v2[0] * tmp0; 2576 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2577 break; 2578 case 3: 2579 sum1 = tmp[row]; 2580 sum2 = tmp[row -1]; 2581 sum3 = tmp[row -2]; 2582 v2 = aa + ad[row] + 1; 2583 v3 = aa + ad[row -1] + 1; 2584 for (j=0; j<nz-1; j+=2) { 2585 i0 = vi[j]; 2586 i1 = vi[j+1]; 2587 tmp0 = tmps[i0]; 2588 tmp1 = tmps[i1]; 2589 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2590 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2591 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2592 } 2593 if (j== nz-1) { 2594 tmp0 = tmps[vi[j]]; 2595 sum1 -= v1[j] * tmp0; 2596 sum2 -= v2[j+1] * tmp0; 2597 sum3 -= v3[j+2] * tmp0; 2598 } 2599 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2600 sum2 -= v2[0]* tmp0; 2601 sum3 -= v3[1] * tmp0; 2602 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2603 sum3 -= v3[0]* tmp0; 2604 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2605 2606 break; 2607 case 4: 2608 sum1 = tmp[row]; 2609 sum2 = tmp[row -1]; 2610 sum3 = tmp[row -2]; 2611 sum4 = tmp[row -3]; 2612 v2 = aa + ad[row]+1; 2613 v3 = aa + ad[row -1]+1; 2614 v4 = aa + ad[row -2]+1; 2615 2616 for (j=0; j<nz-1; j+=2) { 2617 i0 = vi[j]; 2618 i1 = vi[j+1]; 2619 tmp0 = tmps[i0]; 2620 tmp1 = tmps[i1]; 2621 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2622 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2623 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2624 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2625 } 2626 if (j== nz-1) { 2627 tmp0 = tmps[vi[j]]; 2628 sum1 -= v1[j] * tmp0; 2629 sum2 -= v2[j+1] * tmp0; 2630 sum3 -= v3[j+2] * tmp0; 2631 sum4 -= v4[j+3] * tmp0; 2632 } 2633 2634 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2635 sum2 -= v2[0] * tmp0; 2636 sum3 -= v3[1] * tmp0; 2637 sum4 -= v4[2] * tmp0; 2638 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2639 sum3 -= v3[0] * tmp0; 2640 sum4 -= v4[1] * tmp0; 2641 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2642 sum4 -= v4[0] * tmp0; 2643 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2644 break; 2645 case 5: 2646 sum1 = tmp[row]; 2647 sum2 = tmp[row -1]; 2648 sum3 = tmp[row -2]; 2649 sum4 = tmp[row -3]; 2650 sum5 = tmp[row -4]; 2651 v2 = aa + ad[row]+1; 2652 v3 = aa + ad[row -1]+1; 2653 v4 = aa + ad[row -2]+1; 2654 v5 = aa + ad[row -3]+1; 2655 for (j=0; j<nz-1; j+=2) { 2656 i0 = vi[j]; 2657 i1 = vi[j+1]; 2658 tmp0 = tmps[i0]; 2659 tmp1 = tmps[i1]; 2660 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2661 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2662 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2663 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2664 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2665 } 2666 if (j==nz-1) { 2667 tmp0 = tmps[vi[j]]; 2668 sum1 -= v1[j] * tmp0; 2669 sum2 -= v2[j+1] * tmp0; 2670 sum3 -= v3[j+2] * tmp0; 2671 sum4 -= v4[j+3] * tmp0; 2672 sum5 -= v5[j+4] * tmp0; 2673 } 2674 2675 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2676 sum2 -= v2[0] * tmp0; 2677 sum3 -= v3[1] * tmp0; 2678 sum4 -= v4[2] * tmp0; 2679 sum5 -= v5[3] * tmp0; 2680 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2681 sum3 -= v3[0] * tmp0; 2682 sum4 -= v4[1] * tmp0; 2683 sum5 -= v5[2] * tmp0; 2684 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2685 sum4 -= v4[0] * tmp0; 2686 sum5 -= v5[1] * tmp0; 2687 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2688 sum5 -= v5[0] * tmp0; 2689 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2690 break; 2691 default: 2692 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2693 } 2694 } 2695 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2696 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2697 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2698 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2699 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2700 PetscFunctionReturn(0); 2701 } 2702 2703 2704 /* 2705 Makes a longer coloring[] array and calls the usual code with that 2706 */ 2707 #undef __FUNCT__ 2708 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2709 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2710 { 2711 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2712 PetscErrorCode ierr; 2713 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2714 PetscInt *colorused,i; 2715 ISColoringValue *newcolor; 2716 2717 PetscFunctionBegin; 2718 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2719 /* loop over inodes, marking a color for each column*/ 2720 row = 0; 2721 for (i=0; i<m; i++) { 2722 for (j=0; j<ns[i]; j++) { 2723 newcolor[row++] = coloring[i] + j*ncolors; 2724 } 2725 } 2726 2727 /* eliminate unneeded colors */ 2728 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2729 for (i=0; i<n; i++) { 2730 colorused[newcolor[i]] = 1; 2731 } 2732 2733 for (i=1; i<5*ncolors; i++) { 2734 colorused[i] += colorused[i-1]; 2735 } 2736 ncolors = colorused[5*ncolors-1]; 2737 for (i=0; i<n; i++) { 2738 newcolor[i] = colorused[newcolor[i]]-1; 2739 } 2740 ierr = PetscFree(colorused);CHKERRQ(ierr); 2741 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2742 ierr = PetscFree(coloring);CHKERRQ(ierr); 2743 PetscFunctionReturn(0); 2744 } 2745 2746 #include <petsc/private/kernels/blockinvert.h> 2747 2748 #undef __FUNCT__ 2749 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2750 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2751 { 2752 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2753 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2754 MatScalar *ibdiag,*bdiag,work[25],*t; 2755 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2756 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2757 const PetscScalar *xb, *b; 2758 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2759 PetscErrorCode ierr; 2760 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2761 PetscInt sz,k,ipvt[5]; 2762 PetscBool allowzeropivot,zeropivotdetected=PETSC_FALSE; 2763 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2764 2765 PetscFunctionBegin; 2766 allowzeropivot = PetscNot(A->erroriffailure); 2767 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2768 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 2769 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2770 2771 if (!a->inode.ibdiagvalid) { 2772 if (!a->inode.ibdiag) { 2773 /* calculate space needed for diagonal blocks */ 2774 for (i=0; i<m; i++) { 2775 cnt += sizes[i]*sizes[i]; 2776 } 2777 a->inode.bdiagsize = cnt; 2778 2779 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2780 } 2781 2782 /* copy over the diagonal blocks and invert them */ 2783 ibdiag = a->inode.ibdiag; 2784 bdiag = a->inode.bdiag; 2785 cnt = 0; 2786 for (i=0, row = 0; i<m; i++) { 2787 for (j=0; j<sizes[i]; j++) { 2788 for (k=0; k<sizes[i]; k++) { 2789 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2790 } 2791 } 2792 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2793 2794 switch (sizes[i]) { 2795 case 1: 2796 /* Create matrix data structure */ 2797 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2798 if (allowzeropivot) { 2799 zeropivotdetected = PETSC_TRUE; 2800 ierr = PetscInfo1(A,"Zero pivot, row %D\n",row);CHKERRQ(ierr); 2801 } else SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2802 } 2803 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2804 break; 2805 case 2: 2806 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2807 break; 2808 case 3: 2809 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2810 break; 2811 case 4: 2812 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2813 break; 2814 case 5: 2815 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected);CHKERRQ(ierr); 2816 break; 2817 default: 2818 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2819 } 2820 if (zeropivotdetected) { 2821 A->errortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2822 } 2823 2824 cnt += sizes[i]*sizes[i]; 2825 row += sizes[i]; 2826 } 2827 a->inode.ibdiagvalid = PETSC_TRUE; 2828 } 2829 ibdiag = a->inode.ibdiag; 2830 bdiag = a->inode.bdiag; 2831 t = a->inode.ssor_work; 2832 2833 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2834 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2835 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2836 if (flag & SOR_ZERO_INITIAL_GUESS) { 2837 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2838 2839 for (i=0, row=0; i<m; i++) { 2840 sz = diag[row] - ii[row]; 2841 v1 = a->a + ii[row]; 2842 idx = a->j + ii[row]; 2843 2844 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2845 switch (sizes[i]) { 2846 case 1: 2847 2848 sum1 = b[row]; 2849 for (n = 0; n<sz-1; n+=2) { 2850 i1 = idx[0]; 2851 i2 = idx[1]; 2852 idx += 2; 2853 tmp0 = x[i1]; 2854 tmp1 = x[i2]; 2855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2856 } 2857 2858 if (n == sz-1) { 2859 tmp0 = x[*idx]; 2860 sum1 -= *v1 * tmp0; 2861 } 2862 t[row] = sum1; 2863 x[row++] = sum1*(*ibdiag++); 2864 break; 2865 case 2: 2866 v2 = a->a + ii[row+1]; 2867 sum1 = b[row]; 2868 sum2 = b[row+1]; 2869 for (n = 0; n<sz-1; n+=2) { 2870 i1 = idx[0]; 2871 i2 = idx[1]; 2872 idx += 2; 2873 tmp0 = x[i1]; 2874 tmp1 = x[i2]; 2875 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2876 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2877 } 2878 2879 if (n == sz-1) { 2880 tmp0 = x[*idx]; 2881 sum1 -= v1[0] * tmp0; 2882 sum2 -= v2[0] * tmp0; 2883 } 2884 t[row] = sum1; 2885 t[row+1] = sum2; 2886 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2887 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2888 ibdiag += 4; 2889 break; 2890 case 3: 2891 v2 = a->a + ii[row+1]; 2892 v3 = a->a + ii[row+2]; 2893 sum1 = b[row]; 2894 sum2 = b[row+1]; 2895 sum3 = b[row+2]; 2896 for (n = 0; n<sz-1; n+=2) { 2897 i1 = idx[0]; 2898 i2 = idx[1]; 2899 idx += 2; 2900 tmp0 = x[i1]; 2901 tmp1 = x[i2]; 2902 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2903 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2904 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2905 } 2906 2907 if (n == sz-1) { 2908 tmp0 = x[*idx]; 2909 sum1 -= v1[0] * tmp0; 2910 sum2 -= v2[0] * tmp0; 2911 sum3 -= v3[0] * tmp0; 2912 } 2913 t[row] = sum1; 2914 t[row+1] = sum2; 2915 t[row+2] = sum3; 2916 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2917 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2918 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2919 ibdiag += 9; 2920 break; 2921 case 4: 2922 v2 = a->a + ii[row+1]; 2923 v3 = a->a + ii[row+2]; 2924 v4 = a->a + ii[row+3]; 2925 sum1 = b[row]; 2926 sum2 = b[row+1]; 2927 sum3 = b[row+2]; 2928 sum4 = b[row+3]; 2929 for (n = 0; n<sz-1; n+=2) { 2930 i1 = idx[0]; 2931 i2 = idx[1]; 2932 idx += 2; 2933 tmp0 = x[i1]; 2934 tmp1 = x[i2]; 2935 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2936 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2937 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2938 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2939 } 2940 2941 if (n == sz-1) { 2942 tmp0 = x[*idx]; 2943 sum1 -= v1[0] * tmp0; 2944 sum2 -= v2[0] * tmp0; 2945 sum3 -= v3[0] * tmp0; 2946 sum4 -= v4[0] * tmp0; 2947 } 2948 t[row] = sum1; 2949 t[row+1] = sum2; 2950 t[row+2] = sum3; 2951 t[row+3] = sum4; 2952 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2953 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2954 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2955 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2956 ibdiag += 16; 2957 break; 2958 case 5: 2959 v2 = a->a + ii[row+1]; 2960 v3 = a->a + ii[row+2]; 2961 v4 = a->a + ii[row+3]; 2962 v5 = a->a + ii[row+4]; 2963 sum1 = b[row]; 2964 sum2 = b[row+1]; 2965 sum3 = b[row+2]; 2966 sum4 = b[row+3]; 2967 sum5 = b[row+4]; 2968 for (n = 0; n<sz-1; n+=2) { 2969 i1 = idx[0]; 2970 i2 = idx[1]; 2971 idx += 2; 2972 tmp0 = x[i1]; 2973 tmp1 = x[i2]; 2974 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2975 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2976 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2977 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2978 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2979 } 2980 2981 if (n == sz-1) { 2982 tmp0 = x[*idx]; 2983 sum1 -= v1[0] * tmp0; 2984 sum2 -= v2[0] * tmp0; 2985 sum3 -= v3[0] * tmp0; 2986 sum4 -= v4[0] * tmp0; 2987 sum5 -= v5[0] * tmp0; 2988 } 2989 t[row] = sum1; 2990 t[row+1] = sum2; 2991 t[row+2] = sum3; 2992 t[row+3] = sum4; 2993 t[row+4] = sum5; 2994 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2995 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2996 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2997 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2998 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2999 ibdiag += 25; 3000 break; 3001 default: 3002 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3003 } 3004 } 3005 3006 xb = t; 3007 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3008 } else xb = b; 3009 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3010 3011 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3012 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3013 ibdiag -= sizes[i]*sizes[i]; 3014 sz = ii[row+1] - diag[row] - 1; 3015 v1 = a->a + diag[row] + 1; 3016 idx = a->j + diag[row] + 1; 3017 3018 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3019 switch (sizes[i]) { 3020 case 1: 3021 3022 sum1 = xb[row]; 3023 for (n = 0; n<sz-1; n+=2) { 3024 i1 = idx[0]; 3025 i2 = idx[1]; 3026 idx += 2; 3027 tmp0 = x[i1]; 3028 tmp1 = x[i2]; 3029 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3030 } 3031 3032 if (n == sz-1) { 3033 tmp0 = x[*idx]; 3034 sum1 -= *v1*tmp0; 3035 } 3036 x[row--] = sum1*(*ibdiag); 3037 break; 3038 3039 case 2: 3040 3041 sum1 = xb[row]; 3042 sum2 = xb[row-1]; 3043 /* note that sum1 is associated with the second of the two rows */ 3044 v2 = a->a + diag[row-1] + 2; 3045 for (n = 0; n<sz-1; n+=2) { 3046 i1 = idx[0]; 3047 i2 = idx[1]; 3048 idx += 2; 3049 tmp0 = x[i1]; 3050 tmp1 = x[i2]; 3051 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3052 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3053 } 3054 3055 if (n == sz-1) { 3056 tmp0 = x[*idx]; 3057 sum1 -= *v1*tmp0; 3058 sum2 -= *v2*tmp0; 3059 } 3060 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3061 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3062 break; 3063 case 3: 3064 3065 sum1 = xb[row]; 3066 sum2 = xb[row-1]; 3067 sum3 = xb[row-2]; 3068 v2 = a->a + diag[row-1] + 2; 3069 v3 = a->a + diag[row-2] + 3; 3070 for (n = 0; n<sz-1; n+=2) { 3071 i1 = idx[0]; 3072 i2 = idx[1]; 3073 idx += 2; 3074 tmp0 = x[i1]; 3075 tmp1 = x[i2]; 3076 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3077 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3078 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3079 } 3080 3081 if (n == sz-1) { 3082 tmp0 = x[*idx]; 3083 sum1 -= *v1*tmp0; 3084 sum2 -= *v2*tmp0; 3085 sum3 -= *v3*tmp0; 3086 } 3087 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3088 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3089 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3090 break; 3091 case 4: 3092 3093 sum1 = xb[row]; 3094 sum2 = xb[row-1]; 3095 sum3 = xb[row-2]; 3096 sum4 = xb[row-3]; 3097 v2 = a->a + diag[row-1] + 2; 3098 v3 = a->a + diag[row-2] + 3; 3099 v4 = a->a + diag[row-3] + 4; 3100 for (n = 0; n<sz-1; n+=2) { 3101 i1 = idx[0]; 3102 i2 = idx[1]; 3103 idx += 2; 3104 tmp0 = x[i1]; 3105 tmp1 = x[i2]; 3106 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3107 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3108 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3109 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3110 } 3111 3112 if (n == sz-1) { 3113 tmp0 = x[*idx]; 3114 sum1 -= *v1*tmp0; 3115 sum2 -= *v2*tmp0; 3116 sum3 -= *v3*tmp0; 3117 sum4 -= *v4*tmp0; 3118 } 3119 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3120 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3121 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3122 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3123 break; 3124 case 5: 3125 3126 sum1 = xb[row]; 3127 sum2 = xb[row-1]; 3128 sum3 = xb[row-2]; 3129 sum4 = xb[row-3]; 3130 sum5 = xb[row-4]; 3131 v2 = a->a + diag[row-1] + 2; 3132 v3 = a->a + diag[row-2] + 3; 3133 v4 = a->a + diag[row-3] + 4; 3134 v5 = a->a + diag[row-4] + 5; 3135 for (n = 0; n<sz-1; n+=2) { 3136 i1 = idx[0]; 3137 i2 = idx[1]; 3138 idx += 2; 3139 tmp0 = x[i1]; 3140 tmp1 = x[i2]; 3141 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3142 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3143 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3144 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3145 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3146 } 3147 3148 if (n == sz-1) { 3149 tmp0 = x[*idx]; 3150 sum1 -= *v1*tmp0; 3151 sum2 -= *v2*tmp0; 3152 sum3 -= *v3*tmp0; 3153 sum4 -= *v4*tmp0; 3154 sum5 -= *v5*tmp0; 3155 } 3156 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3157 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3158 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3159 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3160 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3161 break; 3162 default: 3163 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3164 } 3165 } 3166 3167 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3168 } 3169 its--; 3170 } 3171 while (its--) { 3172 3173 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3174 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3175 i<m; 3176 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3177 3178 sz = diag[row] - ii[row]; 3179 v1 = a->a + ii[row]; 3180 idx = a->j + ii[row]; 3181 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3182 switch (sizes[i]) { 3183 case 1: 3184 sum1 = b[row]; 3185 for (n = 0; n<sz-1; n+=2) { 3186 i1 = idx[0]; 3187 i2 = idx[1]; 3188 idx += 2; 3189 tmp0 = x[i1]; 3190 tmp1 = x[i2]; 3191 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3192 } 3193 if (n == sz-1) { 3194 tmp0 = x[*idx++]; 3195 sum1 -= *v1 * tmp0; 3196 v1++; 3197 } 3198 t[row] = sum1; 3199 sz = ii[row+1] - diag[row] - 1; 3200 idx = a->j + diag[row] + 1; 3201 v1 += 1; 3202 for (n = 0; n<sz-1; n+=2) { 3203 i1 = idx[0]; 3204 i2 = idx[1]; 3205 idx += 2; 3206 tmp0 = x[i1]; 3207 tmp1 = x[i2]; 3208 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3209 } 3210 if (n == sz-1) { 3211 tmp0 = x[*idx++]; 3212 sum1 -= *v1 * tmp0; 3213 } 3214 /* in MatSOR_SeqAIJ this line would be 3215 * 3216 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3217 * 3218 * but omega == 1, so this becomes 3219 * 3220 * x[row] = sum1*(*ibdiag++); 3221 * 3222 */ 3223 x[row] = sum1*(*ibdiag); 3224 break; 3225 case 2: 3226 v2 = a->a + ii[row+1]; 3227 sum1 = b[row]; 3228 sum2 = b[row+1]; 3229 for (n = 0; n<sz-1; n+=2) { 3230 i1 = idx[0]; 3231 i2 = idx[1]; 3232 idx += 2; 3233 tmp0 = x[i1]; 3234 tmp1 = x[i2]; 3235 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3236 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3237 } 3238 if (n == sz-1) { 3239 tmp0 = x[*idx++]; 3240 sum1 -= v1[0] * tmp0; 3241 sum2 -= v2[0] * tmp0; 3242 v1++; v2++; 3243 } 3244 t[row] = sum1; 3245 t[row+1] = sum2; 3246 sz = ii[row+1] - diag[row] - 2; 3247 idx = a->j + diag[row] + 2; 3248 v1 += 2; 3249 v2 += 2; 3250 for (n = 0; n<sz-1; n+=2) { 3251 i1 = idx[0]; 3252 i2 = idx[1]; 3253 idx += 2; 3254 tmp0 = x[i1]; 3255 tmp1 = x[i2]; 3256 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3257 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3258 } 3259 if (n == sz-1) { 3260 tmp0 = x[*idx]; 3261 sum1 -= v1[0] * tmp0; 3262 sum2 -= v2[0] * tmp0; 3263 } 3264 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3265 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3266 break; 3267 case 3: 3268 v2 = a->a + ii[row+1]; 3269 v3 = a->a + ii[row+2]; 3270 sum1 = b[row]; 3271 sum2 = b[row+1]; 3272 sum3 = b[row+2]; 3273 for (n = 0; n<sz-1; n+=2) { 3274 i1 = idx[0]; 3275 i2 = idx[1]; 3276 idx += 2; 3277 tmp0 = x[i1]; 3278 tmp1 = x[i2]; 3279 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3280 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3281 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3282 } 3283 if (n == sz-1) { 3284 tmp0 = x[*idx++]; 3285 sum1 -= v1[0] * tmp0; 3286 sum2 -= v2[0] * tmp0; 3287 sum3 -= v3[0] * tmp0; 3288 v1++; v2++; v3++; 3289 } 3290 t[row] = sum1; 3291 t[row+1] = sum2; 3292 t[row+2] = sum3; 3293 sz = ii[row+1] - diag[row] - 3; 3294 idx = a->j + diag[row] + 3; 3295 v1 += 3; 3296 v2 += 3; 3297 v3 += 3; 3298 for (n = 0; n<sz-1; n+=2) { 3299 i1 = idx[0]; 3300 i2 = idx[1]; 3301 idx += 2; 3302 tmp0 = x[i1]; 3303 tmp1 = x[i2]; 3304 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3305 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3306 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3307 } 3308 if (n == sz-1) { 3309 tmp0 = x[*idx]; 3310 sum1 -= v1[0] * tmp0; 3311 sum2 -= v2[0] * tmp0; 3312 sum3 -= v3[0] * tmp0; 3313 } 3314 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3315 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3316 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3317 break; 3318 case 4: 3319 v2 = a->a + ii[row+1]; 3320 v3 = a->a + ii[row+2]; 3321 v4 = a->a + ii[row+3]; 3322 sum1 = b[row]; 3323 sum2 = b[row+1]; 3324 sum3 = b[row+2]; 3325 sum4 = b[row+3]; 3326 for (n = 0; n<sz-1; n+=2) { 3327 i1 = idx[0]; 3328 i2 = idx[1]; 3329 idx += 2; 3330 tmp0 = x[i1]; 3331 tmp1 = x[i2]; 3332 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3333 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3334 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3335 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3336 } 3337 if (n == sz-1) { 3338 tmp0 = x[*idx++]; 3339 sum1 -= v1[0] * tmp0; 3340 sum2 -= v2[0] * tmp0; 3341 sum3 -= v3[0] * tmp0; 3342 sum4 -= v4[0] * tmp0; 3343 v1++; v2++; v3++; v4++; 3344 } 3345 t[row] = sum1; 3346 t[row+1] = sum2; 3347 t[row+2] = sum3; 3348 t[row+3] = sum4; 3349 sz = ii[row+1] - diag[row] - 4; 3350 idx = a->j + diag[row] + 4; 3351 v1 += 4; 3352 v2 += 4; 3353 v3 += 4; 3354 v4 += 4; 3355 for (n = 0; n<sz-1; n+=2) { 3356 i1 = idx[0]; 3357 i2 = idx[1]; 3358 idx += 2; 3359 tmp0 = x[i1]; 3360 tmp1 = x[i2]; 3361 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3362 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3363 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3364 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3365 } 3366 if (n == sz-1) { 3367 tmp0 = x[*idx]; 3368 sum1 -= v1[0] * tmp0; 3369 sum2 -= v2[0] * tmp0; 3370 sum3 -= v3[0] * tmp0; 3371 sum4 -= v4[0] * tmp0; 3372 } 3373 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3374 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3375 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3376 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3377 break; 3378 case 5: 3379 v2 = a->a + ii[row+1]; 3380 v3 = a->a + ii[row+2]; 3381 v4 = a->a + ii[row+3]; 3382 v5 = a->a + ii[row+4]; 3383 sum1 = b[row]; 3384 sum2 = b[row+1]; 3385 sum3 = b[row+2]; 3386 sum4 = b[row+3]; 3387 sum5 = b[row+4]; 3388 for (n = 0; n<sz-1; n+=2) { 3389 i1 = idx[0]; 3390 i2 = idx[1]; 3391 idx += 2; 3392 tmp0 = x[i1]; 3393 tmp1 = x[i2]; 3394 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3395 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3396 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3397 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3398 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3399 } 3400 if (n == sz-1) { 3401 tmp0 = x[*idx++]; 3402 sum1 -= v1[0] * tmp0; 3403 sum2 -= v2[0] * tmp0; 3404 sum3 -= v3[0] * tmp0; 3405 sum4 -= v4[0] * tmp0; 3406 sum5 -= v5[0] * tmp0; 3407 v1++; v2++; v3++; v4++; v5++; 3408 } 3409 t[row] = sum1; 3410 t[row+1] = sum2; 3411 t[row+2] = sum3; 3412 t[row+3] = sum4; 3413 t[row+4] = sum5; 3414 sz = ii[row+1] - diag[row] - 5; 3415 idx = a->j + diag[row] + 5; 3416 v1 += 5; 3417 v2 += 5; 3418 v3 += 5; 3419 v4 += 5; 3420 v5 += 5; 3421 for (n = 0; n<sz-1; n+=2) { 3422 i1 = idx[0]; 3423 i2 = idx[1]; 3424 idx += 2; 3425 tmp0 = x[i1]; 3426 tmp1 = x[i2]; 3427 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3428 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3429 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3430 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3431 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3432 } 3433 if (n == sz-1) { 3434 tmp0 = x[*idx]; 3435 sum1 -= v1[0] * tmp0; 3436 sum2 -= v2[0] * tmp0; 3437 sum3 -= v3[0] * tmp0; 3438 sum4 -= v4[0] * tmp0; 3439 sum5 -= v5[0] * tmp0; 3440 } 3441 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3442 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3443 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3444 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3445 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3446 break; 3447 default: 3448 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3449 } 3450 } 3451 xb = t; 3452 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3453 } else xb = b; 3454 3455 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3456 3457 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3458 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3459 ibdiag -= sizes[i]*sizes[i]; 3460 3461 /* set RHS */ 3462 if (xb == b) { 3463 /* whole (old way) */ 3464 sz = ii[row+1] - ii[row]; 3465 idx = a->j + ii[row]; 3466 switch (sizes[i]) { 3467 case 5: 3468 v5 = a->a + ii[row-4]; 3469 case 4: /* fall through */ 3470 v4 = a->a + ii[row-3]; 3471 case 3: 3472 v3 = a->a + ii[row-2]; 3473 case 2: 3474 v2 = a->a + ii[row-1]; 3475 case 1: 3476 v1 = a->a + ii[row]; 3477 break; 3478 default: 3479 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3480 } 3481 } else { 3482 /* upper, no diag */ 3483 sz = ii[row+1] - diag[row] - 1; 3484 idx = a->j + diag[row] + 1; 3485 switch (sizes[i]) { 3486 case 5: 3487 v5 = a->a + diag[row-4] + 5; 3488 case 4: /* fall through */ 3489 v4 = a->a + diag[row-3] + 4; 3490 case 3: 3491 v3 = a->a + diag[row-2] + 3; 3492 case 2: 3493 v2 = a->a + diag[row-1] + 2; 3494 case 1: 3495 v1 = a->a + diag[row] + 1; 3496 } 3497 } 3498 /* set sum */ 3499 switch (sizes[i]) { 3500 case 5: 3501 sum5 = xb[row-4]; 3502 case 4: /* fall through */ 3503 sum4 = xb[row-3]; 3504 case 3: 3505 sum3 = xb[row-2]; 3506 case 2: 3507 sum2 = xb[row-1]; 3508 case 1: 3509 /* note that sum1 is associated with the last row */ 3510 sum1 = xb[row]; 3511 } 3512 /* do sums */ 3513 for (n = 0; n<sz-1; n+=2) { 3514 i1 = idx[0]; 3515 i2 = idx[1]; 3516 idx += 2; 3517 tmp0 = x[i1]; 3518 tmp1 = x[i2]; 3519 switch (sizes[i]) { 3520 case 5: 3521 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3522 case 4: /* fall through */ 3523 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3524 case 3: 3525 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3526 case 2: 3527 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3528 case 1: 3529 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3530 } 3531 } 3532 /* ragged edge */ 3533 if (n == sz-1) { 3534 tmp0 = x[*idx]; 3535 switch (sizes[i]) { 3536 case 5: 3537 sum5 -= *v5*tmp0; 3538 case 4: /* fall through */ 3539 sum4 -= *v4*tmp0; 3540 case 3: 3541 sum3 -= *v3*tmp0; 3542 case 2: 3543 sum2 -= *v2*tmp0; 3544 case 1: 3545 sum1 -= *v1*tmp0; 3546 } 3547 } 3548 /* update */ 3549 if (xb == b) { 3550 /* whole (old way) w/ diag */ 3551 switch (sizes[i]) { 3552 case 5: 3553 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3554 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3555 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3556 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3557 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3558 break; 3559 case 4: 3560 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3561 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3562 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3563 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3564 break; 3565 case 3: 3566 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3567 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3568 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3569 break; 3570 case 2: 3571 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3572 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3573 break; 3574 case 1: 3575 x[row--] += sum1*(*ibdiag); 3576 break; 3577 } 3578 } else { 3579 /* no diag so set = */ 3580 switch (sizes[i]) { 3581 case 5: 3582 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3583 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3584 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3585 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3586 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3587 break; 3588 case 4: 3589 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3590 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3591 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3592 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3593 break; 3594 case 3: 3595 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3596 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3597 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3598 break; 3599 case 2: 3600 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3601 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3602 break; 3603 case 1: 3604 x[row--] = sum1*(*ibdiag); 3605 break; 3606 } 3607 } 3608 } 3609 if (xb == b) { 3610 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3611 } else { 3612 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3613 } 3614 } 3615 } 3616 if (flag & SOR_EISENSTAT) { 3617 /* 3618 Apply (U + D)^-1 where D is now the block diagonal 3619 */ 3620 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3621 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3622 ibdiag -= sizes[i]*sizes[i]; 3623 sz = ii[row+1] - diag[row] - 1; 3624 v1 = a->a + diag[row] + 1; 3625 idx = a->j + diag[row] + 1; 3626 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3627 switch (sizes[i]) { 3628 case 1: 3629 3630 sum1 = b[row]; 3631 for (n = 0; n<sz-1; n+=2) { 3632 i1 = idx[0]; 3633 i2 = idx[1]; 3634 idx += 2; 3635 tmp0 = x[i1]; 3636 tmp1 = x[i2]; 3637 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3638 } 3639 3640 if (n == sz-1) { 3641 tmp0 = x[*idx]; 3642 sum1 -= *v1*tmp0; 3643 } 3644 x[row] = sum1*(*ibdiag);row--; 3645 break; 3646 3647 case 2: 3648 3649 sum1 = b[row]; 3650 sum2 = b[row-1]; 3651 /* note that sum1 is associated with the second of the two rows */ 3652 v2 = a->a + diag[row-1] + 2; 3653 for (n = 0; n<sz-1; n+=2) { 3654 i1 = idx[0]; 3655 i2 = idx[1]; 3656 idx += 2; 3657 tmp0 = x[i1]; 3658 tmp1 = x[i2]; 3659 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3660 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3661 } 3662 3663 if (n == sz-1) { 3664 tmp0 = x[*idx]; 3665 sum1 -= *v1*tmp0; 3666 sum2 -= *v2*tmp0; 3667 } 3668 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3669 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3670 row -= 2; 3671 break; 3672 case 3: 3673 3674 sum1 = b[row]; 3675 sum2 = b[row-1]; 3676 sum3 = b[row-2]; 3677 v2 = a->a + diag[row-1] + 2; 3678 v3 = a->a + diag[row-2] + 3; 3679 for (n = 0; n<sz-1; n+=2) { 3680 i1 = idx[0]; 3681 i2 = idx[1]; 3682 idx += 2; 3683 tmp0 = x[i1]; 3684 tmp1 = x[i2]; 3685 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3686 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3687 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3688 } 3689 3690 if (n == sz-1) { 3691 tmp0 = x[*idx]; 3692 sum1 -= *v1*tmp0; 3693 sum2 -= *v2*tmp0; 3694 sum3 -= *v3*tmp0; 3695 } 3696 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3697 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3698 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3699 row -= 3; 3700 break; 3701 case 4: 3702 3703 sum1 = b[row]; 3704 sum2 = b[row-1]; 3705 sum3 = b[row-2]; 3706 sum4 = b[row-3]; 3707 v2 = a->a + diag[row-1] + 2; 3708 v3 = a->a + diag[row-2] + 3; 3709 v4 = a->a + diag[row-3] + 4; 3710 for (n = 0; n<sz-1; n+=2) { 3711 i1 = idx[0]; 3712 i2 = idx[1]; 3713 idx += 2; 3714 tmp0 = x[i1]; 3715 tmp1 = x[i2]; 3716 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3717 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3718 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3719 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3720 } 3721 3722 if (n == sz-1) { 3723 tmp0 = x[*idx]; 3724 sum1 -= *v1*tmp0; 3725 sum2 -= *v2*tmp0; 3726 sum3 -= *v3*tmp0; 3727 sum4 -= *v4*tmp0; 3728 } 3729 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3730 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3731 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3732 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3733 row -= 4; 3734 break; 3735 case 5: 3736 3737 sum1 = b[row]; 3738 sum2 = b[row-1]; 3739 sum3 = b[row-2]; 3740 sum4 = b[row-3]; 3741 sum5 = b[row-4]; 3742 v2 = a->a + diag[row-1] + 2; 3743 v3 = a->a + diag[row-2] + 3; 3744 v4 = a->a + diag[row-3] + 4; 3745 v5 = a->a + diag[row-4] + 5; 3746 for (n = 0; n<sz-1; n+=2) { 3747 i1 = idx[0]; 3748 i2 = idx[1]; 3749 idx += 2; 3750 tmp0 = x[i1]; 3751 tmp1 = x[i2]; 3752 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3753 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3754 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3755 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3756 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3757 } 3758 3759 if (n == sz-1) { 3760 tmp0 = x[*idx]; 3761 sum1 -= *v1*tmp0; 3762 sum2 -= *v2*tmp0; 3763 sum3 -= *v3*tmp0; 3764 sum4 -= *v4*tmp0; 3765 sum5 -= *v5*tmp0; 3766 } 3767 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3768 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3769 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3770 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3771 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3772 row -= 5; 3773 break; 3774 default: 3775 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3776 } 3777 } 3778 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3779 3780 /* 3781 t = b - D x where D is the block diagonal 3782 */ 3783 cnt = 0; 3784 for (i=0, row=0; i<m; i++) { 3785 switch (sizes[i]) { 3786 case 1: 3787 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3788 break; 3789 case 2: 3790 x1 = x[row]; x2 = x[row+1]; 3791 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3792 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3793 t[row] = b[row] - tmp1; 3794 t[row+1] = b[row+1] - tmp2; row += 2; 3795 cnt += 4; 3796 break; 3797 case 3: 3798 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3799 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3800 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3801 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3802 t[row] = b[row] - tmp1; 3803 t[row+1] = b[row+1] - tmp2; 3804 t[row+2] = b[row+2] - tmp3; row += 3; 3805 cnt += 9; 3806 break; 3807 case 4: 3808 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3809 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3810 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3811 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3812 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3813 t[row] = b[row] - tmp1; 3814 t[row+1] = b[row+1] - tmp2; 3815 t[row+2] = b[row+2] - tmp3; 3816 t[row+3] = b[row+3] - tmp4; row += 4; 3817 cnt += 16; 3818 break; 3819 case 5: 3820 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3821 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3822 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3823 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3824 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3825 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3826 t[row] = b[row] - tmp1; 3827 t[row+1] = b[row+1] - tmp2; 3828 t[row+2] = b[row+2] - tmp3; 3829 t[row+3] = b[row+3] - tmp4; 3830 t[row+4] = b[row+4] - tmp5;row += 5; 3831 cnt += 25; 3832 break; 3833 default: 3834 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3835 } 3836 } 3837 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3838 3839 3840 3841 /* 3842 Apply (L + D)^-1 where D is the block diagonal 3843 */ 3844 for (i=0, row=0; i<m; i++) { 3845 sz = diag[row] - ii[row]; 3846 v1 = a->a + ii[row]; 3847 idx = a->j + ii[row]; 3848 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3849 switch (sizes[i]) { 3850 case 1: 3851 3852 sum1 = t[row]; 3853 for (n = 0; n<sz-1; n+=2) { 3854 i1 = idx[0]; 3855 i2 = idx[1]; 3856 idx += 2; 3857 tmp0 = t[i1]; 3858 tmp1 = t[i2]; 3859 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3860 } 3861 3862 if (n == sz-1) { 3863 tmp0 = t[*idx]; 3864 sum1 -= *v1 * tmp0; 3865 } 3866 x[row] += t[row] = sum1*(*ibdiag++); row++; 3867 break; 3868 case 2: 3869 v2 = a->a + ii[row+1]; 3870 sum1 = t[row]; 3871 sum2 = t[row+1]; 3872 for (n = 0; n<sz-1; n+=2) { 3873 i1 = idx[0]; 3874 i2 = idx[1]; 3875 idx += 2; 3876 tmp0 = t[i1]; 3877 tmp1 = t[i2]; 3878 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3879 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3880 } 3881 3882 if (n == sz-1) { 3883 tmp0 = t[*idx]; 3884 sum1 -= v1[0] * tmp0; 3885 sum2 -= v2[0] * tmp0; 3886 } 3887 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3888 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3889 ibdiag += 4; row += 2; 3890 break; 3891 case 3: 3892 v2 = a->a + ii[row+1]; 3893 v3 = a->a + ii[row+2]; 3894 sum1 = t[row]; 3895 sum2 = t[row+1]; 3896 sum3 = t[row+2]; 3897 for (n = 0; n<sz-1; n+=2) { 3898 i1 = idx[0]; 3899 i2 = idx[1]; 3900 idx += 2; 3901 tmp0 = t[i1]; 3902 tmp1 = t[i2]; 3903 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3905 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3906 } 3907 3908 if (n == sz-1) { 3909 tmp0 = t[*idx]; 3910 sum1 -= v1[0] * tmp0; 3911 sum2 -= v2[0] * tmp0; 3912 sum3 -= v3[0] * tmp0; 3913 } 3914 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3915 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3916 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3917 ibdiag += 9; row += 3; 3918 break; 3919 case 4: 3920 v2 = a->a + ii[row+1]; 3921 v3 = a->a + ii[row+2]; 3922 v4 = a->a + ii[row+3]; 3923 sum1 = t[row]; 3924 sum2 = t[row+1]; 3925 sum3 = t[row+2]; 3926 sum4 = t[row+3]; 3927 for (n = 0; n<sz-1; n+=2) { 3928 i1 = idx[0]; 3929 i2 = idx[1]; 3930 idx += 2; 3931 tmp0 = t[i1]; 3932 tmp1 = t[i2]; 3933 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3934 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3935 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3936 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3937 } 3938 3939 if (n == sz-1) { 3940 tmp0 = t[*idx]; 3941 sum1 -= v1[0] * tmp0; 3942 sum2 -= v2[0] * tmp0; 3943 sum3 -= v3[0] * tmp0; 3944 sum4 -= v4[0] * tmp0; 3945 } 3946 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3947 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3948 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3949 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3950 ibdiag += 16; row += 4; 3951 break; 3952 case 5: 3953 v2 = a->a + ii[row+1]; 3954 v3 = a->a + ii[row+2]; 3955 v4 = a->a + ii[row+3]; 3956 v5 = a->a + ii[row+4]; 3957 sum1 = t[row]; 3958 sum2 = t[row+1]; 3959 sum3 = t[row+2]; 3960 sum4 = t[row+3]; 3961 sum5 = t[row+4]; 3962 for (n = 0; n<sz-1; n+=2) { 3963 i1 = idx[0]; 3964 i2 = idx[1]; 3965 idx += 2; 3966 tmp0 = t[i1]; 3967 tmp1 = t[i2]; 3968 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3969 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3970 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3971 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3972 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3973 } 3974 3975 if (n == sz-1) { 3976 tmp0 = t[*idx]; 3977 sum1 -= v1[0] * tmp0; 3978 sum2 -= v2[0] * tmp0; 3979 sum3 -= v3[0] * tmp0; 3980 sum4 -= v4[0] * tmp0; 3981 sum5 -= v5[0] * tmp0; 3982 } 3983 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3984 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3985 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3986 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3987 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3988 ibdiag += 25; row += 5; 3989 break; 3990 default: 3991 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3992 } 3993 } 3994 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3995 } 3996 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3997 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3998 PetscFunctionReturn(0); 3999 } 4000 4001 #undef __FUNCT__ 4002 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 4003 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 4004 { 4005 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4006 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 4007 const MatScalar *bdiag = a->inode.bdiag; 4008 const PetscScalar *b; 4009 PetscErrorCode ierr; 4010 PetscInt m = a->inode.node_count,cnt = 0,i,row; 4011 const PetscInt *sizes = a->inode.size; 4012 4013 PetscFunctionBegin; 4014 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4015 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4016 cnt = 0; 4017 for (i=0, row=0; i<m; i++) { 4018 switch (sizes[i]) { 4019 case 1: 4020 x[row] = b[row]*bdiag[cnt++];row++; 4021 break; 4022 case 2: 4023 x1 = b[row]; x2 = b[row+1]; 4024 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 4025 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 4026 x[row++] = tmp1; 4027 x[row++] = tmp2; 4028 cnt += 4; 4029 break; 4030 case 3: 4031 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4032 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4033 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4034 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4035 x[row++] = tmp1; 4036 x[row++] = tmp2; 4037 x[row++] = tmp3; 4038 cnt += 9; 4039 break; 4040 case 4: 4041 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4042 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4043 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4044 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4045 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4046 x[row++] = tmp1; 4047 x[row++] = tmp2; 4048 x[row++] = tmp3; 4049 x[row++] = tmp4; 4050 cnt += 16; 4051 break; 4052 case 5: 4053 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4054 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4055 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4056 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4057 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4058 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4059 x[row++] = tmp1; 4060 x[row++] = tmp2; 4061 x[row++] = tmp3; 4062 x[row++] = tmp4; 4063 x[row++] = tmp5; 4064 cnt += 25; 4065 break; 4066 default: 4067 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4068 } 4069 } 4070 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4071 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4072 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4073 PetscFunctionReturn(0); 4074 } 4075 4076 /* 4077 samestructure indicates that the matrix has not changed its nonzero structure so we 4078 do not need to recompute the inodes 4079 */ 4080 #undef __FUNCT__ 4081 #define __FUNCT__ "MatSeqAIJCheckInode" 4082 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4083 { 4084 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4085 PetscErrorCode ierr; 4086 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4087 PetscBool flag; 4088 const PetscInt *idx,*idy,*ii; 4089 4090 PetscFunctionBegin; 4091 if (!a->inode.use) PetscFunctionReturn(0); 4092 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4093 4094 m = A->rmap->n; 4095 if (a->inode.size) ns = a->inode.size; 4096 else { 4097 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4098 } 4099 4100 i = 0; 4101 node_count = 0; 4102 idx = a->j; 4103 ii = a->i; 4104 while (i < m) { /* For each row */ 4105 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4106 /* Limits the number of elements in a node to 'a->inode.limit' */ 4107 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4108 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4109 if (nzy != nzx) break; 4110 idy += nzx; /* Same nonzero pattern */ 4111 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4112 if (!flag) break; 4113 } 4114 ns[node_count++] = blk_size; 4115 idx += blk_size*nzx; 4116 i = j; 4117 } 4118 /* If not enough inodes found,, do not use inode version of the routines */ 4119 if (!m || node_count > .8*m) { 4120 ierr = PetscFree(ns);CHKERRQ(ierr); 4121 4122 a->inode.node_count = 0; 4123 a->inode.size = NULL; 4124 a->inode.use = PETSC_FALSE; 4125 A->ops->mult = MatMult_SeqAIJ; 4126 A->ops->sor = MatSOR_SeqAIJ; 4127 A->ops->multadd = MatMultAdd_SeqAIJ; 4128 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4129 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4130 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4131 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4132 A->ops->coloringpatch = 0; 4133 A->ops->multdiagonalblock = 0; 4134 4135 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4136 } else { 4137 if (!A->factortype) { 4138 A->ops->mult = MatMult_SeqAIJ_Inode; 4139 A->ops->sor = MatSOR_SeqAIJ_Inode; 4140 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4141 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4142 if (A->rmap->n == A->cmap->n) { 4143 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4144 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4145 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4146 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4147 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4148 } 4149 } else { 4150 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4151 } 4152 a->inode.node_count = node_count; 4153 a->inode.size = ns; 4154 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4155 } 4156 a->inode.checked = PETSC_TRUE; 4157 a->inode.mat_nonzerostate = A->nonzerostate; 4158 PetscFunctionReturn(0); 4159 } 4160 4161 #undef __FUNCT__ 4162 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4163 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4164 { 4165 Mat B =*C; 4166 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4167 PetscErrorCode ierr; 4168 PetscInt m=A->rmap->n; 4169 4170 PetscFunctionBegin; 4171 c->inode.use = a->inode.use; 4172 c->inode.limit = a->inode.limit; 4173 c->inode.max_limit = a->inode.max_limit; 4174 if (a->inode.size) { 4175 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4176 c->inode.node_count = a->inode.node_count; 4177 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4178 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4179 if (!B->factortype) { 4180 B->ops->mult = MatMult_SeqAIJ_Inode; 4181 B->ops->sor = MatSOR_SeqAIJ_Inode; 4182 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4183 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4184 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4185 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4186 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4187 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4188 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4189 } else { 4190 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4191 } 4192 } else { 4193 c->inode.size = 0; 4194 c->inode.node_count = 0; 4195 } 4196 c->inode.ibdiagvalid = PETSC_FALSE; 4197 c->inode.ibdiag = 0; 4198 c->inode.bdiag = 0; 4199 PetscFunctionReturn(0); 4200 } 4201 4202 #undef __FUNCT__ 4203 #define __FUNCT__ "MatGetRow_FactoredLU" 4204 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4205 { 4206 PetscInt k; 4207 const PetscInt *vi; 4208 4209 PetscFunctionBegin; 4210 vi = aj + ai[row]; 4211 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4212 vi = aj + adiag[row]; 4213 cols[nzl] = vi[0]; 4214 vi = aj + adiag[row+1]+1; 4215 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4216 PetscFunctionReturn(0); 4217 } 4218 /* 4219 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4220 Modified from MatSeqAIJCheckInode(). 4221 4222 Input Parameters: 4223 . Mat A - ILU or LU matrix factor 4224 4225 */ 4226 #undef __FUNCT__ 4227 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU" 4228 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4229 { 4230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4231 PetscErrorCode ierr; 4232 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4233 PetscInt *cols1,*cols2,*ns; 4234 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4235 PetscBool flag; 4236 4237 PetscFunctionBegin; 4238 if (!a->inode.use) PetscFunctionReturn(0); 4239 if (a->inode.checked) PetscFunctionReturn(0); 4240 4241 m = A->rmap->n; 4242 if (a->inode.size) ns = a->inode.size; 4243 else { 4244 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4245 } 4246 4247 i = 0; 4248 node_count = 0; 4249 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4250 while (i < m) { /* For each row */ 4251 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4252 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4253 nzx = nzl1 + nzu1 + 1; 4254 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4255 4256 /* Limits the number of elements in a node to 'a->inode.limit' */ 4257 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4258 nzl2 = ai[j+1] - ai[j]; 4259 nzu2 = adiag[j] - adiag[j+1] - 1; 4260 nzy = nzl2 + nzu2 + 1; 4261 if (nzy != nzx) break; 4262 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4263 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4264 if (!flag) break; 4265 } 4266 ns[node_count++] = blk_size; 4267 i = j; 4268 } 4269 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4270 /* If not enough inodes found,, do not use inode version of the routines */ 4271 if (!m || node_count > .8*m) { 4272 ierr = PetscFree(ns);CHKERRQ(ierr); 4273 4274 a->inode.node_count = 0; 4275 a->inode.size = NULL; 4276 a->inode.use = PETSC_FALSE; 4277 4278 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4279 } else { 4280 A->ops->mult = 0; 4281 A->ops->sor = 0; 4282 A->ops->multadd = 0; 4283 A->ops->getrowij = 0; 4284 A->ops->restorerowij = 0; 4285 A->ops->getcolumnij = 0; 4286 A->ops->restorecolumnij = 0; 4287 A->ops->coloringpatch = 0; 4288 A->ops->multdiagonalblock = 0; 4289 a->inode.node_count = node_count; 4290 a->inode.size = ns; 4291 4292 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4293 } 4294 a->inode.checked = PETSC_TRUE; 4295 PetscFunctionReturn(0); 4296 } 4297 4298 #undef __FUNCT__ 4299 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4300 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4301 { 4302 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4303 4304 PetscFunctionBegin; 4305 a->inode.ibdiagvalid = PETSC_FALSE; 4306 PetscFunctionReturn(0); 4307 } 4308 4309 /* 4310 This is really ugly. if inodes are used this replaces the 4311 permutations with ones that correspond to rows/cols of the matrix 4312 rather then inode blocks 4313 */ 4314 #undef __FUNCT__ 4315 #define __FUNCT__ "MatInodeAdjustForInodes" 4316 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4317 { 4318 PetscErrorCode ierr; 4319 4320 PetscFunctionBegin; 4321 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4322 PetscFunctionReturn(0); 4323 } 4324 4325 #undef __FUNCT__ 4326 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4327 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4328 { 4329 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4330 PetscErrorCode ierr; 4331 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4332 const PetscInt *ridx,*cidx; 4333 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4334 PetscInt nslim_col,*ns_col; 4335 IS ris = *rperm,cis = *cperm; 4336 4337 PetscFunctionBegin; 4338 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4339 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4340 4341 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4342 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4343 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4344 4345 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4346 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4347 4348 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4349 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4350 4351 /* Construct the permutations for rows*/ 4352 for (i=0,row = 0; i<nslim_row; ++i) { 4353 indx = ridx[i]; 4354 start_val = tns[indx]; 4355 end_val = tns[indx + 1]; 4356 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4357 } 4358 4359 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4360 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4361 4362 /* Construct permutations for columns */ 4363 for (i=0,col=0; i<nslim_col; ++i) { 4364 indx = cidx[i]; 4365 start_val = tns[indx]; 4366 end_val = tns[indx + 1]; 4367 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4368 } 4369 4370 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4371 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4372 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4373 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4374 4375 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4376 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4377 4378 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4379 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4380 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4381 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4382 ierr = PetscFree(tns);CHKERRQ(ierr); 4383 PetscFunctionReturn(0); 4384 } 4385 4386 #undef __FUNCT__ 4387 #define __FUNCT__ "MatInodeGetInodeSizes" 4388 /*@C 4389 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4390 4391 Not Collective 4392 4393 Input Parameter: 4394 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4395 4396 Output Parameter: 4397 + node_count - no of inodes present in the matrix. 4398 . sizes - an array of size node_count,with sizes of each inode. 4399 - limit - the max size used to generate the inodes. 4400 4401 Level: advanced 4402 4403 Notes: This routine returns some internal storage information 4404 of the matrix, it is intended to be used by advanced users. 4405 It should be called after the matrix is assembled. 4406 The contents of the sizes[] array should not be changed. 4407 NULL may be passed for information not requested. 4408 4409 .keywords: matrix, seqaij, get, inode 4410 4411 .seealso: MatGetInfo() 4412 @*/ 4413 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4414 { 4415 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4416 4417 PetscFunctionBegin; 4418 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4419 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4420 if (f) { 4421 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4422 } 4423 PetscFunctionReturn(0); 4424 } 4425 4426 #undef __FUNCT__ 4427 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4428 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4429 { 4430 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4431 4432 PetscFunctionBegin; 4433 if (node_count) *node_count = a->inode.node_count; 4434 if (sizes) *sizes = a->inode.size; 4435 if (limit) *limit = a->inode.limit; 4436 PetscFunctionReturn(0); 4437 } 4438