1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc1(n+1,&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 67 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc1(nslim_col+1,&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc1(n+1,&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1) { 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,nsz,i1,i2; 157 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 158 159 PetscFunctionBegin; 160 nslim_row = a->inode.node_count; 161 n = A->cmap->n; 162 163 /* Create The column_inode for this matrix */ 164 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 165 166 /* allocate space for reformated column_inode structure */ 167 ierr = PetscMalloc1(nslim_col +1,&tns);CHKERRQ(ierr); 168 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 169 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 170 171 for (i1=0,col=0; i1<nslim_col; ++i1) { 172 nsz = ns_col[i1]; 173 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc1(nslim_row+1,&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc1(nslim_row+1,&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 PetscFunctionReturn(0); 264 } 265 266 /* ----------------------------------------------------------- */ 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscErrorCode ierr; 274 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 275 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 276 277 PetscFunctionBegin; 278 nslim_row = a->inode.node_count; 279 n = A->cmap->n; 280 281 /* Create The column_inode for this matrix */ 282 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 283 284 /* allocate space for reformated column_inode structure */ 285 ierr = PetscMalloc1(nslim_col + 1,&tns);CHKERRQ(ierr); 286 ierr = PetscMalloc1(n + 1,&tvc);CHKERRQ(ierr); 287 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 288 289 for (i1=0,col=0; i1<nslim_col; ++i1) { 290 nsz = ns_col[i1]; 291 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 292 } 293 /* allocate space for column pointers */ 294 ierr = PetscMalloc1(nslim_col+1,&ia);CHKERRQ(ierr); 295 *iia = ia; 296 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 297 ierr = PetscMalloc1(nslim_col+1,&work);CHKERRQ(ierr); 298 299 /* determine the number of columns in each row */ 300 ia[0] = oshift; 301 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 302 j = aj + ai[row] + ishift; 303 col = *j++ + ishift; 304 i2 = tvc[col]; 305 nz = ai[row+1] - ai[row]; 306 while (nz-- > 0) { /* off-diagonal elemets */ 307 /* ia[i1+1]++; */ 308 ia[i2+1]++; 309 i2++; 310 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 311 if (nz > 0) i2 = tvc[col]; 312 } 313 } 314 315 /* shift ia[i] to point to next col */ 316 for (i1=1; i1<nslim_col+1; i1++) { 317 col = ia[i1-1]; 318 ia[i1] += col; 319 work[i1-1] = col - oshift; 320 } 321 322 /* allocate space for column pointers */ 323 nz = ia[nslim_col] + (!ishift); 324 ierr = PetscMalloc1(nz,&ja);CHKERRQ(ierr); 325 *jja = ja; 326 327 /* loop over matrix putting into ja */ 328 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 329 j = aj + ai[row] + ishift; 330 col = *j++ + ishift; 331 i2 = tvc[col]; 332 nz = ai[row+1] - ai[row]; 333 while (nz-- > 0) { 334 /* ja[work[i1]++] = i2 + oshift; */ 335 ja[work[i2]++] = i1 + oshift; 336 i2++; 337 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 338 if (nz > 0) i2 = tvc[col]; 339 } 340 } 341 ierr = PetscFree(ns_col);CHKERRQ(ierr); 342 ierr = PetscFree(work);CHKERRQ(ierr); 343 ierr = PetscFree(tns);CHKERRQ(ierr); 344 ierr = PetscFree(tvc);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 350 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 356 if (!ia) PetscFunctionReturn(0); 357 358 if (!blockcompressed) { 359 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 360 } else if (symmetric) { 361 /* Since the indices are symmetric it does'nt matter */ 362 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } else { 364 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 365 } 366 PetscFunctionReturn(0); 367 } 368 369 #undef __FUNCT__ 370 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 371 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 372 { 373 PetscErrorCode ierr; 374 375 PetscFunctionBegin; 376 if (!ia) PetscFunctionReturn(0); 377 if (!blockcompressed) { 378 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 379 } else { 380 ierr = PetscFree(*ia);CHKERRQ(ierr); 381 ierr = PetscFree(*ja);CHKERRQ(ierr); 382 } 383 PetscFunctionReturn(0); 384 } 385 386 /* ----------------------------------------------------------- */ 387 388 #undef __FUNCT__ 389 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 390 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 391 { 392 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 393 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 394 PetscScalar *y; 395 const PetscScalar *x; 396 const MatScalar *v1,*v2,*v3,*v4,*v5; 397 PetscErrorCode ierr; 398 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 399 const PetscInt *idx,*ns,*ii; 400 401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 403 #endif 404 405 PetscFunctionBegin; 406 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 407 node_max = a->inode.node_count; 408 ns = a->inode.size; /* Node Size array */ 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 411 idx = a->j; 412 v1 = a->a; 413 ii = a->i; 414 415 for (i = 0,row = 0; i< node_max; ++i) { 416 nsz = ns[i]; 417 n = ii[1] - ii[0]; 418 nonzerorow += (n>0)*nsz; 419 ii += nsz; 420 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 421 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 422 sz = n; /* No of non zeros in this row */ 423 /* Switch on the size of Node */ 424 switch (nsz) { /* Each loop in 'case' is unrolled */ 425 case 1: 426 sum1 = 0.; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; /* The instructions are ordered to */ 430 i2 = idx[1]; /* make the compiler's job easy */ 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 } 436 437 if (n == sz-1) { /* Take care of the last nonzero */ 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ *tmp0; 440 } 441 y[row++]=sum1; 442 break; 443 case 2: 444 sum1 = 0.; 445 sum2 = 0.; 446 v2 = v1 + n; 447 448 for (n = 0; n< sz-1; n+=2) { 449 i1 = idx[0]; 450 i2 = idx[1]; 451 idx += 2; 452 tmp0 = x[i1]; 453 tmp1 = x[i2]; 454 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 455 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 456 } 457 if (n == sz-1) { 458 tmp0 = x[*idx++]; 459 sum1 += *v1++ * tmp0; 460 sum2 += *v2++ * tmp0; 461 } 462 y[row++]=sum1; 463 y[row++]=sum2; 464 v1 =v2; /* Since the next block to be processed starts there*/ 465 idx +=sz; 466 break; 467 case 3: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 v2 = v1 + n; 472 v3 = v2 + n; 473 474 for (n = 0; n< sz-1; n+=2) { 475 i1 = idx[0]; 476 i2 = idx[1]; 477 idx += 2; 478 tmp0 = x[i1]; 479 tmp1 = x[i2]; 480 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 } 490 y[row++]=sum1; 491 y[row++]=sum2; 492 y[row++]=sum3; 493 v1 =v3; /* Since the next block to be processed starts there*/ 494 idx +=2*sz; 495 break; 496 case 4: 497 sum1 = 0.; 498 sum2 = 0.; 499 sum3 = 0.; 500 sum4 = 0.; 501 v2 = v1 + n; 502 v3 = v2 + n; 503 v4 = v3 + n; 504 505 for (n = 0; n< sz-1; n+=2) { 506 i1 = idx[0]; 507 i2 = idx[1]; 508 idx += 2; 509 tmp0 = x[i1]; 510 tmp1 = x[i2]; 511 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 513 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 514 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 515 } 516 if (n == sz-1) { 517 tmp0 = x[*idx++]; 518 sum1 += *v1++ * tmp0; 519 sum2 += *v2++ * tmp0; 520 sum3 += *v3++ * tmp0; 521 sum4 += *v4++ * tmp0; 522 } 523 y[row++]=sum1; 524 y[row++]=sum2; 525 y[row++]=sum3; 526 y[row++]=sum4; 527 v1 =v4; /* Since the next block to be processed starts there*/ 528 idx +=3*sz; 529 break; 530 case 5: 531 sum1 = 0.; 532 sum2 = 0.; 533 sum3 = 0.; 534 sum4 = 0.; 535 sum5 = 0.; 536 v2 = v1 + n; 537 v3 = v2 + n; 538 v4 = v3 + n; 539 v5 = v4 + n; 540 541 for (n = 0; n<sz-1; n+=2) { 542 i1 = idx[0]; 543 i2 = idx[1]; 544 idx += 2; 545 tmp0 = x[i1]; 546 tmp1 = x[i2]; 547 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 548 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 549 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 550 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 551 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 552 } 553 if (n == sz-1) { 554 tmp0 = x[*idx++]; 555 sum1 += *v1++ * tmp0; 556 sum2 += *v2++ * tmp0; 557 sum3 += *v3++ * tmp0; 558 sum4 += *v4++ * tmp0; 559 sum5 += *v5++ * tmp0; 560 } 561 y[row++]=sum1; 562 y[row++]=sum2; 563 y[row++]=sum3; 564 y[row++]=sum4; 565 y[row++]=sum5; 566 v1 =v5; /* Since the next block to be processed starts there */ 567 idx +=4*sz; 568 break; 569 default: 570 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 571 } 572 } 573 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 575 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 /* ----------------------------------------------------------- */ 579 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 583 { 584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 585 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 586 const MatScalar *v1,*v2,*v3,*v4,*v5; 587 const PetscScalar *x; 588 PetscScalar *y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 591 const PetscInt *idx,*ns,*ii; 592 593 PetscFunctionBegin; 594 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 595 node_max = a->inode.node_count; 596 ns = a->inode.size; /* Node Size array */ 597 598 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 599 ierr = VecGetArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 600 zt = z; 601 602 idx = a->j; 603 v1 = a->a; 604 ii = a->i; 605 606 for (i = 0,row = 0; i< node_max; ++i) { 607 nsz = ns[i]; 608 n = ii[1] - ii[0]; 609 ii += nsz; 610 sz = n; /* No of non zeros in this row */ 611 /* Switch on the size of Node */ 612 switch (nsz) { /* Each loop in 'case' is unrolled */ 613 case 1: 614 sum1 = *zt++; 615 616 for (n = 0; n< sz-1; n+=2) { 617 i1 = idx[0]; /* The instructions are ordered to */ 618 i2 = idx[1]; /* make the compiler's job easy */ 619 idx += 2; 620 tmp0 = x[i1]; 621 tmp1 = x[i2]; 622 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 623 } 624 625 if (n == sz-1) { /* Take care of the last nonzero */ 626 tmp0 = x[*idx++]; 627 sum1 += *v1++ * tmp0; 628 } 629 y[row++]=sum1; 630 break; 631 case 2: 632 sum1 = *zt++; 633 sum2 = *zt++; 634 v2 = v1 + n; 635 636 for (n = 0; n< sz-1; n+=2) { 637 i1 = idx[0]; 638 i2 = idx[1]; 639 idx += 2; 640 tmp0 = x[i1]; 641 tmp1 = x[i2]; 642 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 643 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 644 } 645 if (n == sz-1) { 646 tmp0 = x[*idx++]; 647 sum1 += *v1++ * tmp0; 648 sum2 += *v2++ * tmp0; 649 } 650 y[row++]=sum1; 651 y[row++]=sum2; 652 v1 =v2; /* Since the next block to be processed starts there*/ 653 idx +=sz; 654 break; 655 case 3: 656 sum1 = *zt++; 657 sum2 = *zt++; 658 sum3 = *zt++; 659 v2 = v1 + n; 660 v3 = v2 + n; 661 662 for (n = 0; n< sz-1; n+=2) { 663 i1 = idx[0]; 664 i2 = idx[1]; 665 idx += 2; 666 tmp0 = x[i1]; 667 tmp1 = x[i2]; 668 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 669 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 670 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 671 } 672 if (n == sz-1) { 673 tmp0 = x[*idx++]; 674 sum1 += *v1++ * tmp0; 675 sum2 += *v2++ * tmp0; 676 sum3 += *v3++ * tmp0; 677 } 678 y[row++]=sum1; 679 y[row++]=sum2; 680 y[row++]=sum3; 681 v1 =v3; /* Since the next block to be processed starts there*/ 682 idx +=2*sz; 683 break; 684 case 4: 685 sum1 = *zt++; 686 sum2 = *zt++; 687 sum3 = *zt++; 688 sum4 = *zt++; 689 v2 = v1 + n; 690 v3 = v2 + n; 691 v4 = v3 + n; 692 693 for (n = 0; n< sz-1; n+=2) { 694 i1 = idx[0]; 695 i2 = idx[1]; 696 idx += 2; 697 tmp0 = x[i1]; 698 tmp1 = x[i2]; 699 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 700 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 701 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 702 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 703 } 704 if (n == sz-1) { 705 tmp0 = x[*idx++]; 706 sum1 += *v1++ * tmp0; 707 sum2 += *v2++ * tmp0; 708 sum3 += *v3++ * tmp0; 709 sum4 += *v4++ * tmp0; 710 } 711 y[row++]=sum1; 712 y[row++]=sum2; 713 y[row++]=sum3; 714 y[row++]=sum4; 715 v1 =v4; /* Since the next block to be processed starts there*/ 716 idx +=3*sz; 717 break; 718 case 5: 719 sum1 = *zt++; 720 sum2 = *zt++; 721 sum3 = *zt++; 722 sum4 = *zt++; 723 sum5 = *zt++; 724 v2 = v1 + n; 725 v3 = v2 + n; 726 v4 = v3 + n; 727 v5 = v4 + n; 728 729 for (n = 0; n<sz-1; n+=2) { 730 i1 = idx[0]; 731 i2 = idx[1]; 732 idx += 2; 733 tmp0 = x[i1]; 734 tmp1 = x[i2]; 735 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 736 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 737 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 738 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 739 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 740 } 741 if (n == sz-1) { 742 tmp0 = x[*idx++]; 743 sum1 += *v1++ * tmp0; 744 sum2 += *v2++ * tmp0; 745 sum3 += *v3++ * tmp0; 746 sum4 += *v4++ * tmp0; 747 sum5 += *v5++ * tmp0; 748 } 749 y[row++]=sum1; 750 y[row++]=sum2; 751 y[row++]=sum3; 752 y[row++]=sum4; 753 y[row++]=sum5; 754 v1 =v5; /* Since the next block to be processed starts there */ 755 idx +=4*sz; 756 break; 757 default: 758 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 759 } 760 } 761 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 762 ierr = VecRestoreArrayPair(zz,yy,&z,&y);CHKERRQ(ierr); 763 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 764 PetscFunctionReturn(0); 765 } 766 767 /* ----------------------------------------------------------- */ 768 #undef __FUNCT__ 769 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 770 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 771 { 772 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 773 IS iscol = a->col,isrow = a->row; 774 PetscErrorCode ierr; 775 const PetscInt *r,*c,*rout,*cout; 776 PetscInt i,j,n = A->rmap->n,nz; 777 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 778 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 779 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 780 PetscScalar sum1,sum2,sum3,sum4,sum5; 781 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 782 const PetscScalar *b; 783 784 PetscFunctionBegin; 785 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 786 node_max = a->inode.node_count; 787 ns = a->inode.size; /* Node Size array */ 788 789 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 790 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 791 tmp = a->solve_work; 792 793 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 794 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 795 796 /* forward solve the lower triangular */ 797 tmps = tmp; 798 aa = a_a; 799 aj = a_j; 800 ad = a->diag; 801 802 for (i = 0,row = 0; i< node_max; ++i) { 803 nsz = ns[i]; 804 aii = ai[row]; 805 v1 = aa + aii; 806 vi = aj + aii; 807 nz = ad[row]- aii; 808 if (i < node_max-1) { 809 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 810 * but our indexing to determine it's size could. */ 811 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 812 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 813 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 814 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 815 } 816 817 switch (nsz) { /* Each loop in 'case' is unrolled */ 818 case 1: 819 sum1 = b[*r++]; 820 for (j=0; j<nz-1; j+=2) { 821 i0 = vi[0]; 822 i1 = vi[1]; 823 vi +=2; 824 tmp0 = tmps[i0]; 825 tmp1 = tmps[i1]; 826 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 827 } 828 if (j == nz-1) { 829 tmp0 = tmps[*vi++]; 830 sum1 -= *v1++ *tmp0; 831 } 832 tmp[row++]=sum1; 833 break; 834 case 2: 835 sum1 = b[*r++]; 836 sum2 = b[*r++]; 837 v2 = aa + ai[row+1]; 838 839 for (j=0; j<nz-1; j+=2) { 840 i0 = vi[0]; 841 i1 = vi[1]; 842 vi +=2; 843 tmp0 = tmps[i0]; 844 tmp1 = tmps[i1]; 845 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 846 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 847 } 848 if (j == nz-1) { 849 tmp0 = tmps[*vi++]; 850 sum1 -= *v1++ *tmp0; 851 sum2 -= *v2++ *tmp0; 852 } 853 sum2 -= *v2++ *sum1; 854 tmp[row++]=sum1; 855 tmp[row++]=sum2; 856 break; 857 case 3: 858 sum1 = b[*r++]; 859 sum2 = b[*r++]; 860 sum3 = b[*r++]; 861 v2 = aa + ai[row+1]; 862 v3 = aa + ai[row+2]; 863 864 for (j=0; j<nz-1; j+=2) { 865 i0 = vi[0]; 866 i1 = vi[1]; 867 vi +=2; 868 tmp0 = tmps[i0]; 869 tmp1 = tmps[i1]; 870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 873 } 874 if (j == nz-1) { 875 tmp0 = tmps[*vi++]; 876 sum1 -= *v1++ *tmp0; 877 sum2 -= *v2++ *tmp0; 878 sum3 -= *v3++ *tmp0; 879 } 880 sum2 -= *v2++ * sum1; 881 sum3 -= *v3++ * sum1; 882 sum3 -= *v3++ * sum2; 883 884 tmp[row++]=sum1; 885 tmp[row++]=sum2; 886 tmp[row++]=sum3; 887 break; 888 889 case 4: 890 sum1 = b[*r++]; 891 sum2 = b[*r++]; 892 sum3 = b[*r++]; 893 sum4 = b[*r++]; 894 v2 = aa + ai[row+1]; 895 v3 = aa + ai[row+2]; 896 v4 = aa + ai[row+3]; 897 898 for (j=0; j<nz-1; j+=2) { 899 i0 = vi[0]; 900 i1 = vi[1]; 901 vi +=2; 902 tmp0 = tmps[i0]; 903 tmp1 = tmps[i1]; 904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 908 } 909 if (j == nz-1) { 910 tmp0 = tmps[*vi++]; 911 sum1 -= *v1++ *tmp0; 912 sum2 -= *v2++ *tmp0; 913 sum3 -= *v3++ *tmp0; 914 sum4 -= *v4++ *tmp0; 915 } 916 sum2 -= *v2++ * sum1; 917 sum3 -= *v3++ * sum1; 918 sum4 -= *v4++ * sum1; 919 sum3 -= *v3++ * sum2; 920 sum4 -= *v4++ * sum2; 921 sum4 -= *v4++ * sum3; 922 923 tmp[row++]=sum1; 924 tmp[row++]=sum2; 925 tmp[row++]=sum3; 926 tmp[row++]=sum4; 927 break; 928 case 5: 929 sum1 = b[*r++]; 930 sum2 = b[*r++]; 931 sum3 = b[*r++]; 932 sum4 = b[*r++]; 933 sum5 = b[*r++]; 934 v2 = aa + ai[row+1]; 935 v3 = aa + ai[row+2]; 936 v4 = aa + ai[row+3]; 937 v5 = aa + ai[row+4]; 938 939 for (j=0; j<nz-1; j+=2) { 940 i0 = vi[0]; 941 i1 = vi[1]; 942 vi +=2; 943 tmp0 = tmps[i0]; 944 tmp1 = tmps[i1]; 945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 950 } 951 if (j == nz-1) { 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ *tmp0; 954 sum2 -= *v2++ *tmp0; 955 sum3 -= *v3++ *tmp0; 956 sum4 -= *v4++ *tmp0; 957 sum5 -= *v5++ *tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row++]=sum1; 972 tmp[row++]=sum2; 973 tmp[row++]=sum3; 974 tmp[row++]=sum4; 975 tmp[row++]=sum5; 976 break; 977 default: 978 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i=node_max -1,row = n-1; i>=0; i--) { 983 nsz = ns[i]; 984 aii = ai[row+1] -1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii- ad[row]; 988 switch (nsz) { /* Each loop in 'case' is unrolled */ 989 case 1: 990 sum1 = tmp[row]; 991 992 for (j=nz; j>1; j-=2) { 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j==1) { 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1006 break; 1007 case 2: 1008 sum1 = tmp[row]; 1009 sum2 = tmp[row -1]; 1010 v2 = aa + ai[row]-1; 1011 for (j=nz; j>1; j-=2) { 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1020 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1021 } 1022 if (j==1) { 1023 tmp0 = tmps[*vi--]; 1024 sum1 -= *v1-- * tmp0; 1025 sum2 -= *v2-- * tmp0; 1026 } 1027 1028 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1031 break; 1032 case 3: 1033 sum1 = tmp[row]; 1034 sum2 = tmp[row -1]; 1035 sum3 = tmp[row -2]; 1036 v2 = aa + ai[row]-1; 1037 v3 = aa + ai[row -1]-1; 1038 for (j=nz; j>1; j-=2) { 1039 vi -=2; 1040 i0 = vi[2]; 1041 i1 = vi[1]; 1042 tmp0 = tmps[i0]; 1043 tmp1 = tmps[i1]; 1044 v1 -= 2; 1045 v2 -= 2; 1046 v3 -= 2; 1047 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1048 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1049 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1050 } 1051 if (j==1) { 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 } 1057 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1061 sum3 -= *v3-- * tmp0; 1062 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1063 1064 break; 1065 case 4: 1066 sum1 = tmp[row]; 1067 sum2 = tmp[row -1]; 1068 sum3 = tmp[row -2]; 1069 sum4 = tmp[row -3]; 1070 v2 = aa + ai[row]-1; 1071 v3 = aa + ai[row -1]-1; 1072 v4 = aa + ai[row -2]-1; 1073 1074 for (j=nz; j>1; j-=2) { 1075 vi -=2; 1076 i0 = vi[2]; 1077 i1 = vi[1]; 1078 tmp0 = tmps[i0]; 1079 tmp1 = tmps[i1]; 1080 v1 -= 2; 1081 v2 -= 2; 1082 v3 -= 2; 1083 v4 -= 2; 1084 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1085 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1086 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1087 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1088 } 1089 if (j==1) { 1090 tmp0 = tmps[*vi--]; 1091 sum1 -= *v1-- * tmp0; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 } 1096 1097 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1105 sum4 -= *v4-- * tmp0; 1106 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1107 break; 1108 case 5: 1109 sum1 = tmp[row]; 1110 sum2 = tmp[row -1]; 1111 sum3 = tmp[row -2]; 1112 sum4 = tmp[row -3]; 1113 sum5 = tmp[row -4]; 1114 v2 = aa + ai[row]-1; 1115 v3 = aa + ai[row -1]-1; 1116 v4 = aa + ai[row -2]-1; 1117 v5 = aa + ai[row -3]-1; 1118 for (j=nz; j>1; j-=2) { 1119 vi -= 2; 1120 i0 = vi[2]; 1121 i1 = vi[1]; 1122 tmp0 = tmps[i0]; 1123 tmp1 = tmps[i1]; 1124 v1 -= 2; 1125 v2 -= 2; 1126 v3 -= 2; 1127 v4 -= 2; 1128 v5 -= 2; 1129 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1130 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1131 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1132 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1133 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1134 } 1135 if (j==1) { 1136 tmp0 = tmps[*vi--]; 1137 sum1 -= *v1-- * tmp0; 1138 sum2 -= *v2-- * tmp0; 1139 sum3 -= *v3-- * tmp0; 1140 sum4 -= *v4-- * tmp0; 1141 sum5 -= *v5-- * tmp0; 1142 } 1143 1144 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1157 sum5 -= *v5-- * tmp0; 1158 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1159 break; 1160 default: 1161 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1162 } 1163 } 1164 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1165 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1166 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1174 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1175 { 1176 Mat C =B; 1177 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1178 IS isrow = b->row,isicol = b->icol; 1179 PetscErrorCode ierr; 1180 const PetscInt *r,*ic,*ics; 1181 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1182 PetscInt i,j,k,nz,nzL,row,*pj; 1183 const PetscInt *ajtmp,*bjtmp; 1184 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1185 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1186 FactorShiftCtx sctx; 1187 const PetscInt *ddiag; 1188 PetscReal rs; 1189 MatScalar d; 1190 PetscInt inod,nodesz,node_max,col; 1191 const PetscInt *ns; 1192 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1193 1194 PetscFunctionBegin; 1195 /* MatPivotSetUp(): initialize shift context sctx */ 1196 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1197 1198 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1199 ddiag = a->diag; 1200 sctx.shift_top = info->zeropivot; 1201 for (i=0; i<n; i++) { 1202 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1203 d = (aa)[ddiag[i]]; 1204 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1205 v = aa+ai[i]; 1206 nz = ai[i+1] - ai[i]; 1207 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1208 if (rs>sctx.shift_top) sctx.shift_top = rs; 1209 } 1210 sctx.shift_top *= 1.1; 1211 sctx.nshift_max = 5; 1212 sctx.shift_lo = 0.; 1213 sctx.shift_hi = 1.; 1214 } 1215 1216 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1217 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1218 1219 ierr = PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4);CHKERRQ(ierr); 1220 ics = ic; 1221 1222 node_max = a->inode.node_count; 1223 ns = a->inode.size; 1224 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1225 1226 /* If max inode size > 4, split it into two inodes.*/ 1227 /* also map the inode sizes according to the ordering */ 1228 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1229 for (i=0,j=0; i<node_max; ++i,++j) { 1230 if (ns[i] > 4) { 1231 tmp_vec1[j] = 4; 1232 ++j; 1233 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1234 } else { 1235 tmp_vec1[j] = ns[i]; 1236 } 1237 } 1238 /* Use the correct node_max */ 1239 node_max = j; 1240 1241 /* Now reorder the inode info based on mat re-ordering info */ 1242 /* First create a row -> inode_size_array_index map */ 1243 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1244 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1245 for (i=0,row=0; i<node_max; i++) { 1246 nodesz = tmp_vec1[i]; 1247 for (j=0; j<nodesz; j++,row++) { 1248 nsmap[row] = i; 1249 } 1250 } 1251 /* Using nsmap, create a reordered ns structure */ 1252 for (i=0,j=0; i< node_max; i++) { 1253 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1254 tmp_vec2[i] = nodesz; 1255 j += nodesz; 1256 } 1257 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1258 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1259 1260 /* Now use the correct ns */ 1261 ns = tmp_vec2; 1262 1263 do { 1264 sctx.newshift = PETSC_FALSE; 1265 /* Now loop over each block-row, and do the factorization */ 1266 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1267 nodesz = ns[inod]; 1268 1269 switch (nodesz) { 1270 case 1: 1271 /*----------*/ 1272 /* zero rtmp1 */ 1273 /* L part */ 1274 nz = bi[i+1] - bi[i]; 1275 bjtmp = bj + bi[i]; 1276 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1277 1278 /* U part */ 1279 nz = bdiag[i]-bdiag[i+1]; 1280 bjtmp = bj + bdiag[i+1]+1; 1281 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1282 1283 /* load in initial (unfactored row) */ 1284 nz = ai[r[i]+1] - ai[r[i]]; 1285 ajtmp = aj + ai[r[i]]; 1286 v = aa + ai[r[i]]; 1287 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1288 1289 /* ZeropivotApply() */ 1290 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1291 1292 /* elimination */ 1293 bjtmp = bj + bi[i]; 1294 row = *bjtmp++; 1295 nzL = bi[i+1] - bi[i]; 1296 for (k=0; k < nzL; k++) { 1297 pc = rtmp1 + row; 1298 if (*pc != 0.0) { 1299 pv = b->a + bdiag[row]; 1300 mul1 = *pc * (*pv); 1301 *pc = mul1; 1302 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1303 pv = b->a + bdiag[row+1]+1; 1304 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1305 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1306 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1307 } 1308 row = *bjtmp++; 1309 } 1310 1311 /* finished row so stick it into b->a */ 1312 rs = 0.0; 1313 /* L part */ 1314 pv = b->a + bi[i]; 1315 pj = b->j + bi[i]; 1316 nz = bi[i+1] - bi[i]; 1317 for (j=0; j<nz; j++) { 1318 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1319 } 1320 1321 /* U part */ 1322 pv = b->a + bdiag[i+1]+1; 1323 pj = b->j + bdiag[i+1]+1; 1324 nz = bdiag[i] - bdiag[i+1]-1; 1325 for (j=0; j<nz; j++) { 1326 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1327 } 1328 1329 /* Check zero pivot */ 1330 sctx.rs = rs; 1331 sctx.pv = rtmp1[i]; 1332 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1333 if (sctx.newshift) break; 1334 1335 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1336 pv = b->a + bdiag[i]; 1337 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1338 break; 1339 1340 case 2: 1341 /*----------*/ 1342 /* zero rtmp1 and rtmp2 */ 1343 /* L part */ 1344 nz = bi[i+1] - bi[i]; 1345 bjtmp = bj + bi[i]; 1346 for (j=0; j<nz; j++) { 1347 col = bjtmp[j]; 1348 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1349 } 1350 1351 /* U part */ 1352 nz = bdiag[i]-bdiag[i+1]; 1353 bjtmp = bj + bdiag[i+1]+1; 1354 for (j=0; j<nz; j++) { 1355 col = bjtmp[j]; 1356 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1357 } 1358 1359 /* load in initial (unfactored row) */ 1360 nz = ai[r[i]+1] - ai[r[i]]; 1361 ajtmp = aj + ai[r[i]]; 1362 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1363 for (j=0; j<nz; j++) { 1364 col = ics[ajtmp[j]]; 1365 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1366 } 1367 /* ZeropivotApply(): shift the diagonal of the matrix */ 1368 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1369 1370 /* elimination */ 1371 bjtmp = bj + bi[i]; 1372 row = *bjtmp++; /* pivot row */ 1373 nzL = bi[i+1] - bi[i]; 1374 for (k=0; k < nzL; k++) { 1375 pc1 = rtmp1 + row; 1376 pc2 = rtmp2 + row; 1377 if (*pc1 != 0.0 || *pc2 != 0.0) { 1378 pv = b->a + bdiag[row]; 1379 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1380 *pc1 = mul1; *pc2 = mul2; 1381 1382 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1383 pv = b->a + bdiag[row+1]+1; 1384 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1385 for (j=0; j<nz; j++) { 1386 col = pj[j]; 1387 rtmp1[col] -= mul1 * pv[j]; 1388 rtmp2[col] -= mul2 * pv[j]; 1389 } 1390 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1391 } 1392 row = *bjtmp++; 1393 } 1394 1395 /* finished row i; check zero pivot, then stick row i into b->a */ 1396 rs = 0.0; 1397 /* L part */ 1398 pc1 = b->a + bi[i]; 1399 pj = b->j + bi[i]; 1400 nz = bi[i+1] - bi[i]; 1401 for (j=0; j<nz; j++) { 1402 col = pj[j]; 1403 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1404 } 1405 /* U part */ 1406 pc1 = b->a + bdiag[i+1]+1; 1407 pj = b->j + bdiag[i+1]+1; 1408 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1409 for (j=0; j<nz; j++) { 1410 col = pj[j]; 1411 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1412 } 1413 1414 sctx.rs = rs; 1415 sctx.pv = rtmp1[i]; 1416 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1417 if (sctx.newshift) break; 1418 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1419 *pc1 = 1.0/sctx.pv; 1420 1421 /* Now take care of diagonal 2x2 block. */ 1422 pc2 = rtmp2 + i; 1423 if (*pc2 != 0.0) { 1424 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1425 *pc2 = mul1; /* insert L entry */ 1426 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1427 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1428 for (j=0; j<nz; j++) { 1429 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1430 } 1431 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1432 } 1433 1434 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1435 rs = 0.0; 1436 /* L part */ 1437 pc2 = b->a + bi[i+1]; 1438 pj = b->j + bi[i+1]; 1439 nz = bi[i+2] - bi[i+1]; 1440 for (j=0; j<nz; j++) { 1441 col = pj[j]; 1442 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1443 } 1444 /* U part */ 1445 pc2 = b->a + bdiag[i+2]+1; 1446 pj = b->j + bdiag[i+2]+1; 1447 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1448 for (j=0; j<nz; j++) { 1449 col = pj[j]; 1450 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1451 } 1452 1453 sctx.rs = rs; 1454 sctx.pv = rtmp2[i+1]; 1455 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1456 if (sctx.newshift) break; 1457 pc2 = b->a + bdiag[i+1]; 1458 *pc2 = 1.0/sctx.pv; 1459 break; 1460 1461 case 3: 1462 /*----------*/ 1463 /* zero rtmp */ 1464 /* L part */ 1465 nz = bi[i+1] - bi[i]; 1466 bjtmp = bj + bi[i]; 1467 for (j=0; j<nz; j++) { 1468 col = bjtmp[j]; 1469 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1470 } 1471 1472 /* U part */ 1473 nz = bdiag[i]-bdiag[i+1]; 1474 bjtmp = bj + bdiag[i+1]+1; 1475 for (j=0; j<nz; j++) { 1476 col = bjtmp[j]; 1477 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1478 } 1479 1480 /* load in initial (unfactored row) */ 1481 nz = ai[r[i]+1] - ai[r[i]]; 1482 ajtmp = aj + ai[r[i]]; 1483 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1484 for (j=0; j<nz; j++) { 1485 col = ics[ajtmp[j]]; 1486 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1487 } 1488 /* ZeropivotApply(): shift the diagonal of the matrix */ 1489 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1490 1491 /* elimination */ 1492 bjtmp = bj + bi[i]; 1493 row = *bjtmp++; /* pivot row */ 1494 nzL = bi[i+1] - bi[i]; 1495 for (k=0; k < nzL; k++) { 1496 pc1 = rtmp1 + row; 1497 pc2 = rtmp2 + row; 1498 pc3 = rtmp3 + row; 1499 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1500 pv = b->a + bdiag[row]; 1501 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1502 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1503 1504 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1505 pv = b->a + bdiag[row+1]+1; 1506 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1507 for (j=0; j<nz; j++) { 1508 col = pj[j]; 1509 rtmp1[col] -= mul1 * pv[j]; 1510 rtmp2[col] -= mul2 * pv[j]; 1511 rtmp3[col] -= mul3 * pv[j]; 1512 } 1513 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1514 } 1515 row = *bjtmp++; 1516 } 1517 1518 /* finished row i; check zero pivot, then stick row i into b->a */ 1519 rs = 0.0; 1520 /* L part */ 1521 pc1 = b->a + bi[i]; 1522 pj = b->j + bi[i]; 1523 nz = bi[i+1] - bi[i]; 1524 for (j=0; j<nz; j++) { 1525 col = pj[j]; 1526 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1527 } 1528 /* U part */ 1529 pc1 = b->a + bdiag[i+1]+1; 1530 pj = b->j + bdiag[i+1]+1; 1531 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1532 for (j=0; j<nz; j++) { 1533 col = pj[j]; 1534 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1535 } 1536 1537 sctx.rs = rs; 1538 sctx.pv = rtmp1[i]; 1539 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1540 if (sctx.newshift) break; 1541 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1542 *pc1 = 1.0/sctx.pv; 1543 1544 /* Now take care of 1st column of diagonal 3x3 block. */ 1545 pc2 = rtmp2 + i; 1546 pc3 = rtmp3 + i; 1547 if (*pc2 != 0.0 || *pc3 != 0.0) { 1548 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1549 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1550 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1551 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1552 for (j=0; j<nz; j++) { 1553 col = pj[j]; 1554 rtmp2[col] -= mul2 * rtmp1[col]; 1555 rtmp3[col] -= mul3 * rtmp1[col]; 1556 } 1557 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1558 } 1559 1560 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1561 rs = 0.0; 1562 /* L part */ 1563 pc2 = b->a + bi[i+1]; 1564 pj = b->j + bi[i+1]; 1565 nz = bi[i+2] - bi[i+1]; 1566 for (j=0; j<nz; j++) { 1567 col = pj[j]; 1568 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1569 } 1570 /* U part */ 1571 pc2 = b->a + bdiag[i+2]+1; 1572 pj = b->j + bdiag[i+2]+1; 1573 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1574 for (j=0; j<nz; j++) { 1575 col = pj[j]; 1576 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1577 } 1578 1579 sctx.rs = rs; 1580 sctx.pv = rtmp2[i+1]; 1581 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1582 if (sctx.newshift) break; 1583 pc2 = b->a + bdiag[i+1]; 1584 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1585 1586 /* Now take care of 2nd column of diagonal 3x3 block. */ 1587 pc3 = rtmp3 + i+1; 1588 if (*pc3 != 0.0) { 1589 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1590 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1591 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1592 for (j=0; j<nz; j++) { 1593 col = pj[j]; 1594 rtmp3[col] -= mul3 * rtmp2[col]; 1595 } 1596 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1597 } 1598 1599 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1600 rs = 0.0; 1601 /* L part */ 1602 pc3 = b->a + bi[i+2]; 1603 pj = b->j + bi[i+2]; 1604 nz = bi[i+3] - bi[i+2]; 1605 for (j=0; j<nz; j++) { 1606 col = pj[j]; 1607 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1608 } 1609 /* U part */ 1610 pc3 = b->a + bdiag[i+3]+1; 1611 pj = b->j + bdiag[i+3]+1; 1612 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1613 for (j=0; j<nz; j++) { 1614 col = pj[j]; 1615 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1616 } 1617 1618 sctx.rs = rs; 1619 sctx.pv = rtmp3[i+2]; 1620 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1621 if (sctx.newshift) break; 1622 pc3 = b->a + bdiag[i+2]; 1623 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1624 break; 1625 case 4: 1626 /*----------*/ 1627 /* zero rtmp */ 1628 /* L part */ 1629 nz = bi[i+1] - bi[i]; 1630 bjtmp = bj + bi[i]; 1631 for (j=0; j<nz; j++) { 1632 col = bjtmp[j]; 1633 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1634 } 1635 1636 /* U part */ 1637 nz = bdiag[i]-bdiag[i+1]; 1638 bjtmp = bj + bdiag[i+1]+1; 1639 for (j=0; j<nz; j++) { 1640 col = bjtmp[j]; 1641 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1642 } 1643 1644 /* load in initial (unfactored row) */ 1645 nz = ai[r[i]+1] - ai[r[i]]; 1646 ajtmp = aj + ai[r[i]]; 1647 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1648 for (j=0; j<nz; j++) { 1649 col = ics[ajtmp[j]]; 1650 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1651 } 1652 /* ZeropivotApply(): shift the diagonal of the matrix */ 1653 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1654 1655 /* elimination */ 1656 bjtmp = bj + bi[i]; 1657 row = *bjtmp++; /* pivot row */ 1658 nzL = bi[i+1] - bi[i]; 1659 for (k=0; k < nzL; k++) { 1660 pc1 = rtmp1 + row; 1661 pc2 = rtmp2 + row; 1662 pc3 = rtmp3 + row; 1663 pc4 = rtmp4 + row; 1664 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1665 pv = b->a + bdiag[row]; 1666 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1667 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1668 1669 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1670 pv = b->a + bdiag[row+1]+1; 1671 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1672 for (j=0; j<nz; j++) { 1673 col = pj[j]; 1674 rtmp1[col] -= mul1 * pv[j]; 1675 rtmp2[col] -= mul2 * pv[j]; 1676 rtmp3[col] -= mul3 * pv[j]; 1677 rtmp4[col] -= mul4 * pv[j]; 1678 } 1679 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1680 } 1681 row = *bjtmp++; 1682 } 1683 1684 /* finished row i; check zero pivot, then stick row i into b->a */ 1685 rs = 0.0; 1686 /* L part */ 1687 pc1 = b->a + bi[i]; 1688 pj = b->j + bi[i]; 1689 nz = bi[i+1] - bi[i]; 1690 for (j=0; j<nz; j++) { 1691 col = pj[j]; 1692 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1693 } 1694 /* U part */ 1695 pc1 = b->a + bdiag[i+1]+1; 1696 pj = b->j + bdiag[i+1]+1; 1697 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1698 for (j=0; j<nz; j++) { 1699 col = pj[j]; 1700 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1701 } 1702 1703 sctx.rs = rs; 1704 sctx.pv = rtmp1[i]; 1705 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1706 if (sctx.newshift) break; 1707 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1708 *pc1 = 1.0/sctx.pv; 1709 1710 /* Now take care of 1st column of diagonal 4x4 block. */ 1711 pc2 = rtmp2 + i; 1712 pc3 = rtmp3 + i; 1713 pc4 = rtmp4 + i; 1714 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1715 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1716 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1717 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1718 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1719 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1720 for (j=0; j<nz; j++) { 1721 col = pj[j]; 1722 rtmp2[col] -= mul2 * rtmp1[col]; 1723 rtmp3[col] -= mul3 * rtmp1[col]; 1724 rtmp4[col] -= mul4 * rtmp1[col]; 1725 } 1726 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1727 } 1728 1729 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1730 rs = 0.0; 1731 /* L part */ 1732 pc2 = b->a + bi[i+1]; 1733 pj = b->j + bi[i+1]; 1734 nz = bi[i+2] - bi[i+1]; 1735 for (j=0; j<nz; j++) { 1736 col = pj[j]; 1737 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1738 } 1739 /* U part */ 1740 pc2 = b->a + bdiag[i+2]+1; 1741 pj = b->j + bdiag[i+2]+1; 1742 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1743 for (j=0; j<nz; j++) { 1744 col = pj[j]; 1745 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1746 } 1747 1748 sctx.rs = rs; 1749 sctx.pv = rtmp2[i+1]; 1750 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1751 if (sctx.newshift) break; 1752 pc2 = b->a + bdiag[i+1]; 1753 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1754 1755 /* Now take care of 2nd column of diagonal 4x4 block. */ 1756 pc3 = rtmp3 + i+1; 1757 pc4 = rtmp4 + i+1; 1758 if (*pc3 != 0.0 || *pc4 != 0.0) { 1759 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1760 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1761 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1762 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1763 for (j=0; j<nz; j++) { 1764 col = pj[j]; 1765 rtmp3[col] -= mul3 * rtmp2[col]; 1766 rtmp4[col] -= mul4 * rtmp2[col]; 1767 } 1768 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1769 } 1770 1771 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1772 rs = 0.0; 1773 /* L part */ 1774 pc3 = b->a + bi[i+2]; 1775 pj = b->j + bi[i+2]; 1776 nz = bi[i+3] - bi[i+2]; 1777 for (j=0; j<nz; j++) { 1778 col = pj[j]; 1779 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1780 } 1781 /* U part */ 1782 pc3 = b->a + bdiag[i+3]+1; 1783 pj = b->j + bdiag[i+3]+1; 1784 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1785 for (j=0; j<nz; j++) { 1786 col = pj[j]; 1787 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1788 } 1789 1790 sctx.rs = rs; 1791 sctx.pv = rtmp3[i+2]; 1792 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1793 if (sctx.newshift) break; 1794 pc3 = b->a + bdiag[i+2]; 1795 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1796 1797 /* Now take care of 3rd column of diagonal 4x4 block. */ 1798 pc4 = rtmp4 + i+2; 1799 if (*pc4 != 0.0) { 1800 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1801 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1802 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1803 for (j=0; j<nz; j++) { 1804 col = pj[j]; 1805 rtmp4[col] -= mul4 * rtmp3[col]; 1806 } 1807 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1808 } 1809 1810 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1811 rs = 0.0; 1812 /* L part */ 1813 pc4 = b->a + bi[i+3]; 1814 pj = b->j + bi[i+3]; 1815 nz = bi[i+4] - bi[i+3]; 1816 for (j=0; j<nz; j++) { 1817 col = pj[j]; 1818 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1819 } 1820 /* U part */ 1821 pc4 = b->a + bdiag[i+4]+1; 1822 pj = b->j + bdiag[i+4]+1; 1823 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1824 for (j=0; j<nz; j++) { 1825 col = pj[j]; 1826 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1827 } 1828 1829 sctx.rs = rs; 1830 sctx.pv = rtmp4[i+3]; 1831 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1832 if (sctx.newshift) break; 1833 pc4 = b->a + bdiag[i+3]; 1834 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1835 break; 1836 1837 default: 1838 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1839 } 1840 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1841 i += nodesz; /* Update the row */ 1842 } 1843 1844 /* MatPivotRefine() */ 1845 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1846 /* 1847 * if no shift in this attempt & shifting & started shifting & can refine, 1848 * then try lower shift 1849 */ 1850 sctx.shift_hi = sctx.shift_fraction; 1851 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1852 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1853 sctx.newshift = PETSC_TRUE; 1854 sctx.nshift++; 1855 } 1856 } while (sctx.newshift); 1857 1858 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1859 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1860 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1861 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1862 1863 if (b->inode.size) { 1864 C->ops->solve = MatSolve_SeqAIJ_Inode; 1865 } else { 1866 C->ops->solve = MatSolve_SeqAIJ; 1867 } 1868 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1869 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1870 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1871 C->ops->matsolve = MatMatSolve_SeqAIJ; 1872 C->assembled = PETSC_TRUE; 1873 C->preallocated = PETSC_TRUE; 1874 1875 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1876 1877 /* MatShiftView(A,info,&sctx) */ 1878 if (sctx.nshift) { 1879 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1880 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 1881 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1882 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 1883 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1884 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %g\n",sctx.nshift,(double)info->shiftamount);CHKERRQ(ierr); 1885 } 1886 } 1887 PetscFunctionReturn(0); 1888 } 1889 1890 #undef __FUNCT__ 1891 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1892 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1893 { 1894 Mat C = B; 1895 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1896 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1897 PetscErrorCode ierr; 1898 const PetscInt *r,*ic,*c,*ics; 1899 PetscInt n = A->rmap->n,*bi = b->i; 1900 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1901 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1902 PetscInt *ai = a->i,*aj = a->j; 1903 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1904 PetscScalar mul1,mul2,mul3,tmp; 1905 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1906 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1907 PetscReal rs=0.0; 1908 FactorShiftCtx sctx; 1909 1910 PetscFunctionBegin; 1911 sctx.shift_top = 0; 1912 sctx.nshift_max = 0; 1913 sctx.shift_lo = 0; 1914 sctx.shift_hi = 0; 1915 sctx.shift_fraction = 0; 1916 1917 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1918 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1919 sctx.shift_top = 0; 1920 for (i=0; i<n; i++) { 1921 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1922 rs = 0.0; 1923 ajtmp = aj + ai[i]; 1924 rtmp1 = aa + ai[i]; 1925 nz = ai[i+1] - ai[i]; 1926 for (j=0; j<nz; j++) { 1927 if (*ajtmp != i) { 1928 rs += PetscAbsScalar(*rtmp1++); 1929 } else { 1930 rs -= PetscRealPart(*rtmp1++); 1931 } 1932 ajtmp++; 1933 } 1934 if (rs>sctx.shift_top) sctx.shift_top = rs; 1935 } 1936 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1937 sctx.shift_top *= 1.1; 1938 sctx.nshift_max = 5; 1939 sctx.shift_lo = 0.; 1940 sctx.shift_hi = 1.; 1941 } 1942 sctx.shift_amount = 0; 1943 sctx.nshift = 0; 1944 1945 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1946 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1947 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1948 ierr = PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33);CHKERRQ(ierr); 1949 ics = ic; 1950 1951 node_max = a->inode.node_count; 1952 ns = a->inode.size; 1953 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1954 1955 /* If max inode size > 3, split it into two inodes.*/ 1956 /* also map the inode sizes according to the ordering */ 1957 ierr = PetscMalloc1(n+1,&tmp_vec1);CHKERRQ(ierr); 1958 for (i=0,j=0; i<node_max; ++i,++j) { 1959 if (ns[i]>3) { 1960 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1961 ++j; 1962 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1963 } else { 1964 tmp_vec1[j] = ns[i]; 1965 } 1966 } 1967 /* Use the correct node_max */ 1968 node_max = j; 1969 1970 /* Now reorder the inode info based on mat re-ordering info */ 1971 /* First create a row -> inode_size_array_index map */ 1972 ierr = PetscMalloc1(n+1,&nsmap);CHKERRQ(ierr); 1973 ierr = PetscMalloc1(node_max+1,&tmp_vec2);CHKERRQ(ierr); 1974 for (i=0,row=0; i<node_max; i++) { 1975 nodesz = tmp_vec1[i]; 1976 for (j=0; j<nodesz; j++,row++) { 1977 nsmap[row] = i; 1978 } 1979 } 1980 /* Using nsmap, create a reordered ns structure */ 1981 for (i=0,j=0; i< node_max; i++) { 1982 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1983 tmp_vec2[i] = nodesz; 1984 j += nodesz; 1985 } 1986 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1987 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1988 /* Now use the correct ns */ 1989 ns = tmp_vec2; 1990 1991 do { 1992 sctx.newshift = PETSC_FALSE; 1993 /* Now loop over each block-row, and do the factorization */ 1994 for (i=0,row=0; i<node_max; i++) { 1995 nodesz = ns[i]; 1996 nz = bi[row+1] - bi[row]; 1997 bjtmp = bj + bi[row]; 1998 1999 switch (nodesz) { 2000 case 1: 2001 for (j=0; j<nz; j++) { 2002 idx = bjtmp[j]; 2003 rtmp11[idx] = 0.0; 2004 } 2005 2006 /* load in initial (unfactored row) */ 2007 idx = r[row]; 2008 nz_tmp = ai[idx+1] - ai[idx]; 2009 ajtmp = aj + ai[idx]; 2010 v1 = aa + ai[idx]; 2011 2012 for (j=0; j<nz_tmp; j++) { 2013 idx = ics[ajtmp[j]]; 2014 rtmp11[idx] = v1[j]; 2015 } 2016 rtmp11[ics[r[row]]] += sctx.shift_amount; 2017 2018 prow = *bjtmp++; 2019 while (prow < row) { 2020 pc1 = rtmp11 + prow; 2021 if (*pc1 != 0.0) { 2022 pv = ba + bd[prow]; 2023 pj = nbj + bd[prow]; 2024 mul1 = *pc1 * *pv++; 2025 *pc1 = mul1; 2026 nz_tmp = bi[prow+1] - bd[prow] - 1; 2027 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2028 for (j=0; j<nz_tmp; j++) { 2029 tmp = pv[j]; 2030 idx = pj[j]; 2031 rtmp11[idx] -= mul1 * tmp; 2032 } 2033 } 2034 prow = *bjtmp++; 2035 } 2036 pj = bj + bi[row]; 2037 pc1 = ba + bi[row]; 2038 2039 sctx.pv = rtmp11[row]; 2040 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2041 rs = 0.0; 2042 for (j=0; j<nz; j++) { 2043 idx = pj[j]; 2044 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2045 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2046 } 2047 sctx.rs = rs; 2048 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2049 if (sctx.newshift) goto endofwhile; 2050 break; 2051 2052 case 2: 2053 for (j=0; j<nz; j++) { 2054 idx = bjtmp[j]; 2055 rtmp11[idx] = 0.0; 2056 rtmp22[idx] = 0.0; 2057 } 2058 2059 /* load in initial (unfactored row) */ 2060 idx = r[row]; 2061 nz_tmp = ai[idx+1] - ai[idx]; 2062 ajtmp = aj + ai[idx]; 2063 v1 = aa + ai[idx]; 2064 v2 = aa + ai[idx+1]; 2065 for (j=0; j<nz_tmp; j++) { 2066 idx = ics[ajtmp[j]]; 2067 rtmp11[idx] = v1[j]; 2068 rtmp22[idx] = v2[j]; 2069 } 2070 rtmp11[ics[r[row]]] += sctx.shift_amount; 2071 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2072 2073 prow = *bjtmp++; 2074 while (prow < row) { 2075 pc1 = rtmp11 + prow; 2076 pc2 = rtmp22 + prow; 2077 if (*pc1 != 0.0 || *pc2 != 0.0) { 2078 pv = ba + bd[prow]; 2079 pj = nbj + bd[prow]; 2080 mul1 = *pc1 * *pv; 2081 mul2 = *pc2 * *pv; 2082 ++pv; 2083 *pc1 = mul1; 2084 *pc2 = mul2; 2085 2086 nz_tmp = bi[prow+1] - bd[prow] - 1; 2087 for (j=0; j<nz_tmp; j++) { 2088 tmp = pv[j]; 2089 idx = pj[j]; 2090 rtmp11[idx] -= mul1 * tmp; 2091 rtmp22[idx] -= mul2 * tmp; 2092 } 2093 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2094 } 2095 prow = *bjtmp++; 2096 } 2097 2098 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2099 pc1 = rtmp11 + prow; 2100 pc2 = rtmp22 + prow; 2101 2102 sctx.pv = *pc1; 2103 pj = bj + bi[prow]; 2104 rs = 0.0; 2105 for (j=0; j<nz; j++) { 2106 idx = pj[j]; 2107 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2108 } 2109 sctx.rs = rs; 2110 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2111 if (sctx.newshift) goto endofwhile; 2112 2113 if (*pc2 != 0.0) { 2114 pj = nbj + bd[prow]; 2115 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2116 *pc2 = mul2; 2117 nz_tmp = bi[prow+1] - bd[prow] - 1; 2118 for (j=0; j<nz_tmp; j++) { 2119 idx = pj[j]; 2120 tmp = rtmp11[idx]; 2121 rtmp22[idx] -= mul2 * tmp; 2122 } 2123 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2124 } 2125 2126 pj = bj + bi[row]; 2127 pc1 = ba + bi[row]; 2128 pc2 = ba + bi[row+1]; 2129 2130 sctx.pv = rtmp22[row+1]; 2131 rs = 0.0; 2132 rtmp11[row] = 1.0/rtmp11[row]; 2133 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2134 /* copy row entries from dense representation to sparse */ 2135 for (j=0; j<nz; j++) { 2136 idx = pj[j]; 2137 pc1[j] = rtmp11[idx]; 2138 pc2[j] = rtmp22[idx]; 2139 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2140 } 2141 sctx.rs = rs; 2142 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2143 if (sctx.newshift) goto endofwhile; 2144 break; 2145 2146 case 3: 2147 for (j=0; j<nz; j++) { 2148 idx = bjtmp[j]; 2149 rtmp11[idx] = 0.0; 2150 rtmp22[idx] = 0.0; 2151 rtmp33[idx] = 0.0; 2152 } 2153 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2154 idx = r[row]; 2155 nz_tmp = ai[idx+1] - ai[idx]; 2156 ajtmp = aj + ai[idx]; 2157 v1 = aa + ai[idx]; 2158 v2 = aa + ai[idx+1]; 2159 v3 = aa + ai[idx+2]; 2160 for (j=0; j<nz_tmp; j++) { 2161 idx = ics[ajtmp[j]]; 2162 rtmp11[idx] = v1[j]; 2163 rtmp22[idx] = v2[j]; 2164 rtmp33[idx] = v3[j]; 2165 } 2166 rtmp11[ics[r[row]]] += sctx.shift_amount; 2167 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2168 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2169 2170 /* loop over all pivot row blocks above this row block */ 2171 prow = *bjtmp++; 2172 while (prow < row) { 2173 pc1 = rtmp11 + prow; 2174 pc2 = rtmp22 + prow; 2175 pc3 = rtmp33 + prow; 2176 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2177 pv = ba + bd[prow]; 2178 pj = nbj + bd[prow]; 2179 mul1 = *pc1 * *pv; 2180 mul2 = *pc2 * *pv; 2181 mul3 = *pc3 * *pv; 2182 ++pv; 2183 *pc1 = mul1; 2184 *pc2 = mul2; 2185 *pc3 = mul3; 2186 2187 nz_tmp = bi[prow+1] - bd[prow] - 1; 2188 /* update this row based on pivot row */ 2189 for (j=0; j<nz_tmp; j++) { 2190 tmp = pv[j]; 2191 idx = pj[j]; 2192 rtmp11[idx] -= mul1 * tmp; 2193 rtmp22[idx] -= mul2 * tmp; 2194 rtmp33[idx] -= mul3 * tmp; 2195 } 2196 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2197 } 2198 prow = *bjtmp++; 2199 } 2200 2201 /* Now take care of diagonal 3x3 block in this set of rows */ 2202 /* note: prow = row here */ 2203 pc1 = rtmp11 + prow; 2204 pc2 = rtmp22 + prow; 2205 pc3 = rtmp33 + prow; 2206 2207 sctx.pv = *pc1; 2208 pj = bj + bi[prow]; 2209 rs = 0.0; 2210 for (j=0; j<nz; j++) { 2211 idx = pj[j]; 2212 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2213 } 2214 sctx.rs = rs; 2215 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2216 if (sctx.newshift) goto endofwhile; 2217 2218 if (*pc2 != 0.0 || *pc3 != 0.0) { 2219 mul2 = (*pc2)/(*pc1); 2220 mul3 = (*pc3)/(*pc1); 2221 *pc2 = mul2; 2222 *pc3 = mul3; 2223 nz_tmp = bi[prow+1] - bd[prow] - 1; 2224 pj = nbj + bd[prow]; 2225 for (j=0; j<nz_tmp; j++) { 2226 idx = pj[j]; 2227 tmp = rtmp11[idx]; 2228 rtmp22[idx] -= mul2 * tmp; 2229 rtmp33[idx] -= mul3 * tmp; 2230 } 2231 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2232 } 2233 ++prow; 2234 2235 pc2 = rtmp22 + prow; 2236 pc3 = rtmp33 + prow; 2237 sctx.pv = *pc2; 2238 pj = bj + bi[prow]; 2239 rs = 0.0; 2240 for (j=0; j<nz; j++) { 2241 idx = pj[j]; 2242 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2243 } 2244 sctx.rs = rs; 2245 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2246 if (sctx.newshift) goto endofwhile; 2247 2248 if (*pc3 != 0.0) { 2249 mul3 = (*pc3)/(*pc2); 2250 *pc3 = mul3; 2251 pj = nbj + bd[prow]; 2252 nz_tmp = bi[prow+1] - bd[prow] - 1; 2253 for (j=0; j<nz_tmp; j++) { 2254 idx = pj[j]; 2255 tmp = rtmp22[idx]; 2256 rtmp33[idx] -= mul3 * tmp; 2257 } 2258 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2259 } 2260 2261 pj = bj + bi[row]; 2262 pc1 = ba + bi[row]; 2263 pc2 = ba + bi[row+1]; 2264 pc3 = ba + bi[row+2]; 2265 2266 sctx.pv = rtmp33[row+2]; 2267 rs = 0.0; 2268 rtmp11[row] = 1.0/rtmp11[row]; 2269 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2270 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2271 /* copy row entries from dense representation to sparse */ 2272 for (j=0; j<nz; j++) { 2273 idx = pj[j]; 2274 pc1[j] = rtmp11[idx]; 2275 pc2[j] = rtmp22[idx]; 2276 pc3[j] = rtmp33[idx]; 2277 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2278 } 2279 2280 sctx.rs = rs; 2281 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2282 if (sctx.newshift) goto endofwhile; 2283 break; 2284 2285 default: 2286 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2287 } 2288 row += nodesz; /* Update the row */ 2289 } 2290 endofwhile:; 2291 } while (sctx.newshift); 2292 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2293 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2294 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2295 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2296 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2297 2298 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2299 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2300 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2301 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2302 C->assembled = PETSC_TRUE; 2303 C->preallocated = PETSC_TRUE; 2304 if (sctx.nshift) { 2305 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2306 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top);CHKERRQ(ierr); 2307 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2308 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount);CHKERRQ(ierr); 2309 } 2310 } 2311 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2312 ierr = MatSeqAIJCheckInode(C);CHKERRQ(ierr); 2313 PetscFunctionReturn(0); 2314 } 2315 2316 2317 /* ----------------------------------------------------------- */ 2318 #undef __FUNCT__ 2319 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2320 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2321 { 2322 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2323 IS iscol = a->col,isrow = a->row; 2324 PetscErrorCode ierr; 2325 const PetscInt *r,*c,*rout,*cout; 2326 PetscInt i,j,n = A->rmap->n; 2327 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2328 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2329 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2330 PetscScalar sum1,sum2,sum3,sum4,sum5; 2331 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2332 const PetscScalar *b; 2333 2334 PetscFunctionBegin; 2335 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2336 node_max = a->inode.node_count; 2337 ns = a->inode.size; /* Node Size array */ 2338 2339 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2340 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2341 tmp = a->solve_work; 2342 2343 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2344 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2345 2346 /* forward solve the lower triangular */ 2347 tmps = tmp; 2348 aa = a_a; 2349 aj = a_j; 2350 ad = a->diag; 2351 2352 for (i = 0,row = 0; i< node_max; ++i) { 2353 nsz = ns[i]; 2354 aii = ai[row]; 2355 v1 = aa + aii; 2356 vi = aj + aii; 2357 nz = ai[row+1]- ai[row]; 2358 2359 if (i < node_max-1) { 2360 /* Prefetch the indices for the next block */ 2361 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2362 /* Prefetch the data for the next block */ 2363 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2364 } 2365 2366 switch (nsz) { /* Each loop in 'case' is unrolled */ 2367 case 1: 2368 sum1 = b[r[row]]; 2369 for (j=0; j<nz-1; j+=2) { 2370 i0 = vi[j]; 2371 i1 = vi[j+1]; 2372 tmp0 = tmps[i0]; 2373 tmp1 = tmps[i1]; 2374 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2375 } 2376 if (j == nz-1) { 2377 tmp0 = tmps[vi[j]]; 2378 sum1 -= v1[j]*tmp0; 2379 } 2380 tmp[row++]=sum1; 2381 break; 2382 case 2: 2383 sum1 = b[r[row]]; 2384 sum2 = b[r[row+1]]; 2385 v2 = aa + ai[row+1]; 2386 2387 for (j=0; j<nz-1; j+=2) { 2388 i0 = vi[j]; 2389 i1 = vi[j+1]; 2390 tmp0 = tmps[i0]; 2391 tmp1 = tmps[i1]; 2392 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2393 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2394 } 2395 if (j == nz-1) { 2396 tmp0 = tmps[vi[j]]; 2397 sum1 -= v1[j] *tmp0; 2398 sum2 -= v2[j] *tmp0; 2399 } 2400 sum2 -= v2[nz] * sum1; 2401 tmp[row++]=sum1; 2402 tmp[row++]=sum2; 2403 break; 2404 case 3: 2405 sum1 = b[r[row]]; 2406 sum2 = b[r[row+1]]; 2407 sum3 = b[r[row+2]]; 2408 v2 = aa + ai[row+1]; 2409 v3 = aa + ai[row+2]; 2410 2411 for (j=0; j<nz-1; j+=2) { 2412 i0 = vi[j]; 2413 i1 = vi[j+1]; 2414 tmp0 = tmps[i0]; 2415 tmp1 = tmps[i1]; 2416 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2417 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2418 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2419 } 2420 if (j == nz-1) { 2421 tmp0 = tmps[vi[j]]; 2422 sum1 -= v1[j] *tmp0; 2423 sum2 -= v2[j] *tmp0; 2424 sum3 -= v3[j] *tmp0; 2425 } 2426 sum2 -= v2[nz] * sum1; 2427 sum3 -= v3[nz] * sum1; 2428 sum3 -= v3[nz+1] * sum2; 2429 tmp[row++]=sum1; 2430 tmp[row++]=sum2; 2431 tmp[row++]=sum3; 2432 break; 2433 2434 case 4: 2435 sum1 = b[r[row]]; 2436 sum2 = b[r[row+1]]; 2437 sum3 = b[r[row+2]]; 2438 sum4 = b[r[row+3]]; 2439 v2 = aa + ai[row+1]; 2440 v3 = aa + ai[row+2]; 2441 v4 = aa + ai[row+3]; 2442 2443 for (j=0; j<nz-1; j+=2) { 2444 i0 = vi[j]; 2445 i1 = vi[j+1]; 2446 tmp0 = tmps[i0]; 2447 tmp1 = tmps[i1]; 2448 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2449 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2450 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2451 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2452 } 2453 if (j == nz-1) { 2454 tmp0 = tmps[vi[j]]; 2455 sum1 -= v1[j] *tmp0; 2456 sum2 -= v2[j] *tmp0; 2457 sum3 -= v3[j] *tmp0; 2458 sum4 -= v4[j] *tmp0; 2459 } 2460 sum2 -= v2[nz] * sum1; 2461 sum3 -= v3[nz] * sum1; 2462 sum4 -= v4[nz] * sum1; 2463 sum3 -= v3[nz+1] * sum2; 2464 sum4 -= v4[nz+1] * sum2; 2465 sum4 -= v4[nz+2] * sum3; 2466 2467 tmp[row++]=sum1; 2468 tmp[row++]=sum2; 2469 tmp[row++]=sum3; 2470 tmp[row++]=sum4; 2471 break; 2472 case 5: 2473 sum1 = b[r[row]]; 2474 sum2 = b[r[row+1]]; 2475 sum3 = b[r[row+2]]; 2476 sum4 = b[r[row+3]]; 2477 sum5 = b[r[row+4]]; 2478 v2 = aa + ai[row+1]; 2479 v3 = aa + ai[row+2]; 2480 v4 = aa + ai[row+3]; 2481 v5 = aa + ai[row+4]; 2482 2483 for (j=0; j<nz-1; j+=2) { 2484 i0 = vi[j]; 2485 i1 = vi[j+1]; 2486 tmp0 = tmps[i0]; 2487 tmp1 = tmps[i1]; 2488 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2489 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2490 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2491 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2492 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2493 } 2494 if (j == nz-1) { 2495 tmp0 = tmps[vi[j]]; 2496 sum1 -= v1[j] *tmp0; 2497 sum2 -= v2[j] *tmp0; 2498 sum3 -= v3[j] *tmp0; 2499 sum4 -= v4[j] *tmp0; 2500 sum5 -= v5[j] *tmp0; 2501 } 2502 2503 sum2 -= v2[nz] * sum1; 2504 sum3 -= v3[nz] * sum1; 2505 sum4 -= v4[nz] * sum1; 2506 sum5 -= v5[nz] * sum1; 2507 sum3 -= v3[nz+1] * sum2; 2508 sum4 -= v4[nz+1] * sum2; 2509 sum5 -= v5[nz+1] * sum2; 2510 sum4 -= v4[nz+2] * sum3; 2511 sum5 -= v5[nz+2] * sum3; 2512 sum5 -= v5[nz+3] * sum4; 2513 2514 tmp[row++]=sum1; 2515 tmp[row++]=sum2; 2516 tmp[row++]=sum3; 2517 tmp[row++]=sum4; 2518 tmp[row++]=sum5; 2519 break; 2520 default: 2521 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2522 } 2523 } 2524 /* backward solve the upper triangular */ 2525 for (i=node_max -1,row = n-1; i>=0; i--) { 2526 nsz = ns[i]; 2527 aii = ad[row+1] + 1; 2528 v1 = aa + aii; 2529 vi = aj + aii; 2530 nz = ad[row]- ad[row+1] - 1; 2531 2532 if (i > 0) { 2533 /* Prefetch the indices for the next block */ 2534 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2535 /* Prefetch the data for the next block */ 2536 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2537 } 2538 2539 switch (nsz) { /* Each loop in 'case' is unrolled */ 2540 case 1: 2541 sum1 = tmp[row]; 2542 2543 for (j=0; j<nz-1; j+=2) { 2544 i0 = vi[j]; 2545 i1 = vi[j+1]; 2546 tmp0 = tmps[i0]; 2547 tmp1 = tmps[i1]; 2548 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2549 } 2550 if (j == nz-1) { 2551 tmp0 = tmps[vi[j]]; 2552 sum1 -= v1[j]*tmp0; 2553 } 2554 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2555 break; 2556 case 2: 2557 sum1 = tmp[row]; 2558 sum2 = tmp[row-1]; 2559 v2 = aa + ad[row] + 1; 2560 for (j=0; j<nz-1; j+=2) { 2561 i0 = vi[j]; 2562 i1 = vi[j+1]; 2563 tmp0 = tmps[i0]; 2564 tmp1 = tmps[i1]; 2565 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2566 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2567 } 2568 if (j == nz-1) { 2569 tmp0 = tmps[vi[j]]; 2570 sum1 -= v1[j]* tmp0; 2571 sum2 -= v2[j+1]* tmp0; 2572 } 2573 2574 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2575 sum2 -= v2[0] * tmp0; 2576 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2577 break; 2578 case 3: 2579 sum1 = tmp[row]; 2580 sum2 = tmp[row -1]; 2581 sum3 = tmp[row -2]; 2582 v2 = aa + ad[row] + 1; 2583 v3 = aa + ad[row -1] + 1; 2584 for (j=0; j<nz-1; j+=2) { 2585 i0 = vi[j]; 2586 i1 = vi[j+1]; 2587 tmp0 = tmps[i0]; 2588 tmp1 = tmps[i1]; 2589 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2590 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2591 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2592 } 2593 if (j== nz-1) { 2594 tmp0 = tmps[vi[j]]; 2595 sum1 -= v1[j] * tmp0; 2596 sum2 -= v2[j+1] * tmp0; 2597 sum3 -= v3[j+2] * tmp0; 2598 } 2599 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2600 sum2 -= v2[0]* tmp0; 2601 sum3 -= v3[1] * tmp0; 2602 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2603 sum3 -= v3[0]* tmp0; 2604 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2605 2606 break; 2607 case 4: 2608 sum1 = tmp[row]; 2609 sum2 = tmp[row -1]; 2610 sum3 = tmp[row -2]; 2611 sum4 = tmp[row -3]; 2612 v2 = aa + ad[row]+1; 2613 v3 = aa + ad[row -1]+1; 2614 v4 = aa + ad[row -2]+1; 2615 2616 for (j=0; j<nz-1; j+=2) { 2617 i0 = vi[j]; 2618 i1 = vi[j+1]; 2619 tmp0 = tmps[i0]; 2620 tmp1 = tmps[i1]; 2621 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2622 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2623 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2624 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2625 } 2626 if (j== nz-1) { 2627 tmp0 = tmps[vi[j]]; 2628 sum1 -= v1[j] * tmp0; 2629 sum2 -= v2[j+1] * tmp0; 2630 sum3 -= v3[j+2] * tmp0; 2631 sum4 -= v4[j+3] * tmp0; 2632 } 2633 2634 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2635 sum2 -= v2[0] * tmp0; 2636 sum3 -= v3[1] * tmp0; 2637 sum4 -= v4[2] * tmp0; 2638 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2639 sum3 -= v3[0] * tmp0; 2640 sum4 -= v4[1] * tmp0; 2641 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2642 sum4 -= v4[0] * tmp0; 2643 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2644 break; 2645 case 5: 2646 sum1 = tmp[row]; 2647 sum2 = tmp[row -1]; 2648 sum3 = tmp[row -2]; 2649 sum4 = tmp[row -3]; 2650 sum5 = tmp[row -4]; 2651 v2 = aa + ad[row]+1; 2652 v3 = aa + ad[row -1]+1; 2653 v4 = aa + ad[row -2]+1; 2654 v5 = aa + ad[row -3]+1; 2655 for (j=0; j<nz-1; j+=2) { 2656 i0 = vi[j]; 2657 i1 = vi[j+1]; 2658 tmp0 = tmps[i0]; 2659 tmp1 = tmps[i1]; 2660 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2661 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2662 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2663 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2664 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2665 } 2666 if (j==nz-1) { 2667 tmp0 = tmps[vi[j]]; 2668 sum1 -= v1[j] * tmp0; 2669 sum2 -= v2[j+1] * tmp0; 2670 sum3 -= v3[j+2] * tmp0; 2671 sum4 -= v4[j+3] * tmp0; 2672 sum5 -= v5[j+4] * tmp0; 2673 } 2674 2675 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2676 sum2 -= v2[0] * tmp0; 2677 sum3 -= v3[1] * tmp0; 2678 sum4 -= v4[2] * tmp0; 2679 sum5 -= v5[3] * tmp0; 2680 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2681 sum3 -= v3[0] * tmp0; 2682 sum4 -= v4[1] * tmp0; 2683 sum5 -= v5[2] * tmp0; 2684 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2685 sum4 -= v4[0] * tmp0; 2686 sum5 -= v5[1] * tmp0; 2687 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2688 sum5 -= v5[0] * tmp0; 2689 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2690 break; 2691 default: 2692 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2693 } 2694 } 2695 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2696 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2697 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2698 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2699 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2700 PetscFunctionReturn(0); 2701 } 2702 2703 2704 /* 2705 Makes a longer coloring[] array and calls the usual code with that 2706 */ 2707 #undef __FUNCT__ 2708 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2709 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2710 { 2711 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2712 PetscErrorCode ierr; 2713 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2714 PetscInt *colorused,i; 2715 ISColoringValue *newcolor; 2716 2717 PetscFunctionBegin; 2718 ierr = PetscMalloc1(n+1,&newcolor);CHKERRQ(ierr); 2719 /* loop over inodes, marking a color for each column*/ 2720 row = 0; 2721 for (i=0; i<m; i++) { 2722 for (j=0; j<ns[i]; j++) { 2723 newcolor[row++] = coloring[i] + j*ncolors; 2724 } 2725 } 2726 2727 /* eliminate unneeded colors */ 2728 ierr = PetscCalloc1(5*ncolors,&colorused);CHKERRQ(ierr); 2729 for (i=0; i<n; i++) { 2730 colorused[newcolor[i]] = 1; 2731 } 2732 2733 for (i=1; i<5*ncolors; i++) { 2734 colorused[i] += colorused[i-1]; 2735 } 2736 ncolors = colorused[5*ncolors-1]; 2737 for (i=0; i<n; i++) { 2738 newcolor[i] = colorused[newcolor[i]]-1; 2739 } 2740 ierr = PetscFree(colorused);CHKERRQ(ierr); 2741 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring);CHKERRQ(ierr); 2742 ierr = PetscFree(coloring);CHKERRQ(ierr); 2743 PetscFunctionReturn(0); 2744 } 2745 2746 #include <petsc/private/kernels/blockinvert.h> 2747 2748 #undef __FUNCT__ 2749 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2750 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2751 { 2752 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2753 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2754 MatScalar *ibdiag,*bdiag,work[25],*t; 2755 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2756 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2757 const PetscScalar *xb, *b; 2758 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2759 PetscErrorCode ierr; 2760 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2761 PetscInt sz,k,ipvt[5]; 2762 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2763 2764 PetscFunctionBegin; 2765 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2766 if (fshift == -1.0) fshift = 0.0; /* negative fshift indicates do not error on zero diagonal; this code never errors on zero diagonal */ 2767 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2768 2769 if (!a->inode.ibdiagvalid) { 2770 if (!a->inode.ibdiag) { 2771 /* calculate space needed for diagonal blocks */ 2772 for (i=0; i<m; i++) { 2773 cnt += sizes[i]*sizes[i]; 2774 } 2775 a->inode.bdiagsize = cnt; 2776 2777 ierr = PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work);CHKERRQ(ierr); 2778 } 2779 2780 /* copy over the diagonal blocks and invert them */ 2781 ibdiag = a->inode.ibdiag; 2782 bdiag = a->inode.bdiag; 2783 cnt = 0; 2784 for (i=0, row = 0; i<m; i++) { 2785 for (j=0; j<sizes[i]; j++) { 2786 for (k=0; k<sizes[i]; k++) { 2787 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2788 } 2789 } 2790 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2791 2792 switch (sizes[i]) { 2793 case 1: 2794 /* Create matrix data structure */ 2795 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2796 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2797 break; 2798 case 2: 2799 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2800 break; 2801 case 3: 2802 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2803 break; 2804 case 4: 2805 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2806 break; 2807 case 5: 2808 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2809 break; 2810 default: 2811 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2812 } 2813 cnt += sizes[i]*sizes[i]; 2814 row += sizes[i]; 2815 } 2816 a->inode.ibdiagvalid = PETSC_TRUE; 2817 } 2818 ibdiag = a->inode.ibdiag; 2819 bdiag = a->inode.bdiag; 2820 t = a->inode.ssor_work; 2821 2822 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2823 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2824 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2825 if (flag & SOR_ZERO_INITIAL_GUESS) { 2826 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2827 2828 for (i=0, row=0; i<m; i++) { 2829 sz = diag[row] - ii[row]; 2830 v1 = a->a + ii[row]; 2831 idx = a->j + ii[row]; 2832 2833 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2834 switch (sizes[i]) { 2835 case 1: 2836 2837 sum1 = b[row]; 2838 for (n = 0; n<sz-1; n+=2) { 2839 i1 = idx[0]; 2840 i2 = idx[1]; 2841 idx += 2; 2842 tmp0 = x[i1]; 2843 tmp1 = x[i2]; 2844 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2845 } 2846 2847 if (n == sz-1) { 2848 tmp0 = x[*idx]; 2849 sum1 -= *v1 * tmp0; 2850 } 2851 t[row] = sum1; 2852 x[row++] = sum1*(*ibdiag++); 2853 break; 2854 case 2: 2855 v2 = a->a + ii[row+1]; 2856 sum1 = b[row]; 2857 sum2 = b[row+1]; 2858 for (n = 0; n<sz-1; n+=2) { 2859 i1 = idx[0]; 2860 i2 = idx[1]; 2861 idx += 2; 2862 tmp0 = x[i1]; 2863 tmp1 = x[i2]; 2864 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2865 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2866 } 2867 2868 if (n == sz-1) { 2869 tmp0 = x[*idx]; 2870 sum1 -= v1[0] * tmp0; 2871 sum2 -= v2[0] * tmp0; 2872 } 2873 t[row] = sum1; 2874 t[row+1] = sum2; 2875 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2876 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2877 ibdiag += 4; 2878 break; 2879 case 3: 2880 v2 = a->a + ii[row+1]; 2881 v3 = a->a + ii[row+2]; 2882 sum1 = b[row]; 2883 sum2 = b[row+1]; 2884 sum3 = b[row+2]; 2885 for (n = 0; n<sz-1; n+=2) { 2886 i1 = idx[0]; 2887 i2 = idx[1]; 2888 idx += 2; 2889 tmp0 = x[i1]; 2890 tmp1 = x[i2]; 2891 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2892 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2893 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2894 } 2895 2896 if (n == sz-1) { 2897 tmp0 = x[*idx]; 2898 sum1 -= v1[0] * tmp0; 2899 sum2 -= v2[0] * tmp0; 2900 sum3 -= v3[0] * tmp0; 2901 } 2902 t[row] = sum1; 2903 t[row+1] = sum2; 2904 t[row+2] = sum3; 2905 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2906 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2907 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2908 ibdiag += 9; 2909 break; 2910 case 4: 2911 v2 = a->a + ii[row+1]; 2912 v3 = a->a + ii[row+2]; 2913 v4 = a->a + ii[row+3]; 2914 sum1 = b[row]; 2915 sum2 = b[row+1]; 2916 sum3 = b[row+2]; 2917 sum4 = b[row+3]; 2918 for (n = 0; n<sz-1; n+=2) { 2919 i1 = idx[0]; 2920 i2 = idx[1]; 2921 idx += 2; 2922 tmp0 = x[i1]; 2923 tmp1 = x[i2]; 2924 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2925 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2926 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2927 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2928 } 2929 2930 if (n == sz-1) { 2931 tmp0 = x[*idx]; 2932 sum1 -= v1[0] * tmp0; 2933 sum2 -= v2[0] * tmp0; 2934 sum3 -= v3[0] * tmp0; 2935 sum4 -= v4[0] * tmp0; 2936 } 2937 t[row] = sum1; 2938 t[row+1] = sum2; 2939 t[row+2] = sum3; 2940 t[row+3] = sum4; 2941 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2942 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2943 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2944 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2945 ibdiag += 16; 2946 break; 2947 case 5: 2948 v2 = a->a + ii[row+1]; 2949 v3 = a->a + ii[row+2]; 2950 v4 = a->a + ii[row+3]; 2951 v5 = a->a + ii[row+4]; 2952 sum1 = b[row]; 2953 sum2 = b[row+1]; 2954 sum3 = b[row+2]; 2955 sum4 = b[row+3]; 2956 sum5 = b[row+4]; 2957 for (n = 0; n<sz-1; n+=2) { 2958 i1 = idx[0]; 2959 i2 = idx[1]; 2960 idx += 2; 2961 tmp0 = x[i1]; 2962 tmp1 = x[i2]; 2963 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2964 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2965 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2966 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2967 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2968 } 2969 2970 if (n == sz-1) { 2971 tmp0 = x[*idx]; 2972 sum1 -= v1[0] * tmp0; 2973 sum2 -= v2[0] * tmp0; 2974 sum3 -= v3[0] * tmp0; 2975 sum4 -= v4[0] * tmp0; 2976 sum5 -= v5[0] * tmp0; 2977 } 2978 t[row] = sum1; 2979 t[row+1] = sum2; 2980 t[row+2] = sum3; 2981 t[row+3] = sum4; 2982 t[row+4] = sum5; 2983 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2984 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2985 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2986 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2987 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2988 ibdiag += 25; 2989 break; 2990 default: 2991 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2992 } 2993 } 2994 2995 xb = t; 2996 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2997 } else xb = b; 2998 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 2999 3000 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3001 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3002 ibdiag -= sizes[i]*sizes[i]; 3003 sz = ii[row+1] - diag[row] - 1; 3004 v1 = a->a + diag[row] + 1; 3005 idx = a->j + diag[row] + 1; 3006 3007 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3008 switch (sizes[i]) { 3009 case 1: 3010 3011 sum1 = xb[row]; 3012 for (n = 0; n<sz-1; n+=2) { 3013 i1 = idx[0]; 3014 i2 = idx[1]; 3015 idx += 2; 3016 tmp0 = x[i1]; 3017 tmp1 = x[i2]; 3018 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3019 } 3020 3021 if (n == sz-1) { 3022 tmp0 = x[*idx]; 3023 sum1 -= *v1*tmp0; 3024 } 3025 x[row--] = sum1*(*ibdiag); 3026 break; 3027 3028 case 2: 3029 3030 sum1 = xb[row]; 3031 sum2 = xb[row-1]; 3032 /* note that sum1 is associated with the second of the two rows */ 3033 v2 = a->a + diag[row-1] + 2; 3034 for (n = 0; n<sz-1; n+=2) { 3035 i1 = idx[0]; 3036 i2 = idx[1]; 3037 idx += 2; 3038 tmp0 = x[i1]; 3039 tmp1 = x[i2]; 3040 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3041 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3042 } 3043 3044 if (n == sz-1) { 3045 tmp0 = x[*idx]; 3046 sum1 -= *v1*tmp0; 3047 sum2 -= *v2*tmp0; 3048 } 3049 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3050 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3051 break; 3052 case 3: 3053 3054 sum1 = xb[row]; 3055 sum2 = xb[row-1]; 3056 sum3 = xb[row-2]; 3057 v2 = a->a + diag[row-1] + 2; 3058 v3 = a->a + diag[row-2] + 3; 3059 for (n = 0; n<sz-1; n+=2) { 3060 i1 = idx[0]; 3061 i2 = idx[1]; 3062 idx += 2; 3063 tmp0 = x[i1]; 3064 tmp1 = x[i2]; 3065 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3066 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3067 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3068 } 3069 3070 if (n == sz-1) { 3071 tmp0 = x[*idx]; 3072 sum1 -= *v1*tmp0; 3073 sum2 -= *v2*tmp0; 3074 sum3 -= *v3*tmp0; 3075 } 3076 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3077 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3078 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3079 break; 3080 case 4: 3081 3082 sum1 = xb[row]; 3083 sum2 = xb[row-1]; 3084 sum3 = xb[row-2]; 3085 sum4 = xb[row-3]; 3086 v2 = a->a + diag[row-1] + 2; 3087 v3 = a->a + diag[row-2] + 3; 3088 v4 = a->a + diag[row-3] + 4; 3089 for (n = 0; n<sz-1; n+=2) { 3090 i1 = idx[0]; 3091 i2 = idx[1]; 3092 idx += 2; 3093 tmp0 = x[i1]; 3094 tmp1 = x[i2]; 3095 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3096 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3097 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3098 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3099 } 3100 3101 if (n == sz-1) { 3102 tmp0 = x[*idx]; 3103 sum1 -= *v1*tmp0; 3104 sum2 -= *v2*tmp0; 3105 sum3 -= *v3*tmp0; 3106 sum4 -= *v4*tmp0; 3107 } 3108 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3109 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3110 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3111 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3112 break; 3113 case 5: 3114 3115 sum1 = xb[row]; 3116 sum2 = xb[row-1]; 3117 sum3 = xb[row-2]; 3118 sum4 = xb[row-3]; 3119 sum5 = xb[row-4]; 3120 v2 = a->a + diag[row-1] + 2; 3121 v3 = a->a + diag[row-2] + 3; 3122 v4 = a->a + diag[row-3] + 4; 3123 v5 = a->a + diag[row-4] + 5; 3124 for (n = 0; n<sz-1; n+=2) { 3125 i1 = idx[0]; 3126 i2 = idx[1]; 3127 idx += 2; 3128 tmp0 = x[i1]; 3129 tmp1 = x[i2]; 3130 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3131 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3132 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3133 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3134 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3135 } 3136 3137 if (n == sz-1) { 3138 tmp0 = x[*idx]; 3139 sum1 -= *v1*tmp0; 3140 sum2 -= *v2*tmp0; 3141 sum3 -= *v3*tmp0; 3142 sum4 -= *v4*tmp0; 3143 sum5 -= *v5*tmp0; 3144 } 3145 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3146 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3147 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3148 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3149 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3150 break; 3151 default: 3152 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3153 } 3154 } 3155 3156 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3157 } 3158 its--; 3159 } 3160 while (its--) { 3161 3162 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3163 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3164 i<m; 3165 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3166 3167 sz = diag[row] - ii[row]; 3168 v1 = a->a + ii[row]; 3169 idx = a->j + ii[row]; 3170 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3171 switch (sizes[i]) { 3172 case 1: 3173 sum1 = b[row]; 3174 for (n = 0; n<sz-1; n+=2) { 3175 i1 = idx[0]; 3176 i2 = idx[1]; 3177 idx += 2; 3178 tmp0 = x[i1]; 3179 tmp1 = x[i2]; 3180 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3181 } 3182 if (n == sz-1) { 3183 tmp0 = x[*idx++]; 3184 sum1 -= *v1 * tmp0; 3185 v1++; 3186 } 3187 t[row] = sum1; 3188 sz = ii[row+1] - diag[row] - 1; 3189 idx = a->j + diag[row] + 1; 3190 v1 += 1; 3191 for (n = 0; n<sz-1; n+=2) { 3192 i1 = idx[0]; 3193 i2 = idx[1]; 3194 idx += 2; 3195 tmp0 = x[i1]; 3196 tmp1 = x[i2]; 3197 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3198 } 3199 if (n == sz-1) { 3200 tmp0 = x[*idx++]; 3201 sum1 -= *v1 * tmp0; 3202 } 3203 /* in MatSOR_SeqAIJ this line would be 3204 * 3205 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3206 * 3207 * but omega == 1, so this becomes 3208 * 3209 * x[row] = sum1*(*ibdiag++); 3210 * 3211 */ 3212 x[row] = sum1*(*ibdiag); 3213 break; 3214 case 2: 3215 v2 = a->a + ii[row+1]; 3216 sum1 = b[row]; 3217 sum2 = b[row+1]; 3218 for (n = 0; n<sz-1; n+=2) { 3219 i1 = idx[0]; 3220 i2 = idx[1]; 3221 idx += 2; 3222 tmp0 = x[i1]; 3223 tmp1 = x[i2]; 3224 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3225 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3226 } 3227 if (n == sz-1) { 3228 tmp0 = x[*idx++]; 3229 sum1 -= v1[0] * tmp0; 3230 sum2 -= v2[0] * tmp0; 3231 v1++; v2++; 3232 } 3233 t[row] = sum1; 3234 t[row+1] = sum2; 3235 sz = ii[row+1] - diag[row] - 2; 3236 idx = a->j + diag[row] + 2; 3237 v1 += 2; 3238 v2 += 2; 3239 for (n = 0; n<sz-1; n+=2) { 3240 i1 = idx[0]; 3241 i2 = idx[1]; 3242 idx += 2; 3243 tmp0 = x[i1]; 3244 tmp1 = x[i2]; 3245 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3246 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3247 } 3248 if (n == sz-1) { 3249 tmp0 = x[*idx]; 3250 sum1 -= v1[0] * tmp0; 3251 sum2 -= v2[0] * tmp0; 3252 } 3253 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3254 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3255 break; 3256 case 3: 3257 v2 = a->a + ii[row+1]; 3258 v3 = a->a + ii[row+2]; 3259 sum1 = b[row]; 3260 sum2 = b[row+1]; 3261 sum3 = b[row+2]; 3262 for (n = 0; n<sz-1; n+=2) { 3263 i1 = idx[0]; 3264 i2 = idx[1]; 3265 idx += 2; 3266 tmp0 = x[i1]; 3267 tmp1 = x[i2]; 3268 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3269 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3270 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3271 } 3272 if (n == sz-1) { 3273 tmp0 = x[*idx++]; 3274 sum1 -= v1[0] * tmp0; 3275 sum2 -= v2[0] * tmp0; 3276 sum3 -= v3[0] * tmp0; 3277 v1++; v2++; v3++; 3278 } 3279 t[row] = sum1; 3280 t[row+1] = sum2; 3281 t[row+2] = sum3; 3282 sz = ii[row+1] - diag[row] - 3; 3283 idx = a->j + diag[row] + 3; 3284 v1 += 3; 3285 v2 += 3; 3286 v3 += 3; 3287 for (n = 0; n<sz-1; n+=2) { 3288 i1 = idx[0]; 3289 i2 = idx[1]; 3290 idx += 2; 3291 tmp0 = x[i1]; 3292 tmp1 = x[i2]; 3293 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3294 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3295 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3296 } 3297 if (n == sz-1) { 3298 tmp0 = x[*idx]; 3299 sum1 -= v1[0] * tmp0; 3300 sum2 -= v2[0] * tmp0; 3301 sum3 -= v3[0] * tmp0; 3302 } 3303 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3304 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3305 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3306 break; 3307 case 4: 3308 v2 = a->a + ii[row+1]; 3309 v3 = a->a + ii[row+2]; 3310 v4 = a->a + ii[row+3]; 3311 sum1 = b[row]; 3312 sum2 = b[row+1]; 3313 sum3 = b[row+2]; 3314 sum4 = b[row+3]; 3315 for (n = 0; n<sz-1; n+=2) { 3316 i1 = idx[0]; 3317 i2 = idx[1]; 3318 idx += 2; 3319 tmp0 = x[i1]; 3320 tmp1 = x[i2]; 3321 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3322 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3323 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3324 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3325 } 3326 if (n == sz-1) { 3327 tmp0 = x[*idx++]; 3328 sum1 -= v1[0] * tmp0; 3329 sum2 -= v2[0] * tmp0; 3330 sum3 -= v3[0] * tmp0; 3331 sum4 -= v4[0] * tmp0; 3332 v1++; v2++; v3++; v4++; 3333 } 3334 t[row] = sum1; 3335 t[row+1] = sum2; 3336 t[row+2] = sum3; 3337 t[row+3] = sum4; 3338 sz = ii[row+1] - diag[row] - 4; 3339 idx = a->j + diag[row] + 4; 3340 v1 += 4; 3341 v2 += 4; 3342 v3 += 4; 3343 v4 += 4; 3344 for (n = 0; n<sz-1; n+=2) { 3345 i1 = idx[0]; 3346 i2 = idx[1]; 3347 idx += 2; 3348 tmp0 = x[i1]; 3349 tmp1 = x[i2]; 3350 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3351 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3352 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3353 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3354 } 3355 if (n == sz-1) { 3356 tmp0 = x[*idx]; 3357 sum1 -= v1[0] * tmp0; 3358 sum2 -= v2[0] * tmp0; 3359 sum3 -= v3[0] * tmp0; 3360 sum4 -= v4[0] * tmp0; 3361 } 3362 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3363 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3364 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3365 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3366 break; 3367 case 5: 3368 v2 = a->a + ii[row+1]; 3369 v3 = a->a + ii[row+2]; 3370 v4 = a->a + ii[row+3]; 3371 v5 = a->a + ii[row+4]; 3372 sum1 = b[row]; 3373 sum2 = b[row+1]; 3374 sum3 = b[row+2]; 3375 sum4 = b[row+3]; 3376 sum5 = b[row+4]; 3377 for (n = 0; n<sz-1; n+=2) { 3378 i1 = idx[0]; 3379 i2 = idx[1]; 3380 idx += 2; 3381 tmp0 = x[i1]; 3382 tmp1 = x[i2]; 3383 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3384 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3385 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3386 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3387 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3388 } 3389 if (n == sz-1) { 3390 tmp0 = x[*idx++]; 3391 sum1 -= v1[0] * tmp0; 3392 sum2 -= v2[0] * tmp0; 3393 sum3 -= v3[0] * tmp0; 3394 sum4 -= v4[0] * tmp0; 3395 sum5 -= v5[0] * tmp0; 3396 v1++; v2++; v3++; v4++; v5++; 3397 } 3398 t[row] = sum1; 3399 t[row+1] = sum2; 3400 t[row+2] = sum3; 3401 t[row+3] = sum4; 3402 t[row+4] = sum5; 3403 sz = ii[row+1] - diag[row] - 5; 3404 idx = a->j + diag[row] + 5; 3405 v1 += 5; 3406 v2 += 5; 3407 v3 += 5; 3408 v4 += 5; 3409 v5 += 5; 3410 for (n = 0; n<sz-1; n+=2) { 3411 i1 = idx[0]; 3412 i2 = idx[1]; 3413 idx += 2; 3414 tmp0 = x[i1]; 3415 tmp1 = x[i2]; 3416 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3417 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3418 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3419 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3420 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3421 } 3422 if (n == sz-1) { 3423 tmp0 = x[*idx]; 3424 sum1 -= v1[0] * tmp0; 3425 sum2 -= v2[0] * tmp0; 3426 sum3 -= v3[0] * tmp0; 3427 sum4 -= v4[0] * tmp0; 3428 sum5 -= v5[0] * tmp0; 3429 } 3430 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3431 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3432 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3433 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3434 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3435 break; 3436 default: 3437 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3438 } 3439 } 3440 xb = t; 3441 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); /* undercounts diag inverse */ 3442 } else xb = b; 3443 3444 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3445 3446 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3447 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3448 ibdiag -= sizes[i]*sizes[i]; 3449 3450 /* set RHS */ 3451 if (xb == b) { 3452 /* whole (old way) */ 3453 sz = ii[row+1] - ii[row]; 3454 idx = a->j + ii[row]; 3455 switch (sizes[i]) { 3456 case 5: 3457 v5 = a->a + ii[row-4]; 3458 case 4: /* fall through */ 3459 v4 = a->a + ii[row-3]; 3460 case 3: 3461 v3 = a->a + ii[row-2]; 3462 case 2: 3463 v2 = a->a + ii[row-1]; 3464 case 1: 3465 v1 = a->a + ii[row]; 3466 break; 3467 default: 3468 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3469 } 3470 } else { 3471 /* upper, no diag */ 3472 sz = ii[row+1] - diag[row] - 1; 3473 idx = a->j + diag[row] + 1; 3474 switch (sizes[i]) { 3475 case 5: 3476 v5 = a->a + diag[row-4] + 5; 3477 case 4: /* fall through */ 3478 v4 = a->a + diag[row-3] + 4; 3479 case 3: 3480 v3 = a->a + diag[row-2] + 3; 3481 case 2: 3482 v2 = a->a + diag[row-1] + 2; 3483 case 1: 3484 v1 = a->a + diag[row] + 1; 3485 } 3486 } 3487 /* set sum */ 3488 switch (sizes[i]) { 3489 case 5: 3490 sum5 = xb[row-4]; 3491 case 4: /* fall through */ 3492 sum4 = xb[row-3]; 3493 case 3: 3494 sum3 = xb[row-2]; 3495 case 2: 3496 sum2 = xb[row-1]; 3497 case 1: 3498 /* note that sum1 is associated with the last row */ 3499 sum1 = xb[row]; 3500 } 3501 /* do sums */ 3502 for (n = 0; n<sz-1; n+=2) { 3503 i1 = idx[0]; 3504 i2 = idx[1]; 3505 idx += 2; 3506 tmp0 = x[i1]; 3507 tmp1 = x[i2]; 3508 switch (sizes[i]) { 3509 case 5: 3510 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3511 case 4: /* fall through */ 3512 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3513 case 3: 3514 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3515 case 2: 3516 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3517 case 1: 3518 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3519 } 3520 } 3521 /* ragged edge */ 3522 if (n == sz-1) { 3523 tmp0 = x[*idx]; 3524 switch (sizes[i]) { 3525 case 5: 3526 sum5 -= *v5*tmp0; 3527 case 4: /* fall through */ 3528 sum4 -= *v4*tmp0; 3529 case 3: 3530 sum3 -= *v3*tmp0; 3531 case 2: 3532 sum2 -= *v2*tmp0; 3533 case 1: 3534 sum1 -= *v1*tmp0; 3535 } 3536 } 3537 /* update */ 3538 if (xb == b) { 3539 /* whole (old way) w/ diag */ 3540 switch (sizes[i]) { 3541 case 5: 3542 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3543 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3544 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3545 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3546 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3547 break; 3548 case 4: 3549 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3550 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3551 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3552 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3553 break; 3554 case 3: 3555 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3556 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3557 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3558 break; 3559 case 2: 3560 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3561 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3562 break; 3563 case 1: 3564 x[row--] += sum1*(*ibdiag); 3565 break; 3566 } 3567 } else { 3568 /* no diag so set = */ 3569 switch (sizes[i]) { 3570 case 5: 3571 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3572 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3573 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3574 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3575 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3576 break; 3577 case 4: 3578 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3579 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3580 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3581 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3582 break; 3583 case 3: 3584 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3585 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3586 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3587 break; 3588 case 2: 3589 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3590 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3591 break; 3592 case 1: 3593 x[row--] = sum1*(*ibdiag); 3594 break; 3595 } 3596 } 3597 } 3598 if (xb == b) { 3599 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3600 } else { 3601 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); /* assumes 1/2 in upper, undercounts diag inverse */ 3602 } 3603 } 3604 } 3605 if (flag & SOR_EISENSTAT) { 3606 /* 3607 Apply (U + D)^-1 where D is now the block diagonal 3608 */ 3609 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3610 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3611 ibdiag -= sizes[i]*sizes[i]; 3612 sz = ii[row+1] - diag[row] - 1; 3613 v1 = a->a + diag[row] + 1; 3614 idx = a->j + diag[row] + 1; 3615 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3616 switch (sizes[i]) { 3617 case 1: 3618 3619 sum1 = b[row]; 3620 for (n = 0; n<sz-1; n+=2) { 3621 i1 = idx[0]; 3622 i2 = idx[1]; 3623 idx += 2; 3624 tmp0 = x[i1]; 3625 tmp1 = x[i2]; 3626 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3627 } 3628 3629 if (n == sz-1) { 3630 tmp0 = x[*idx]; 3631 sum1 -= *v1*tmp0; 3632 } 3633 x[row] = sum1*(*ibdiag);row--; 3634 break; 3635 3636 case 2: 3637 3638 sum1 = b[row]; 3639 sum2 = b[row-1]; 3640 /* note that sum1 is associated with the second of the two rows */ 3641 v2 = a->a + diag[row-1] + 2; 3642 for (n = 0; n<sz-1; n+=2) { 3643 i1 = idx[0]; 3644 i2 = idx[1]; 3645 idx += 2; 3646 tmp0 = x[i1]; 3647 tmp1 = x[i2]; 3648 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3649 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3650 } 3651 3652 if (n == sz-1) { 3653 tmp0 = x[*idx]; 3654 sum1 -= *v1*tmp0; 3655 sum2 -= *v2*tmp0; 3656 } 3657 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3658 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3659 row -= 2; 3660 break; 3661 case 3: 3662 3663 sum1 = b[row]; 3664 sum2 = b[row-1]; 3665 sum3 = b[row-2]; 3666 v2 = a->a + diag[row-1] + 2; 3667 v3 = a->a + diag[row-2] + 3; 3668 for (n = 0; n<sz-1; n+=2) { 3669 i1 = idx[0]; 3670 i2 = idx[1]; 3671 idx += 2; 3672 tmp0 = x[i1]; 3673 tmp1 = x[i2]; 3674 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3675 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3676 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3677 } 3678 3679 if (n == sz-1) { 3680 tmp0 = x[*idx]; 3681 sum1 -= *v1*tmp0; 3682 sum2 -= *v2*tmp0; 3683 sum3 -= *v3*tmp0; 3684 } 3685 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3686 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3687 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3688 row -= 3; 3689 break; 3690 case 4: 3691 3692 sum1 = b[row]; 3693 sum2 = b[row-1]; 3694 sum3 = b[row-2]; 3695 sum4 = b[row-3]; 3696 v2 = a->a + diag[row-1] + 2; 3697 v3 = a->a + diag[row-2] + 3; 3698 v4 = a->a + diag[row-3] + 4; 3699 for (n = 0; n<sz-1; n+=2) { 3700 i1 = idx[0]; 3701 i2 = idx[1]; 3702 idx += 2; 3703 tmp0 = x[i1]; 3704 tmp1 = x[i2]; 3705 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3706 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3707 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3708 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3709 } 3710 3711 if (n == sz-1) { 3712 tmp0 = x[*idx]; 3713 sum1 -= *v1*tmp0; 3714 sum2 -= *v2*tmp0; 3715 sum3 -= *v3*tmp0; 3716 sum4 -= *v4*tmp0; 3717 } 3718 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3719 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3720 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3721 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3722 row -= 4; 3723 break; 3724 case 5: 3725 3726 sum1 = b[row]; 3727 sum2 = b[row-1]; 3728 sum3 = b[row-2]; 3729 sum4 = b[row-3]; 3730 sum5 = b[row-4]; 3731 v2 = a->a + diag[row-1] + 2; 3732 v3 = a->a + diag[row-2] + 3; 3733 v4 = a->a + diag[row-3] + 4; 3734 v5 = a->a + diag[row-4] + 5; 3735 for (n = 0; n<sz-1; n+=2) { 3736 i1 = idx[0]; 3737 i2 = idx[1]; 3738 idx += 2; 3739 tmp0 = x[i1]; 3740 tmp1 = x[i2]; 3741 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3742 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3743 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3744 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3745 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3746 } 3747 3748 if (n == sz-1) { 3749 tmp0 = x[*idx]; 3750 sum1 -= *v1*tmp0; 3751 sum2 -= *v2*tmp0; 3752 sum3 -= *v3*tmp0; 3753 sum4 -= *v4*tmp0; 3754 sum5 -= *v5*tmp0; 3755 } 3756 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3757 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3758 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3759 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3760 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3761 row -= 5; 3762 break; 3763 default: 3764 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3765 } 3766 } 3767 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3768 3769 /* 3770 t = b - D x where D is the block diagonal 3771 */ 3772 cnt = 0; 3773 for (i=0, row=0; i<m; i++) { 3774 switch (sizes[i]) { 3775 case 1: 3776 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3777 break; 3778 case 2: 3779 x1 = x[row]; x2 = x[row+1]; 3780 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3781 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3782 t[row] = b[row] - tmp1; 3783 t[row+1] = b[row+1] - tmp2; row += 2; 3784 cnt += 4; 3785 break; 3786 case 3: 3787 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3788 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3789 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3790 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3791 t[row] = b[row] - tmp1; 3792 t[row+1] = b[row+1] - tmp2; 3793 t[row+2] = b[row+2] - tmp3; row += 3; 3794 cnt += 9; 3795 break; 3796 case 4: 3797 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3798 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3799 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3800 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3801 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3802 t[row] = b[row] - tmp1; 3803 t[row+1] = b[row+1] - tmp2; 3804 t[row+2] = b[row+2] - tmp3; 3805 t[row+3] = b[row+3] - tmp4; row += 4; 3806 cnt += 16; 3807 break; 3808 case 5: 3809 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3810 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3811 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3812 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3813 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3814 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3815 t[row] = b[row] - tmp1; 3816 t[row+1] = b[row+1] - tmp2; 3817 t[row+2] = b[row+2] - tmp3; 3818 t[row+3] = b[row+3] - tmp4; 3819 t[row+4] = b[row+4] - tmp5;row += 5; 3820 cnt += 25; 3821 break; 3822 default: 3823 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3824 } 3825 } 3826 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3827 3828 3829 3830 /* 3831 Apply (L + D)^-1 where D is the block diagonal 3832 */ 3833 for (i=0, row=0; i<m; i++) { 3834 sz = diag[row] - ii[row]; 3835 v1 = a->a + ii[row]; 3836 idx = a->j + ii[row]; 3837 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3838 switch (sizes[i]) { 3839 case 1: 3840 3841 sum1 = t[row]; 3842 for (n = 0; n<sz-1; n+=2) { 3843 i1 = idx[0]; 3844 i2 = idx[1]; 3845 idx += 2; 3846 tmp0 = t[i1]; 3847 tmp1 = t[i2]; 3848 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3849 } 3850 3851 if (n == sz-1) { 3852 tmp0 = t[*idx]; 3853 sum1 -= *v1 * tmp0; 3854 } 3855 x[row] += t[row] = sum1*(*ibdiag++); row++; 3856 break; 3857 case 2: 3858 v2 = a->a + ii[row+1]; 3859 sum1 = t[row]; 3860 sum2 = t[row+1]; 3861 for (n = 0; n<sz-1; n+=2) { 3862 i1 = idx[0]; 3863 i2 = idx[1]; 3864 idx += 2; 3865 tmp0 = t[i1]; 3866 tmp1 = t[i2]; 3867 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3868 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3869 } 3870 3871 if (n == sz-1) { 3872 tmp0 = t[*idx]; 3873 sum1 -= v1[0] * tmp0; 3874 sum2 -= v2[0] * tmp0; 3875 } 3876 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3877 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3878 ibdiag += 4; row += 2; 3879 break; 3880 case 3: 3881 v2 = a->a + ii[row+1]; 3882 v3 = a->a + ii[row+2]; 3883 sum1 = t[row]; 3884 sum2 = t[row+1]; 3885 sum3 = t[row+2]; 3886 for (n = 0; n<sz-1; n+=2) { 3887 i1 = idx[0]; 3888 i2 = idx[1]; 3889 idx += 2; 3890 tmp0 = t[i1]; 3891 tmp1 = t[i2]; 3892 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3893 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3894 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3895 } 3896 3897 if (n == sz-1) { 3898 tmp0 = t[*idx]; 3899 sum1 -= v1[0] * tmp0; 3900 sum2 -= v2[0] * tmp0; 3901 sum3 -= v3[0] * tmp0; 3902 } 3903 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3904 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3905 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3906 ibdiag += 9; row += 3; 3907 break; 3908 case 4: 3909 v2 = a->a + ii[row+1]; 3910 v3 = a->a + ii[row+2]; 3911 v4 = a->a + ii[row+3]; 3912 sum1 = t[row]; 3913 sum2 = t[row+1]; 3914 sum3 = t[row+2]; 3915 sum4 = t[row+3]; 3916 for (n = 0; n<sz-1; n+=2) { 3917 i1 = idx[0]; 3918 i2 = idx[1]; 3919 idx += 2; 3920 tmp0 = t[i1]; 3921 tmp1 = t[i2]; 3922 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3923 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3924 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3925 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3926 } 3927 3928 if (n == sz-1) { 3929 tmp0 = t[*idx]; 3930 sum1 -= v1[0] * tmp0; 3931 sum2 -= v2[0] * tmp0; 3932 sum3 -= v3[0] * tmp0; 3933 sum4 -= v4[0] * tmp0; 3934 } 3935 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3936 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3937 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3938 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3939 ibdiag += 16; row += 4; 3940 break; 3941 case 5: 3942 v2 = a->a + ii[row+1]; 3943 v3 = a->a + ii[row+2]; 3944 v4 = a->a + ii[row+3]; 3945 v5 = a->a + ii[row+4]; 3946 sum1 = t[row]; 3947 sum2 = t[row+1]; 3948 sum3 = t[row+2]; 3949 sum4 = t[row+3]; 3950 sum5 = t[row+4]; 3951 for (n = 0; n<sz-1; n+=2) { 3952 i1 = idx[0]; 3953 i2 = idx[1]; 3954 idx += 2; 3955 tmp0 = t[i1]; 3956 tmp1 = t[i2]; 3957 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3958 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3959 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3960 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3961 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3962 } 3963 3964 if (n == sz-1) { 3965 tmp0 = t[*idx]; 3966 sum1 -= v1[0] * tmp0; 3967 sum2 -= v2[0] * tmp0; 3968 sum3 -= v3[0] * tmp0; 3969 sum4 -= v4[0] * tmp0; 3970 sum5 -= v5[0] * tmp0; 3971 } 3972 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3973 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3974 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3975 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3976 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3977 ibdiag += 25; row += 5; 3978 break; 3979 default: 3980 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3981 } 3982 } 3983 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3984 } 3985 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3986 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3987 PetscFunctionReturn(0); 3988 } 3989 3990 #undef __FUNCT__ 3991 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3992 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3993 { 3994 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3995 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3996 const MatScalar *bdiag = a->inode.bdiag; 3997 const PetscScalar *b; 3998 PetscErrorCode ierr; 3999 PetscInt m = a->inode.node_count,cnt = 0,i,row; 4000 const PetscInt *sizes = a->inode.size; 4001 4002 PetscFunctionBegin; 4003 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4004 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4005 cnt = 0; 4006 for (i=0, row=0; i<m; i++) { 4007 switch (sizes[i]) { 4008 case 1: 4009 x[row] = b[row]*bdiag[cnt++];row++; 4010 break; 4011 case 2: 4012 x1 = b[row]; x2 = b[row+1]; 4013 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 4014 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 4015 x[row++] = tmp1; 4016 x[row++] = tmp2; 4017 cnt += 4; 4018 break; 4019 case 3: 4020 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 4021 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 4022 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 4023 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 4024 x[row++] = tmp1; 4025 x[row++] = tmp2; 4026 x[row++] = tmp3; 4027 cnt += 9; 4028 break; 4029 case 4: 4030 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 4031 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 4032 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 4033 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 4034 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 4035 x[row++] = tmp1; 4036 x[row++] = tmp2; 4037 x[row++] = tmp3; 4038 x[row++] = tmp4; 4039 cnt += 16; 4040 break; 4041 case 5: 4042 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 4043 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 4044 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 4045 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 4046 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4047 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4048 x[row++] = tmp1; 4049 x[row++] = tmp2; 4050 x[row++] = tmp3; 4051 x[row++] = tmp4; 4052 x[row++] = tmp5; 4053 cnt += 25; 4054 break; 4055 default: 4056 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 4057 } 4058 } 4059 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 4060 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4061 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4062 PetscFunctionReturn(0); 4063 } 4064 4065 /* 4066 samestructure indicates that the matrix has not changed its nonzero structure so we 4067 do not need to recompute the inodes 4068 */ 4069 #undef __FUNCT__ 4070 #define __FUNCT__ "MatSeqAIJCheckInode" 4071 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4072 { 4073 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4074 PetscErrorCode ierr; 4075 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4076 PetscBool flag; 4077 const PetscInt *idx,*idy,*ii; 4078 4079 PetscFunctionBegin; 4080 if (!a->inode.use) PetscFunctionReturn(0); 4081 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4082 4083 m = A->rmap->n; 4084 if (a->inode.size) ns = a->inode.size; 4085 else { 4086 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4087 } 4088 4089 i = 0; 4090 node_count = 0; 4091 idx = a->j; 4092 ii = a->i; 4093 while (i < m) { /* For each row */ 4094 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4095 /* Limits the number of elements in a node to 'a->inode.limit' */ 4096 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4097 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4098 if (nzy != nzx) break; 4099 idy += nzx; /* Same nonzero pattern */ 4100 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4101 if (!flag) break; 4102 } 4103 ns[node_count++] = blk_size; 4104 idx += blk_size*nzx; 4105 i = j; 4106 } 4107 /* If not enough inodes found,, do not use inode version of the routines */ 4108 if (!m || node_count > .8*m) { 4109 ierr = PetscFree(ns);CHKERRQ(ierr); 4110 4111 a->inode.node_count = 0; 4112 a->inode.size = NULL; 4113 a->inode.use = PETSC_FALSE; 4114 A->ops->mult = MatMult_SeqAIJ; 4115 A->ops->sor = MatSOR_SeqAIJ; 4116 A->ops->multadd = MatMultAdd_SeqAIJ; 4117 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4118 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4119 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4120 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4121 A->ops->coloringpatch = 0; 4122 A->ops->multdiagonalblock = 0; 4123 4124 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4125 } else { 4126 if (!A->factortype) { 4127 A->ops->mult = MatMult_SeqAIJ_Inode; 4128 A->ops->sor = MatSOR_SeqAIJ_Inode; 4129 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4130 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4131 if (A->rmap->n == A->cmap->n) { 4132 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4133 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4134 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4135 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4136 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4137 } 4138 } else { 4139 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4140 } 4141 a->inode.node_count = node_count; 4142 a->inode.size = ns; 4143 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4144 } 4145 a->inode.checked = PETSC_TRUE; 4146 a->inode.mat_nonzerostate = A->nonzerostate; 4147 PetscFunctionReturn(0); 4148 } 4149 4150 #undef __FUNCT__ 4151 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4152 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4153 { 4154 Mat B =*C; 4155 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4156 PetscErrorCode ierr; 4157 PetscInt m=A->rmap->n; 4158 4159 PetscFunctionBegin; 4160 c->inode.use = a->inode.use; 4161 c->inode.limit = a->inode.limit; 4162 c->inode.max_limit = a->inode.max_limit; 4163 if (a->inode.size) { 4164 ierr = PetscMalloc1(m+1,&c->inode.size);CHKERRQ(ierr); 4165 c->inode.node_count = a->inode.node_count; 4166 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4167 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4168 if (!B->factortype) { 4169 B->ops->mult = MatMult_SeqAIJ_Inode; 4170 B->ops->sor = MatSOR_SeqAIJ_Inode; 4171 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4172 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4173 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4174 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4175 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4176 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4177 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4178 } else { 4179 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4180 } 4181 } else { 4182 c->inode.size = 0; 4183 c->inode.node_count = 0; 4184 } 4185 c->inode.ibdiagvalid = PETSC_FALSE; 4186 c->inode.ibdiag = 0; 4187 c->inode.bdiag = 0; 4188 PetscFunctionReturn(0); 4189 } 4190 4191 #undef __FUNCT__ 4192 #define __FUNCT__ "MatGetRow_FactoredLU" 4193 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4194 { 4195 PetscInt k; 4196 const PetscInt *vi; 4197 4198 PetscFunctionBegin; 4199 vi = aj + ai[row]; 4200 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4201 vi = aj + adiag[row]; 4202 cols[nzl] = vi[0]; 4203 vi = aj + adiag[row+1]+1; 4204 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4205 PetscFunctionReturn(0); 4206 } 4207 /* 4208 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4209 Modified from MatSeqAIJCheckInode(). 4210 4211 Input Parameters: 4212 . Mat A - ILU or LU matrix factor 4213 4214 */ 4215 #undef __FUNCT__ 4216 #define __FUNCT__ "MatSeqAIJCheckInode_FactorLU" 4217 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4218 { 4219 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4220 PetscErrorCode ierr; 4221 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4222 PetscInt *cols1,*cols2,*ns; 4223 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4224 PetscBool flag; 4225 4226 PetscFunctionBegin; 4227 if (!a->inode.use) PetscFunctionReturn(0); 4228 if (a->inode.checked) PetscFunctionReturn(0); 4229 4230 m = A->rmap->n; 4231 if (a->inode.size) ns = a->inode.size; 4232 else { 4233 ierr = PetscMalloc1(m+1,&ns);CHKERRQ(ierr); 4234 } 4235 4236 i = 0; 4237 node_count = 0; 4238 ierr = PetscMalloc2(m,&cols1,m,&cols2);CHKERRQ(ierr); 4239 while (i < m) { /* For each row */ 4240 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4241 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4242 nzx = nzl1 + nzu1 + 1; 4243 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4244 4245 /* Limits the number of elements in a node to 'a->inode.limit' */ 4246 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4247 nzl2 = ai[j+1] - ai[j]; 4248 nzu2 = adiag[j] - adiag[j+1] - 1; 4249 nzy = nzl2 + nzu2 + 1; 4250 if (nzy != nzx) break; 4251 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4252 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4253 if (!flag) break; 4254 } 4255 ns[node_count++] = blk_size; 4256 i = j; 4257 } 4258 ierr = PetscFree2(cols1,cols2);CHKERRQ(ierr); 4259 /* If not enough inodes found,, do not use inode version of the routines */ 4260 if (!m || node_count > .8*m) { 4261 ierr = PetscFree(ns);CHKERRQ(ierr); 4262 4263 a->inode.node_count = 0; 4264 a->inode.size = NULL; 4265 a->inode.use = PETSC_FALSE; 4266 4267 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4268 } else { 4269 A->ops->mult = 0; 4270 A->ops->sor = 0; 4271 A->ops->multadd = 0; 4272 A->ops->getrowij = 0; 4273 A->ops->restorerowij = 0; 4274 A->ops->getcolumnij = 0; 4275 A->ops->restorecolumnij = 0; 4276 A->ops->coloringpatch = 0; 4277 A->ops->multdiagonalblock = 0; 4278 a->inode.node_count = node_count; 4279 a->inode.size = ns; 4280 4281 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4282 } 4283 a->inode.checked = PETSC_TRUE; 4284 PetscFunctionReturn(0); 4285 } 4286 4287 #undef __FUNCT__ 4288 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4289 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4290 { 4291 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4292 4293 PetscFunctionBegin; 4294 a->inode.ibdiagvalid = PETSC_FALSE; 4295 PetscFunctionReturn(0); 4296 } 4297 4298 /* 4299 This is really ugly. if inodes are used this replaces the 4300 permutations with ones that correspond to rows/cols of the matrix 4301 rather then inode blocks 4302 */ 4303 #undef __FUNCT__ 4304 #define __FUNCT__ "MatInodeAdjustForInodes" 4305 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4306 { 4307 PetscErrorCode ierr; 4308 4309 PetscFunctionBegin; 4310 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4311 PetscFunctionReturn(0); 4312 } 4313 4314 #undef __FUNCT__ 4315 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4316 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4317 { 4318 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4319 PetscErrorCode ierr; 4320 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4321 const PetscInt *ridx,*cidx; 4322 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4323 PetscInt nslim_col,*ns_col; 4324 IS ris = *rperm,cis = *cperm; 4325 4326 PetscFunctionBegin; 4327 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4328 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4329 4330 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4331 ierr = PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns);CHKERRQ(ierr); 4332 ierr = PetscMalloc2(m,&permr,n,&permc);CHKERRQ(ierr); 4333 4334 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4335 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4336 4337 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4338 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4339 4340 /* Construct the permutations for rows*/ 4341 for (i=0,row = 0; i<nslim_row; ++i) { 4342 indx = ridx[i]; 4343 start_val = tns[indx]; 4344 end_val = tns[indx + 1]; 4345 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4346 } 4347 4348 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4349 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4350 4351 /* Construct permutations for columns */ 4352 for (i=0,col=0; i<nslim_col; ++i) { 4353 indx = cidx[i]; 4354 start_val = tns[indx]; 4355 end_val = tns[indx + 1]; 4356 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4357 } 4358 4359 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4360 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4361 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4362 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4363 4364 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4365 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4366 4367 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4368 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4369 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4370 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4371 ierr = PetscFree(tns);CHKERRQ(ierr); 4372 PetscFunctionReturn(0); 4373 } 4374 4375 #undef __FUNCT__ 4376 #define __FUNCT__ "MatInodeGetInodeSizes" 4377 /*@C 4378 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4379 4380 Not Collective 4381 4382 Input Parameter: 4383 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4384 4385 Output Parameter: 4386 + node_count - no of inodes present in the matrix. 4387 . sizes - an array of size node_count,with sizes of each inode. 4388 - limit - the max size used to generate the inodes. 4389 4390 Level: advanced 4391 4392 Notes: This routine returns some internal storage information 4393 of the matrix, it is intended to be used by advanced users. 4394 It should be called after the matrix is assembled. 4395 The contents of the sizes[] array should not be changed. 4396 NULL may be passed for information not requested. 4397 4398 .keywords: matrix, seqaij, get, inode 4399 4400 .seealso: MatGetInfo() 4401 @*/ 4402 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4403 { 4404 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4405 4406 PetscFunctionBegin; 4407 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4408 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4409 if (f) { 4410 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4411 } 4412 PetscFunctionReturn(0); 4413 } 4414 4415 #undef __FUNCT__ 4416 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4417 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4418 { 4419 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4420 4421 PetscFunctionBegin; 4422 if (node_count) *node_count = a->inode.node_count; 4423 if (sizes) *sizes = a->inode.size; 4424 if (limit) *limit = a->inode.limit; 4425 PetscFunctionReturn(0); 4426 } 4427