1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 static PetscErrorCode MatCreateColInode_Private(Mat A,PetscInt *size,PetscInt **ns) 9 { 10 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 11 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 12 13 PetscFunctionBegin; 14 n = A->cmap->n; 15 m = A->rmap->n; 16 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 17 ns_row = a->inode.size; 18 19 min_mn = (m < n) ? m : n; 20 if (!ns) { 21 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 22 for (; count+1 < n; count++,i++) ; 23 if (count < n) { 24 i++; 25 } 26 *size = i; 27 PetscFunctionReturn(0); 28 } 29 PetscCall(PetscMalloc1(n+1,&ns_col)); 30 31 /* Use the same row structure wherever feasible. */ 32 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 33 ns_col[i] = ns_row[i]; 34 } 35 36 /* if m < n; pad up the remainder with inode_limit */ 37 for (; count+1 < n; count++,i++) { 38 ns_col[i] = 1; 39 } 40 /* The last node is the odd ball. padd it up with the remaining rows; */ 41 if (count < n) { 42 ns_col[i] = n - count; 43 i++; 44 } else if (count > n) { 45 /* Adjust for the over estimation */ 46 ns_col[i-1] += n - count; 47 } 48 *size = i; 49 *ns = ns_col; 50 PetscFunctionReturn(0); 51 } 52 53 /* 54 This builds symmetric version of nonzero structure, 55 */ 56 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 57 { 58 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 59 PetscInt *work,*ia,*ja,nz,nslim_row,nslim_col,m,row,col,n; 60 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2; 61 const PetscInt *j,*jmax,*ai= a->i,*aj = a->j; 62 63 PetscFunctionBegin; 64 nslim_row = a->inode.node_count; 65 m = A->rmap->n; 66 n = A->cmap->n; 67 PetscCheck(m == n,PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 68 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 69 70 /* Use the row_inode as column_inode */ 71 nslim_col = nslim_row; 72 ns_col = ns_row; 73 74 /* allocate space for reformated inode structure */ 75 PetscCall(PetscMalloc2(nslim_col+1,&tns,n+1,&tvc)); 76 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 77 78 for (i1=0,col=0; i1<nslim_col; ++i1) { 79 nsz = ns_col[i1]; 80 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 81 } 82 /* allocate space for row pointers */ 83 PetscCall(PetscCalloc1(nslim_row+1,&ia)); 84 *iia = ia; 85 PetscCall(PetscMalloc1(nslim_row+1,&work)); 86 87 /* determine the number of columns in each row */ 88 ia[0] = oshift; 89 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 90 91 j = aj + ai[row] + ishift; 92 jmax = aj + ai[row+1] + ishift; 93 if (j==jmax) continue; /* empty row */ 94 col = *j++ + ishift; 95 i2 = tvc[col]; 96 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elements */ 97 ia[i1+1]++; 98 ia[i2+1]++; 99 i2++; /* Start col of next node */ 100 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 101 i2 = tvc[col]; 102 } 103 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 104 } 105 106 /* shift ia[i] to point to next row */ 107 for (i1=1; i1<nslim_row+1; i1++) { 108 row = ia[i1-1]; 109 ia[i1] += row; 110 work[i1-1] = row - oshift; 111 } 112 113 /* allocate space for column pointers */ 114 nz = ia[nslim_row] + (!ishift); 115 PetscCall(PetscMalloc1(nz,&ja)); 116 *jja = ja; 117 118 /* loop over lower triangular part putting into ja */ 119 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 120 j = aj + ai[row] + ishift; 121 jmax = aj + ai[row+1] + ishift; 122 if (j==jmax) continue; /* empty row */ 123 col = *j++ + ishift; 124 i2 = tvc[col]; 125 while (i2<i1 && j<jmax) { 126 ja[work[i2]++] = i1 + oshift; 127 ja[work[i1]++] = i2 + oshift; 128 ++i2; 129 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 130 i2 = tvc[col]; 131 } 132 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 133 134 } 135 PetscCall(PetscFree(work)); 136 PetscCall(PetscFree2(tns,tvc)); 137 PetscFunctionReturn(0); 138 } 139 140 /* 141 This builds nonsymmetric version of nonzero structure, 142 */ 143 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 144 { 145 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 146 PetscInt *work,*ia,*ja,nz,nslim_row,n,row,col,*ns_col,nslim_col; 147 PetscInt *tns,*tvc,nsz,i1,i2; 148 const PetscInt *j,*ai= a->i,*aj = a->j,*ns_row = a->inode.size; 149 150 PetscFunctionBegin; 151 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 152 nslim_row = a->inode.node_count; 153 n = A->cmap->n; 154 155 /* Create The column_inode for this matrix */ 156 PetscCall(MatCreateColInode_Private(A,&nslim_col,&ns_col)); 157 158 /* allocate space for reformated column_inode structure */ 159 PetscCall(PetscMalloc2(nslim_col +1,&tns,n + 1,&tvc)); 160 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 161 162 for (i1=0,col=0; i1<nslim_col; ++i1) { 163 nsz = ns_col[i1]; 164 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 165 } 166 /* allocate space for row pointers */ 167 PetscCall(PetscCalloc1(nslim_row+1,&ia)); 168 *iia = ia; 169 PetscCall(PetscMalloc1(nslim_row+1,&work)); 170 171 /* determine the number of columns in each row */ 172 ia[0] = oshift; 173 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 174 j = aj + ai[row] + ishift; 175 nz = ai[row+1] - ai[row]; 176 if (!nz) continue; /* empty row */ 177 col = *j++ + ishift; 178 i2 = tvc[col]; 179 while (nz-- > 0) { /* off-diagonal elements */ 180 ia[i1+1]++; 181 i2++; /* Start col of next node */ 182 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 183 if (nz > 0) i2 = tvc[col]; 184 } 185 } 186 187 /* shift ia[i] to point to next row */ 188 for (i1=1; i1<nslim_row+1; i1++) { 189 row = ia[i1-1]; 190 ia[i1] += row; 191 work[i1-1] = row - oshift; 192 } 193 194 /* allocate space for column pointers */ 195 nz = ia[nslim_row] + (!ishift); 196 PetscCall(PetscMalloc1(nz,&ja)); 197 *jja = ja; 198 199 /* loop over matrix putting into ja */ 200 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 201 j = aj + ai[row] + ishift; 202 nz = ai[row+1] - ai[row]; 203 if (!nz) continue; /* empty row */ 204 col = *j++ + ishift; 205 i2 = tvc[col]; 206 while (nz-- > 0) { 207 ja[work[i1]++] = i2 + oshift; 208 ++i2; 209 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 210 if (nz > 0) i2 = tvc[col]; 211 } 212 } 213 PetscCall(PetscFree(ns_col)); 214 PetscCall(PetscFree(work)); 215 PetscCall(PetscFree2(tns,tvc)); 216 PetscFunctionReturn(0); 217 } 218 219 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 220 { 221 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 222 223 PetscFunctionBegin; 224 *n = a->inode.node_count; 225 if (!ia) PetscFunctionReturn(0); 226 if (!blockcompressed) { 227 PetscCall(MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done)); 228 } else if (symmetric) { 229 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift)); 230 } else { 231 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift)); 232 } 233 PetscFunctionReturn(0); 234 } 235 236 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 237 { 238 PetscFunctionBegin; 239 if (!ia) PetscFunctionReturn(0); 240 241 if (!blockcompressed) { 242 PetscCall(MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done)); 243 } else { 244 PetscCall(PetscFree(*ia)); 245 PetscCall(PetscFree(*ja)); 246 } 247 PetscFunctionReturn(0); 248 } 249 250 /* ----------------------------------------------------------- */ 251 252 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 253 { 254 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 255 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 256 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 257 258 PetscFunctionBegin; 259 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 260 nslim_row = a->inode.node_count; 261 n = A->cmap->n; 262 263 /* Create The column_inode for this matrix */ 264 PetscCall(MatCreateColInode_Private(A,&nslim_col,&ns_col)); 265 266 /* allocate space for reformated column_inode structure */ 267 PetscCall(PetscMalloc2(nslim_col + 1,&tns,n + 1,&tvc)); 268 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 269 270 for (i1=0,col=0; i1<nslim_col; ++i1) { 271 nsz = ns_col[i1]; 272 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 273 } 274 /* allocate space for column pointers */ 275 PetscCall(PetscCalloc1(nslim_col+1,&ia)); 276 *iia = ia; 277 PetscCall(PetscMalloc1(nslim_col+1,&work)); 278 279 /* determine the number of columns in each row */ 280 ia[0] = oshift; 281 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 282 j = aj + ai[row] + ishift; 283 col = *j++ + ishift; 284 i2 = tvc[col]; 285 nz = ai[row+1] - ai[row]; 286 while (nz-- > 0) { /* off-diagonal elements */ 287 /* ia[i1+1]++; */ 288 ia[i2+1]++; 289 i2++; 290 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 291 if (nz > 0) i2 = tvc[col]; 292 } 293 } 294 295 /* shift ia[i] to point to next col */ 296 for (i1=1; i1<nslim_col+1; i1++) { 297 col = ia[i1-1]; 298 ia[i1] += col; 299 work[i1-1] = col - oshift; 300 } 301 302 /* allocate space for column pointers */ 303 nz = ia[nslim_col] + (!ishift); 304 PetscCall(PetscMalloc1(nz,&ja)); 305 *jja = ja; 306 307 /* loop over matrix putting into ja */ 308 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 309 j = aj + ai[row] + ishift; 310 col = *j++ + ishift; 311 i2 = tvc[col]; 312 nz = ai[row+1] - ai[row]; 313 while (nz-- > 0) { 314 /* ja[work[i1]++] = i2 + oshift; */ 315 ja[work[i2]++] = i1 + oshift; 316 i2++; 317 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 318 if (nz > 0) i2 = tvc[col]; 319 } 320 } 321 PetscCall(PetscFree(ns_col)); 322 PetscCall(PetscFree(work)); 323 PetscCall(PetscFree2(tns,tvc)); 324 PetscFunctionReturn(0); 325 } 326 327 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 328 { 329 PetscFunctionBegin; 330 PetscCall(MatCreateColInode_Private(A,n,NULL)); 331 if (!ia) PetscFunctionReturn(0); 332 333 if (!blockcompressed) { 334 PetscCall(MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done)); 335 } else if (symmetric) { 336 /* Since the indices are symmetric it doesn't matter */ 337 PetscCall(MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift)); 338 } else { 339 PetscCall(MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift)); 340 } 341 PetscFunctionReturn(0); 342 } 343 344 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 345 { 346 PetscFunctionBegin; 347 if (!ia) PetscFunctionReturn(0); 348 if (!blockcompressed) { 349 PetscCall(MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done)); 350 } else { 351 PetscCall(PetscFree(*ia)); 352 PetscCall(PetscFree(*ja)); 353 } 354 PetscFunctionReturn(0); 355 } 356 357 /* ----------------------------------------------------------- */ 358 359 PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 360 { 361 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 362 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 363 PetscScalar *y; 364 const PetscScalar *x; 365 const MatScalar *v1,*v2,*v3,*v4,*v5; 366 PetscInt i1,i2,n,i,row,node_max,nsz,sz,nonzerorow=0; 367 const PetscInt *idx,*ns,*ii; 368 369 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 370 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 371 #endif 372 373 PetscFunctionBegin; 374 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 375 node_max = a->inode.node_count; 376 ns = a->inode.size; /* Node Size array */ 377 PetscCall(VecGetArrayRead(xx,&x)); 378 PetscCall(VecGetArray(yy,&y)); 379 idx = a->j; 380 v1 = a->a; 381 ii = a->i; 382 383 for (i = 0,row = 0; i< node_max; ++i) { 384 nsz = ns[i]; 385 n = ii[1] - ii[0]; 386 nonzerorow += (n>0)*nsz; 387 ii += nsz; 388 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 389 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 390 sz = n; /* No of non zeros in this row */ 391 /* Switch on the size of Node */ 392 switch (nsz) { /* Each loop in 'case' is unrolled */ 393 case 1: 394 sum1 = 0.; 395 396 for (n = 0; n< sz-1; n+=2) { 397 i1 = idx[0]; /* The instructions are ordered to */ 398 i2 = idx[1]; /* make the compiler's job easy */ 399 idx += 2; 400 tmp0 = x[i1]; 401 tmp1 = x[i2]; 402 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 403 } 404 405 if (n == sz-1) { /* Take care of the last nonzero */ 406 tmp0 = x[*idx++]; 407 sum1 += *v1++ *tmp0; 408 } 409 y[row++]=sum1; 410 break; 411 case 2: 412 sum1 = 0.; 413 sum2 = 0.; 414 v2 = v1 + n; 415 416 for (n = 0; n< sz-1; n+=2) { 417 i1 = idx[0]; 418 i2 = idx[1]; 419 idx += 2; 420 tmp0 = x[i1]; 421 tmp1 = x[i2]; 422 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 423 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 424 } 425 if (n == sz-1) { 426 tmp0 = x[*idx++]; 427 sum1 += *v1++ * tmp0; 428 sum2 += *v2++ * tmp0; 429 } 430 y[row++]=sum1; 431 y[row++]=sum2; 432 v1 =v2; /* Since the next block to be processed starts there*/ 433 idx +=sz; 434 break; 435 case 3: 436 sum1 = 0.; 437 sum2 = 0.; 438 sum3 = 0.; 439 v2 = v1 + n; 440 v3 = v2 + n; 441 442 for (n = 0; n< sz-1; n+=2) { 443 i1 = idx[0]; 444 i2 = idx[1]; 445 idx += 2; 446 tmp0 = x[i1]; 447 tmp1 = x[i2]; 448 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 449 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 450 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 451 } 452 if (n == sz-1) { 453 tmp0 = x[*idx++]; 454 sum1 += *v1++ * tmp0; 455 sum2 += *v2++ * tmp0; 456 sum3 += *v3++ * tmp0; 457 } 458 y[row++]=sum1; 459 y[row++]=sum2; 460 y[row++]=sum3; 461 v1 =v3; /* Since the next block to be processed starts there*/ 462 idx +=2*sz; 463 break; 464 case 4: 465 sum1 = 0.; 466 sum2 = 0.; 467 sum3 = 0.; 468 sum4 = 0.; 469 v2 = v1 + n; 470 v3 = v2 + n; 471 v4 = v3 + n; 472 473 for (n = 0; n< sz-1; n+=2) { 474 i1 = idx[0]; 475 i2 = idx[1]; 476 idx += 2; 477 tmp0 = x[i1]; 478 tmp1 = x[i2]; 479 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 480 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 481 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 482 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 sum4 += *v4++ * tmp0; 490 } 491 y[row++]=sum1; 492 y[row++]=sum2; 493 y[row++]=sum3; 494 y[row++]=sum4; 495 v1 =v4; /* Since the next block to be processed starts there*/ 496 idx +=3*sz; 497 break; 498 case 5: 499 sum1 = 0.; 500 sum2 = 0.; 501 sum3 = 0.; 502 sum4 = 0.; 503 sum5 = 0.; 504 v2 = v1 + n; 505 v3 = v2 + n; 506 v4 = v3 + n; 507 v5 = v4 + n; 508 509 for (n = 0; n<sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 520 } 521 if (n == sz-1) { 522 tmp0 = x[*idx++]; 523 sum1 += *v1++ * tmp0; 524 sum2 += *v2++ * tmp0; 525 sum3 += *v3++ * tmp0; 526 sum4 += *v4++ * tmp0; 527 sum5 += *v5++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 y[row++]=sum5; 534 v1 =v5; /* Since the next block to be processed starts there */ 535 idx +=4*sz; 536 break; 537 default: 538 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 539 } 540 } 541 PetscCall(VecRestoreArrayRead(xx,&x)); 542 PetscCall(VecRestoreArray(yy,&y)); 543 PetscCall(PetscLogFlops(2.0*a->nz - nonzerorow)); 544 PetscFunctionReturn(0); 545 } 546 /* ----------------------------------------------------------- */ 547 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 548 PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 549 { 550 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 551 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 552 const MatScalar *v1,*v2,*v3,*v4,*v5; 553 const PetscScalar *x; 554 PetscScalar *y,*z,*zt; 555 PetscInt i1,i2,n,i,row,node_max,nsz,sz; 556 const PetscInt *idx,*ns,*ii; 557 558 PetscFunctionBegin; 559 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 560 node_max = a->inode.node_count; 561 ns = a->inode.size; /* Node Size array */ 562 563 PetscCall(VecGetArrayRead(xx,&x)); 564 PetscCall(VecGetArrayPair(zz,yy,&z,&y)); 565 zt = z; 566 567 idx = a->j; 568 v1 = a->a; 569 ii = a->i; 570 571 for (i = 0,row = 0; i< node_max; ++i) { 572 nsz = ns[i]; 573 n = ii[1] - ii[0]; 574 ii += nsz; 575 sz = n; /* No of non zeros in this row */ 576 /* Switch on the size of Node */ 577 switch (nsz) { /* Each loop in 'case' is unrolled */ 578 case 1: 579 sum1 = *zt++; 580 581 for (n = 0; n< sz-1; n+=2) { 582 i1 = idx[0]; /* The instructions are ordered to */ 583 i2 = idx[1]; /* make the compiler's job easy */ 584 idx += 2; 585 tmp0 = x[i1]; 586 tmp1 = x[i2]; 587 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 588 } 589 590 if (n == sz-1) { /* Take care of the last nonzero */ 591 tmp0 = x[*idx++]; 592 sum1 += *v1++ * tmp0; 593 } 594 y[row++]=sum1; 595 break; 596 case 2: 597 sum1 = *zt++; 598 sum2 = *zt++; 599 v2 = v1 + n; 600 601 for (n = 0; n< sz-1; n+=2) { 602 i1 = idx[0]; 603 i2 = idx[1]; 604 idx += 2; 605 tmp0 = x[i1]; 606 tmp1 = x[i2]; 607 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 608 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 609 } 610 if (n == sz-1) { 611 tmp0 = x[*idx++]; 612 sum1 += *v1++ * tmp0; 613 sum2 += *v2++ * tmp0; 614 } 615 y[row++]=sum1; 616 y[row++]=sum2; 617 v1 =v2; /* Since the next block to be processed starts there*/ 618 idx +=sz; 619 break; 620 case 3: 621 sum1 = *zt++; 622 sum2 = *zt++; 623 sum3 = *zt++; 624 v2 = v1 + n; 625 v3 = v2 + n; 626 627 for (n = 0; n< sz-1; n+=2) { 628 i1 = idx[0]; 629 i2 = idx[1]; 630 idx += 2; 631 tmp0 = x[i1]; 632 tmp1 = x[i2]; 633 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 634 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 635 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 636 } 637 if (n == sz-1) { 638 tmp0 = x[*idx++]; 639 sum1 += *v1++ * tmp0; 640 sum2 += *v2++ * tmp0; 641 sum3 += *v3++ * tmp0; 642 } 643 y[row++]=sum1; 644 y[row++]=sum2; 645 y[row++]=sum3; 646 v1 =v3; /* Since the next block to be processed starts there*/ 647 idx +=2*sz; 648 break; 649 case 4: 650 sum1 = *zt++; 651 sum2 = *zt++; 652 sum3 = *zt++; 653 sum4 = *zt++; 654 v2 = v1 + n; 655 v3 = v2 + n; 656 v4 = v3 + n; 657 658 for (n = 0; n< sz-1; n+=2) { 659 i1 = idx[0]; 660 i2 = idx[1]; 661 idx += 2; 662 tmp0 = x[i1]; 663 tmp1 = x[i2]; 664 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 665 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 666 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 667 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 668 } 669 if (n == sz-1) { 670 tmp0 = x[*idx++]; 671 sum1 += *v1++ * tmp0; 672 sum2 += *v2++ * tmp0; 673 sum3 += *v3++ * tmp0; 674 sum4 += *v4++ * tmp0; 675 } 676 y[row++]=sum1; 677 y[row++]=sum2; 678 y[row++]=sum3; 679 y[row++]=sum4; 680 v1 =v4; /* Since the next block to be processed starts there*/ 681 idx +=3*sz; 682 break; 683 case 5: 684 sum1 = *zt++; 685 sum2 = *zt++; 686 sum3 = *zt++; 687 sum4 = *zt++; 688 sum5 = *zt++; 689 v2 = v1 + n; 690 v3 = v2 + n; 691 v4 = v3 + n; 692 v5 = v4 + n; 693 694 for (n = 0; n<sz-1; n+=2) { 695 i1 = idx[0]; 696 i2 = idx[1]; 697 idx += 2; 698 tmp0 = x[i1]; 699 tmp1 = x[i2]; 700 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 701 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 702 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 703 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 704 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 705 } 706 if (n == sz-1) { 707 tmp0 = x[*idx++]; 708 sum1 += *v1++ * tmp0; 709 sum2 += *v2++ * tmp0; 710 sum3 += *v3++ * tmp0; 711 sum4 += *v4++ * tmp0; 712 sum5 += *v5++ * tmp0; 713 } 714 y[row++]=sum1; 715 y[row++]=sum2; 716 y[row++]=sum3; 717 y[row++]=sum4; 718 y[row++]=sum5; 719 v1 =v5; /* Since the next block to be processed starts there */ 720 idx +=4*sz; 721 break; 722 default: 723 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 724 } 725 } 726 PetscCall(VecRestoreArrayRead(xx,&x)); 727 PetscCall(VecRestoreArrayPair(zz,yy,&z,&y)); 728 PetscCall(PetscLogFlops(2.0*a->nz)); 729 PetscFunctionReturn(0); 730 } 731 732 /* ----------------------------------------------------------- */ 733 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 734 { 735 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 736 IS iscol = a->col,isrow = a->row; 737 const PetscInt *r,*c,*rout,*cout; 738 PetscInt i,j,n = A->rmap->n,nz; 739 PetscInt node_max,*ns,row,nsz,aii,i0,i1; 740 const PetscInt *ai = a->i,*a_j = a->j,*vi,*ad,*aj; 741 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 742 PetscScalar sum1,sum2,sum3,sum4,sum5; 743 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 744 const PetscScalar *b; 745 746 PetscFunctionBegin; 747 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 748 node_max = a->inode.node_count; 749 ns = a->inode.size; /* Node Size array */ 750 751 PetscCall(VecGetArrayRead(bb,&b)); 752 PetscCall(VecGetArrayWrite(xx,&x)); 753 tmp = a->solve_work; 754 755 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 756 PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1); 757 758 /* forward solve the lower triangular */ 759 tmps = tmp; 760 aa = a_a; 761 aj = a_j; 762 ad = a->diag; 763 764 for (i = 0,row = 0; i< node_max; ++i) { 765 nsz = ns[i]; 766 aii = ai[row]; 767 v1 = aa + aii; 768 vi = aj + aii; 769 nz = ad[row]- aii; 770 if (i < node_max-1) { 771 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 772 * but our indexing to determine it's size could. */ 773 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 774 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 775 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 776 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 777 } 778 779 switch (nsz) { /* Each loop in 'case' is unrolled */ 780 case 1: 781 sum1 = b[*r++]; 782 for (j=0; j<nz-1; j+=2) { 783 i0 = vi[0]; 784 i1 = vi[1]; 785 vi +=2; 786 tmp0 = tmps[i0]; 787 tmp1 = tmps[i1]; 788 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 789 } 790 if (j == nz-1) { 791 tmp0 = tmps[*vi++]; 792 sum1 -= *v1++ *tmp0; 793 } 794 tmp[row++]=sum1; 795 break; 796 case 2: 797 sum1 = b[*r++]; 798 sum2 = b[*r++]; 799 v2 = aa + ai[row+1]; 800 801 for (j=0; j<nz-1; j+=2) { 802 i0 = vi[0]; 803 i1 = vi[1]; 804 vi +=2; 805 tmp0 = tmps[i0]; 806 tmp1 = tmps[i1]; 807 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 808 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 809 } 810 if (j == nz-1) { 811 tmp0 = tmps[*vi++]; 812 sum1 -= *v1++ *tmp0; 813 sum2 -= *v2++ *tmp0; 814 } 815 sum2 -= *v2++ *sum1; 816 tmp[row++]=sum1; 817 tmp[row++]=sum2; 818 break; 819 case 3: 820 sum1 = b[*r++]; 821 sum2 = b[*r++]; 822 sum3 = b[*r++]; 823 v2 = aa + ai[row+1]; 824 v3 = aa + ai[row+2]; 825 826 for (j=0; j<nz-1; j+=2) { 827 i0 = vi[0]; 828 i1 = vi[1]; 829 vi +=2; 830 tmp0 = tmps[i0]; 831 tmp1 = tmps[i1]; 832 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 833 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 834 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 835 } 836 if (j == nz-1) { 837 tmp0 = tmps[*vi++]; 838 sum1 -= *v1++ *tmp0; 839 sum2 -= *v2++ *tmp0; 840 sum3 -= *v3++ *tmp0; 841 } 842 sum2 -= *v2++ * sum1; 843 sum3 -= *v3++ * sum1; 844 sum3 -= *v3++ * sum2; 845 846 tmp[row++]=sum1; 847 tmp[row++]=sum2; 848 tmp[row++]=sum3; 849 break; 850 851 case 4: 852 sum1 = b[*r++]; 853 sum2 = b[*r++]; 854 sum3 = b[*r++]; 855 sum4 = b[*r++]; 856 v2 = aa + ai[row+1]; 857 v3 = aa + ai[row+2]; 858 v4 = aa + ai[row+3]; 859 860 for (j=0; j<nz-1; j+=2) { 861 i0 = vi[0]; 862 i1 = vi[1]; 863 vi +=2; 864 tmp0 = tmps[i0]; 865 tmp1 = tmps[i1]; 866 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 867 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 868 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 869 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 870 } 871 if (j == nz-1) { 872 tmp0 = tmps[*vi++]; 873 sum1 -= *v1++ *tmp0; 874 sum2 -= *v2++ *tmp0; 875 sum3 -= *v3++ *tmp0; 876 sum4 -= *v4++ *tmp0; 877 } 878 sum2 -= *v2++ * sum1; 879 sum3 -= *v3++ * sum1; 880 sum4 -= *v4++ * sum1; 881 sum3 -= *v3++ * sum2; 882 sum4 -= *v4++ * sum2; 883 sum4 -= *v4++ * sum3; 884 885 tmp[row++]=sum1; 886 tmp[row++]=sum2; 887 tmp[row++]=sum3; 888 tmp[row++]=sum4; 889 break; 890 case 5: 891 sum1 = b[*r++]; 892 sum2 = b[*r++]; 893 sum3 = b[*r++]; 894 sum4 = b[*r++]; 895 sum5 = b[*r++]; 896 v2 = aa + ai[row+1]; 897 v3 = aa + ai[row+2]; 898 v4 = aa + ai[row+3]; 899 v5 = aa + ai[row+4]; 900 901 for (j=0; j<nz-1; j+=2) { 902 i0 = vi[0]; 903 i1 = vi[1]; 904 vi +=2; 905 tmp0 = tmps[i0]; 906 tmp1 = tmps[i1]; 907 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 908 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 909 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 910 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 911 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 912 } 913 if (j == nz-1) { 914 tmp0 = tmps[*vi++]; 915 sum1 -= *v1++ *tmp0; 916 sum2 -= *v2++ *tmp0; 917 sum3 -= *v3++ *tmp0; 918 sum4 -= *v4++ *tmp0; 919 sum5 -= *v5++ *tmp0; 920 } 921 922 sum2 -= *v2++ * sum1; 923 sum3 -= *v3++ * sum1; 924 sum4 -= *v4++ * sum1; 925 sum5 -= *v5++ * sum1; 926 sum3 -= *v3++ * sum2; 927 sum4 -= *v4++ * sum2; 928 sum5 -= *v5++ * sum2; 929 sum4 -= *v4++ * sum3; 930 sum5 -= *v5++ * sum3; 931 sum5 -= *v5++ * sum4; 932 933 tmp[row++]=sum1; 934 tmp[row++]=sum2; 935 tmp[row++]=sum3; 936 tmp[row++]=sum4; 937 tmp[row++]=sum5; 938 break; 939 default: 940 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported "); 941 } 942 } 943 /* backward solve the upper triangular */ 944 for (i=node_max -1,row = n-1; i>=0; i--) { 945 nsz = ns[i]; 946 aii = ai[row+1] -1; 947 v1 = aa + aii; 948 vi = aj + aii; 949 nz = aii- ad[row]; 950 switch (nsz) { /* Each loop in 'case' is unrolled */ 951 case 1: 952 sum1 = tmp[row]; 953 954 for (j=nz; j>1; j-=2) { 955 vi -=2; 956 i0 = vi[2]; 957 i1 = vi[1]; 958 tmp0 = tmps[i0]; 959 tmp1 = tmps[i1]; 960 v1 -= 2; 961 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 962 } 963 if (j==1) { 964 tmp0 = tmps[*vi--]; 965 sum1 -= *v1-- * tmp0; 966 } 967 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 968 break; 969 case 2: 970 sum1 = tmp[row]; 971 sum2 = tmp[row -1]; 972 v2 = aa + ai[row]-1; 973 for (j=nz; j>1; j-=2) { 974 vi -=2; 975 i0 = vi[2]; 976 i1 = vi[1]; 977 tmp0 = tmps[i0]; 978 tmp1 = tmps[i1]; 979 v1 -= 2; 980 v2 -= 2; 981 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 982 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 983 } 984 if (j==1) { 985 tmp0 = tmps[*vi--]; 986 sum1 -= *v1-- * tmp0; 987 sum2 -= *v2-- * tmp0; 988 } 989 990 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 991 sum2 -= *v2-- * tmp0; 992 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 993 break; 994 case 3: 995 sum1 = tmp[row]; 996 sum2 = tmp[row -1]; 997 sum3 = tmp[row -2]; 998 v2 = aa + ai[row]-1; 999 v3 = aa + ai[row -1]-1; 1000 for (j=nz; j>1; j-=2) { 1001 vi -=2; 1002 i0 = vi[2]; 1003 i1 = vi[1]; 1004 tmp0 = tmps[i0]; 1005 tmp1 = tmps[i1]; 1006 v1 -= 2; 1007 v2 -= 2; 1008 v3 -= 2; 1009 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1010 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1011 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1012 } 1013 if (j==1) { 1014 tmp0 = tmps[*vi--]; 1015 sum1 -= *v1-- * tmp0; 1016 sum2 -= *v2-- * tmp0; 1017 sum3 -= *v3-- * tmp0; 1018 } 1019 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1020 sum2 -= *v2-- * tmp0; 1021 sum3 -= *v3-- * tmp0; 1022 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1023 sum3 -= *v3-- * tmp0; 1024 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1025 1026 break; 1027 case 4: 1028 sum1 = tmp[row]; 1029 sum2 = tmp[row -1]; 1030 sum3 = tmp[row -2]; 1031 sum4 = tmp[row -3]; 1032 v2 = aa + ai[row]-1; 1033 v3 = aa + ai[row -1]-1; 1034 v4 = aa + ai[row -2]-1; 1035 1036 for (j=nz; j>1; j-=2) { 1037 vi -=2; 1038 i0 = vi[2]; 1039 i1 = vi[1]; 1040 tmp0 = tmps[i0]; 1041 tmp1 = tmps[i1]; 1042 v1 -= 2; 1043 v2 -= 2; 1044 v3 -= 2; 1045 v4 -= 2; 1046 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1047 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1048 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1049 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1050 } 1051 if (j==1) { 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 sum4 -= *v4-- * tmp0; 1057 } 1058 1059 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1060 sum2 -= *v2-- * tmp0; 1061 sum3 -= *v3-- * tmp0; 1062 sum4 -= *v4-- * tmp0; 1063 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1064 sum3 -= *v3-- * tmp0; 1065 sum4 -= *v4-- * tmp0; 1066 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1067 sum4 -= *v4-- * tmp0; 1068 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1069 break; 1070 case 5: 1071 sum1 = tmp[row]; 1072 sum2 = tmp[row -1]; 1073 sum3 = tmp[row -2]; 1074 sum4 = tmp[row -3]; 1075 sum5 = tmp[row -4]; 1076 v2 = aa + ai[row]-1; 1077 v3 = aa + ai[row -1]-1; 1078 v4 = aa + ai[row -2]-1; 1079 v5 = aa + ai[row -3]-1; 1080 for (j=nz; j>1; j-=2) { 1081 vi -= 2; 1082 i0 = vi[2]; 1083 i1 = vi[1]; 1084 tmp0 = tmps[i0]; 1085 tmp1 = tmps[i1]; 1086 v1 -= 2; 1087 v2 -= 2; 1088 v3 -= 2; 1089 v4 -= 2; 1090 v5 -= 2; 1091 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1092 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1093 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1094 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1095 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1096 } 1097 if (j==1) { 1098 tmp0 = tmps[*vi--]; 1099 sum1 -= *v1-- * tmp0; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 sum5 -= *v5-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 sum5 -= *v5-- * tmp0; 1111 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1112 sum3 -= *v3-- * tmp0; 1113 sum4 -= *v4-- * tmp0; 1114 sum5 -= *v5-- * tmp0; 1115 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1116 sum4 -= *v4-- * tmp0; 1117 sum5 -= *v5-- * tmp0; 1118 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1119 sum5 -= *v5-- * tmp0; 1120 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1121 break; 1122 default: 1123 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported "); 1124 } 1125 } 1126 PetscCall(ISRestoreIndices(isrow,&rout)); 1127 PetscCall(ISRestoreIndices(iscol,&cout)); 1128 PetscCall(VecRestoreArrayRead(bb,&b)); 1129 PetscCall(VecRestoreArrayWrite(xx,&x)); 1130 PetscCall(PetscLogFlops(2.0*a->nz - A->cmap->n)); 1131 PetscFunctionReturn(0); 1132 } 1133 1134 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1135 { 1136 Mat C =B; 1137 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1138 IS isrow = b->row,isicol = b->icol; 1139 const PetscInt *r,*ic,*ics; 1140 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1141 PetscInt i,j,k,nz,nzL,row,*pj; 1142 const PetscInt *ajtmp,*bjtmp; 1143 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1144 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1145 FactorShiftCtx sctx; 1146 const PetscInt *ddiag; 1147 PetscReal rs; 1148 MatScalar d; 1149 PetscInt inod,nodesz,node_max,col; 1150 const PetscInt *ns; 1151 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1152 1153 PetscFunctionBegin; 1154 /* MatPivotSetUp(): initialize shift context sctx */ 1155 PetscCall(PetscMemzero(&sctx,sizeof(FactorShiftCtx))); 1156 1157 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1158 ddiag = a->diag; 1159 sctx.shift_top = info->zeropivot; 1160 for (i=0; i<n; i++) { 1161 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1162 d = (aa)[ddiag[i]]; 1163 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1164 v = aa+ai[i]; 1165 nz = ai[i+1] - ai[i]; 1166 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1167 if (rs>sctx.shift_top) sctx.shift_top = rs; 1168 } 1169 sctx.shift_top *= 1.1; 1170 sctx.nshift_max = 5; 1171 sctx.shift_lo = 0.; 1172 sctx.shift_hi = 1.; 1173 } 1174 1175 PetscCall(ISGetIndices(isrow,&r)); 1176 PetscCall(ISGetIndices(isicol,&ic)); 1177 1178 PetscCall(PetscCalloc4(n,&rtmp1,n,&rtmp2,n,&rtmp3,n,&rtmp4)); 1179 ics = ic; 1180 1181 node_max = a->inode.node_count; 1182 ns = a->inode.size; 1183 PetscCheck(ns,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1184 1185 /* If max inode size > 4, split it into two inodes.*/ 1186 /* also map the inode sizes according to the ordering */ 1187 PetscCall(PetscMalloc1(n+1,&tmp_vec1)); 1188 for (i=0,j=0; i<node_max; ++i,++j) { 1189 if (ns[i] > 4) { 1190 tmp_vec1[j] = 4; 1191 ++j; 1192 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1193 } else { 1194 tmp_vec1[j] = ns[i]; 1195 } 1196 } 1197 /* Use the correct node_max */ 1198 node_max = j; 1199 1200 /* Now reorder the inode info based on mat re-ordering info */ 1201 /* First create a row -> inode_size_array_index map */ 1202 PetscCall(PetscMalloc1(n+1,&nsmap)); 1203 PetscCall(PetscMalloc1(node_max+1,&tmp_vec2)); 1204 for (i=0,row=0; i<node_max; i++) { 1205 nodesz = tmp_vec1[i]; 1206 for (j=0; j<nodesz; j++,row++) { 1207 nsmap[row] = i; 1208 } 1209 } 1210 /* Using nsmap, create a reordered ns structure */ 1211 for (i=0,j=0; i< node_max; i++) { 1212 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1213 tmp_vec2[i] = nodesz; 1214 j += nodesz; 1215 } 1216 PetscCall(PetscFree(nsmap)); 1217 PetscCall(PetscFree(tmp_vec1)); 1218 1219 /* Now use the correct ns */ 1220 ns = tmp_vec2; 1221 1222 do { 1223 sctx.newshift = PETSC_FALSE; 1224 /* Now loop over each block-row, and do the factorization */ 1225 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1226 nodesz = ns[inod]; 1227 1228 switch (nodesz) { 1229 case 1: 1230 /*----------*/ 1231 /* zero rtmp1 */ 1232 /* L part */ 1233 nz = bi[i+1] - bi[i]; 1234 bjtmp = bj + bi[i]; 1235 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1236 1237 /* U part */ 1238 nz = bdiag[i]-bdiag[i+1]; 1239 bjtmp = bj + bdiag[i+1]+1; 1240 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1241 1242 /* load in initial (unfactored row) */ 1243 nz = ai[r[i]+1] - ai[r[i]]; 1244 ajtmp = aj + ai[r[i]]; 1245 v = aa + ai[r[i]]; 1246 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1247 1248 /* ZeropivotApply() */ 1249 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1250 1251 /* elimination */ 1252 bjtmp = bj + bi[i]; 1253 row = *bjtmp++; 1254 nzL = bi[i+1] - bi[i]; 1255 for (k=0; k < nzL; k++) { 1256 pc = rtmp1 + row; 1257 if (*pc != 0.0) { 1258 pv = b->a + bdiag[row]; 1259 mul1 = *pc * (*pv); 1260 *pc = mul1; 1261 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1262 pv = b->a + bdiag[row+1]+1; 1263 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1264 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1265 PetscCall(PetscLogFlops(1+2.0*nz)); 1266 } 1267 row = *bjtmp++; 1268 } 1269 1270 /* finished row so stick it into b->a */ 1271 rs = 0.0; 1272 /* L part */ 1273 pv = b->a + bi[i]; 1274 pj = b->j + bi[i]; 1275 nz = bi[i+1] - bi[i]; 1276 for (j=0; j<nz; j++) { 1277 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1278 } 1279 1280 /* U part */ 1281 pv = b->a + bdiag[i+1]+1; 1282 pj = b->j + bdiag[i+1]+1; 1283 nz = bdiag[i] - bdiag[i+1]-1; 1284 for (j=0; j<nz; j++) { 1285 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1286 } 1287 1288 /* Check zero pivot */ 1289 sctx.rs = rs; 1290 sctx.pv = rtmp1[i]; 1291 PetscCall(MatPivotCheck(B,A,info,&sctx,i)); 1292 if (sctx.newshift) break; 1293 1294 /* Mark diagonal and invert diagonal for simpler triangular solves */ 1295 pv = b->a + bdiag[i]; 1296 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1297 break; 1298 1299 case 2: 1300 /*----------*/ 1301 /* zero rtmp1 and rtmp2 */ 1302 /* L part */ 1303 nz = bi[i+1] - bi[i]; 1304 bjtmp = bj + bi[i]; 1305 for (j=0; j<nz; j++) { 1306 col = bjtmp[j]; 1307 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1308 } 1309 1310 /* U part */ 1311 nz = bdiag[i]-bdiag[i+1]; 1312 bjtmp = bj + bdiag[i+1]+1; 1313 for (j=0; j<nz; j++) { 1314 col = bjtmp[j]; 1315 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1316 } 1317 1318 /* load in initial (unfactored row) */ 1319 nz = ai[r[i]+1] - ai[r[i]]; 1320 ajtmp = aj + ai[r[i]]; 1321 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1322 for (j=0; j<nz; j++) { 1323 col = ics[ajtmp[j]]; 1324 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1325 } 1326 /* ZeropivotApply(): shift the diagonal of the matrix */ 1327 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1328 1329 /* elimination */ 1330 bjtmp = bj + bi[i]; 1331 row = *bjtmp++; /* pivot row */ 1332 nzL = bi[i+1] - bi[i]; 1333 for (k=0; k < nzL; k++) { 1334 pc1 = rtmp1 + row; 1335 pc2 = rtmp2 + row; 1336 if (*pc1 != 0.0 || *pc2 != 0.0) { 1337 pv = b->a + bdiag[row]; 1338 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1339 *pc1 = mul1; *pc2 = mul2; 1340 1341 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1342 pv = b->a + bdiag[row+1]+1; 1343 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1344 for (j=0; j<nz; j++) { 1345 col = pj[j]; 1346 rtmp1[col] -= mul1 * pv[j]; 1347 rtmp2[col] -= mul2 * pv[j]; 1348 } 1349 PetscCall(PetscLogFlops(2+4.0*nz)); 1350 } 1351 row = *bjtmp++; 1352 } 1353 1354 /* finished row i; check zero pivot, then stick row i into b->a */ 1355 rs = 0.0; 1356 /* L part */ 1357 pc1 = b->a + bi[i]; 1358 pj = b->j + bi[i]; 1359 nz = bi[i+1] - bi[i]; 1360 for (j=0; j<nz; j++) { 1361 col = pj[j]; 1362 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1363 } 1364 /* U part */ 1365 pc1 = b->a + bdiag[i+1]+1; 1366 pj = b->j + bdiag[i+1]+1; 1367 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1368 for (j=0; j<nz; j++) { 1369 col = pj[j]; 1370 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1371 } 1372 1373 sctx.rs = rs; 1374 sctx.pv = rtmp1[i]; 1375 PetscCall(MatPivotCheck(B,A,info,&sctx,i)); 1376 if (sctx.newshift) break; 1377 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1378 *pc1 = 1.0/sctx.pv; 1379 1380 /* Now take care of diagonal 2x2 block. */ 1381 pc2 = rtmp2 + i; 1382 if (*pc2 != 0.0) { 1383 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1384 *pc2 = mul1; /* insert L entry */ 1385 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1386 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1387 for (j=0; j<nz; j++) { 1388 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1389 } 1390 PetscCall(PetscLogFlops(1+2.0*nz)); 1391 } 1392 1393 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1394 rs = 0.0; 1395 /* L part */ 1396 pc2 = b->a + bi[i+1]; 1397 pj = b->j + bi[i+1]; 1398 nz = bi[i+2] - bi[i+1]; 1399 for (j=0; j<nz; j++) { 1400 col = pj[j]; 1401 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1402 } 1403 /* U part */ 1404 pc2 = b->a + bdiag[i+2]+1; 1405 pj = b->j + bdiag[i+2]+1; 1406 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1407 for (j=0; j<nz; j++) { 1408 col = pj[j]; 1409 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1410 } 1411 1412 sctx.rs = rs; 1413 sctx.pv = rtmp2[i+1]; 1414 PetscCall(MatPivotCheck(B,A,info,&sctx,i+1)); 1415 if (sctx.newshift) break; 1416 pc2 = b->a + bdiag[i+1]; 1417 *pc2 = 1.0/sctx.pv; 1418 break; 1419 1420 case 3: 1421 /*----------*/ 1422 /* zero rtmp */ 1423 /* L part */ 1424 nz = bi[i+1] - bi[i]; 1425 bjtmp = bj + bi[i]; 1426 for (j=0; j<nz; j++) { 1427 col = bjtmp[j]; 1428 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1429 } 1430 1431 /* U part */ 1432 nz = bdiag[i]-bdiag[i+1]; 1433 bjtmp = bj + bdiag[i+1]+1; 1434 for (j=0; j<nz; j++) { 1435 col = bjtmp[j]; 1436 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1437 } 1438 1439 /* load in initial (unfactored row) */ 1440 nz = ai[r[i]+1] - ai[r[i]]; 1441 ajtmp = aj + ai[r[i]]; 1442 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1443 for (j=0; j<nz; j++) { 1444 col = ics[ajtmp[j]]; 1445 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1446 } 1447 /* ZeropivotApply(): shift the diagonal of the matrix */ 1448 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1449 1450 /* elimination */ 1451 bjtmp = bj + bi[i]; 1452 row = *bjtmp++; /* pivot row */ 1453 nzL = bi[i+1] - bi[i]; 1454 for (k=0; k < nzL; k++) { 1455 pc1 = rtmp1 + row; 1456 pc2 = rtmp2 + row; 1457 pc3 = rtmp3 + row; 1458 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1459 pv = b->a + bdiag[row]; 1460 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1461 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1462 1463 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1464 pv = b->a + bdiag[row+1]+1; 1465 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1466 for (j=0; j<nz; j++) { 1467 col = pj[j]; 1468 rtmp1[col] -= mul1 * pv[j]; 1469 rtmp2[col] -= mul2 * pv[j]; 1470 rtmp3[col] -= mul3 * pv[j]; 1471 } 1472 PetscCall(PetscLogFlops(3+6.0*nz)); 1473 } 1474 row = *bjtmp++; 1475 } 1476 1477 /* finished row i; check zero pivot, then stick row i into b->a */ 1478 rs = 0.0; 1479 /* L part */ 1480 pc1 = b->a + bi[i]; 1481 pj = b->j + bi[i]; 1482 nz = bi[i+1] - bi[i]; 1483 for (j=0; j<nz; j++) { 1484 col = pj[j]; 1485 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1486 } 1487 /* U part */ 1488 pc1 = b->a + bdiag[i+1]+1; 1489 pj = b->j + bdiag[i+1]+1; 1490 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1491 for (j=0; j<nz; j++) { 1492 col = pj[j]; 1493 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1494 } 1495 1496 sctx.rs = rs; 1497 sctx.pv = rtmp1[i]; 1498 PetscCall(MatPivotCheck(B,A,info,&sctx,i)); 1499 if (sctx.newshift) break; 1500 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1501 *pc1 = 1.0/sctx.pv; 1502 1503 /* Now take care of 1st column of diagonal 3x3 block. */ 1504 pc2 = rtmp2 + i; 1505 pc3 = rtmp3 + i; 1506 if (*pc2 != 0.0 || *pc3 != 0.0) { 1507 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1508 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1509 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1510 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1511 for (j=0; j<nz; j++) { 1512 col = pj[j]; 1513 rtmp2[col] -= mul2 * rtmp1[col]; 1514 rtmp3[col] -= mul3 * rtmp1[col]; 1515 } 1516 PetscCall(PetscLogFlops(2+4.0*nz)); 1517 } 1518 1519 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1520 rs = 0.0; 1521 /* L part */ 1522 pc2 = b->a + bi[i+1]; 1523 pj = b->j + bi[i+1]; 1524 nz = bi[i+2] - bi[i+1]; 1525 for (j=0; j<nz; j++) { 1526 col = pj[j]; 1527 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1528 } 1529 /* U part */ 1530 pc2 = b->a + bdiag[i+2]+1; 1531 pj = b->j + bdiag[i+2]+1; 1532 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1533 for (j=0; j<nz; j++) { 1534 col = pj[j]; 1535 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1536 } 1537 1538 sctx.rs = rs; 1539 sctx.pv = rtmp2[i+1]; 1540 PetscCall(MatPivotCheck(B,A,info,&sctx,i+1)); 1541 if (sctx.newshift) break; 1542 pc2 = b->a + bdiag[i+1]; 1543 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1544 1545 /* Now take care of 2nd column of diagonal 3x3 block. */ 1546 pc3 = rtmp3 + i+1; 1547 if (*pc3 != 0.0) { 1548 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1549 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1550 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1551 for (j=0; j<nz; j++) { 1552 col = pj[j]; 1553 rtmp3[col] -= mul3 * rtmp2[col]; 1554 } 1555 PetscCall(PetscLogFlops(1+2.0*nz)); 1556 } 1557 1558 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1559 rs = 0.0; 1560 /* L part */ 1561 pc3 = b->a + bi[i+2]; 1562 pj = b->j + bi[i+2]; 1563 nz = bi[i+3] - bi[i+2]; 1564 for (j=0; j<nz; j++) { 1565 col = pj[j]; 1566 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1567 } 1568 /* U part */ 1569 pc3 = b->a + bdiag[i+3]+1; 1570 pj = b->j + bdiag[i+3]+1; 1571 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1572 for (j=0; j<nz; j++) { 1573 col = pj[j]; 1574 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1575 } 1576 1577 sctx.rs = rs; 1578 sctx.pv = rtmp3[i+2]; 1579 PetscCall(MatPivotCheck(B,A,info,&sctx,i+2)); 1580 if (sctx.newshift) break; 1581 pc3 = b->a + bdiag[i+2]; 1582 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1583 break; 1584 case 4: 1585 /*----------*/ 1586 /* zero rtmp */ 1587 /* L part */ 1588 nz = bi[i+1] - bi[i]; 1589 bjtmp = bj + bi[i]; 1590 for (j=0; j<nz; j++) { 1591 col = bjtmp[j]; 1592 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1593 } 1594 1595 /* U part */ 1596 nz = bdiag[i]-bdiag[i+1]; 1597 bjtmp = bj + bdiag[i+1]+1; 1598 for (j=0; j<nz; j++) { 1599 col = bjtmp[j]; 1600 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1601 } 1602 1603 /* load in initial (unfactored row) */ 1604 nz = ai[r[i]+1] - ai[r[i]]; 1605 ajtmp = aj + ai[r[i]]; 1606 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1607 for (j=0; j<nz; j++) { 1608 col = ics[ajtmp[j]]; 1609 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1610 } 1611 /* ZeropivotApply(): shift the diagonal of the matrix */ 1612 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1613 1614 /* elimination */ 1615 bjtmp = bj + bi[i]; 1616 row = *bjtmp++; /* pivot row */ 1617 nzL = bi[i+1] - bi[i]; 1618 for (k=0; k < nzL; k++) { 1619 pc1 = rtmp1 + row; 1620 pc2 = rtmp2 + row; 1621 pc3 = rtmp3 + row; 1622 pc4 = rtmp4 + row; 1623 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1624 pv = b->a + bdiag[row]; 1625 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1626 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1627 1628 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1629 pv = b->a + bdiag[row+1]+1; 1630 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1631 for (j=0; j<nz; j++) { 1632 col = pj[j]; 1633 rtmp1[col] -= mul1 * pv[j]; 1634 rtmp2[col] -= mul2 * pv[j]; 1635 rtmp3[col] -= mul3 * pv[j]; 1636 rtmp4[col] -= mul4 * pv[j]; 1637 } 1638 PetscCall(PetscLogFlops(4+8.0*nz)); 1639 } 1640 row = *bjtmp++; 1641 } 1642 1643 /* finished row i; check zero pivot, then stick row i into b->a */ 1644 rs = 0.0; 1645 /* L part */ 1646 pc1 = b->a + bi[i]; 1647 pj = b->j + bi[i]; 1648 nz = bi[i+1] - bi[i]; 1649 for (j=0; j<nz; j++) { 1650 col = pj[j]; 1651 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1652 } 1653 /* U part */ 1654 pc1 = b->a + bdiag[i+1]+1; 1655 pj = b->j + bdiag[i+1]+1; 1656 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1657 for (j=0; j<nz; j++) { 1658 col = pj[j]; 1659 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1660 } 1661 1662 sctx.rs = rs; 1663 sctx.pv = rtmp1[i]; 1664 PetscCall(MatPivotCheck(B,A,info,&sctx,i)); 1665 if (sctx.newshift) break; 1666 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1667 *pc1 = 1.0/sctx.pv; 1668 1669 /* Now take care of 1st column of diagonal 4x4 block. */ 1670 pc2 = rtmp2 + i; 1671 pc3 = rtmp3 + i; 1672 pc4 = rtmp4 + i; 1673 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1674 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1675 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1676 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1677 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1678 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1679 for (j=0; j<nz; j++) { 1680 col = pj[j]; 1681 rtmp2[col] -= mul2 * rtmp1[col]; 1682 rtmp3[col] -= mul3 * rtmp1[col]; 1683 rtmp4[col] -= mul4 * rtmp1[col]; 1684 } 1685 PetscCall(PetscLogFlops(3+6.0*nz)); 1686 } 1687 1688 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1689 rs = 0.0; 1690 /* L part */ 1691 pc2 = b->a + bi[i+1]; 1692 pj = b->j + bi[i+1]; 1693 nz = bi[i+2] - bi[i+1]; 1694 for (j=0; j<nz; j++) { 1695 col = pj[j]; 1696 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1697 } 1698 /* U part */ 1699 pc2 = b->a + bdiag[i+2]+1; 1700 pj = b->j + bdiag[i+2]+1; 1701 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1702 for (j=0; j<nz; j++) { 1703 col = pj[j]; 1704 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1705 } 1706 1707 sctx.rs = rs; 1708 sctx.pv = rtmp2[i+1]; 1709 PetscCall(MatPivotCheck(B,A,info,&sctx,i+1)); 1710 if (sctx.newshift) break; 1711 pc2 = b->a + bdiag[i+1]; 1712 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1713 1714 /* Now take care of 2nd column of diagonal 4x4 block. */ 1715 pc3 = rtmp3 + i+1; 1716 pc4 = rtmp4 + i+1; 1717 if (*pc3 != 0.0 || *pc4 != 0.0) { 1718 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1719 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1720 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1721 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1722 for (j=0; j<nz; j++) { 1723 col = pj[j]; 1724 rtmp3[col] -= mul3 * rtmp2[col]; 1725 rtmp4[col] -= mul4 * rtmp2[col]; 1726 } 1727 PetscCall(PetscLogFlops(4.0*nz)); 1728 } 1729 1730 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1731 rs = 0.0; 1732 /* L part */ 1733 pc3 = b->a + bi[i+2]; 1734 pj = b->j + bi[i+2]; 1735 nz = bi[i+3] - bi[i+2]; 1736 for (j=0; j<nz; j++) { 1737 col = pj[j]; 1738 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1739 } 1740 /* U part */ 1741 pc3 = b->a + bdiag[i+3]+1; 1742 pj = b->j + bdiag[i+3]+1; 1743 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1744 for (j=0; j<nz; j++) { 1745 col = pj[j]; 1746 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1747 } 1748 1749 sctx.rs = rs; 1750 sctx.pv = rtmp3[i+2]; 1751 PetscCall(MatPivotCheck(B,A,info,&sctx,i+2)); 1752 if (sctx.newshift) break; 1753 pc3 = b->a + bdiag[i+2]; 1754 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1755 1756 /* Now take care of 3rd column of diagonal 4x4 block. */ 1757 pc4 = rtmp4 + i+2; 1758 if (*pc4 != 0.0) { 1759 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1760 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1761 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1762 for (j=0; j<nz; j++) { 1763 col = pj[j]; 1764 rtmp4[col] -= mul4 * rtmp3[col]; 1765 } 1766 PetscCall(PetscLogFlops(1+2.0*nz)); 1767 } 1768 1769 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1770 rs = 0.0; 1771 /* L part */ 1772 pc4 = b->a + bi[i+3]; 1773 pj = b->j + bi[i+3]; 1774 nz = bi[i+4] - bi[i+3]; 1775 for (j=0; j<nz; j++) { 1776 col = pj[j]; 1777 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1778 } 1779 /* U part */ 1780 pc4 = b->a + bdiag[i+4]+1; 1781 pj = b->j + bdiag[i+4]+1; 1782 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1783 for (j=0; j<nz; j++) { 1784 col = pj[j]; 1785 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1786 } 1787 1788 sctx.rs = rs; 1789 sctx.pv = rtmp4[i+3]; 1790 PetscCall(MatPivotCheck(B,A,info,&sctx,i+3)); 1791 if (sctx.newshift) break; 1792 pc4 = b->a + bdiag[i+3]; 1793 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1794 break; 1795 1796 default: 1797 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported "); 1798 } 1799 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1800 i += nodesz; /* Update the row */ 1801 } 1802 1803 /* MatPivotRefine() */ 1804 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1805 /* 1806 * if no shift in this attempt & shifting & started shifting & can refine, 1807 * then try lower shift 1808 */ 1809 sctx.shift_hi = sctx.shift_fraction; 1810 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1811 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1812 sctx.newshift = PETSC_TRUE; 1813 sctx.nshift++; 1814 } 1815 } while (sctx.newshift); 1816 1817 PetscCall(PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4)); 1818 PetscCall(PetscFree(tmp_vec2)); 1819 PetscCall(ISRestoreIndices(isicol,&ic)); 1820 PetscCall(ISRestoreIndices(isrow,&r)); 1821 1822 if (b->inode.size) { 1823 C->ops->solve = MatSolve_SeqAIJ_Inode; 1824 } else { 1825 C->ops->solve = MatSolve_SeqAIJ; 1826 } 1827 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1828 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1829 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1830 C->ops->matsolve = MatMatSolve_SeqAIJ; 1831 C->assembled = PETSC_TRUE; 1832 C->preallocated = PETSC_TRUE; 1833 1834 PetscCall(PetscLogFlops(C->cmap->n)); 1835 1836 /* MatShiftView(A,info,&sctx) */ 1837 if (sctx.nshift) { 1838 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1839 PetscCall(PetscInfo(A,"number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top)); 1840 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1841 PetscCall(PetscInfo(A,"number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount)); 1842 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1843 PetscCall(PetscInfo(A,"number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n",sctx.nshift,(double)info->shiftamount)); 1844 } 1845 } 1846 PetscFunctionReturn(0); 1847 } 1848 1849 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1850 { 1851 Mat C = B; 1852 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1853 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1854 const PetscInt *r,*ic,*c,*ics; 1855 PetscInt n = A->rmap->n,*bi = b->i; 1856 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1857 PetscInt i,j,idx,*bd = b->diag,node_max,nodesz; 1858 PetscInt *ai = a->i,*aj = a->j; 1859 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1860 PetscScalar mul1,mul2,mul3,tmp; 1861 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1862 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1863 PetscReal rs=0.0; 1864 FactorShiftCtx sctx; 1865 1866 PetscFunctionBegin; 1867 sctx.shift_top = 0; 1868 sctx.nshift_max = 0; 1869 sctx.shift_lo = 0; 1870 sctx.shift_hi = 0; 1871 sctx.shift_fraction = 0; 1872 1873 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1874 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1875 sctx.shift_top = 0; 1876 for (i=0; i<n; i++) { 1877 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1878 rs = 0.0; 1879 ajtmp = aj + ai[i]; 1880 rtmp1 = aa + ai[i]; 1881 nz = ai[i+1] - ai[i]; 1882 for (j=0; j<nz; j++) { 1883 if (*ajtmp != i) { 1884 rs += PetscAbsScalar(*rtmp1++); 1885 } else { 1886 rs -= PetscRealPart(*rtmp1++); 1887 } 1888 ajtmp++; 1889 } 1890 if (rs>sctx.shift_top) sctx.shift_top = rs; 1891 } 1892 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1893 sctx.shift_top *= 1.1; 1894 sctx.nshift_max = 5; 1895 sctx.shift_lo = 0.; 1896 sctx.shift_hi = 1.; 1897 } 1898 sctx.shift_amount = 0; 1899 sctx.nshift = 0; 1900 1901 PetscCall(ISGetIndices(isrow,&r)); 1902 PetscCall(ISGetIndices(iscol,&c)); 1903 PetscCall(ISGetIndices(isicol,&ic)); 1904 PetscCall(PetscCalloc3(n,&rtmp11,n,&rtmp22,n,&rtmp33)); 1905 ics = ic; 1906 1907 node_max = a->inode.node_count; 1908 ns = a->inode.size; 1909 PetscCheck(ns,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1910 1911 /* If max inode size > 3, split it into two inodes.*/ 1912 /* also map the inode sizes according to the ordering */ 1913 PetscCall(PetscMalloc1(n+1,&tmp_vec1)); 1914 for (i=0,j=0; i<node_max; ++i,++j) { 1915 if (ns[i]>3) { 1916 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1917 ++j; 1918 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1919 } else { 1920 tmp_vec1[j] = ns[i]; 1921 } 1922 } 1923 /* Use the correct node_max */ 1924 node_max = j; 1925 1926 /* Now reorder the inode info based on mat re-ordering info */ 1927 /* First create a row -> inode_size_array_index map */ 1928 PetscCall(PetscMalloc2(n+1,&nsmap,node_max+1,&tmp_vec2)); 1929 for (i=0,row=0; i<node_max; i++) { 1930 nodesz = tmp_vec1[i]; 1931 for (j=0; j<nodesz; j++,row++) { 1932 nsmap[row] = i; 1933 } 1934 } 1935 /* Using nsmap, create a reordered ns structure */ 1936 for (i=0,j=0; i< node_max; i++) { 1937 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1938 tmp_vec2[i] = nodesz; 1939 j += nodesz; 1940 } 1941 PetscCall(PetscFree2(nsmap,tmp_vec1)); 1942 /* Now use the correct ns */ 1943 ns = tmp_vec2; 1944 1945 do { 1946 sctx.newshift = PETSC_FALSE; 1947 /* Now loop over each block-row, and do the factorization */ 1948 for (i=0,row=0; i<node_max; i++) { 1949 nodesz = ns[i]; 1950 nz = bi[row+1] - bi[row]; 1951 bjtmp = bj + bi[row]; 1952 1953 switch (nodesz) { 1954 case 1: 1955 for (j=0; j<nz; j++) { 1956 idx = bjtmp[j]; 1957 rtmp11[idx] = 0.0; 1958 } 1959 1960 /* load in initial (unfactored row) */ 1961 idx = r[row]; 1962 nz_tmp = ai[idx+1] - ai[idx]; 1963 ajtmp = aj + ai[idx]; 1964 v1 = aa + ai[idx]; 1965 1966 for (j=0; j<nz_tmp; j++) { 1967 idx = ics[ajtmp[j]]; 1968 rtmp11[idx] = v1[j]; 1969 } 1970 rtmp11[ics[r[row]]] += sctx.shift_amount; 1971 1972 prow = *bjtmp++; 1973 while (prow < row) { 1974 pc1 = rtmp11 + prow; 1975 if (*pc1 != 0.0) { 1976 pv = ba + bd[prow]; 1977 pj = nbj + bd[prow]; 1978 mul1 = *pc1 * *pv++; 1979 *pc1 = mul1; 1980 nz_tmp = bi[prow+1] - bd[prow] - 1; 1981 PetscCall(PetscLogFlops(1+2.0*nz_tmp)); 1982 for (j=0; j<nz_tmp; j++) { 1983 tmp = pv[j]; 1984 idx = pj[j]; 1985 rtmp11[idx] -= mul1 * tmp; 1986 } 1987 } 1988 prow = *bjtmp++; 1989 } 1990 pj = bj + bi[row]; 1991 pc1 = ba + bi[row]; 1992 1993 sctx.pv = rtmp11[row]; 1994 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1995 rs = 0.0; 1996 for (j=0; j<nz; j++) { 1997 idx = pj[j]; 1998 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1999 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2000 } 2001 sctx.rs = rs; 2002 PetscCall(MatPivotCheck(B,A,info,&sctx,row)); 2003 if (sctx.newshift) goto endofwhile; 2004 break; 2005 2006 case 2: 2007 for (j=0; j<nz; j++) { 2008 idx = bjtmp[j]; 2009 rtmp11[idx] = 0.0; 2010 rtmp22[idx] = 0.0; 2011 } 2012 2013 /* load in initial (unfactored row) */ 2014 idx = r[row]; 2015 nz_tmp = ai[idx+1] - ai[idx]; 2016 ajtmp = aj + ai[idx]; 2017 v1 = aa + ai[idx]; 2018 v2 = aa + ai[idx+1]; 2019 for (j=0; j<nz_tmp; j++) { 2020 idx = ics[ajtmp[j]]; 2021 rtmp11[idx] = v1[j]; 2022 rtmp22[idx] = v2[j]; 2023 } 2024 rtmp11[ics[r[row]]] += sctx.shift_amount; 2025 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2026 2027 prow = *bjtmp++; 2028 while (prow < row) { 2029 pc1 = rtmp11 + prow; 2030 pc2 = rtmp22 + prow; 2031 if (*pc1 != 0.0 || *pc2 != 0.0) { 2032 pv = ba + bd[prow]; 2033 pj = nbj + bd[prow]; 2034 mul1 = *pc1 * *pv; 2035 mul2 = *pc2 * *pv; 2036 ++pv; 2037 *pc1 = mul1; 2038 *pc2 = mul2; 2039 2040 nz_tmp = bi[prow+1] - bd[prow] - 1; 2041 for (j=0; j<nz_tmp; j++) { 2042 tmp = pv[j]; 2043 idx = pj[j]; 2044 rtmp11[idx] -= mul1 * tmp; 2045 rtmp22[idx] -= mul2 * tmp; 2046 } 2047 PetscCall(PetscLogFlops(2+4.0*nz_tmp)); 2048 } 2049 prow = *bjtmp++; 2050 } 2051 2052 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2053 pc1 = rtmp11 + prow; 2054 pc2 = rtmp22 + prow; 2055 2056 sctx.pv = *pc1; 2057 pj = bj + bi[prow]; 2058 rs = 0.0; 2059 for (j=0; j<nz; j++) { 2060 idx = pj[j]; 2061 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2062 } 2063 sctx.rs = rs; 2064 PetscCall(MatPivotCheck(B,A,info,&sctx,row)); 2065 if (sctx.newshift) goto endofwhile; 2066 2067 if (*pc2 != 0.0) { 2068 pj = nbj + bd[prow]; 2069 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2070 *pc2 = mul2; 2071 nz_tmp = bi[prow+1] - bd[prow] - 1; 2072 for (j=0; j<nz_tmp; j++) { 2073 idx = pj[j]; 2074 tmp = rtmp11[idx]; 2075 rtmp22[idx] -= mul2 * tmp; 2076 } 2077 PetscCall(PetscLogFlops(1+2.0*nz_tmp)); 2078 } 2079 2080 pj = bj + bi[row]; 2081 pc1 = ba + bi[row]; 2082 pc2 = ba + bi[row+1]; 2083 2084 sctx.pv = rtmp22[row+1]; 2085 rs = 0.0; 2086 rtmp11[row] = 1.0/rtmp11[row]; 2087 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2088 /* copy row entries from dense representation to sparse */ 2089 for (j=0; j<nz; j++) { 2090 idx = pj[j]; 2091 pc1[j] = rtmp11[idx]; 2092 pc2[j] = rtmp22[idx]; 2093 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2094 } 2095 sctx.rs = rs; 2096 PetscCall(MatPivotCheck(B,A,info,&sctx,row+1)); 2097 if (sctx.newshift) goto endofwhile; 2098 break; 2099 2100 case 3: 2101 for (j=0; j<nz; j++) { 2102 idx = bjtmp[j]; 2103 rtmp11[idx] = 0.0; 2104 rtmp22[idx] = 0.0; 2105 rtmp33[idx] = 0.0; 2106 } 2107 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2108 idx = r[row]; 2109 nz_tmp = ai[idx+1] - ai[idx]; 2110 ajtmp = aj + ai[idx]; 2111 v1 = aa + ai[idx]; 2112 v2 = aa + ai[idx+1]; 2113 v3 = aa + ai[idx+2]; 2114 for (j=0; j<nz_tmp; j++) { 2115 idx = ics[ajtmp[j]]; 2116 rtmp11[idx] = v1[j]; 2117 rtmp22[idx] = v2[j]; 2118 rtmp33[idx] = v3[j]; 2119 } 2120 rtmp11[ics[r[row]]] += sctx.shift_amount; 2121 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2122 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2123 2124 /* loop over all pivot row blocks above this row block */ 2125 prow = *bjtmp++; 2126 while (prow < row) { 2127 pc1 = rtmp11 + prow; 2128 pc2 = rtmp22 + prow; 2129 pc3 = rtmp33 + prow; 2130 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2131 pv = ba + bd[prow]; 2132 pj = nbj + bd[prow]; 2133 mul1 = *pc1 * *pv; 2134 mul2 = *pc2 * *pv; 2135 mul3 = *pc3 * *pv; 2136 ++pv; 2137 *pc1 = mul1; 2138 *pc2 = mul2; 2139 *pc3 = mul3; 2140 2141 nz_tmp = bi[prow+1] - bd[prow] - 1; 2142 /* update this row based on pivot row */ 2143 for (j=0; j<nz_tmp; j++) { 2144 tmp = pv[j]; 2145 idx = pj[j]; 2146 rtmp11[idx] -= mul1 * tmp; 2147 rtmp22[idx] -= mul2 * tmp; 2148 rtmp33[idx] -= mul3 * tmp; 2149 } 2150 PetscCall(PetscLogFlops(3+6.0*nz_tmp)); 2151 } 2152 prow = *bjtmp++; 2153 } 2154 2155 /* Now take care of diagonal 3x3 block in this set of rows */ 2156 /* note: prow = row here */ 2157 pc1 = rtmp11 + prow; 2158 pc2 = rtmp22 + prow; 2159 pc3 = rtmp33 + prow; 2160 2161 sctx.pv = *pc1; 2162 pj = bj + bi[prow]; 2163 rs = 0.0; 2164 for (j=0; j<nz; j++) { 2165 idx = pj[j]; 2166 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2167 } 2168 sctx.rs = rs; 2169 PetscCall(MatPivotCheck(B,A,info,&sctx,row)); 2170 if (sctx.newshift) goto endofwhile; 2171 2172 if (*pc2 != 0.0 || *pc3 != 0.0) { 2173 mul2 = (*pc2)/(*pc1); 2174 mul3 = (*pc3)/(*pc1); 2175 *pc2 = mul2; 2176 *pc3 = mul3; 2177 nz_tmp = bi[prow+1] - bd[prow] - 1; 2178 pj = nbj + bd[prow]; 2179 for (j=0; j<nz_tmp; j++) { 2180 idx = pj[j]; 2181 tmp = rtmp11[idx]; 2182 rtmp22[idx] -= mul2 * tmp; 2183 rtmp33[idx] -= mul3 * tmp; 2184 } 2185 PetscCall(PetscLogFlops(2+4.0*nz_tmp)); 2186 } 2187 ++prow; 2188 2189 pc2 = rtmp22 + prow; 2190 pc3 = rtmp33 + prow; 2191 sctx.pv = *pc2; 2192 pj = bj + bi[prow]; 2193 rs = 0.0; 2194 for (j=0; j<nz; j++) { 2195 idx = pj[j]; 2196 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2197 } 2198 sctx.rs = rs; 2199 PetscCall(MatPivotCheck(B,A,info,&sctx,row+1)); 2200 if (sctx.newshift) goto endofwhile; 2201 2202 if (*pc3 != 0.0) { 2203 mul3 = (*pc3)/(*pc2); 2204 *pc3 = mul3; 2205 pj = nbj + bd[prow]; 2206 nz_tmp = bi[prow+1] - bd[prow] - 1; 2207 for (j=0; j<nz_tmp; j++) { 2208 idx = pj[j]; 2209 tmp = rtmp22[idx]; 2210 rtmp33[idx] -= mul3 * tmp; 2211 } 2212 PetscCall(PetscLogFlops(1+2.0*nz_tmp)); 2213 } 2214 2215 pj = bj + bi[row]; 2216 pc1 = ba + bi[row]; 2217 pc2 = ba + bi[row+1]; 2218 pc3 = ba + bi[row+2]; 2219 2220 sctx.pv = rtmp33[row+2]; 2221 rs = 0.0; 2222 rtmp11[row] = 1.0/rtmp11[row]; 2223 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2224 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2225 /* copy row entries from dense representation to sparse */ 2226 for (j=0; j<nz; j++) { 2227 idx = pj[j]; 2228 pc1[j] = rtmp11[idx]; 2229 pc2[j] = rtmp22[idx]; 2230 pc3[j] = rtmp33[idx]; 2231 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2232 } 2233 2234 sctx.rs = rs; 2235 PetscCall(MatPivotCheck(B,A,info,&sctx,row+2)); 2236 if (sctx.newshift) goto endofwhile; 2237 break; 2238 2239 default: 2240 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported "); 2241 } 2242 row += nodesz; /* Update the row */ 2243 } 2244 endofwhile:; 2245 } while (sctx.newshift); 2246 PetscCall(PetscFree3(rtmp11,rtmp22,rtmp33)); 2247 PetscCall(PetscFree(tmp_vec2)); 2248 PetscCall(ISRestoreIndices(isicol,&ic)); 2249 PetscCall(ISRestoreIndices(isrow,&r)); 2250 PetscCall(ISRestoreIndices(iscol,&c)); 2251 2252 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2253 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2254 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2255 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2256 C->assembled = PETSC_TRUE; 2257 C->preallocated = PETSC_TRUE; 2258 if (sctx.nshift) { 2259 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2260 PetscCall(PetscInfo(A,"number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,(double)sctx.shift_amount,(double)sctx.shift_fraction,(double)sctx.shift_top)); 2261 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2262 PetscCall(PetscInfo(A,"number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n",sctx.nshift,(double)sctx.shift_amount)); 2263 } 2264 } 2265 PetscCall(PetscLogFlops(C->cmap->n)); 2266 PetscCall(MatSeqAIJCheckInode(C)); 2267 PetscFunctionReturn(0); 2268 } 2269 2270 /* ----------------------------------------------------------- */ 2271 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2272 { 2273 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2274 IS iscol = a->col,isrow = a->row; 2275 const PetscInt *r,*c,*rout,*cout; 2276 PetscInt i,j,n = A->rmap->n; 2277 PetscInt node_max,row,nsz,aii,i0,i1,nz; 2278 const PetscInt *ai = a->i,*a_j = a->j,*ns,*vi,*ad,*aj; 2279 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2280 PetscScalar sum1,sum2,sum3,sum4,sum5; 2281 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2282 const PetscScalar *b; 2283 2284 PetscFunctionBegin; 2285 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2286 node_max = a->inode.node_count; 2287 ns = a->inode.size; /* Node Size array */ 2288 2289 PetscCall(VecGetArrayRead(bb,&b)); 2290 PetscCall(VecGetArrayWrite(xx,&x)); 2291 tmp = a->solve_work; 2292 2293 PetscCall(ISGetIndices(isrow,&rout)); r = rout; 2294 PetscCall(ISGetIndices(iscol,&cout)); c = cout; 2295 2296 /* forward solve the lower triangular */ 2297 tmps = tmp; 2298 aa = a_a; 2299 aj = a_j; 2300 ad = a->diag; 2301 2302 for (i = 0,row = 0; i< node_max; ++i) { 2303 nsz = ns[i]; 2304 aii = ai[row]; 2305 v1 = aa + aii; 2306 vi = aj + aii; 2307 nz = ai[row+1]- ai[row]; 2308 2309 if (i < node_max-1) { 2310 /* Prefetch the indices for the next block */ 2311 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2312 /* Prefetch the data for the next block */ 2313 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2314 } 2315 2316 switch (nsz) { /* Each loop in 'case' is unrolled */ 2317 case 1: 2318 sum1 = b[r[row]]; 2319 for (j=0; j<nz-1; j+=2) { 2320 i0 = vi[j]; 2321 i1 = vi[j+1]; 2322 tmp0 = tmps[i0]; 2323 tmp1 = tmps[i1]; 2324 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2325 } 2326 if (j == nz-1) { 2327 tmp0 = tmps[vi[j]]; 2328 sum1 -= v1[j]*tmp0; 2329 } 2330 tmp[row++]=sum1; 2331 break; 2332 case 2: 2333 sum1 = b[r[row]]; 2334 sum2 = b[r[row+1]]; 2335 v2 = aa + ai[row+1]; 2336 2337 for (j=0; j<nz-1; j+=2) { 2338 i0 = vi[j]; 2339 i1 = vi[j+1]; 2340 tmp0 = tmps[i0]; 2341 tmp1 = tmps[i1]; 2342 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2343 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2344 } 2345 if (j == nz-1) { 2346 tmp0 = tmps[vi[j]]; 2347 sum1 -= v1[j] *tmp0; 2348 sum2 -= v2[j] *tmp0; 2349 } 2350 sum2 -= v2[nz] * sum1; 2351 tmp[row++]=sum1; 2352 tmp[row++]=sum2; 2353 break; 2354 case 3: 2355 sum1 = b[r[row]]; 2356 sum2 = b[r[row+1]]; 2357 sum3 = b[r[row+2]]; 2358 v2 = aa + ai[row+1]; 2359 v3 = aa + ai[row+2]; 2360 2361 for (j=0; j<nz-1; j+=2) { 2362 i0 = vi[j]; 2363 i1 = vi[j+1]; 2364 tmp0 = tmps[i0]; 2365 tmp1 = tmps[i1]; 2366 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2367 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2368 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2369 } 2370 if (j == nz-1) { 2371 tmp0 = tmps[vi[j]]; 2372 sum1 -= v1[j] *tmp0; 2373 sum2 -= v2[j] *tmp0; 2374 sum3 -= v3[j] *tmp0; 2375 } 2376 sum2 -= v2[nz] * sum1; 2377 sum3 -= v3[nz] * sum1; 2378 sum3 -= v3[nz+1] * sum2; 2379 tmp[row++]=sum1; 2380 tmp[row++]=sum2; 2381 tmp[row++]=sum3; 2382 break; 2383 2384 case 4: 2385 sum1 = b[r[row]]; 2386 sum2 = b[r[row+1]]; 2387 sum3 = b[r[row+2]]; 2388 sum4 = b[r[row+3]]; 2389 v2 = aa + ai[row+1]; 2390 v3 = aa + ai[row+2]; 2391 v4 = aa + ai[row+3]; 2392 2393 for (j=0; j<nz-1; j+=2) { 2394 i0 = vi[j]; 2395 i1 = vi[j+1]; 2396 tmp0 = tmps[i0]; 2397 tmp1 = tmps[i1]; 2398 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2399 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2400 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2401 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2402 } 2403 if (j == nz-1) { 2404 tmp0 = tmps[vi[j]]; 2405 sum1 -= v1[j] *tmp0; 2406 sum2 -= v2[j] *tmp0; 2407 sum3 -= v3[j] *tmp0; 2408 sum4 -= v4[j] *tmp0; 2409 } 2410 sum2 -= v2[nz] * sum1; 2411 sum3 -= v3[nz] * sum1; 2412 sum4 -= v4[nz] * sum1; 2413 sum3 -= v3[nz+1] * sum2; 2414 sum4 -= v4[nz+1] * sum2; 2415 sum4 -= v4[nz+2] * sum3; 2416 2417 tmp[row++]=sum1; 2418 tmp[row++]=sum2; 2419 tmp[row++]=sum3; 2420 tmp[row++]=sum4; 2421 break; 2422 case 5: 2423 sum1 = b[r[row]]; 2424 sum2 = b[r[row+1]]; 2425 sum3 = b[r[row+2]]; 2426 sum4 = b[r[row+3]]; 2427 sum5 = b[r[row+4]]; 2428 v2 = aa + ai[row+1]; 2429 v3 = aa + ai[row+2]; 2430 v4 = aa + ai[row+3]; 2431 v5 = aa + ai[row+4]; 2432 2433 for (j=0; j<nz-1; j+=2) { 2434 i0 = vi[j]; 2435 i1 = vi[j+1]; 2436 tmp0 = tmps[i0]; 2437 tmp1 = tmps[i1]; 2438 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2439 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2440 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2441 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2442 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2443 } 2444 if (j == nz-1) { 2445 tmp0 = tmps[vi[j]]; 2446 sum1 -= v1[j] *tmp0; 2447 sum2 -= v2[j] *tmp0; 2448 sum3 -= v3[j] *tmp0; 2449 sum4 -= v4[j] *tmp0; 2450 sum5 -= v5[j] *tmp0; 2451 } 2452 2453 sum2 -= v2[nz] * sum1; 2454 sum3 -= v3[nz] * sum1; 2455 sum4 -= v4[nz] * sum1; 2456 sum5 -= v5[nz] * sum1; 2457 sum3 -= v3[nz+1] * sum2; 2458 sum4 -= v4[nz+1] * sum2; 2459 sum5 -= v5[nz+1] * sum2; 2460 sum4 -= v4[nz+2] * sum3; 2461 sum5 -= v5[nz+2] * sum3; 2462 sum5 -= v5[nz+3] * sum4; 2463 2464 tmp[row++]=sum1; 2465 tmp[row++]=sum2; 2466 tmp[row++]=sum3; 2467 tmp[row++]=sum4; 2468 tmp[row++]=sum5; 2469 break; 2470 default: 2471 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported "); 2472 } 2473 } 2474 /* backward solve the upper triangular */ 2475 for (i=node_max -1,row = n-1; i>=0; i--) { 2476 nsz = ns[i]; 2477 aii = ad[row+1] + 1; 2478 v1 = aa + aii; 2479 vi = aj + aii; 2480 nz = ad[row]- ad[row+1] - 1; 2481 2482 if (i > 0) { 2483 /* Prefetch the indices for the next block */ 2484 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2485 /* Prefetch the data for the next block */ 2486 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2487 } 2488 2489 switch (nsz) { /* Each loop in 'case' is unrolled */ 2490 case 1: 2491 sum1 = tmp[row]; 2492 2493 for (j=0; j<nz-1; j+=2) { 2494 i0 = vi[j]; 2495 i1 = vi[j+1]; 2496 tmp0 = tmps[i0]; 2497 tmp1 = tmps[i1]; 2498 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2499 } 2500 if (j == nz-1) { 2501 tmp0 = tmps[vi[j]]; 2502 sum1 -= v1[j]*tmp0; 2503 } 2504 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2505 break; 2506 case 2: 2507 sum1 = tmp[row]; 2508 sum2 = tmp[row-1]; 2509 v2 = aa + ad[row] + 1; 2510 for (j=0; j<nz-1; j+=2) { 2511 i0 = vi[j]; 2512 i1 = vi[j+1]; 2513 tmp0 = tmps[i0]; 2514 tmp1 = tmps[i1]; 2515 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2516 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2517 } 2518 if (j == nz-1) { 2519 tmp0 = tmps[vi[j]]; 2520 sum1 -= v1[j]* tmp0; 2521 sum2 -= v2[j+1]* tmp0; 2522 } 2523 2524 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2525 sum2 -= v2[0] * tmp0; 2526 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2527 break; 2528 case 3: 2529 sum1 = tmp[row]; 2530 sum2 = tmp[row -1]; 2531 sum3 = tmp[row -2]; 2532 v2 = aa + ad[row] + 1; 2533 v3 = aa + ad[row -1] + 1; 2534 for (j=0; j<nz-1; j+=2) { 2535 i0 = vi[j]; 2536 i1 = vi[j+1]; 2537 tmp0 = tmps[i0]; 2538 tmp1 = tmps[i1]; 2539 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2540 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2541 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2542 } 2543 if (j== nz-1) { 2544 tmp0 = tmps[vi[j]]; 2545 sum1 -= v1[j] * tmp0; 2546 sum2 -= v2[j+1] * tmp0; 2547 sum3 -= v3[j+2] * tmp0; 2548 } 2549 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2550 sum2 -= v2[0]* tmp0; 2551 sum3 -= v3[1] * tmp0; 2552 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2553 sum3 -= v3[0]* tmp0; 2554 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2555 2556 break; 2557 case 4: 2558 sum1 = tmp[row]; 2559 sum2 = tmp[row -1]; 2560 sum3 = tmp[row -2]; 2561 sum4 = tmp[row -3]; 2562 v2 = aa + ad[row]+1; 2563 v3 = aa + ad[row -1]+1; 2564 v4 = aa + ad[row -2]+1; 2565 2566 for (j=0; j<nz-1; j+=2) { 2567 i0 = vi[j]; 2568 i1 = vi[j+1]; 2569 tmp0 = tmps[i0]; 2570 tmp1 = tmps[i1]; 2571 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2572 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2573 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2574 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2575 } 2576 if (j== nz-1) { 2577 tmp0 = tmps[vi[j]]; 2578 sum1 -= v1[j] * tmp0; 2579 sum2 -= v2[j+1] * tmp0; 2580 sum3 -= v3[j+2] * tmp0; 2581 sum4 -= v4[j+3] * tmp0; 2582 } 2583 2584 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2585 sum2 -= v2[0] * tmp0; 2586 sum3 -= v3[1] * tmp0; 2587 sum4 -= v4[2] * tmp0; 2588 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2589 sum3 -= v3[0] * tmp0; 2590 sum4 -= v4[1] * tmp0; 2591 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2592 sum4 -= v4[0] * tmp0; 2593 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2594 break; 2595 case 5: 2596 sum1 = tmp[row]; 2597 sum2 = tmp[row -1]; 2598 sum3 = tmp[row -2]; 2599 sum4 = tmp[row -3]; 2600 sum5 = tmp[row -4]; 2601 v2 = aa + ad[row]+1; 2602 v3 = aa + ad[row -1]+1; 2603 v4 = aa + ad[row -2]+1; 2604 v5 = aa + ad[row -3]+1; 2605 for (j=0; j<nz-1; j+=2) { 2606 i0 = vi[j]; 2607 i1 = vi[j+1]; 2608 tmp0 = tmps[i0]; 2609 tmp1 = tmps[i1]; 2610 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2611 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2612 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2613 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2614 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2615 } 2616 if (j==nz-1) { 2617 tmp0 = tmps[vi[j]]; 2618 sum1 -= v1[j] * tmp0; 2619 sum2 -= v2[j+1] * tmp0; 2620 sum3 -= v3[j+2] * tmp0; 2621 sum4 -= v4[j+3] * tmp0; 2622 sum5 -= v5[j+4] * tmp0; 2623 } 2624 2625 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2626 sum2 -= v2[0] * tmp0; 2627 sum3 -= v3[1] * tmp0; 2628 sum4 -= v4[2] * tmp0; 2629 sum5 -= v5[3] * tmp0; 2630 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2631 sum3 -= v3[0] * tmp0; 2632 sum4 -= v4[1] * tmp0; 2633 sum5 -= v5[2] * tmp0; 2634 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2635 sum4 -= v4[0] * tmp0; 2636 sum5 -= v5[1] * tmp0; 2637 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2638 sum5 -= v5[0] * tmp0; 2639 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2640 break; 2641 default: 2642 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported "); 2643 } 2644 } 2645 PetscCall(ISRestoreIndices(isrow,&rout)); 2646 PetscCall(ISRestoreIndices(iscol,&cout)); 2647 PetscCall(VecRestoreArrayRead(bb,&b)); 2648 PetscCall(VecRestoreArrayWrite(xx,&x)); 2649 PetscCall(PetscLogFlops(2.0*a->nz - A->cmap->n)); 2650 PetscFunctionReturn(0); 2651 } 2652 2653 /* 2654 Makes a longer coloring[] array and calls the usual code with that 2655 */ 2656 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2657 { 2658 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2659 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2660 PetscInt *colorused,i; 2661 ISColoringValue *newcolor; 2662 2663 PetscFunctionBegin; 2664 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2665 PetscCall(PetscMalloc1(n+1,&newcolor)); 2666 /* loop over inodes, marking a color for each column*/ 2667 row = 0; 2668 for (i=0; i<m; i++) { 2669 for (j=0; j<ns[i]; j++) { 2670 newcolor[row++] = coloring[i] + j*ncolors; 2671 } 2672 } 2673 2674 /* eliminate unneeded colors */ 2675 PetscCall(PetscCalloc1(5*ncolors,&colorused)); 2676 for (i=0; i<n; i++) { 2677 colorused[newcolor[i]] = 1; 2678 } 2679 2680 for (i=1; i<5*ncolors; i++) { 2681 colorused[i] += colorused[i-1]; 2682 } 2683 ncolors = colorused[5*ncolors-1]; 2684 for (i=0; i<n; i++) { 2685 newcolor[i] = colorused[newcolor[i]]-1; 2686 } 2687 PetscCall(PetscFree(colorused)); 2688 PetscCall(ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,PETSC_OWN_POINTER,iscoloring)); 2689 PetscCall(PetscFree(coloring)); 2690 PetscFunctionReturn(0); 2691 } 2692 2693 #include <petsc/private/kernels/blockinvert.h> 2694 2695 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2696 { 2697 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2698 PetscScalar sum1 = 0.0,sum2 = 0.0,sum3 = 0.0,sum4 = 0.0,sum5 = 0.0,tmp0,tmp1,tmp2,tmp3; 2699 MatScalar *ibdiag,*bdiag,work[25],*t; 2700 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2701 const MatScalar *v = a->a,*v1 = NULL,*v2 = NULL,*v3 = NULL,*v4 = NULL,*v5 = NULL; 2702 const PetscScalar *xb, *b; 2703 PetscReal zeropivot = 100.*PETSC_MACHINE_EPSILON, shift = 0.0; 2704 PetscInt n,m = a->inode.node_count,cnt = 0,i,j,row,i1,i2; 2705 PetscInt sz,k,ipvt[5]; 2706 PetscBool allowzeropivot,zeropivotdetected; 2707 const PetscInt *sizes = a->inode.size,*idx,*diag = a->diag,*ii = a->i; 2708 2709 PetscFunctionBegin; 2710 allowzeropivot = PetscNot(A->erroriffailure); 2711 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2712 PetscCheck(omega == 1.0,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2713 PetscCheck(fshift == 0.0,PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2714 2715 if (!a->inode.ibdiagvalid) { 2716 if (!a->inode.ibdiag) { 2717 /* calculate space needed for diagonal blocks */ 2718 for (i=0; i<m; i++) { 2719 cnt += sizes[i]*sizes[i]; 2720 } 2721 a->inode.bdiagsize = cnt; 2722 2723 PetscCall(PetscMalloc3(cnt,&a->inode.ibdiag,cnt,&a->inode.bdiag,A->rmap->n,&a->inode.ssor_work)); 2724 } 2725 2726 /* copy over the diagonal blocks and invert them */ 2727 ibdiag = a->inode.ibdiag; 2728 bdiag = a->inode.bdiag; 2729 cnt = 0; 2730 for (i=0, row = 0; i<m; i++) { 2731 for (j=0; j<sizes[i]; j++) { 2732 for (k=0; k<sizes[i]; k++) { 2733 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2734 } 2735 } 2736 PetscCall(PetscArraycpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i])); 2737 2738 switch (sizes[i]) { 2739 case 1: 2740 /* Create matrix data structure */ 2741 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) { 2742 if (allowzeropivot) { 2743 A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2744 A->factorerror_zeropivot_value = PetscAbsScalar(ibdiag[cnt]); 2745 A->factorerror_zeropivot_row = row; 2746 PetscCall(PetscInfo(A,"Zero pivot, row %" PetscInt_FMT "\n",row)); 2747 } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %" PetscInt_FMT,row); 2748 } 2749 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2750 break; 2751 case 2: 2752 PetscCall(PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected)); 2753 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2754 break; 2755 case 3: 2756 PetscCall(PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected)); 2757 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2758 break; 2759 case 4: 2760 PetscCall(PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift,allowzeropivot,&zeropivotdetected)); 2761 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2762 break; 2763 case 5: 2764 PetscCall(PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift,allowzeropivot,&zeropivotdetected)); 2765 if (zeropivotdetected) A->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 2766 break; 2767 default: 2768 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]); 2769 } 2770 cnt += sizes[i]*sizes[i]; 2771 row += sizes[i]; 2772 } 2773 a->inode.ibdiagvalid = PETSC_TRUE; 2774 } 2775 ibdiag = a->inode.ibdiag; 2776 bdiag = a->inode.bdiag; 2777 t = a->inode.ssor_work; 2778 2779 PetscCall(VecGetArray(xx,&x)); 2780 PetscCall(VecGetArrayRead(bb,&b)); 2781 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2782 if (flag & SOR_ZERO_INITIAL_GUESS) { 2783 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2784 2785 for (i=0, row=0; i<m; i++) { 2786 sz = diag[row] - ii[row]; 2787 v1 = a->a + ii[row]; 2788 idx = a->j + ii[row]; 2789 2790 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2791 switch (sizes[i]) { 2792 case 1: 2793 2794 sum1 = b[row]; 2795 for (n = 0; n<sz-1; n+=2) { 2796 i1 = idx[0]; 2797 i2 = idx[1]; 2798 idx += 2; 2799 tmp0 = x[i1]; 2800 tmp1 = x[i2]; 2801 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2802 } 2803 2804 if (n == sz-1) { 2805 tmp0 = x[*idx]; 2806 sum1 -= *v1 * tmp0; 2807 } 2808 t[row] = sum1; 2809 x[row++] = sum1*(*ibdiag++); 2810 break; 2811 case 2: 2812 v2 = a->a + ii[row+1]; 2813 sum1 = b[row]; 2814 sum2 = b[row+1]; 2815 for (n = 0; n<sz-1; n+=2) { 2816 i1 = idx[0]; 2817 i2 = idx[1]; 2818 idx += 2; 2819 tmp0 = x[i1]; 2820 tmp1 = x[i2]; 2821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2822 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2823 } 2824 2825 if (n == sz-1) { 2826 tmp0 = x[*idx]; 2827 sum1 -= v1[0] * tmp0; 2828 sum2 -= v2[0] * tmp0; 2829 } 2830 t[row] = sum1; 2831 t[row+1] = sum2; 2832 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2833 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2834 ibdiag += 4; 2835 break; 2836 case 3: 2837 v2 = a->a + ii[row+1]; 2838 v3 = a->a + ii[row+2]; 2839 sum1 = b[row]; 2840 sum2 = b[row+1]; 2841 sum3 = b[row+2]; 2842 for (n = 0; n<sz-1; n+=2) { 2843 i1 = idx[0]; 2844 i2 = idx[1]; 2845 idx += 2; 2846 tmp0 = x[i1]; 2847 tmp1 = x[i2]; 2848 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2849 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2850 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2851 } 2852 2853 if (n == sz-1) { 2854 tmp0 = x[*idx]; 2855 sum1 -= v1[0] * tmp0; 2856 sum2 -= v2[0] * tmp0; 2857 sum3 -= v3[0] * tmp0; 2858 } 2859 t[row] = sum1; 2860 t[row+1] = sum2; 2861 t[row+2] = sum3; 2862 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2863 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2864 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2865 ibdiag += 9; 2866 break; 2867 case 4: 2868 v2 = a->a + ii[row+1]; 2869 v3 = a->a + ii[row+2]; 2870 v4 = a->a + ii[row+3]; 2871 sum1 = b[row]; 2872 sum2 = b[row+1]; 2873 sum3 = b[row+2]; 2874 sum4 = b[row+3]; 2875 for (n = 0; n<sz-1; n+=2) { 2876 i1 = idx[0]; 2877 i2 = idx[1]; 2878 idx += 2; 2879 tmp0 = x[i1]; 2880 tmp1 = x[i2]; 2881 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2882 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2883 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2884 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2885 } 2886 2887 if (n == sz-1) { 2888 tmp0 = x[*idx]; 2889 sum1 -= v1[0] * tmp0; 2890 sum2 -= v2[0] * tmp0; 2891 sum3 -= v3[0] * tmp0; 2892 sum4 -= v4[0] * tmp0; 2893 } 2894 t[row] = sum1; 2895 t[row+1] = sum2; 2896 t[row+2] = sum3; 2897 t[row+3] = sum4; 2898 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2899 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2900 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2901 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2902 ibdiag += 16; 2903 break; 2904 case 5: 2905 v2 = a->a + ii[row+1]; 2906 v3 = a->a + ii[row+2]; 2907 v4 = a->a + ii[row+3]; 2908 v5 = a->a + ii[row+4]; 2909 sum1 = b[row]; 2910 sum2 = b[row+1]; 2911 sum3 = b[row+2]; 2912 sum4 = b[row+3]; 2913 sum5 = b[row+4]; 2914 for (n = 0; n<sz-1; n+=2) { 2915 i1 = idx[0]; 2916 i2 = idx[1]; 2917 idx += 2; 2918 tmp0 = x[i1]; 2919 tmp1 = x[i2]; 2920 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2921 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2922 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2923 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2924 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2925 } 2926 2927 if (n == sz-1) { 2928 tmp0 = x[*idx]; 2929 sum1 -= v1[0] * tmp0; 2930 sum2 -= v2[0] * tmp0; 2931 sum3 -= v3[0] * tmp0; 2932 sum4 -= v4[0] * tmp0; 2933 sum5 -= v5[0] * tmp0; 2934 } 2935 t[row] = sum1; 2936 t[row+1] = sum2; 2937 t[row+2] = sum3; 2938 t[row+3] = sum4; 2939 t[row+4] = sum5; 2940 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2941 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2942 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2943 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2944 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2945 ibdiag += 25; 2946 break; 2947 default: 2948 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]); 2949 } 2950 } 2951 2952 xb = t; 2953 PetscCall(PetscLogFlops(a->nz)); 2954 } else xb = b; 2955 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 2956 2957 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2958 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2959 ibdiag -= sizes[i]*sizes[i]; 2960 sz = ii[row+1] - diag[row] - 1; 2961 v1 = a->a + diag[row] + 1; 2962 idx = a->j + diag[row] + 1; 2963 2964 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2965 switch (sizes[i]) { 2966 case 1: 2967 2968 sum1 = xb[row]; 2969 for (n = 0; n<sz-1; n+=2) { 2970 i1 = idx[0]; 2971 i2 = idx[1]; 2972 idx += 2; 2973 tmp0 = x[i1]; 2974 tmp1 = x[i2]; 2975 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2976 } 2977 2978 if (n == sz-1) { 2979 tmp0 = x[*idx]; 2980 sum1 -= *v1*tmp0; 2981 } 2982 x[row--] = sum1*(*ibdiag); 2983 break; 2984 2985 case 2: 2986 2987 sum1 = xb[row]; 2988 sum2 = xb[row-1]; 2989 /* note that sum1 is associated with the second of the two rows */ 2990 v2 = a->a + diag[row-1] + 2; 2991 for (n = 0; n<sz-1; n+=2) { 2992 i1 = idx[0]; 2993 i2 = idx[1]; 2994 idx += 2; 2995 tmp0 = x[i1]; 2996 tmp1 = x[i2]; 2997 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2998 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2999 } 3000 3001 if (n == sz-1) { 3002 tmp0 = x[*idx]; 3003 sum1 -= *v1*tmp0; 3004 sum2 -= *v2*tmp0; 3005 } 3006 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3007 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3008 break; 3009 case 3: 3010 3011 sum1 = xb[row]; 3012 sum2 = xb[row-1]; 3013 sum3 = xb[row-2]; 3014 v2 = a->a + diag[row-1] + 2; 3015 v3 = a->a + diag[row-2] + 3; 3016 for (n = 0; n<sz-1; n+=2) { 3017 i1 = idx[0]; 3018 i2 = idx[1]; 3019 idx += 2; 3020 tmp0 = x[i1]; 3021 tmp1 = x[i2]; 3022 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3023 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3024 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3025 } 3026 3027 if (n == sz-1) { 3028 tmp0 = x[*idx]; 3029 sum1 -= *v1*tmp0; 3030 sum2 -= *v2*tmp0; 3031 sum3 -= *v3*tmp0; 3032 } 3033 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3034 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3035 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3036 break; 3037 case 4: 3038 3039 sum1 = xb[row]; 3040 sum2 = xb[row-1]; 3041 sum3 = xb[row-2]; 3042 sum4 = xb[row-3]; 3043 v2 = a->a + diag[row-1] + 2; 3044 v3 = a->a + diag[row-2] + 3; 3045 v4 = a->a + diag[row-3] + 4; 3046 for (n = 0; n<sz-1; n+=2) { 3047 i1 = idx[0]; 3048 i2 = idx[1]; 3049 idx += 2; 3050 tmp0 = x[i1]; 3051 tmp1 = x[i2]; 3052 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3053 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3054 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3055 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3056 } 3057 3058 if (n == sz-1) { 3059 tmp0 = x[*idx]; 3060 sum1 -= *v1*tmp0; 3061 sum2 -= *v2*tmp0; 3062 sum3 -= *v3*tmp0; 3063 sum4 -= *v4*tmp0; 3064 } 3065 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3066 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3067 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3068 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3069 break; 3070 case 5: 3071 3072 sum1 = xb[row]; 3073 sum2 = xb[row-1]; 3074 sum3 = xb[row-2]; 3075 sum4 = xb[row-3]; 3076 sum5 = xb[row-4]; 3077 v2 = a->a + diag[row-1] + 2; 3078 v3 = a->a + diag[row-2] + 3; 3079 v4 = a->a + diag[row-3] + 4; 3080 v5 = a->a + diag[row-4] + 5; 3081 for (n = 0; n<sz-1; n+=2) { 3082 i1 = idx[0]; 3083 i2 = idx[1]; 3084 idx += 2; 3085 tmp0 = x[i1]; 3086 tmp1 = x[i2]; 3087 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3088 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3089 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3090 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3091 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3092 } 3093 3094 if (n == sz-1) { 3095 tmp0 = x[*idx]; 3096 sum1 -= *v1*tmp0; 3097 sum2 -= *v2*tmp0; 3098 sum3 -= *v3*tmp0; 3099 sum4 -= *v4*tmp0; 3100 sum5 -= *v5*tmp0; 3101 } 3102 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3103 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3104 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3105 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3106 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3107 break; 3108 default: 3109 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]); 3110 } 3111 } 3112 3113 PetscCall(PetscLogFlops(a->nz)); 3114 } 3115 its--; 3116 } 3117 while (its--) { 3118 3119 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3120 for (i=0, row=0, ibdiag = a->inode.ibdiag; 3121 i<m; 3122 row += sizes[i], ibdiag += sizes[i]*sizes[i], i++) { 3123 3124 sz = diag[row] - ii[row]; 3125 v1 = a->a + ii[row]; 3126 idx = a->j + ii[row]; 3127 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3128 switch (sizes[i]) { 3129 case 1: 3130 sum1 = b[row]; 3131 for (n = 0; n<sz-1; n+=2) { 3132 i1 = idx[0]; 3133 i2 = idx[1]; 3134 idx += 2; 3135 tmp0 = x[i1]; 3136 tmp1 = x[i2]; 3137 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3138 } 3139 if (n == sz-1) { 3140 tmp0 = x[*idx++]; 3141 sum1 -= *v1 * tmp0; 3142 v1++; 3143 } 3144 t[row] = sum1; 3145 sz = ii[row+1] - diag[row] - 1; 3146 idx = a->j + diag[row] + 1; 3147 v1 += 1; 3148 for (n = 0; n<sz-1; n+=2) { 3149 i1 = idx[0]; 3150 i2 = idx[1]; 3151 idx += 2; 3152 tmp0 = x[i1]; 3153 tmp1 = x[i2]; 3154 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3155 } 3156 if (n == sz-1) { 3157 tmp0 = x[*idx++]; 3158 sum1 -= *v1 * tmp0; 3159 } 3160 /* in MatSOR_SeqAIJ this line would be 3161 * 3162 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3163 * 3164 * but omega == 1, so this becomes 3165 * 3166 * x[row] = sum1*(*ibdiag++); 3167 * 3168 */ 3169 x[row] = sum1*(*ibdiag); 3170 break; 3171 case 2: 3172 v2 = a->a + ii[row+1]; 3173 sum1 = b[row]; 3174 sum2 = b[row+1]; 3175 for (n = 0; n<sz-1; n+=2) { 3176 i1 = idx[0]; 3177 i2 = idx[1]; 3178 idx += 2; 3179 tmp0 = x[i1]; 3180 tmp1 = x[i2]; 3181 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3182 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3183 } 3184 if (n == sz-1) { 3185 tmp0 = x[*idx++]; 3186 sum1 -= v1[0] * tmp0; 3187 sum2 -= v2[0] * tmp0; 3188 v1++; v2++; 3189 } 3190 t[row] = sum1; 3191 t[row+1] = sum2; 3192 sz = ii[row+1] - diag[row] - 2; 3193 idx = a->j + diag[row] + 2; 3194 v1 += 2; 3195 v2 += 2; 3196 for (n = 0; n<sz-1; n+=2) { 3197 i1 = idx[0]; 3198 i2 = idx[1]; 3199 idx += 2; 3200 tmp0 = x[i1]; 3201 tmp1 = x[i2]; 3202 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3203 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3204 } 3205 if (n == sz-1) { 3206 tmp0 = x[*idx]; 3207 sum1 -= v1[0] * tmp0; 3208 sum2 -= v2[0] * tmp0; 3209 } 3210 x[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3211 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3212 break; 3213 case 3: 3214 v2 = a->a + ii[row+1]; 3215 v3 = a->a + ii[row+2]; 3216 sum1 = b[row]; 3217 sum2 = b[row+1]; 3218 sum3 = b[row+2]; 3219 for (n = 0; n<sz-1; n+=2) { 3220 i1 = idx[0]; 3221 i2 = idx[1]; 3222 idx += 2; 3223 tmp0 = x[i1]; 3224 tmp1 = x[i2]; 3225 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3226 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3227 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3228 } 3229 if (n == sz-1) { 3230 tmp0 = x[*idx++]; 3231 sum1 -= v1[0] * tmp0; 3232 sum2 -= v2[0] * tmp0; 3233 sum3 -= v3[0] * tmp0; 3234 v1++; v2++; v3++; 3235 } 3236 t[row] = sum1; 3237 t[row+1] = sum2; 3238 t[row+2] = sum3; 3239 sz = ii[row+1] - diag[row] - 3; 3240 idx = a->j + diag[row] + 3; 3241 v1 += 3; 3242 v2 += 3; 3243 v3 += 3; 3244 for (n = 0; n<sz-1; n+=2) { 3245 i1 = idx[0]; 3246 i2 = idx[1]; 3247 idx += 2; 3248 tmp0 = x[i1]; 3249 tmp1 = x[i2]; 3250 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3251 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3252 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3253 } 3254 if (n == sz-1) { 3255 tmp0 = x[*idx]; 3256 sum1 -= v1[0] * tmp0; 3257 sum2 -= v2[0] * tmp0; 3258 sum3 -= v3[0] * tmp0; 3259 } 3260 x[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3261 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3262 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3263 break; 3264 case 4: 3265 v2 = a->a + ii[row+1]; 3266 v3 = a->a + ii[row+2]; 3267 v4 = a->a + ii[row+3]; 3268 sum1 = b[row]; 3269 sum2 = b[row+1]; 3270 sum3 = b[row+2]; 3271 sum4 = b[row+3]; 3272 for (n = 0; n<sz-1; n+=2) { 3273 i1 = idx[0]; 3274 i2 = idx[1]; 3275 idx += 2; 3276 tmp0 = x[i1]; 3277 tmp1 = x[i2]; 3278 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3279 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3280 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3281 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3282 } 3283 if (n == sz-1) { 3284 tmp0 = x[*idx++]; 3285 sum1 -= v1[0] * tmp0; 3286 sum2 -= v2[0] * tmp0; 3287 sum3 -= v3[0] * tmp0; 3288 sum4 -= v4[0] * tmp0; 3289 v1++; v2++; v3++; v4++; 3290 } 3291 t[row] = sum1; 3292 t[row+1] = sum2; 3293 t[row+2] = sum3; 3294 t[row+3] = sum4; 3295 sz = ii[row+1] - diag[row] - 4; 3296 idx = a->j + diag[row] + 4; 3297 v1 += 4; 3298 v2 += 4; 3299 v3 += 4; 3300 v4 += 4; 3301 for (n = 0; n<sz-1; n+=2) { 3302 i1 = idx[0]; 3303 i2 = idx[1]; 3304 idx += 2; 3305 tmp0 = x[i1]; 3306 tmp1 = x[i2]; 3307 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3308 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3309 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3310 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3311 } 3312 if (n == sz-1) { 3313 tmp0 = x[*idx]; 3314 sum1 -= v1[0] * tmp0; 3315 sum2 -= v2[0] * tmp0; 3316 sum3 -= v3[0] * tmp0; 3317 sum4 -= v4[0] * tmp0; 3318 } 3319 x[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3320 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3321 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3322 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3323 break; 3324 case 5: 3325 v2 = a->a + ii[row+1]; 3326 v3 = a->a + ii[row+2]; 3327 v4 = a->a + ii[row+3]; 3328 v5 = a->a + ii[row+4]; 3329 sum1 = b[row]; 3330 sum2 = b[row+1]; 3331 sum3 = b[row+2]; 3332 sum4 = b[row+3]; 3333 sum5 = b[row+4]; 3334 for (n = 0; n<sz-1; n+=2) { 3335 i1 = idx[0]; 3336 i2 = idx[1]; 3337 idx += 2; 3338 tmp0 = x[i1]; 3339 tmp1 = x[i2]; 3340 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3341 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3342 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3343 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3344 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3345 } 3346 if (n == sz-1) { 3347 tmp0 = x[*idx++]; 3348 sum1 -= v1[0] * tmp0; 3349 sum2 -= v2[0] * tmp0; 3350 sum3 -= v3[0] * tmp0; 3351 sum4 -= v4[0] * tmp0; 3352 sum5 -= v5[0] * tmp0; 3353 v1++; v2++; v3++; v4++; v5++; 3354 } 3355 t[row] = sum1; 3356 t[row+1] = sum2; 3357 t[row+2] = sum3; 3358 t[row+3] = sum4; 3359 t[row+4] = sum5; 3360 sz = ii[row+1] - diag[row] - 5; 3361 idx = a->j + diag[row] + 5; 3362 v1 += 5; 3363 v2 += 5; 3364 v3 += 5; 3365 v4 += 5; 3366 v5 += 5; 3367 for (n = 0; n<sz-1; n+=2) { 3368 i1 = idx[0]; 3369 i2 = idx[1]; 3370 idx += 2; 3371 tmp0 = x[i1]; 3372 tmp1 = x[i2]; 3373 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3374 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3375 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3376 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3377 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3378 } 3379 if (n == sz-1) { 3380 tmp0 = x[*idx]; 3381 sum1 -= v1[0] * tmp0; 3382 sum2 -= v2[0] * tmp0; 3383 sum3 -= v3[0] * tmp0; 3384 sum4 -= v4[0] * tmp0; 3385 sum5 -= v5[0] * tmp0; 3386 } 3387 x[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3388 x[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3389 x[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3390 x[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3391 x[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3392 break; 3393 default: 3394 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]); 3395 } 3396 } 3397 xb = t; 3398 PetscCall(PetscLogFlops(2.0*a->nz)); /* undercounts diag inverse */ 3399 } else xb = b; 3400 3401 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3402 3403 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3404 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3405 ibdiag -= sizes[i]*sizes[i]; 3406 3407 /* set RHS */ 3408 if (xb == b) { 3409 /* whole (old way) */ 3410 sz = ii[row+1] - ii[row]; 3411 idx = a->j + ii[row]; 3412 switch (sizes[i]) { 3413 case 5: 3414 v5 = a->a + ii[row-4]; 3415 case 4: /* fall through */ 3416 v4 = a->a + ii[row-3]; 3417 case 3: 3418 v3 = a->a + ii[row-2]; 3419 case 2: 3420 v2 = a->a + ii[row-1]; 3421 case 1: 3422 v1 = a->a + ii[row]; 3423 break; 3424 default: 3425 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]); 3426 } 3427 } else { 3428 /* upper, no diag */ 3429 sz = ii[row+1] - diag[row] - 1; 3430 idx = a->j + diag[row] + 1; 3431 switch (sizes[i]) { 3432 case 5: 3433 v5 = a->a + diag[row-4] + 5; 3434 case 4: /* fall through */ 3435 v4 = a->a + diag[row-3] + 4; 3436 case 3: 3437 v3 = a->a + diag[row-2] + 3; 3438 case 2: 3439 v2 = a->a + diag[row-1] + 2; 3440 case 1: 3441 v1 = a->a + diag[row] + 1; 3442 } 3443 } 3444 /* set sum */ 3445 switch (sizes[i]) { 3446 case 5: 3447 sum5 = xb[row-4]; 3448 case 4: /* fall through */ 3449 sum4 = xb[row-3]; 3450 case 3: 3451 sum3 = xb[row-2]; 3452 case 2: 3453 sum2 = xb[row-1]; 3454 case 1: 3455 /* note that sum1 is associated with the last row */ 3456 sum1 = xb[row]; 3457 } 3458 /* do sums */ 3459 for (n = 0; n<sz-1; n+=2) { 3460 i1 = idx[0]; 3461 i2 = idx[1]; 3462 idx += 2; 3463 tmp0 = x[i1]; 3464 tmp1 = x[i2]; 3465 switch (sizes[i]) { 3466 case 5: 3467 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3468 case 4: /* fall through */ 3469 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3470 case 3: 3471 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3472 case 2: 3473 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3474 case 1: 3475 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3476 } 3477 } 3478 /* ragged edge */ 3479 if (n == sz-1) { 3480 tmp0 = x[*idx]; 3481 switch (sizes[i]) { 3482 case 5: 3483 sum5 -= *v5*tmp0; 3484 case 4: /* fall through */ 3485 sum4 -= *v4*tmp0; 3486 case 3: 3487 sum3 -= *v3*tmp0; 3488 case 2: 3489 sum2 -= *v2*tmp0; 3490 case 1: 3491 sum1 -= *v1*tmp0; 3492 } 3493 } 3494 /* update */ 3495 if (xb == b) { 3496 /* whole (old way) w/ diag */ 3497 switch (sizes[i]) { 3498 case 5: 3499 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3500 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3501 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3502 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3503 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3504 break; 3505 case 4: 3506 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3507 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3508 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3509 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3510 break; 3511 case 3: 3512 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3513 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3514 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3515 break; 3516 case 2: 3517 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3518 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3519 break; 3520 case 1: 3521 x[row--] += sum1*(*ibdiag); 3522 break; 3523 } 3524 } else { 3525 /* no diag so set = */ 3526 switch (sizes[i]) { 3527 case 5: 3528 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3529 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3530 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3531 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3532 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3533 break; 3534 case 4: 3535 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3536 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3537 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3538 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3539 break; 3540 case 3: 3541 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3542 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3543 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3544 break; 3545 case 2: 3546 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3547 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3548 break; 3549 case 1: 3550 x[row--] = sum1*(*ibdiag); 3551 break; 3552 } 3553 } 3554 } 3555 if (xb == b) { 3556 PetscCall(PetscLogFlops(2.0*a->nz)); 3557 } else { 3558 PetscCall(PetscLogFlops(a->nz)); /* assumes 1/2 in upper, undercounts diag inverse */ 3559 } 3560 } 3561 } 3562 if (flag & SOR_EISENSTAT) { 3563 /* 3564 Apply (U + D)^-1 where D is now the block diagonal 3565 */ 3566 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3567 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3568 ibdiag -= sizes[i]*sizes[i]; 3569 sz = ii[row+1] - diag[row] - 1; 3570 v1 = a->a + diag[row] + 1; 3571 idx = a->j + diag[row] + 1; 3572 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3573 switch (sizes[i]) { 3574 case 1: 3575 3576 sum1 = b[row]; 3577 for (n = 0; n<sz-1; n+=2) { 3578 i1 = idx[0]; 3579 i2 = idx[1]; 3580 idx += 2; 3581 tmp0 = x[i1]; 3582 tmp1 = x[i2]; 3583 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3584 } 3585 3586 if (n == sz-1) { 3587 tmp0 = x[*idx]; 3588 sum1 -= *v1*tmp0; 3589 } 3590 x[row] = sum1*(*ibdiag);row--; 3591 break; 3592 3593 case 2: 3594 3595 sum1 = b[row]; 3596 sum2 = b[row-1]; 3597 /* note that sum1 is associated with the second of the two rows */ 3598 v2 = a->a + diag[row-1] + 2; 3599 for (n = 0; n<sz-1; n+=2) { 3600 i1 = idx[0]; 3601 i2 = idx[1]; 3602 idx += 2; 3603 tmp0 = x[i1]; 3604 tmp1 = x[i2]; 3605 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3606 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3607 } 3608 3609 if (n == sz-1) { 3610 tmp0 = x[*idx]; 3611 sum1 -= *v1*tmp0; 3612 sum2 -= *v2*tmp0; 3613 } 3614 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3615 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3616 row -= 2; 3617 break; 3618 case 3: 3619 3620 sum1 = b[row]; 3621 sum2 = b[row-1]; 3622 sum3 = b[row-2]; 3623 v2 = a->a + diag[row-1] + 2; 3624 v3 = a->a + diag[row-2] + 3; 3625 for (n = 0; n<sz-1; n+=2) { 3626 i1 = idx[0]; 3627 i2 = idx[1]; 3628 idx += 2; 3629 tmp0 = x[i1]; 3630 tmp1 = x[i2]; 3631 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3632 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3633 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3634 } 3635 3636 if (n == sz-1) { 3637 tmp0 = x[*idx]; 3638 sum1 -= *v1*tmp0; 3639 sum2 -= *v2*tmp0; 3640 sum3 -= *v3*tmp0; 3641 } 3642 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3643 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3644 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3645 row -= 3; 3646 break; 3647 case 4: 3648 3649 sum1 = b[row]; 3650 sum2 = b[row-1]; 3651 sum3 = b[row-2]; 3652 sum4 = b[row-3]; 3653 v2 = a->a + diag[row-1] + 2; 3654 v3 = a->a + diag[row-2] + 3; 3655 v4 = a->a + diag[row-3] + 4; 3656 for (n = 0; n<sz-1; n+=2) { 3657 i1 = idx[0]; 3658 i2 = idx[1]; 3659 idx += 2; 3660 tmp0 = x[i1]; 3661 tmp1 = x[i2]; 3662 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3663 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3664 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3665 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3666 } 3667 3668 if (n == sz-1) { 3669 tmp0 = x[*idx]; 3670 sum1 -= *v1*tmp0; 3671 sum2 -= *v2*tmp0; 3672 sum3 -= *v3*tmp0; 3673 sum4 -= *v4*tmp0; 3674 } 3675 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3676 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3677 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3678 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3679 row -= 4; 3680 break; 3681 case 5: 3682 3683 sum1 = b[row]; 3684 sum2 = b[row-1]; 3685 sum3 = b[row-2]; 3686 sum4 = b[row-3]; 3687 sum5 = b[row-4]; 3688 v2 = a->a + diag[row-1] + 2; 3689 v3 = a->a + diag[row-2] + 3; 3690 v4 = a->a + diag[row-3] + 4; 3691 v5 = a->a + diag[row-4] + 5; 3692 for (n = 0; n<sz-1; n+=2) { 3693 i1 = idx[0]; 3694 i2 = idx[1]; 3695 idx += 2; 3696 tmp0 = x[i1]; 3697 tmp1 = x[i2]; 3698 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3699 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3700 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3701 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3702 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3703 } 3704 3705 if (n == sz-1) { 3706 tmp0 = x[*idx]; 3707 sum1 -= *v1*tmp0; 3708 sum2 -= *v2*tmp0; 3709 sum3 -= *v3*tmp0; 3710 sum4 -= *v4*tmp0; 3711 sum5 -= *v5*tmp0; 3712 } 3713 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3714 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3715 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3716 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3717 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3718 row -= 5; 3719 break; 3720 default: 3721 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]); 3722 } 3723 } 3724 PetscCall(PetscLogFlops(a->nz)); 3725 3726 /* 3727 t = b - D x where D is the block diagonal 3728 */ 3729 cnt = 0; 3730 for (i=0, row=0; i<m; i++) { 3731 switch (sizes[i]) { 3732 case 1: 3733 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3734 break; 3735 case 2: 3736 x1 = x[row]; x2 = x[row+1]; 3737 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3738 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3739 t[row] = b[row] - tmp1; 3740 t[row+1] = b[row+1] - tmp2; row += 2; 3741 cnt += 4; 3742 break; 3743 case 3: 3744 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3745 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3746 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3747 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3748 t[row] = b[row] - tmp1; 3749 t[row+1] = b[row+1] - tmp2; 3750 t[row+2] = b[row+2] - tmp3; row += 3; 3751 cnt += 9; 3752 break; 3753 case 4: 3754 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3755 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3756 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3757 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3758 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3759 t[row] = b[row] - tmp1; 3760 t[row+1] = b[row+1] - tmp2; 3761 t[row+2] = b[row+2] - tmp3; 3762 t[row+3] = b[row+3] - tmp4; row += 4; 3763 cnt += 16; 3764 break; 3765 case 5: 3766 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3767 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3768 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3769 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3770 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3771 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3772 t[row] = b[row] - tmp1; 3773 t[row+1] = b[row+1] - tmp2; 3774 t[row+2] = b[row+2] - tmp3; 3775 t[row+3] = b[row+3] - tmp4; 3776 t[row+4] = b[row+4] - tmp5;row += 5; 3777 cnt += 25; 3778 break; 3779 default: 3780 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]); 3781 } 3782 } 3783 PetscCall(PetscLogFlops(m)); 3784 3785 /* 3786 Apply (L + D)^-1 where D is the block diagonal 3787 */ 3788 for (i=0, row=0; i<m; i++) { 3789 sz = diag[row] - ii[row]; 3790 v1 = a->a + ii[row]; 3791 idx = a->j + ii[row]; 3792 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3793 switch (sizes[i]) { 3794 case 1: 3795 3796 sum1 = t[row]; 3797 for (n = 0; n<sz-1; n+=2) { 3798 i1 = idx[0]; 3799 i2 = idx[1]; 3800 idx += 2; 3801 tmp0 = t[i1]; 3802 tmp1 = t[i2]; 3803 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3804 } 3805 3806 if (n == sz-1) { 3807 tmp0 = t[*idx]; 3808 sum1 -= *v1 * tmp0; 3809 } 3810 x[row] += t[row] = sum1*(*ibdiag++); row++; 3811 break; 3812 case 2: 3813 v2 = a->a + ii[row+1]; 3814 sum1 = t[row]; 3815 sum2 = t[row+1]; 3816 for (n = 0; n<sz-1; n+=2) { 3817 i1 = idx[0]; 3818 i2 = idx[1]; 3819 idx += 2; 3820 tmp0 = t[i1]; 3821 tmp1 = t[i2]; 3822 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3823 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3824 } 3825 3826 if (n == sz-1) { 3827 tmp0 = t[*idx]; 3828 sum1 -= v1[0] * tmp0; 3829 sum2 -= v2[0] * tmp0; 3830 } 3831 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3832 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3833 ibdiag += 4; row += 2; 3834 break; 3835 case 3: 3836 v2 = a->a + ii[row+1]; 3837 v3 = a->a + ii[row+2]; 3838 sum1 = t[row]; 3839 sum2 = t[row+1]; 3840 sum3 = t[row+2]; 3841 for (n = 0; n<sz-1; n+=2) { 3842 i1 = idx[0]; 3843 i2 = idx[1]; 3844 idx += 2; 3845 tmp0 = t[i1]; 3846 tmp1 = t[i2]; 3847 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3848 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3849 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3850 } 3851 3852 if (n == sz-1) { 3853 tmp0 = t[*idx]; 3854 sum1 -= v1[0] * tmp0; 3855 sum2 -= v2[0] * tmp0; 3856 sum3 -= v3[0] * tmp0; 3857 } 3858 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3859 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3860 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3861 ibdiag += 9; row += 3; 3862 break; 3863 case 4: 3864 v2 = a->a + ii[row+1]; 3865 v3 = a->a + ii[row+2]; 3866 v4 = a->a + ii[row+3]; 3867 sum1 = t[row]; 3868 sum2 = t[row+1]; 3869 sum3 = t[row+2]; 3870 sum4 = t[row+3]; 3871 for (n = 0; n<sz-1; n+=2) { 3872 i1 = idx[0]; 3873 i2 = idx[1]; 3874 idx += 2; 3875 tmp0 = t[i1]; 3876 tmp1 = t[i2]; 3877 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3878 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3879 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3880 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3881 } 3882 3883 if (n == sz-1) { 3884 tmp0 = t[*idx]; 3885 sum1 -= v1[0] * tmp0; 3886 sum2 -= v2[0] * tmp0; 3887 sum3 -= v3[0] * tmp0; 3888 sum4 -= v4[0] * tmp0; 3889 } 3890 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3891 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3892 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3893 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3894 ibdiag += 16; row += 4; 3895 break; 3896 case 5: 3897 v2 = a->a + ii[row+1]; 3898 v3 = a->a + ii[row+2]; 3899 v4 = a->a + ii[row+3]; 3900 v5 = a->a + ii[row+4]; 3901 sum1 = t[row]; 3902 sum2 = t[row+1]; 3903 sum3 = t[row+2]; 3904 sum4 = t[row+3]; 3905 sum5 = t[row+4]; 3906 for (n = 0; n<sz-1; n+=2) { 3907 i1 = idx[0]; 3908 i2 = idx[1]; 3909 idx += 2; 3910 tmp0 = t[i1]; 3911 tmp1 = t[i2]; 3912 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3913 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3914 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3915 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3916 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3917 } 3918 3919 if (n == sz-1) { 3920 tmp0 = t[*idx]; 3921 sum1 -= v1[0] * tmp0; 3922 sum2 -= v2[0] * tmp0; 3923 sum3 -= v3[0] * tmp0; 3924 sum4 -= v4[0] * tmp0; 3925 sum5 -= v5[0] * tmp0; 3926 } 3927 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3928 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3929 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3930 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3931 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3932 ibdiag += 25; row += 5; 3933 break; 3934 default: 3935 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]); 3936 } 3937 } 3938 PetscCall(PetscLogFlops(a->nz)); 3939 } 3940 PetscCall(VecRestoreArray(xx,&x)); 3941 PetscCall(VecRestoreArrayRead(bb,&b)); 3942 PetscFunctionReturn(0); 3943 } 3944 3945 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3946 { 3947 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3948 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3949 const MatScalar *bdiag = a->inode.bdiag; 3950 const PetscScalar *b; 3951 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3952 const PetscInt *sizes = a->inode.size; 3953 3954 PetscFunctionBegin; 3955 PetscCheck(a->inode.size,PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 3956 PetscCall(VecGetArray(xx,&x)); 3957 PetscCall(VecGetArrayRead(bb,&b)); 3958 cnt = 0; 3959 for (i=0, row=0; i<m; i++) { 3960 switch (sizes[i]) { 3961 case 1: 3962 x[row] = b[row]*bdiag[cnt++];row++; 3963 break; 3964 case 2: 3965 x1 = b[row]; x2 = b[row+1]; 3966 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3967 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3968 x[row++] = tmp1; 3969 x[row++] = tmp2; 3970 cnt += 4; 3971 break; 3972 case 3: 3973 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3974 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3975 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3976 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3977 x[row++] = tmp1; 3978 x[row++] = tmp2; 3979 x[row++] = tmp3; 3980 cnt += 9; 3981 break; 3982 case 4: 3983 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3984 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3985 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3986 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3987 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3988 x[row++] = tmp1; 3989 x[row++] = tmp2; 3990 x[row++] = tmp3; 3991 x[row++] = tmp4; 3992 cnt += 16; 3993 break; 3994 case 5: 3995 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3996 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3997 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3998 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3999 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 4000 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 4001 x[row++] = tmp1; 4002 x[row++] = tmp2; 4003 x[row++] = tmp3; 4004 x[row++] = tmp4; 4005 x[row++] = tmp5; 4006 cnt += 25; 4007 break; 4008 default: 4009 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %" PetscInt_FMT " not supported",sizes[i]); 4010 } 4011 } 4012 PetscCall(PetscLogFlops(2.0*cnt)); 4013 PetscCall(VecRestoreArray(xx,&x)); 4014 PetscCall(VecRestoreArrayRead(bb,&b)); 4015 PetscFunctionReturn(0); 4016 } 4017 4018 static PetscErrorCode MatSeqAIJ_Inode_ResetOps(Mat A) 4019 { 4020 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4021 4022 PetscFunctionBegin; 4023 a->inode.node_count = 0; 4024 a->inode.use = PETSC_FALSE; 4025 a->inode.checked = PETSC_FALSE; 4026 a->inode.mat_nonzerostate = -1; 4027 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4028 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4029 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4030 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4031 A->ops->coloringpatch = NULL; 4032 A->ops->multdiagonalblock = NULL; 4033 if (A->factortype) { 4034 A->ops->solve = MatSolve_SeqAIJ_inplace; 4035 } 4036 PetscFunctionReturn(0); 4037 } 4038 4039 /* 4040 samestructure indicates that the matrix has not changed its nonzero structure so we 4041 do not need to recompute the inodes 4042 */ 4043 PetscErrorCode MatSeqAIJCheckInode(Mat A) 4044 { 4045 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4046 PetscInt i,j,m,nzx,nzy,*ns,node_count,blk_size; 4047 PetscBool flag; 4048 const PetscInt *idx,*idy,*ii; 4049 4050 PetscFunctionBegin; 4051 if (!a->inode.use) { 4052 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4053 PetscCall(PetscFree(a->inode.size)); 4054 PetscFunctionReturn(0); 4055 } 4056 if (a->inode.checked && A->nonzerostate == a->inode.mat_nonzerostate) PetscFunctionReturn(0); 4057 4058 m = A->rmap->n; 4059 if (!a->inode.size) PetscCall(PetscMalloc1(m+1,&a->inode.size)); 4060 ns = a->inode.size; 4061 4062 i = 0; 4063 node_count = 0; 4064 idx = a->j; 4065 ii = a->i; 4066 while (i < m) { /* For each row */ 4067 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 4068 /* Limits the number of elements in a node to 'a->inode.limit' */ 4069 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4070 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 4071 if (nzy != nzx) break; 4072 idy += nzx; /* Same nonzero pattern */ 4073 PetscCall(PetscArraycmp(idx,idy,nzx,&flag)); 4074 if (!flag) break; 4075 } 4076 ns[node_count++] = blk_size; 4077 idx += blk_size*nzx; 4078 i = j; 4079 } 4080 4081 /* If not enough inodes found,, do not use inode version of the routines */ 4082 if (!m || node_count > .8*m) { 4083 PetscCall(MatSeqAIJ_Inode_ResetOps(A)); 4084 PetscCall(PetscFree(a->inode.size)); 4085 PetscCall(PetscInfo(A,"Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n",node_count,m)); 4086 } else { 4087 if (!A->factortype) { 4088 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4089 if (A->rmap->n == A->cmap->n) { 4090 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4091 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4092 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4093 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4094 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4095 } 4096 } else { 4097 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4098 } 4099 a->inode.node_count = node_count; 4100 PetscCall(PetscInfo(A,"Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n",node_count,m,a->inode.limit)); 4101 } 4102 a->inode.checked = PETSC_TRUE; 4103 a->inode.mat_nonzerostate = A->nonzerostate; 4104 PetscFunctionReturn(0); 4105 } 4106 4107 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4108 { 4109 Mat B =*C; 4110 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4111 PetscInt m=A->rmap->n; 4112 4113 PetscFunctionBegin; 4114 c->inode.use = a->inode.use; 4115 c->inode.limit = a->inode.limit; 4116 c->inode.max_limit = a->inode.max_limit; 4117 c->inode.checked = PETSC_FALSE; 4118 c->inode.size = NULL; 4119 c->inode.node_count = 0; 4120 c->inode.ibdiagvalid = PETSC_FALSE; 4121 c->inode.ibdiag = NULL; 4122 c->inode.bdiag = NULL; 4123 c->inode.mat_nonzerostate = -1; 4124 if (a->inode.use) { 4125 if (a->inode.checked && a->inode.size) { 4126 PetscCall(PetscMalloc1(m+1,&c->inode.size)); 4127 PetscCall(PetscArraycpy(c->inode.size,a->inode.size,m+1)); 4128 4129 c->inode.checked = PETSC_TRUE; 4130 c->inode.node_count = a->inode.node_count; 4131 c->inode.mat_nonzerostate = (*C)->nonzerostate; 4132 } 4133 /* note the table of functions below should match that in MatSeqAIJCheckInode() */ 4134 if (!B->factortype) { 4135 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4136 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4137 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4138 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4139 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4140 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4141 } else { 4142 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4143 } 4144 } 4145 PetscFunctionReturn(0); 4146 } 4147 4148 static inline PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,const PetscInt *ai,const PetscInt *aj,const PetscInt *adiag,PetscInt row) 4149 { 4150 PetscInt k; 4151 const PetscInt *vi; 4152 4153 PetscFunctionBegin; 4154 vi = aj + ai[row]; 4155 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4156 vi = aj + adiag[row]; 4157 cols[nzl] = vi[0]; 4158 vi = aj + adiag[row+1]+1; 4159 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4160 PetscFunctionReturn(0); 4161 } 4162 /* 4163 MatSeqAIJCheckInode_FactorLU - Check Inode for factored seqaij matrix. 4164 Modified from MatSeqAIJCheckInode(). 4165 4166 Input Parameters: 4167 . Mat A - ILU or LU matrix factor 4168 4169 */ 4170 PetscErrorCode MatSeqAIJCheckInode_FactorLU(Mat A) 4171 { 4172 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4173 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4174 PetscInt *cols1,*cols2,*ns; 4175 const PetscInt *ai = a->i,*aj = a->j, *adiag = a->diag; 4176 PetscBool flag; 4177 4178 PetscFunctionBegin; 4179 if (!a->inode.use) PetscFunctionReturn(0); 4180 if (a->inode.checked) PetscFunctionReturn(0); 4181 4182 m = A->rmap->n; 4183 if (a->inode.size) ns = a->inode.size; 4184 else { 4185 PetscCall(PetscMalloc1(m+1,&ns)); 4186 } 4187 4188 i = 0; 4189 node_count = 0; 4190 PetscCall(PetscMalloc2(m,&cols1,m,&cols2)); 4191 while (i < m) { /* For each row */ 4192 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4193 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4194 nzx = nzl1 + nzu1 + 1; 4195 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4196 4197 /* Limits the number of elements in a node to 'a->inode.limit' */ 4198 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4199 nzl2 = ai[j+1] - ai[j]; 4200 nzu2 = adiag[j] - adiag[j+1] - 1; 4201 nzy = nzl2 + nzu2 + 1; 4202 if (nzy != nzx) break; 4203 PetscCall(MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j)); 4204 PetscCall(PetscArraycmp(cols1,cols2,nzx,&flag)); 4205 if (!flag) break; 4206 } 4207 ns[node_count++] = blk_size; 4208 i = j; 4209 } 4210 PetscCall(PetscFree2(cols1,cols2)); 4211 /* If not enough inodes found,, do not use inode version of the routines */ 4212 if (!m || node_count > .8*m) { 4213 PetscCall(PetscFree(ns)); 4214 4215 a->inode.node_count = 0; 4216 a->inode.size = NULL; 4217 a->inode.use = PETSC_FALSE; 4218 4219 PetscCall(PetscInfo(A,"Found %" PetscInt_FMT " nodes out of %" PetscInt_FMT " rows. Not using Inode routines\n",node_count,m)); 4220 } else { 4221 A->ops->mult = NULL; 4222 A->ops->sor = NULL; 4223 A->ops->multadd = NULL; 4224 A->ops->getrowij = NULL; 4225 A->ops->restorerowij = NULL; 4226 A->ops->getcolumnij = NULL; 4227 A->ops->restorecolumnij = NULL; 4228 A->ops->coloringpatch = NULL; 4229 A->ops->multdiagonalblock = NULL; 4230 a->inode.node_count = node_count; 4231 a->inode.size = ns; 4232 4233 PetscCall(PetscInfo(A,"Found %" PetscInt_FMT " nodes of %" PetscInt_FMT ". Limit used: %" PetscInt_FMT ". Using Inode routines\n",node_count,m,a->inode.limit)); 4234 } 4235 a->inode.checked = PETSC_TRUE; 4236 PetscFunctionReturn(0); 4237 } 4238 4239 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4240 { 4241 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4242 4243 PetscFunctionBegin; 4244 a->inode.ibdiagvalid = PETSC_FALSE; 4245 PetscFunctionReturn(0); 4246 } 4247 4248 /* 4249 This is really ugly. if inodes are used this replaces the 4250 permutations with ones that correspond to rows/cols of the matrix 4251 rather then inode blocks 4252 */ 4253 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4254 { 4255 PetscFunctionBegin; 4256 PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm)); 4257 PetscFunctionReturn(0); 4258 } 4259 4260 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4261 { 4262 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4263 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4264 const PetscInt *ridx,*cidx; 4265 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4266 PetscInt nslim_col,*ns_col; 4267 IS ris = *rperm,cis = *cperm; 4268 4269 PetscFunctionBegin; 4270 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4271 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4272 4273 PetscCall(MatCreateColInode_Private(A,&nslim_col,&ns_col)); 4274 PetscCall(PetscMalloc1(((nslim_row>nslim_col) ? nslim_row : nslim_col)+1,&tns)); 4275 PetscCall(PetscMalloc2(m,&permr,n,&permc)); 4276 4277 PetscCall(ISGetIndices(ris,&ridx)); 4278 PetscCall(ISGetIndices(cis,&cidx)); 4279 4280 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4281 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4282 4283 /* Construct the permutations for rows*/ 4284 for (i=0,row = 0; i<nslim_row; ++i) { 4285 indx = ridx[i]; 4286 start_val = tns[indx]; 4287 end_val = tns[indx + 1]; 4288 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4289 } 4290 4291 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4292 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4293 4294 /* Construct permutations for columns */ 4295 for (i=0,col=0; i<nslim_col; ++i) { 4296 indx = cidx[i]; 4297 start_val = tns[indx]; 4298 end_val = tns[indx + 1]; 4299 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4300 } 4301 4302 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm)); 4303 PetscCall(ISSetPermutation(*rperm)); 4304 PetscCall(ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm)); 4305 PetscCall(ISSetPermutation(*cperm)); 4306 4307 PetscCall(ISRestoreIndices(ris,&ridx)); 4308 PetscCall(ISRestoreIndices(cis,&cidx)); 4309 4310 PetscCall(PetscFree(ns_col)); 4311 PetscCall(PetscFree2(permr,permc)); 4312 PetscCall(ISDestroy(&cis)); 4313 PetscCall(ISDestroy(&ris)); 4314 PetscCall(PetscFree(tns)); 4315 PetscFunctionReturn(0); 4316 } 4317 4318 /*@C 4319 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4320 4321 Not Collective 4322 4323 Input Parameter: 4324 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4325 4326 Output Parameters: 4327 + node_count - no of inodes present in the matrix. 4328 . sizes - an array of size node_count,with sizes of each inode. 4329 - limit - the max size used to generate the inodes. 4330 4331 Level: advanced 4332 4333 Notes: 4334 This routine returns some internal storage information 4335 of the matrix, it is intended to be used by advanced users. 4336 It should be called after the matrix is assembled. 4337 The contents of the sizes[] array should not be changed. 4338 NULL may be passed for information not requested. 4339 4340 .seealso: `MatGetInfo()` 4341 @*/ 4342 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4343 { 4344 PetscErrorCode (*f)(Mat,PetscInt*,PetscInt**,PetscInt*); 4345 4346 PetscFunctionBegin; 4347 PetscCheck(A->assembled,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4348 PetscCall(PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f)); 4349 if (f) PetscCall((*f)(A,node_count,sizes,limit)); 4350 PetscFunctionReturn(0); 4351 } 4352 4353 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4354 { 4355 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4356 4357 PetscFunctionBegin; 4358 if (node_count) *node_count = a->inode.node_count; 4359 if (sizes) *sizes = a->inode.size; 4360 if (limit) *limit = a->inode.limit; 4361 PetscFunctionReturn(0); 4362 } 4363