1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 67 68 PetscFunctionBegin; 69 nslim_row = a->inode.node_count; 70 m = A->rmap->n; 71 n = A->cmap->n; 72 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 73 74 /* Use the row_inode as column_inode */ 75 nslim_col = nslim_row; 76 ns_col = ns_row; 77 78 /* allocate space for reformated inode structure */ 79 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 80 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 81 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 82 83 for (i1=0,col=0; i1<nslim_col; ++i1) { 84 nsz = ns_col[i1]; 85 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 86 } 87 /* allocate space for row pointers */ 88 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 89 *iia = ia; 90 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 91 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 92 93 /* determine the number of columns in each row */ 94 ia[0] = oshift; 95 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 96 97 j = aj + ai[row] + ishift; 98 jmax = aj + ai[row+1] + ishift; 99 col = *j++ + ishift; 100 i2 = tvc[col]; 101 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 102 ia[i1+1]++; 103 ia[i2+1]++; 104 i2++; /* Start col of next node */ 105 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 106 i2 = tvc[col]; 107 } 108 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 109 } 110 111 /* shift ia[i] to point to next row */ 112 for (i1=1; i1<nslim_row+1; i1++) { 113 row = ia[i1-1]; 114 ia[i1] += row; 115 work[i1-1] = row - oshift; 116 } 117 118 /* allocate space for column pointers */ 119 nz = ia[nslim_row] + (!ishift); 120 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 121 *jja = ja; 122 123 /* loop over lower triangular part putting into ja */ 124 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 125 j = aj + ai[row] + ishift; 126 jmax = aj + ai[row+1] + ishift; 127 col = *j++ + ishift; 128 i2 = tvc[col]; 129 while (i2<i1 && j<jmax) { 130 ja[work[i2]++] = i1 + oshift; 131 ja[work[i1]++] = i2 + oshift; 132 ++i2; 133 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 134 i2 = tvc[col]; 135 } 136 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 137 138 } 139 ierr = PetscFree(work);CHKERRQ(ierr); 140 ierr = PetscFree(tns);CHKERRQ(ierr); 141 ierr = PetscFree(tvc);CHKERRQ(ierr); 142 PetscFunctionReturn(0); 143 } 144 145 /* 146 This builds nonsymmetric version of nonzero structure, 147 */ 148 #undef __FUNCT__ 149 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 150 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 151 { 152 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 153 PetscErrorCode ierr; 154 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 155 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 156 157 PetscFunctionBegin; 158 nslim_row = a->inode.node_count; 159 n = A->cmap->n; 160 161 /* Create The column_inode for this matrix */ 162 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 163 164 /* allocate space for reformated column_inode structure */ 165 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 166 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 167 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 168 169 for (i1=0,col=0; i1<nslim_col; ++i1) { 170 nsz = ns_col[i1]; 171 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 172 } 173 /* allocate space for row pointers */ 174 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 175 *iia = ia; 176 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 177 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 178 179 /* determine the number of columns in each row */ 180 ia[0] = oshift; 181 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 182 j = aj + ai[row] + ishift; 183 col = *j++ + ishift; 184 i2 = tvc[col]; 185 nz = ai[row+1] - ai[row]; 186 while (nz-- > 0) { /* off-diagonal elemets */ 187 ia[i1+1]++; 188 i2++; /* Start col of next node */ 189 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 190 if (nz > 0) i2 = tvc[col]; 191 } 192 } 193 194 /* shift ia[i] to point to next row */ 195 for (i1=1; i1<nslim_row+1; i1++) { 196 row = ia[i1-1]; 197 ia[i1] += row; 198 work[i1-1] = row - oshift; 199 } 200 201 /* allocate space for column pointers */ 202 nz = ia[nslim_row] + (!ishift); 203 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 204 *jja = ja; 205 206 /* loop over matrix putting into ja */ 207 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 208 j = aj + ai[row] + ishift; 209 col = *j++ + ishift; 210 i2 = tvc[col]; 211 nz = ai[row+1] - ai[row]; 212 while (nz-- > 0) { 213 ja[work[i1]++] = i2 + oshift; 214 ++i2; 215 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 216 if (nz > 0) i2 = tvc[col]; 217 } 218 } 219 ierr = PetscFree(ns_col);CHKERRQ(ierr); 220 ierr = PetscFree(work);CHKERRQ(ierr); 221 ierr = PetscFree(tns);CHKERRQ(ierr); 222 ierr = PetscFree(tvc);CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 #undef __FUNCT__ 227 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 228 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 229 { 230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 231 PetscErrorCode ierr; 232 233 PetscFunctionBegin; 234 *n = a->inode.node_count; 235 if (!ia) PetscFunctionReturn(0); 236 if (!blockcompressed) { 237 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 238 } else if (symmetric) { 239 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 240 } else { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 248 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 249 { 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 if (!ia) PetscFunctionReturn(0); 254 255 if (!blockcompressed) { 256 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 257 } else { 258 ierr = PetscFree(*ia);CHKERRQ(ierr); 259 ierr = PetscFree(*ja);CHKERRQ(ierr); 260 } 261 PetscFunctionReturn(0); 262 } 263 264 /* ----------------------------------------------------------- */ 265 266 #undef __FUNCT__ 267 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 268 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 269 { 270 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 271 PetscErrorCode ierr; 272 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 273 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 274 275 PetscFunctionBegin; 276 nslim_row = a->inode.node_count; 277 n = A->cmap->n; 278 279 /* Create The column_inode for this matrix */ 280 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 281 282 /* allocate space for reformated column_inode structure */ 283 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 284 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 285 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 286 287 for (i1=0,col=0; i1<nslim_col; ++i1) { 288 nsz = ns_col[i1]; 289 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 290 } 291 /* allocate space for column pointers */ 292 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 293 *iia = ia; 294 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 295 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 296 297 /* determine the number of columns in each row */ 298 ia[0] = oshift; 299 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 300 j = aj + ai[row] + ishift; 301 col = *j++ + ishift; 302 i2 = tvc[col]; 303 nz = ai[row+1] - ai[row]; 304 while (nz-- > 0) { /* off-diagonal elemets */ 305 /* ia[i1+1]++; */ 306 ia[i2+1]++; 307 i2++; 308 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 309 if (nz > 0) i2 = tvc[col]; 310 } 311 } 312 313 /* shift ia[i] to point to next col */ 314 for (i1=1; i1<nslim_col+1; i1++) { 315 col = ia[i1-1]; 316 ia[i1] += col; 317 work[i1-1] = col - oshift; 318 } 319 320 /* allocate space for column pointers */ 321 nz = ia[nslim_col] + (!ishift); 322 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 323 *jja = ja; 324 325 /* loop over matrix putting into ja */ 326 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 327 j = aj + ai[row] + ishift; 328 col = *j++ + ishift; 329 i2 = tvc[col]; 330 nz = ai[row+1] - ai[row]; 331 while (nz-- > 0) { 332 /* ja[work[i1]++] = i2 + oshift; */ 333 ja[work[i2]++] = i1 + oshift; 334 i2++; 335 while (nz > 0 && ((col = *j++ + ishift) < tns[i2])) nz--; 336 if (nz > 0) i2 = tvc[col]; 337 } 338 } 339 ierr = PetscFree(ns_col);CHKERRQ(ierr); 340 ierr = PetscFree(work);CHKERRQ(ierr); 341 ierr = PetscFree(tns);CHKERRQ(ierr); 342 ierr = PetscFree(tvc);CHKERRQ(ierr); 343 PetscFunctionReturn(0); 344 } 345 346 #undef __FUNCT__ 347 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 348 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 349 { 350 PetscErrorCode ierr; 351 352 PetscFunctionBegin; 353 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 354 if (!ia) PetscFunctionReturn(0); 355 356 if (!blockcompressed) { 357 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 358 } else if (symmetric) { 359 /* Since the indices are symmetric it does'nt matter */ 360 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 361 } else { 362 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } 364 PetscFunctionReturn(0); 365 } 366 367 #undef __FUNCT__ 368 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 369 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 370 { 371 PetscErrorCode ierr; 372 373 PetscFunctionBegin; 374 if (!ia) PetscFunctionReturn(0); 375 if (!blockcompressed) { 376 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 377 } else { 378 ierr = PetscFree(*ia);CHKERRQ(ierr); 379 ierr = PetscFree(*ja);CHKERRQ(ierr); 380 } 381 PetscFunctionReturn(0); 382 } 383 384 /* ----------------------------------------------------------- */ 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 388 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 389 { 390 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 391 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 392 PetscScalar *y; 393 const PetscScalar *x; 394 const MatScalar *v1,*v2,*v3,*v4,*v5; 395 PetscErrorCode ierr; 396 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 397 398 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 399 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 400 #endif 401 402 PetscFunctionBegin; 403 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 404 node_max = a->inode.node_count; 405 ns = a->inode.size; /* Node Size array */ 406 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 407 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 408 idx = a->j; 409 v1 = a->a; 410 ii = a->i; 411 412 for (i = 0,row = 0; i< node_max; ++i) { 413 nsz = ns[i]; 414 n = ii[1] - ii[0]; 415 nonzerorow += (n>0)*nsz; 416 ii += nsz; 417 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 418 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 419 sz = n; /* No of non zeros in this row */ 420 /* Switch on the size of Node */ 421 switch (nsz) { /* Each loop in 'case' is unrolled */ 422 case 1: 423 sum1 = 0.; 424 425 for (n = 0; n< sz-1; n+=2) { 426 i1 = idx[0]; /* The instructions are ordered to */ 427 i2 = idx[1]; /* make the compiler's job easy */ 428 idx += 2; 429 tmp0 = x[i1]; 430 tmp1 = x[i2]; 431 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 432 } 433 434 if (n == sz-1) { /* Take care of the last nonzero */ 435 tmp0 = x[*idx++]; 436 sum1 += *v1++ *tmp0; 437 } 438 y[row++]=sum1; 439 break; 440 case 2: 441 sum1 = 0.; 442 sum2 = 0.; 443 v2 = v1 + n; 444 445 for (n = 0; n< sz-1; n+=2) { 446 i1 = idx[0]; 447 i2 = idx[1]; 448 idx += 2; 449 tmp0 = x[i1]; 450 tmp1 = x[i2]; 451 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 452 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 453 } 454 if (n == sz-1) { 455 tmp0 = x[*idx++]; 456 sum1 += *v1++ * tmp0; 457 sum2 += *v2++ * tmp0; 458 } 459 y[row++]=sum1; 460 y[row++]=sum2; 461 v1 =v2; /* Since the next block to be processed starts there*/ 462 idx +=sz; 463 break; 464 case 3: 465 sum1 = 0.; 466 sum2 = 0.; 467 sum3 = 0.; 468 v2 = v1 + n; 469 v3 = v2 + n; 470 471 for (n = 0; n< sz-1; n+=2) { 472 i1 = idx[0]; 473 i2 = idx[1]; 474 idx += 2; 475 tmp0 = x[i1]; 476 tmp1 = x[i2]; 477 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 478 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 479 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 480 } 481 if (n == sz-1) { 482 tmp0 = x[*idx++]; 483 sum1 += *v1++ * tmp0; 484 sum2 += *v2++ * tmp0; 485 sum3 += *v3++ * tmp0; 486 } 487 y[row++]=sum1; 488 y[row++]=sum2; 489 y[row++]=sum3; 490 v1 =v3; /* Since the next block to be processed starts there*/ 491 idx +=2*sz; 492 break; 493 case 4: 494 sum1 = 0.; 495 sum2 = 0.; 496 sum3 = 0.; 497 sum4 = 0.; 498 v2 = v1 + n; 499 v3 = v2 + n; 500 v4 = v3 + n; 501 502 for (n = 0; n< sz-1; n+=2) { 503 i1 = idx[0]; 504 i2 = idx[1]; 505 idx += 2; 506 tmp0 = x[i1]; 507 tmp1 = x[i2]; 508 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 509 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 510 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 511 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 512 } 513 if (n == sz-1) { 514 tmp0 = x[*idx++]; 515 sum1 += *v1++ * tmp0; 516 sum2 += *v2++ * tmp0; 517 sum3 += *v3++ * tmp0; 518 sum4 += *v4++ * tmp0; 519 } 520 y[row++]=sum1; 521 y[row++]=sum2; 522 y[row++]=sum3; 523 y[row++]=sum4; 524 v1 =v4; /* Since the next block to be processed starts there*/ 525 idx +=3*sz; 526 break; 527 case 5: 528 sum1 = 0.; 529 sum2 = 0.; 530 sum3 = 0.; 531 sum4 = 0.; 532 sum5 = 0.; 533 v2 = v1 + n; 534 v3 = v2 + n; 535 v4 = v3 + n; 536 v5 = v4 + n; 537 538 for (n = 0; n<sz-1; n+=2) { 539 i1 = idx[0]; 540 i2 = idx[1]; 541 idx += 2; 542 tmp0 = x[i1]; 543 tmp1 = x[i2]; 544 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 545 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 546 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 547 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 548 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 549 } 550 if (n == sz-1) { 551 tmp0 = x[*idx++]; 552 sum1 += *v1++ * tmp0; 553 sum2 += *v2++ * tmp0; 554 sum3 += *v3++ * tmp0; 555 sum4 += *v4++ * tmp0; 556 sum5 += *v5++ * tmp0; 557 } 558 y[row++]=sum1; 559 y[row++]=sum2; 560 y[row++]=sum3; 561 y[row++]=sum4; 562 y[row++]=sum5; 563 v1 =v5; /* Since the next block to be processed starts there */ 564 idx +=4*sz; 565 break; 566 default: 567 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 568 } 569 } 570 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 571 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 572 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 /* ----------------------------------------------------------- */ 576 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 577 #undef __FUNCT__ 578 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 579 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 580 { 581 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 582 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 583 MatScalar *v1,*v2,*v3,*v4,*v5; 584 PetscScalar *x,*y,*z,*zt; 585 PetscErrorCode ierr; 586 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 587 588 PetscFunctionBegin; 589 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 590 node_max = a->inode.node_count; 591 ns = a->inode.size; /* Node Size array */ 592 593 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 594 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 595 if (zz != yy) { 596 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 597 } else { 598 z = y; 599 } 600 zt = z; 601 602 idx = a->j; 603 v1 = a->a; 604 ii = a->i; 605 606 for (i = 0,row = 0; i< node_max; ++i) { 607 nsz = ns[i]; 608 n = ii[1] - ii[0]; 609 ii += nsz; 610 sz = n; /* No of non zeros in this row */ 611 /* Switch on the size of Node */ 612 switch (nsz) { /* Each loop in 'case' is unrolled */ 613 case 1: 614 sum1 = *zt++; 615 616 for (n = 0; n< sz-1; n+=2) { 617 i1 = idx[0]; /* The instructions are ordered to */ 618 i2 = idx[1]; /* make the compiler's job easy */ 619 idx += 2; 620 tmp0 = x[i1]; 621 tmp1 = x[i2]; 622 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 623 } 624 625 if (n == sz-1) { /* Take care of the last nonzero */ 626 tmp0 = x[*idx++]; 627 sum1 += *v1++ * tmp0; 628 } 629 y[row++]=sum1; 630 break; 631 case 2: 632 sum1 = *zt++; 633 sum2 = *zt++; 634 v2 = v1 + n; 635 636 for (n = 0; n< sz-1; n+=2) { 637 i1 = idx[0]; 638 i2 = idx[1]; 639 idx += 2; 640 tmp0 = x[i1]; 641 tmp1 = x[i2]; 642 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 643 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 644 } 645 if (n == sz-1) { 646 tmp0 = x[*idx++]; 647 sum1 += *v1++ * tmp0; 648 sum2 += *v2++ * tmp0; 649 } 650 y[row++]=sum1; 651 y[row++]=sum2; 652 v1 =v2; /* Since the next block to be processed starts there*/ 653 idx +=sz; 654 break; 655 case 3: 656 sum1 = *zt++; 657 sum2 = *zt++; 658 sum3 = *zt++; 659 v2 = v1 + n; 660 v3 = v2 + n; 661 662 for (n = 0; n< sz-1; n+=2) { 663 i1 = idx[0]; 664 i2 = idx[1]; 665 idx += 2; 666 tmp0 = x[i1]; 667 tmp1 = x[i2]; 668 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 669 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 670 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 671 } 672 if (n == sz-1) { 673 tmp0 = x[*idx++]; 674 sum1 += *v1++ * tmp0; 675 sum2 += *v2++ * tmp0; 676 sum3 += *v3++ * tmp0; 677 } 678 y[row++]=sum1; 679 y[row++]=sum2; 680 y[row++]=sum3; 681 v1 =v3; /* Since the next block to be processed starts there*/ 682 idx +=2*sz; 683 break; 684 case 4: 685 sum1 = *zt++; 686 sum2 = *zt++; 687 sum3 = *zt++; 688 sum4 = *zt++; 689 v2 = v1 + n; 690 v3 = v2 + n; 691 v4 = v3 + n; 692 693 for (n = 0; n< sz-1; n+=2) { 694 i1 = idx[0]; 695 i2 = idx[1]; 696 idx += 2; 697 tmp0 = x[i1]; 698 tmp1 = x[i2]; 699 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 700 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 701 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 702 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 703 } 704 if (n == sz-1) { 705 tmp0 = x[*idx++]; 706 sum1 += *v1++ * tmp0; 707 sum2 += *v2++ * tmp0; 708 sum3 += *v3++ * tmp0; 709 sum4 += *v4++ * tmp0; 710 } 711 y[row++]=sum1; 712 y[row++]=sum2; 713 y[row++]=sum3; 714 y[row++]=sum4; 715 v1 =v4; /* Since the next block to be processed starts there*/ 716 idx +=3*sz; 717 break; 718 case 5: 719 sum1 = *zt++; 720 sum2 = *zt++; 721 sum3 = *zt++; 722 sum4 = *zt++; 723 sum5 = *zt++; 724 v2 = v1 + n; 725 v3 = v2 + n; 726 v4 = v3 + n; 727 v5 = v4 + n; 728 729 for (n = 0; n<sz-1; n+=2) { 730 i1 = idx[0]; 731 i2 = idx[1]; 732 idx += 2; 733 tmp0 = x[i1]; 734 tmp1 = x[i2]; 735 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 736 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 737 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 738 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 739 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 740 } 741 if (n == sz-1) { 742 tmp0 = x[*idx++]; 743 sum1 += *v1++ * tmp0; 744 sum2 += *v2++ * tmp0; 745 sum3 += *v3++ * tmp0; 746 sum4 += *v4++ * tmp0; 747 sum5 += *v5++ * tmp0; 748 } 749 y[row++]=sum1; 750 y[row++]=sum2; 751 y[row++]=sum3; 752 y[row++]=sum4; 753 y[row++]=sum5; 754 v1 =v5; /* Since the next block to be processed starts there */ 755 idx +=4*sz; 756 break; 757 default: 758 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 759 } 760 } 761 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 762 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 763 if (zz != yy) { 764 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 765 } 766 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 767 PetscFunctionReturn(0); 768 } 769 770 /* ----------------------------------------------------------- */ 771 #undef __FUNCT__ 772 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 773 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 774 { 775 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 776 IS iscol = a->col,isrow = a->row; 777 PetscErrorCode ierr; 778 const PetscInt *r,*c,*rout,*cout; 779 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 780 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 781 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 782 PetscScalar sum1,sum2,sum3,sum4,sum5; 783 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 784 const PetscScalar *b; 785 786 PetscFunctionBegin; 787 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 788 node_max = a->inode.node_count; 789 ns = a->inode.size; /* Node Size array */ 790 791 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 792 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 793 tmp = a->solve_work; 794 795 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 796 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 797 798 /* forward solve the lower triangular */ 799 tmps = tmp; 800 aa = a_a; 801 aj = a_j; 802 ad = a->diag; 803 804 for (i = 0,row = 0; i< node_max; ++i) { 805 nsz = ns[i]; 806 aii = ai[row]; 807 v1 = aa + aii; 808 vi = aj + aii; 809 nz = ad[row]- aii; 810 if (i < node_max-1) { 811 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 812 * but our indexing to determine it's size could. */ 813 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 814 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 815 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 816 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 817 } 818 819 switch (nsz) { /* Each loop in 'case' is unrolled */ 820 case 1: 821 sum1 = b[*r++]; 822 for (j=0; j<nz-1; j+=2) { 823 i0 = vi[0]; 824 i1 = vi[1]; 825 vi +=2; 826 tmp0 = tmps[i0]; 827 tmp1 = tmps[i1]; 828 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 829 } 830 if (j == nz-1) { 831 tmp0 = tmps[*vi++]; 832 sum1 -= *v1++ *tmp0; 833 } 834 tmp[row++]=sum1; 835 break; 836 case 2: 837 sum1 = b[*r++]; 838 sum2 = b[*r++]; 839 v2 = aa + ai[row+1]; 840 841 for (j=0; j<nz-1; j+=2) { 842 i0 = vi[0]; 843 i1 = vi[1]; 844 vi +=2; 845 tmp0 = tmps[i0]; 846 tmp1 = tmps[i1]; 847 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 848 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 849 } 850 if (j == nz-1) { 851 tmp0 = tmps[*vi++]; 852 sum1 -= *v1++ *tmp0; 853 sum2 -= *v2++ *tmp0; 854 } 855 sum2 -= *v2++ *sum1; 856 tmp[row++]=sum1; 857 tmp[row++]=sum2; 858 break; 859 case 3: 860 sum1 = b[*r++]; 861 sum2 = b[*r++]; 862 sum3 = b[*r++]; 863 v2 = aa + ai[row+1]; 864 v3 = aa + ai[row+2]; 865 866 for (j=0; j<nz-1; j+=2) { 867 i0 = vi[0]; 868 i1 = vi[1]; 869 vi +=2; 870 tmp0 = tmps[i0]; 871 tmp1 = tmps[i1]; 872 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 873 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 874 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 875 } 876 if (j == nz-1) { 877 tmp0 = tmps[*vi++]; 878 sum1 -= *v1++ *tmp0; 879 sum2 -= *v2++ *tmp0; 880 sum3 -= *v3++ *tmp0; 881 } 882 sum2 -= *v2++ * sum1; 883 sum3 -= *v3++ * sum1; 884 sum3 -= *v3++ * sum2; 885 886 tmp[row++]=sum1; 887 tmp[row++]=sum2; 888 tmp[row++]=sum3; 889 break; 890 891 case 4: 892 sum1 = b[*r++]; 893 sum2 = b[*r++]; 894 sum3 = b[*r++]; 895 sum4 = b[*r++]; 896 v2 = aa + ai[row+1]; 897 v3 = aa + ai[row+2]; 898 v4 = aa + ai[row+3]; 899 900 for (j=0; j<nz-1; j+=2) { 901 i0 = vi[0]; 902 i1 = vi[1]; 903 vi +=2; 904 tmp0 = tmps[i0]; 905 tmp1 = tmps[i1]; 906 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 908 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 909 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 910 } 911 if (j == nz-1) { 912 tmp0 = tmps[*vi++]; 913 sum1 -= *v1++ *tmp0; 914 sum2 -= *v2++ *tmp0; 915 sum3 -= *v3++ *tmp0; 916 sum4 -= *v4++ *tmp0; 917 } 918 sum2 -= *v2++ * sum1; 919 sum3 -= *v3++ * sum1; 920 sum4 -= *v4++ * sum1; 921 sum3 -= *v3++ * sum2; 922 sum4 -= *v4++ * sum2; 923 sum4 -= *v4++ * sum3; 924 925 tmp[row++]=sum1; 926 tmp[row++]=sum2; 927 tmp[row++]=sum3; 928 tmp[row++]=sum4; 929 break; 930 case 5: 931 sum1 = b[*r++]; 932 sum2 = b[*r++]; 933 sum3 = b[*r++]; 934 sum4 = b[*r++]; 935 sum5 = b[*r++]; 936 v2 = aa + ai[row+1]; 937 v3 = aa + ai[row+2]; 938 v4 = aa + ai[row+3]; 939 v5 = aa + ai[row+4]; 940 941 for (j=0; j<nz-1; j+=2) { 942 i0 = vi[0]; 943 i1 = vi[1]; 944 vi +=2; 945 tmp0 = tmps[i0]; 946 tmp1 = tmps[i1]; 947 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 948 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 949 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 950 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 951 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 952 } 953 if (j == nz-1) { 954 tmp0 = tmps[*vi++]; 955 sum1 -= *v1++ *tmp0; 956 sum2 -= *v2++ *tmp0; 957 sum3 -= *v3++ *tmp0; 958 sum4 -= *v4++ *tmp0; 959 sum5 -= *v5++ *tmp0; 960 } 961 962 sum2 -= *v2++ * sum1; 963 sum3 -= *v3++ * sum1; 964 sum4 -= *v4++ * sum1; 965 sum5 -= *v5++ * sum1; 966 sum3 -= *v3++ * sum2; 967 sum4 -= *v4++ * sum2; 968 sum5 -= *v5++ * sum2; 969 sum4 -= *v4++ * sum3; 970 sum5 -= *v5++ * sum3; 971 sum5 -= *v5++ * sum4; 972 973 tmp[row++]=sum1; 974 tmp[row++]=sum2; 975 tmp[row++]=sum3; 976 tmp[row++]=sum4; 977 tmp[row++]=sum5; 978 break; 979 default: 980 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 981 } 982 } 983 /* backward solve the upper triangular */ 984 for (i=node_max -1,row = n-1; i>=0; i--) { 985 nsz = ns[i]; 986 aii = ai[row+1] -1; 987 v1 = aa + aii; 988 vi = aj + aii; 989 nz = aii- ad[row]; 990 switch (nsz) { /* Each loop in 'case' is unrolled */ 991 case 1: 992 sum1 = tmp[row]; 993 994 for (j=nz; j>1; j-=2) { 995 vi -=2; 996 i0 = vi[2]; 997 i1 = vi[1]; 998 tmp0 = tmps[i0]; 999 tmp1 = tmps[i1]; 1000 v1 -= 2; 1001 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1002 } 1003 if (j==1) { 1004 tmp0 = tmps[*vi--]; 1005 sum1 -= *v1-- * tmp0; 1006 } 1007 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1008 break; 1009 case 2: 1010 sum1 = tmp[row]; 1011 sum2 = tmp[row -1]; 1012 v2 = aa + ai[row]-1; 1013 for (j=nz; j>1; j-=2) { 1014 vi -=2; 1015 i0 = vi[2]; 1016 i1 = vi[1]; 1017 tmp0 = tmps[i0]; 1018 tmp1 = tmps[i1]; 1019 v1 -= 2; 1020 v2 -= 2; 1021 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1022 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1023 } 1024 if (j==1) { 1025 tmp0 = tmps[*vi--]; 1026 sum1 -= *v1-- * tmp0; 1027 sum2 -= *v2-- * tmp0; 1028 } 1029 1030 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1031 sum2 -= *v2-- * tmp0; 1032 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1033 break; 1034 case 3: 1035 sum1 = tmp[row]; 1036 sum2 = tmp[row -1]; 1037 sum3 = tmp[row -2]; 1038 v2 = aa + ai[row]-1; 1039 v3 = aa + ai[row -1]-1; 1040 for (j=nz; j>1; j-=2) { 1041 vi -=2; 1042 i0 = vi[2]; 1043 i1 = vi[1]; 1044 tmp0 = tmps[i0]; 1045 tmp1 = tmps[i1]; 1046 v1 -= 2; 1047 v2 -= 2; 1048 v3 -= 2; 1049 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1050 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1051 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1052 } 1053 if (j==1) { 1054 tmp0 = tmps[*vi--]; 1055 sum1 -= *v1-- * tmp0; 1056 sum2 -= *v2-- * tmp0; 1057 sum3 -= *v3-- * tmp0; 1058 } 1059 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1060 sum2 -= *v2-- * tmp0; 1061 sum3 -= *v3-- * tmp0; 1062 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1063 sum3 -= *v3-- * tmp0; 1064 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1065 1066 break; 1067 case 4: 1068 sum1 = tmp[row]; 1069 sum2 = tmp[row -1]; 1070 sum3 = tmp[row -2]; 1071 sum4 = tmp[row -3]; 1072 v2 = aa + ai[row]-1; 1073 v3 = aa + ai[row -1]-1; 1074 v4 = aa + ai[row -2]-1; 1075 1076 for (j=nz; j>1; j-=2) { 1077 vi -=2; 1078 i0 = vi[2]; 1079 i1 = vi[1]; 1080 tmp0 = tmps[i0]; 1081 tmp1 = tmps[i1]; 1082 v1 -= 2; 1083 v2 -= 2; 1084 v3 -= 2; 1085 v4 -= 2; 1086 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1087 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1088 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1089 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1090 } 1091 if (j==1) { 1092 tmp0 = tmps[*vi--]; 1093 sum1 -= *v1-- * tmp0; 1094 sum2 -= *v2-- * tmp0; 1095 sum3 -= *v3-- * tmp0; 1096 sum4 -= *v4-- * tmp0; 1097 } 1098 1099 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1104 sum3 -= *v3-- * tmp0; 1105 sum4 -= *v4-- * tmp0; 1106 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1107 sum4 -= *v4-- * tmp0; 1108 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1109 break; 1110 case 5: 1111 sum1 = tmp[row]; 1112 sum2 = tmp[row -1]; 1113 sum3 = tmp[row -2]; 1114 sum4 = tmp[row -3]; 1115 sum5 = tmp[row -4]; 1116 v2 = aa + ai[row]-1; 1117 v3 = aa + ai[row -1]-1; 1118 v4 = aa + ai[row -2]-1; 1119 v5 = aa + ai[row -3]-1; 1120 for (j=nz; j>1; j-=2) { 1121 vi -= 2; 1122 i0 = vi[2]; 1123 i1 = vi[1]; 1124 tmp0 = tmps[i0]; 1125 tmp1 = tmps[i1]; 1126 v1 -= 2; 1127 v2 -= 2; 1128 v3 -= 2; 1129 v4 -= 2; 1130 v5 -= 2; 1131 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1132 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1133 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1134 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1135 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1136 } 1137 if (j==1) { 1138 tmp0 = tmps[*vi--]; 1139 sum1 -= *v1-- * tmp0; 1140 sum2 -= *v2-- * tmp0; 1141 sum3 -= *v3-- * tmp0; 1142 sum4 -= *v4-- * tmp0; 1143 sum5 -= *v5-- * tmp0; 1144 } 1145 1146 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1152 sum3 -= *v3-- * tmp0; 1153 sum4 -= *v4-- * tmp0; 1154 sum5 -= *v5-- * tmp0; 1155 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1159 sum5 -= *v5-- * tmp0; 1160 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1161 break; 1162 default: 1163 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1164 } 1165 } 1166 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1167 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1168 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1176 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1177 { 1178 Mat C =B; 1179 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1180 IS isrow = b->row,isicol = b->icol; 1181 PetscErrorCode ierr; 1182 const PetscInt *r,*ic,*ics; 1183 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1184 PetscInt i,j,k,nz,nzL,row,*pj; 1185 const PetscInt *ajtmp,*bjtmp; 1186 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1187 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1188 FactorShiftCtx sctx; 1189 const PetscInt *ddiag; 1190 PetscReal rs; 1191 MatScalar d; 1192 PetscInt inod,nodesz,node_max,col; 1193 const PetscInt *ns; 1194 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1195 1196 PetscFunctionBegin; 1197 /* MatPivotSetUp(): initialize shift context sctx */ 1198 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1199 1200 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1201 ddiag = a->diag; 1202 sctx.shift_top = info->zeropivot; 1203 for (i=0; i<n; i++) { 1204 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1205 d = (aa)[ddiag[i]]; 1206 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1207 v = aa+ai[i]; 1208 nz = ai[i+1] - ai[i]; 1209 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1210 if (rs>sctx.shift_top) sctx.shift_top = rs; 1211 } 1212 sctx.shift_top *= 1.1; 1213 sctx.nshift_max = 5; 1214 sctx.shift_lo = 0.; 1215 sctx.shift_hi = 1.; 1216 } 1217 1218 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1219 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1220 1221 ierr = PetscMalloc4(n,PetscScalar,&rtmp1,n,PetscScalar,&rtmp2,n,PetscScalar,&rtmp3,n,PetscScalar,&rtmp4);CHKERRQ(ierr); 1222 ierr = PetscMemzero(rtmp1,n*sizeof(PetscScalar));CHKERRQ(ierr); 1223 ierr = PetscMemzero(rtmp2,n*sizeof(PetscScalar));CHKERRQ(ierr); 1224 ierr = PetscMemzero(rtmp3,n*sizeof(PetscScalar));CHKERRQ(ierr); 1225 ierr = PetscMemzero(rtmp4,n*sizeof(PetscScalar));CHKERRQ(ierr); 1226 ics = ic; 1227 1228 node_max = a->inode.node_count; 1229 ns = a->inode.size; 1230 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1231 1232 /* If max inode size > 4, split it into two inodes.*/ 1233 /* also map the inode sizes according to the ordering */ 1234 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1235 for (i=0,j=0; i<node_max; ++i,++j) { 1236 if (ns[i] > 4) { 1237 tmp_vec1[j] = 4; 1238 ++j; 1239 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1240 } else { 1241 tmp_vec1[j] = ns[i]; 1242 } 1243 } 1244 /* Use the correct node_max */ 1245 node_max = j; 1246 1247 /* Now reorder the inode info based on mat re-ordering info */ 1248 /* First create a row -> inode_size_array_index map */ 1249 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1250 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1251 for (i=0,row=0; i<node_max; i++) { 1252 nodesz = tmp_vec1[i]; 1253 for (j=0; j<nodesz; j++,row++) { 1254 nsmap[row] = i; 1255 } 1256 } 1257 /* Using nsmap, create a reordered ns structure */ 1258 for (i=0,j=0; i< node_max; i++) { 1259 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1260 tmp_vec2[i] = nodesz; 1261 j += nodesz; 1262 } 1263 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1264 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1265 1266 /* Now use the correct ns */ 1267 ns = tmp_vec2; 1268 1269 do { 1270 sctx.newshift = PETSC_FALSE; 1271 /* Now loop over each block-row, and do the factorization */ 1272 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1273 nodesz = ns[inod]; 1274 1275 switch (nodesz) { 1276 case 1: 1277 /*----------*/ 1278 /* zero rtmp1 */ 1279 /* L part */ 1280 nz = bi[i+1] - bi[i]; 1281 bjtmp = bj + bi[i]; 1282 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1283 1284 /* U part */ 1285 nz = bdiag[i]-bdiag[i+1]; 1286 bjtmp = bj + bdiag[i+1]+1; 1287 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1288 1289 /* load in initial (unfactored row) */ 1290 nz = ai[r[i]+1] - ai[r[i]]; 1291 ajtmp = aj + ai[r[i]]; 1292 v = aa + ai[r[i]]; 1293 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1294 1295 /* ZeropivotApply() */ 1296 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1297 1298 /* elimination */ 1299 bjtmp = bj + bi[i]; 1300 row = *bjtmp++; 1301 nzL = bi[i+1] - bi[i]; 1302 for (k=0; k < nzL; k++) { 1303 pc = rtmp1 + row; 1304 if (*pc != 0.0) { 1305 pv = b->a + bdiag[row]; 1306 mul1 = *pc * (*pv); 1307 *pc = mul1; 1308 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1309 pv = b->a + bdiag[row+1]+1; 1310 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1311 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1312 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1313 } 1314 row = *bjtmp++; 1315 } 1316 1317 /* finished row so stick it into b->a */ 1318 rs = 0.0; 1319 /* L part */ 1320 pv = b->a + bi[i]; 1321 pj = b->j + bi[i]; 1322 nz = bi[i+1] - bi[i]; 1323 for (j=0; j<nz; j++) { 1324 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1325 } 1326 1327 /* U part */ 1328 pv = b->a + bdiag[i+1]+1; 1329 pj = b->j + bdiag[i+1]+1; 1330 nz = bdiag[i] - bdiag[i+1]-1; 1331 for (j=0; j<nz; j++) { 1332 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1333 } 1334 1335 /* Check zero pivot */ 1336 sctx.rs = rs; 1337 sctx.pv = rtmp1[i]; 1338 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1339 if (sctx.newshift) break; 1340 1341 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1342 pv = b->a + bdiag[i]; 1343 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1344 break; 1345 1346 case 2: 1347 /*----------*/ 1348 /* zero rtmp1 and rtmp2 */ 1349 /* L part */ 1350 nz = bi[i+1] - bi[i]; 1351 bjtmp = bj + bi[i]; 1352 for (j=0; j<nz; j++) { 1353 col = bjtmp[j]; 1354 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1355 } 1356 1357 /* U part */ 1358 nz = bdiag[i]-bdiag[i+1]; 1359 bjtmp = bj + bdiag[i+1]+1; 1360 for (j=0; j<nz; j++) { 1361 col = bjtmp[j]; 1362 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1363 } 1364 1365 /* load in initial (unfactored row) */ 1366 nz = ai[r[i]+1] - ai[r[i]]; 1367 ajtmp = aj + ai[r[i]]; 1368 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1369 for (j=0; j<nz; j++) { 1370 col = ics[ajtmp[j]]; 1371 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1372 } 1373 /* ZeropivotApply(): shift the diagonal of the matrix */ 1374 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1375 1376 /* elimination */ 1377 bjtmp = bj + bi[i]; 1378 row = *bjtmp++; /* pivot row */ 1379 nzL = bi[i+1] - bi[i]; 1380 for (k=0; k < nzL; k++) { 1381 pc1 = rtmp1 + row; 1382 pc2 = rtmp2 + row; 1383 if (*pc1 != 0.0 || *pc2 != 0.0) { 1384 pv = b->a + bdiag[row]; 1385 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1386 *pc1 = mul1; *pc2 = mul2; 1387 1388 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1389 pv = b->a + bdiag[row+1]+1; 1390 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1391 for (j=0; j<nz; j++) { 1392 col = pj[j]; 1393 rtmp1[col] -= mul1 * pv[j]; 1394 rtmp2[col] -= mul2 * pv[j]; 1395 } 1396 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1397 } 1398 row = *bjtmp++; 1399 } 1400 1401 /* finished row i; check zero pivot, then stick row i into b->a */ 1402 rs = 0.0; 1403 /* L part */ 1404 pc1 = b->a + bi[i]; 1405 pj = b->j + bi[i]; 1406 nz = bi[i+1] - bi[i]; 1407 for (j=0; j<nz; j++) { 1408 col = pj[j]; 1409 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1410 } 1411 /* U part */ 1412 pc1 = b->a + bdiag[i+1]+1; 1413 pj = b->j + bdiag[i+1]+1; 1414 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1415 for (j=0; j<nz; j++) { 1416 col = pj[j]; 1417 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1418 } 1419 1420 sctx.rs = rs; 1421 sctx.pv = rtmp1[i]; 1422 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1423 if (sctx.newshift) break; 1424 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1425 *pc1 = 1.0/sctx.pv; 1426 1427 /* Now take care of diagonal 2x2 block. */ 1428 pc2 = rtmp2 + i; 1429 if (*pc2 != 0.0) { 1430 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1431 *pc2 = mul1; /* insert L entry */ 1432 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1433 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1434 for (j=0; j<nz; j++) { 1435 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1436 } 1437 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1438 } 1439 1440 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1441 rs = 0.0; 1442 /* L part */ 1443 pc2 = b->a + bi[i+1]; 1444 pj = b->j + bi[i+1]; 1445 nz = bi[i+2] - bi[i+1]; 1446 for (j=0; j<nz; j++) { 1447 col = pj[j]; 1448 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1449 } 1450 /* U part */ 1451 pc2 = b->a + bdiag[i+2]+1; 1452 pj = b->j + bdiag[i+2]+1; 1453 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1454 for (j=0; j<nz; j++) { 1455 col = pj[j]; 1456 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1457 } 1458 1459 sctx.rs = rs; 1460 sctx.pv = rtmp2[i+1]; 1461 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1462 if (sctx.newshift) break; 1463 pc2 = b->a + bdiag[i+1]; 1464 *pc2 = 1.0/sctx.pv; 1465 break; 1466 1467 case 3: 1468 /*----------*/ 1469 /* zero rtmp */ 1470 /* L part */ 1471 nz = bi[i+1] - bi[i]; 1472 bjtmp = bj + bi[i]; 1473 for (j=0; j<nz; j++) { 1474 col = bjtmp[j]; 1475 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1476 } 1477 1478 /* U part */ 1479 nz = bdiag[i]-bdiag[i+1]; 1480 bjtmp = bj + bdiag[i+1]+1; 1481 for (j=0; j<nz; j++) { 1482 col = bjtmp[j]; 1483 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1484 } 1485 1486 /* load in initial (unfactored row) */ 1487 nz = ai[r[i]+1] - ai[r[i]]; 1488 ajtmp = aj + ai[r[i]]; 1489 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1490 for (j=0; j<nz; j++) { 1491 col = ics[ajtmp[j]]; 1492 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1493 } 1494 /* ZeropivotApply(): shift the diagonal of the matrix */ 1495 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1496 1497 /* elimination */ 1498 bjtmp = bj + bi[i]; 1499 row = *bjtmp++; /* pivot row */ 1500 nzL = bi[i+1] - bi[i]; 1501 for (k=0; k < nzL; k++) { 1502 pc1 = rtmp1 + row; 1503 pc2 = rtmp2 + row; 1504 pc3 = rtmp3 + row; 1505 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1506 pv = b->a + bdiag[row]; 1507 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1508 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1509 1510 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1511 pv = b->a + bdiag[row+1]+1; 1512 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1513 for (j=0; j<nz; j++) { 1514 col = pj[j]; 1515 rtmp1[col] -= mul1 * pv[j]; 1516 rtmp2[col] -= mul2 * pv[j]; 1517 rtmp3[col] -= mul3 * pv[j]; 1518 } 1519 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1520 } 1521 row = *bjtmp++; 1522 } 1523 1524 /* finished row i; check zero pivot, then stick row i into b->a */ 1525 rs = 0.0; 1526 /* L part */ 1527 pc1 = b->a + bi[i]; 1528 pj = b->j + bi[i]; 1529 nz = bi[i+1] - bi[i]; 1530 for (j=0; j<nz; j++) { 1531 col = pj[j]; 1532 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1533 } 1534 /* U part */ 1535 pc1 = b->a + bdiag[i+1]+1; 1536 pj = b->j + bdiag[i+1]+1; 1537 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1538 for (j=0; j<nz; j++) { 1539 col = pj[j]; 1540 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1541 } 1542 1543 sctx.rs = rs; 1544 sctx.pv = rtmp1[i]; 1545 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1546 if (sctx.newshift) break; 1547 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1548 *pc1 = 1.0/sctx.pv; 1549 1550 /* Now take care of 1st column of diagonal 3x3 block. */ 1551 pc2 = rtmp2 + i; 1552 pc3 = rtmp3 + i; 1553 if (*pc2 != 0.0 || *pc3 != 0.0) { 1554 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1555 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1556 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1557 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1558 for (j=0; j<nz; j++) { 1559 col = pj[j]; 1560 rtmp2[col] -= mul2 * rtmp1[col]; 1561 rtmp3[col] -= mul3 * rtmp1[col]; 1562 } 1563 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1564 } 1565 1566 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1567 rs = 0.0; 1568 /* L part */ 1569 pc2 = b->a + bi[i+1]; 1570 pj = b->j + bi[i+1]; 1571 nz = bi[i+2] - bi[i+1]; 1572 for (j=0; j<nz; j++) { 1573 col = pj[j]; 1574 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1575 } 1576 /* U part */ 1577 pc2 = b->a + bdiag[i+2]+1; 1578 pj = b->j + bdiag[i+2]+1; 1579 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1580 for (j=0; j<nz; j++) { 1581 col = pj[j]; 1582 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1583 } 1584 1585 sctx.rs = rs; 1586 sctx.pv = rtmp2[i+1]; 1587 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1588 if (sctx.newshift) break; 1589 pc2 = b->a + bdiag[i+1]; 1590 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1591 1592 /* Now take care of 2nd column of diagonal 3x3 block. */ 1593 pc3 = rtmp3 + i+1; 1594 if (*pc3 != 0.0) { 1595 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1596 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1597 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1598 for (j=0; j<nz; j++) { 1599 col = pj[j]; 1600 rtmp3[col] -= mul3 * rtmp2[col]; 1601 } 1602 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1603 } 1604 1605 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1606 rs = 0.0; 1607 /* L part */ 1608 pc3 = b->a + bi[i+2]; 1609 pj = b->j + bi[i+2]; 1610 nz = bi[i+3] - bi[i+2]; 1611 for (j=0; j<nz; j++) { 1612 col = pj[j]; 1613 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1614 } 1615 /* U part */ 1616 pc3 = b->a + bdiag[i+3]+1; 1617 pj = b->j + bdiag[i+3]+1; 1618 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1619 for (j=0; j<nz; j++) { 1620 col = pj[j]; 1621 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1622 } 1623 1624 sctx.rs = rs; 1625 sctx.pv = rtmp3[i+2]; 1626 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1627 if (sctx.newshift) break; 1628 pc3 = b->a + bdiag[i+2]; 1629 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1630 break; 1631 case 4: 1632 /*----------*/ 1633 /* zero rtmp */ 1634 /* L part */ 1635 nz = bi[i+1] - bi[i]; 1636 bjtmp = bj + bi[i]; 1637 for (j=0; j<nz; j++) { 1638 col = bjtmp[j]; 1639 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1640 } 1641 1642 /* U part */ 1643 nz = bdiag[i]-bdiag[i+1]; 1644 bjtmp = bj + bdiag[i+1]+1; 1645 for (j=0; j<nz; j++) { 1646 col = bjtmp[j]; 1647 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1648 } 1649 1650 /* load in initial (unfactored row) */ 1651 nz = ai[r[i]+1] - ai[r[i]]; 1652 ajtmp = aj + ai[r[i]]; 1653 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1654 for (j=0; j<nz; j++) { 1655 col = ics[ajtmp[j]]; 1656 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1657 } 1658 /* ZeropivotApply(): shift the diagonal of the matrix */ 1659 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1660 1661 /* elimination */ 1662 bjtmp = bj + bi[i]; 1663 row = *bjtmp++; /* pivot row */ 1664 nzL = bi[i+1] - bi[i]; 1665 for (k=0; k < nzL; k++) { 1666 pc1 = rtmp1 + row; 1667 pc2 = rtmp2 + row; 1668 pc3 = rtmp3 + row; 1669 pc4 = rtmp4 + row; 1670 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1671 pv = b->a + bdiag[row]; 1672 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1673 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1674 1675 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1676 pv = b->a + bdiag[row+1]+1; 1677 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1678 for (j=0; j<nz; j++) { 1679 col = pj[j]; 1680 rtmp1[col] -= mul1 * pv[j]; 1681 rtmp2[col] -= mul2 * pv[j]; 1682 rtmp3[col] -= mul3 * pv[j]; 1683 rtmp4[col] -= mul4 * pv[j]; 1684 } 1685 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1686 } 1687 row = *bjtmp++; 1688 } 1689 1690 /* finished row i; check zero pivot, then stick row i into b->a */ 1691 rs = 0.0; 1692 /* L part */ 1693 pc1 = b->a + bi[i]; 1694 pj = b->j + bi[i]; 1695 nz = bi[i+1] - bi[i]; 1696 for (j=0; j<nz; j++) { 1697 col = pj[j]; 1698 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1699 } 1700 /* U part */ 1701 pc1 = b->a + bdiag[i+1]+1; 1702 pj = b->j + bdiag[i+1]+1; 1703 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1704 for (j=0; j<nz; j++) { 1705 col = pj[j]; 1706 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1707 } 1708 1709 sctx.rs = rs; 1710 sctx.pv = rtmp1[i]; 1711 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1712 if (sctx.newshift) break; 1713 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1714 *pc1 = 1.0/sctx.pv; 1715 1716 /* Now take care of 1st column of diagonal 4x4 block. */ 1717 pc2 = rtmp2 + i; 1718 pc3 = rtmp3 + i; 1719 pc4 = rtmp4 + i; 1720 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1721 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1722 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1723 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1724 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1725 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1726 for (j=0; j<nz; j++) { 1727 col = pj[j]; 1728 rtmp2[col] -= mul2 * rtmp1[col]; 1729 rtmp3[col] -= mul3 * rtmp1[col]; 1730 rtmp4[col] -= mul4 * rtmp1[col]; 1731 } 1732 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1733 } 1734 1735 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1736 rs = 0.0; 1737 /* L part */ 1738 pc2 = b->a + bi[i+1]; 1739 pj = b->j + bi[i+1]; 1740 nz = bi[i+2] - bi[i+1]; 1741 for (j=0; j<nz; j++) { 1742 col = pj[j]; 1743 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1744 } 1745 /* U part */ 1746 pc2 = b->a + bdiag[i+2]+1; 1747 pj = b->j + bdiag[i+2]+1; 1748 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1749 for (j=0; j<nz; j++) { 1750 col = pj[j]; 1751 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1752 } 1753 1754 sctx.rs = rs; 1755 sctx.pv = rtmp2[i+1]; 1756 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1757 if (sctx.newshift) break; 1758 pc2 = b->a + bdiag[i+1]; 1759 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1760 1761 /* Now take care of 2nd column of diagonal 4x4 block. */ 1762 pc3 = rtmp3 + i+1; 1763 pc4 = rtmp4 + i+1; 1764 if (*pc3 != 0.0 || *pc4 != 0.0) { 1765 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1766 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1767 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1768 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1769 for (j=0; j<nz; j++) { 1770 col = pj[j]; 1771 rtmp3[col] -= mul3 * rtmp2[col]; 1772 rtmp4[col] -= mul4 * rtmp2[col]; 1773 } 1774 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1775 } 1776 1777 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1778 rs = 0.0; 1779 /* L part */ 1780 pc3 = b->a + bi[i+2]; 1781 pj = b->j + bi[i+2]; 1782 nz = bi[i+3] - bi[i+2]; 1783 for (j=0; j<nz; j++) { 1784 col = pj[j]; 1785 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1786 } 1787 /* U part */ 1788 pc3 = b->a + bdiag[i+3]+1; 1789 pj = b->j + bdiag[i+3]+1; 1790 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1791 for (j=0; j<nz; j++) { 1792 col = pj[j]; 1793 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1794 } 1795 1796 sctx.rs = rs; 1797 sctx.pv = rtmp3[i+2]; 1798 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1799 if (sctx.newshift) break; 1800 pc3 = b->a + bdiag[i+2]; 1801 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1802 1803 /* Now take care of 3rd column of diagonal 4x4 block. */ 1804 pc4 = rtmp4 + i+2; 1805 if (*pc4 != 0.0) { 1806 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1807 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1808 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1809 for (j=0; j<nz; j++) { 1810 col = pj[j]; 1811 rtmp4[col] -= mul4 * rtmp3[col]; 1812 } 1813 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1814 } 1815 1816 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1817 rs = 0.0; 1818 /* L part */ 1819 pc4 = b->a + bi[i+3]; 1820 pj = b->j + bi[i+3]; 1821 nz = bi[i+4] - bi[i+3]; 1822 for (j=0; j<nz; j++) { 1823 col = pj[j]; 1824 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1825 } 1826 /* U part */ 1827 pc4 = b->a + bdiag[i+4]+1; 1828 pj = b->j + bdiag[i+4]+1; 1829 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1830 for (j=0; j<nz; j++) { 1831 col = pj[j]; 1832 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1833 } 1834 1835 sctx.rs = rs; 1836 sctx.pv = rtmp4[i+3]; 1837 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1838 if (sctx.newshift) break; 1839 pc4 = b->a + bdiag[i+3]; 1840 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1841 break; 1842 1843 default: 1844 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1845 } 1846 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1847 i += nodesz; /* Update the row */ 1848 } 1849 1850 /* MatPivotRefine() */ 1851 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1852 /* 1853 * if no shift in this attempt & shifting & started shifting & can refine, 1854 * then try lower shift 1855 */ 1856 sctx.shift_hi = sctx.shift_fraction; 1857 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1858 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1859 sctx.newshift = PETSC_TRUE; 1860 sctx.nshift++; 1861 } 1862 } while (sctx.newshift); 1863 1864 ierr = PetscFree4(rtmp1,rtmp2,rtmp3,rtmp4);CHKERRQ(ierr); 1865 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1866 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1867 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1868 1869 C->ops->solve = MatSolve_SeqAIJ; 1870 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1871 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1872 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1873 C->ops->matsolve = MatMatSolve_SeqAIJ; 1874 C->assembled = PETSC_TRUE; 1875 C->preallocated = PETSC_TRUE; 1876 1877 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1878 1879 /* MatShiftView(A,info,&sctx) */ 1880 if (sctx.nshift) { 1881 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1882 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1883 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1884 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1885 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1886 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1887 } 1888 } 1889 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1890 PetscFunctionReturn(0); 1891 } 1892 1893 #undef __FUNCT__ 1894 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1895 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1896 { 1897 Mat C = B; 1898 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1899 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1900 PetscErrorCode ierr; 1901 const PetscInt *r,*ic,*c,*ics; 1902 PetscInt n = A->rmap->n,*bi = b->i; 1903 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1904 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1905 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1906 PetscScalar mul1,mul2,mul3,tmp; 1907 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1908 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1909 PetscReal rs=0.0; 1910 FactorShiftCtx sctx; 1911 1912 PetscFunctionBegin; 1913 sctx.shift_top = 0; 1914 sctx.nshift_max = 0; 1915 sctx.shift_lo = 0; 1916 sctx.shift_hi = 0; 1917 sctx.shift_fraction = 0; 1918 1919 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1920 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1921 sctx.shift_top = 0; 1922 for (i=0; i<n; i++) { 1923 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1924 rs = 0.0; 1925 ajtmp = aj + ai[i]; 1926 rtmp1 = aa + ai[i]; 1927 nz = ai[i+1] - ai[i]; 1928 for (j=0; j<nz; j++) { 1929 if (*ajtmp != i) { 1930 rs += PetscAbsScalar(*rtmp1++); 1931 } else { 1932 rs -= PetscRealPart(*rtmp1++); 1933 } 1934 ajtmp++; 1935 } 1936 if (rs>sctx.shift_top) sctx.shift_top = rs; 1937 } 1938 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1939 sctx.shift_top *= 1.1; 1940 sctx.nshift_max = 5; 1941 sctx.shift_lo = 0.; 1942 sctx.shift_hi = 1.; 1943 } 1944 sctx.shift_amount = 0; 1945 sctx.nshift = 0; 1946 1947 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1948 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1949 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1950 ierr = PetscMalloc3(n,PetscScalar,&rtmp11,n,PetscScalar,&rtmp22,n,PetscScalar,&rtmp33);CHKERRQ(ierr); 1951 ierr = PetscMemzero(rtmp11,n*sizeof(PetscScalar));CHKERRQ(ierr); 1952 ierr = PetscMemzero(rtmp22,n*sizeof(PetscScalar));CHKERRQ(ierr); 1953 ierr = PetscMemzero(rtmp33,n*sizeof(PetscScalar));CHKERRQ(ierr); 1954 ics = ic; 1955 1956 node_max = a->inode.node_count; 1957 ns = a->inode.size; 1958 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1959 1960 /* If max inode size > 3, split it into two inodes.*/ 1961 /* also map the inode sizes according to the ordering */ 1962 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1963 for (i=0,j=0; i<node_max; ++i,++j) { 1964 if (ns[i]>3) { 1965 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1966 ++j; 1967 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1968 } else { 1969 tmp_vec1[j] = ns[i]; 1970 } 1971 } 1972 /* Use the correct node_max */ 1973 node_max = j; 1974 1975 /* Now reorder the inode info based on mat re-ordering info */ 1976 /* First create a row -> inode_size_array_index map */ 1977 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1978 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1979 for (i=0,row=0; i<node_max; i++) { 1980 nodesz = tmp_vec1[i]; 1981 for (j=0; j<nodesz; j++,row++) { 1982 nsmap[row] = i; 1983 } 1984 } 1985 /* Using nsmap, create a reordered ns structure */ 1986 for (i=0,j=0; i< node_max; i++) { 1987 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1988 tmp_vec2[i] = nodesz; 1989 j += nodesz; 1990 } 1991 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1992 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1993 /* Now use the correct ns */ 1994 ns = tmp_vec2; 1995 1996 do { 1997 sctx.newshift = PETSC_FALSE; 1998 /* Now loop over each block-row, and do the factorization */ 1999 for (i=0,row=0; i<node_max; i++) { 2000 nodesz = ns[i]; 2001 nz = bi[row+1] - bi[row]; 2002 bjtmp = bj + bi[row]; 2003 2004 switch (nodesz) { 2005 case 1: 2006 for (j=0; j<nz; j++) { 2007 idx = bjtmp[j]; 2008 rtmp11[idx] = 0.0; 2009 } 2010 2011 /* load in initial (unfactored row) */ 2012 idx = r[row]; 2013 nz_tmp = ai[idx+1] - ai[idx]; 2014 ajtmp = aj + ai[idx]; 2015 v1 = aa + ai[idx]; 2016 2017 for (j=0; j<nz_tmp; j++) { 2018 idx = ics[ajtmp[j]]; 2019 rtmp11[idx] = v1[j]; 2020 } 2021 rtmp11[ics[r[row]]] += sctx.shift_amount; 2022 2023 prow = *bjtmp++; 2024 while (prow < row) { 2025 pc1 = rtmp11 + prow; 2026 if (*pc1 != 0.0) { 2027 pv = ba + bd[prow]; 2028 pj = nbj + bd[prow]; 2029 mul1 = *pc1 * *pv++; 2030 *pc1 = mul1; 2031 nz_tmp = bi[prow+1] - bd[prow] - 1; 2032 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2033 for (j=0; j<nz_tmp; j++) { 2034 tmp = pv[j]; 2035 idx = pj[j]; 2036 rtmp11[idx] -= mul1 * tmp; 2037 } 2038 } 2039 prow = *bjtmp++; 2040 } 2041 pj = bj + bi[row]; 2042 pc1 = ba + bi[row]; 2043 2044 sctx.pv = rtmp11[row]; 2045 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2046 rs = 0.0; 2047 for (j=0; j<nz; j++) { 2048 idx = pj[j]; 2049 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2050 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2051 } 2052 sctx.rs = rs; 2053 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2054 if (sctx.newshift) goto endofwhile; 2055 break; 2056 2057 case 2: 2058 for (j=0; j<nz; j++) { 2059 idx = bjtmp[j]; 2060 rtmp11[idx] = 0.0; 2061 rtmp22[idx] = 0.0; 2062 } 2063 2064 /* load in initial (unfactored row) */ 2065 idx = r[row]; 2066 nz_tmp = ai[idx+1] - ai[idx]; 2067 ajtmp = aj + ai[idx]; 2068 v1 = aa + ai[idx]; 2069 v2 = aa + ai[idx+1]; 2070 for (j=0; j<nz_tmp; j++) { 2071 idx = ics[ajtmp[j]]; 2072 rtmp11[idx] = v1[j]; 2073 rtmp22[idx] = v2[j]; 2074 } 2075 rtmp11[ics[r[row]]] += sctx.shift_amount; 2076 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2077 2078 prow = *bjtmp++; 2079 while (prow < row) { 2080 pc1 = rtmp11 + prow; 2081 pc2 = rtmp22 + prow; 2082 if (*pc1 != 0.0 || *pc2 != 0.0) { 2083 pv = ba + bd[prow]; 2084 pj = nbj + bd[prow]; 2085 mul1 = *pc1 * *pv; 2086 mul2 = *pc2 * *pv; 2087 ++pv; 2088 *pc1 = mul1; 2089 *pc2 = mul2; 2090 2091 nz_tmp = bi[prow+1] - bd[prow] - 1; 2092 for (j=0; j<nz_tmp; j++) { 2093 tmp = pv[j]; 2094 idx = pj[j]; 2095 rtmp11[idx] -= mul1 * tmp; 2096 rtmp22[idx] -= mul2 * tmp; 2097 } 2098 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2099 } 2100 prow = *bjtmp++; 2101 } 2102 2103 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2104 pc1 = rtmp11 + prow; 2105 pc2 = rtmp22 + prow; 2106 2107 sctx.pv = *pc1; 2108 pj = bj + bi[prow]; 2109 rs = 0.0; 2110 for (j=0; j<nz; j++) { 2111 idx = pj[j]; 2112 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2113 } 2114 sctx.rs = rs; 2115 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2116 if (sctx.newshift) goto endofwhile; 2117 2118 if (*pc2 != 0.0) { 2119 pj = nbj + bd[prow]; 2120 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2121 *pc2 = mul2; 2122 nz_tmp = bi[prow+1] - bd[prow] - 1; 2123 for (j=0; j<nz_tmp; j++) { 2124 idx = pj[j]; 2125 tmp = rtmp11[idx]; 2126 rtmp22[idx] -= mul2 * tmp; 2127 } 2128 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2129 } 2130 2131 pj = bj + bi[row]; 2132 pc1 = ba + bi[row]; 2133 pc2 = ba + bi[row+1]; 2134 2135 sctx.pv = rtmp22[row+1]; 2136 rs = 0.0; 2137 rtmp11[row] = 1.0/rtmp11[row]; 2138 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2139 /* copy row entries from dense representation to sparse */ 2140 for (j=0; j<nz; j++) { 2141 idx = pj[j]; 2142 pc1[j] = rtmp11[idx]; 2143 pc2[j] = rtmp22[idx]; 2144 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2145 } 2146 sctx.rs = rs; 2147 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2148 if (sctx.newshift) goto endofwhile; 2149 break; 2150 2151 case 3: 2152 for (j=0; j<nz; j++) { 2153 idx = bjtmp[j]; 2154 rtmp11[idx] = 0.0; 2155 rtmp22[idx] = 0.0; 2156 rtmp33[idx] = 0.0; 2157 } 2158 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2159 idx = r[row]; 2160 nz_tmp = ai[idx+1] - ai[idx]; 2161 ajtmp = aj + ai[idx]; 2162 v1 = aa + ai[idx]; 2163 v2 = aa + ai[idx+1]; 2164 v3 = aa + ai[idx+2]; 2165 for (j=0; j<nz_tmp; j++) { 2166 idx = ics[ajtmp[j]]; 2167 rtmp11[idx] = v1[j]; 2168 rtmp22[idx] = v2[j]; 2169 rtmp33[idx] = v3[j]; 2170 } 2171 rtmp11[ics[r[row]]] += sctx.shift_amount; 2172 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2173 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2174 2175 /* loop over all pivot row blocks above this row block */ 2176 prow = *bjtmp++; 2177 while (prow < row) { 2178 pc1 = rtmp11 + prow; 2179 pc2 = rtmp22 + prow; 2180 pc3 = rtmp33 + prow; 2181 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2182 pv = ba + bd[prow]; 2183 pj = nbj + bd[prow]; 2184 mul1 = *pc1 * *pv; 2185 mul2 = *pc2 * *pv; 2186 mul3 = *pc3 * *pv; 2187 ++pv; 2188 *pc1 = mul1; 2189 *pc2 = mul2; 2190 *pc3 = mul3; 2191 2192 nz_tmp = bi[prow+1] - bd[prow] - 1; 2193 /* update this row based on pivot row */ 2194 for (j=0; j<nz_tmp; j++) { 2195 tmp = pv[j]; 2196 idx = pj[j]; 2197 rtmp11[idx] -= mul1 * tmp; 2198 rtmp22[idx] -= mul2 * tmp; 2199 rtmp33[idx] -= mul3 * tmp; 2200 } 2201 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2202 } 2203 prow = *bjtmp++; 2204 } 2205 2206 /* Now take care of diagonal 3x3 block in this set of rows */ 2207 /* note: prow = row here */ 2208 pc1 = rtmp11 + prow; 2209 pc2 = rtmp22 + prow; 2210 pc3 = rtmp33 + prow; 2211 2212 sctx.pv = *pc1; 2213 pj = bj + bi[prow]; 2214 rs = 0.0; 2215 for (j=0; j<nz; j++) { 2216 idx = pj[j]; 2217 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2218 } 2219 sctx.rs = rs; 2220 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2221 if (sctx.newshift) goto endofwhile; 2222 2223 if (*pc2 != 0.0 || *pc3 != 0.0) { 2224 mul2 = (*pc2)/(*pc1); 2225 mul3 = (*pc3)/(*pc1); 2226 *pc2 = mul2; 2227 *pc3 = mul3; 2228 nz_tmp = bi[prow+1] - bd[prow] - 1; 2229 pj = nbj + bd[prow]; 2230 for (j=0; j<nz_tmp; j++) { 2231 idx = pj[j]; 2232 tmp = rtmp11[idx]; 2233 rtmp22[idx] -= mul2 * tmp; 2234 rtmp33[idx] -= mul3 * tmp; 2235 } 2236 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2237 } 2238 ++prow; 2239 2240 pc2 = rtmp22 + prow; 2241 pc3 = rtmp33 + prow; 2242 sctx.pv = *pc2; 2243 pj = bj + bi[prow]; 2244 rs = 0.0; 2245 for (j=0; j<nz; j++) { 2246 idx = pj[j]; 2247 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2248 } 2249 sctx.rs = rs; 2250 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2251 if (sctx.newshift) goto endofwhile; 2252 2253 if (*pc3 != 0.0) { 2254 mul3 = (*pc3)/(*pc2); 2255 *pc3 = mul3; 2256 pj = nbj + bd[prow]; 2257 nz_tmp = bi[prow+1] - bd[prow] - 1; 2258 for (j=0; j<nz_tmp; j++) { 2259 idx = pj[j]; 2260 tmp = rtmp22[idx]; 2261 rtmp33[idx] -= mul3 * tmp; 2262 } 2263 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2264 } 2265 2266 pj = bj + bi[row]; 2267 pc1 = ba + bi[row]; 2268 pc2 = ba + bi[row+1]; 2269 pc3 = ba + bi[row+2]; 2270 2271 sctx.pv = rtmp33[row+2]; 2272 rs = 0.0; 2273 rtmp11[row] = 1.0/rtmp11[row]; 2274 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2275 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2276 /* copy row entries from dense representation to sparse */ 2277 for (j=0; j<nz; j++) { 2278 idx = pj[j]; 2279 pc1[j] = rtmp11[idx]; 2280 pc2[j] = rtmp22[idx]; 2281 pc3[j] = rtmp33[idx]; 2282 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2283 } 2284 2285 sctx.rs = rs; 2286 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2287 if (sctx.newshift) goto endofwhile; 2288 break; 2289 2290 default: 2291 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2292 } 2293 row += nodesz; /* Update the row */ 2294 } 2295 endofwhile:; 2296 } while (sctx.newshift); 2297 ierr = PetscFree3(rtmp11,rtmp22,rtmp33);CHKERRQ(ierr); 2298 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2299 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2300 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2301 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2302 2303 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2304 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2305 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2306 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2307 C->assembled = PETSC_TRUE; 2308 C->preallocated = PETSC_TRUE; 2309 if (sctx.nshift) { 2310 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2311 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2312 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2313 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2314 } 2315 } 2316 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2317 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2318 PetscFunctionReturn(0); 2319 } 2320 2321 2322 /* ----------------------------------------------------------- */ 2323 #undef __FUNCT__ 2324 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2325 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2326 { 2327 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2328 IS iscol = a->col,isrow = a->row; 2329 PetscErrorCode ierr; 2330 const PetscInt *r,*c,*rout,*cout; 2331 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2332 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2333 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2334 PetscScalar sum1,sum2,sum3,sum4,sum5; 2335 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2336 const PetscScalar *b; 2337 2338 PetscFunctionBegin; 2339 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2340 node_max = a->inode.node_count; 2341 ns = a->inode.size; /* Node Size array */ 2342 2343 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2344 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2345 tmp = a->solve_work; 2346 2347 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2348 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2349 2350 /* forward solve the lower triangular */ 2351 tmps = tmp; 2352 aa = a_a; 2353 aj = a_j; 2354 ad = a->diag; 2355 2356 for (i = 0,row = 0; i< node_max; ++i) { 2357 nsz = ns[i]; 2358 aii = ai[row]; 2359 v1 = aa + aii; 2360 vi = aj + aii; 2361 nz = ai[row+1]- ai[row]; 2362 2363 if (i < node_max-1) { 2364 /* Prefetch the indices for the next block */ 2365 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2366 /* Prefetch the data for the next block */ 2367 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2368 } 2369 2370 switch (nsz) { /* Each loop in 'case' is unrolled */ 2371 case 1: 2372 sum1 = b[r[row]]; 2373 for (j=0; j<nz-1; j+=2) { 2374 i0 = vi[j]; 2375 i1 = vi[j+1]; 2376 tmp0 = tmps[i0]; 2377 tmp1 = tmps[i1]; 2378 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2379 } 2380 if (j == nz-1) { 2381 tmp0 = tmps[vi[j]]; 2382 sum1 -= v1[j]*tmp0; 2383 } 2384 tmp[row++]=sum1; 2385 break; 2386 case 2: 2387 sum1 = b[r[row]]; 2388 sum2 = b[r[row+1]]; 2389 v2 = aa + ai[row+1]; 2390 2391 for (j=0; j<nz-1; j+=2) { 2392 i0 = vi[j]; 2393 i1 = vi[j+1]; 2394 tmp0 = tmps[i0]; 2395 tmp1 = tmps[i1]; 2396 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2397 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2398 } 2399 if (j == nz-1) { 2400 tmp0 = tmps[vi[j]]; 2401 sum1 -= v1[j] *tmp0; 2402 sum2 -= v2[j] *tmp0; 2403 } 2404 sum2 -= v2[nz] * sum1; 2405 tmp[row++]=sum1; 2406 tmp[row++]=sum2; 2407 break; 2408 case 3: 2409 sum1 = b[r[row]]; 2410 sum2 = b[r[row+1]]; 2411 sum3 = b[r[row+2]]; 2412 v2 = aa + ai[row+1]; 2413 v3 = aa + ai[row+2]; 2414 2415 for (j=0; j<nz-1; j+=2) { 2416 i0 = vi[j]; 2417 i1 = vi[j+1]; 2418 tmp0 = tmps[i0]; 2419 tmp1 = tmps[i1]; 2420 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2421 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2422 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2423 } 2424 if (j == nz-1) { 2425 tmp0 = tmps[vi[j]]; 2426 sum1 -= v1[j] *tmp0; 2427 sum2 -= v2[j] *tmp0; 2428 sum3 -= v3[j] *tmp0; 2429 } 2430 sum2 -= v2[nz] * sum1; 2431 sum3 -= v3[nz] * sum1; 2432 sum3 -= v3[nz+1] * sum2; 2433 tmp[row++]=sum1; 2434 tmp[row++]=sum2; 2435 tmp[row++]=sum3; 2436 break; 2437 2438 case 4: 2439 sum1 = b[r[row]]; 2440 sum2 = b[r[row+1]]; 2441 sum3 = b[r[row+2]]; 2442 sum4 = b[r[row+3]]; 2443 v2 = aa + ai[row+1]; 2444 v3 = aa + ai[row+2]; 2445 v4 = aa + ai[row+3]; 2446 2447 for (j=0; j<nz-1; j+=2) { 2448 i0 = vi[j]; 2449 i1 = vi[j+1]; 2450 tmp0 = tmps[i0]; 2451 tmp1 = tmps[i1]; 2452 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2453 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2454 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2455 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2456 } 2457 if (j == nz-1) { 2458 tmp0 = tmps[vi[j]]; 2459 sum1 -= v1[j] *tmp0; 2460 sum2 -= v2[j] *tmp0; 2461 sum3 -= v3[j] *tmp0; 2462 sum4 -= v4[j] *tmp0; 2463 } 2464 sum2 -= v2[nz] * sum1; 2465 sum3 -= v3[nz] * sum1; 2466 sum4 -= v4[nz] * sum1; 2467 sum3 -= v3[nz+1] * sum2; 2468 sum4 -= v4[nz+1] * sum2; 2469 sum4 -= v4[nz+2] * sum3; 2470 2471 tmp[row++]=sum1; 2472 tmp[row++]=sum2; 2473 tmp[row++]=sum3; 2474 tmp[row++]=sum4; 2475 break; 2476 case 5: 2477 sum1 = b[r[row]]; 2478 sum2 = b[r[row+1]]; 2479 sum3 = b[r[row+2]]; 2480 sum4 = b[r[row+3]]; 2481 sum5 = b[r[row+4]]; 2482 v2 = aa + ai[row+1]; 2483 v3 = aa + ai[row+2]; 2484 v4 = aa + ai[row+3]; 2485 v5 = aa + ai[row+4]; 2486 2487 for (j=0; j<nz-1; j+=2) { 2488 i0 = vi[j]; 2489 i1 = vi[j+1]; 2490 tmp0 = tmps[i0]; 2491 tmp1 = tmps[i1]; 2492 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2493 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2494 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2495 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2496 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2497 } 2498 if (j == nz-1) { 2499 tmp0 = tmps[vi[j]]; 2500 sum1 -= v1[j] *tmp0; 2501 sum2 -= v2[j] *tmp0; 2502 sum3 -= v3[j] *tmp0; 2503 sum4 -= v4[j] *tmp0; 2504 sum5 -= v5[j] *tmp0; 2505 } 2506 2507 sum2 -= v2[nz] * sum1; 2508 sum3 -= v3[nz] * sum1; 2509 sum4 -= v4[nz] * sum1; 2510 sum5 -= v5[nz] * sum1; 2511 sum3 -= v3[nz+1] * sum2; 2512 sum4 -= v4[nz+1] * sum2; 2513 sum5 -= v5[nz+1] * sum2; 2514 sum4 -= v4[nz+2] * sum3; 2515 sum5 -= v5[nz+2] * sum3; 2516 sum5 -= v5[nz+3] * sum4; 2517 2518 tmp[row++]=sum1; 2519 tmp[row++]=sum2; 2520 tmp[row++]=sum3; 2521 tmp[row++]=sum4; 2522 tmp[row++]=sum5; 2523 break; 2524 default: 2525 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2526 } 2527 } 2528 /* backward solve the upper triangular */ 2529 for (i=node_max -1,row = n-1; i>=0; i--) { 2530 nsz = ns[i]; 2531 aii = ad[row+1] + 1; 2532 v1 = aa + aii; 2533 vi = aj + aii; 2534 nz = ad[row]- ad[row+1] - 1; 2535 2536 if (i > 0) { 2537 /* Prefetch the indices for the next block */ 2538 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2539 /* Prefetch the data for the next block */ 2540 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2541 } 2542 2543 switch (nsz) { /* Each loop in 'case' is unrolled */ 2544 case 1: 2545 sum1 = tmp[row]; 2546 2547 for (j=0; j<nz-1; j+=2) { 2548 i0 = vi[j]; 2549 i1 = vi[j+1]; 2550 tmp0 = tmps[i0]; 2551 tmp1 = tmps[i1]; 2552 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2553 } 2554 if (j == nz-1) { 2555 tmp0 = tmps[vi[j]]; 2556 sum1 -= v1[j]*tmp0; 2557 } 2558 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2559 break; 2560 case 2: 2561 sum1 = tmp[row]; 2562 sum2 = tmp[row-1]; 2563 v2 = aa + ad[row] + 1; 2564 for (j=0; j<nz-1; j+=2) { 2565 i0 = vi[j]; 2566 i1 = vi[j+1]; 2567 tmp0 = tmps[i0]; 2568 tmp1 = tmps[i1]; 2569 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2570 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2571 } 2572 if (j == nz-1) { 2573 tmp0 = tmps[vi[j]]; 2574 sum1 -= v1[j]* tmp0; 2575 sum2 -= v2[j+1]* tmp0; 2576 } 2577 2578 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2579 sum2 -= v2[0] * tmp0; 2580 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2581 break; 2582 case 3: 2583 sum1 = tmp[row]; 2584 sum2 = tmp[row -1]; 2585 sum3 = tmp[row -2]; 2586 v2 = aa + ad[row] + 1; 2587 v3 = aa + ad[row -1] + 1; 2588 for (j=0; j<nz-1; j+=2) { 2589 i0 = vi[j]; 2590 i1 = vi[j+1]; 2591 tmp0 = tmps[i0]; 2592 tmp1 = tmps[i1]; 2593 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2594 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2595 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2596 } 2597 if (j== nz-1) { 2598 tmp0 = tmps[vi[j]]; 2599 sum1 -= v1[j] * tmp0; 2600 sum2 -= v2[j+1] * tmp0; 2601 sum3 -= v3[j+2] * tmp0; 2602 } 2603 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2604 sum2 -= v2[0]* tmp0; 2605 sum3 -= v3[1] * tmp0; 2606 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2607 sum3 -= v3[0]* tmp0; 2608 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2609 2610 break; 2611 case 4: 2612 sum1 = tmp[row]; 2613 sum2 = tmp[row -1]; 2614 sum3 = tmp[row -2]; 2615 sum4 = tmp[row -3]; 2616 v2 = aa + ad[row]+1; 2617 v3 = aa + ad[row -1]+1; 2618 v4 = aa + ad[row -2]+1; 2619 2620 for (j=0; j<nz-1; j+=2) { 2621 i0 = vi[j]; 2622 i1 = vi[j+1]; 2623 tmp0 = tmps[i0]; 2624 tmp1 = tmps[i1]; 2625 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2626 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2627 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2628 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2629 } 2630 if (j== nz-1) { 2631 tmp0 = tmps[vi[j]]; 2632 sum1 -= v1[j] * tmp0; 2633 sum2 -= v2[j+1] * tmp0; 2634 sum3 -= v3[j+2] * tmp0; 2635 sum4 -= v4[j+3] * tmp0; 2636 } 2637 2638 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2639 sum2 -= v2[0] * tmp0; 2640 sum3 -= v3[1] * tmp0; 2641 sum4 -= v4[2] * tmp0; 2642 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2643 sum3 -= v3[0] * tmp0; 2644 sum4 -= v4[1] * tmp0; 2645 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2646 sum4 -= v4[0] * tmp0; 2647 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2648 break; 2649 case 5: 2650 sum1 = tmp[row]; 2651 sum2 = tmp[row -1]; 2652 sum3 = tmp[row -2]; 2653 sum4 = tmp[row -3]; 2654 sum5 = tmp[row -4]; 2655 v2 = aa + ad[row]+1; 2656 v3 = aa + ad[row -1]+1; 2657 v4 = aa + ad[row -2]+1; 2658 v5 = aa + ad[row -3]+1; 2659 for (j=0; j<nz-1; j+=2) { 2660 i0 = vi[j]; 2661 i1 = vi[j+1]; 2662 tmp0 = tmps[i0]; 2663 tmp1 = tmps[i1]; 2664 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2665 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2666 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2667 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2668 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2669 } 2670 if (j==nz-1) { 2671 tmp0 = tmps[vi[j]]; 2672 sum1 -= v1[j] * tmp0; 2673 sum2 -= v2[j+1] * tmp0; 2674 sum3 -= v3[j+2] * tmp0; 2675 sum4 -= v4[j+3] * tmp0; 2676 sum5 -= v5[j+4] * tmp0; 2677 } 2678 2679 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2680 sum2 -= v2[0] * tmp0; 2681 sum3 -= v3[1] * tmp0; 2682 sum4 -= v4[2] * tmp0; 2683 sum5 -= v5[3] * tmp0; 2684 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2685 sum3 -= v3[0] * tmp0; 2686 sum4 -= v4[1] * tmp0; 2687 sum5 -= v5[2] * tmp0; 2688 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2689 sum4 -= v4[0] * tmp0; 2690 sum5 -= v5[1] * tmp0; 2691 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2692 sum5 -= v5[0] * tmp0; 2693 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2694 break; 2695 default: 2696 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2697 } 2698 } 2699 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2700 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2701 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2703 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2704 PetscFunctionReturn(0); 2705 } 2706 2707 2708 /* 2709 Makes a longer coloring[] array and calls the usual code with that 2710 */ 2711 #undef __FUNCT__ 2712 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2713 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2714 { 2715 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2716 PetscErrorCode ierr; 2717 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2718 PetscInt *colorused,i; 2719 ISColoringValue *newcolor; 2720 2721 PetscFunctionBegin; 2722 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2723 /* loop over inodes, marking a color for each column*/ 2724 row = 0; 2725 for (i=0; i<m; i++) { 2726 for (j=0; j<ns[i]; j++) { 2727 newcolor[row++] = coloring[i] + j*ncolors; 2728 } 2729 } 2730 2731 /* eliminate unneeded colors */ 2732 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2733 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2734 for (i=0; i<n; i++) { 2735 colorused[newcolor[i]] = 1; 2736 } 2737 2738 for (i=1; i<5*ncolors; i++) { 2739 colorused[i] += colorused[i-1]; 2740 } 2741 ncolors = colorused[5*ncolors-1]; 2742 for (i=0; i<n; i++) { 2743 newcolor[i] = colorused[newcolor[i]]-1; 2744 } 2745 ierr = PetscFree(colorused);CHKERRQ(ierr); 2746 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2747 ierr = PetscFree(coloring);CHKERRQ(ierr); 2748 PetscFunctionReturn(0); 2749 } 2750 2751 #include <petsc-private/kernels/blockinvert.h> 2752 2753 #undef __FUNCT__ 2754 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2755 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2756 { 2757 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2758 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2759 MatScalar *ibdiag,*bdiag,work[25],*t; 2760 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2761 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2762 const PetscScalar *xb, *b; 2763 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2764 PetscErrorCode ierr; 2765 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2766 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2767 2768 PetscFunctionBegin; 2769 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2770 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2771 2772 if (!a->inode.ibdiagvalid) { 2773 if (!a->inode.ibdiag) { 2774 /* calculate space needed for diagonal blocks */ 2775 for (i=0; i<m; i++) { 2776 cnt += sizes[i]*sizes[i]; 2777 } 2778 a->inode.bdiagsize = cnt; 2779 2780 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2781 } 2782 2783 /* copy over the diagonal blocks and invert them */ 2784 ibdiag = a->inode.ibdiag; 2785 bdiag = a->inode.bdiag; 2786 cnt = 0; 2787 for (i=0, row = 0; i<m; i++) { 2788 for (j=0; j<sizes[i]; j++) { 2789 for (k=0; k<sizes[i]; k++) { 2790 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2791 } 2792 } 2793 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2794 2795 switch (sizes[i]) { 2796 case 1: 2797 /* Create matrix data structure */ 2798 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2799 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2800 break; 2801 case 2: 2802 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2803 break; 2804 case 3: 2805 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2806 break; 2807 case 4: 2808 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2809 break; 2810 case 5: 2811 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2812 break; 2813 default: 2814 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2815 } 2816 cnt += sizes[i]*sizes[i]; 2817 row += sizes[i]; 2818 } 2819 a->inode.ibdiagvalid = PETSC_TRUE; 2820 } 2821 ibdiag = a->inode.ibdiag; 2822 bdiag = a->inode.bdiag; 2823 t = a->inode.ssor_work; 2824 2825 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2826 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2827 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2828 if (flag & SOR_ZERO_INITIAL_GUESS) { 2829 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2830 2831 for (i=0, row=0; i<m; i++) { 2832 sz = diag[row] - ii[row]; 2833 v1 = a->a + ii[row]; 2834 idx = a->j + ii[row]; 2835 2836 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2837 switch (sizes[i]) { 2838 case 1: 2839 2840 sum1 = b[row]; 2841 for (n = 0; n<sz-1; n+=2) { 2842 i1 = idx[0]; 2843 i2 = idx[1]; 2844 idx += 2; 2845 tmp0 = x[i1]; 2846 tmp1 = x[i2]; 2847 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2848 } 2849 2850 if (n == sz-1) { 2851 tmp0 = x[*idx]; 2852 sum1 -= *v1 * tmp0; 2853 } 2854 t[row] = sum1; 2855 x[row++] = sum1*(*ibdiag++); 2856 break; 2857 case 2: 2858 v2 = a->a + ii[row+1]; 2859 sum1 = b[row]; 2860 sum2 = b[row+1]; 2861 for (n = 0; n<sz-1; n+=2) { 2862 i1 = idx[0]; 2863 i2 = idx[1]; 2864 idx += 2; 2865 tmp0 = x[i1]; 2866 tmp1 = x[i2]; 2867 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2868 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2869 } 2870 2871 if (n == sz-1) { 2872 tmp0 = x[*idx]; 2873 sum1 -= v1[0] * tmp0; 2874 sum2 -= v2[0] * tmp0; 2875 } 2876 t[row] = sum1; 2877 t[row+1] = sum2; 2878 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2879 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2880 ibdiag += 4; 2881 break; 2882 case 3: 2883 v2 = a->a + ii[row+1]; 2884 v3 = a->a + ii[row+2]; 2885 sum1 = b[row]; 2886 sum2 = b[row+1]; 2887 sum3 = b[row+2]; 2888 for (n = 0; n<sz-1; n+=2) { 2889 i1 = idx[0]; 2890 i2 = idx[1]; 2891 idx += 2; 2892 tmp0 = x[i1]; 2893 tmp1 = x[i2]; 2894 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2895 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2896 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2897 } 2898 2899 if (n == sz-1) { 2900 tmp0 = x[*idx]; 2901 sum1 -= v1[0] * tmp0; 2902 sum2 -= v2[0] * tmp0; 2903 sum3 -= v3[0] * tmp0; 2904 } 2905 t[row] = sum1; 2906 t[row+1] = sum2; 2907 t[row+2] = sum3; 2908 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2909 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2910 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2911 ibdiag += 9; 2912 break; 2913 case 4: 2914 v2 = a->a + ii[row+1]; 2915 v3 = a->a + ii[row+2]; 2916 v4 = a->a + ii[row+3]; 2917 sum1 = b[row]; 2918 sum2 = b[row+1]; 2919 sum3 = b[row+2]; 2920 sum4 = b[row+3]; 2921 for (n = 0; n<sz-1; n+=2) { 2922 i1 = idx[0]; 2923 i2 = idx[1]; 2924 idx += 2; 2925 tmp0 = x[i1]; 2926 tmp1 = x[i2]; 2927 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2928 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2929 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2930 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2931 } 2932 2933 if (n == sz-1) { 2934 tmp0 = x[*idx]; 2935 sum1 -= v1[0] * tmp0; 2936 sum2 -= v2[0] * tmp0; 2937 sum3 -= v3[0] * tmp0; 2938 sum4 -= v4[0] * tmp0; 2939 } 2940 t[row] = sum1; 2941 t[row+1] = sum2; 2942 t[row+2] = sum3; 2943 t[row+3] = sum4; 2944 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2945 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2946 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2947 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2948 ibdiag += 16; 2949 break; 2950 case 5: 2951 v2 = a->a + ii[row+1]; 2952 v3 = a->a + ii[row+2]; 2953 v4 = a->a + ii[row+3]; 2954 v5 = a->a + ii[row+4]; 2955 sum1 = b[row]; 2956 sum2 = b[row+1]; 2957 sum3 = b[row+2]; 2958 sum4 = b[row+3]; 2959 sum5 = b[row+4]; 2960 for (n = 0; n<sz-1; n+=2) { 2961 i1 = idx[0]; 2962 i2 = idx[1]; 2963 idx += 2; 2964 tmp0 = x[i1]; 2965 tmp1 = x[i2]; 2966 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2967 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2968 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2969 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2970 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2971 } 2972 2973 if (n == sz-1) { 2974 tmp0 = x[*idx]; 2975 sum1 -= v1[0] * tmp0; 2976 sum2 -= v2[0] * tmp0; 2977 sum3 -= v3[0] * tmp0; 2978 sum4 -= v4[0] * tmp0; 2979 sum5 -= v5[0] * tmp0; 2980 } 2981 t[row] = sum1; 2982 t[row+1] = sum2; 2983 t[row+2] = sum3; 2984 t[row+3] = sum4; 2985 t[row+4] = sum5; 2986 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2987 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2988 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2989 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2990 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2991 ibdiag += 25; 2992 break; 2993 default: 2994 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2995 } 2996 } 2997 2998 xb = t; 2999 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3000 } else xb = b; 3001 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3002 3003 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3004 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3005 ibdiag -= sizes[i]*sizes[i]; 3006 sz = ii[row+1] - diag[row] - 1; 3007 v1 = a->a + diag[row] + 1; 3008 idx = a->j + diag[row] + 1; 3009 3010 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3011 switch (sizes[i]) { 3012 case 1: 3013 3014 sum1 = xb[row]; 3015 for (n = 0; n<sz-1; n+=2) { 3016 i1 = idx[0]; 3017 i2 = idx[1]; 3018 idx += 2; 3019 tmp0 = x[i1]; 3020 tmp1 = x[i2]; 3021 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3022 } 3023 3024 if (n == sz-1) { 3025 tmp0 = x[*idx]; 3026 sum1 -= *v1*tmp0; 3027 } 3028 x[row--] = sum1*(*ibdiag); 3029 break; 3030 3031 case 2: 3032 3033 sum1 = xb[row]; 3034 sum2 = xb[row-1]; 3035 /* note that sum1 is associated with the second of the two rows */ 3036 v2 = a->a + diag[row-1] + 2; 3037 for (n = 0; n<sz-1; n+=2) { 3038 i1 = idx[0]; 3039 i2 = idx[1]; 3040 idx += 2; 3041 tmp0 = x[i1]; 3042 tmp1 = x[i2]; 3043 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3044 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3045 } 3046 3047 if (n == sz-1) { 3048 tmp0 = x[*idx]; 3049 sum1 -= *v1*tmp0; 3050 sum2 -= *v2*tmp0; 3051 } 3052 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3053 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3054 break; 3055 case 3: 3056 3057 sum1 = xb[row]; 3058 sum2 = xb[row-1]; 3059 sum3 = xb[row-2]; 3060 v2 = a->a + diag[row-1] + 2; 3061 v3 = a->a + diag[row-2] + 3; 3062 for (n = 0; n<sz-1; n+=2) { 3063 i1 = idx[0]; 3064 i2 = idx[1]; 3065 idx += 2; 3066 tmp0 = x[i1]; 3067 tmp1 = x[i2]; 3068 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3069 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3070 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3071 } 3072 3073 if (n == sz-1) { 3074 tmp0 = x[*idx]; 3075 sum1 -= *v1*tmp0; 3076 sum2 -= *v2*tmp0; 3077 sum3 -= *v3*tmp0; 3078 } 3079 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3080 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3081 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3082 break; 3083 case 4: 3084 3085 sum1 = xb[row]; 3086 sum2 = xb[row-1]; 3087 sum3 = xb[row-2]; 3088 sum4 = xb[row-3]; 3089 v2 = a->a + diag[row-1] + 2; 3090 v3 = a->a + diag[row-2] + 3; 3091 v4 = a->a + diag[row-3] + 4; 3092 for (n = 0; n<sz-1; n+=2) { 3093 i1 = idx[0]; 3094 i2 = idx[1]; 3095 idx += 2; 3096 tmp0 = x[i1]; 3097 tmp1 = x[i2]; 3098 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3099 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3100 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3101 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3102 } 3103 3104 if (n == sz-1) { 3105 tmp0 = x[*idx]; 3106 sum1 -= *v1*tmp0; 3107 sum2 -= *v2*tmp0; 3108 sum3 -= *v3*tmp0; 3109 sum4 -= *v4*tmp0; 3110 } 3111 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3112 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3113 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3114 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3115 break; 3116 case 5: 3117 3118 sum1 = xb[row]; 3119 sum2 = xb[row-1]; 3120 sum3 = xb[row-2]; 3121 sum4 = xb[row-3]; 3122 sum5 = xb[row-4]; 3123 v2 = a->a + diag[row-1] + 2; 3124 v3 = a->a + diag[row-2] + 3; 3125 v4 = a->a + diag[row-3] + 4; 3126 v5 = a->a + diag[row-4] + 5; 3127 for (n = 0; n<sz-1; n+=2) { 3128 i1 = idx[0]; 3129 i2 = idx[1]; 3130 idx += 2; 3131 tmp0 = x[i1]; 3132 tmp1 = x[i2]; 3133 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3134 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3135 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3136 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3137 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3138 } 3139 3140 if (n == sz-1) { 3141 tmp0 = x[*idx]; 3142 sum1 -= *v1*tmp0; 3143 sum2 -= *v2*tmp0; 3144 sum3 -= *v3*tmp0; 3145 sum4 -= *v4*tmp0; 3146 sum5 -= *v5*tmp0; 3147 } 3148 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3149 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3150 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3151 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3152 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3153 break; 3154 default: 3155 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3156 } 3157 } 3158 3159 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3160 } 3161 its--; 3162 } 3163 while (its--) { 3164 3165 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3166 ibdiag = a->inode.ibdiag; 3167 3168 for (i=0, row=0; i<m; i++) { 3169 sz = ii[row + 1] - ii[row]; 3170 v1 = a->a + ii[row]; 3171 idx = a->j + ii[row]; 3172 3173 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3174 switch (sizes[i]) { 3175 case 1: 3176 3177 sum1 = b[row]; 3178 for (n = 0; n<sz-1; n+=2) { 3179 i1 = idx[0]; 3180 i2 = idx[1]; 3181 idx += 2; 3182 tmp0 = x[i1]; 3183 tmp1 = x[i2]; 3184 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3185 } 3186 3187 if (n == sz-1) { 3188 tmp0 = x[*idx]; 3189 sum1 -= *v1 * tmp0; 3190 } 3191 3192 /* in MatSOR_SeqAIJ this line would be 3193 * 3194 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3195 * 3196 * but omega == 1, so this becomes 3197 * 3198 * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++); 3199 * 3200 * but bdiag and ibdiag cancel each other, so we can change this 3201 * to adding sum1*(*ibdiag++). We can skip bdiag for the larger 3202 * block sizes as well 3203 */ 3204 x[row++] += sum1*(*ibdiag++); 3205 break; 3206 case 2: 3207 v2 = a->a + ii[row+1]; 3208 sum1 = b[row]; 3209 sum2 = b[row+1]; 3210 for (n = 0; n<sz-1; n+=2) { 3211 i1 = idx[0]; 3212 i2 = idx[1]; 3213 idx += 2; 3214 tmp0 = x[i1]; 3215 tmp1 = x[i2]; 3216 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3217 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3218 } 3219 3220 if (n == sz-1) { 3221 tmp0 = x[*idx]; 3222 sum1 -= v1[0] * tmp0; 3223 sum2 -= v2[0] * tmp0; 3224 } 3225 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2]; 3226 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3]; 3227 ibdiag += 4; 3228 break; 3229 case 3: 3230 v2 = a->a + ii[row+1]; 3231 v3 = a->a + ii[row+2]; 3232 sum1 = b[row]; 3233 sum2 = b[row+1]; 3234 sum3 = b[row+2]; 3235 for (n = 0; n<sz-1; n+=2) { 3236 i1 = idx[0]; 3237 i2 = idx[1]; 3238 idx += 2; 3239 tmp0 = x[i1]; 3240 tmp1 = x[i2]; 3241 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3242 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3243 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3244 } 3245 3246 if (n == sz-1) { 3247 tmp0 = x[*idx]; 3248 sum1 -= v1[0] * tmp0; 3249 sum2 -= v2[0] * tmp0; 3250 sum3 -= v3[0] * tmp0; 3251 } 3252 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3253 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3254 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3255 ibdiag += 9; 3256 break; 3257 case 4: 3258 v2 = a->a + ii[row+1]; 3259 v3 = a->a + ii[row+2]; 3260 v4 = a->a + ii[row+3]; 3261 sum1 = b[row]; 3262 sum2 = b[row+1]; 3263 sum3 = b[row+2]; 3264 sum4 = b[row+3]; 3265 for (n = 0; n<sz-1; n+=2) { 3266 i1 = idx[0]; 3267 i2 = idx[1]; 3268 idx += 2; 3269 tmp0 = x[i1]; 3270 tmp1 = x[i2]; 3271 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3272 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3273 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3274 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3275 } 3276 3277 if (n == sz-1) { 3278 tmp0 = x[*idx]; 3279 sum1 -= v1[0] * tmp0; 3280 sum2 -= v2[0] * tmp0; 3281 sum3 -= v3[0] * tmp0; 3282 sum4 -= v4[0] * tmp0; 3283 } 3284 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3285 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3286 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3287 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3288 ibdiag += 16; 3289 break; 3290 case 5: 3291 v2 = a->a + ii[row+1]; 3292 v3 = a->a + ii[row+2]; 3293 v4 = a->a + ii[row+3]; 3294 v5 = a->a + ii[row+4]; 3295 sum1 = b[row]; 3296 sum2 = b[row+1]; 3297 sum3 = b[row+2]; 3298 sum4 = b[row+3]; 3299 sum5 = b[row+4]; 3300 for (n = 0; n<sz-1; n+=2) { 3301 i1 = idx[0]; 3302 i2 = idx[1]; 3303 idx += 2; 3304 tmp0 = x[i1]; 3305 tmp1 = x[i2]; 3306 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3307 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3308 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3309 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3310 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3311 } 3312 3313 if (n == sz-1) { 3314 tmp0 = x[*idx]; 3315 sum1 -= v1[0] * tmp0; 3316 sum2 -= v2[0] * tmp0; 3317 sum3 -= v3[0] * tmp0; 3318 sum4 -= v4[0] * tmp0; 3319 sum5 -= v5[0] * tmp0; 3320 } 3321 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3322 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3323 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3324 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3325 x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3326 ibdiag += 25; 3327 break; 3328 default: 3329 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3330 } 3331 } 3332 3333 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3334 } 3335 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3336 3337 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3338 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3339 ibdiag -= sizes[i]*sizes[i]; 3340 sz = ii[row+1] - ii[row]; 3341 v1 = a->a + ii[row]; 3342 idx = a->j + ii[row]; 3343 3344 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3345 switch (sizes[i]) { 3346 case 1: 3347 3348 sum1 = b[row]; 3349 for (n = 0; n<sz-1; n+=2) { 3350 i1 = idx[0]; 3351 i2 = idx[1]; 3352 idx += 2; 3353 tmp0 = x[i1]; 3354 tmp1 = x[i2]; 3355 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3356 } 3357 3358 if (n == sz-1) { 3359 tmp0 = x[*idx]; 3360 sum1 -= *v1*tmp0; 3361 } 3362 x[row--] += sum1*(*ibdiag); 3363 break; 3364 3365 case 2: 3366 3367 sum1 = b[row]; 3368 sum2 = b[row-1]; 3369 /* note that sum1 is associated with the second of the two rows */ 3370 v2 = a->a + ii[row - 1]; 3371 for (n = 0; n<sz-1; n+=2) { 3372 i1 = idx[0]; 3373 i2 = idx[1]; 3374 idx += 2; 3375 tmp0 = x[i1]; 3376 tmp1 = x[i2]; 3377 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3378 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3379 } 3380 3381 if (n == sz-1) { 3382 tmp0 = x[*idx]; 3383 sum1 -= *v1*tmp0; 3384 sum2 -= *v2*tmp0; 3385 } 3386 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3387 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3388 break; 3389 case 3: 3390 3391 sum1 = b[row]; 3392 sum2 = b[row-1]; 3393 sum3 = b[row-2]; 3394 v2 = a->a + ii[row-1]; 3395 v3 = a->a + ii[row-2]; 3396 for (n = 0; n<sz-1; n+=2) { 3397 i1 = idx[0]; 3398 i2 = idx[1]; 3399 idx += 2; 3400 tmp0 = x[i1]; 3401 tmp1 = x[i2]; 3402 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3403 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3404 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3405 } 3406 3407 if (n == sz-1) { 3408 tmp0 = x[*idx]; 3409 sum1 -= *v1*tmp0; 3410 sum2 -= *v2*tmp0; 3411 sum3 -= *v3*tmp0; 3412 } 3413 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3414 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3415 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3416 break; 3417 case 4: 3418 3419 sum1 = b[row]; 3420 sum2 = b[row-1]; 3421 sum3 = b[row-2]; 3422 sum4 = b[row-3]; 3423 v2 = a->a + ii[row-1]; 3424 v3 = a->a + ii[row-2]; 3425 v4 = a->a + ii[row-3]; 3426 for (n = 0; n<sz-1; n+=2) { 3427 i1 = idx[0]; 3428 i2 = idx[1]; 3429 idx += 2; 3430 tmp0 = x[i1]; 3431 tmp1 = x[i2]; 3432 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3433 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3434 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3435 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3436 } 3437 3438 if (n == sz-1) { 3439 tmp0 = x[*idx]; 3440 sum1 -= *v1*tmp0; 3441 sum2 -= *v2*tmp0; 3442 sum3 -= *v3*tmp0; 3443 sum4 -= *v4*tmp0; 3444 } 3445 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3446 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3447 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3448 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3449 break; 3450 case 5: 3451 3452 sum1 = b[row]; 3453 sum2 = b[row-1]; 3454 sum3 = b[row-2]; 3455 sum4 = b[row-3]; 3456 sum5 = b[row-4]; 3457 v2 = a->a + ii[row-1]; 3458 v3 = a->a + ii[row-2]; 3459 v4 = a->a + ii[row-3]; 3460 v5 = a->a + ii[row-4]; 3461 for (n = 0; n<sz-1; n+=2) { 3462 i1 = idx[0]; 3463 i2 = idx[1]; 3464 idx += 2; 3465 tmp0 = x[i1]; 3466 tmp1 = x[i2]; 3467 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3468 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3469 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3470 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3471 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3472 } 3473 3474 if (n == sz-1) { 3475 tmp0 = x[*idx]; 3476 sum1 -= *v1*tmp0; 3477 sum2 -= *v2*tmp0; 3478 sum3 -= *v3*tmp0; 3479 sum4 -= *v4*tmp0; 3480 sum5 -= *v5*tmp0; 3481 } 3482 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3483 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3484 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3485 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3486 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3487 break; 3488 default: 3489 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3490 } 3491 } 3492 3493 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3494 } 3495 } 3496 if (flag & SOR_EISENSTAT) { 3497 /* 3498 Apply (U + D)^-1 where D is now the block diagonal 3499 */ 3500 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3501 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3502 ibdiag -= sizes[i]*sizes[i]; 3503 sz = ii[row+1] - diag[row] - 1; 3504 v1 = a->a + diag[row] + 1; 3505 idx = a->j + diag[row] + 1; 3506 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3507 switch (sizes[i]) { 3508 case 1: 3509 3510 sum1 = b[row]; 3511 for (n = 0; n<sz-1; n+=2) { 3512 i1 = idx[0]; 3513 i2 = idx[1]; 3514 idx += 2; 3515 tmp0 = x[i1]; 3516 tmp1 = x[i2]; 3517 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3518 } 3519 3520 if (n == sz-1) { 3521 tmp0 = x[*idx]; 3522 sum1 -= *v1*tmp0; 3523 } 3524 x[row] = sum1*(*ibdiag);row--; 3525 break; 3526 3527 case 2: 3528 3529 sum1 = b[row]; 3530 sum2 = b[row-1]; 3531 /* note that sum1 is associated with the second of the two rows */ 3532 v2 = a->a + diag[row-1] + 2; 3533 for (n = 0; n<sz-1; n+=2) { 3534 i1 = idx[0]; 3535 i2 = idx[1]; 3536 idx += 2; 3537 tmp0 = x[i1]; 3538 tmp1 = x[i2]; 3539 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3540 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3541 } 3542 3543 if (n == sz-1) { 3544 tmp0 = x[*idx]; 3545 sum1 -= *v1*tmp0; 3546 sum2 -= *v2*tmp0; 3547 } 3548 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3549 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3550 row -= 2; 3551 break; 3552 case 3: 3553 3554 sum1 = b[row]; 3555 sum2 = b[row-1]; 3556 sum3 = b[row-2]; 3557 v2 = a->a + diag[row-1] + 2; 3558 v3 = a->a + diag[row-2] + 3; 3559 for (n = 0; n<sz-1; n+=2) { 3560 i1 = idx[0]; 3561 i2 = idx[1]; 3562 idx += 2; 3563 tmp0 = x[i1]; 3564 tmp1 = x[i2]; 3565 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3566 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3567 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3568 } 3569 3570 if (n == sz-1) { 3571 tmp0 = x[*idx]; 3572 sum1 -= *v1*tmp0; 3573 sum2 -= *v2*tmp0; 3574 sum3 -= *v3*tmp0; 3575 } 3576 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3577 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3578 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3579 row -= 3; 3580 break; 3581 case 4: 3582 3583 sum1 = b[row]; 3584 sum2 = b[row-1]; 3585 sum3 = b[row-2]; 3586 sum4 = b[row-3]; 3587 v2 = a->a + diag[row-1] + 2; 3588 v3 = a->a + diag[row-2] + 3; 3589 v4 = a->a + diag[row-3] + 4; 3590 for (n = 0; n<sz-1; n+=2) { 3591 i1 = idx[0]; 3592 i2 = idx[1]; 3593 idx += 2; 3594 tmp0 = x[i1]; 3595 tmp1 = x[i2]; 3596 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3597 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3598 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3599 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3600 } 3601 3602 if (n == sz-1) { 3603 tmp0 = x[*idx]; 3604 sum1 -= *v1*tmp0; 3605 sum2 -= *v2*tmp0; 3606 sum3 -= *v3*tmp0; 3607 sum4 -= *v4*tmp0; 3608 } 3609 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3610 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3611 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3612 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3613 row -= 4; 3614 break; 3615 case 5: 3616 3617 sum1 = b[row]; 3618 sum2 = b[row-1]; 3619 sum3 = b[row-2]; 3620 sum4 = b[row-3]; 3621 sum5 = b[row-4]; 3622 v2 = a->a + diag[row-1] + 2; 3623 v3 = a->a + diag[row-2] + 3; 3624 v4 = a->a + diag[row-3] + 4; 3625 v5 = a->a + diag[row-4] + 5; 3626 for (n = 0; n<sz-1; n+=2) { 3627 i1 = idx[0]; 3628 i2 = idx[1]; 3629 idx += 2; 3630 tmp0 = x[i1]; 3631 tmp1 = x[i2]; 3632 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3633 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3634 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3635 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3636 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3637 } 3638 3639 if (n == sz-1) { 3640 tmp0 = x[*idx]; 3641 sum1 -= *v1*tmp0; 3642 sum2 -= *v2*tmp0; 3643 sum3 -= *v3*tmp0; 3644 sum4 -= *v4*tmp0; 3645 sum5 -= *v5*tmp0; 3646 } 3647 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3648 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3649 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3650 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3651 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3652 row -= 5; 3653 break; 3654 default: 3655 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3656 } 3657 } 3658 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3659 3660 /* 3661 t = b - D x where D is the block diagonal 3662 */ 3663 cnt = 0; 3664 for (i=0, row=0; i<m; i++) { 3665 switch (sizes[i]) { 3666 case 1: 3667 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3668 break; 3669 case 2: 3670 x1 = x[row]; x2 = x[row+1]; 3671 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3672 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3673 t[row] = b[row] - tmp1; 3674 t[row+1] = b[row+1] - tmp2; row += 2; 3675 cnt += 4; 3676 break; 3677 case 3: 3678 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3679 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3680 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3681 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3682 t[row] = b[row] - tmp1; 3683 t[row+1] = b[row+1] - tmp2; 3684 t[row+2] = b[row+2] - tmp3; row += 3; 3685 cnt += 9; 3686 break; 3687 case 4: 3688 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3689 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3690 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3691 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3692 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3693 t[row] = b[row] - tmp1; 3694 t[row+1] = b[row+1] - tmp2; 3695 t[row+2] = b[row+2] - tmp3; 3696 t[row+3] = b[row+3] - tmp4; row += 4; 3697 cnt += 16; 3698 break; 3699 case 5: 3700 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3701 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3702 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3703 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3704 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3705 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3706 t[row] = b[row] - tmp1; 3707 t[row+1] = b[row+1] - tmp2; 3708 t[row+2] = b[row+2] - tmp3; 3709 t[row+3] = b[row+3] - tmp4; 3710 t[row+4] = b[row+4] - tmp5;row += 5; 3711 cnt += 25; 3712 break; 3713 default: 3714 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3715 } 3716 } 3717 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3718 3719 3720 3721 /* 3722 Apply (L + D)^-1 where D is the block diagonal 3723 */ 3724 for (i=0, row=0; i<m; i++) { 3725 sz = diag[row] - ii[row]; 3726 v1 = a->a + ii[row]; 3727 idx = a->j + ii[row]; 3728 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3729 switch (sizes[i]) { 3730 case 1: 3731 3732 sum1 = t[row]; 3733 for (n = 0; n<sz-1; n+=2) { 3734 i1 = idx[0]; 3735 i2 = idx[1]; 3736 idx += 2; 3737 tmp0 = t[i1]; 3738 tmp1 = t[i2]; 3739 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3740 } 3741 3742 if (n == sz-1) { 3743 tmp0 = t[*idx]; 3744 sum1 -= *v1 * tmp0; 3745 } 3746 x[row] += t[row] = sum1*(*ibdiag++); row++; 3747 break; 3748 case 2: 3749 v2 = a->a + ii[row+1]; 3750 sum1 = t[row]; 3751 sum2 = t[row+1]; 3752 for (n = 0; n<sz-1; n+=2) { 3753 i1 = idx[0]; 3754 i2 = idx[1]; 3755 idx += 2; 3756 tmp0 = t[i1]; 3757 tmp1 = t[i2]; 3758 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3759 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3760 } 3761 3762 if (n == sz-1) { 3763 tmp0 = t[*idx]; 3764 sum1 -= v1[0] * tmp0; 3765 sum2 -= v2[0] * tmp0; 3766 } 3767 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3768 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3769 ibdiag += 4; row += 2; 3770 break; 3771 case 3: 3772 v2 = a->a + ii[row+1]; 3773 v3 = a->a + ii[row+2]; 3774 sum1 = t[row]; 3775 sum2 = t[row+1]; 3776 sum3 = t[row+2]; 3777 for (n = 0; n<sz-1; n+=2) { 3778 i1 = idx[0]; 3779 i2 = idx[1]; 3780 idx += 2; 3781 tmp0 = t[i1]; 3782 tmp1 = t[i2]; 3783 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3784 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3785 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3786 } 3787 3788 if (n == sz-1) { 3789 tmp0 = t[*idx]; 3790 sum1 -= v1[0] * tmp0; 3791 sum2 -= v2[0] * tmp0; 3792 sum3 -= v3[0] * tmp0; 3793 } 3794 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3795 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3796 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3797 ibdiag += 9; row += 3; 3798 break; 3799 case 4: 3800 v2 = a->a + ii[row+1]; 3801 v3 = a->a + ii[row+2]; 3802 v4 = a->a + ii[row+3]; 3803 sum1 = t[row]; 3804 sum2 = t[row+1]; 3805 sum3 = t[row+2]; 3806 sum4 = t[row+3]; 3807 for (n = 0; n<sz-1; n+=2) { 3808 i1 = idx[0]; 3809 i2 = idx[1]; 3810 idx += 2; 3811 tmp0 = t[i1]; 3812 tmp1 = t[i2]; 3813 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3814 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3815 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3816 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3817 } 3818 3819 if (n == sz-1) { 3820 tmp0 = t[*idx]; 3821 sum1 -= v1[0] * tmp0; 3822 sum2 -= v2[0] * tmp0; 3823 sum3 -= v3[0] * tmp0; 3824 sum4 -= v4[0] * tmp0; 3825 } 3826 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3827 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3828 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3829 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3830 ibdiag += 16; row += 4; 3831 break; 3832 case 5: 3833 v2 = a->a + ii[row+1]; 3834 v3 = a->a + ii[row+2]; 3835 v4 = a->a + ii[row+3]; 3836 v5 = a->a + ii[row+4]; 3837 sum1 = t[row]; 3838 sum2 = t[row+1]; 3839 sum3 = t[row+2]; 3840 sum4 = t[row+3]; 3841 sum5 = t[row+4]; 3842 for (n = 0; n<sz-1; n+=2) { 3843 i1 = idx[0]; 3844 i2 = idx[1]; 3845 idx += 2; 3846 tmp0 = t[i1]; 3847 tmp1 = t[i2]; 3848 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3849 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3850 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3851 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3852 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3853 } 3854 3855 if (n == sz-1) { 3856 tmp0 = t[*idx]; 3857 sum1 -= v1[0] * tmp0; 3858 sum2 -= v2[0] * tmp0; 3859 sum3 -= v3[0] * tmp0; 3860 sum4 -= v4[0] * tmp0; 3861 sum5 -= v5[0] * tmp0; 3862 } 3863 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3864 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3865 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3866 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3867 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3868 ibdiag += 25; row += 5; 3869 break; 3870 default: 3871 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3872 } 3873 } 3874 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3875 } 3876 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3877 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3878 PetscFunctionReturn(0); 3879 } 3880 3881 #undef __FUNCT__ 3882 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3883 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3884 { 3885 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3886 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3887 const MatScalar *bdiag = a->inode.bdiag; 3888 const PetscScalar *b; 3889 PetscErrorCode ierr; 3890 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3891 const PetscInt *sizes = a->inode.size; 3892 3893 PetscFunctionBegin; 3894 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3895 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3896 cnt = 0; 3897 for (i=0, row=0; i<m; i++) { 3898 switch (sizes[i]) { 3899 case 1: 3900 x[row] = b[row]*bdiag[cnt++];row++; 3901 break; 3902 case 2: 3903 x1 = b[row]; x2 = b[row+1]; 3904 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3905 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3906 x[row++] = tmp1; 3907 x[row++] = tmp2; 3908 cnt += 4; 3909 break; 3910 case 3: 3911 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3912 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3913 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3914 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3915 x[row++] = tmp1; 3916 x[row++] = tmp2; 3917 x[row++] = tmp3; 3918 cnt += 9; 3919 break; 3920 case 4: 3921 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3922 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3923 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3924 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3925 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3926 x[row++] = tmp1; 3927 x[row++] = tmp2; 3928 x[row++] = tmp3; 3929 x[row++] = tmp4; 3930 cnt += 16; 3931 break; 3932 case 5: 3933 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3934 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3935 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3936 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3937 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3938 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3939 x[row++] = tmp1; 3940 x[row++] = tmp2; 3941 x[row++] = tmp3; 3942 x[row++] = tmp4; 3943 x[row++] = tmp5; 3944 cnt += 25; 3945 break; 3946 default: 3947 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3948 } 3949 } 3950 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3951 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3952 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3953 PetscFunctionReturn(0); 3954 } 3955 3956 /* 3957 samestructure indicates that the matrix has not changed its nonzero structure so we 3958 do not need to recompute the inodes 3959 */ 3960 #undef __FUNCT__ 3961 #define __FUNCT__ "Mat_CheckInode" 3962 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 3963 { 3964 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3965 PetscErrorCode ierr; 3966 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3967 PetscBool flag; 3968 3969 PetscFunctionBegin; 3970 if (!a->inode.use) PetscFunctionReturn(0); 3971 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3972 3973 3974 m = A->rmap->n; 3975 if (a->inode.size) ns = a->inode.size; 3976 else { 3977 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 3978 } 3979 3980 i = 0; 3981 node_count = 0; 3982 idx = a->j; 3983 ii = a->i; 3984 while (i < m) { /* For each row */ 3985 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3986 /* Limits the number of elements in a node to 'a->inode.limit' */ 3987 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3988 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3989 if (nzy != nzx) break; 3990 idy += nzx; /* Same nonzero pattern */ 3991 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3992 if (!flag) break; 3993 } 3994 ns[node_count++] = blk_size; 3995 idx += blk_size*nzx; 3996 i = j; 3997 } 3998 /* If not enough inodes found,, do not use inode version of the routines */ 3999 if (!m || node_count > .8*m) { 4000 ierr = PetscFree(ns);CHKERRQ(ierr); 4001 4002 a->inode.node_count = 0; 4003 a->inode.size = NULL; 4004 a->inode.use = PETSC_FALSE; 4005 A->ops->mult = MatMult_SeqAIJ; 4006 A->ops->sor = MatSOR_SeqAIJ; 4007 A->ops->multadd = MatMultAdd_SeqAIJ; 4008 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4009 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4010 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4011 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4012 A->ops->coloringpatch = 0; 4013 A->ops->multdiagonalblock = 0; 4014 4015 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4016 } else { 4017 if (!A->factortype) { 4018 A->ops->mult = MatMult_SeqAIJ_Inode; 4019 A->ops->sor = MatSOR_SeqAIJ_Inode; 4020 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4021 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4022 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4023 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4024 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4025 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4026 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4027 } else { 4028 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4029 } 4030 a->inode.node_count = node_count; 4031 a->inode.size = ns; 4032 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4033 } 4034 a->inode.checked = PETSC_TRUE; 4035 PetscFunctionReturn(0); 4036 } 4037 4038 #undef __FUNCT__ 4039 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4040 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4041 { 4042 Mat B =*C; 4043 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4044 PetscErrorCode ierr; 4045 PetscInt m=A->rmap->n; 4046 4047 PetscFunctionBegin; 4048 c->inode.use = a->inode.use; 4049 c->inode.limit = a->inode.limit; 4050 c->inode.max_limit = a->inode.max_limit; 4051 if (a->inode.size) { 4052 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr); 4053 c->inode.node_count = a->inode.node_count; 4054 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4055 /* note the table of functions below should match that in Mat_CheckInode() */ 4056 if (!B->factortype) { 4057 B->ops->mult = MatMult_SeqAIJ_Inode; 4058 B->ops->sor = MatSOR_SeqAIJ_Inode; 4059 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4060 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4061 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4062 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4063 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4064 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4065 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4066 } else { 4067 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4068 } 4069 } else { 4070 c->inode.size = 0; 4071 c->inode.node_count = 0; 4072 } 4073 c->inode.ibdiagvalid = PETSC_FALSE; 4074 c->inode.ibdiag = 0; 4075 c->inode.bdiag = 0; 4076 PetscFunctionReturn(0); 4077 } 4078 4079 #undef __FUNCT__ 4080 #define __FUNCT__ "MatGetRow_FactoredLU" 4081 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row) 4082 { 4083 PetscInt k, *vi; 4084 4085 PetscFunctionBegin; 4086 vi = aj + ai[row]; 4087 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4088 vi = aj + adiag[row]; 4089 cols[nzl] = vi[0]; 4090 vi = aj + adiag[row+1]+1; 4091 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4092 PetscFunctionReturn(0); 4093 } 4094 /* 4095 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4096 Modified from Mat_CheckInode(). 4097 4098 Input Parameters: 4099 + Mat A - ILU or LU matrix factor 4100 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 4101 do not need to recompute the inodes 4102 */ 4103 #undef __FUNCT__ 4104 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4105 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure) 4106 { 4107 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4108 PetscErrorCode ierr; 4109 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4110 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 4111 PetscBool flag; 4112 4113 PetscFunctionBegin; 4114 if (!a->inode.use) PetscFunctionReturn(0); 4115 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4116 4117 m = A->rmap->n; 4118 if (a->inode.size) ns = a->inode.size; 4119 else { 4120 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 4121 } 4122 4123 i = 0; 4124 node_count = 0; 4125 while (i < m) { /* For each row */ 4126 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4127 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4128 nzx = nzl1 + nzu1 + 1; 4129 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 4130 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4131 4132 /* Limits the number of elements in a node to 'a->inode.limit' */ 4133 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4134 nzl2 = ai[j+1] - ai[j]; 4135 nzu2 = adiag[j] - adiag[j+1] - 1; 4136 nzy = nzl2 + nzu2 + 1; 4137 if (nzy != nzx) break; 4138 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 4139 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4140 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4141 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 4142 ierr = PetscFree(cols2);CHKERRQ(ierr); 4143 } 4144 ns[node_count++] = blk_size; 4145 ierr = PetscFree(cols1);CHKERRQ(ierr); 4146 i = j; 4147 } 4148 /* If not enough inodes found,, do not use inode version of the routines */ 4149 if (!m || node_count > .8*m) { 4150 ierr = PetscFree(ns);CHKERRQ(ierr); 4151 4152 a->inode.node_count = 0; 4153 a->inode.size = NULL; 4154 a->inode.use = PETSC_FALSE; 4155 4156 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4157 } else { 4158 A->ops->mult = 0; 4159 A->ops->sor = 0; 4160 A->ops->multadd = 0; 4161 A->ops->getrowij = 0; 4162 A->ops->restorerowij = 0; 4163 A->ops->getcolumnij = 0; 4164 A->ops->restorecolumnij = 0; 4165 A->ops->coloringpatch = 0; 4166 A->ops->multdiagonalblock = 0; 4167 A->ops->solve = MatSolve_SeqAIJ_Inode; 4168 a->inode.node_count = node_count; 4169 a->inode.size = ns; 4170 4171 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4172 } 4173 a->inode.checked = PETSC_TRUE; 4174 PetscFunctionReturn(0); 4175 } 4176 4177 #undef __FUNCT__ 4178 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4179 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4180 { 4181 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4182 4183 PetscFunctionBegin; 4184 a->inode.ibdiagvalid = PETSC_FALSE; 4185 PetscFunctionReturn(0); 4186 } 4187 4188 /* 4189 This is really ugly. if inodes are used this replaces the 4190 permutations with ones that correspond to rows/cols of the matrix 4191 rather then inode blocks 4192 */ 4193 #undef __FUNCT__ 4194 #define __FUNCT__ "MatInodeAdjustForInodes" 4195 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4196 { 4197 PetscErrorCode ierr; 4198 4199 PetscFunctionBegin; 4200 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4201 PetscFunctionReturn(0); 4202 } 4203 4204 #undef __FUNCT__ 4205 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4206 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4207 { 4208 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4209 PetscErrorCode ierr; 4210 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4211 const PetscInt *ridx,*cidx; 4212 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4213 PetscInt nslim_col,*ns_col; 4214 IS ris = *rperm,cis = *cperm; 4215 4216 PetscFunctionBegin; 4217 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4218 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4219 4220 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4221 ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 4222 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 4223 4224 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4225 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4226 4227 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4228 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4229 4230 /* Construct the permutations for rows*/ 4231 for (i=0,row = 0; i<nslim_row; ++i) { 4232 indx = ridx[i]; 4233 start_val = tns[indx]; 4234 end_val = tns[indx + 1]; 4235 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4236 } 4237 4238 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4239 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4240 4241 /* Construct permutations for columns */ 4242 for (i=0,col=0; i<nslim_col; ++i) { 4243 indx = cidx[i]; 4244 start_val = tns[indx]; 4245 end_val = tns[indx + 1]; 4246 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4247 } 4248 4249 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4250 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4251 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4252 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4253 4254 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4255 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4256 4257 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4258 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4259 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4260 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4261 ierr = PetscFree(tns);CHKERRQ(ierr); 4262 PetscFunctionReturn(0); 4263 } 4264 4265 #undef __FUNCT__ 4266 #define __FUNCT__ "MatInodeGetInodeSizes" 4267 /*@C 4268 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4269 4270 Not Collective 4271 4272 Input Parameter: 4273 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4274 4275 Output Parameter: 4276 + node_count - no of inodes present in the matrix. 4277 . sizes - an array of size node_count,with sizes of each inode. 4278 - limit - the max size used to generate the inodes. 4279 4280 Level: advanced 4281 4282 Notes: This routine returns some internal storage information 4283 of the matrix, it is intended to be used by advanced users. 4284 It should be called after the matrix is assembled. 4285 The contents of the sizes[] array should not be changed. 4286 NULL may be passed for information not requested. 4287 4288 .keywords: matrix, seqaij, get, inode 4289 4290 .seealso: MatGetInfo() 4291 @*/ 4292 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4293 { 4294 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4295 4296 PetscFunctionBegin; 4297 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4298 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",&f);CHKERRQ(ierr); 4299 if (f) { 4300 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4301 } 4302 PetscFunctionReturn(0); 4303 } 4304 4305 #undef __FUNCT__ 4306 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4307 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4308 { 4309 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4310 4311 PetscFunctionBegin; 4312 if (node_count) *node_count = a->inode.node_count; 4313 if (sizes) *sizes = a->inode.size; 4314 if (limit) *limit = a->inode.limit; 4315 PetscFunctionReturn(0); 4316 } 4317