1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt *size,PetscInt **ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) ; 24 for (; count+1 < n; count++,i++) ; 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 67 68 PetscFunctionBegin; 69 nslim_row = a->inode.node_count; 70 m = A->rmap->n; 71 n = A->cmap->n; 72 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 73 74 /* Use the row_inode as column_inode */ 75 nslim_col = nslim_row; 76 ns_col = ns_row; 77 78 /* allocate space for reformated inode structure */ 79 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 80 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 81 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 82 83 for (i1=0,col=0; i1<nslim_col; ++i1) { 84 nsz = ns_col[i1]; 85 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 86 } 87 /* allocate space for row pointers */ 88 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 89 *iia = ia; 90 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 91 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 92 93 /* determine the number of columns in each row */ 94 ia[0] = oshift; 95 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 96 97 j = aj + ai[row] + ishift; 98 jmax = aj + ai[row+1] + ishift; 99 col = *j++ + ishift; 100 i2 = tvc[col]; 101 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 102 ia[i1+1]++; 103 ia[i2+1]++; 104 i2++; /* Start col of next node */ 105 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 106 i2 = tvc[col]; 107 } 108 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 109 } 110 111 /* shift ia[i] to point to next row */ 112 for (i1=1; i1<nslim_row+1; i1++) { 113 row = ia[i1-1]; 114 ia[i1] += row; 115 work[i1-1] = row - oshift; 116 } 117 118 /* allocate space for column pointers */ 119 nz = ia[nslim_row] + (!ishift); 120 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 121 *jja = ja; 122 123 /* loop over lower triangular part putting into ja */ 124 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 125 j = aj + ai[row] + ishift; 126 jmax = aj + ai[row+1] + ishift; 127 col = *j++ + ishift; 128 i2 = tvc[col]; 129 while (i2<i1 && j<jmax) { 130 ja[work[i2]++] = i1 + oshift; 131 ja[work[i1]++] = i2 + oshift; 132 ++i2; 133 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 134 i2 = tvc[col]; 135 } 136 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 137 138 } 139 ierr = PetscFree(work);CHKERRQ(ierr); 140 ierr = PetscFree(tns);CHKERRQ(ierr); 141 ierr = PetscFree(tvc);CHKERRQ(ierr); 142 PetscFunctionReturn(0); 143 } 144 145 /* 146 This builds nonsymmetric version of nonzero structure, 147 */ 148 #undef __FUNCT__ 149 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 150 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 151 { 152 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 153 PetscErrorCode ierr; 154 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 155 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 156 157 PetscFunctionBegin; 158 nslim_row = a->inode.node_count; 159 n = A->cmap->n; 160 161 /* Create The column_inode for this matrix */ 162 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 163 164 /* allocate space for reformated column_inode structure */ 165 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 166 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 167 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 168 169 for (i1=0,col=0; i1<nslim_col; ++i1) { 170 nsz = ns_col[i1]; 171 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 172 } 173 /* allocate space for row pointers */ 174 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 175 *iia = ia; 176 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 177 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 178 179 /* determine the number of columns in each row */ 180 ia[0] = oshift; 181 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 182 j = aj + ai[row] + ishift; 183 col = *j++ + ishift; 184 i2 = tvc[col]; 185 nz = ai[row+1] - ai[row]; 186 while (nz-- > 0) { /* off-diagonal elemets */ 187 ia[i1+1]++; 188 i2++; /* Start col of next node */ 189 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) nz--; 190 if (nz > 0) i2 = tvc[col]; 191 } 192 } 193 194 /* shift ia[i] to point to next row */ 195 for (i1=1; i1<nslim_row+1; i1++) { 196 row = ia[i1-1]; 197 ia[i1] += row; 198 work[i1-1] = row - oshift; 199 } 200 201 /* allocate space for column pointers */ 202 nz = ia[nslim_row] + (!ishift); 203 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 204 *jja = ja; 205 206 /* loop over matrix putting into ja */ 207 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 208 j = aj + ai[row] + ishift; 209 col = *j++ + ishift; 210 i2 = tvc[col]; 211 nz = ai[row+1] - ai[row]; 212 while (nz-- > 0) { 213 ja[work[i1]++] = i2 + oshift; 214 ++i2; 215 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) nz--; 216 if (nz > 0) i2 = tvc[col]; 217 } 218 } 219 ierr = PetscFree(ns_col);CHKERRQ(ierr); 220 ierr = PetscFree(work);CHKERRQ(ierr); 221 ierr = PetscFree(tns);CHKERRQ(ierr); 222 ierr = PetscFree(tvc);CHKERRQ(ierr); 223 PetscFunctionReturn(0); 224 } 225 226 #undef __FUNCT__ 227 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 228 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 229 { 230 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 231 PetscErrorCode ierr; 232 233 PetscFunctionBegin; 234 *n = a->inode.node_count; 235 if (!ia) PetscFunctionReturn(0); 236 if (!blockcompressed) { 237 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 238 } else if (symmetric) { 239 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 240 } else { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } 243 PetscFunctionReturn(0); 244 } 245 246 #undef __FUNCT__ 247 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 248 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 249 { 250 PetscErrorCode ierr; 251 252 PetscFunctionBegin; 253 if (!ia) PetscFunctionReturn(0); 254 255 if (!blockcompressed) { 256 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 257 } else { 258 ierr = PetscFree(*ia);CHKERRQ(ierr); 259 ierr = PetscFree(*ja);CHKERRQ(ierr); 260 } 261 PetscFunctionReturn(0); 262 } 263 264 /* ----------------------------------------------------------- */ 265 266 #undef __FUNCT__ 267 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 268 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 269 { 270 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 271 PetscErrorCode ierr; 272 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 273 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 274 275 PetscFunctionBegin; 276 nslim_row = a->inode.node_count; 277 n = A->cmap->n; 278 279 /* Create The column_inode for this matrix */ 280 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 281 282 /* allocate space for reformated column_inode structure */ 283 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 284 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 285 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 286 287 for (i1=0,col=0; i1<nslim_col; ++i1) { 288 nsz = ns_col[i1]; 289 for (i2=0; i2<nsz; ++i2,++col) tvc[col] = i1; 290 } 291 /* allocate space for column pointers */ 292 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 293 *iia = ia; 294 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 295 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 296 297 /* determine the number of columns in each row */ 298 ia[0] = oshift; 299 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 300 j = aj + ai[row] + ishift; 301 col = *j++ + ishift; 302 i2 = tvc[col]; 303 nz = ai[row+1] - ai[row]; 304 while (nz-- > 0) { /* off-diagonal elemets */ 305 /* ia[i1+1]++; */ 306 ia[i2+1]++; 307 i2++; 308 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) nz--; 309 if (nz > 0) i2 = tvc[col]; 310 } 311 } 312 313 /* shift ia[i] to point to next col */ 314 for (i1=1; i1<nslim_col+1; i1++) { 315 col = ia[i1-1]; 316 ia[i1] += col; 317 work[i1-1] = col - oshift; 318 } 319 320 /* allocate space for column pointers */ 321 nz = ia[nslim_col] + (!ishift); 322 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 323 *jja = ja; 324 325 /* loop over matrix putting into ja */ 326 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 327 j = aj + ai[row] + ishift; 328 col = *j++ + ishift; 329 i2 = tvc[col]; 330 nz = ai[row+1] - ai[row]; 331 while (nz-- > 0) { 332 /* ja[work[i1]++] = i2 + oshift; */ 333 ja[work[i2]++] = i1 + oshift; 334 i2++; 335 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) nz--; 336 if (nz > 0) i2 = tvc[col]; 337 } 338 } 339 ierr = PetscFree(ns_col);CHKERRQ(ierr); 340 ierr = PetscFree(work);CHKERRQ(ierr); 341 ierr = PetscFree(tns);CHKERRQ(ierr); 342 ierr = PetscFree(tvc);CHKERRQ(ierr); 343 PetscFunctionReturn(0); 344 } 345 346 #undef __FUNCT__ 347 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 348 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 349 { 350 PetscErrorCode ierr; 351 352 PetscFunctionBegin; 353 ierr = Mat_CreateColInode(A,n,NULL);CHKERRQ(ierr); 354 if (!ia) PetscFunctionReturn(0); 355 356 if (!blockcompressed) { 357 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 358 } else if (symmetric) { 359 /* Since the indices are symmetric it does'nt matter */ 360 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 361 } else { 362 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } 364 PetscFunctionReturn(0); 365 } 366 367 #undef __FUNCT__ 368 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 369 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 370 { 371 PetscErrorCode ierr; 372 373 PetscFunctionBegin; 374 if (!ia) PetscFunctionReturn(0); 375 if (!blockcompressed) { 376 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 377 } else { 378 ierr = PetscFree(*ia);CHKERRQ(ierr); 379 ierr = PetscFree(*ja);CHKERRQ(ierr); 380 } 381 PetscFunctionReturn(0); 382 } 383 384 /* ----------------------------------------------------------- */ 385 386 #undef __FUNCT__ 387 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 388 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 389 { 390 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 391 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 392 PetscScalar *y; 393 const PetscScalar *x; 394 const MatScalar *v1,*v2,*v3,*v4,*v5; 395 PetscErrorCode ierr; 396 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 397 398 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 399 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 400 #endif 401 402 PetscFunctionBegin; 403 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 404 node_max = a->inode.node_count; 405 ns = a->inode.size; /* Node Size array */ 406 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 407 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 408 idx = a->j; 409 v1 = a->a; 410 ii = a->i; 411 412 for (i = 0,row = 0; i< node_max; ++i) { 413 nsz = ns[i]; 414 n = ii[1] - ii[0]; 415 nonzerorow += (n>0)*nsz; 416 ii += nsz; 417 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 418 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 419 sz = n; /* No of non zeros in this row */ 420 /* Switch on the size of Node */ 421 switch (nsz) { /* Each loop in 'case' is unrolled */ 422 case 1: 423 sum1 = 0.; 424 425 for (n = 0; n< sz-1; n+=2) { 426 i1 = idx[0]; /* The instructions are ordered to */ 427 i2 = idx[1]; /* make the compiler's job easy */ 428 idx += 2; 429 tmp0 = x[i1]; 430 tmp1 = x[i2]; 431 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 432 } 433 434 if (n == sz-1) { /* Take care of the last nonzero */ 435 tmp0 = x[*idx++]; 436 sum1 += *v1++ *tmp0; 437 } 438 y[row++]=sum1; 439 break; 440 case 2: 441 sum1 = 0.; 442 sum2 = 0.; 443 v2 = v1 + n; 444 445 for (n = 0; n< sz-1; n+=2) { 446 i1 = idx[0]; 447 i2 = idx[1]; 448 idx += 2; 449 tmp0 = x[i1]; 450 tmp1 = x[i2]; 451 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 452 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 453 } 454 if (n == sz-1) { 455 tmp0 = x[*idx++]; 456 sum1 += *v1++ * tmp0; 457 sum2 += *v2++ * tmp0; 458 } 459 y[row++]=sum1; 460 y[row++]=sum2; 461 v1 =v2; /* Since the next block to be processed starts there*/ 462 idx +=sz; 463 break; 464 case 3: 465 sum1 = 0.; 466 sum2 = 0.; 467 sum3 = 0.; 468 v2 = v1 + n; 469 v3 = v2 + n; 470 471 for (n = 0; n< sz-1; n+=2) { 472 i1 = idx[0]; 473 i2 = idx[1]; 474 idx += 2; 475 tmp0 = x[i1]; 476 tmp1 = x[i2]; 477 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 478 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 479 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 480 } 481 if (n == sz-1) { 482 tmp0 = x[*idx++]; 483 sum1 += *v1++ * tmp0; 484 sum2 += *v2++ * tmp0; 485 sum3 += *v3++ * tmp0; 486 } 487 y[row++]=sum1; 488 y[row++]=sum2; 489 y[row++]=sum3; 490 v1 =v3; /* Since the next block to be processed starts there*/ 491 idx +=2*sz; 492 break; 493 case 4: 494 sum1 = 0.; 495 sum2 = 0.; 496 sum3 = 0.; 497 sum4 = 0.; 498 v2 = v1 + n; 499 v3 = v2 + n; 500 v4 = v3 + n; 501 502 for (n = 0; n< sz-1; n+=2) { 503 i1 = idx[0]; 504 i2 = idx[1]; 505 idx += 2; 506 tmp0 = x[i1]; 507 tmp1 = x[i2]; 508 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 509 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 510 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 511 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 512 } 513 if (n == sz-1) { 514 tmp0 = x[*idx++]; 515 sum1 += *v1++ * tmp0; 516 sum2 += *v2++ * tmp0; 517 sum3 += *v3++ * tmp0; 518 sum4 += *v4++ * tmp0; 519 } 520 y[row++]=sum1; 521 y[row++]=sum2; 522 y[row++]=sum3; 523 y[row++]=sum4; 524 v1 =v4; /* Since the next block to be processed starts there*/ 525 idx +=3*sz; 526 break; 527 case 5: 528 sum1 = 0.; 529 sum2 = 0.; 530 sum3 = 0.; 531 sum4 = 0.; 532 sum5 = 0.; 533 v2 = v1 + n; 534 v3 = v2 + n; 535 v4 = v3 + n; 536 v5 = v4 + n; 537 538 for (n = 0; n<sz-1; n+=2) { 539 i1 = idx[0]; 540 i2 = idx[1]; 541 idx += 2; 542 tmp0 = x[i1]; 543 tmp1 = x[i2]; 544 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 545 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 546 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 547 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 548 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 549 } 550 if (n == sz-1) { 551 tmp0 = x[*idx++]; 552 sum1 += *v1++ * tmp0; 553 sum2 += *v2++ * tmp0; 554 sum3 += *v3++ * tmp0; 555 sum4 += *v4++ * tmp0; 556 sum5 += *v5++ * tmp0; 557 } 558 y[row++]=sum1; 559 y[row++]=sum2; 560 y[row++]=sum3; 561 y[row++]=sum4; 562 y[row++]=sum5; 563 v1 =v5; /* Since the next block to be processed starts there */ 564 idx +=4*sz; 565 break; 566 default: 567 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 568 } 569 } 570 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 571 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 572 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 573 PetscFunctionReturn(0); 574 } 575 /* ----------------------------------------------------------- */ 576 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 577 #undef __FUNCT__ 578 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 579 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 580 { 581 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 582 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 583 MatScalar *v1,*v2,*v3,*v4,*v5; 584 PetscScalar *x,*y,*z,*zt; 585 PetscErrorCode ierr; 586 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 587 588 PetscFunctionBegin; 589 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 590 node_max = a->inode.node_count; 591 ns = a->inode.size; /* Node Size array */ 592 593 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 594 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 595 if (zz != yy) { 596 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 597 } else { 598 z = y; 599 } 600 zt = z; 601 602 idx = a->j; 603 v1 = a->a; 604 ii = a->i; 605 606 for (i = 0,row = 0; i< node_max; ++i) { 607 nsz = ns[i]; 608 n = ii[1] - ii[0]; 609 ii += nsz; 610 sz = n; /* No of non zeros in this row */ 611 /* Switch on the size of Node */ 612 switch (nsz) { /* Each loop in 'case' is unrolled */ 613 case 1: 614 sum1 = *zt++; 615 616 for (n = 0; n< sz-1; n+=2) { 617 i1 = idx[0]; /* The instructions are ordered to */ 618 i2 = idx[1]; /* make the compiler's job easy */ 619 idx += 2; 620 tmp0 = x[i1]; 621 tmp1 = x[i2]; 622 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 623 } 624 625 if (n == sz-1) { /* Take care of the last nonzero */ 626 tmp0 = x[*idx++]; 627 sum1 += *v1++ * tmp0; 628 } 629 y[row++]=sum1; 630 break; 631 case 2: 632 sum1 = *zt++; 633 sum2 = *zt++; 634 v2 = v1 + n; 635 636 for (n = 0; n< sz-1; n+=2) { 637 i1 = idx[0]; 638 i2 = idx[1]; 639 idx += 2; 640 tmp0 = x[i1]; 641 tmp1 = x[i2]; 642 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 643 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 644 } 645 if (n == sz-1) { 646 tmp0 = x[*idx++]; 647 sum1 += *v1++ * tmp0; 648 sum2 += *v2++ * tmp0; 649 } 650 y[row++]=sum1; 651 y[row++]=sum2; 652 v1 =v2; /* Since the next block to be processed starts there*/ 653 idx +=sz; 654 break; 655 case 3: 656 sum1 = *zt++; 657 sum2 = *zt++; 658 sum3 = *zt++; 659 v2 = v1 + n; 660 v3 = v2 + n; 661 662 for (n = 0; n< sz-1; n+=2) { 663 i1 = idx[0]; 664 i2 = idx[1]; 665 idx += 2; 666 tmp0 = x[i1]; 667 tmp1 = x[i2]; 668 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 669 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 670 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 671 } 672 if (n == sz-1) { 673 tmp0 = x[*idx++]; 674 sum1 += *v1++ * tmp0; 675 sum2 += *v2++ * tmp0; 676 sum3 += *v3++ * tmp0; 677 } 678 y[row++]=sum1; 679 y[row++]=sum2; 680 y[row++]=sum3; 681 v1 =v3; /* Since the next block to be processed starts there*/ 682 idx +=2*sz; 683 break; 684 case 4: 685 sum1 = *zt++; 686 sum2 = *zt++; 687 sum3 = *zt++; 688 sum4 = *zt++; 689 v2 = v1 + n; 690 v3 = v2 + n; 691 v4 = v3 + n; 692 693 for (n = 0; n< sz-1; n+=2) { 694 i1 = idx[0]; 695 i2 = idx[1]; 696 idx += 2; 697 tmp0 = x[i1]; 698 tmp1 = x[i2]; 699 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 700 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 701 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 702 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 703 } 704 if (n == sz-1) { 705 tmp0 = x[*idx++]; 706 sum1 += *v1++ * tmp0; 707 sum2 += *v2++ * tmp0; 708 sum3 += *v3++ * tmp0; 709 sum4 += *v4++ * tmp0; 710 } 711 y[row++]=sum1; 712 y[row++]=sum2; 713 y[row++]=sum3; 714 y[row++]=sum4; 715 v1 =v4; /* Since the next block to be processed starts there*/ 716 idx +=3*sz; 717 break; 718 case 5: 719 sum1 = *zt++; 720 sum2 = *zt++; 721 sum3 = *zt++; 722 sum4 = *zt++; 723 sum5 = *zt++; 724 v2 = v1 + n; 725 v3 = v2 + n; 726 v4 = v3 + n; 727 v5 = v4 + n; 728 729 for (n = 0; n<sz-1; n+=2) { 730 i1 = idx[0]; 731 i2 = idx[1]; 732 idx += 2; 733 tmp0 = x[i1]; 734 tmp1 = x[i2]; 735 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 736 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 737 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 738 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 739 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 740 } 741 if (n == sz-1) { 742 tmp0 = x[*idx++]; 743 sum1 += *v1++ * tmp0; 744 sum2 += *v2++ * tmp0; 745 sum3 += *v3++ * tmp0; 746 sum4 += *v4++ * tmp0; 747 sum5 += *v5++ * tmp0; 748 } 749 y[row++]=sum1; 750 y[row++]=sum2; 751 y[row++]=sum3; 752 y[row++]=sum4; 753 y[row++]=sum5; 754 v1 =v5; /* Since the next block to be processed starts there */ 755 idx +=4*sz; 756 break; 757 default: 758 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 759 } 760 } 761 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 762 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 763 if (zz != yy) { 764 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 765 } 766 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 767 PetscFunctionReturn(0); 768 } 769 770 /* ----------------------------------------------------------- */ 771 #undef __FUNCT__ 772 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 773 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 774 { 775 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 776 IS iscol = a->col,isrow = a->row; 777 PetscErrorCode ierr; 778 const PetscInt *r,*c,*rout,*cout; 779 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 780 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 781 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 782 PetscScalar sum1,sum2,sum3,sum4,sum5; 783 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 784 const PetscScalar *b; 785 786 PetscFunctionBegin; 787 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 788 node_max = a->inode.node_count; 789 ns = a->inode.size; /* Node Size array */ 790 791 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 792 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 793 tmp = a->solve_work; 794 795 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 796 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 797 798 /* forward solve the lower triangular */ 799 tmps = tmp; 800 aa = a_a; 801 aj = a_j; 802 ad = a->diag; 803 804 for (i = 0,row = 0; i< node_max; ++i) { 805 nsz = ns[i]; 806 aii = ai[row]; 807 v1 = aa + aii; 808 vi = aj + aii; 809 nz = ad[row]- aii; 810 if (i < node_max-1) { 811 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 812 * but our indexing to determine it's size could. */ 813 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 814 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 815 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 816 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 817 } 818 819 switch (nsz) { /* Each loop in 'case' is unrolled */ 820 case 1: 821 sum1 = b[*r++]; 822 for (j=0; j<nz-1; j+=2) { 823 i0 = vi[0]; 824 i1 = vi[1]; 825 vi +=2; 826 tmp0 = tmps[i0]; 827 tmp1 = tmps[i1]; 828 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 829 } 830 if (j == nz-1) { 831 tmp0 = tmps[*vi++]; 832 sum1 -= *v1++ *tmp0; 833 } 834 tmp[row++]=sum1; 835 break; 836 case 2: 837 sum1 = b[*r++]; 838 sum2 = b[*r++]; 839 v2 = aa + ai[row+1]; 840 841 for (j=0; j<nz-1; j+=2) { 842 i0 = vi[0]; 843 i1 = vi[1]; 844 vi +=2; 845 tmp0 = tmps[i0]; 846 tmp1 = tmps[i1]; 847 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 848 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 849 } 850 if (j == nz-1) { 851 tmp0 = tmps[*vi++]; 852 sum1 -= *v1++ *tmp0; 853 sum2 -= *v2++ *tmp0; 854 } 855 sum2 -= *v2++ *sum1; 856 tmp[row++]=sum1; 857 tmp[row++]=sum2; 858 break; 859 case 3: 860 sum1 = b[*r++]; 861 sum2 = b[*r++]; 862 sum3 = b[*r++]; 863 v2 = aa + ai[row+1]; 864 v3 = aa + ai[row+2]; 865 866 for (j=0; j<nz-1; j+=2) { 867 i0 = vi[0]; 868 i1 = vi[1]; 869 vi +=2; 870 tmp0 = tmps[i0]; 871 tmp1 = tmps[i1]; 872 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 873 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 874 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 875 } 876 if (j == nz-1) { 877 tmp0 = tmps[*vi++]; 878 sum1 -= *v1++ *tmp0; 879 sum2 -= *v2++ *tmp0; 880 sum3 -= *v3++ *tmp0; 881 } 882 sum2 -= *v2++ * sum1; 883 sum3 -= *v3++ * sum1; 884 sum3 -= *v3++ * sum2; 885 886 tmp[row++]=sum1; 887 tmp[row++]=sum2; 888 tmp[row++]=sum3; 889 break; 890 891 case 4: 892 sum1 = b[*r++]; 893 sum2 = b[*r++]; 894 sum3 = b[*r++]; 895 sum4 = b[*r++]; 896 v2 = aa + ai[row+1]; 897 v3 = aa + ai[row+2]; 898 v4 = aa + ai[row+3]; 899 900 for (j=0; j<nz-1; j+=2) { 901 i0 = vi[0]; 902 i1 = vi[1]; 903 vi +=2; 904 tmp0 = tmps[i0]; 905 tmp1 = tmps[i1]; 906 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 907 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 908 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 909 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 910 } 911 if (j == nz-1) { 912 tmp0 = tmps[*vi++]; 913 sum1 -= *v1++ *tmp0; 914 sum2 -= *v2++ *tmp0; 915 sum3 -= *v3++ *tmp0; 916 sum4 -= *v4++ *tmp0; 917 } 918 sum2 -= *v2++ * sum1; 919 sum3 -= *v3++ * sum1; 920 sum4 -= *v4++ * sum1; 921 sum3 -= *v3++ * sum2; 922 sum4 -= *v4++ * sum2; 923 sum4 -= *v4++ * sum3; 924 925 tmp[row++]=sum1; 926 tmp[row++]=sum2; 927 tmp[row++]=sum3; 928 tmp[row++]=sum4; 929 break; 930 case 5: 931 sum1 = b[*r++]; 932 sum2 = b[*r++]; 933 sum3 = b[*r++]; 934 sum4 = b[*r++]; 935 sum5 = b[*r++]; 936 v2 = aa + ai[row+1]; 937 v3 = aa + ai[row+2]; 938 v4 = aa + ai[row+3]; 939 v5 = aa + ai[row+4]; 940 941 for (j=0; j<nz-1; j+=2) { 942 i0 = vi[0]; 943 i1 = vi[1]; 944 vi +=2; 945 tmp0 = tmps[i0]; 946 tmp1 = tmps[i1]; 947 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 948 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 949 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 950 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 951 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 952 } 953 if (j == nz-1) { 954 tmp0 = tmps[*vi++]; 955 sum1 -= *v1++ *tmp0; 956 sum2 -= *v2++ *tmp0; 957 sum3 -= *v3++ *tmp0; 958 sum4 -= *v4++ *tmp0; 959 sum5 -= *v5++ *tmp0; 960 } 961 962 sum2 -= *v2++ * sum1; 963 sum3 -= *v3++ * sum1; 964 sum4 -= *v4++ * sum1; 965 sum5 -= *v5++ * sum1; 966 sum3 -= *v3++ * sum2; 967 sum4 -= *v4++ * sum2; 968 sum5 -= *v5++ * sum2; 969 sum4 -= *v4++ * sum3; 970 sum5 -= *v5++ * sum3; 971 sum5 -= *v5++ * sum4; 972 973 tmp[row++]=sum1; 974 tmp[row++]=sum2; 975 tmp[row++]=sum3; 976 tmp[row++]=sum4; 977 tmp[row++]=sum5; 978 break; 979 default: 980 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 981 } 982 } 983 /* backward solve the upper triangular */ 984 for (i=node_max -1,row = n-1; i>=0; i--) { 985 nsz = ns[i]; 986 aii = ai[row+1] -1; 987 v1 = aa + aii; 988 vi = aj + aii; 989 nz = aii- ad[row]; 990 switch (nsz) { /* Each loop in 'case' is unrolled */ 991 case 1: 992 sum1 = tmp[row]; 993 994 for (j=nz; j>1; j-=2) { 995 vi -=2; 996 i0 = vi[2]; 997 i1 = vi[1]; 998 tmp0 = tmps[i0]; 999 tmp1 = tmps[i1]; 1000 v1 -= 2; 1001 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1002 } 1003 if (j==1) { 1004 tmp0 = tmps[*vi--]; 1005 sum1 -= *v1-- * tmp0; 1006 } 1007 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1008 break; 1009 case 2: 1010 sum1 = tmp[row]; 1011 sum2 = tmp[row -1]; 1012 v2 = aa + ai[row]-1; 1013 for (j=nz; j>1; j-=2) { 1014 vi -=2; 1015 i0 = vi[2]; 1016 i1 = vi[1]; 1017 tmp0 = tmps[i0]; 1018 tmp1 = tmps[i1]; 1019 v1 -= 2; 1020 v2 -= 2; 1021 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1022 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1023 } 1024 if (j==1) { 1025 tmp0 = tmps[*vi--]; 1026 sum1 -= *v1-- * tmp0; 1027 sum2 -= *v2-- * tmp0; 1028 } 1029 1030 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1031 sum2 -= *v2-- * tmp0; 1032 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1033 break; 1034 case 3: 1035 sum1 = tmp[row]; 1036 sum2 = tmp[row -1]; 1037 sum3 = tmp[row -2]; 1038 v2 = aa + ai[row]-1; 1039 v3 = aa + ai[row -1]-1; 1040 for (j=nz; j>1; j-=2) { 1041 vi -=2; 1042 i0 = vi[2]; 1043 i1 = vi[1]; 1044 tmp0 = tmps[i0]; 1045 tmp1 = tmps[i1]; 1046 v1 -= 2; 1047 v2 -= 2; 1048 v3 -= 2; 1049 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1050 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1051 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1052 } 1053 if (j==1) { 1054 tmp0 = tmps[*vi--]; 1055 sum1 -= *v1-- * tmp0; 1056 sum2 -= *v2-- * tmp0; 1057 sum3 -= *v3-- * tmp0; 1058 } 1059 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1060 sum2 -= *v2-- * tmp0; 1061 sum3 -= *v3-- * tmp0; 1062 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1063 sum3 -= *v3-- * tmp0; 1064 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1065 1066 break; 1067 case 4: 1068 sum1 = tmp[row]; 1069 sum2 = tmp[row -1]; 1070 sum3 = tmp[row -2]; 1071 sum4 = tmp[row -3]; 1072 v2 = aa + ai[row]-1; 1073 v3 = aa + ai[row -1]-1; 1074 v4 = aa + ai[row -2]-1; 1075 1076 for (j=nz; j>1; j-=2) { 1077 vi -=2; 1078 i0 = vi[2]; 1079 i1 = vi[1]; 1080 tmp0 = tmps[i0]; 1081 tmp1 = tmps[i1]; 1082 v1 -= 2; 1083 v2 -= 2; 1084 v3 -= 2; 1085 v4 -= 2; 1086 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1087 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1088 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1089 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1090 } 1091 if (j==1) { 1092 tmp0 = tmps[*vi--]; 1093 sum1 -= *v1-- * tmp0; 1094 sum2 -= *v2-- * tmp0; 1095 sum3 -= *v3-- * tmp0; 1096 sum4 -= *v4-- * tmp0; 1097 } 1098 1099 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1104 sum3 -= *v3-- * tmp0; 1105 sum4 -= *v4-- * tmp0; 1106 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1107 sum4 -= *v4-- * tmp0; 1108 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1109 break; 1110 case 5: 1111 sum1 = tmp[row]; 1112 sum2 = tmp[row -1]; 1113 sum3 = tmp[row -2]; 1114 sum4 = tmp[row -3]; 1115 sum5 = tmp[row -4]; 1116 v2 = aa + ai[row]-1; 1117 v3 = aa + ai[row -1]-1; 1118 v4 = aa + ai[row -2]-1; 1119 v5 = aa + ai[row -3]-1; 1120 for (j=nz; j>1; j-=2) { 1121 vi -= 2; 1122 i0 = vi[2]; 1123 i1 = vi[1]; 1124 tmp0 = tmps[i0]; 1125 tmp1 = tmps[i1]; 1126 v1 -= 2; 1127 v2 -= 2; 1128 v3 -= 2; 1129 v4 -= 2; 1130 v5 -= 2; 1131 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1132 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1133 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1134 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1135 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1136 } 1137 if (j==1) { 1138 tmp0 = tmps[*vi--]; 1139 sum1 -= *v1-- * tmp0; 1140 sum2 -= *v2-- * tmp0; 1141 sum3 -= *v3-- * tmp0; 1142 sum4 -= *v4-- * tmp0; 1143 sum5 -= *v5-- * tmp0; 1144 } 1145 1146 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1152 sum3 -= *v3-- * tmp0; 1153 sum4 -= *v4-- * tmp0; 1154 sum5 -= *v5-- * tmp0; 1155 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1159 sum5 -= *v5-- * tmp0; 1160 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1161 break; 1162 default: 1163 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1164 } 1165 } 1166 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1167 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1168 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 #undef __FUNCT__ 1175 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1176 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1177 { 1178 Mat C =B; 1179 Mat_SeqAIJ *a =(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ*)C->data; 1180 IS isrow = b->row,isicol = b->icol; 1181 PetscErrorCode ierr; 1182 const PetscInt *r,*ic,*ics; 1183 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1184 PetscInt i,j,k,nz,nzL,row,*pj; 1185 const PetscInt *ajtmp,*bjtmp; 1186 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1187 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1188 FactorShiftCtx sctx; 1189 const PetscInt *ddiag; 1190 PetscReal rs; 1191 MatScalar d; 1192 PetscInt inod,nodesz,node_max,col; 1193 const PetscInt *ns; 1194 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1195 1196 PetscFunctionBegin; 1197 /* MatPivotSetUp(): initialize shift context sctx */ 1198 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1199 1200 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1201 ddiag = a->diag; 1202 sctx.shift_top = info->zeropivot; 1203 for (i=0; i<n; i++) { 1204 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1205 d = (aa)[ddiag[i]]; 1206 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1207 v = aa+ai[i]; 1208 nz = ai[i+1] - ai[i]; 1209 for (j=0; j<nz; j++) rs += PetscAbsScalar(v[j]); 1210 if (rs>sctx.shift_top) sctx.shift_top = rs; 1211 } 1212 sctx.shift_top *= 1.1; 1213 sctx.nshift_max = 5; 1214 sctx.shift_lo = 0.; 1215 sctx.shift_hi = 1.; 1216 } 1217 1218 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1219 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1220 1221 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1222 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1223 rtmp2 = rtmp1 + n; 1224 rtmp3 = rtmp2 + n; 1225 rtmp4 = rtmp3 + n; 1226 ics = ic; 1227 1228 node_max = a->inode.node_count; 1229 ns = a->inode.size; 1230 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1231 1232 /* If max inode size > 4, split it into two inodes.*/ 1233 /* also map the inode sizes according to the ordering */ 1234 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1235 for (i=0,j=0; i<node_max; ++i,++j) { 1236 if (ns[i] > 4) { 1237 tmp_vec1[j] = 4; 1238 ++j; 1239 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1240 } else { 1241 tmp_vec1[j] = ns[i]; 1242 } 1243 } 1244 /* Use the correct node_max */ 1245 node_max = j; 1246 1247 /* Now reorder the inode info based on mat re-ordering info */ 1248 /* First create a row -> inode_size_array_index map */ 1249 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1250 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1251 for (i=0,row=0; i<node_max; i++) { 1252 nodesz = tmp_vec1[i]; 1253 for (j=0; j<nodesz; j++,row++) { 1254 nsmap[row] = i; 1255 } 1256 } 1257 /* Using nsmap, create a reordered ns structure */ 1258 for (i=0,j=0; i< node_max; i++) { 1259 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1260 tmp_vec2[i] = nodesz; 1261 j += nodesz; 1262 } 1263 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1264 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1265 1266 /* Now use the correct ns */ 1267 ns = tmp_vec2; 1268 1269 do { 1270 sctx.newshift = PETSC_FALSE; 1271 /* Now loop over each block-row, and do the factorization */ 1272 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1273 nodesz = ns[inod]; 1274 1275 switch (nodesz) { 1276 case 1: 1277 /*----------*/ 1278 /* zero rtmp1 */ 1279 /* L part */ 1280 nz = bi[i+1] - bi[i]; 1281 bjtmp = bj + bi[i]; 1282 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1283 1284 /* U part */ 1285 nz = bdiag[i]-bdiag[i+1]; 1286 bjtmp = bj + bdiag[i+1]+1; 1287 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1288 1289 /* load in initial (unfactored row) */ 1290 nz = ai[r[i]+1] - ai[r[i]]; 1291 ajtmp = aj + ai[r[i]]; 1292 v = aa + ai[r[i]]; 1293 for (j=0; j<nz; j++) rtmp1[ics[ajtmp[j]]] = v[j]; 1294 1295 /* ZeropivotApply() */ 1296 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1297 1298 /* elimination */ 1299 bjtmp = bj + bi[i]; 1300 row = *bjtmp++; 1301 nzL = bi[i+1] - bi[i]; 1302 for (k=0; k < nzL; k++) { 1303 pc = rtmp1 + row; 1304 if (*pc != 0.0) { 1305 pv = b->a + bdiag[row]; 1306 mul1 = *pc * (*pv); 1307 *pc = mul1; 1308 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1309 pv = b->a + bdiag[row+1]+1; 1310 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1311 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1312 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1313 } 1314 row = *bjtmp++; 1315 } 1316 1317 /* finished row so stick it into b->a */ 1318 rs = 0.0; 1319 /* L part */ 1320 pv = b->a + bi[i]; 1321 pj = b->j + bi[i]; 1322 nz = bi[i+1] - bi[i]; 1323 for (j=0; j<nz; j++) { 1324 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1325 } 1326 1327 /* U part */ 1328 pv = b->a + bdiag[i+1]+1; 1329 pj = b->j + bdiag[i+1]+1; 1330 nz = bdiag[i] - bdiag[i+1]-1; 1331 for (j=0; j<nz; j++) { 1332 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1333 } 1334 1335 /* Check zero pivot */ 1336 sctx.rs = rs; 1337 sctx.pv = rtmp1[i]; 1338 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1339 if (sctx.newshift) break; 1340 1341 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1342 pv = b->a + bdiag[i]; 1343 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1344 break; 1345 1346 case 2: 1347 /*----------*/ 1348 /* zero rtmp1 and rtmp2 */ 1349 /* L part */ 1350 nz = bi[i+1] - bi[i]; 1351 bjtmp = bj + bi[i]; 1352 for (j=0; j<nz; j++) { 1353 col = bjtmp[j]; 1354 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1355 } 1356 1357 /* U part */ 1358 nz = bdiag[i]-bdiag[i+1]; 1359 bjtmp = bj + bdiag[i+1]+1; 1360 for (j=0; j<nz; j++) { 1361 col = bjtmp[j]; 1362 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1363 } 1364 1365 /* load in initial (unfactored row) */ 1366 nz = ai[r[i]+1] - ai[r[i]]; 1367 ajtmp = aj + ai[r[i]]; 1368 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1369 for (j=0; j<nz; j++) { 1370 col = ics[ajtmp[j]]; 1371 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1372 } 1373 /* ZeropivotApply(): shift the diagonal of the matrix */ 1374 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1375 1376 /* elimination */ 1377 bjtmp = bj + bi[i]; 1378 row = *bjtmp++; /* pivot row */ 1379 nzL = bi[i+1] - bi[i]; 1380 for (k=0; k < nzL; k++) { 1381 pc1 = rtmp1 + row; 1382 pc2 = rtmp2 + row; 1383 if (*pc1 != 0.0 || *pc2 != 0.0) { 1384 pv = b->a + bdiag[row]; 1385 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1386 *pc1 = mul1; *pc2 = mul2; 1387 1388 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1389 pv = b->a + bdiag[row+1]+1; 1390 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1391 for (j=0; j<nz; j++) { 1392 col = pj[j]; 1393 rtmp1[col] -= mul1 * pv[j]; 1394 rtmp2[col] -= mul2 * pv[j]; 1395 } 1396 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1397 } 1398 row = *bjtmp++; 1399 } 1400 1401 /* finished row i; check zero pivot, then stick row i into b->a */ 1402 rs = 0.0; 1403 /* L part */ 1404 pc1 = b->a + bi[i]; 1405 pj = b->j + bi[i]; 1406 nz = bi[i+1] - bi[i]; 1407 for (j=0; j<nz; j++) { 1408 col = pj[j]; 1409 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1410 } 1411 /* U part */ 1412 pc1 = b->a + bdiag[i+1]+1; 1413 pj = b->j + bdiag[i+1]+1; 1414 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1415 for (j=0; j<nz; j++) { 1416 col = pj[j]; 1417 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1418 } 1419 1420 sctx.rs = rs; 1421 sctx.pv = rtmp1[i]; 1422 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1423 if (sctx.newshift) break; 1424 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1425 *pc1 = 1.0/sctx.pv; 1426 1427 /* Now take care of diagonal 2x2 block. */ 1428 pc2 = rtmp2 + i; 1429 if (*pc2 != 0.0) { 1430 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1431 *pc2 = mul1; /* insert L entry */ 1432 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1433 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1434 for (j=0; j<nz; j++) { 1435 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1436 } 1437 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1438 } 1439 1440 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1441 rs = 0.0; 1442 /* L part */ 1443 pc2 = b->a + bi[i+1]; 1444 pj = b->j + bi[i+1]; 1445 nz = bi[i+2] - bi[i+1]; 1446 for (j=0; j<nz; j++) { 1447 col = pj[j]; 1448 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1449 } 1450 /* U part */ 1451 pc2 = b->a + bdiag[i+2]+1; 1452 pj = b->j + bdiag[i+2]+1; 1453 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1454 for (j=0; j<nz; j++) { 1455 col = pj[j]; 1456 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1457 } 1458 1459 sctx.rs = rs; 1460 sctx.pv = rtmp2[i+1]; 1461 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1462 if (sctx.newshift) break; 1463 pc2 = b->a + bdiag[i+1]; 1464 *pc2 = 1.0/sctx.pv; 1465 break; 1466 1467 case 3: 1468 /*----------*/ 1469 /* zero rtmp */ 1470 /* L part */ 1471 nz = bi[i+1] - bi[i]; 1472 bjtmp = bj + bi[i]; 1473 for (j=0; j<nz; j++) { 1474 col = bjtmp[j]; 1475 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1476 } 1477 1478 /* U part */ 1479 nz = bdiag[i]-bdiag[i+1]; 1480 bjtmp = bj + bdiag[i+1]+1; 1481 for (j=0; j<nz; j++) { 1482 col = bjtmp[j]; 1483 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1484 } 1485 1486 /* load in initial (unfactored row) */ 1487 nz = ai[r[i]+1] - ai[r[i]]; 1488 ajtmp = aj + ai[r[i]]; 1489 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1490 for (j=0; j<nz; j++) { 1491 col = ics[ajtmp[j]]; 1492 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1493 } 1494 /* ZeropivotApply(): shift the diagonal of the matrix */ 1495 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1496 1497 /* elimination */ 1498 bjtmp = bj + bi[i]; 1499 row = *bjtmp++; /* pivot row */ 1500 nzL = bi[i+1] - bi[i]; 1501 for (k=0; k < nzL; k++) { 1502 pc1 = rtmp1 + row; 1503 pc2 = rtmp2 + row; 1504 pc3 = rtmp3 + row; 1505 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1506 pv = b->a + bdiag[row]; 1507 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1508 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1509 1510 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1511 pv = b->a + bdiag[row+1]+1; 1512 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1513 for (j=0; j<nz; j++) { 1514 col = pj[j]; 1515 rtmp1[col] -= mul1 * pv[j]; 1516 rtmp2[col] -= mul2 * pv[j]; 1517 rtmp3[col] -= mul3 * pv[j]; 1518 } 1519 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1520 } 1521 row = *bjtmp++; 1522 } 1523 1524 /* finished row i; check zero pivot, then stick row i into b->a */ 1525 rs = 0.0; 1526 /* L part */ 1527 pc1 = b->a + bi[i]; 1528 pj = b->j + bi[i]; 1529 nz = bi[i+1] - bi[i]; 1530 for (j=0; j<nz; j++) { 1531 col = pj[j]; 1532 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1533 } 1534 /* U part */ 1535 pc1 = b->a + bdiag[i+1]+1; 1536 pj = b->j + bdiag[i+1]+1; 1537 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1538 for (j=0; j<nz; j++) { 1539 col = pj[j]; 1540 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1541 } 1542 1543 sctx.rs = rs; 1544 sctx.pv = rtmp1[i]; 1545 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1546 if (sctx.newshift) break; 1547 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1548 *pc1 = 1.0/sctx.pv; 1549 1550 /* Now take care of 1st column of diagonal 3x3 block. */ 1551 pc2 = rtmp2 + i; 1552 pc3 = rtmp3 + i; 1553 if (*pc2 != 0.0 || *pc3 != 0.0) { 1554 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1555 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1556 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1557 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1558 for (j=0; j<nz; j++) { 1559 col = pj[j]; 1560 rtmp2[col] -= mul2 * rtmp1[col]; 1561 rtmp3[col] -= mul3 * rtmp1[col]; 1562 } 1563 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1564 } 1565 1566 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1567 rs = 0.0; 1568 /* L part */ 1569 pc2 = b->a + bi[i+1]; 1570 pj = b->j + bi[i+1]; 1571 nz = bi[i+2] - bi[i+1]; 1572 for (j=0; j<nz; j++) { 1573 col = pj[j]; 1574 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1575 } 1576 /* U part */ 1577 pc2 = b->a + bdiag[i+2]+1; 1578 pj = b->j + bdiag[i+2]+1; 1579 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1580 for (j=0; j<nz; j++) { 1581 col = pj[j]; 1582 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1583 } 1584 1585 sctx.rs = rs; 1586 sctx.pv = rtmp2[i+1]; 1587 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1588 if (sctx.newshift) break; 1589 pc2 = b->a + bdiag[i+1]; 1590 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1591 1592 /* Now take care of 2nd column of diagonal 3x3 block. */ 1593 pc3 = rtmp3 + i+1; 1594 if (*pc3 != 0.0) { 1595 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1596 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1597 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1598 for (j=0; j<nz; j++) { 1599 col = pj[j]; 1600 rtmp3[col] -= mul3 * rtmp2[col]; 1601 } 1602 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1603 } 1604 1605 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1606 rs = 0.0; 1607 /* L part */ 1608 pc3 = b->a + bi[i+2]; 1609 pj = b->j + bi[i+2]; 1610 nz = bi[i+3] - bi[i+2]; 1611 for (j=0; j<nz; j++) { 1612 col = pj[j]; 1613 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1614 } 1615 /* U part */ 1616 pc3 = b->a + bdiag[i+3]+1; 1617 pj = b->j + bdiag[i+3]+1; 1618 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1619 for (j=0; j<nz; j++) { 1620 col = pj[j]; 1621 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1622 } 1623 1624 sctx.rs = rs; 1625 sctx.pv = rtmp3[i+2]; 1626 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1627 if (sctx.newshift) break; 1628 pc3 = b->a + bdiag[i+2]; 1629 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1630 break; 1631 case 4: 1632 /*----------*/ 1633 /* zero rtmp */ 1634 /* L part */ 1635 nz = bi[i+1] - bi[i]; 1636 bjtmp = bj + bi[i]; 1637 for (j=0; j<nz; j++) { 1638 col = bjtmp[j]; 1639 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1640 } 1641 1642 /* U part */ 1643 nz = bdiag[i]-bdiag[i+1]; 1644 bjtmp = bj + bdiag[i+1]+1; 1645 for (j=0; j<nz; j++) { 1646 col = bjtmp[j]; 1647 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1648 } 1649 1650 /* load in initial (unfactored row) */ 1651 nz = ai[r[i]+1] - ai[r[i]]; 1652 ajtmp = aj + ai[r[i]]; 1653 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1654 for (j=0; j<nz; j++) { 1655 col = ics[ajtmp[j]]; 1656 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1657 } 1658 /* ZeropivotApply(): shift the diagonal of the matrix */ 1659 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1660 1661 /* elimination */ 1662 bjtmp = bj + bi[i]; 1663 row = *bjtmp++; /* pivot row */ 1664 nzL = bi[i+1] - bi[i]; 1665 for (k=0; k < nzL; k++) { 1666 pc1 = rtmp1 + row; 1667 pc2 = rtmp2 + row; 1668 pc3 = rtmp3 + row; 1669 pc4 = rtmp4 + row; 1670 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1671 pv = b->a + bdiag[row]; 1672 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1673 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1674 1675 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1676 pv = b->a + bdiag[row+1]+1; 1677 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1678 for (j=0; j<nz; j++) { 1679 col = pj[j]; 1680 rtmp1[col] -= mul1 * pv[j]; 1681 rtmp2[col] -= mul2 * pv[j]; 1682 rtmp3[col] -= mul3 * pv[j]; 1683 rtmp4[col] -= mul4 * pv[j]; 1684 } 1685 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1686 } 1687 row = *bjtmp++; 1688 } 1689 1690 /* finished row i; check zero pivot, then stick row i into b->a */ 1691 rs = 0.0; 1692 /* L part */ 1693 pc1 = b->a + bi[i]; 1694 pj = b->j + bi[i]; 1695 nz = bi[i+1] - bi[i]; 1696 for (j=0; j<nz; j++) { 1697 col = pj[j]; 1698 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1699 } 1700 /* U part */ 1701 pc1 = b->a + bdiag[i+1]+1; 1702 pj = b->j + bdiag[i+1]+1; 1703 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1704 for (j=0; j<nz; j++) { 1705 col = pj[j]; 1706 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1707 } 1708 1709 sctx.rs = rs; 1710 sctx.pv = rtmp1[i]; 1711 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1712 if (sctx.newshift) break; 1713 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1714 *pc1 = 1.0/sctx.pv; 1715 1716 /* Now take care of 1st column of diagonal 4x4 block. */ 1717 pc2 = rtmp2 + i; 1718 pc3 = rtmp3 + i; 1719 pc4 = rtmp4 + i; 1720 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1721 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1722 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1723 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1724 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1725 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1726 for (j=0; j<nz; j++) { 1727 col = pj[j]; 1728 rtmp2[col] -= mul2 * rtmp1[col]; 1729 rtmp3[col] -= mul3 * rtmp1[col]; 1730 rtmp4[col] -= mul4 * rtmp1[col]; 1731 } 1732 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1733 } 1734 1735 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1736 rs = 0.0; 1737 /* L part */ 1738 pc2 = b->a + bi[i+1]; 1739 pj = b->j + bi[i+1]; 1740 nz = bi[i+2] - bi[i+1]; 1741 for (j=0; j<nz; j++) { 1742 col = pj[j]; 1743 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1744 } 1745 /* U part */ 1746 pc2 = b->a + bdiag[i+2]+1; 1747 pj = b->j + bdiag[i+2]+1; 1748 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1749 for (j=0; j<nz; j++) { 1750 col = pj[j]; 1751 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1752 } 1753 1754 sctx.rs = rs; 1755 sctx.pv = rtmp2[i+1]; 1756 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1757 if (sctx.newshift) break; 1758 pc2 = b->a + bdiag[i+1]; 1759 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1760 1761 /* Now take care of 2nd column of diagonal 4x4 block. */ 1762 pc3 = rtmp3 + i+1; 1763 pc4 = rtmp4 + i+1; 1764 if (*pc3 != 0.0 || *pc4 != 0.0) { 1765 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1766 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1767 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1768 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1769 for (j=0; j<nz; j++) { 1770 col = pj[j]; 1771 rtmp3[col] -= mul3 * rtmp2[col]; 1772 rtmp4[col] -= mul4 * rtmp2[col]; 1773 } 1774 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1775 } 1776 1777 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1778 rs = 0.0; 1779 /* L part */ 1780 pc3 = b->a + bi[i+2]; 1781 pj = b->j + bi[i+2]; 1782 nz = bi[i+3] - bi[i+2]; 1783 for (j=0; j<nz; j++) { 1784 col = pj[j]; 1785 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1786 } 1787 /* U part */ 1788 pc3 = b->a + bdiag[i+3]+1; 1789 pj = b->j + bdiag[i+3]+1; 1790 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1791 for (j=0; j<nz; j++) { 1792 col = pj[j]; 1793 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1794 } 1795 1796 sctx.rs = rs; 1797 sctx.pv = rtmp3[i+2]; 1798 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1799 if (sctx.newshift) break; 1800 pc3 = b->a + bdiag[i+2]; 1801 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1802 1803 /* Now take care of 3rd column of diagonal 4x4 block. */ 1804 pc4 = rtmp4 + i+2; 1805 if (*pc4 != 0.0) { 1806 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1807 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1808 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1809 for (j=0; j<nz; j++) { 1810 col = pj[j]; 1811 rtmp4[col] -= mul4 * rtmp3[col]; 1812 } 1813 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1814 } 1815 1816 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1817 rs = 0.0; 1818 /* L part */ 1819 pc4 = b->a + bi[i+3]; 1820 pj = b->j + bi[i+3]; 1821 nz = bi[i+4] - bi[i+3]; 1822 for (j=0; j<nz; j++) { 1823 col = pj[j]; 1824 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1825 } 1826 /* U part */ 1827 pc4 = b->a + bdiag[i+4]+1; 1828 pj = b->j + bdiag[i+4]+1; 1829 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1830 for (j=0; j<nz; j++) { 1831 col = pj[j]; 1832 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1833 } 1834 1835 sctx.rs = rs; 1836 sctx.pv = rtmp4[i+3]; 1837 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1838 if (sctx.newshift) break; 1839 pc4 = b->a + bdiag[i+3]; 1840 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1841 break; 1842 1843 default: 1844 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1845 } 1846 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1847 i += nodesz; /* Update the row */ 1848 } 1849 1850 /* MatPivotRefine() */ 1851 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1852 /* 1853 * if no shift in this attempt & shifting & started shifting & can refine, 1854 * then try lower shift 1855 */ 1856 sctx.shift_hi = sctx.shift_fraction; 1857 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1858 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1859 sctx.newshift = PETSC_TRUE; 1860 sctx.nshift++; 1861 } 1862 } while (sctx.newshift); 1863 1864 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1865 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1866 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1867 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1868 1869 C->ops->solve = MatSolve_SeqAIJ; 1870 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1871 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1872 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1873 C->ops->matsolve = MatMatSolve_SeqAIJ; 1874 C->assembled = PETSC_TRUE; 1875 C->preallocated = PETSC_TRUE; 1876 1877 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1878 1879 /* MatShiftView(A,info,&sctx) */ 1880 if (sctx.nshift) { 1881 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1882 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1883 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1884 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1885 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1886 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1887 } 1888 } 1889 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1890 PetscFunctionReturn(0); 1891 } 1892 1893 #undef __FUNCT__ 1894 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1895 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1896 { 1897 Mat C = B; 1898 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1899 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1900 PetscErrorCode ierr; 1901 const PetscInt *r,*ic,*c,*ics; 1902 PetscInt n = A->rmap->n,*bi = b->i; 1903 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1904 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1905 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1906 PetscScalar mul1,mul2,mul3,tmp; 1907 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1908 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1909 PetscReal rs=0.0; 1910 FactorShiftCtx sctx; 1911 1912 PetscFunctionBegin; 1913 sctx.shift_top = 0; 1914 sctx.nshift_max = 0; 1915 sctx.shift_lo = 0; 1916 sctx.shift_hi = 0; 1917 sctx.shift_fraction = 0; 1918 1919 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1920 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1921 sctx.shift_top = 0; 1922 for (i=0; i<n; i++) { 1923 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1924 rs = 0.0; 1925 ajtmp = aj + ai[i]; 1926 rtmp1 = aa + ai[i]; 1927 nz = ai[i+1] - ai[i]; 1928 for (j=0; j<nz; j++) { 1929 if (*ajtmp != i) { 1930 rs += PetscAbsScalar(*rtmp1++); 1931 } else { 1932 rs -= PetscRealPart(*rtmp1++); 1933 } 1934 ajtmp++; 1935 } 1936 if (rs>sctx.shift_top) sctx.shift_top = rs; 1937 } 1938 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1939 sctx.shift_top *= 1.1; 1940 sctx.nshift_max = 5; 1941 sctx.shift_lo = 0.; 1942 sctx.shift_hi = 1.; 1943 } 1944 sctx.shift_amount = 0; 1945 sctx.nshift = 0; 1946 1947 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1948 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1949 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1950 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1951 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1952 ics = ic; 1953 rtmp22 = rtmp11 + n; 1954 rtmp33 = rtmp22 + n; 1955 1956 node_max = a->inode.node_count; 1957 ns = a->inode.size; 1958 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1959 1960 /* If max inode size > 3, split it into two inodes.*/ 1961 /* also map the inode sizes according to the ordering */ 1962 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1963 for (i=0,j=0; i<node_max; ++i,++j) { 1964 if (ns[i]>3) { 1965 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1966 ++j; 1967 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1968 } else { 1969 tmp_vec1[j] = ns[i]; 1970 } 1971 } 1972 /* Use the correct node_max */ 1973 node_max = j; 1974 1975 /* Now reorder the inode info based on mat re-ordering info */ 1976 /* First create a row -> inode_size_array_index map */ 1977 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1978 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1979 for (i=0,row=0; i<node_max; i++) { 1980 nodesz = tmp_vec1[i]; 1981 for (j=0; j<nodesz; j++,row++) { 1982 nsmap[row] = i; 1983 } 1984 } 1985 /* Using nsmap, create a reordered ns structure */ 1986 for (i=0,j=0; i< node_max; i++) { 1987 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1988 tmp_vec2[i] = nodesz; 1989 j += nodesz; 1990 } 1991 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1992 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1993 /* Now use the correct ns */ 1994 ns = tmp_vec2; 1995 1996 do { 1997 sctx.newshift = PETSC_FALSE; 1998 /* Now loop over each block-row, and do the factorization */ 1999 for (i=0,row=0; i<node_max; i++) { 2000 nodesz = ns[i]; 2001 nz = bi[row+1] - bi[row]; 2002 bjtmp = bj + bi[row]; 2003 2004 switch (nodesz) { 2005 case 1: 2006 for (j=0; j<nz; j++) { 2007 idx = bjtmp[j]; 2008 rtmp11[idx] = 0.0; 2009 } 2010 2011 /* load in initial (unfactored row) */ 2012 idx = r[row]; 2013 nz_tmp = ai[idx+1] - ai[idx]; 2014 ajtmp = aj + ai[idx]; 2015 v1 = aa + ai[idx]; 2016 2017 for (j=0; j<nz_tmp; j++) { 2018 idx = ics[ajtmp[j]]; 2019 rtmp11[idx] = v1[j]; 2020 } 2021 rtmp11[ics[r[row]]] += sctx.shift_amount; 2022 2023 prow = *bjtmp++; 2024 while (prow < row) { 2025 pc1 = rtmp11 + prow; 2026 if (*pc1 != 0.0) { 2027 pv = ba + bd[prow]; 2028 pj = nbj + bd[prow]; 2029 mul1 = *pc1 * *pv++; 2030 *pc1 = mul1; 2031 nz_tmp = bi[prow+1] - bd[prow] - 1; 2032 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2033 for (j=0; j<nz_tmp; j++) { 2034 tmp = pv[j]; 2035 idx = pj[j]; 2036 rtmp11[idx] -= mul1 * tmp; 2037 } 2038 } 2039 prow = *bjtmp++; 2040 } 2041 pj = bj + bi[row]; 2042 pc1 = ba + bi[row]; 2043 2044 sctx.pv = rtmp11[row]; 2045 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2046 rs = 0.0; 2047 for (j=0; j<nz; j++) { 2048 idx = pj[j]; 2049 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2050 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2051 } 2052 sctx.rs = rs; 2053 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2054 if (sctx.newshift) goto endofwhile; 2055 break; 2056 2057 case 2: 2058 for (j=0; j<nz; j++) { 2059 idx = bjtmp[j]; 2060 rtmp11[idx] = 0.0; 2061 rtmp22[idx] = 0.0; 2062 } 2063 2064 /* load in initial (unfactored row) */ 2065 idx = r[row]; 2066 nz_tmp = ai[idx+1] - ai[idx]; 2067 ajtmp = aj + ai[idx]; 2068 v1 = aa + ai[idx]; 2069 v2 = aa + ai[idx+1]; 2070 for (j=0; j<nz_tmp; j++) { 2071 idx = ics[ajtmp[j]]; 2072 rtmp11[idx] = v1[j]; 2073 rtmp22[idx] = v2[j]; 2074 } 2075 rtmp11[ics[r[row]]] += sctx.shift_amount; 2076 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2077 2078 prow = *bjtmp++; 2079 while (prow < row) { 2080 pc1 = rtmp11 + prow; 2081 pc2 = rtmp22 + prow; 2082 if (*pc1 != 0.0 || *pc2 != 0.0) { 2083 pv = ba + bd[prow]; 2084 pj = nbj + bd[prow]; 2085 mul1 = *pc1 * *pv; 2086 mul2 = *pc2 * *pv; 2087 ++pv; 2088 *pc1 = mul1; 2089 *pc2 = mul2; 2090 2091 nz_tmp = bi[prow+1] - bd[prow] - 1; 2092 for (j=0; j<nz_tmp; j++) { 2093 tmp = pv[j]; 2094 idx = pj[j]; 2095 rtmp11[idx] -= mul1 * tmp; 2096 rtmp22[idx] -= mul2 * tmp; 2097 } 2098 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2099 } 2100 prow = *bjtmp++; 2101 } 2102 2103 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2104 pc1 = rtmp11 + prow; 2105 pc2 = rtmp22 + prow; 2106 2107 sctx.pv = *pc1; 2108 pj = bj + bi[prow]; 2109 rs = 0.0; 2110 for (j=0; j<nz; j++) { 2111 idx = pj[j]; 2112 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2113 } 2114 sctx.rs = rs; 2115 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2116 if (sctx.newshift) goto endofwhile; 2117 2118 if (*pc2 != 0.0) { 2119 pj = nbj + bd[prow]; 2120 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2121 *pc2 = mul2; 2122 nz_tmp = bi[prow+1] - bd[prow] - 1; 2123 for (j=0; j<nz_tmp; j++) { 2124 idx = pj[j]; 2125 tmp = rtmp11[idx]; 2126 rtmp22[idx] -= mul2 * tmp; 2127 } 2128 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2129 } 2130 2131 pj = bj + bi[row]; 2132 pc1 = ba + bi[row]; 2133 pc2 = ba + bi[row+1]; 2134 2135 sctx.pv = rtmp22[row+1]; 2136 rs = 0.0; 2137 rtmp11[row] = 1.0/rtmp11[row]; 2138 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2139 /* copy row entries from dense representation to sparse */ 2140 for (j=0; j<nz; j++) { 2141 idx = pj[j]; 2142 pc1[j] = rtmp11[idx]; 2143 pc2[j] = rtmp22[idx]; 2144 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2145 } 2146 sctx.rs = rs; 2147 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2148 if (sctx.newshift) goto endofwhile; 2149 break; 2150 2151 case 3: 2152 for (j=0; j<nz; j++) { 2153 idx = bjtmp[j]; 2154 rtmp11[idx] = 0.0; 2155 rtmp22[idx] = 0.0; 2156 rtmp33[idx] = 0.0; 2157 } 2158 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2159 idx = r[row]; 2160 nz_tmp = ai[idx+1] - ai[idx]; 2161 ajtmp = aj + ai[idx]; 2162 v1 = aa + ai[idx]; 2163 v2 = aa + ai[idx+1]; 2164 v3 = aa + ai[idx+2]; 2165 for (j=0; j<nz_tmp; j++) { 2166 idx = ics[ajtmp[j]]; 2167 rtmp11[idx] = v1[j]; 2168 rtmp22[idx] = v2[j]; 2169 rtmp33[idx] = v3[j]; 2170 } 2171 rtmp11[ics[r[row]]] += sctx.shift_amount; 2172 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2173 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2174 2175 /* loop over all pivot row blocks above this row block */ 2176 prow = *bjtmp++; 2177 while (prow < row) { 2178 pc1 = rtmp11 + prow; 2179 pc2 = rtmp22 + prow; 2180 pc3 = rtmp33 + prow; 2181 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2182 pv = ba + bd[prow]; 2183 pj = nbj + bd[prow]; 2184 mul1 = *pc1 * *pv; 2185 mul2 = *pc2 * *pv; 2186 mul3 = *pc3 * *pv; 2187 ++pv; 2188 *pc1 = mul1; 2189 *pc2 = mul2; 2190 *pc3 = mul3; 2191 2192 nz_tmp = bi[prow+1] - bd[prow] - 1; 2193 /* update this row based on pivot row */ 2194 for (j=0; j<nz_tmp; j++) { 2195 tmp = pv[j]; 2196 idx = pj[j]; 2197 rtmp11[idx] -= mul1 * tmp; 2198 rtmp22[idx] -= mul2 * tmp; 2199 rtmp33[idx] -= mul3 * tmp; 2200 } 2201 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2202 } 2203 prow = *bjtmp++; 2204 } 2205 2206 /* Now take care of diagonal 3x3 block in this set of rows */ 2207 /* note: prow = row here */ 2208 pc1 = rtmp11 + prow; 2209 pc2 = rtmp22 + prow; 2210 pc3 = rtmp33 + prow; 2211 2212 sctx.pv = *pc1; 2213 pj = bj + bi[prow]; 2214 rs = 0.0; 2215 for (j=0; j<nz; j++) { 2216 idx = pj[j]; 2217 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2218 } 2219 sctx.rs = rs; 2220 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2221 if (sctx.newshift) goto endofwhile; 2222 2223 if (*pc2 != 0.0 || *pc3 != 0.0) { 2224 mul2 = (*pc2)/(*pc1); 2225 mul3 = (*pc3)/(*pc1); 2226 *pc2 = mul2; 2227 *pc3 = mul3; 2228 nz_tmp = bi[prow+1] - bd[prow] - 1; 2229 pj = nbj + bd[prow]; 2230 for (j=0; j<nz_tmp; j++) { 2231 idx = pj[j]; 2232 tmp = rtmp11[idx]; 2233 rtmp22[idx] -= mul2 * tmp; 2234 rtmp33[idx] -= mul3 * tmp; 2235 } 2236 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2237 } 2238 ++prow; 2239 2240 pc2 = rtmp22 + prow; 2241 pc3 = rtmp33 + prow; 2242 sctx.pv = *pc2; 2243 pj = bj + bi[prow]; 2244 rs = 0.0; 2245 for (j=0; j<nz; j++) { 2246 idx = pj[j]; 2247 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2248 } 2249 sctx.rs = rs; 2250 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2251 if (sctx.newshift) goto endofwhile; 2252 2253 if (*pc3 != 0.0) { 2254 mul3 = (*pc3)/(*pc2); 2255 *pc3 = mul3; 2256 pj = nbj + bd[prow]; 2257 nz_tmp = bi[prow+1] - bd[prow] - 1; 2258 for (j=0; j<nz_tmp; j++) { 2259 idx = pj[j]; 2260 tmp = rtmp22[idx]; 2261 rtmp33[idx] -= mul3 * tmp; 2262 } 2263 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2264 } 2265 2266 pj = bj + bi[row]; 2267 pc1 = ba + bi[row]; 2268 pc2 = ba + bi[row+1]; 2269 pc3 = ba + bi[row+2]; 2270 2271 sctx.pv = rtmp33[row+2]; 2272 rs = 0.0; 2273 rtmp11[row] = 1.0/rtmp11[row]; 2274 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2275 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2276 /* copy row entries from dense representation to sparse */ 2277 for (j=0; j<nz; j++) { 2278 idx = pj[j]; 2279 pc1[j] = rtmp11[idx]; 2280 pc2[j] = rtmp22[idx]; 2281 pc3[j] = rtmp33[idx]; 2282 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2283 } 2284 2285 sctx.rs = rs; 2286 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2287 if (sctx.newshift) goto endofwhile; 2288 break; 2289 2290 default: 2291 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2292 } 2293 row += nodesz; /* Update the row */ 2294 } 2295 endofwhile:; 2296 } while (sctx.newshift); 2297 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2298 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2299 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2300 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2301 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2302 2303 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2304 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2305 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2306 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2307 C->assembled = PETSC_TRUE; 2308 C->preallocated = PETSC_TRUE; 2309 if (sctx.nshift) { 2310 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2311 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2312 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2313 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2314 } 2315 } 2316 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2317 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2318 PetscFunctionReturn(0); 2319 } 2320 2321 2322 /* ----------------------------------------------------------- */ 2323 #undef __FUNCT__ 2324 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2325 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2326 { 2327 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2328 IS iscol = a->col,isrow = a->row; 2329 PetscErrorCode ierr; 2330 const PetscInt *r,*c,*rout,*cout; 2331 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2332 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2333 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2334 PetscScalar sum1,sum2,sum3,sum4,sum5; 2335 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2336 const PetscScalar *b; 2337 2338 PetscFunctionBegin; 2339 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2340 node_max = a->inode.node_count; 2341 ns = a->inode.size; /* Node Size array */ 2342 2343 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2344 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2345 tmp = a->solve_work; 2346 2347 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2348 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2349 2350 /* forward solve the lower triangular */ 2351 tmps = tmp; 2352 aa = a_a; 2353 aj = a_j; 2354 ad = a->diag; 2355 2356 for (i = 0,row = 0; i< node_max; ++i) { 2357 nsz = ns[i]; 2358 aii = ai[row]; 2359 v1 = aa + aii; 2360 vi = aj + aii; 2361 nz = ai[row+1]- ai[row]; 2362 2363 if (i < node_max-1) { 2364 /* Prefetch the indices for the next block */ 2365 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2366 /* Prefetch the data for the next block */ 2367 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2368 } 2369 2370 switch (nsz) { /* Each loop in 'case' is unrolled */ 2371 case 1: 2372 sum1 = b[r[row]]; 2373 for (j=0; j<nz-1; j+=2) { 2374 i0 = vi[j]; 2375 i1 = vi[j+1]; 2376 tmp0 = tmps[i0]; 2377 tmp1 = tmps[i1]; 2378 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2379 } 2380 if (j == nz-1) { 2381 tmp0 = tmps[vi[j]]; 2382 sum1 -= v1[j]*tmp0; 2383 } 2384 tmp[row++]=sum1; 2385 break; 2386 case 2: 2387 sum1 = b[r[row]]; 2388 sum2 = b[r[row+1]]; 2389 v2 = aa + ai[row+1]; 2390 2391 for (j=0; j<nz-1; j+=2) { 2392 i0 = vi[j]; 2393 i1 = vi[j+1]; 2394 tmp0 = tmps[i0]; 2395 tmp1 = tmps[i1]; 2396 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2397 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2398 } 2399 if (j == nz-1) { 2400 tmp0 = tmps[vi[j]]; 2401 sum1 -= v1[j] *tmp0; 2402 sum2 -= v2[j] *tmp0; 2403 } 2404 sum2 -= v2[nz] * sum1; 2405 tmp[row++]=sum1; 2406 tmp[row++]=sum2; 2407 break; 2408 case 3: 2409 sum1 = b[r[row]]; 2410 sum2 = b[r[row+1]]; 2411 sum3 = b[r[row+2]]; 2412 v2 = aa + ai[row+1]; 2413 v3 = aa + ai[row+2]; 2414 2415 for (j=0; j<nz-1; j+=2) { 2416 i0 = vi[j]; 2417 i1 = vi[j+1]; 2418 tmp0 = tmps[i0]; 2419 tmp1 = tmps[i1]; 2420 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2421 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2422 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2423 } 2424 if (j == nz-1) { 2425 tmp0 = tmps[vi[j]]; 2426 sum1 -= v1[j] *tmp0; 2427 sum2 -= v2[j] *tmp0; 2428 sum3 -= v3[j] *tmp0; 2429 } 2430 sum2 -= v2[nz] * sum1; 2431 sum3 -= v3[nz] * sum1; 2432 sum3 -= v3[nz+1] * sum2; 2433 tmp[row++]=sum1; 2434 tmp[row++]=sum2; 2435 tmp[row++]=sum3; 2436 break; 2437 2438 case 4: 2439 sum1 = b[r[row]]; 2440 sum2 = b[r[row+1]]; 2441 sum3 = b[r[row+2]]; 2442 sum4 = b[r[row+3]]; 2443 v2 = aa + ai[row+1]; 2444 v3 = aa + ai[row+2]; 2445 v4 = aa + ai[row+3]; 2446 2447 for (j=0; j<nz-1; j+=2) { 2448 i0 = vi[j]; 2449 i1 = vi[j+1]; 2450 tmp0 = tmps[i0]; 2451 tmp1 = tmps[i1]; 2452 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2453 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2454 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2455 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2456 } 2457 if (j == nz-1) { 2458 tmp0 = tmps[vi[j]]; 2459 sum1 -= v1[j] *tmp0; 2460 sum2 -= v2[j] *tmp0; 2461 sum3 -= v3[j] *tmp0; 2462 sum4 -= v4[j] *tmp0; 2463 } 2464 sum2 -= v2[nz] * sum1; 2465 sum3 -= v3[nz] * sum1; 2466 sum4 -= v4[nz] * sum1; 2467 sum3 -= v3[nz+1] * sum2; 2468 sum4 -= v4[nz+1] * sum2; 2469 sum4 -= v4[nz+2] * sum3; 2470 2471 tmp[row++]=sum1; 2472 tmp[row++]=sum2; 2473 tmp[row++]=sum3; 2474 tmp[row++]=sum4; 2475 break; 2476 case 5: 2477 sum1 = b[r[row]]; 2478 sum2 = b[r[row+1]]; 2479 sum3 = b[r[row+2]]; 2480 sum4 = b[r[row+3]]; 2481 sum5 = b[r[row+4]]; 2482 v2 = aa + ai[row+1]; 2483 v3 = aa + ai[row+2]; 2484 v4 = aa + ai[row+3]; 2485 v5 = aa + ai[row+4]; 2486 2487 for (j=0; j<nz-1; j+=2) { 2488 i0 = vi[j]; 2489 i1 = vi[j+1]; 2490 tmp0 = tmps[i0]; 2491 tmp1 = tmps[i1]; 2492 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2493 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2494 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2495 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2496 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2497 } 2498 if (j == nz-1) { 2499 tmp0 = tmps[vi[j]]; 2500 sum1 -= v1[j] *tmp0; 2501 sum2 -= v2[j] *tmp0; 2502 sum3 -= v3[j] *tmp0; 2503 sum4 -= v4[j] *tmp0; 2504 sum5 -= v5[j] *tmp0; 2505 } 2506 2507 sum2 -= v2[nz] * sum1; 2508 sum3 -= v3[nz] * sum1; 2509 sum4 -= v4[nz] * sum1; 2510 sum5 -= v5[nz] * sum1; 2511 sum3 -= v3[nz+1] * sum2; 2512 sum4 -= v4[nz+1] * sum2; 2513 sum5 -= v5[nz+1] * sum2; 2514 sum4 -= v4[nz+2] * sum3; 2515 sum5 -= v5[nz+2] * sum3; 2516 sum5 -= v5[nz+3] * sum4; 2517 2518 tmp[row++]=sum1; 2519 tmp[row++]=sum2; 2520 tmp[row++]=sum3; 2521 tmp[row++]=sum4; 2522 tmp[row++]=sum5; 2523 break; 2524 default: 2525 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2526 } 2527 } 2528 /* backward solve the upper triangular */ 2529 for (i=node_max -1,row = n-1; i>=0; i--) { 2530 nsz = ns[i]; 2531 aii = ad[row+1] + 1; 2532 v1 = aa + aii; 2533 vi = aj + aii; 2534 nz = ad[row]- ad[row+1] - 1; 2535 2536 if (i > 0) { 2537 /* Prefetch the indices for the next block */ 2538 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2539 /* Prefetch the data for the next block */ 2540 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2541 } 2542 2543 switch (nsz) { /* Each loop in 'case' is unrolled */ 2544 case 1: 2545 sum1 = tmp[row]; 2546 2547 for (j=0; j<nz-1; j+=2) { 2548 i0 = vi[j]; 2549 i1 = vi[j+1]; 2550 tmp0 = tmps[i0]; 2551 tmp1 = tmps[i1]; 2552 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2553 } 2554 if (j == nz-1) { 2555 tmp0 = tmps[vi[j]]; 2556 sum1 -= v1[j]*tmp0; 2557 } 2558 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2559 break; 2560 case 2: 2561 sum1 = tmp[row]; 2562 sum2 = tmp[row-1]; 2563 v2 = aa + ad[row] + 1; 2564 for (j=0; j<nz-1; j+=2) { 2565 i0 = vi[j]; 2566 i1 = vi[j+1]; 2567 tmp0 = tmps[i0]; 2568 tmp1 = tmps[i1]; 2569 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2570 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2571 } 2572 if (j == nz-1) { 2573 tmp0 = tmps[vi[j]]; 2574 sum1 -= v1[j]* tmp0; 2575 sum2 -= v2[j+1]* tmp0; 2576 } 2577 2578 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2579 sum2 -= v2[0] * tmp0; 2580 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2581 break; 2582 case 3: 2583 sum1 = tmp[row]; 2584 sum2 = tmp[row -1]; 2585 sum3 = tmp[row -2]; 2586 v2 = aa + ad[row] + 1; 2587 v3 = aa + ad[row -1] + 1; 2588 for (j=0; j<nz-1; j+=2) { 2589 i0 = vi[j]; 2590 i1 = vi[j+1]; 2591 tmp0 = tmps[i0]; 2592 tmp1 = tmps[i1]; 2593 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2594 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2595 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2596 } 2597 if (j== nz-1) { 2598 tmp0 = tmps[vi[j]]; 2599 sum1 -= v1[j] * tmp0; 2600 sum2 -= v2[j+1] * tmp0; 2601 sum3 -= v3[j+2] * tmp0; 2602 } 2603 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2604 sum2 -= v2[0]* tmp0; 2605 sum3 -= v3[1] * tmp0; 2606 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2607 sum3 -= v3[0]* tmp0; 2608 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2609 2610 break; 2611 case 4: 2612 sum1 = tmp[row]; 2613 sum2 = tmp[row -1]; 2614 sum3 = tmp[row -2]; 2615 sum4 = tmp[row -3]; 2616 v2 = aa + ad[row]+1; 2617 v3 = aa + ad[row -1]+1; 2618 v4 = aa + ad[row -2]+1; 2619 2620 for (j=0; j<nz-1; j+=2) { 2621 i0 = vi[j]; 2622 i1 = vi[j+1]; 2623 tmp0 = tmps[i0]; 2624 tmp1 = tmps[i1]; 2625 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2626 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2627 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2628 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2629 } 2630 if (j== nz-1) { 2631 tmp0 = tmps[vi[j]]; 2632 sum1 -= v1[j] * tmp0; 2633 sum2 -= v2[j+1] * tmp0; 2634 sum3 -= v3[j+2] * tmp0; 2635 sum4 -= v4[j+3] * tmp0; 2636 } 2637 2638 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2639 sum2 -= v2[0] * tmp0; 2640 sum3 -= v3[1] * tmp0; 2641 sum4 -= v4[2] * tmp0; 2642 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2643 sum3 -= v3[0] * tmp0; 2644 sum4 -= v4[1] * tmp0; 2645 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2646 sum4 -= v4[0] * tmp0; 2647 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2648 break; 2649 case 5: 2650 sum1 = tmp[row]; 2651 sum2 = tmp[row -1]; 2652 sum3 = tmp[row -2]; 2653 sum4 = tmp[row -3]; 2654 sum5 = tmp[row -4]; 2655 v2 = aa + ad[row]+1; 2656 v3 = aa + ad[row -1]+1; 2657 v4 = aa + ad[row -2]+1; 2658 v5 = aa + ad[row -3]+1; 2659 for (j=0; j<nz-1; j+=2) { 2660 i0 = vi[j]; 2661 i1 = vi[j+1]; 2662 tmp0 = tmps[i0]; 2663 tmp1 = tmps[i1]; 2664 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2665 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2666 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2667 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2668 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2669 } 2670 if (j==nz-1) { 2671 tmp0 = tmps[vi[j]]; 2672 sum1 -= v1[j] * tmp0; 2673 sum2 -= v2[j+1] * tmp0; 2674 sum3 -= v3[j+2] * tmp0; 2675 sum4 -= v4[j+3] * tmp0; 2676 sum5 -= v5[j+4] * tmp0; 2677 } 2678 2679 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2680 sum2 -= v2[0] * tmp0; 2681 sum3 -= v3[1] * tmp0; 2682 sum4 -= v4[2] * tmp0; 2683 sum5 -= v5[3] * tmp0; 2684 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2685 sum3 -= v3[0] * tmp0; 2686 sum4 -= v4[1] * tmp0; 2687 sum5 -= v5[2] * tmp0; 2688 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2689 sum4 -= v4[0] * tmp0; 2690 sum5 -= v5[1] * tmp0; 2691 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2692 sum5 -= v5[0] * tmp0; 2693 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2694 break; 2695 default: 2696 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2697 } 2698 } 2699 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2700 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2701 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2702 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2703 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2704 PetscFunctionReturn(0); 2705 } 2706 2707 2708 /* 2709 Makes a longer coloring[] array and calls the usual code with that 2710 */ 2711 #undef __FUNCT__ 2712 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2713 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2714 { 2715 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2716 PetscErrorCode ierr; 2717 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2718 PetscInt *colorused,i; 2719 ISColoringValue *newcolor; 2720 2721 PetscFunctionBegin; 2722 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2723 /* loop over inodes, marking a color for each column*/ 2724 row = 0; 2725 for (i=0; i<m; i++) { 2726 for (j=0; j<ns[i]; j++) { 2727 newcolor[row++] = coloring[i] + j*ncolors; 2728 } 2729 } 2730 2731 /* eliminate unneeded colors */ 2732 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2733 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2734 for (i=0; i<n; i++) { 2735 colorused[newcolor[i]] = 1; 2736 } 2737 2738 for (i=1; i<5*ncolors; i++) { 2739 colorused[i] += colorused[i-1]; 2740 } 2741 ncolors = colorused[5*ncolors-1]; 2742 for (i=0; i<n; i++) { 2743 newcolor[i] = colorused[newcolor[i]]-1; 2744 } 2745 ierr = PetscFree(colorused);CHKERRQ(ierr); 2746 ierr = ISColoringCreate(PetscObjectComm((PetscObject)mat),ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2747 ierr = PetscFree(coloring);CHKERRQ(ierr); 2748 PetscFunctionReturn(0); 2749 } 2750 2751 #include <../src/mat/blockinvert.h> 2752 2753 #undef __FUNCT__ 2754 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2755 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2756 { 2757 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2758 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2759 MatScalar *ibdiag,*bdiag,work[25],*t; 2760 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2761 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2762 const PetscScalar *xb, *b; 2763 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2764 PetscErrorCode ierr; 2765 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2766 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2767 2768 PetscFunctionBegin; 2769 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2770 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2771 2772 if (!a->inode.ibdiagvalid) { 2773 if (!a->inode.ibdiag) { 2774 /* calculate space needed for diagonal blocks */ 2775 for (i=0; i<m; i++) { 2776 cnt += sizes[i]*sizes[i]; 2777 } 2778 a->inode.bdiagsize = cnt; 2779 2780 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2781 } 2782 2783 /* copy over the diagonal blocks and invert them */ 2784 ibdiag = a->inode.ibdiag; 2785 bdiag = a->inode.bdiag; 2786 cnt = 0; 2787 for (i=0, row = 0; i<m; i++) { 2788 for (j=0; j<sizes[i]; j++) { 2789 for (k=0; k<sizes[i]; k++) { 2790 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2791 } 2792 } 2793 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2794 2795 switch (sizes[i]) { 2796 case 1: 2797 /* Create matrix data structure */ 2798 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2799 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2800 break; 2801 case 2: 2802 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2803 break; 2804 case 3: 2805 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2806 break; 2807 case 4: 2808 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2809 break; 2810 case 5: 2811 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2812 break; 2813 default: 2814 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2815 } 2816 cnt += sizes[i]*sizes[i]; 2817 row += sizes[i]; 2818 } 2819 a->inode.ibdiagvalid = PETSC_TRUE; 2820 } 2821 ibdiag = a->inode.ibdiag; 2822 bdiag = a->inode.bdiag; 2823 t = a->inode.ssor_work; 2824 2825 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2826 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2827 CHKMEMQ; 2828 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2829 if (flag & SOR_ZERO_INITIAL_GUESS) { 2830 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2831 2832 for (i=0, row=0; i<m; i++) { 2833 sz = diag[row] - ii[row]; 2834 v1 = a->a + ii[row]; 2835 idx = a->j + ii[row]; 2836 2837 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2838 switch (sizes[i]) { 2839 case 1: 2840 2841 sum1 = b[row]; 2842 for (n = 0; n<sz-1; n+=2) { 2843 i1 = idx[0]; 2844 i2 = idx[1]; 2845 idx += 2; 2846 tmp0 = x[i1]; 2847 tmp1 = x[i2]; 2848 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2849 } 2850 2851 if (n == sz-1) { 2852 tmp0 = x[*idx]; 2853 sum1 -= *v1 * tmp0; 2854 } 2855 t[row] = sum1; 2856 x[row++] = sum1*(*ibdiag++); 2857 break; 2858 case 2: 2859 v2 = a->a + ii[row+1]; 2860 sum1 = b[row]; 2861 sum2 = b[row+1]; 2862 for (n = 0; n<sz-1; n+=2) { 2863 i1 = idx[0]; 2864 i2 = idx[1]; 2865 idx += 2; 2866 tmp0 = x[i1]; 2867 tmp1 = x[i2]; 2868 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2869 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2870 } 2871 2872 if (n == sz-1) { 2873 tmp0 = x[*idx]; 2874 sum1 -= v1[0] * tmp0; 2875 sum2 -= v2[0] * tmp0; 2876 } 2877 t[row] = sum1; 2878 t[row+1] = sum2; 2879 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2880 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2881 ibdiag += 4; 2882 break; 2883 case 3: 2884 v2 = a->a + ii[row+1]; 2885 v3 = a->a + ii[row+2]; 2886 sum1 = b[row]; 2887 sum2 = b[row+1]; 2888 sum3 = b[row+2]; 2889 for (n = 0; n<sz-1; n+=2) { 2890 i1 = idx[0]; 2891 i2 = idx[1]; 2892 idx += 2; 2893 tmp0 = x[i1]; 2894 tmp1 = x[i2]; 2895 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2896 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2897 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2898 } 2899 2900 if (n == sz-1) { 2901 tmp0 = x[*idx]; 2902 sum1 -= v1[0] * tmp0; 2903 sum2 -= v2[0] * tmp0; 2904 sum3 -= v3[0] * tmp0; 2905 } 2906 t[row] = sum1; 2907 t[row+1] = sum2; 2908 t[row+2] = sum3; 2909 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2910 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2911 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2912 ibdiag += 9; 2913 break; 2914 case 4: 2915 v2 = a->a + ii[row+1]; 2916 v3 = a->a + ii[row+2]; 2917 v4 = a->a + ii[row+3]; 2918 sum1 = b[row]; 2919 sum2 = b[row+1]; 2920 sum3 = b[row+2]; 2921 sum4 = b[row+3]; 2922 for (n = 0; n<sz-1; n+=2) { 2923 i1 = idx[0]; 2924 i2 = idx[1]; 2925 idx += 2; 2926 tmp0 = x[i1]; 2927 tmp1 = x[i2]; 2928 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2929 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2930 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2931 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2932 } 2933 2934 if (n == sz-1) { 2935 tmp0 = x[*idx]; 2936 sum1 -= v1[0] * tmp0; 2937 sum2 -= v2[0] * tmp0; 2938 sum3 -= v3[0] * tmp0; 2939 sum4 -= v4[0] * tmp0; 2940 } 2941 t[row] = sum1; 2942 t[row+1] = sum2; 2943 t[row+2] = sum3; 2944 t[row+3] = sum4; 2945 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2946 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2947 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2948 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2949 ibdiag += 16; 2950 break; 2951 case 5: 2952 v2 = a->a + ii[row+1]; 2953 v3 = a->a + ii[row+2]; 2954 v4 = a->a + ii[row+3]; 2955 v5 = a->a + ii[row+4]; 2956 sum1 = b[row]; 2957 sum2 = b[row+1]; 2958 sum3 = b[row+2]; 2959 sum4 = b[row+3]; 2960 sum5 = b[row+4]; 2961 for (n = 0; n<sz-1; n+=2) { 2962 i1 = idx[0]; 2963 i2 = idx[1]; 2964 idx += 2; 2965 tmp0 = x[i1]; 2966 tmp1 = x[i2]; 2967 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2968 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2969 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2970 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2971 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2972 } 2973 2974 if (n == sz-1) { 2975 tmp0 = x[*idx]; 2976 sum1 -= v1[0] * tmp0; 2977 sum2 -= v2[0] * tmp0; 2978 sum3 -= v3[0] * tmp0; 2979 sum4 -= v4[0] * tmp0; 2980 sum5 -= v5[0] * tmp0; 2981 } 2982 t[row] = sum1; 2983 t[row+1] = sum2; 2984 t[row+2] = sum3; 2985 t[row+3] = sum4; 2986 t[row+4] = sum5; 2987 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2988 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2989 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2990 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2991 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2992 ibdiag += 25; 2993 break; 2994 default: 2995 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2996 } 2997 } 2998 2999 xb = t; 3000 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3001 } else xb = b; 3002 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3003 3004 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3005 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3006 ibdiag -= sizes[i]*sizes[i]; 3007 sz = ii[row+1] - diag[row] - 1; 3008 v1 = a->a + diag[row] + 1; 3009 idx = a->j + diag[row] + 1; 3010 3011 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3012 switch (sizes[i]) { 3013 case 1: 3014 3015 sum1 = xb[row]; 3016 for (n = 0; n<sz-1; n+=2) { 3017 i1 = idx[0]; 3018 i2 = idx[1]; 3019 idx += 2; 3020 tmp0 = x[i1]; 3021 tmp1 = x[i2]; 3022 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3023 } 3024 3025 if (n == sz-1) { 3026 tmp0 = x[*idx]; 3027 sum1 -= *v1*tmp0; 3028 } 3029 x[row--] = sum1*(*ibdiag); 3030 break; 3031 3032 case 2: 3033 3034 sum1 = xb[row]; 3035 sum2 = xb[row-1]; 3036 /* note that sum1 is associated with the second of the two rows */ 3037 v2 = a->a + diag[row-1] + 2; 3038 for (n = 0; n<sz-1; n+=2) { 3039 i1 = idx[0]; 3040 i2 = idx[1]; 3041 idx += 2; 3042 tmp0 = x[i1]; 3043 tmp1 = x[i2]; 3044 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3045 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3046 } 3047 3048 if (n == sz-1) { 3049 tmp0 = x[*idx]; 3050 sum1 -= *v1*tmp0; 3051 sum2 -= *v2*tmp0; 3052 } 3053 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3054 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3055 break; 3056 case 3: 3057 3058 sum1 = xb[row]; 3059 sum2 = xb[row-1]; 3060 sum3 = xb[row-2]; 3061 v2 = a->a + diag[row-1] + 2; 3062 v3 = a->a + diag[row-2] + 3; 3063 for (n = 0; n<sz-1; n+=2) { 3064 i1 = idx[0]; 3065 i2 = idx[1]; 3066 idx += 2; 3067 tmp0 = x[i1]; 3068 tmp1 = x[i2]; 3069 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3070 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3071 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3072 } 3073 3074 if (n == sz-1) { 3075 tmp0 = x[*idx]; 3076 sum1 -= *v1*tmp0; 3077 sum2 -= *v2*tmp0; 3078 sum3 -= *v3*tmp0; 3079 } 3080 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3081 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3082 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3083 break; 3084 case 4: 3085 3086 sum1 = xb[row]; 3087 sum2 = xb[row-1]; 3088 sum3 = xb[row-2]; 3089 sum4 = xb[row-3]; 3090 v2 = a->a + diag[row-1] + 2; 3091 v3 = a->a + diag[row-2] + 3; 3092 v4 = a->a + diag[row-3] + 4; 3093 for (n = 0; n<sz-1; n+=2) { 3094 i1 = idx[0]; 3095 i2 = idx[1]; 3096 idx += 2; 3097 tmp0 = x[i1]; 3098 tmp1 = x[i2]; 3099 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3100 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3101 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3102 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3103 } 3104 3105 if (n == sz-1) { 3106 tmp0 = x[*idx]; 3107 sum1 -= *v1*tmp0; 3108 sum2 -= *v2*tmp0; 3109 sum3 -= *v3*tmp0; 3110 sum4 -= *v4*tmp0; 3111 } 3112 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3113 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3114 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3115 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3116 break; 3117 case 5: 3118 3119 sum1 = xb[row]; 3120 sum2 = xb[row-1]; 3121 sum3 = xb[row-2]; 3122 sum4 = xb[row-3]; 3123 sum5 = xb[row-4]; 3124 v2 = a->a + diag[row-1] + 2; 3125 v3 = a->a + diag[row-2] + 3; 3126 v4 = a->a + diag[row-3] + 4; 3127 v5 = a->a + diag[row-4] + 5; 3128 for (n = 0; n<sz-1; n+=2) { 3129 i1 = idx[0]; 3130 i2 = idx[1]; 3131 idx += 2; 3132 tmp0 = x[i1]; 3133 tmp1 = x[i2]; 3134 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3135 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3136 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3137 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3138 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3139 } 3140 3141 if (n == sz-1) { 3142 tmp0 = x[*idx]; 3143 sum1 -= *v1*tmp0; 3144 sum2 -= *v2*tmp0; 3145 sum3 -= *v3*tmp0; 3146 sum4 -= *v4*tmp0; 3147 sum5 -= *v5*tmp0; 3148 } 3149 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3150 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3151 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3152 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3153 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3154 break; 3155 default: 3156 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3157 } 3158 } 3159 3160 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3161 } 3162 its--; 3163 } 3164 while (its--) { 3165 3166 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3167 ibdiag = a->inode.ibdiag; 3168 3169 for (i=0, row=0; i<m; i++) { 3170 sz = ii[row + 1] - ii[row]; 3171 v1 = a->a + ii[row]; 3172 idx = a->j + ii[row]; 3173 3174 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3175 switch (sizes[i]) { 3176 case 1: 3177 3178 sum1 = b[row]; 3179 for (n = 0; n<sz-1; n+=2) { 3180 i1 = idx[0]; 3181 i2 = idx[1]; 3182 idx += 2; 3183 tmp0 = x[i1]; 3184 tmp1 = x[i2]; 3185 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3186 } 3187 3188 if (n == sz-1) { 3189 tmp0 = x[*idx]; 3190 sum1 -= *v1 * tmp0; 3191 } 3192 3193 /* in MatSOR_SeqAIJ this line would be 3194 * 3195 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3196 * 3197 * but omega == 1, so this becomes 3198 * 3199 * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++); 3200 * 3201 * but bdiag and ibdiag cancel each other, so we can change this 3202 * to adding sum1*(*ibdiag++). We can skip bdiag for the larger 3203 * block sizes as well 3204 */ 3205 x[row++] += sum1*(*ibdiag++); 3206 break; 3207 case 2: 3208 v2 = a->a + ii[row+1]; 3209 sum1 = b[row]; 3210 sum2 = b[row+1]; 3211 for (n = 0; n<sz-1; n+=2) { 3212 i1 = idx[0]; 3213 i2 = idx[1]; 3214 idx += 2; 3215 tmp0 = x[i1]; 3216 tmp1 = x[i2]; 3217 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3218 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3219 } 3220 3221 if (n == sz-1) { 3222 tmp0 = x[*idx]; 3223 sum1 -= v1[0] * tmp0; 3224 sum2 -= v2[0] * tmp0; 3225 } 3226 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2]; 3227 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3]; 3228 ibdiag += 4; 3229 break; 3230 case 3: 3231 v2 = a->a + ii[row+1]; 3232 v3 = a->a + ii[row+2]; 3233 sum1 = b[row]; 3234 sum2 = b[row+1]; 3235 sum3 = b[row+2]; 3236 for (n = 0; n<sz-1; n+=2) { 3237 i1 = idx[0]; 3238 i2 = idx[1]; 3239 idx += 2; 3240 tmp0 = x[i1]; 3241 tmp1 = x[i2]; 3242 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3243 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3244 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3245 } 3246 3247 if (n == sz-1) { 3248 tmp0 = x[*idx]; 3249 sum1 -= v1[0] * tmp0; 3250 sum2 -= v2[0] * tmp0; 3251 sum3 -= v3[0] * tmp0; 3252 } 3253 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3254 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3255 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3256 ibdiag += 9; 3257 break; 3258 case 4: 3259 v2 = a->a + ii[row+1]; 3260 v3 = a->a + ii[row+2]; 3261 v4 = a->a + ii[row+3]; 3262 sum1 = b[row]; 3263 sum2 = b[row+1]; 3264 sum3 = b[row+2]; 3265 sum4 = b[row+3]; 3266 for (n = 0; n<sz-1; n+=2) { 3267 i1 = idx[0]; 3268 i2 = idx[1]; 3269 idx += 2; 3270 tmp0 = x[i1]; 3271 tmp1 = x[i2]; 3272 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3273 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3274 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3275 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3276 } 3277 3278 if (n == sz-1) { 3279 tmp0 = x[*idx]; 3280 sum1 -= v1[0] * tmp0; 3281 sum2 -= v2[0] * tmp0; 3282 sum3 -= v3[0] * tmp0; 3283 sum4 -= v4[0] * tmp0; 3284 } 3285 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3286 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3287 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3288 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3289 ibdiag += 16; 3290 break; 3291 case 5: 3292 v2 = a->a + ii[row+1]; 3293 v3 = a->a + ii[row+2]; 3294 v4 = a->a + ii[row+3]; 3295 v5 = a->a + ii[row+4]; 3296 sum1 = b[row]; 3297 sum2 = b[row+1]; 3298 sum3 = b[row+2]; 3299 sum4 = b[row+3]; 3300 sum5 = b[row+4]; 3301 for (n = 0; n<sz-1; n+=2) { 3302 i1 = idx[0]; 3303 i2 = idx[1]; 3304 idx += 2; 3305 tmp0 = x[i1]; 3306 tmp1 = x[i2]; 3307 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3308 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3309 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3310 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3311 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3312 } 3313 3314 if (n == sz-1) { 3315 tmp0 = x[*idx]; 3316 sum1 -= v1[0] * tmp0; 3317 sum2 -= v2[0] * tmp0; 3318 sum3 -= v3[0] * tmp0; 3319 sum4 -= v4[0] * tmp0; 3320 sum5 -= v5[0] * tmp0; 3321 } 3322 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3323 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3324 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3325 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3326 x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3327 ibdiag += 25; 3328 break; 3329 default: 3330 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3331 } 3332 } 3333 3334 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3335 } 3336 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3337 3338 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3339 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3340 ibdiag -= sizes[i]*sizes[i]; 3341 sz = ii[row+1] - ii[row]; 3342 v1 = a->a + ii[row]; 3343 idx = a->j + ii[row]; 3344 3345 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3346 switch (sizes[i]) { 3347 case 1: 3348 3349 sum1 = b[row]; 3350 for (n = 0; n<sz-1; n+=2) { 3351 i1 = idx[0]; 3352 i2 = idx[1]; 3353 idx += 2; 3354 tmp0 = x[i1]; 3355 tmp1 = x[i2]; 3356 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3357 } 3358 3359 if (n == sz-1) { 3360 tmp0 = x[*idx]; 3361 sum1 -= *v1*tmp0; 3362 } 3363 x[row--] += sum1*(*ibdiag); 3364 break; 3365 3366 case 2: 3367 3368 sum1 = b[row]; 3369 sum2 = b[row-1]; 3370 /* note that sum1 is associated with the second of the two rows */ 3371 v2 = a->a + ii[row - 1]; 3372 for (n = 0; n<sz-1; n+=2) { 3373 i1 = idx[0]; 3374 i2 = idx[1]; 3375 idx += 2; 3376 tmp0 = x[i1]; 3377 tmp1 = x[i2]; 3378 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3379 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3380 } 3381 3382 if (n == sz-1) { 3383 tmp0 = x[*idx]; 3384 sum1 -= *v1*tmp0; 3385 sum2 -= *v2*tmp0; 3386 } 3387 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3388 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3389 break; 3390 case 3: 3391 3392 sum1 = b[row]; 3393 sum2 = b[row-1]; 3394 sum3 = b[row-2]; 3395 v2 = a->a + ii[row-1]; 3396 v3 = a->a + ii[row-2]; 3397 for (n = 0; n<sz-1; n+=2) { 3398 i1 = idx[0]; 3399 i2 = idx[1]; 3400 idx += 2; 3401 tmp0 = x[i1]; 3402 tmp1 = x[i2]; 3403 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3404 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3405 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3406 } 3407 3408 if (n == sz-1) { 3409 tmp0 = x[*idx]; 3410 sum1 -= *v1*tmp0; 3411 sum2 -= *v2*tmp0; 3412 sum3 -= *v3*tmp0; 3413 } 3414 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3415 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3416 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3417 break; 3418 case 4: 3419 3420 sum1 = b[row]; 3421 sum2 = b[row-1]; 3422 sum3 = b[row-2]; 3423 sum4 = b[row-3]; 3424 v2 = a->a + ii[row-1]; 3425 v3 = a->a + ii[row-2]; 3426 v4 = a->a + ii[row-3]; 3427 for (n = 0; n<sz-1; n+=2) { 3428 i1 = idx[0]; 3429 i2 = idx[1]; 3430 idx += 2; 3431 tmp0 = x[i1]; 3432 tmp1 = x[i2]; 3433 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3434 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3435 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3436 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3437 } 3438 3439 if (n == sz-1) { 3440 tmp0 = x[*idx]; 3441 sum1 -= *v1*tmp0; 3442 sum2 -= *v2*tmp0; 3443 sum3 -= *v3*tmp0; 3444 sum4 -= *v4*tmp0; 3445 } 3446 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3447 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3448 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3449 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3450 break; 3451 case 5: 3452 3453 sum1 = b[row]; 3454 sum2 = b[row-1]; 3455 sum3 = b[row-2]; 3456 sum4 = b[row-3]; 3457 sum5 = b[row-4]; 3458 v2 = a->a + ii[row-1]; 3459 v3 = a->a + ii[row-2]; 3460 v4 = a->a + ii[row-3]; 3461 v5 = a->a + ii[row-4]; 3462 for (n = 0; n<sz-1; n+=2) { 3463 i1 = idx[0]; 3464 i2 = idx[1]; 3465 idx += 2; 3466 tmp0 = x[i1]; 3467 tmp1 = x[i2]; 3468 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3469 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3470 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3471 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3472 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3473 } 3474 3475 if (n == sz-1) { 3476 tmp0 = x[*idx]; 3477 sum1 -= *v1*tmp0; 3478 sum2 -= *v2*tmp0; 3479 sum3 -= *v3*tmp0; 3480 sum4 -= *v4*tmp0; 3481 sum5 -= *v5*tmp0; 3482 } 3483 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3484 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3485 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3486 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3487 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3488 break; 3489 default: 3490 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3491 } 3492 } 3493 3494 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3495 } 3496 } 3497 if (flag & SOR_EISENSTAT) { 3498 /* 3499 Apply (U + D)^-1 where D is now the block diagonal 3500 */ 3501 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3502 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3503 ibdiag -= sizes[i]*sizes[i]; 3504 sz = ii[row+1] - diag[row] - 1; 3505 v1 = a->a + diag[row] + 1; 3506 idx = a->j + diag[row] + 1; 3507 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3508 switch (sizes[i]) { 3509 case 1: 3510 3511 sum1 = b[row]; 3512 for (n = 0; n<sz-1; n+=2) { 3513 i1 = idx[0]; 3514 i2 = idx[1]; 3515 idx += 2; 3516 tmp0 = x[i1]; 3517 tmp1 = x[i2]; 3518 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3519 } 3520 3521 if (n == sz-1) { 3522 tmp0 = x[*idx]; 3523 sum1 -= *v1*tmp0; 3524 } 3525 x[row] = sum1*(*ibdiag);row--; 3526 break; 3527 3528 case 2: 3529 3530 sum1 = b[row]; 3531 sum2 = b[row-1]; 3532 /* note that sum1 is associated with the second of the two rows */ 3533 v2 = a->a + diag[row-1] + 2; 3534 for (n = 0; n<sz-1; n+=2) { 3535 i1 = idx[0]; 3536 i2 = idx[1]; 3537 idx += 2; 3538 tmp0 = x[i1]; 3539 tmp1 = x[i2]; 3540 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3541 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3542 } 3543 3544 if (n == sz-1) { 3545 tmp0 = x[*idx]; 3546 sum1 -= *v1*tmp0; 3547 sum2 -= *v2*tmp0; 3548 } 3549 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3550 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3551 row -= 2; 3552 break; 3553 case 3: 3554 3555 sum1 = b[row]; 3556 sum2 = b[row-1]; 3557 sum3 = b[row-2]; 3558 v2 = a->a + diag[row-1] + 2; 3559 v3 = a->a + diag[row-2] + 3; 3560 for (n = 0; n<sz-1; n+=2) { 3561 i1 = idx[0]; 3562 i2 = idx[1]; 3563 idx += 2; 3564 tmp0 = x[i1]; 3565 tmp1 = x[i2]; 3566 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3567 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3568 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3569 } 3570 3571 if (n == sz-1) { 3572 tmp0 = x[*idx]; 3573 sum1 -= *v1*tmp0; 3574 sum2 -= *v2*tmp0; 3575 sum3 -= *v3*tmp0; 3576 } 3577 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3578 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3579 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3580 row -= 3; 3581 break; 3582 case 4: 3583 3584 sum1 = b[row]; 3585 sum2 = b[row-1]; 3586 sum3 = b[row-2]; 3587 sum4 = b[row-3]; 3588 v2 = a->a + diag[row-1] + 2; 3589 v3 = a->a + diag[row-2] + 3; 3590 v4 = a->a + diag[row-3] + 4; 3591 for (n = 0; n<sz-1; n+=2) { 3592 i1 = idx[0]; 3593 i2 = idx[1]; 3594 idx += 2; 3595 tmp0 = x[i1]; 3596 tmp1 = x[i2]; 3597 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3598 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3599 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3600 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3601 } 3602 3603 if (n == sz-1) { 3604 tmp0 = x[*idx]; 3605 sum1 -= *v1*tmp0; 3606 sum2 -= *v2*tmp0; 3607 sum3 -= *v3*tmp0; 3608 sum4 -= *v4*tmp0; 3609 } 3610 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3611 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3612 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3613 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3614 row -= 4; 3615 break; 3616 case 5: 3617 3618 sum1 = b[row]; 3619 sum2 = b[row-1]; 3620 sum3 = b[row-2]; 3621 sum4 = b[row-3]; 3622 sum5 = b[row-4]; 3623 v2 = a->a + diag[row-1] + 2; 3624 v3 = a->a + diag[row-2] + 3; 3625 v4 = a->a + diag[row-3] + 4; 3626 v5 = a->a + diag[row-4] + 5; 3627 for (n = 0; n<sz-1; n+=2) { 3628 i1 = idx[0]; 3629 i2 = idx[1]; 3630 idx += 2; 3631 tmp0 = x[i1]; 3632 tmp1 = x[i2]; 3633 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3634 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3635 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3636 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3637 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3638 } 3639 3640 if (n == sz-1) { 3641 tmp0 = x[*idx]; 3642 sum1 -= *v1*tmp0; 3643 sum2 -= *v2*tmp0; 3644 sum3 -= *v3*tmp0; 3645 sum4 -= *v4*tmp0; 3646 sum5 -= *v5*tmp0; 3647 } 3648 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3649 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3650 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3651 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3652 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3653 row -= 5; 3654 break; 3655 default: 3656 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3657 } 3658 } 3659 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3660 3661 /* 3662 t = b - D x where D is the block diagonal 3663 */ 3664 cnt = 0; 3665 for (i=0, row=0; i<m; i++) { 3666 switch (sizes[i]) { 3667 case 1: 3668 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3669 break; 3670 case 2: 3671 x1 = x[row]; x2 = x[row+1]; 3672 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3673 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3674 t[row] = b[row] - tmp1; 3675 t[row+1] = b[row+1] - tmp2; row += 2; 3676 cnt += 4; 3677 break; 3678 case 3: 3679 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3680 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3681 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3682 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3683 t[row] = b[row] - tmp1; 3684 t[row+1] = b[row+1] - tmp2; 3685 t[row+2] = b[row+2] - tmp3; row += 3; 3686 cnt += 9; 3687 break; 3688 case 4: 3689 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3690 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3691 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3692 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3693 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3694 t[row] = b[row] - tmp1; 3695 t[row+1] = b[row+1] - tmp2; 3696 t[row+2] = b[row+2] - tmp3; 3697 t[row+3] = b[row+3] - tmp4; row += 4; 3698 cnt += 16; 3699 break; 3700 case 5: 3701 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3702 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3703 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3704 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3705 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3706 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3707 t[row] = b[row] - tmp1; 3708 t[row+1] = b[row+1] - tmp2; 3709 t[row+2] = b[row+2] - tmp3; 3710 t[row+3] = b[row+3] - tmp4; 3711 t[row+4] = b[row+4] - tmp5;row += 5; 3712 cnt += 25; 3713 break; 3714 default: 3715 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3716 } 3717 } 3718 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3719 3720 3721 3722 /* 3723 Apply (L + D)^-1 where D is the block diagonal 3724 */ 3725 for (i=0, row=0; i<m; i++) { 3726 sz = diag[row] - ii[row]; 3727 v1 = a->a + ii[row]; 3728 idx = a->j + ii[row]; 3729 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3730 switch (sizes[i]) { 3731 case 1: 3732 3733 sum1 = t[row]; 3734 for (n = 0; n<sz-1; n+=2) { 3735 i1 = idx[0]; 3736 i2 = idx[1]; 3737 idx += 2; 3738 tmp0 = t[i1]; 3739 tmp1 = t[i2]; 3740 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3741 } 3742 3743 if (n == sz-1) { 3744 tmp0 = t[*idx]; 3745 sum1 -= *v1 * tmp0; 3746 } 3747 x[row] += t[row] = sum1*(*ibdiag++); row++; 3748 break; 3749 case 2: 3750 v2 = a->a + ii[row+1]; 3751 sum1 = t[row]; 3752 sum2 = t[row+1]; 3753 for (n = 0; n<sz-1; n+=2) { 3754 i1 = idx[0]; 3755 i2 = idx[1]; 3756 idx += 2; 3757 tmp0 = t[i1]; 3758 tmp1 = t[i2]; 3759 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3760 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3761 } 3762 3763 if (n == sz-1) { 3764 tmp0 = t[*idx]; 3765 sum1 -= v1[0] * tmp0; 3766 sum2 -= v2[0] * tmp0; 3767 } 3768 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3769 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3770 ibdiag += 4; row += 2; 3771 break; 3772 case 3: 3773 v2 = a->a + ii[row+1]; 3774 v3 = a->a + ii[row+2]; 3775 sum1 = t[row]; 3776 sum2 = t[row+1]; 3777 sum3 = t[row+2]; 3778 for (n = 0; n<sz-1; n+=2) { 3779 i1 = idx[0]; 3780 i2 = idx[1]; 3781 idx += 2; 3782 tmp0 = t[i1]; 3783 tmp1 = t[i2]; 3784 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3785 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3786 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3787 } 3788 3789 if (n == sz-1) { 3790 tmp0 = t[*idx]; 3791 sum1 -= v1[0] * tmp0; 3792 sum2 -= v2[0] * tmp0; 3793 sum3 -= v3[0] * tmp0; 3794 } 3795 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3796 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3797 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3798 ibdiag += 9; row += 3; 3799 break; 3800 case 4: 3801 v2 = a->a + ii[row+1]; 3802 v3 = a->a + ii[row+2]; 3803 v4 = a->a + ii[row+3]; 3804 sum1 = t[row]; 3805 sum2 = t[row+1]; 3806 sum3 = t[row+2]; 3807 sum4 = t[row+3]; 3808 for (n = 0; n<sz-1; n+=2) { 3809 i1 = idx[0]; 3810 i2 = idx[1]; 3811 idx += 2; 3812 tmp0 = t[i1]; 3813 tmp1 = t[i2]; 3814 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3815 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3816 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3817 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3818 } 3819 3820 if (n == sz-1) { 3821 tmp0 = t[*idx]; 3822 sum1 -= v1[0] * tmp0; 3823 sum2 -= v2[0] * tmp0; 3824 sum3 -= v3[0] * tmp0; 3825 sum4 -= v4[0] * tmp0; 3826 } 3827 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3828 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3829 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3830 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3831 ibdiag += 16; row += 4; 3832 break; 3833 case 5: 3834 v2 = a->a + ii[row+1]; 3835 v3 = a->a + ii[row+2]; 3836 v4 = a->a + ii[row+3]; 3837 v5 = a->a + ii[row+4]; 3838 sum1 = t[row]; 3839 sum2 = t[row+1]; 3840 sum3 = t[row+2]; 3841 sum4 = t[row+3]; 3842 sum5 = t[row+4]; 3843 for (n = 0; n<sz-1; n+=2) { 3844 i1 = idx[0]; 3845 i2 = idx[1]; 3846 idx += 2; 3847 tmp0 = t[i1]; 3848 tmp1 = t[i2]; 3849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3850 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3851 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3852 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3853 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3854 } 3855 3856 if (n == sz-1) { 3857 tmp0 = t[*idx]; 3858 sum1 -= v1[0] * tmp0; 3859 sum2 -= v2[0] * tmp0; 3860 sum3 -= v3[0] * tmp0; 3861 sum4 -= v4[0] * tmp0; 3862 sum5 -= v5[0] * tmp0; 3863 } 3864 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3865 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3866 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3867 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3868 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3869 ibdiag += 25; row += 5; 3870 break; 3871 default: 3872 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3873 } 3874 } 3875 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3876 } 3877 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3878 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3879 CHKMEMQ; PetscFunctionReturn(0); 3880 } 3881 3882 #undef __FUNCT__ 3883 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3884 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3885 { 3886 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3887 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3888 const MatScalar *bdiag = a->inode.bdiag; 3889 const PetscScalar *b; 3890 PetscErrorCode ierr; 3891 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3892 const PetscInt *sizes = a->inode.size; 3893 3894 PetscFunctionBegin; 3895 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3896 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3897 cnt = 0; 3898 for (i=0, row=0; i<m; i++) { 3899 switch (sizes[i]) { 3900 case 1: 3901 x[row] = b[row]*bdiag[cnt++];row++; 3902 break; 3903 case 2: 3904 x1 = b[row]; x2 = b[row+1]; 3905 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3906 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3907 x[row++] = tmp1; 3908 x[row++] = tmp2; 3909 cnt += 4; 3910 break; 3911 case 3: 3912 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3913 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3914 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3915 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3916 x[row++] = tmp1; 3917 x[row++] = tmp2; 3918 x[row++] = tmp3; 3919 cnt += 9; 3920 break; 3921 case 4: 3922 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3923 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3924 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3925 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3926 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3927 x[row++] = tmp1; 3928 x[row++] = tmp2; 3929 x[row++] = tmp3; 3930 x[row++] = tmp4; 3931 cnt += 16; 3932 break; 3933 case 5: 3934 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3935 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3936 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3937 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3938 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3939 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3940 x[row++] = tmp1; 3941 x[row++] = tmp2; 3942 x[row++] = tmp3; 3943 x[row++] = tmp4; 3944 x[row++] = tmp5; 3945 cnt += 25; 3946 break; 3947 default: 3948 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3949 } 3950 } 3951 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3952 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3953 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3954 PetscFunctionReturn(0); 3955 } 3956 3957 /* 3958 samestructure indicates that the matrix has not changed its nonzero structure so we 3959 do not need to recompute the inodes 3960 */ 3961 #undef __FUNCT__ 3962 #define __FUNCT__ "Mat_CheckInode" 3963 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 3964 { 3965 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3966 PetscErrorCode ierr; 3967 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3968 PetscBool flag; 3969 3970 PetscFunctionBegin; 3971 if (!a->inode.use) PetscFunctionReturn(0); 3972 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3973 3974 3975 m = A->rmap->n; 3976 if (a->inode.size) ns = a->inode.size; 3977 else { 3978 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 3979 } 3980 3981 i = 0; 3982 node_count = 0; 3983 idx = a->j; 3984 ii = a->i; 3985 while (i < m) { /* For each row */ 3986 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3987 /* Limits the number of elements in a node to 'a->inode.limit' */ 3988 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3989 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3990 if (nzy != nzx) break; 3991 idy += nzx; /* Same nonzero pattern */ 3992 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3993 if (!flag) break; 3994 } 3995 ns[node_count++] = blk_size; 3996 idx += blk_size*nzx; 3997 i = j; 3998 } 3999 /* If not enough inodes found,, do not use inode version of the routines */ 4000 if (!m || node_count > .8*m) { 4001 ierr = PetscFree(ns);CHKERRQ(ierr); 4002 4003 a->inode.node_count = 0; 4004 a->inode.size = NULL; 4005 a->inode.use = PETSC_FALSE; 4006 A->ops->mult = MatMult_SeqAIJ; 4007 A->ops->sor = MatSOR_SeqAIJ; 4008 A->ops->multadd = MatMultAdd_SeqAIJ; 4009 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4010 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4011 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4012 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4013 A->ops->coloringpatch = 0; 4014 A->ops->multdiagonalblock = 0; 4015 4016 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4017 } else { 4018 if (!A->factortype) { 4019 A->ops->mult = MatMult_SeqAIJ_Inode; 4020 A->ops->sor = MatSOR_SeqAIJ_Inode; 4021 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4022 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4023 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4024 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4025 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4026 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4027 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4028 } else { 4029 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4030 } 4031 a->inode.node_count = node_count; 4032 a->inode.size = ns; 4033 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4034 } 4035 a->inode.checked = PETSC_TRUE; 4036 PetscFunctionReturn(0); 4037 } 4038 4039 #undef __FUNCT__ 4040 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4041 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4042 { 4043 Mat B =*C; 4044 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4045 PetscErrorCode ierr; 4046 PetscInt m=A->rmap->n; 4047 4048 PetscFunctionBegin; 4049 c->inode.use = a->inode.use; 4050 c->inode.limit = a->inode.limit; 4051 c->inode.max_limit = a->inode.max_limit; 4052 if (a->inode.size) { 4053 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr); 4054 c->inode.node_count = a->inode.node_count; 4055 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4056 /* note the table of functions below should match that in Mat_CheckInode() */ 4057 if (!B->factortype) { 4058 B->ops->mult = MatMult_SeqAIJ_Inode; 4059 B->ops->sor = MatSOR_SeqAIJ_Inode; 4060 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4061 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4062 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4063 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4064 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4065 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4066 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4067 } else { 4068 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4069 } 4070 } else { 4071 c->inode.size = 0; 4072 c->inode.node_count = 0; 4073 } 4074 c->inode.ibdiagvalid = PETSC_FALSE; 4075 c->inode.ibdiag = 0; 4076 c->inode.bdiag = 0; 4077 PetscFunctionReturn(0); 4078 } 4079 4080 #undef __FUNCT__ 4081 #define __FUNCT__ "MatGetRow_FactoredLU" 4082 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row) 4083 { 4084 PetscInt k, *vi; 4085 4086 PetscFunctionBegin; 4087 vi = aj + ai[row]; 4088 for (k=0; k<nzl; k++) cols[k] = vi[k]; 4089 vi = aj + adiag[row]; 4090 cols[nzl] = vi[0]; 4091 vi = aj + adiag[row+1]+1; 4092 for (k=0; k<nzu; k++) cols[nzl+1+k] = vi[k]; 4093 PetscFunctionReturn(0); 4094 } 4095 /* 4096 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4097 Modified from Mat_CheckInode(). 4098 4099 Input Parameters: 4100 + Mat A - ILU or LU matrix factor 4101 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 4102 do not need to recompute the inodes 4103 */ 4104 #undef __FUNCT__ 4105 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4106 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure) 4107 { 4108 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4109 PetscErrorCode ierr; 4110 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4111 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 4112 PetscBool flag; 4113 4114 PetscFunctionBegin; 4115 if (!a->inode.use) PetscFunctionReturn(0); 4116 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4117 4118 m = A->rmap->n; 4119 if (a->inode.size) ns = a->inode.size; 4120 else { 4121 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr); 4122 } 4123 4124 i = 0; 4125 node_count = 0; 4126 while (i < m) { /* For each row */ 4127 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4128 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4129 nzx = nzl1 + nzu1 + 1; 4130 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 4131 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4132 4133 /* Limits the number of elements in a node to 'a->inode.limit' */ 4134 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4135 nzl2 = ai[j+1] - ai[j]; 4136 nzu2 = adiag[j] - adiag[j+1] - 1; 4137 nzy = nzl2 + nzu2 + 1; 4138 if (nzy != nzx) break; 4139 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 4140 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4141 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4142 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 4143 ierr = PetscFree(cols2);CHKERRQ(ierr); 4144 } 4145 ns[node_count++] = blk_size; 4146 ierr = PetscFree(cols1);CHKERRQ(ierr); 4147 i = j; 4148 } 4149 /* If not enough inodes found,, do not use inode version of the routines */ 4150 if (!m || node_count > .8*m) { 4151 ierr = PetscFree(ns);CHKERRQ(ierr); 4152 4153 a->inode.node_count = 0; 4154 a->inode.size = NULL; 4155 a->inode.use = PETSC_FALSE; 4156 4157 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4158 } else { 4159 A->ops->mult = 0; 4160 A->ops->sor = 0; 4161 A->ops->multadd = 0; 4162 A->ops->getrowij = 0; 4163 A->ops->restorerowij = 0; 4164 A->ops->getcolumnij = 0; 4165 A->ops->restorecolumnij = 0; 4166 A->ops->coloringpatch = 0; 4167 A->ops->multdiagonalblock = 0; 4168 A->ops->solve = MatSolve_SeqAIJ_Inode; 4169 a->inode.node_count = node_count; 4170 a->inode.size = ns; 4171 4172 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4173 } 4174 a->inode.checked = PETSC_TRUE; 4175 PetscFunctionReturn(0); 4176 } 4177 4178 #undef __FUNCT__ 4179 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4180 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4181 { 4182 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4183 4184 PetscFunctionBegin; 4185 a->inode.ibdiagvalid = PETSC_FALSE; 4186 PetscFunctionReturn(0); 4187 } 4188 4189 /* 4190 This is really ugly. if inodes are used this replaces the 4191 permutations with ones that correspond to rows/cols of the matrix 4192 rather then inode blocks 4193 */ 4194 #undef __FUNCT__ 4195 #define __FUNCT__ "MatInodeAdjustForInodes" 4196 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4197 { 4198 PetscErrorCode ierr; 4199 4200 PetscFunctionBegin; 4201 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4202 PetscFunctionReturn(0); 4203 } 4204 4205 #undef __FUNCT__ 4206 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4207 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4208 { 4209 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4210 PetscErrorCode ierr; 4211 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4212 const PetscInt *ridx,*cidx; 4213 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4214 PetscInt nslim_col,*ns_col; 4215 IS ris = *rperm,cis = *cperm; 4216 4217 PetscFunctionBegin; 4218 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4219 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4220 4221 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4222 ierr = PetscMalloc((((nslim_row>nslim_col) ? nslim_row : nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 4223 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 4224 4225 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4226 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4227 4228 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4229 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4230 4231 /* Construct the permutations for rows*/ 4232 for (i=0,row = 0; i<nslim_row; ++i) { 4233 indx = ridx[i]; 4234 start_val = tns[indx]; 4235 end_val = tns[indx + 1]; 4236 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4237 } 4238 4239 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4240 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4241 4242 /* Construct permutations for columns */ 4243 for (i=0,col=0; i<nslim_col; ++i) { 4244 indx = cidx[i]; 4245 start_val = tns[indx]; 4246 end_val = tns[indx + 1]; 4247 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4248 } 4249 4250 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4251 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4252 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4253 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4254 4255 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4256 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4257 4258 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4259 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4260 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4261 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4262 ierr = PetscFree(tns);CHKERRQ(ierr); 4263 PetscFunctionReturn(0); 4264 } 4265 4266 #undef __FUNCT__ 4267 #define __FUNCT__ "MatInodeGetInodeSizes" 4268 /*@C 4269 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4270 4271 Not Collective 4272 4273 Input Parameter: 4274 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4275 4276 Output Parameter: 4277 + node_count - no of inodes present in the matrix. 4278 . sizes - an array of size node_count,with sizes of each inode. 4279 - limit - the max size used to generate the inodes. 4280 4281 Level: advanced 4282 4283 Notes: This routine returns some internal storage information 4284 of the matrix, it is intended to be used by advanced users. 4285 It should be called after the matrix is assembled. 4286 The contents of the sizes[] array should not be changed. 4287 NULL may be passed for information not requested. 4288 4289 .keywords: matrix, seqaij, get, inode 4290 4291 .seealso: MatGetInfo() 4292 @*/ 4293 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4294 { 4295 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4296 4297 PetscFunctionBegin; 4298 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4299 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 4300 if (f) { 4301 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4302 } 4303 PetscFunctionReturn(0); 4304 } 4305 4306 #undef __FUNCT__ 4307 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4308 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4309 { 4310 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4311 4312 PetscFunctionBegin; 4313 if (node_count) *node_count = a->inode.node_count; 4314 if (sizes) *sizes = a->inode.size; 4315 if (limit) *limit = a->inode.limit; 4316 PetscFunctionReturn(0); 4317 } 4318