1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 24 for (; count+1 < n; count++,i++); 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 67 68 PetscFunctionBegin; 69 nslim_row = a->inode.node_count; 70 m = A->rmap->n; 71 n = A->cmap->n; 72 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 73 74 /* Use the row_inode as column_inode */ 75 nslim_col = nslim_row; 76 ns_col = ns_row; 77 78 /* allocate space for reformated inode structure */ 79 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 80 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 81 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 82 83 for (i1=0,col=0; i1<nslim_col; ++i1) { 84 nsz = ns_col[i1]; 85 for (i2=0; i2<nsz; ++i2,++col) 86 tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 157 158 PetscFunctionBegin; 159 nslim_row = a->inode.node_count; 160 n = A->cmap->n; 161 162 /* Create The column_inode for this matrix */ 163 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 164 165 /* allocate space for reformated column_inode structure */ 166 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 167 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 168 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 169 170 for (i1=0,col=0; i1<nslim_col; ++i1) { 171 nsz = ns_col[i1]; 172 for (i2=0; i2<nsz; ++i2,++col) 173 tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 PetscFunctionReturn(0); 264 } 265 266 /* ----------------------------------------------------------- */ 267 268 #undef __FUNCT__ 269 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 270 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 271 { 272 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 273 PetscErrorCode ierr; 274 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 275 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 276 277 PetscFunctionBegin; 278 nslim_row = a->inode.node_count; 279 n = A->cmap->n; 280 281 /* Create The column_inode for this matrix */ 282 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 283 284 /* allocate space for reformated column_inode structure */ 285 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 286 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 287 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 288 289 for (i1=0,col=0; i1<nslim_col; ++i1) { 290 nsz = ns_col[i1]; 291 for (i2=0; i2<nsz; ++i2,++col) 292 tvc[col] = i1; 293 } 294 /* allocate space for column pointers */ 295 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 296 *iia = ia; 297 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 298 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 299 300 /* determine the number of columns in each row */ 301 ia[0] = oshift; 302 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 303 j = aj + ai[row] + ishift; 304 col = *j++ + ishift; 305 i2 = tvc[col]; 306 nz = ai[row+1] - ai[row]; 307 while (nz-- > 0) { /* off-diagonal elemets */ 308 /* ia[i1+1]++; */ 309 ia[i2+1]++; 310 i2++; 311 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 312 if (nz > 0) i2 = tvc[col]; 313 } 314 } 315 316 /* shift ia[i] to point to next col */ 317 for (i1=1; i1<nslim_col+1; i1++) { 318 col = ia[i1-1]; 319 ia[i1] += col; 320 work[i1-1] = col - oshift; 321 } 322 323 /* allocate space for column pointers */ 324 nz = ia[nslim_col] + (!ishift); 325 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 326 *jja = ja; 327 328 /* loop over matrix putting into ja */ 329 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 330 j = aj + ai[row] + ishift; 331 col = *j++ + ishift; 332 i2 = tvc[col]; 333 nz = ai[row+1] - ai[row]; 334 while (nz-- > 0) { 335 /* ja[work[i1]++] = i2 + oshift; */ 336 ja[work[i2]++] = i1 + oshift; 337 i2++; 338 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 339 if (nz > 0) i2 = tvc[col]; 340 } 341 } 342 ierr = PetscFree(ns_col);CHKERRQ(ierr); 343 ierr = PetscFree(work);CHKERRQ(ierr); 344 ierr = PetscFree(tns);CHKERRQ(ierr); 345 ierr = PetscFree(tvc);CHKERRQ(ierr); 346 PetscFunctionReturn(0); 347 } 348 349 #undef __FUNCT__ 350 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 351 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 352 { 353 PetscErrorCode ierr; 354 355 PetscFunctionBegin; 356 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 357 if (!ia) PetscFunctionReturn(0); 358 359 if (!blockcompressed) { 360 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 361 } else if (symmetric) { 362 /* Since the indices are symmetric it does'nt matter */ 363 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 364 } else { 365 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 366 } 367 PetscFunctionReturn(0); 368 } 369 370 #undef __FUNCT__ 371 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 372 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 373 { 374 PetscErrorCode ierr; 375 376 PetscFunctionBegin; 377 if (!ia) PetscFunctionReturn(0); 378 if (!blockcompressed) { 379 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 380 } else { 381 ierr = PetscFree(*ia);CHKERRQ(ierr); 382 ierr = PetscFree(*ja);CHKERRQ(ierr); 383 } 384 PetscFunctionReturn(0); 385 } 386 387 /* ----------------------------------------------------------- */ 388 389 #undef __FUNCT__ 390 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 391 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 392 { 393 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 394 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 395 PetscScalar *y; 396 const PetscScalar *x; 397 const MatScalar *v1,*v2,*v3,*v4,*v5; 398 PetscErrorCode ierr; 399 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 400 401 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 402 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 403 #endif 404 405 PetscFunctionBegin; 406 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 407 node_max = a->inode.node_count; 408 ns = a->inode.size; /* Node Size array */ 409 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 410 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 411 idx = a->j; 412 v1 = a->a; 413 ii = a->i; 414 415 for (i = 0,row = 0; i< node_max; ++i) { 416 nsz = ns[i]; 417 n = ii[1] - ii[0]; 418 nonzerorow += (n>0)*nsz; 419 ii += nsz; 420 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 421 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 422 sz = n; /* No of non zeros in this row */ 423 /* Switch on the size of Node */ 424 switch (nsz) { /* Each loop in 'case' is unrolled */ 425 case 1 : 426 sum1 = 0.; 427 428 for (n = 0; n< sz-1; n+=2) { 429 i1 = idx[0]; /* The instructions are ordered to */ 430 i2 = idx[1]; /* make the compiler's job easy */ 431 idx += 2; 432 tmp0 = x[i1]; 433 tmp1 = x[i2]; 434 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 435 } 436 437 if (n == sz-1) { /* Take care of the last nonzero */ 438 tmp0 = x[*idx++]; 439 sum1 += *v1++ * tmp0; 440 } 441 y[row++]=sum1; 442 break; 443 case 2: 444 sum1 = 0.; 445 sum2 = 0.; 446 v2 = v1 + n; 447 448 for (n = 0; n< sz-1; n+=2) { 449 i1 = idx[0]; 450 i2 = idx[1]; 451 idx += 2; 452 tmp0 = x[i1]; 453 tmp1 = x[i2]; 454 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 455 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 456 } 457 if (n == sz-1) { 458 tmp0 = x[*idx++]; 459 sum1 += *v1++ * tmp0; 460 sum2 += *v2++ * tmp0; 461 } 462 y[row++]=sum1; 463 y[row++]=sum2; 464 v1 =v2; /* Since the next block to be processed starts there*/ 465 idx +=sz; 466 break; 467 case 3: 468 sum1 = 0.; 469 sum2 = 0.; 470 sum3 = 0.; 471 v2 = v1 + n; 472 v3 = v2 + n; 473 474 for (n = 0; n< sz-1; n+=2) { 475 i1 = idx[0]; 476 i2 = idx[1]; 477 idx += 2; 478 tmp0 = x[i1]; 479 tmp1 = x[i2]; 480 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 481 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 482 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 483 } 484 if (n == sz-1) { 485 tmp0 = x[*idx++]; 486 sum1 += *v1++ * tmp0; 487 sum2 += *v2++ * tmp0; 488 sum3 += *v3++ * tmp0; 489 } 490 y[row++]=sum1; 491 y[row++]=sum2; 492 y[row++]=sum3; 493 v1 =v3; /* Since the next block to be processed starts there*/ 494 idx +=2*sz; 495 break; 496 case 4: 497 sum1 = 0.; 498 sum2 = 0.; 499 sum3 = 0.; 500 sum4 = 0.; 501 v2 = v1 + n; 502 v3 = v2 + n; 503 v4 = v3 + n; 504 505 for (n = 0; n< sz-1; n+=2) { 506 i1 = idx[0]; 507 i2 = idx[1]; 508 idx += 2; 509 tmp0 = x[i1]; 510 tmp1 = x[i2]; 511 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 512 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 513 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 514 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 515 } 516 if (n == sz-1) { 517 tmp0 = x[*idx++]; 518 sum1 += *v1++ * tmp0; 519 sum2 += *v2++ * tmp0; 520 sum3 += *v3++ * tmp0; 521 sum4 += *v4++ * tmp0; 522 } 523 y[row++]=sum1; 524 y[row++]=sum2; 525 y[row++]=sum3; 526 y[row++]=sum4; 527 v1 =v4; /* Since the next block to be processed starts there*/ 528 idx +=3*sz; 529 break; 530 case 5: 531 sum1 = 0.; 532 sum2 = 0.; 533 sum3 = 0.; 534 sum4 = 0.; 535 sum5 = 0.; 536 v2 = v1 + n; 537 v3 = v2 + n; 538 v4 = v3 + n; 539 v5 = v4 + n; 540 541 for (n = 0; n<sz-1; n+=2) { 542 i1 = idx[0]; 543 i2 = idx[1]; 544 idx += 2; 545 tmp0 = x[i1]; 546 tmp1 = x[i2]; 547 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 548 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 549 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 550 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 551 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 552 } 553 if (n == sz-1) { 554 tmp0 = x[*idx++]; 555 sum1 += *v1++ * tmp0; 556 sum2 += *v2++ * tmp0; 557 sum3 += *v3++ * tmp0; 558 sum4 += *v4++ * tmp0; 559 sum5 += *v5++ * tmp0; 560 } 561 y[row++]=sum1; 562 y[row++]=sum2; 563 y[row++]=sum3; 564 y[row++]=sum4; 565 y[row++]=sum5; 566 v1 =v5; /* Since the next block to be processed starts there */ 567 idx +=4*sz; 568 break; 569 default : 570 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 571 } 572 } 573 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 574 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 575 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 576 PetscFunctionReturn(0); 577 } 578 /* ----------------------------------------------------------- */ 579 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 580 #undef __FUNCT__ 581 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 582 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 583 { 584 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 585 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 586 MatScalar *v1,*v2,*v3,*v4,*v5; 587 PetscScalar *x,*y,*z,*zt; 588 PetscErrorCode ierr; 589 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 590 591 PetscFunctionBegin; 592 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 593 node_max = a->inode.node_count; 594 ns = a->inode.size; /* Node Size array */ 595 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 596 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 597 if (zz != yy) { 598 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 599 } else { 600 z = y; 601 } 602 zt = z; 603 604 idx = a->j; 605 v1 = a->a; 606 ii = a->i; 607 608 for (i = 0,row = 0; i< node_max; ++i) { 609 nsz = ns[i]; 610 n = ii[1] - ii[0]; 611 ii += nsz; 612 sz = n; /* No of non zeros in this row */ 613 /* Switch on the size of Node */ 614 switch (nsz) { /* Each loop in 'case' is unrolled */ 615 case 1 : 616 sum1 = *zt++; 617 618 for (n = 0; n< sz-1; n+=2) { 619 i1 = idx[0]; /* The instructions are ordered to */ 620 i2 = idx[1]; /* make the compiler's job easy */ 621 idx += 2; 622 tmp0 = x[i1]; 623 tmp1 = x[i2]; 624 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 625 } 626 627 if (n == sz-1) { /* Take care of the last nonzero */ 628 tmp0 = x[*idx++]; 629 sum1 += *v1++ * tmp0; 630 } 631 y[row++]=sum1; 632 break; 633 case 2: 634 sum1 = *zt++; 635 sum2 = *zt++; 636 v2 = v1 + n; 637 638 for (n = 0; n< sz-1; n+=2) { 639 i1 = idx[0]; 640 i2 = idx[1]; 641 idx += 2; 642 tmp0 = x[i1]; 643 tmp1 = x[i2]; 644 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 645 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 646 } 647 if (n == sz-1) { 648 tmp0 = x[*idx++]; 649 sum1 += *v1++ * tmp0; 650 sum2 += *v2++ * tmp0; 651 } 652 y[row++]=sum1; 653 y[row++]=sum2; 654 v1 =v2; /* Since the next block to be processed starts there*/ 655 idx +=sz; 656 break; 657 case 3: 658 sum1 = *zt++; 659 sum2 = *zt++; 660 sum3 = *zt++; 661 v2 = v1 + n; 662 v3 = v2 + n; 663 664 for (n = 0; n< sz-1; n+=2) { 665 i1 = idx[0]; 666 i2 = idx[1]; 667 idx += 2; 668 tmp0 = x[i1]; 669 tmp1 = x[i2]; 670 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 671 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 672 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 673 } 674 if (n == sz-1) { 675 tmp0 = x[*idx++]; 676 sum1 += *v1++ * tmp0; 677 sum2 += *v2++ * tmp0; 678 sum3 += *v3++ * tmp0; 679 } 680 y[row++]=sum1; 681 y[row++]=sum2; 682 y[row++]=sum3; 683 v1 =v3; /* Since the next block to be processed starts there*/ 684 idx +=2*sz; 685 break; 686 case 4: 687 sum1 = *zt++; 688 sum2 = *zt++; 689 sum3 = *zt++; 690 sum4 = *zt++; 691 v2 = v1 + n; 692 v3 = v2 + n; 693 v4 = v3 + n; 694 695 for (n = 0; n< sz-1; n+=2) { 696 i1 = idx[0]; 697 i2 = idx[1]; 698 idx += 2; 699 tmp0 = x[i1]; 700 tmp1 = x[i2]; 701 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 702 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 703 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 704 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 705 } 706 if (n == sz-1) { 707 tmp0 = x[*idx++]; 708 sum1 += *v1++ * tmp0; 709 sum2 += *v2++ * tmp0; 710 sum3 += *v3++ * tmp0; 711 sum4 += *v4++ * tmp0; 712 } 713 y[row++]=sum1; 714 y[row++]=sum2; 715 y[row++]=sum3; 716 y[row++]=sum4; 717 v1 =v4; /* Since the next block to be processed starts there*/ 718 idx +=3*sz; 719 break; 720 case 5: 721 sum1 = *zt++; 722 sum2 = *zt++; 723 sum3 = *zt++; 724 sum4 = *zt++; 725 sum5 = *zt++; 726 v2 = v1 + n; 727 v3 = v2 + n; 728 v4 = v3 + n; 729 v5 = v4 + n; 730 731 for (n = 0; n<sz-1; n+=2) { 732 i1 = idx[0]; 733 i2 = idx[1]; 734 idx += 2; 735 tmp0 = x[i1]; 736 tmp1 = x[i2]; 737 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 738 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 739 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 740 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 741 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 742 } 743 if (n == sz-1) { 744 tmp0 = x[*idx++]; 745 sum1 += *v1++ * tmp0; 746 sum2 += *v2++ * tmp0; 747 sum3 += *v3++ * tmp0; 748 sum4 += *v4++ * tmp0; 749 sum5 += *v5++ * tmp0; 750 } 751 y[row++]=sum1; 752 y[row++]=sum2; 753 y[row++]=sum3; 754 y[row++]=sum4; 755 y[row++]=sum5; 756 v1 =v5; /* Since the next block to be processed starts there */ 757 idx +=4*sz; 758 break; 759 default : 760 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 761 } 762 } 763 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 764 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 765 if (zz != yy) { 766 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 767 } 768 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 769 PetscFunctionReturn(0); 770 } 771 772 /* ----------------------------------------------------------- */ 773 #undef __FUNCT__ 774 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 775 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 776 { 777 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 778 IS iscol = a->col,isrow = a->row; 779 PetscErrorCode ierr; 780 const PetscInt *r,*c,*rout,*cout; 781 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 782 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 783 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 784 PetscScalar sum1,sum2,sum3,sum4,sum5; 785 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 786 const PetscScalar *b; 787 788 PetscFunctionBegin; 789 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 790 node_max = a->inode.node_count; 791 ns = a->inode.size; /* Node Size array */ 792 793 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 794 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 795 tmp = a->solve_work; 796 797 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 798 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 799 800 /* forward solve the lower triangular */ 801 tmps = tmp ; 802 aa = a_a ; 803 aj = a_j ; 804 ad = a->diag; 805 806 for (i = 0,row = 0; i< node_max; ++i) { 807 nsz = ns[i]; 808 aii = ai[row]; 809 v1 = aa + aii; 810 vi = aj + aii; 811 nz = ad[row]- aii; 812 if (i < node_max-1) { 813 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 814 * but our indexing to determine it's size could. */ 815 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 816 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 817 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 818 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 819 } 820 821 switch (nsz) { /* Each loop in 'case' is unrolled */ 822 case 1 : 823 sum1 = b[*r++]; 824 for (j=0; j<nz-1; j+=2) { 825 i0 = vi[0]; 826 i1 = vi[1]; 827 vi +=2; 828 tmp0 = tmps[i0]; 829 tmp1 = tmps[i1]; 830 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 831 } 832 if (j == nz-1) { 833 tmp0 = tmps[*vi++]; 834 sum1 -= *v1++ *tmp0; 835 } 836 tmp[row ++]=sum1; 837 break; 838 case 2: 839 sum1 = b[*r++]; 840 sum2 = b[*r++]; 841 v2 = aa + ai[row+1]; 842 843 for (j=0; j<nz-1; j+=2) { 844 i0 = vi[0]; 845 i1 = vi[1]; 846 vi +=2; 847 tmp0 = tmps[i0]; 848 tmp1 = tmps[i1]; 849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 850 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 851 } 852 if (j == nz-1) { 853 tmp0 = tmps[*vi++]; 854 sum1 -= *v1++ *tmp0; 855 sum2 -= *v2++ *tmp0; 856 } 857 sum2 -= *v2++ * sum1; 858 tmp[row ++]=sum1; 859 tmp[row ++]=sum2; 860 break; 861 case 3: 862 sum1 = b[*r++]; 863 sum2 = b[*r++]; 864 sum3 = b[*r++]; 865 v2 = aa + ai[row+1]; 866 v3 = aa + ai[row+2]; 867 868 for (j=0; j<nz-1; j+=2) { 869 i0 = vi[0]; 870 i1 = vi[1]; 871 vi +=2; 872 tmp0 = tmps[i0]; 873 tmp1 = tmps[i1]; 874 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 875 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 876 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 877 } 878 if (j == nz-1) { 879 tmp0 = tmps[*vi++]; 880 sum1 -= *v1++ *tmp0; 881 sum2 -= *v2++ *tmp0; 882 sum3 -= *v3++ *tmp0; 883 } 884 sum2 -= *v2++ * sum1; 885 sum3 -= *v3++ * sum1; 886 sum3 -= *v3++ * sum2; 887 tmp[row ++]=sum1; 888 tmp[row ++]=sum2; 889 tmp[row ++]=sum3; 890 break; 891 892 case 4: 893 sum1 = b[*r++]; 894 sum2 = b[*r++]; 895 sum3 = b[*r++]; 896 sum4 = b[*r++]; 897 v2 = aa + ai[row+1]; 898 v3 = aa + ai[row+2]; 899 v4 = aa + ai[row+3]; 900 901 for (j=0; j<nz-1; j+=2) { 902 i0 = vi[0]; 903 i1 = vi[1]; 904 vi +=2; 905 tmp0 = tmps[i0]; 906 tmp1 = tmps[i1]; 907 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 908 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 909 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 910 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 911 } 912 if (j == nz-1) { 913 tmp0 = tmps[*vi++]; 914 sum1 -= *v1++ *tmp0; 915 sum2 -= *v2++ *tmp0; 916 sum3 -= *v3++ *tmp0; 917 sum4 -= *v4++ *tmp0; 918 } 919 sum2 -= *v2++ * sum1; 920 sum3 -= *v3++ * sum1; 921 sum4 -= *v4++ * sum1; 922 sum3 -= *v3++ * sum2; 923 sum4 -= *v4++ * sum2; 924 sum4 -= *v4++ * sum3; 925 926 tmp[row ++]=sum1; 927 tmp[row ++]=sum2; 928 tmp[row ++]=sum3; 929 tmp[row ++]=sum4; 930 break; 931 case 5: 932 sum1 = b[*r++]; 933 sum2 = b[*r++]; 934 sum3 = b[*r++]; 935 sum4 = b[*r++]; 936 sum5 = b[*r++]; 937 v2 = aa + ai[row+1]; 938 v3 = aa + ai[row+2]; 939 v4 = aa + ai[row+3]; 940 v5 = aa + ai[row+4]; 941 942 for (j=0; j<nz-1; j+=2) { 943 i0 = vi[0]; 944 i1 = vi[1]; 945 vi +=2; 946 tmp0 = tmps[i0]; 947 tmp1 = tmps[i1]; 948 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 949 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 950 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 951 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 952 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 953 } 954 if (j == nz-1) { 955 tmp0 = tmps[*vi++]; 956 sum1 -= *v1++ *tmp0; 957 sum2 -= *v2++ *tmp0; 958 sum3 -= *v3++ *tmp0; 959 sum4 -= *v4++ *tmp0; 960 sum5 -= *v5++ *tmp0; 961 } 962 963 sum2 -= *v2++ * sum1; 964 sum3 -= *v3++ * sum1; 965 sum4 -= *v4++ * sum1; 966 sum5 -= *v5++ * sum1; 967 sum3 -= *v3++ * sum2; 968 sum4 -= *v4++ * sum2; 969 sum5 -= *v5++ * sum2; 970 sum4 -= *v4++ * sum3; 971 sum5 -= *v5++ * sum3; 972 sum5 -= *v5++ * sum4; 973 974 tmp[row ++]=sum1; 975 tmp[row ++]=sum2; 976 tmp[row ++]=sum3; 977 tmp[row ++]=sum4; 978 tmp[row ++]=sum5; 979 break; 980 default: 981 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 982 } 983 } 984 /* backward solve the upper triangular */ 985 for (i=node_max -1 ,row = n-1 ; i>=0; i--) { 986 nsz = ns[i]; 987 aii = ai[row+1] -1; 988 v1 = aa + aii; 989 vi = aj + aii; 990 nz = aii- ad[row]; 991 switch (nsz) { /* Each loop in 'case' is unrolled */ 992 case 1 : 993 sum1 = tmp[row]; 994 995 for (j=nz ; j>1; j-=2) { 996 vi -=2; 997 i0 = vi[2]; 998 i1 = vi[1]; 999 tmp0 = tmps[i0]; 1000 tmp1 = tmps[i1]; 1001 v1 -= 2; 1002 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1003 } 1004 if (j==1) { 1005 tmp0 = tmps[*vi--]; 1006 sum1 -= *v1-- * tmp0; 1007 } 1008 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1009 break; 1010 case 2 : 1011 sum1 = tmp[row]; 1012 sum2 = tmp[row -1]; 1013 v2 = aa + ai[row]-1; 1014 for (j=nz ; j>1; j-=2) { 1015 vi -=2; 1016 i0 = vi[2]; 1017 i1 = vi[1]; 1018 tmp0 = tmps[i0]; 1019 tmp1 = tmps[i1]; 1020 v1 -= 2; 1021 v2 -= 2; 1022 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1023 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1024 } 1025 if (j==1) { 1026 tmp0 = tmps[*vi--]; 1027 sum1 -= *v1-- * tmp0; 1028 sum2 -= *v2-- * tmp0; 1029 } 1030 1031 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1032 sum2 -= *v2-- * tmp0; 1033 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1034 break; 1035 case 3 : 1036 sum1 = tmp[row]; 1037 sum2 = tmp[row -1]; 1038 sum3 = tmp[row -2]; 1039 v2 = aa + ai[row]-1; 1040 v3 = aa + ai[row -1]-1; 1041 for (j=nz ; j>1; j-=2) { 1042 vi -=2; 1043 i0 = vi[2]; 1044 i1 = vi[1]; 1045 tmp0 = tmps[i0]; 1046 tmp1 = tmps[i1]; 1047 v1 -= 2; 1048 v2 -= 2; 1049 v3 -= 2; 1050 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1051 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1052 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1053 } 1054 if (j==1) { 1055 tmp0 = tmps[*vi--]; 1056 sum1 -= *v1-- * tmp0; 1057 sum2 -= *v2-- * tmp0; 1058 sum3 -= *v3-- * tmp0; 1059 } 1060 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1061 sum2 -= *v2-- * tmp0; 1062 sum3 -= *v3-- * tmp0; 1063 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1064 sum3 -= *v3-- * tmp0; 1065 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1066 1067 break; 1068 case 4 : 1069 sum1 = tmp[row]; 1070 sum2 = tmp[row -1]; 1071 sum3 = tmp[row -2]; 1072 sum4 = tmp[row -3]; 1073 v2 = aa + ai[row]-1; 1074 v3 = aa + ai[row -1]-1; 1075 v4 = aa + ai[row -2]-1; 1076 1077 for (j=nz ; j>1; j-=2) { 1078 vi -=2; 1079 i0 = vi[2]; 1080 i1 = vi[1]; 1081 tmp0 = tmps[i0]; 1082 tmp1 = tmps[i1]; 1083 v1 -= 2; 1084 v2 -= 2; 1085 v3 -= 2; 1086 v4 -= 2; 1087 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1088 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1089 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1090 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1091 } 1092 if (j==1) { 1093 tmp0 = tmps[*vi--]; 1094 sum1 -= *v1-- * tmp0; 1095 sum2 -= *v2-- * tmp0; 1096 sum3 -= *v3-- * tmp0; 1097 sum4 -= *v4-- * tmp0; 1098 } 1099 1100 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1105 sum3 -= *v3-- * tmp0; 1106 sum4 -= *v4-- * tmp0; 1107 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1108 sum4 -= *v4-- * tmp0; 1109 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1110 break; 1111 case 5 : 1112 sum1 = tmp[row]; 1113 sum2 = tmp[row -1]; 1114 sum3 = tmp[row -2]; 1115 sum4 = tmp[row -3]; 1116 sum5 = tmp[row -4]; 1117 v2 = aa + ai[row]-1; 1118 v3 = aa + ai[row -1]-1; 1119 v4 = aa + ai[row -2]-1; 1120 v5 = aa + ai[row -3]-1; 1121 for (j=nz ; j>1; j-=2) { 1122 vi -= 2; 1123 i0 = vi[2]; 1124 i1 = vi[1]; 1125 tmp0 = tmps[i0]; 1126 tmp1 = tmps[i1]; 1127 v1 -= 2; 1128 v2 -= 2; 1129 v3 -= 2; 1130 v4 -= 2; 1131 v5 -= 2; 1132 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1133 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1134 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1135 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1136 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1137 } 1138 if (j==1) { 1139 tmp0 = tmps[*vi--]; 1140 sum1 -= *v1-- * tmp0; 1141 sum2 -= *v2-- * tmp0; 1142 sum3 -= *v3-- * tmp0; 1143 sum4 -= *v4-- * tmp0; 1144 sum5 -= *v5-- * tmp0; 1145 } 1146 1147 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1148 sum2 -= *v2-- * tmp0; 1149 sum3 -= *v3-- * tmp0; 1150 sum4 -= *v4-- * tmp0; 1151 sum5 -= *v5-- * tmp0; 1152 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1153 sum3 -= *v3-- * tmp0; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1157 sum4 -= *v4-- * tmp0; 1158 sum5 -= *v5-- * tmp0; 1159 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1160 sum5 -= *v5-- * tmp0; 1161 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1162 break; 1163 default: 1164 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1165 } 1166 } 1167 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1168 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1169 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1170 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1171 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1172 PetscFunctionReturn(0); 1173 } 1174 1175 #undef __FUNCT__ 1176 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1177 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1178 { 1179 Mat C=B; 1180 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1181 IS isrow = b->row,isicol = b->icol; 1182 PetscErrorCode ierr; 1183 const PetscInt *r,*ic,*ics; 1184 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1185 PetscInt i,j,k,nz,nzL,row,*pj; 1186 const PetscInt *ajtmp,*bjtmp; 1187 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1188 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1189 FactorShiftCtx sctx; 1190 const PetscInt *ddiag; 1191 PetscReal rs; 1192 MatScalar d; 1193 PetscInt inod,nodesz,node_max,col; 1194 const PetscInt *ns; 1195 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1196 1197 PetscFunctionBegin; 1198 /* MatPivotSetUp(): initialize shift context sctx */ 1199 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1200 1201 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1202 ddiag = a->diag; 1203 sctx.shift_top = info->zeropivot; 1204 for (i=0; i<n; i++) { 1205 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1206 d = (aa)[ddiag[i]]; 1207 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1208 v = aa+ai[i]; 1209 nz = ai[i+1] - ai[i]; 1210 for (j=0; j<nz; j++) 1211 rs += PetscAbsScalar(v[j]); 1212 if (rs>sctx.shift_top) sctx.shift_top = rs; 1213 } 1214 sctx.shift_top *= 1.1; 1215 sctx.nshift_max = 5; 1216 sctx.shift_lo = 0.; 1217 sctx.shift_hi = 1.; 1218 } 1219 1220 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1221 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1222 1223 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1224 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1225 rtmp2 = rtmp1 + n; 1226 rtmp3 = rtmp2 + n; 1227 rtmp4 = rtmp3 + n; 1228 ics = ic; 1229 1230 node_max = a->inode.node_count; 1231 ns = a->inode.size; 1232 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1233 1234 /* If max inode size > 4, split it into two inodes.*/ 1235 /* also map the inode sizes according to the ordering */ 1236 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1237 for (i=0,j=0; i<node_max; ++i,++j) { 1238 if (ns[i] > 4) { 1239 tmp_vec1[j] = 4; 1240 ++j; 1241 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1242 } else { 1243 tmp_vec1[j] = ns[i]; 1244 } 1245 } 1246 /* Use the correct node_max */ 1247 node_max = j; 1248 1249 /* Now reorder the inode info based on mat re-ordering info */ 1250 /* First create a row -> inode_size_array_index map */ 1251 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1252 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1253 for (i=0,row=0; i<node_max; i++) { 1254 nodesz = tmp_vec1[i]; 1255 for (j=0; j<nodesz; j++,row++) { 1256 nsmap[row] = i; 1257 } 1258 } 1259 /* Using nsmap, create a reordered ns structure */ 1260 for (i=0,j=0; i< node_max; i++) { 1261 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1262 tmp_vec2[i] = nodesz; 1263 j += nodesz; 1264 } 1265 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1266 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1267 1268 /* Now use the correct ns */ 1269 ns = tmp_vec2; 1270 1271 do { 1272 sctx.newshift = PETSC_FALSE; 1273 /* Now loop over each block-row, and do the factorization */ 1274 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1275 nodesz = ns[inod]; 1276 1277 switch (nodesz) { 1278 case 1: 1279 /*----------*/ 1280 /* zero rtmp1 */ 1281 /* L part */ 1282 nz = bi[i+1] - bi[i]; 1283 bjtmp = bj + bi[i]; 1284 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1285 1286 /* U part */ 1287 nz = bdiag[i]-bdiag[i+1]; 1288 bjtmp = bj + bdiag[i+1]+1; 1289 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1290 1291 /* load in initial (unfactored row) */ 1292 nz = ai[r[i]+1] - ai[r[i]]; 1293 ajtmp = aj + ai[r[i]]; 1294 v = aa + ai[r[i]]; 1295 for (j=0; j<nz; j++) { 1296 rtmp1[ics[ajtmp[j]]] = v[j]; 1297 } 1298 /* ZeropivotApply() */ 1299 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1300 1301 /* elimination */ 1302 bjtmp = bj + bi[i]; 1303 row = *bjtmp++; 1304 nzL = bi[i+1] - bi[i]; 1305 for (k=0; k < nzL;k++) { 1306 pc = rtmp1 + row; 1307 if (*pc != 0.0) { 1308 pv = b->a + bdiag[row]; 1309 mul1 = *pc * (*pv); 1310 *pc = mul1; 1311 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1312 pv = b->a + bdiag[row+1]+1; 1313 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1314 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1315 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1316 } 1317 row = *bjtmp++; 1318 } 1319 1320 /* finished row so stick it into b->a */ 1321 rs = 0.0; 1322 /* L part */ 1323 pv = b->a + bi[i] ; 1324 pj = b->j + bi[i] ; 1325 nz = bi[i+1] - bi[i]; 1326 for (j=0; j<nz; j++) { 1327 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1328 } 1329 1330 /* U part */ 1331 pv = b->a + bdiag[i+1]+1; 1332 pj = b->j + bdiag[i+1]+1; 1333 nz = bdiag[i] - bdiag[i+1]-1; 1334 for (j=0; j<nz; j++) { 1335 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1336 } 1337 1338 /* Check zero pivot */ 1339 sctx.rs = rs; 1340 sctx.pv = rtmp1[i]; 1341 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1342 if (sctx.newshift) break; 1343 1344 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1345 pv = b->a + bdiag[i]; 1346 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1347 break; 1348 1349 case 2: 1350 /*----------*/ 1351 /* zero rtmp1 and rtmp2 */ 1352 /* L part */ 1353 nz = bi[i+1] - bi[i]; 1354 bjtmp = bj + bi[i]; 1355 for (j=0; j<nz; j++) { 1356 col = bjtmp[j]; 1357 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1358 } 1359 1360 /* U part */ 1361 nz = bdiag[i]-bdiag[i+1]; 1362 bjtmp = bj + bdiag[i+1]+1; 1363 for (j=0; j<nz; j++) { 1364 col = bjtmp[j]; 1365 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1366 } 1367 1368 /* load in initial (unfactored row) */ 1369 nz = ai[r[i]+1] - ai[r[i]]; 1370 ajtmp = aj + ai[r[i]]; 1371 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1372 for (j=0; j<nz; j++) { 1373 col = ics[ajtmp[j]]; 1374 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1375 } 1376 /* ZeropivotApply(): shift the diagonal of the matrix */ 1377 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1378 1379 /* elimination */ 1380 bjtmp = bj + bi[i]; 1381 row = *bjtmp++; /* pivot row */ 1382 nzL = bi[i+1] - bi[i]; 1383 for (k=0; k < nzL;k++) { 1384 pc1 = rtmp1 + row; 1385 pc2 = rtmp2 + row; 1386 if (*pc1 != 0.0 || *pc2 != 0.0) { 1387 pv = b->a + bdiag[row]; 1388 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1389 *pc1 = mul1; *pc2 = mul2; 1390 1391 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1392 pv = b->a + bdiag[row+1]+1; 1393 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1394 for (j=0; j<nz; j++) { 1395 col = pj[j]; 1396 rtmp1[col] -= mul1 * pv[j]; 1397 rtmp2[col] -= mul2 * pv[j]; 1398 } 1399 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1400 } 1401 row = *bjtmp++; 1402 } 1403 1404 /* finished row i; check zero pivot, then stick row i into b->a */ 1405 rs = 0.0; 1406 /* L part */ 1407 pc1 = b->a + bi[i]; 1408 pj = b->j + bi[i] ; 1409 nz = bi[i+1] - bi[i]; 1410 for (j=0; j<nz; j++) { 1411 col = pj[j]; 1412 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1413 } 1414 /* U part */ 1415 pc1 = b->a + bdiag[i+1]+1; 1416 pj = b->j + bdiag[i+1]+1; 1417 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1418 for (j=0; j<nz; j++) { 1419 col = pj[j]; 1420 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1421 } 1422 1423 sctx.rs = rs; 1424 sctx.pv = rtmp1[i]; 1425 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1426 if (sctx.newshift) break; 1427 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1428 *pc1 = 1.0/sctx.pv; 1429 1430 /* Now take care of diagonal 2x2 block. */ 1431 pc2 = rtmp2 + i; 1432 if (*pc2 != 0.0) { 1433 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1434 *pc2 = mul1; /* insert L entry */ 1435 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1436 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1437 for (j=0; j<nz; j++) { 1438 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1439 } 1440 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1441 } 1442 1443 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1444 rs = 0.0; 1445 /* L part */ 1446 pc2 = b->a + bi[i+1]; 1447 pj = b->j + bi[i+1] ; 1448 nz = bi[i+2] - bi[i+1]; 1449 for (j=0; j<nz; j++) { 1450 col = pj[j]; 1451 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1452 } 1453 /* U part */ 1454 pc2 = b->a + bdiag[i+2]+1; 1455 pj = b->j + bdiag[i+2]+1; 1456 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1457 for (j=0; j<nz; j++) { 1458 col = pj[j]; 1459 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1460 } 1461 1462 sctx.rs = rs; 1463 sctx.pv = rtmp2[i+1]; 1464 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1465 if (sctx.newshift) break; 1466 pc2 = b->a + bdiag[i+1]; 1467 *pc2 = 1.0/sctx.pv; 1468 break; 1469 1470 case 3: 1471 /*----------*/ 1472 /* zero rtmp */ 1473 /* L part */ 1474 nz = bi[i+1] - bi[i]; 1475 bjtmp = bj + bi[i]; 1476 for (j=0; j<nz; j++) { 1477 col = bjtmp[j]; 1478 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1479 } 1480 1481 /* U part */ 1482 nz = bdiag[i]-bdiag[i+1]; 1483 bjtmp = bj + bdiag[i+1]+1; 1484 for (j=0; j<nz; j++) { 1485 col = bjtmp[j]; 1486 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1487 } 1488 1489 /* load in initial (unfactored row) */ 1490 nz = ai[r[i]+1] - ai[r[i]]; 1491 ajtmp = aj + ai[r[i]]; 1492 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1493 for (j=0; j<nz; j++) { 1494 col = ics[ajtmp[j]]; 1495 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1496 } 1497 /* ZeropivotApply(): shift the diagonal of the matrix */ 1498 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1499 1500 /* elimination */ 1501 bjtmp = bj + bi[i]; 1502 row = *bjtmp++; /* pivot row */ 1503 nzL = bi[i+1] - bi[i]; 1504 for (k=0; k < nzL;k++) { 1505 pc1 = rtmp1 + row; 1506 pc2 = rtmp2 + row; 1507 pc3 = rtmp3 + row; 1508 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1509 pv = b->a + bdiag[row]; 1510 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1511 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1512 1513 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1514 pv = b->a + bdiag[row+1]+1; 1515 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1516 for (j=0; j<nz; j++) { 1517 col = pj[j]; 1518 rtmp1[col] -= mul1 * pv[j]; 1519 rtmp2[col] -= mul2 * pv[j]; 1520 rtmp3[col] -= mul3 * pv[j]; 1521 } 1522 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1523 } 1524 row = *bjtmp++; 1525 } 1526 1527 /* finished row i; check zero pivot, then stick row i into b->a */ 1528 rs = 0.0; 1529 /* L part */ 1530 pc1 = b->a + bi[i]; 1531 pj = b->j + bi[i] ; 1532 nz = bi[i+1] - bi[i]; 1533 for (j=0; j<nz; j++) { 1534 col = pj[j]; 1535 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1536 } 1537 /* U part */ 1538 pc1 = b->a + bdiag[i+1]+1; 1539 pj = b->j + bdiag[i+1]+1; 1540 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1541 for (j=0; j<nz; j++) { 1542 col = pj[j]; 1543 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1544 } 1545 1546 sctx.rs = rs; 1547 sctx.pv = rtmp1[i]; 1548 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1549 if (sctx.newshift) break; 1550 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1551 *pc1 = 1.0/sctx.pv; 1552 1553 /* Now take care of 1st column of diagonal 3x3 block. */ 1554 pc2 = rtmp2 + i; 1555 pc3 = rtmp3 + i; 1556 if (*pc2 != 0.0 || *pc3 != 0.0) { 1557 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1558 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1559 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1560 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1561 for (j=0; j<nz; j++) { 1562 col = pj[j]; 1563 rtmp2[col] -= mul2 * rtmp1[col]; 1564 rtmp3[col] -= mul3 * rtmp1[col]; 1565 } 1566 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1567 } 1568 1569 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1570 rs = 0.0; 1571 /* L part */ 1572 pc2 = b->a + bi[i+1]; 1573 pj = b->j + bi[i+1] ; 1574 nz = bi[i+2] - bi[i+1]; 1575 for (j=0; j<nz; j++) { 1576 col = pj[j]; 1577 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1578 } 1579 /* U part */ 1580 pc2 = b->a + bdiag[i+2]+1; 1581 pj = b->j + bdiag[i+2]+1; 1582 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1583 for (j=0; j<nz; j++) { 1584 col = pj[j]; 1585 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1586 } 1587 1588 sctx.rs = rs; 1589 sctx.pv = rtmp2[i+1]; 1590 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1591 if (sctx.newshift) break; 1592 pc2 = b->a + bdiag[i+1]; 1593 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1594 1595 /* Now take care of 2nd column of diagonal 3x3 block. */ 1596 pc3 = rtmp3 + i+1; 1597 if (*pc3 != 0.0) { 1598 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1599 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1600 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1601 for (j=0; j<nz; j++) { 1602 col = pj[j]; 1603 rtmp3[col] -= mul3 * rtmp2[col]; 1604 } 1605 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1606 } 1607 1608 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1609 rs = 0.0; 1610 /* L part */ 1611 pc3 = b->a + bi[i+2]; 1612 pj = b->j + bi[i+2] ; 1613 nz = bi[i+3] - bi[i+2]; 1614 for (j=0; j<nz; j++) { 1615 col = pj[j]; 1616 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1617 } 1618 /* U part */ 1619 pc3 = b->a + bdiag[i+3]+1; 1620 pj = b->j + bdiag[i+3]+1; 1621 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1622 for (j=0; j<nz; j++) { 1623 col = pj[j]; 1624 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1625 } 1626 1627 sctx.rs = rs; 1628 sctx.pv = rtmp3[i+2]; 1629 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1630 if (sctx.newshift) break; 1631 pc3 = b->a + bdiag[i+2]; 1632 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1633 break; 1634 case 4: 1635 /*----------*/ 1636 /* zero rtmp */ 1637 /* L part */ 1638 nz = bi[i+1] - bi[i]; 1639 bjtmp = bj + bi[i]; 1640 for (j=0; j<nz; j++) { 1641 col = bjtmp[j]; 1642 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1643 } 1644 1645 /* U part */ 1646 nz = bdiag[i]-bdiag[i+1]; 1647 bjtmp = bj + bdiag[i+1]+1; 1648 for (j=0; j<nz; j++) { 1649 col = bjtmp[j]; 1650 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1651 } 1652 1653 /* load in initial (unfactored row) */ 1654 nz = ai[r[i]+1] - ai[r[i]]; 1655 ajtmp = aj + ai[r[i]]; 1656 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1657 for (j=0; j<nz; j++) { 1658 col = ics[ajtmp[j]]; 1659 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1660 } 1661 /* ZeropivotApply(): shift the diagonal of the matrix */ 1662 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1663 1664 /* elimination */ 1665 bjtmp = bj + bi[i]; 1666 row = *bjtmp++; /* pivot row */ 1667 nzL = bi[i+1] - bi[i]; 1668 for (k=0; k < nzL;k++) { 1669 pc1 = rtmp1 + row; 1670 pc2 = rtmp2 + row; 1671 pc3 = rtmp3 + row; 1672 pc4 = rtmp4 + row; 1673 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1674 pv = b->a + bdiag[row]; 1675 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1676 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1677 1678 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1679 pv = b->a + bdiag[row+1]+1; 1680 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1681 for (j=0; j<nz; j++) { 1682 col = pj[j]; 1683 rtmp1[col] -= mul1 * pv[j]; 1684 rtmp2[col] -= mul2 * pv[j]; 1685 rtmp3[col] -= mul3 * pv[j]; 1686 rtmp4[col] -= mul4 * pv[j]; 1687 } 1688 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1689 } 1690 row = *bjtmp++; 1691 } 1692 1693 /* finished row i; check zero pivot, then stick row i into b->a */ 1694 rs = 0.0; 1695 /* L part */ 1696 pc1 = b->a + bi[i]; 1697 pj = b->j + bi[i] ; 1698 nz = bi[i+1] - bi[i]; 1699 for (j=0; j<nz; j++) { 1700 col = pj[j]; 1701 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1702 } 1703 /* U part */ 1704 pc1 = b->a + bdiag[i+1]+1; 1705 pj = b->j + bdiag[i+1]+1; 1706 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1707 for (j=0; j<nz; j++) { 1708 col = pj[j]; 1709 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1710 } 1711 1712 sctx.rs = rs; 1713 sctx.pv = rtmp1[i]; 1714 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1715 if (sctx.newshift) break; 1716 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1717 *pc1 = 1.0/sctx.pv; 1718 1719 /* Now take care of 1st column of diagonal 4x4 block. */ 1720 pc2 = rtmp2 + i; 1721 pc3 = rtmp3 + i; 1722 pc4 = rtmp4 + i; 1723 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1724 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1725 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1726 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1727 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1728 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1729 for (j=0; j<nz; j++) { 1730 col = pj[j]; 1731 rtmp2[col] -= mul2 * rtmp1[col]; 1732 rtmp3[col] -= mul3 * rtmp1[col]; 1733 rtmp4[col] -= mul4 * rtmp1[col]; 1734 } 1735 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1736 } 1737 1738 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1739 rs = 0.0; 1740 /* L part */ 1741 pc2 = b->a + bi[i+1]; 1742 pj = b->j + bi[i+1] ; 1743 nz = bi[i+2] - bi[i+1]; 1744 for (j=0; j<nz; j++) { 1745 col = pj[j]; 1746 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1747 } 1748 /* U part */ 1749 pc2 = b->a + bdiag[i+2]+1; 1750 pj = b->j + bdiag[i+2]+1; 1751 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1752 for (j=0; j<nz; j++) { 1753 col = pj[j]; 1754 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1755 } 1756 1757 sctx.rs = rs; 1758 sctx.pv = rtmp2[i+1]; 1759 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1760 if (sctx.newshift) break; 1761 pc2 = b->a + bdiag[i+1]; 1762 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1763 1764 /* Now take care of 2nd column of diagonal 4x4 block. */ 1765 pc3 = rtmp3 + i+1; 1766 pc4 = rtmp4 + i+1; 1767 if (*pc3 != 0.0 || *pc4 != 0.0) { 1768 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1769 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1770 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1771 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1772 for (j=0; j<nz; j++) { 1773 col = pj[j]; 1774 rtmp3[col] -= mul3 * rtmp2[col]; 1775 rtmp4[col] -= mul4 * rtmp2[col]; 1776 } 1777 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1778 } 1779 1780 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1781 rs = 0.0; 1782 /* L part */ 1783 pc3 = b->a + bi[i+2]; 1784 pj = b->j + bi[i+2] ; 1785 nz = bi[i+3] - bi[i+2]; 1786 for (j=0; j<nz; j++) { 1787 col = pj[j]; 1788 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1789 } 1790 /* U part */ 1791 pc3 = b->a + bdiag[i+3]+1; 1792 pj = b->j + bdiag[i+3]+1; 1793 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1794 for (j=0; j<nz; j++) { 1795 col = pj[j]; 1796 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1797 } 1798 1799 sctx.rs = rs; 1800 sctx.pv = rtmp3[i+2]; 1801 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1802 if (sctx.newshift) break; 1803 pc3 = b->a + bdiag[i+2]; 1804 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1805 1806 /* Now take care of 3rd column of diagonal 4x4 block. */ 1807 pc4 = rtmp4 + i+2; 1808 if (*pc4 != 0.0) { 1809 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1810 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1811 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1812 for (j=0; j<nz; j++) { 1813 col = pj[j]; 1814 rtmp4[col] -= mul4 * rtmp3[col]; 1815 } 1816 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1817 } 1818 1819 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1820 rs = 0.0; 1821 /* L part */ 1822 pc4 = b->a + bi[i+3]; 1823 pj = b->j + bi[i+3] ; 1824 nz = bi[i+4] - bi[i+3]; 1825 for (j=0; j<nz; j++) { 1826 col = pj[j]; 1827 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1828 } 1829 /* U part */ 1830 pc4 = b->a + bdiag[i+4]+1; 1831 pj = b->j + bdiag[i+4]+1; 1832 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1833 for (j=0; j<nz; j++) { 1834 col = pj[j]; 1835 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1836 } 1837 1838 sctx.rs = rs; 1839 sctx.pv = rtmp4[i+3]; 1840 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1841 if (sctx.newshift) break; 1842 pc4 = b->a + bdiag[i+3]; 1843 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1844 break; 1845 1846 default: 1847 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1848 } 1849 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1850 i += nodesz; /* Update the row */ 1851 } 1852 1853 /* MatPivotRefine() */ 1854 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1855 /* 1856 * if no shift in this attempt & shifting & started shifting & can refine, 1857 * then try lower shift 1858 */ 1859 sctx.shift_hi = sctx.shift_fraction; 1860 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1861 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1862 sctx.newshift = PETSC_TRUE; 1863 sctx.nshift++; 1864 } 1865 } while (sctx.newshift); 1866 1867 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1868 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1869 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1870 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1871 1872 C->ops->solve = MatSolve_SeqAIJ; 1873 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1874 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1875 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1876 C->ops->matsolve = MatMatSolve_SeqAIJ; 1877 C->assembled = PETSC_TRUE; 1878 C->preallocated = PETSC_TRUE; 1879 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1880 1881 /* MatShiftView(A,info,&sctx) */ 1882 if (sctx.nshift) { 1883 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1884 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1885 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1886 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1887 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1888 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1889 } 1890 } 1891 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1892 PetscFunctionReturn(0); 1893 } 1894 1895 #undef __FUNCT__ 1896 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1897 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1898 { 1899 Mat C = B; 1900 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1901 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1902 PetscErrorCode ierr; 1903 const PetscInt *r,*ic,*c,*ics; 1904 PetscInt n = A->rmap->n,*bi = b->i; 1905 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1906 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1907 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1908 PetscScalar mul1,mul2,mul3,tmp; 1909 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1910 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1911 PetscReal rs=0.0; 1912 FactorShiftCtx sctx; 1913 1914 PetscFunctionBegin; 1915 sctx.shift_top = 0; 1916 sctx.nshift_max = 0; 1917 sctx.shift_lo = 0; 1918 sctx.shift_hi = 0; 1919 sctx.shift_fraction = 0; 1920 1921 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1922 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1923 sctx.shift_top = 0; 1924 for (i=0; i<n; i++) { 1925 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1926 rs = 0.0; 1927 ajtmp = aj + ai[i]; 1928 rtmp1 = aa + ai[i]; 1929 nz = ai[i+1] - ai[i]; 1930 for (j=0; j<nz; j++) { 1931 if (*ajtmp != i) { 1932 rs += PetscAbsScalar(*rtmp1++); 1933 } else { 1934 rs -= PetscRealPart(*rtmp1++); 1935 } 1936 ajtmp++; 1937 } 1938 if (rs>sctx.shift_top) sctx.shift_top = rs; 1939 } 1940 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1941 sctx.shift_top *= 1.1; 1942 sctx.nshift_max = 5; 1943 sctx.shift_lo = 0.; 1944 sctx.shift_hi = 1.; 1945 } 1946 sctx.shift_amount = 0; 1947 sctx.nshift = 0; 1948 1949 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1950 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1951 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1952 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1953 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1954 ics = ic ; 1955 rtmp22 = rtmp11 + n; 1956 rtmp33 = rtmp22 + n; 1957 1958 node_max = a->inode.node_count; 1959 ns = a->inode.size; 1960 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1961 1962 /* If max inode size > 3, split it into two inodes.*/ 1963 /* also map the inode sizes according to the ordering */ 1964 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1965 for (i=0,j=0; i<node_max; ++i,++j) { 1966 if (ns[i]>3) { 1967 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1968 ++j; 1969 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1970 } else { 1971 tmp_vec1[j] = ns[i]; 1972 } 1973 } 1974 /* Use the correct node_max */ 1975 node_max = j; 1976 1977 /* Now reorder the inode info based on mat re-ordering info */ 1978 /* First create a row -> inode_size_array_index map */ 1979 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1980 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1981 for (i=0,row=0; i<node_max; i++) { 1982 nodesz = tmp_vec1[i]; 1983 for (j=0; j<nodesz; j++,row++) { 1984 nsmap[row] = i; 1985 } 1986 } 1987 /* Using nsmap, create a reordered ns structure */ 1988 for (i=0,j=0; i< node_max; i++) { 1989 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1990 tmp_vec2[i] = nodesz; 1991 j += nodesz; 1992 } 1993 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1994 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1995 /* Now use the correct ns */ 1996 ns = tmp_vec2; 1997 1998 do { 1999 sctx.newshift = PETSC_FALSE; 2000 /* Now loop over each block-row, and do the factorization */ 2001 for (i=0,row=0; i<node_max; i++) { 2002 nodesz = ns[i]; 2003 nz = bi[row+1] - bi[row]; 2004 bjtmp = bj + bi[row]; 2005 2006 switch (nodesz) { 2007 case 1: 2008 for (j=0; j<nz; j++) { 2009 idx = bjtmp[j]; 2010 rtmp11[idx] = 0.0; 2011 } 2012 2013 /* load in initial (unfactored row) */ 2014 idx = r[row]; 2015 nz_tmp = ai[idx+1] - ai[idx]; 2016 ajtmp = aj + ai[idx]; 2017 v1 = aa + ai[idx]; 2018 2019 for (j=0; j<nz_tmp; j++) { 2020 idx = ics[ajtmp[j]]; 2021 rtmp11[idx] = v1[j]; 2022 } 2023 rtmp11[ics[r[row]]] += sctx.shift_amount; 2024 2025 prow = *bjtmp++ ; 2026 while (prow < row) { 2027 pc1 = rtmp11 + prow; 2028 if (*pc1 != 0.0) { 2029 pv = ba + bd[prow]; 2030 pj = nbj + bd[prow]; 2031 mul1 = *pc1 * *pv++; 2032 *pc1 = mul1; 2033 nz_tmp = bi[prow+1] - bd[prow] - 1; 2034 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2035 for (j=0; j<nz_tmp; j++) { 2036 tmp = pv[j]; 2037 idx = pj[j]; 2038 rtmp11[idx] -= mul1 * tmp; 2039 } 2040 } 2041 prow = *bjtmp++ ; 2042 } 2043 pj = bj + bi[row]; 2044 pc1 = ba + bi[row]; 2045 2046 sctx.pv = rtmp11[row]; 2047 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2048 rs = 0.0; 2049 for (j=0; j<nz; j++) { 2050 idx = pj[j]; 2051 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2052 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2053 } 2054 sctx.rs = rs; 2055 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2056 if (sctx.newshift) goto endofwhile; 2057 break; 2058 2059 case 2: 2060 for (j=0; j<nz; j++) { 2061 idx = bjtmp[j]; 2062 rtmp11[idx] = 0.0; 2063 rtmp22[idx] = 0.0; 2064 } 2065 2066 /* load in initial (unfactored row) */ 2067 idx = r[row]; 2068 nz_tmp = ai[idx+1] - ai[idx]; 2069 ajtmp = aj + ai[idx]; 2070 v1 = aa + ai[idx]; 2071 v2 = aa + ai[idx+1]; 2072 for (j=0; j<nz_tmp; j++) { 2073 idx = ics[ajtmp[j]]; 2074 rtmp11[idx] = v1[j]; 2075 rtmp22[idx] = v2[j]; 2076 } 2077 rtmp11[ics[r[row]]] += sctx.shift_amount; 2078 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2079 2080 prow = *bjtmp++ ; 2081 while (prow < row) { 2082 pc1 = rtmp11 + prow; 2083 pc2 = rtmp22 + prow; 2084 if (*pc1 != 0.0 || *pc2 != 0.0) { 2085 pv = ba + bd[prow]; 2086 pj = nbj + bd[prow]; 2087 mul1 = *pc1 * *pv; 2088 mul2 = *pc2 * *pv; 2089 ++pv; 2090 *pc1 = mul1; 2091 *pc2 = mul2; 2092 2093 nz_tmp = bi[prow+1] - bd[prow] - 1; 2094 for (j=0; j<nz_tmp; j++) { 2095 tmp = pv[j]; 2096 idx = pj[j]; 2097 rtmp11[idx] -= mul1 * tmp; 2098 rtmp22[idx] -= mul2 * tmp; 2099 } 2100 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2101 } 2102 prow = *bjtmp++ ; 2103 } 2104 2105 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2106 pc1 = rtmp11 + prow; 2107 pc2 = rtmp22 + prow; 2108 2109 sctx.pv = *pc1; 2110 pj = bj + bi[prow]; 2111 rs = 0.0; 2112 for (j=0; j<nz; j++) { 2113 idx = pj[j]; 2114 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2115 } 2116 sctx.rs = rs; 2117 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2118 if (sctx.newshift) goto endofwhile; 2119 2120 if (*pc2 != 0.0) { 2121 pj = nbj + bd[prow]; 2122 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2123 *pc2 = mul2; 2124 nz_tmp = bi[prow+1] - bd[prow] - 1; 2125 for (j=0; j<nz_tmp; j++) { 2126 idx = pj[j] ; 2127 tmp = rtmp11[idx]; 2128 rtmp22[idx] -= mul2 * tmp; 2129 } 2130 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2131 } 2132 2133 pj = bj + bi[row]; 2134 pc1 = ba + bi[row]; 2135 pc2 = ba + bi[row+1]; 2136 2137 sctx.pv = rtmp22[row+1]; 2138 rs = 0.0; 2139 rtmp11[row] = 1.0/rtmp11[row]; 2140 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2141 /* copy row entries from dense representation to sparse */ 2142 for (j=0; j<nz; j++) { 2143 idx = pj[j]; 2144 pc1[j] = rtmp11[idx]; 2145 pc2[j] = rtmp22[idx]; 2146 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2147 } 2148 sctx.rs = rs; 2149 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2150 if (sctx.newshift) goto endofwhile; 2151 break; 2152 2153 case 3: 2154 for (j=0; j<nz; j++) { 2155 idx = bjtmp[j]; 2156 rtmp11[idx] = 0.0; 2157 rtmp22[idx] = 0.0; 2158 rtmp33[idx] = 0.0; 2159 } 2160 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2161 idx = r[row]; 2162 nz_tmp = ai[idx+1] - ai[idx]; 2163 ajtmp = aj + ai[idx]; 2164 v1 = aa + ai[idx]; 2165 v2 = aa + ai[idx+1]; 2166 v3 = aa + ai[idx+2]; 2167 for (j=0; j<nz_tmp; j++) { 2168 idx = ics[ajtmp[j]]; 2169 rtmp11[idx] = v1[j]; 2170 rtmp22[idx] = v2[j]; 2171 rtmp33[idx] = v3[j]; 2172 } 2173 rtmp11[ics[r[row]]] += sctx.shift_amount; 2174 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2175 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2176 2177 /* loop over all pivot row blocks above this row block */ 2178 prow = *bjtmp++ ; 2179 while (prow < row) { 2180 pc1 = rtmp11 + prow; 2181 pc2 = rtmp22 + prow; 2182 pc3 = rtmp33 + prow; 2183 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2184 pv = ba + bd[prow]; 2185 pj = nbj + bd[prow]; 2186 mul1 = *pc1 * *pv; 2187 mul2 = *pc2 * *pv; 2188 mul3 = *pc3 * *pv; 2189 ++pv; 2190 *pc1 = mul1; 2191 *pc2 = mul2; 2192 *pc3 = mul3; 2193 2194 nz_tmp = bi[prow+1] - bd[prow] - 1; 2195 /* update this row based on pivot row */ 2196 for (j=0; j<nz_tmp; j++) { 2197 tmp = pv[j]; 2198 idx = pj[j]; 2199 rtmp11[idx] -= mul1 * tmp; 2200 rtmp22[idx] -= mul2 * tmp; 2201 rtmp33[idx] -= mul3 * tmp; 2202 } 2203 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2204 } 2205 prow = *bjtmp++ ; 2206 } 2207 2208 /* Now take care of diagonal 3x3 block in this set of rows */ 2209 /* note: prow = row here */ 2210 pc1 = rtmp11 + prow; 2211 pc2 = rtmp22 + prow; 2212 pc3 = rtmp33 + prow; 2213 2214 sctx.pv = *pc1; 2215 pj = bj + bi[prow]; 2216 rs = 0.0; 2217 for (j=0; j<nz; j++) { 2218 idx = pj[j]; 2219 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2220 } 2221 sctx.rs = rs; 2222 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2223 if (sctx.newshift) goto endofwhile; 2224 2225 if (*pc2 != 0.0 || *pc3 != 0.0) { 2226 mul2 = (*pc2)/(*pc1); 2227 mul3 = (*pc3)/(*pc1); 2228 *pc2 = mul2; 2229 *pc3 = mul3; 2230 nz_tmp = bi[prow+1] - bd[prow] - 1; 2231 pj = nbj + bd[prow]; 2232 for (j=0; j<nz_tmp; j++) { 2233 idx = pj[j] ; 2234 tmp = rtmp11[idx]; 2235 rtmp22[idx] -= mul2 * tmp; 2236 rtmp33[idx] -= mul3 * tmp; 2237 } 2238 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2239 } 2240 ++prow; 2241 2242 pc2 = rtmp22 + prow; 2243 pc3 = rtmp33 + prow; 2244 sctx.pv = *pc2; 2245 pj = bj + bi[prow]; 2246 rs = 0.0; 2247 for (j=0; j<nz; j++) { 2248 idx = pj[j]; 2249 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2250 } 2251 sctx.rs = rs; 2252 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2253 if (sctx.newshift) goto endofwhile; 2254 2255 if (*pc3 != 0.0) { 2256 mul3 = (*pc3)/(*pc2); 2257 *pc3 = mul3; 2258 pj = nbj + bd[prow]; 2259 nz_tmp = bi[prow+1] - bd[prow] - 1; 2260 for (j=0; j<nz_tmp; j++) { 2261 idx = pj[j] ; 2262 tmp = rtmp22[idx]; 2263 rtmp33[idx] -= mul3 * tmp; 2264 } 2265 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2266 } 2267 2268 pj = bj + bi[row]; 2269 pc1 = ba + bi[row]; 2270 pc2 = ba + bi[row+1]; 2271 pc3 = ba + bi[row+2]; 2272 2273 sctx.pv = rtmp33[row+2]; 2274 rs = 0.0; 2275 rtmp11[row] = 1.0/rtmp11[row]; 2276 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2277 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2278 /* copy row entries from dense representation to sparse */ 2279 for (j=0; j<nz; j++) { 2280 idx = pj[j]; 2281 pc1[j] = rtmp11[idx]; 2282 pc2[j] = rtmp22[idx]; 2283 pc3[j] = rtmp33[idx]; 2284 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2285 } 2286 2287 sctx.rs = rs; 2288 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2289 if (sctx.newshift) goto endofwhile; 2290 break; 2291 2292 default: 2293 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2294 } 2295 row += nodesz; /* Update the row */ 2296 } 2297 endofwhile:; 2298 } while (sctx.newshift); 2299 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2300 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2301 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2302 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2303 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2304 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2305 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2306 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2307 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2308 C->assembled = PETSC_TRUE; 2309 C->preallocated = PETSC_TRUE; 2310 if (sctx.nshift) { 2311 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2312 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2313 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2314 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2315 } 2316 } 2317 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2318 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2319 PetscFunctionReturn(0); 2320 } 2321 2322 2323 /* ----------------------------------------------------------- */ 2324 #undef __FUNCT__ 2325 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2326 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2327 { 2328 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2329 IS iscol = a->col,isrow = a->row; 2330 PetscErrorCode ierr; 2331 const PetscInt *r,*c,*rout,*cout; 2332 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2333 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2334 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2335 PetscScalar sum1,sum2,sum3,sum4,sum5; 2336 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2337 const PetscScalar *b; 2338 2339 PetscFunctionBegin; 2340 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2341 node_max = a->inode.node_count; 2342 ns = a->inode.size; /* Node Size array */ 2343 2344 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2345 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2346 tmp = a->solve_work; 2347 2348 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2349 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2350 2351 /* forward solve the lower triangular */ 2352 tmps = tmp ; 2353 aa = a_a ; 2354 aj = a_j ; 2355 ad = a->diag; 2356 2357 for (i = 0,row = 0; i< node_max; ++i) { 2358 nsz = ns[i]; 2359 aii = ai[row]; 2360 v1 = aa + aii; 2361 vi = aj + aii; 2362 nz = ai[row+1]- ai[row]; 2363 2364 if (i < node_max-1) { 2365 /* Prefetch the indices for the next block */ 2366 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2367 /* Prefetch the data for the next block */ 2368 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2369 } 2370 2371 switch (nsz) { /* Each loop in 'case' is unrolled */ 2372 case 1 : 2373 sum1 = b[r[row]]; 2374 for (j=0; j<nz-1; j+=2) { 2375 i0 = vi[j]; 2376 i1 = vi[j+1]; 2377 tmp0 = tmps[i0]; 2378 tmp1 = tmps[i1]; 2379 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2380 } 2381 if (j == nz-1) { 2382 tmp0 = tmps[vi[j]]; 2383 sum1 -= v1[j]*tmp0; 2384 } 2385 tmp[row++]=sum1; 2386 break; 2387 case 2: 2388 sum1 = b[r[row]]; 2389 sum2 = b[r[row+1]]; 2390 v2 = aa + ai[row+1]; 2391 2392 for (j=0; j<nz-1; j+=2) { 2393 i0 = vi[j]; 2394 i1 = vi[j+1]; 2395 tmp0 = tmps[i0]; 2396 tmp1 = tmps[i1]; 2397 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2398 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2399 } 2400 if (j == nz-1) { 2401 tmp0 = tmps[vi[j]]; 2402 sum1 -= v1[j] *tmp0; 2403 sum2 -= v2[j] *tmp0; 2404 } 2405 sum2 -= v2[nz] * sum1; 2406 tmp[row ++]=sum1; 2407 tmp[row ++]=sum2; 2408 break; 2409 case 3: 2410 sum1 = b[r[row]]; 2411 sum2 = b[r[row+1]]; 2412 sum3 = b[r[row+2]]; 2413 v2 = aa + ai[row+1]; 2414 v3 = aa + ai[row+2]; 2415 2416 for (j=0; j<nz-1; j+=2) { 2417 i0 = vi[j]; 2418 i1 = vi[j+1]; 2419 tmp0 = tmps[i0]; 2420 tmp1 = tmps[i1]; 2421 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2422 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2423 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2424 } 2425 if (j == nz-1) { 2426 tmp0 = tmps[vi[j]]; 2427 sum1 -= v1[j] *tmp0; 2428 sum2 -= v2[j] *tmp0; 2429 sum3 -= v3[j] *tmp0; 2430 } 2431 sum2 -= v2[nz] * sum1; 2432 sum3 -= v3[nz] * sum1; 2433 sum3 -= v3[nz+1] * sum2; 2434 tmp[row ++]=sum1; 2435 tmp[row ++]=sum2; 2436 tmp[row ++]=sum3; 2437 break; 2438 2439 case 4: 2440 sum1 = b[r[row]]; 2441 sum2 = b[r[row+1]]; 2442 sum3 = b[r[row+2]]; 2443 sum4 = b[r[row+3]]; 2444 v2 = aa + ai[row+1]; 2445 v3 = aa + ai[row+2]; 2446 v4 = aa + ai[row+3]; 2447 2448 for (j=0; j<nz-1; j+=2) { 2449 i0 = vi[j]; 2450 i1 = vi[j+1]; 2451 tmp0 = tmps[i0]; 2452 tmp1 = tmps[i1]; 2453 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2454 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2455 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2456 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2457 } 2458 if (j == nz-1) { 2459 tmp0 = tmps[vi[j]]; 2460 sum1 -= v1[j] *tmp0; 2461 sum2 -= v2[j] *tmp0; 2462 sum3 -= v3[j] *tmp0; 2463 sum4 -= v4[j] *tmp0; 2464 } 2465 sum2 -= v2[nz] * sum1; 2466 sum3 -= v3[nz] * sum1; 2467 sum4 -= v4[nz] * sum1; 2468 sum3 -= v3[nz+1] * sum2; 2469 sum4 -= v4[nz+1] * sum2; 2470 sum4 -= v4[nz+2] * sum3; 2471 2472 tmp[row ++]=sum1; 2473 tmp[row ++]=sum2; 2474 tmp[row ++]=sum3; 2475 tmp[row ++]=sum4; 2476 break; 2477 case 5: 2478 sum1 = b[r[row]]; 2479 sum2 = b[r[row+1]]; 2480 sum3 = b[r[row+2]]; 2481 sum4 = b[r[row+3]]; 2482 sum5 = b[r[row+4]]; 2483 v2 = aa + ai[row+1]; 2484 v3 = aa + ai[row+2]; 2485 v4 = aa + ai[row+3]; 2486 v5 = aa + ai[row+4]; 2487 2488 for (j=0; j<nz-1; j+=2) { 2489 i0 = vi[j]; 2490 i1 = vi[j+1]; 2491 tmp0 = tmps[i0]; 2492 tmp1 = tmps[i1]; 2493 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2494 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2495 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2496 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2497 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2498 } 2499 if (j == nz-1) { 2500 tmp0 = tmps[vi[j]]; 2501 sum1 -= v1[j] *tmp0; 2502 sum2 -= v2[j] *tmp0; 2503 sum3 -= v3[j] *tmp0; 2504 sum4 -= v4[j] *tmp0; 2505 sum5 -= v5[j] *tmp0; 2506 } 2507 2508 sum2 -= v2[nz] * sum1; 2509 sum3 -= v3[nz] * sum1; 2510 sum4 -= v4[nz] * sum1; 2511 sum5 -= v5[nz] * sum1; 2512 sum3 -= v3[nz+1] * sum2; 2513 sum4 -= v4[nz+1] * sum2; 2514 sum5 -= v5[nz+1] * sum2; 2515 sum4 -= v4[nz+2] * sum3; 2516 sum5 -= v5[nz+2] * sum3; 2517 sum5 -= v5[nz+3] * sum4; 2518 2519 tmp[row ++]=sum1; 2520 tmp[row ++]=sum2; 2521 tmp[row ++]=sum3; 2522 tmp[row ++]=sum4; 2523 tmp[row ++]=sum5; 2524 break; 2525 default: 2526 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2527 } 2528 } 2529 /* backward solve the upper triangular */ 2530 for (i=node_max -1 ,row = n-1 ; i>=0; i--) { 2531 nsz = ns[i]; 2532 aii = ad[row+1] + 1; 2533 v1 = aa + aii; 2534 vi = aj + aii; 2535 nz = ad[row]- ad[row+1] - 1; 2536 2537 if (i > 0) { 2538 /* Prefetch the indices for the next block */ 2539 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2540 /* Prefetch the data for the next block */ 2541 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2542 } 2543 2544 switch (nsz) { /* Each loop in 'case' is unrolled */ 2545 case 1 : 2546 sum1 = tmp[row]; 2547 2548 for (j=0 ; j<nz-1; j+=2) { 2549 i0 = vi[j]; 2550 i1 = vi[j+1]; 2551 tmp0 = tmps[i0]; 2552 tmp1 = tmps[i1]; 2553 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2554 } 2555 if (j == nz-1) { 2556 tmp0 = tmps[vi[j]]; 2557 sum1 -= v1[j]*tmp0; 2558 } 2559 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2560 break; 2561 case 2 : 2562 sum1 = tmp[row]; 2563 sum2 = tmp[row-1]; 2564 v2 = aa + ad[row] + 1; 2565 for (j=0 ; j<nz-1; j+=2) { 2566 i0 = vi[j]; 2567 i1 = vi[j+1]; 2568 tmp0 = tmps[i0]; 2569 tmp1 = tmps[i1]; 2570 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2571 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2572 } 2573 if (j == nz-1) { 2574 tmp0 = tmps[vi[j]]; 2575 sum1 -= v1[j]* tmp0; 2576 sum2 -= v2[j+1]* tmp0; 2577 } 2578 2579 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2580 sum2 -= v2[0] * tmp0; 2581 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2582 break; 2583 case 3 : 2584 sum1 = tmp[row]; 2585 sum2 = tmp[row -1]; 2586 sum3 = tmp[row -2]; 2587 v2 = aa + ad[row] + 1; 2588 v3 = aa + ad[row -1] + 1; 2589 for (j=0 ; j<nz-1; j+=2) { 2590 i0 = vi[j]; 2591 i1 = vi[j+1]; 2592 tmp0 = tmps[i0]; 2593 tmp1 = tmps[i1]; 2594 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2595 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2596 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2597 } 2598 if (j== nz-1) { 2599 tmp0 = tmps[vi[j]]; 2600 sum1 -= v1[j] * tmp0; 2601 sum2 -= v2[j+1] * tmp0; 2602 sum3 -= v3[j+2] * tmp0; 2603 } 2604 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2605 sum2 -= v2[0]* tmp0; 2606 sum3 -= v3[1] * tmp0; 2607 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2608 sum3 -= v3[0]* tmp0; 2609 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2610 2611 break; 2612 case 4 : 2613 sum1 = tmp[row]; 2614 sum2 = tmp[row -1]; 2615 sum3 = tmp[row -2]; 2616 sum4 = tmp[row -3]; 2617 v2 = aa + ad[row]+1; 2618 v3 = aa + ad[row -1]+1; 2619 v4 = aa + ad[row -2]+1; 2620 2621 for (j=0 ; j<nz-1; j+=2) { 2622 i0 = vi[j]; 2623 i1 = vi[j+1]; 2624 tmp0 = tmps[i0]; 2625 tmp1 = tmps[i1]; 2626 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2627 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2628 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2629 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2630 } 2631 if (j== nz-1) { 2632 tmp0 = tmps[vi[j]]; 2633 sum1 -= v1[j] * tmp0; 2634 sum2 -= v2[j+1] * tmp0; 2635 sum3 -= v3[j+2] * tmp0; 2636 sum4 -= v4[j+3] * tmp0; 2637 } 2638 2639 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2640 sum2 -= v2[0] * tmp0; 2641 sum3 -= v3[1] * tmp0; 2642 sum4 -= v4[2] * tmp0; 2643 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2644 sum3 -= v3[0] * tmp0; 2645 sum4 -= v4[1] * tmp0; 2646 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2647 sum4 -= v4[0] * tmp0; 2648 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2649 break; 2650 case 5 : 2651 sum1 = tmp[row]; 2652 sum2 = tmp[row -1]; 2653 sum3 = tmp[row -2]; 2654 sum4 = tmp[row -3]; 2655 sum5 = tmp[row -4]; 2656 v2 = aa + ad[row]+1; 2657 v3 = aa + ad[row -1]+1; 2658 v4 = aa + ad[row -2]+1; 2659 v5 = aa + ad[row -3]+1; 2660 for (j=0 ; j<nz-1; j+=2) { 2661 i0 = vi[j]; 2662 i1 = vi[j+1]; 2663 tmp0 = tmps[i0]; 2664 tmp1 = tmps[i1]; 2665 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2666 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2667 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2668 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2669 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2670 } 2671 if (j==nz-1) { 2672 tmp0 = tmps[vi[j]]; 2673 sum1 -= v1[j] * tmp0; 2674 sum2 -= v2[j+1] * tmp0; 2675 sum3 -= v3[j+2] * tmp0; 2676 sum4 -= v4[j+3] * tmp0; 2677 sum5 -= v5[j+4] * tmp0; 2678 } 2679 2680 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2681 sum2 -= v2[0] * tmp0; 2682 sum3 -= v3[1] * tmp0; 2683 sum4 -= v4[2] * tmp0; 2684 sum5 -= v5[3] * tmp0; 2685 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2686 sum3 -= v3[0] * tmp0; 2687 sum4 -= v4[1] * tmp0; 2688 sum5 -= v5[2] * tmp0; 2689 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2690 sum4 -= v4[0] * tmp0; 2691 sum5 -= v5[1] * tmp0; 2692 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2693 sum5 -= v5[0] * tmp0; 2694 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2695 break; 2696 default: 2697 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2698 } 2699 } 2700 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2701 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2702 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2703 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2704 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2705 PetscFunctionReturn(0); 2706 } 2707 2708 2709 /* 2710 Makes a longer coloring[] array and calls the usual code with that 2711 */ 2712 #undef __FUNCT__ 2713 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2714 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2715 { 2716 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2717 PetscErrorCode ierr; 2718 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2719 PetscInt *colorused,i; 2720 ISColoringValue *newcolor; 2721 2722 PetscFunctionBegin; 2723 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2724 /* loop over inodes, marking a color for each column*/ 2725 row = 0; 2726 for (i=0; i<m; i++) { 2727 for (j=0; j<ns[i]; j++) { 2728 newcolor[row++] = coloring[i] + j*ncolors; 2729 } 2730 } 2731 2732 /* eliminate unneeded colors */ 2733 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2734 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2735 for (i=0; i<n; i++) { 2736 colorused[newcolor[i]] = 1; 2737 } 2738 2739 for (i=1; i<5*ncolors; i++) { 2740 colorused[i] += colorused[i-1]; 2741 } 2742 ncolors = colorused[5*ncolors-1]; 2743 for (i=0; i<n; i++) { 2744 newcolor[i] = colorused[newcolor[i]]-1; 2745 } 2746 ierr = PetscFree(colorused);CHKERRQ(ierr); 2747 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2748 ierr = PetscFree(coloring);CHKERRQ(ierr); 2749 PetscFunctionReturn(0); 2750 } 2751 2752 #include <../src/mat/blockinvert.h> 2753 2754 #undef __FUNCT__ 2755 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2756 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2757 { 2758 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2759 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2760 MatScalar *ibdiag,*bdiag,work[25],*t; 2761 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2762 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2763 const PetscScalar *xb, *b; 2764 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2765 PetscErrorCode ierr; 2766 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2767 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2768 2769 PetscFunctionBegin; 2770 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2771 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2772 2773 if (!a->inode.ibdiagvalid) { 2774 if (!a->inode.ibdiag) { 2775 /* calculate space needed for diagonal blocks */ 2776 for (i=0; i<m; i++) { 2777 cnt += sizes[i]*sizes[i]; 2778 } 2779 a->inode.bdiagsize = cnt; 2780 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2781 } 2782 2783 /* copy over the diagonal blocks and invert them */ 2784 ibdiag = a->inode.ibdiag; 2785 bdiag = a->inode.bdiag; 2786 cnt = 0; 2787 for (i=0, row = 0; i<m; i++) { 2788 for (j=0; j<sizes[i]; j++) { 2789 for (k=0; k<sizes[i]; k++) { 2790 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2791 } 2792 } 2793 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2794 2795 switch (sizes[i]) { 2796 case 1: 2797 /* Create matrix data structure */ 2798 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2799 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2800 break; 2801 case 2: 2802 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2803 break; 2804 case 3: 2805 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2806 break; 2807 case 4: 2808 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2809 break; 2810 case 5: 2811 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2812 break; 2813 default: 2814 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2815 } 2816 cnt += sizes[i]*sizes[i]; 2817 row += sizes[i]; 2818 } 2819 a->inode.ibdiagvalid = PETSC_TRUE; 2820 } 2821 ibdiag = a->inode.ibdiag; 2822 bdiag = a->inode.bdiag; 2823 t = a->inode.ssor_work; 2824 2825 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2826 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2827 CHKMEMQ; 2828 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2829 if (flag & SOR_ZERO_INITIAL_GUESS) { 2830 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2831 2832 for (i=0, row=0; i<m; i++) { 2833 sz = diag[row] - ii[row]; 2834 v1 = a->a + ii[row]; 2835 idx = a->j + ii[row]; 2836 2837 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2838 switch (sizes[i]) { 2839 case 1: 2840 2841 sum1 = b[row]; 2842 for (n = 0; n<sz-1; n+=2) { 2843 i1 = idx[0]; 2844 i2 = idx[1]; 2845 idx += 2; 2846 tmp0 = x[i1]; 2847 tmp1 = x[i2]; 2848 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2849 } 2850 2851 if (n == sz-1) { 2852 tmp0 = x[*idx]; 2853 sum1 -= *v1 * tmp0; 2854 } 2855 t[row] = sum1; 2856 x[row++] = sum1*(*ibdiag++); 2857 break; 2858 case 2: 2859 v2 = a->a + ii[row+1]; 2860 sum1 = b[row]; 2861 sum2 = b[row+1]; 2862 for (n = 0; n<sz-1; n+=2) { 2863 i1 = idx[0]; 2864 i2 = idx[1]; 2865 idx += 2; 2866 tmp0 = x[i1]; 2867 tmp1 = x[i2]; 2868 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2869 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2870 } 2871 2872 if (n == sz-1) { 2873 tmp0 = x[*idx]; 2874 sum1 -= v1[0] * tmp0; 2875 sum2 -= v2[0] * tmp0; 2876 } 2877 t[row] = sum1; 2878 t[row+1] = sum2; 2879 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2880 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2881 ibdiag += 4; 2882 break; 2883 case 3: 2884 v2 = a->a + ii[row+1]; 2885 v3 = a->a + ii[row+2]; 2886 sum1 = b[row]; 2887 sum2 = b[row+1]; 2888 sum3 = b[row+2]; 2889 for (n = 0; n<sz-1; n+=2) { 2890 i1 = idx[0]; 2891 i2 = idx[1]; 2892 idx += 2; 2893 tmp0 = x[i1]; 2894 tmp1 = x[i2]; 2895 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2896 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2897 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2898 } 2899 2900 if (n == sz-1) { 2901 tmp0 = x[*idx]; 2902 sum1 -= v1[0] * tmp0; 2903 sum2 -= v2[0] * tmp0; 2904 sum3 -= v3[0] * tmp0; 2905 } 2906 t[row] = sum1; 2907 t[row+1] = sum2; 2908 t[row+2] = sum3; 2909 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2910 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2911 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2912 ibdiag += 9; 2913 break; 2914 case 4: 2915 v2 = a->a + ii[row+1]; 2916 v3 = a->a + ii[row+2]; 2917 v4 = a->a + ii[row+3]; 2918 sum1 = b[row]; 2919 sum2 = b[row+1]; 2920 sum3 = b[row+2]; 2921 sum4 = b[row+3]; 2922 for (n = 0; n<sz-1; n+=2) { 2923 i1 = idx[0]; 2924 i2 = idx[1]; 2925 idx += 2; 2926 tmp0 = x[i1]; 2927 tmp1 = x[i2]; 2928 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2929 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2930 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2931 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2932 } 2933 2934 if (n == sz-1) { 2935 tmp0 = x[*idx]; 2936 sum1 -= v1[0] * tmp0; 2937 sum2 -= v2[0] * tmp0; 2938 sum3 -= v3[0] * tmp0; 2939 sum4 -= v4[0] * tmp0; 2940 } 2941 t[row] = sum1; 2942 t[row+1] = sum2; 2943 t[row+2] = sum3; 2944 t[row+3] = sum4; 2945 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2946 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2947 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2948 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2949 ibdiag += 16; 2950 break; 2951 case 5: 2952 v2 = a->a + ii[row+1]; 2953 v3 = a->a + ii[row+2]; 2954 v4 = a->a + ii[row+3]; 2955 v5 = a->a + ii[row+4]; 2956 sum1 = b[row]; 2957 sum2 = b[row+1]; 2958 sum3 = b[row+2]; 2959 sum4 = b[row+3]; 2960 sum5 = b[row+4]; 2961 for (n = 0; n<sz-1; n+=2) { 2962 i1 = idx[0]; 2963 i2 = idx[1]; 2964 idx += 2; 2965 tmp0 = x[i1]; 2966 tmp1 = x[i2]; 2967 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2968 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2969 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2970 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2971 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2972 } 2973 2974 if (n == sz-1) { 2975 tmp0 = x[*idx]; 2976 sum1 -= v1[0] * tmp0; 2977 sum2 -= v2[0] * tmp0; 2978 sum3 -= v3[0] * tmp0; 2979 sum4 -= v4[0] * tmp0; 2980 sum5 -= v5[0] * tmp0; 2981 } 2982 t[row] = sum1; 2983 t[row+1] = sum2; 2984 t[row+2] = sum3; 2985 t[row+3] = sum4; 2986 t[row+4] = sum5; 2987 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2988 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2989 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2990 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2991 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2992 ibdiag += 25; 2993 break; 2994 default: 2995 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2996 } 2997 } 2998 2999 xb = t; 3000 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3001 } else xb = b; 3002 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3003 3004 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3005 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3006 ibdiag -= sizes[i]*sizes[i]; 3007 sz = ii[row+1] - diag[row] - 1; 3008 v1 = a->a + diag[row] + 1; 3009 idx = a->j + diag[row] + 1; 3010 3011 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3012 switch (sizes[i]) { 3013 case 1: 3014 3015 sum1 = xb[row]; 3016 for (n = 0; n<sz-1; n+=2) { 3017 i1 = idx[0]; 3018 i2 = idx[1]; 3019 idx += 2; 3020 tmp0 = x[i1]; 3021 tmp1 = x[i2]; 3022 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3023 } 3024 3025 if (n == sz-1) { 3026 tmp0 = x[*idx]; 3027 sum1 -= *v1*tmp0; 3028 } 3029 x[row--] = sum1*(*ibdiag); 3030 break; 3031 3032 case 2: 3033 3034 sum1 = xb[row]; 3035 sum2 = xb[row-1]; 3036 /* note that sum1 is associated with the second of the two rows */ 3037 v2 = a->a + diag[row-1] + 2; 3038 for (n = 0; n<sz-1; n+=2) { 3039 i1 = idx[0]; 3040 i2 = idx[1]; 3041 idx += 2; 3042 tmp0 = x[i1]; 3043 tmp1 = x[i2]; 3044 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3045 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3046 } 3047 3048 if (n == sz-1) { 3049 tmp0 = x[*idx]; 3050 sum1 -= *v1*tmp0; 3051 sum2 -= *v2*tmp0; 3052 } 3053 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3054 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3055 break; 3056 case 3: 3057 3058 sum1 = xb[row]; 3059 sum2 = xb[row-1]; 3060 sum3 = xb[row-2]; 3061 v2 = a->a + diag[row-1] + 2; 3062 v3 = a->a + diag[row-2] + 3; 3063 for (n = 0; n<sz-1; n+=2) { 3064 i1 = idx[0]; 3065 i2 = idx[1]; 3066 idx += 2; 3067 tmp0 = x[i1]; 3068 tmp1 = x[i2]; 3069 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3070 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3071 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3072 } 3073 3074 if (n == sz-1) { 3075 tmp0 = x[*idx]; 3076 sum1 -= *v1*tmp0; 3077 sum2 -= *v2*tmp0; 3078 sum3 -= *v3*tmp0; 3079 } 3080 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3081 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3082 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3083 break; 3084 case 4: 3085 3086 sum1 = xb[row]; 3087 sum2 = xb[row-1]; 3088 sum3 = xb[row-2]; 3089 sum4 = xb[row-3]; 3090 v2 = a->a + diag[row-1] + 2; 3091 v3 = a->a + diag[row-2] + 3; 3092 v4 = a->a + diag[row-3] + 4; 3093 for (n = 0; n<sz-1; n+=2) { 3094 i1 = idx[0]; 3095 i2 = idx[1]; 3096 idx += 2; 3097 tmp0 = x[i1]; 3098 tmp1 = x[i2]; 3099 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3100 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3101 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3102 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3103 } 3104 3105 if (n == sz-1) { 3106 tmp0 = x[*idx]; 3107 sum1 -= *v1*tmp0; 3108 sum2 -= *v2*tmp0; 3109 sum3 -= *v3*tmp0; 3110 sum4 -= *v4*tmp0; 3111 } 3112 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3113 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3114 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3115 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3116 break; 3117 case 5: 3118 3119 sum1 = xb[row]; 3120 sum2 = xb[row-1]; 3121 sum3 = xb[row-2]; 3122 sum4 = xb[row-3]; 3123 sum5 = xb[row-4]; 3124 v2 = a->a + diag[row-1] + 2; 3125 v3 = a->a + diag[row-2] + 3; 3126 v4 = a->a + diag[row-3] + 4; 3127 v5 = a->a + diag[row-4] + 5; 3128 for (n = 0; n<sz-1; n+=2) { 3129 i1 = idx[0]; 3130 i2 = idx[1]; 3131 idx += 2; 3132 tmp0 = x[i1]; 3133 tmp1 = x[i2]; 3134 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3135 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3136 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3137 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3138 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3139 } 3140 3141 if (n == sz-1) { 3142 tmp0 = x[*idx]; 3143 sum1 -= *v1*tmp0; 3144 sum2 -= *v2*tmp0; 3145 sum3 -= *v3*tmp0; 3146 sum4 -= *v4*tmp0; 3147 sum5 -= *v5*tmp0; 3148 } 3149 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3150 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3151 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3152 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3153 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3154 break; 3155 default: 3156 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3157 } 3158 } 3159 3160 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3161 } 3162 its--; 3163 } 3164 while (its--) { 3165 3166 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3167 ibdiag = a->inode.ibdiag; 3168 3169 for (i=0, row=0; i<m; i++) { 3170 sz = ii[row + 1] - ii[row]; 3171 v1 = a->a + ii[row]; 3172 idx = a->j + ii[row]; 3173 3174 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3175 switch (sizes[i]) { 3176 case 1: 3177 3178 sum1 = b[row]; 3179 for (n = 0; n<sz-1; n+=2) { 3180 i1 = idx[0]; 3181 i2 = idx[1]; 3182 idx += 2; 3183 tmp0 = x[i1]; 3184 tmp1 = x[i2]; 3185 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3186 } 3187 3188 if (n == sz-1) { 3189 tmp0 = x[*idx]; 3190 sum1 -= *v1 * tmp0; 3191 } 3192 3193 /* in MatSOR_SeqAIJ this line would be 3194 * 3195 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3196 * 3197 * but omega == 1, so this becomes 3198 * 3199 * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++); 3200 * 3201 * but bdiag and ibdiag cancel each other, so we can change this 3202 * to adding sum1*(*ibdiag++). We can skip bdiag for the larger 3203 * block sizes as well 3204 */ 3205 x[row++] += sum1*(*ibdiag++); 3206 break; 3207 case 2: 3208 v2 = a->a + ii[row+1]; 3209 sum1 = b[row]; 3210 sum2 = b[row+1]; 3211 for (n = 0; n<sz-1; n+=2) { 3212 i1 = idx[0]; 3213 i2 = idx[1]; 3214 idx += 2; 3215 tmp0 = x[i1]; 3216 tmp1 = x[i2]; 3217 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3218 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3219 } 3220 3221 if (n == sz-1) { 3222 tmp0 = x[*idx]; 3223 sum1 -= v1[0] * tmp0; 3224 sum2 -= v2[0] * tmp0; 3225 } 3226 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2]; 3227 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3]; 3228 ibdiag += 4; 3229 break; 3230 case 3: 3231 v2 = a->a + ii[row+1]; 3232 v3 = a->a + ii[row+2]; 3233 sum1 = b[row]; 3234 sum2 = b[row+1]; 3235 sum3 = b[row+2]; 3236 for (n = 0; n<sz-1; n+=2) { 3237 i1 = idx[0]; 3238 i2 = idx[1]; 3239 idx += 2; 3240 tmp0 = x[i1]; 3241 tmp1 = x[i2]; 3242 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3243 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3244 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3245 } 3246 3247 if (n == sz-1) { 3248 tmp0 = x[*idx]; 3249 sum1 -= v1[0] * tmp0; 3250 sum2 -= v2[0] * tmp0; 3251 sum3 -= v3[0] * tmp0; 3252 } 3253 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3254 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3255 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3256 ibdiag += 9; 3257 break; 3258 case 4: 3259 v2 = a->a + ii[row+1]; 3260 v3 = a->a + ii[row+2]; 3261 v4 = a->a + ii[row+3]; 3262 sum1 = b[row]; 3263 sum2 = b[row+1]; 3264 sum3 = b[row+2]; 3265 sum4 = b[row+3]; 3266 for (n = 0; n<sz-1; n+=2) { 3267 i1 = idx[0]; 3268 i2 = idx[1]; 3269 idx += 2; 3270 tmp0 = x[i1]; 3271 tmp1 = x[i2]; 3272 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3273 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3274 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3275 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3276 } 3277 3278 if (n == sz-1) { 3279 tmp0 = x[*idx]; 3280 sum1 -= v1[0] * tmp0; 3281 sum2 -= v2[0] * tmp0; 3282 sum3 -= v3[0] * tmp0; 3283 sum4 -= v4[0] * tmp0; 3284 } 3285 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3286 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3287 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3288 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3289 ibdiag += 16; 3290 break; 3291 case 5: 3292 v2 = a->a + ii[row+1]; 3293 v3 = a->a + ii[row+2]; 3294 v4 = a->a + ii[row+3]; 3295 v5 = a->a + ii[row+4]; 3296 sum1 = b[row]; 3297 sum2 = b[row+1]; 3298 sum3 = b[row+2]; 3299 sum4 = b[row+3]; 3300 sum5 = b[row+4]; 3301 for (n = 0; n<sz-1; n+=2) { 3302 i1 = idx[0]; 3303 i2 = idx[1]; 3304 idx += 2; 3305 tmp0 = x[i1]; 3306 tmp1 = x[i2]; 3307 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3308 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3309 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3310 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3311 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3312 } 3313 3314 if (n == sz-1) { 3315 tmp0 = x[*idx]; 3316 sum1 -= v1[0] * tmp0; 3317 sum2 -= v2[0] * tmp0; 3318 sum3 -= v3[0] * tmp0; 3319 sum4 -= v4[0] * tmp0; 3320 sum5 -= v5[0] * tmp0; 3321 } 3322 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3323 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3324 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3325 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3326 x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3327 ibdiag += 25; 3328 break; 3329 default: 3330 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3331 } 3332 } 3333 3334 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3335 } 3336 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3337 3338 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3339 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3340 ibdiag -= sizes[i]*sizes[i]; 3341 sz = ii[row+1] - ii[row]; 3342 v1 = a->a + ii[row]; 3343 idx = a->j + ii[row]; 3344 3345 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3346 switch (sizes[i]) { 3347 case 1: 3348 3349 sum1 = b[row]; 3350 for (n = 0; n<sz-1; n+=2) { 3351 i1 = idx[0]; 3352 i2 = idx[1]; 3353 idx += 2; 3354 tmp0 = x[i1]; 3355 tmp1 = x[i2]; 3356 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3357 } 3358 3359 if (n == sz-1) { 3360 tmp0 = x[*idx]; 3361 sum1 -= *v1*tmp0; 3362 } 3363 x[row--] += sum1*(*ibdiag); 3364 break; 3365 3366 case 2: 3367 3368 sum1 = b[row]; 3369 sum2 = b[row-1]; 3370 /* note that sum1 is associated with the second of the two rows */ 3371 v2 = a->a + ii[row - 1]; 3372 for (n = 0; n<sz-1; n+=2) { 3373 i1 = idx[0]; 3374 i2 = idx[1]; 3375 idx += 2; 3376 tmp0 = x[i1]; 3377 tmp1 = x[i2]; 3378 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3379 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3380 } 3381 3382 if (n == sz-1) { 3383 tmp0 = x[*idx]; 3384 sum1 -= *v1*tmp0; 3385 sum2 -= *v2*tmp0; 3386 } 3387 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3388 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3389 break; 3390 case 3: 3391 3392 sum1 = b[row]; 3393 sum2 = b[row-1]; 3394 sum3 = b[row-2]; 3395 v2 = a->a + ii[row-1]; 3396 v3 = a->a + ii[row-2]; 3397 for (n = 0; n<sz-1; n+=2) { 3398 i1 = idx[0]; 3399 i2 = idx[1]; 3400 idx += 2; 3401 tmp0 = x[i1]; 3402 tmp1 = x[i2]; 3403 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3404 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3405 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3406 } 3407 3408 if (n == sz-1) { 3409 tmp0 = x[*idx]; 3410 sum1 -= *v1*tmp0; 3411 sum2 -= *v2*tmp0; 3412 sum3 -= *v3*tmp0; 3413 } 3414 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3415 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3416 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3417 break; 3418 case 4: 3419 3420 sum1 = b[row]; 3421 sum2 = b[row-1]; 3422 sum3 = b[row-2]; 3423 sum4 = b[row-3]; 3424 v2 = a->a + ii[row-1]; 3425 v3 = a->a + ii[row-2]; 3426 v4 = a->a + ii[row-3]; 3427 for (n = 0; n<sz-1; n+=2) { 3428 i1 = idx[0]; 3429 i2 = idx[1]; 3430 idx += 2; 3431 tmp0 = x[i1]; 3432 tmp1 = x[i2]; 3433 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3434 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3435 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3436 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3437 } 3438 3439 if (n == sz-1) { 3440 tmp0 = x[*idx]; 3441 sum1 -= *v1*tmp0; 3442 sum2 -= *v2*tmp0; 3443 sum3 -= *v3*tmp0; 3444 sum4 -= *v4*tmp0; 3445 } 3446 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3447 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3448 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3449 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3450 break; 3451 case 5: 3452 3453 sum1 = b[row]; 3454 sum2 = b[row-1]; 3455 sum3 = b[row-2]; 3456 sum4 = b[row-3]; 3457 sum5 = b[row-4]; 3458 v2 = a->a + ii[row-1]; 3459 v3 = a->a + ii[row-2]; 3460 v4 = a->a + ii[row-3]; 3461 v5 = a->a + ii[row-4]; 3462 for (n = 0; n<sz-1; n+=2) { 3463 i1 = idx[0]; 3464 i2 = idx[1]; 3465 idx += 2; 3466 tmp0 = x[i1]; 3467 tmp1 = x[i2]; 3468 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3469 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3470 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3471 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3472 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3473 } 3474 3475 if (n == sz-1) { 3476 tmp0 = x[*idx]; 3477 sum1 -= *v1*tmp0; 3478 sum2 -= *v2*tmp0; 3479 sum3 -= *v3*tmp0; 3480 sum4 -= *v4*tmp0; 3481 sum5 -= *v5*tmp0; 3482 } 3483 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3484 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3485 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3486 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3487 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3488 break; 3489 default: 3490 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3491 } 3492 } 3493 3494 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3495 } 3496 } 3497 if (flag & SOR_EISENSTAT) { 3498 /* 3499 Apply (U + D)^-1 where D is now the block diagonal 3500 */ 3501 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3502 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3503 ibdiag -= sizes[i]*sizes[i]; 3504 sz = ii[row+1] - diag[row] - 1; 3505 v1 = a->a + diag[row] + 1; 3506 idx = a->j + diag[row] + 1; 3507 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3508 switch (sizes[i]) { 3509 case 1: 3510 3511 sum1 = b[row]; 3512 for (n = 0; n<sz-1; n+=2) { 3513 i1 = idx[0]; 3514 i2 = idx[1]; 3515 idx += 2; 3516 tmp0 = x[i1]; 3517 tmp1 = x[i2]; 3518 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3519 } 3520 3521 if (n == sz-1) { 3522 tmp0 = x[*idx]; 3523 sum1 -= *v1*tmp0; 3524 } 3525 x[row] = sum1*(*ibdiag);row--; 3526 break; 3527 3528 case 2: 3529 3530 sum1 = b[row]; 3531 sum2 = b[row-1]; 3532 /* note that sum1 is associated with the second of the two rows */ 3533 v2 = a->a + diag[row-1] + 2; 3534 for (n = 0; n<sz-1; n+=2) { 3535 i1 = idx[0]; 3536 i2 = idx[1]; 3537 idx += 2; 3538 tmp0 = x[i1]; 3539 tmp1 = x[i2]; 3540 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3541 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3542 } 3543 3544 if (n == sz-1) { 3545 tmp0 = x[*idx]; 3546 sum1 -= *v1*tmp0; 3547 sum2 -= *v2*tmp0; 3548 } 3549 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3550 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3551 row -= 2; 3552 break; 3553 case 3: 3554 3555 sum1 = b[row]; 3556 sum2 = b[row-1]; 3557 sum3 = b[row-2]; 3558 v2 = a->a + diag[row-1] + 2; 3559 v3 = a->a + diag[row-2] + 3; 3560 for (n = 0; n<sz-1; n+=2) { 3561 i1 = idx[0]; 3562 i2 = idx[1]; 3563 idx += 2; 3564 tmp0 = x[i1]; 3565 tmp1 = x[i2]; 3566 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3567 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3568 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3569 } 3570 3571 if (n == sz-1) { 3572 tmp0 = x[*idx]; 3573 sum1 -= *v1*tmp0; 3574 sum2 -= *v2*tmp0; 3575 sum3 -= *v3*tmp0; 3576 } 3577 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3578 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3579 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3580 row -= 3; 3581 break; 3582 case 4: 3583 3584 sum1 = b[row]; 3585 sum2 = b[row-1]; 3586 sum3 = b[row-2]; 3587 sum4 = b[row-3]; 3588 v2 = a->a + diag[row-1] + 2; 3589 v3 = a->a + diag[row-2] + 3; 3590 v4 = a->a + diag[row-3] + 4; 3591 for (n = 0; n<sz-1; n+=2) { 3592 i1 = idx[0]; 3593 i2 = idx[1]; 3594 idx += 2; 3595 tmp0 = x[i1]; 3596 tmp1 = x[i2]; 3597 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3598 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3599 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3600 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3601 } 3602 3603 if (n == sz-1) { 3604 tmp0 = x[*idx]; 3605 sum1 -= *v1*tmp0; 3606 sum2 -= *v2*tmp0; 3607 sum3 -= *v3*tmp0; 3608 sum4 -= *v4*tmp0; 3609 } 3610 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3611 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3612 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3613 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3614 row -= 4; 3615 break; 3616 case 5: 3617 3618 sum1 = b[row]; 3619 sum2 = b[row-1]; 3620 sum3 = b[row-2]; 3621 sum4 = b[row-3]; 3622 sum5 = b[row-4]; 3623 v2 = a->a + diag[row-1] + 2; 3624 v3 = a->a + diag[row-2] + 3; 3625 v4 = a->a + diag[row-3] + 4; 3626 v5 = a->a + diag[row-4] + 5; 3627 for (n = 0; n<sz-1; n+=2) { 3628 i1 = idx[0]; 3629 i2 = idx[1]; 3630 idx += 2; 3631 tmp0 = x[i1]; 3632 tmp1 = x[i2]; 3633 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3634 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3635 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3636 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3637 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3638 } 3639 3640 if (n == sz-1) { 3641 tmp0 = x[*idx]; 3642 sum1 -= *v1*tmp0; 3643 sum2 -= *v2*tmp0; 3644 sum3 -= *v3*tmp0; 3645 sum4 -= *v4*tmp0; 3646 sum5 -= *v5*tmp0; 3647 } 3648 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3649 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3650 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3651 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3652 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3653 row -= 5; 3654 break; 3655 default: 3656 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3657 } 3658 } 3659 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3660 3661 /* 3662 t = b - D x where D is the block diagonal 3663 */ 3664 cnt = 0; 3665 for (i=0, row=0; i<m; i++) { 3666 switch (sizes[i]) { 3667 case 1: 3668 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3669 break; 3670 case 2: 3671 x1 = x[row]; x2 = x[row+1]; 3672 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3673 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3674 t[row] = b[row] - tmp1; 3675 t[row+1] = b[row+1] - tmp2; row += 2; 3676 cnt += 4; 3677 break; 3678 case 3: 3679 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3680 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3681 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3682 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3683 t[row] = b[row] - tmp1; 3684 t[row+1] = b[row+1] - tmp2; 3685 t[row+2] = b[row+2] - tmp3; row += 3; 3686 cnt += 9; 3687 break; 3688 case 4: 3689 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3690 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3691 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3692 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3693 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3694 t[row] = b[row] - tmp1; 3695 t[row+1] = b[row+1] - tmp2; 3696 t[row+2] = b[row+2] - tmp3; 3697 t[row+3] = b[row+3] - tmp4; row += 4; 3698 cnt += 16; 3699 break; 3700 case 5: 3701 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3702 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3703 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3704 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3705 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3706 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3707 t[row] = b[row] - tmp1; 3708 t[row+1] = b[row+1] - tmp2; 3709 t[row+2] = b[row+2] - tmp3; 3710 t[row+3] = b[row+3] - tmp4; 3711 t[row+4] = b[row+4] - tmp5;row += 5; 3712 cnt += 25; 3713 break; 3714 default: 3715 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3716 } 3717 } 3718 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3719 3720 3721 3722 /* 3723 Apply (L + D)^-1 where D is the block diagonal 3724 */ 3725 for (i=0, row=0; i<m; i++) { 3726 sz = diag[row] - ii[row]; 3727 v1 = a->a + ii[row]; 3728 idx = a->j + ii[row]; 3729 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3730 switch (sizes[i]) { 3731 case 1: 3732 3733 sum1 = t[row]; 3734 for (n = 0; n<sz-1; n+=2) { 3735 i1 = idx[0]; 3736 i2 = idx[1]; 3737 idx += 2; 3738 tmp0 = t[i1]; 3739 tmp1 = t[i2]; 3740 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3741 } 3742 3743 if (n == sz-1) { 3744 tmp0 = t[*idx]; 3745 sum1 -= *v1 * tmp0; 3746 } 3747 x[row] += t[row] = sum1*(*ibdiag++); row++; 3748 break; 3749 case 2: 3750 v2 = a->a + ii[row+1]; 3751 sum1 = t[row]; 3752 sum2 = t[row+1]; 3753 for (n = 0; n<sz-1; n+=2) { 3754 i1 = idx[0]; 3755 i2 = idx[1]; 3756 idx += 2; 3757 tmp0 = t[i1]; 3758 tmp1 = t[i2]; 3759 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3760 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3761 } 3762 3763 if (n == sz-1) { 3764 tmp0 = t[*idx]; 3765 sum1 -= v1[0] * tmp0; 3766 sum2 -= v2[0] * tmp0; 3767 } 3768 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3769 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3770 ibdiag += 4; row += 2; 3771 break; 3772 case 3: 3773 v2 = a->a + ii[row+1]; 3774 v3 = a->a + ii[row+2]; 3775 sum1 = t[row]; 3776 sum2 = t[row+1]; 3777 sum3 = t[row+2]; 3778 for (n = 0; n<sz-1; n+=2) { 3779 i1 = idx[0]; 3780 i2 = idx[1]; 3781 idx += 2; 3782 tmp0 = t[i1]; 3783 tmp1 = t[i2]; 3784 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3785 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3786 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3787 } 3788 3789 if (n == sz-1) { 3790 tmp0 = t[*idx]; 3791 sum1 -= v1[0] * tmp0; 3792 sum2 -= v2[0] * tmp0; 3793 sum3 -= v3[0] * tmp0; 3794 } 3795 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3796 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3797 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3798 ibdiag += 9; row += 3; 3799 break; 3800 case 4: 3801 v2 = a->a + ii[row+1]; 3802 v3 = a->a + ii[row+2]; 3803 v4 = a->a + ii[row+3]; 3804 sum1 = t[row]; 3805 sum2 = t[row+1]; 3806 sum3 = t[row+2]; 3807 sum4 = t[row+3]; 3808 for (n = 0; n<sz-1; n+=2) { 3809 i1 = idx[0]; 3810 i2 = idx[1]; 3811 idx += 2; 3812 tmp0 = t[i1]; 3813 tmp1 = t[i2]; 3814 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3815 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3816 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3817 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3818 } 3819 3820 if (n == sz-1) { 3821 tmp0 = t[*idx]; 3822 sum1 -= v1[0] * tmp0; 3823 sum2 -= v2[0] * tmp0; 3824 sum3 -= v3[0] * tmp0; 3825 sum4 -= v4[0] * tmp0; 3826 } 3827 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3828 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3829 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3830 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3831 ibdiag += 16; row += 4; 3832 break; 3833 case 5: 3834 v2 = a->a + ii[row+1]; 3835 v3 = a->a + ii[row+2]; 3836 v4 = a->a + ii[row+3]; 3837 v5 = a->a + ii[row+4]; 3838 sum1 = t[row]; 3839 sum2 = t[row+1]; 3840 sum3 = t[row+2]; 3841 sum4 = t[row+3]; 3842 sum5 = t[row+4]; 3843 for (n = 0; n<sz-1; n+=2) { 3844 i1 = idx[0]; 3845 i2 = idx[1]; 3846 idx += 2; 3847 tmp0 = t[i1]; 3848 tmp1 = t[i2]; 3849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3850 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3851 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3852 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3853 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3854 } 3855 3856 if (n == sz-1) { 3857 tmp0 = t[*idx]; 3858 sum1 -= v1[0] * tmp0; 3859 sum2 -= v2[0] * tmp0; 3860 sum3 -= v3[0] * tmp0; 3861 sum4 -= v4[0] * tmp0; 3862 sum5 -= v5[0] * tmp0; 3863 } 3864 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3865 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3866 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3867 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3868 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3869 ibdiag += 25; row += 5; 3870 break; 3871 default: 3872 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3873 } 3874 } 3875 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3876 } 3877 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3878 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3879 CHKMEMQ; PetscFunctionReturn(0); 3880 } 3881 3882 #undef __FUNCT__ 3883 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3884 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3885 { 3886 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3887 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3888 const MatScalar *bdiag = a->inode.bdiag; 3889 const PetscScalar *b; 3890 PetscErrorCode ierr; 3891 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3892 const PetscInt *sizes = a->inode.size; 3893 3894 PetscFunctionBegin; 3895 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3896 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3897 cnt = 0; 3898 for (i=0, row=0; i<m; i++) { 3899 switch (sizes[i]) { 3900 case 1: 3901 x[row] = b[row]*bdiag[cnt++];row++; 3902 break; 3903 case 2: 3904 x1 = b[row]; x2 = b[row+1]; 3905 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3906 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3907 x[row++] = tmp1; 3908 x[row++] = tmp2; 3909 cnt += 4; 3910 break; 3911 case 3: 3912 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3913 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3914 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3915 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3916 x[row++] = tmp1; 3917 x[row++] = tmp2; 3918 x[row++] = tmp3; 3919 cnt += 9; 3920 break; 3921 case 4: 3922 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3923 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3924 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3925 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3926 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3927 x[row++] = tmp1; 3928 x[row++] = tmp2; 3929 x[row++] = tmp3; 3930 x[row++] = tmp4; 3931 cnt += 16; 3932 break; 3933 case 5: 3934 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3935 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3936 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3937 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3938 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3939 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3940 x[row++] = tmp1; 3941 x[row++] = tmp2; 3942 x[row++] = tmp3; 3943 x[row++] = tmp4; 3944 x[row++] = tmp5; 3945 cnt += 25; 3946 break; 3947 default: 3948 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3949 } 3950 } 3951 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3952 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3953 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3954 PetscFunctionReturn(0); 3955 } 3956 3957 /* 3958 samestructure indicates that the matrix has not changed its nonzero structure so we 3959 do not need to recompute the inodes 3960 */ 3961 #undef __FUNCT__ 3962 #define __FUNCT__ "Mat_CheckInode" 3963 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 3964 { 3965 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3966 PetscErrorCode ierr; 3967 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3968 PetscBool flag; 3969 3970 PetscFunctionBegin; 3971 if (!a->inode.use) PetscFunctionReturn(0); 3972 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3973 3974 3975 m = A->rmap->n; 3976 if (a->inode.size) {ns = a->inode.size;} 3977 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3978 3979 i = 0; 3980 node_count = 0; 3981 idx = a->j; 3982 ii = a->i; 3983 while (i < m) { /* For each row */ 3984 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3985 /* Limits the number of elements in a node to 'a->inode.limit' */ 3986 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3987 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3988 if (nzy != nzx) break; 3989 idy += nzx; /* Same nonzero pattern */ 3990 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3991 if (!flag) break; 3992 } 3993 ns[node_count++] = blk_size; 3994 idx += blk_size*nzx; 3995 i = j; 3996 } 3997 /* If not enough inodes found,, do not use inode version of the routines */ 3998 if (!m || node_count > .8*m) { 3999 ierr = PetscFree(ns);CHKERRQ(ierr); 4000 a->inode.node_count = 0; 4001 a->inode.size = PETSC_NULL; 4002 a->inode.use = PETSC_FALSE; 4003 A->ops->mult = MatMult_SeqAIJ; 4004 A->ops->sor = MatSOR_SeqAIJ; 4005 A->ops->multadd = MatMultAdd_SeqAIJ; 4006 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4007 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4008 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4009 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4010 A->ops->coloringpatch = 0; 4011 A->ops->multdiagonalblock = 0; 4012 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4013 } else { 4014 if (!A->factortype) { 4015 A->ops->mult = MatMult_SeqAIJ_Inode; 4016 A->ops->sor = MatSOR_SeqAIJ_Inode; 4017 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4018 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4019 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4020 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4021 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4022 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4023 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4024 } else { 4025 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4026 } 4027 a->inode.node_count = node_count; 4028 a->inode.size = ns; 4029 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4030 } 4031 a->inode.checked = PETSC_TRUE; 4032 PetscFunctionReturn(0); 4033 } 4034 4035 #undef __FUNCT__ 4036 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4037 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4038 { 4039 Mat B=*C; 4040 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4041 PetscErrorCode ierr; 4042 PetscInt m=A->rmap->n; 4043 4044 PetscFunctionBegin; 4045 c->inode.use = a->inode.use; 4046 c->inode.limit = a->inode.limit; 4047 c->inode.max_limit = a->inode.max_limit; 4048 if (a->inode.size) { 4049 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr); 4050 c->inode.node_count = a->inode.node_count; 4051 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4052 /* note the table of functions below should match that in Mat_CheckInode() */ 4053 if (!B->factortype) { 4054 B->ops->mult = MatMult_SeqAIJ_Inode; 4055 B->ops->sor = MatSOR_SeqAIJ_Inode; 4056 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4057 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4058 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4059 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4060 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4061 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4062 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4063 } else { 4064 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4065 } 4066 } else { 4067 c->inode.size = 0; 4068 c->inode.node_count = 0; 4069 } 4070 c->inode.ibdiagvalid = PETSC_FALSE; 4071 c->inode.ibdiag = 0; 4072 c->inode.bdiag = 0; 4073 PetscFunctionReturn(0); 4074 } 4075 4076 #undef __FUNCT__ 4077 #define __FUNCT__ "MatGetRow_FactoredLU" 4078 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row) 4079 { 4080 PetscInt k, *vi; 4081 4082 PetscFunctionBegin; 4083 vi = aj + ai[row]; 4084 for (k=0;k<nzl;k++) cols[k] = vi[k]; 4085 vi = aj + adiag[row]; 4086 cols[nzl] = vi[0]; 4087 vi = aj + adiag[row+1]+1; 4088 for (k=0;k<nzu;k++) cols[nzl+1+k] = vi[k]; 4089 PetscFunctionReturn(0); 4090 } 4091 /* 4092 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4093 Modified from Mat_CheckInode(). 4094 4095 Input Parameters: 4096 + Mat A - ILU or LU matrix factor 4097 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 4098 do not need to recompute the inodes 4099 */ 4100 #undef __FUNCT__ 4101 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4102 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure) 4103 { 4104 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4105 PetscErrorCode ierr; 4106 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4107 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 4108 PetscBool flag; 4109 4110 PetscFunctionBegin; 4111 if (!a->inode.use) PetscFunctionReturn(0); 4112 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4113 4114 m = A->rmap->n; 4115 if (a->inode.size) {ns = a->inode.size;} 4116 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 4117 4118 i = 0; 4119 node_count = 0; 4120 while (i < m) { /* For each row */ 4121 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4122 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4123 nzx = nzl1 + nzu1 + 1; 4124 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 4125 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4126 4127 /* Limits the number of elements in a node to 'a->inode.limit' */ 4128 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4129 nzl2 = ai[j+1] - ai[j]; 4130 nzu2 = adiag[j] - adiag[j+1] - 1; 4131 nzy = nzl2 + nzu2 + 1; 4132 if (nzy != nzx) break; 4133 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 4134 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4135 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4136 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 4137 ierr = PetscFree(cols2);CHKERRQ(ierr); 4138 } 4139 ns[node_count++] = blk_size; 4140 ierr = PetscFree(cols1);CHKERRQ(ierr); 4141 i = j; 4142 } 4143 /* If not enough inodes found,, do not use inode version of the routines */ 4144 if (!m || node_count > .8*m) { 4145 ierr = PetscFree(ns);CHKERRQ(ierr); 4146 a->inode.node_count = 0; 4147 a->inode.size = PETSC_NULL; 4148 a->inode.use = PETSC_FALSE; 4149 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4150 } else { 4151 A->ops->mult = 0; 4152 A->ops->sor = 0; 4153 A->ops->multadd = 0; 4154 A->ops->getrowij = 0; 4155 A->ops->restorerowij = 0; 4156 A->ops->getcolumnij = 0; 4157 A->ops->restorecolumnij = 0; 4158 A->ops->coloringpatch = 0; 4159 A->ops->multdiagonalblock = 0; 4160 A->ops->solve = MatSolve_SeqAIJ_Inode; 4161 a->inode.node_count = node_count; 4162 a->inode.size = ns; 4163 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4164 } 4165 a->inode.checked = PETSC_TRUE; 4166 PetscFunctionReturn(0); 4167 } 4168 4169 #undef __FUNCT__ 4170 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4171 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4172 { 4173 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4174 4175 PetscFunctionBegin; 4176 a->inode.ibdiagvalid = PETSC_FALSE; 4177 PetscFunctionReturn(0); 4178 } 4179 4180 /* 4181 This is really ugly. if inodes are used this replaces the 4182 permutations with ones that correspond to rows/cols of the matrix 4183 rather then inode blocks 4184 */ 4185 #undef __FUNCT__ 4186 #define __FUNCT__ "MatInodeAdjustForInodes" 4187 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4188 { 4189 PetscErrorCode ierr; 4190 4191 PetscFunctionBegin; 4192 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4193 PetscFunctionReturn(0); 4194 } 4195 4196 EXTERN_C_BEGIN 4197 #undef __FUNCT__ 4198 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4199 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4200 { 4201 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4202 PetscErrorCode ierr; 4203 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4204 const PetscInt *ridx,*cidx; 4205 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4206 PetscInt nslim_col,*ns_col; 4207 IS ris = *rperm,cis = *cperm; 4208 4209 PetscFunctionBegin; 4210 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4211 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4212 4213 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4214 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 4215 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 4216 4217 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4218 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4219 4220 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4221 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4222 4223 /* Construct the permutations for rows*/ 4224 for (i=0,row = 0; i<nslim_row; ++i) { 4225 indx = ridx[i]; 4226 start_val = tns[indx]; 4227 end_val = tns[indx + 1]; 4228 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4229 } 4230 4231 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4232 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4233 4234 /* Construct permutations for columns */ 4235 for (i=0,col=0; i<nslim_col; ++i) { 4236 indx = cidx[i]; 4237 start_val = tns[indx]; 4238 end_val = tns[indx + 1]; 4239 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4240 } 4241 4242 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4243 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4244 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4245 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4246 4247 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4248 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4249 4250 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4251 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4252 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4253 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4254 ierr = PetscFree(tns);CHKERRQ(ierr); 4255 PetscFunctionReturn(0); 4256 } 4257 EXTERN_C_END 4258 4259 #undef __FUNCT__ 4260 #define __FUNCT__ "MatInodeGetInodeSizes" 4261 /*@C 4262 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4263 4264 Not Collective 4265 4266 Input Parameter: 4267 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4268 4269 Output Parameter: 4270 + node_count - no of inodes present in the matrix. 4271 . sizes - an array of size node_count,with sizes of each inode. 4272 - limit - the max size used to generate the inodes. 4273 4274 Level: advanced 4275 4276 Notes: This routine returns some internal storage information 4277 of the matrix, it is intended to be used by advanced users. 4278 It should be called after the matrix is assembled. 4279 The contents of the sizes[] array should not be changed. 4280 PETSC_NULL may be passed for information not requested. 4281 4282 .keywords: matrix, seqaij, get, inode 4283 4284 .seealso: MatGetInfo() 4285 @*/ 4286 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4287 { 4288 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4289 4290 PetscFunctionBegin; 4291 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4292 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 4293 if (f) { 4294 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4295 } 4296 PetscFunctionReturn(0); 4297 } 4298 4299 EXTERN_C_BEGIN 4300 #undef __FUNCT__ 4301 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4302 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4303 { 4304 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4305 4306 PetscFunctionBegin; 4307 if (node_count) *node_count = a->inode.node_count; 4308 if (sizes) *sizes = a->inode.size; 4309 if (limit) *limit = a->inode.limit; 4310 PetscFunctionReturn(0); 4311 } 4312 EXTERN_C_END 4313