1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 24 for (; count+1 < n; count++,i++); 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for (; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 67 68 PetscFunctionBegin; 69 nslim_row = a->inode.node_count; 70 m = A->rmap->n; 71 n = A->cmap->n; 72 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 73 74 /* Use the row_inode as column_inode */ 75 nslim_col = nslim_row; 76 ns_col = ns_row; 77 78 /* allocate space for reformated inode structure */ 79 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 80 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 81 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 82 83 for (i1=0,col=0; i1<nslim_col; ++i1) { 84 nsz = ns_col[i1]; 85 for (i2=0; i2<nsz; ++i2,++col) 86 tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 col = *j++ + ishift; 101 i2 = tvc[col]; 102 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 103 ia[i1+1]++; 104 ia[i2+1]++; 105 i2++; /* Start col of next node */ 106 while ((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 107 i2 = tvc[col]; 108 } 109 if (i2 == i1) ia[i2+1]++; /* now the diagonal element */ 110 } 111 112 /* shift ia[i] to point to next row */ 113 for (i1=1; i1<nslim_row+1; i1++) { 114 row = ia[i1-1]; 115 ia[i1] += row; 116 work[i1-1] = row - oshift; 117 } 118 119 /* allocate space for column pointers */ 120 nz = ia[nslim_row] + (!ishift); 121 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 122 *jja = ja; 123 124 /* loop over lower triangular part putting into ja */ 125 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 126 j = aj + ai[row] + ishift; 127 jmax = aj + ai[row+1] + ishift; 128 col = *j++ + ishift; 129 i2 = tvc[col]; 130 while (i2<i1 && j<jmax) { 131 ja[work[i2]++] = i1 + oshift; 132 ja[work[i1]++] = i2 + oshift; 133 ++i2; 134 while ((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 135 i2 = tvc[col]; 136 } 137 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 138 139 } 140 ierr = PetscFree(work);CHKERRQ(ierr); 141 ierr = PetscFree(tns);CHKERRQ(ierr); 142 ierr = PetscFree(tvc);CHKERRQ(ierr); 143 PetscFunctionReturn(0); 144 } 145 146 /* 147 This builds nonsymmetric version of nonzero structure, 148 */ 149 #undef __FUNCT__ 150 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 151 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 152 { 153 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 154 PetscErrorCode ierr; 155 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 156 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 157 158 PetscFunctionBegin; 159 nslim_row = a->inode.node_count; 160 n = A->cmap->n; 161 162 /* Create The column_inode for this matrix */ 163 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 164 165 /* allocate space for reformated column_inode structure */ 166 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 167 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 168 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 169 170 for (i1=0,col=0; i1<nslim_col; ++i1) { 171 nsz = ns_col[i1]; 172 for (i2=0; i2<nsz; ++i2,++col) 173 tvc[col] = i1; 174 } 175 /* allocate space for row pointers */ 176 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 177 *iia = ia; 178 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 180 181 /* determine the number of columns in each row */ 182 ia[0] = oshift; 183 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 184 j = aj + ai[row] + ishift; 185 col = *j++ + ishift; 186 i2 = tvc[col]; 187 nz = ai[row+1] - ai[row]; 188 while (nz-- > 0) { /* off-diagonal elemets */ 189 ia[i1+1]++; 190 i2++; /* Start col of next node */ 191 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 192 if (nz > 0) i2 = tvc[col]; 193 } 194 } 195 196 /* shift ia[i] to point to next row */ 197 for (i1=1; i1<nslim_row+1; i1++) { 198 row = ia[i1-1]; 199 ia[i1] += row; 200 work[i1-1] = row - oshift; 201 } 202 203 /* allocate space for column pointers */ 204 nz = ia[nslim_row] + (!ishift); 205 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 206 *jja = ja; 207 208 /* loop over matrix putting into ja */ 209 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 210 j = aj + ai[row] + ishift; 211 col = *j++ + ishift; 212 i2 = tvc[col]; 213 nz = ai[row+1] - ai[row]; 214 while (nz-- > 0) { 215 ja[work[i1]++] = i2 + oshift; 216 ++i2; 217 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 218 if (nz > 0) i2 = tvc[col]; 219 } 220 } 221 ierr = PetscFree(ns_col);CHKERRQ(ierr); 222 ierr = PetscFree(work);CHKERRQ(ierr); 223 ierr = PetscFree(tns);CHKERRQ(ierr); 224 ierr = PetscFree(tvc);CHKERRQ(ierr); 225 PetscFunctionReturn(0); 226 } 227 228 #undef __FUNCT__ 229 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 230 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 231 { 232 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 233 PetscErrorCode ierr; 234 235 PetscFunctionBegin; 236 *n = a->inode.node_count; 237 if (!ia) PetscFunctionReturn(0); 238 if (!blockcompressed) { 239 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 240 } else if (symmetric) { 241 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 242 } else { 243 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 244 } 245 PetscFunctionReturn(0); 246 } 247 248 #undef __FUNCT__ 249 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 250 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 251 { 252 PetscErrorCode ierr; 253 254 PetscFunctionBegin; 255 if (!ia) PetscFunctionReturn(0); 256 257 if (!blockcompressed) { 258 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 259 } else { 260 ierr = PetscFree(*ia);CHKERRQ(ierr); 261 ierr = PetscFree(*ja);CHKERRQ(ierr); 262 } 263 264 PetscFunctionReturn(0); 265 } 266 267 /* ----------------------------------------------------------- */ 268 269 #undef __FUNCT__ 270 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 271 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,const PetscInt *iia[],const PetscInt *jja[],PetscInt ishift,PetscInt oshift) 272 { 273 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 274 PetscErrorCode ierr; 275 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 276 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 277 278 PetscFunctionBegin; 279 nslim_row = a->inode.node_count; 280 n = A->cmap->n; 281 282 /* Create The column_inode for this matrix */ 283 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 284 285 /* allocate space for reformated column_inode structure */ 286 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 287 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 288 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 289 290 for (i1=0,col=0; i1<nslim_col; ++i1) { 291 nsz = ns_col[i1]; 292 for (i2=0; i2<nsz; ++i2,++col) 293 tvc[col] = i1; 294 } 295 /* allocate space for column pointers */ 296 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 297 *iia = ia; 298 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 299 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 300 301 /* determine the number of columns in each row */ 302 ia[0] = oshift; 303 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 304 j = aj + ai[row] + ishift; 305 col = *j++ + ishift; 306 i2 = tvc[col]; 307 nz = ai[row+1] - ai[row]; 308 while (nz-- > 0) { /* off-diagonal elemets */ 309 /* ia[i1+1]++; */ 310 ia[i2+1]++; 311 i2++; 312 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 313 if (nz > 0) i2 = tvc[col]; 314 } 315 } 316 317 /* shift ia[i] to point to next col */ 318 for (i1=1; i1<nslim_col+1; i1++) { 319 col = ia[i1-1]; 320 ia[i1] += col; 321 work[i1-1] = col - oshift; 322 } 323 324 /* allocate space for column pointers */ 325 nz = ia[nslim_col] + (!ishift); 326 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 327 *jja = ja; 328 329 /* loop over matrix putting into ja */ 330 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 331 j = aj + ai[row] + ishift; 332 col = *j++ + ishift; 333 i2 = tvc[col]; 334 nz = ai[row+1] - ai[row]; 335 while (nz-- > 0) { 336 /* ja[work[i1]++] = i2 + oshift; */ 337 ja[work[i2]++] = i1 + oshift; 338 i2++; 339 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 340 if (nz > 0) i2 = tvc[col]; 341 } 342 } 343 ierr = PetscFree(ns_col);CHKERRQ(ierr); 344 ierr = PetscFree(work);CHKERRQ(ierr); 345 ierr = PetscFree(tns);CHKERRQ(ierr); 346 ierr = PetscFree(tvc);CHKERRQ(ierr); 347 PetscFunctionReturn(0); 348 } 349 350 #undef __FUNCT__ 351 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 352 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 353 { 354 PetscErrorCode ierr; 355 356 PetscFunctionBegin; 357 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 358 if (!ia) PetscFunctionReturn(0); 359 360 if (!blockcompressed) { 361 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 362 } else if (symmetric) { 363 /* Since the indices are symmetric it does'nt matter */ 364 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 365 } else { 366 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 367 } 368 PetscFunctionReturn(0); 369 } 370 371 #undef __FUNCT__ 372 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 373 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,const PetscInt *ia[],const PetscInt *ja[],PetscBool *done) 374 { 375 PetscErrorCode ierr; 376 377 PetscFunctionBegin; 378 if (!ia) PetscFunctionReturn(0); 379 if (!blockcompressed) { 380 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 381 } else { 382 ierr = PetscFree(*ia);CHKERRQ(ierr); 383 ierr = PetscFree(*ja);CHKERRQ(ierr); 384 } 385 PetscFunctionReturn(0); 386 } 387 388 /* ----------------------------------------------------------- */ 389 390 #undef __FUNCT__ 391 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 392 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 393 { 394 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 395 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 396 PetscScalar *y; 397 const PetscScalar *x; 398 const MatScalar *v1,*v2,*v3,*v4,*v5; 399 PetscErrorCode ierr; 400 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 401 402 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 403 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 404 #endif 405 406 PetscFunctionBegin; 407 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 408 node_max = a->inode.node_count; 409 ns = a->inode.size; /* Node Size array */ 410 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 411 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 412 idx = a->j; 413 v1 = a->a; 414 ii = a->i; 415 416 for (i = 0,row = 0; i< node_max; ++i) { 417 nsz = ns[i]; 418 n = ii[1] - ii[0]; 419 nonzerorow += (n>0)*nsz; 420 ii += nsz; 421 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 422 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 423 sz = n; /* No of non zeros in this row */ 424 /* Switch on the size of Node */ 425 switch (nsz) { /* Each loop in 'case' is unrolled */ 426 case 1 : 427 sum1 = 0.; 428 429 for (n = 0; n< sz-1; n+=2) { 430 i1 = idx[0]; /* The instructions are ordered to */ 431 i2 = idx[1]; /* make the compiler's job easy */ 432 idx += 2; 433 tmp0 = x[i1]; 434 tmp1 = x[i2]; 435 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 436 } 437 438 if (n == sz-1) { /* Take care of the last nonzero */ 439 tmp0 = x[*idx++]; 440 sum1 += *v1++ * tmp0; 441 } 442 y[row++]=sum1; 443 break; 444 case 2: 445 sum1 = 0.; 446 sum2 = 0.; 447 v2 = v1 + n; 448 449 for (n = 0; n< sz-1; n+=2) { 450 i1 = idx[0]; 451 i2 = idx[1]; 452 idx += 2; 453 tmp0 = x[i1]; 454 tmp1 = x[i2]; 455 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 456 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 457 } 458 if (n == sz-1) { 459 tmp0 = x[*idx++]; 460 sum1 += *v1++ * tmp0; 461 sum2 += *v2++ * tmp0; 462 } 463 y[row++]=sum1; 464 y[row++]=sum2; 465 v1 =v2; /* Since the next block to be processed starts there*/ 466 idx +=sz; 467 break; 468 case 3: 469 sum1 = 0.; 470 sum2 = 0.; 471 sum3 = 0.; 472 v2 = v1 + n; 473 v3 = v2 + n; 474 475 for (n = 0; n< sz-1; n+=2) { 476 i1 = idx[0]; 477 i2 = idx[1]; 478 idx += 2; 479 tmp0 = x[i1]; 480 tmp1 = x[i2]; 481 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 482 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 483 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 484 } 485 if (n == sz-1) { 486 tmp0 = x[*idx++]; 487 sum1 += *v1++ * tmp0; 488 sum2 += *v2++ * tmp0; 489 sum3 += *v3++ * tmp0; 490 } 491 y[row++]=sum1; 492 y[row++]=sum2; 493 y[row++]=sum3; 494 v1 =v3; /* Since the next block to be processed starts there*/ 495 idx +=2*sz; 496 break; 497 case 4: 498 sum1 = 0.; 499 sum2 = 0.; 500 sum3 = 0.; 501 sum4 = 0.; 502 v2 = v1 + n; 503 v3 = v2 + n; 504 v4 = v3 + n; 505 506 for (n = 0; n< sz-1; n+=2) { 507 i1 = idx[0]; 508 i2 = idx[1]; 509 idx += 2; 510 tmp0 = x[i1]; 511 tmp1 = x[i2]; 512 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 513 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 514 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 515 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 516 } 517 if (n == sz-1) { 518 tmp0 = x[*idx++]; 519 sum1 += *v1++ * tmp0; 520 sum2 += *v2++ * tmp0; 521 sum3 += *v3++ * tmp0; 522 sum4 += *v4++ * tmp0; 523 } 524 y[row++]=sum1; 525 y[row++]=sum2; 526 y[row++]=sum3; 527 y[row++]=sum4; 528 v1 =v4; /* Since the next block to be processed starts there*/ 529 idx +=3*sz; 530 break; 531 case 5: 532 sum1 = 0.; 533 sum2 = 0.; 534 sum3 = 0.; 535 sum4 = 0.; 536 sum5 = 0.; 537 v2 = v1 + n; 538 v3 = v2 + n; 539 v4 = v3 + n; 540 v5 = v4 + n; 541 542 for (n = 0; n<sz-1; n+=2) { 543 i1 = idx[0]; 544 i2 = idx[1]; 545 idx += 2; 546 tmp0 = x[i1]; 547 tmp1 = x[i2]; 548 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 549 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 550 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 551 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 552 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 553 } 554 if (n == sz-1) { 555 tmp0 = x[*idx++]; 556 sum1 += *v1++ * tmp0; 557 sum2 += *v2++ * tmp0; 558 sum3 += *v3++ * tmp0; 559 sum4 += *v4++ * tmp0; 560 sum5 += *v5++ * tmp0; 561 } 562 y[row++]=sum1; 563 y[row++]=sum2; 564 y[row++]=sum3; 565 y[row++]=sum4; 566 y[row++]=sum5; 567 v1 =v5; /* Since the next block to be processed starts there */ 568 idx +=4*sz; 569 break; 570 default : 571 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 572 } 573 } 574 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 575 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 576 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 577 PetscFunctionReturn(0); 578 } 579 /* ----------------------------------------------------------- */ 580 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 581 #undef __FUNCT__ 582 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 583 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 584 { 585 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 586 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 587 MatScalar *v1,*v2,*v3,*v4,*v5; 588 PetscScalar *x,*y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 591 592 PetscFunctionBegin; 593 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 594 node_max = a->inode.node_count; 595 ns = a->inode.size; /* Node Size array */ 596 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 597 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 598 if (zz != yy) { 599 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 600 } else { 601 z = y; 602 } 603 zt = z; 604 605 idx = a->j; 606 v1 = a->a; 607 ii = a->i; 608 609 for (i = 0,row = 0; i< node_max; ++i) { 610 nsz = ns[i]; 611 n = ii[1] - ii[0]; 612 ii += nsz; 613 sz = n; /* No of non zeros in this row */ 614 /* Switch on the size of Node */ 615 switch (nsz) { /* Each loop in 'case' is unrolled */ 616 case 1 : 617 sum1 = *zt++; 618 619 for (n = 0; n< sz-1; n+=2) { 620 i1 = idx[0]; /* The instructions are ordered to */ 621 i2 = idx[1]; /* make the compiler's job easy */ 622 idx += 2; 623 tmp0 = x[i1]; 624 tmp1 = x[i2]; 625 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 626 } 627 628 if (n == sz-1) { /* Take care of the last nonzero */ 629 tmp0 = x[*idx++]; 630 sum1 += *v1++ * tmp0; 631 } 632 y[row++]=sum1; 633 break; 634 case 2: 635 sum1 = *zt++; 636 sum2 = *zt++; 637 v2 = v1 + n; 638 639 for (n = 0; n< sz-1; n+=2) { 640 i1 = idx[0]; 641 i2 = idx[1]; 642 idx += 2; 643 tmp0 = x[i1]; 644 tmp1 = x[i2]; 645 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 646 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 647 } 648 if (n == sz-1) { 649 tmp0 = x[*idx++]; 650 sum1 += *v1++ * tmp0; 651 sum2 += *v2++ * tmp0; 652 } 653 y[row++]=sum1; 654 y[row++]=sum2; 655 v1 =v2; /* Since the next block to be processed starts there*/ 656 idx +=sz; 657 break; 658 case 3: 659 sum1 = *zt++; 660 sum2 = *zt++; 661 sum3 = *zt++; 662 v2 = v1 + n; 663 v3 = v2 + n; 664 665 for (n = 0; n< sz-1; n+=2) { 666 i1 = idx[0]; 667 i2 = idx[1]; 668 idx += 2; 669 tmp0 = x[i1]; 670 tmp1 = x[i2]; 671 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 672 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 673 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 674 } 675 if (n == sz-1) { 676 tmp0 = x[*idx++]; 677 sum1 += *v1++ * tmp0; 678 sum2 += *v2++ * tmp0; 679 sum3 += *v3++ * tmp0; 680 } 681 y[row++]=sum1; 682 y[row++]=sum2; 683 y[row++]=sum3; 684 v1 =v3; /* Since the next block to be processed starts there*/ 685 idx +=2*sz; 686 break; 687 case 4: 688 sum1 = *zt++; 689 sum2 = *zt++; 690 sum3 = *zt++; 691 sum4 = *zt++; 692 v2 = v1 + n; 693 v3 = v2 + n; 694 v4 = v3 + n; 695 696 for (n = 0; n< sz-1; n+=2) { 697 i1 = idx[0]; 698 i2 = idx[1]; 699 idx += 2; 700 tmp0 = x[i1]; 701 tmp1 = x[i2]; 702 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 703 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 704 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 705 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 706 } 707 if (n == sz-1) { 708 tmp0 = x[*idx++]; 709 sum1 += *v1++ * tmp0; 710 sum2 += *v2++ * tmp0; 711 sum3 += *v3++ * tmp0; 712 sum4 += *v4++ * tmp0; 713 } 714 y[row++]=sum1; 715 y[row++]=sum2; 716 y[row++]=sum3; 717 y[row++]=sum4; 718 v1 =v4; /* Since the next block to be processed starts there*/ 719 idx +=3*sz; 720 break; 721 case 5: 722 sum1 = *zt++; 723 sum2 = *zt++; 724 sum3 = *zt++; 725 sum4 = *zt++; 726 sum5 = *zt++; 727 v2 = v1 + n; 728 v3 = v2 + n; 729 v4 = v3 + n; 730 v5 = v4 + n; 731 732 for (n = 0; n<sz-1; n+=2) { 733 i1 = idx[0]; 734 i2 = idx[1]; 735 idx += 2; 736 tmp0 = x[i1]; 737 tmp1 = x[i2]; 738 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 739 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 740 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 741 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 742 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 743 } 744 if (n == sz-1) { 745 tmp0 = x[*idx++]; 746 sum1 += *v1++ * tmp0; 747 sum2 += *v2++ * tmp0; 748 sum3 += *v3++ * tmp0; 749 sum4 += *v4++ * tmp0; 750 sum5 += *v5++ * tmp0; 751 } 752 y[row++]=sum1; 753 y[row++]=sum2; 754 y[row++]=sum3; 755 y[row++]=sum4; 756 y[row++]=sum5; 757 v1 =v5; /* Since the next block to be processed starts there */ 758 idx +=4*sz; 759 break; 760 default : 761 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 762 } 763 } 764 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 765 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 766 if (zz != yy) { 767 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 768 } 769 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 770 PetscFunctionReturn(0); 771 } 772 773 /* ----------------------------------------------------------- */ 774 #undef __FUNCT__ 775 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 776 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 777 { 778 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 779 IS iscol = a->col,isrow = a->row; 780 PetscErrorCode ierr; 781 const PetscInt *r,*c,*rout,*cout; 782 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 783 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 784 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 785 PetscScalar sum1,sum2,sum3,sum4,sum5; 786 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 787 const PetscScalar *b; 788 789 PetscFunctionBegin; 790 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 791 node_max = a->inode.node_count; 792 ns = a->inode.size; /* Node Size array */ 793 794 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 795 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 796 tmp = a->solve_work; 797 798 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 799 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 800 801 /* forward solve the lower triangular */ 802 tmps = tmp ; 803 aa = a_a ; 804 aj = a_j ; 805 ad = a->diag; 806 807 for (i = 0,row = 0; i< node_max; ++i) { 808 nsz = ns[i]; 809 aii = ai[row]; 810 v1 = aa + aii; 811 vi = aj + aii; 812 nz = ad[row]- aii; 813 if (i < node_max-1) { 814 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 815 * but our indexing to determine it's size could. */ 816 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 817 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 818 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 819 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 820 } 821 822 switch (nsz) { /* Each loop in 'case' is unrolled */ 823 case 1 : 824 sum1 = b[*r++]; 825 for (j=0; j<nz-1; j+=2) { 826 i0 = vi[0]; 827 i1 = vi[1]; 828 vi +=2; 829 tmp0 = tmps[i0]; 830 tmp1 = tmps[i1]; 831 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 832 } 833 if (j == nz-1) { 834 tmp0 = tmps[*vi++]; 835 sum1 -= *v1++ *tmp0; 836 } 837 tmp[row ++]=sum1; 838 break; 839 case 2: 840 sum1 = b[*r++]; 841 sum2 = b[*r++]; 842 v2 = aa + ai[row+1]; 843 844 for (j=0; j<nz-1; j+=2) { 845 i0 = vi[0]; 846 i1 = vi[1]; 847 vi +=2; 848 tmp0 = tmps[i0]; 849 tmp1 = tmps[i1]; 850 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 851 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 852 } 853 if (j == nz-1) { 854 tmp0 = tmps[*vi++]; 855 sum1 -= *v1++ *tmp0; 856 sum2 -= *v2++ *tmp0; 857 } 858 sum2 -= *v2++ * sum1; 859 tmp[row ++]=sum1; 860 tmp[row ++]=sum2; 861 break; 862 case 3: 863 sum1 = b[*r++]; 864 sum2 = b[*r++]; 865 sum3 = b[*r++]; 866 v2 = aa + ai[row+1]; 867 v3 = aa + ai[row+2]; 868 869 for (j=0; j<nz-1; j+=2) { 870 i0 = vi[0]; 871 i1 = vi[1]; 872 vi +=2; 873 tmp0 = tmps[i0]; 874 tmp1 = tmps[i1]; 875 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 876 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 877 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 878 } 879 if (j == nz-1) { 880 tmp0 = tmps[*vi++]; 881 sum1 -= *v1++ *tmp0; 882 sum2 -= *v2++ *tmp0; 883 sum3 -= *v3++ *tmp0; 884 } 885 sum2 -= *v2++ * sum1; 886 sum3 -= *v3++ * sum1; 887 sum3 -= *v3++ * sum2; 888 tmp[row ++]=sum1; 889 tmp[row ++]=sum2; 890 tmp[row ++]=sum3; 891 break; 892 893 case 4: 894 sum1 = b[*r++]; 895 sum2 = b[*r++]; 896 sum3 = b[*r++]; 897 sum4 = b[*r++]; 898 v2 = aa + ai[row+1]; 899 v3 = aa + ai[row+2]; 900 v4 = aa + ai[row+3]; 901 902 for (j=0; j<nz-1; j+=2) { 903 i0 = vi[0]; 904 i1 = vi[1]; 905 vi +=2; 906 tmp0 = tmps[i0]; 907 tmp1 = tmps[i1]; 908 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 909 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 910 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 911 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 912 } 913 if (j == nz-1) { 914 tmp0 = tmps[*vi++]; 915 sum1 -= *v1++ *tmp0; 916 sum2 -= *v2++ *tmp0; 917 sum3 -= *v3++ *tmp0; 918 sum4 -= *v4++ *tmp0; 919 } 920 sum2 -= *v2++ * sum1; 921 sum3 -= *v3++ * sum1; 922 sum4 -= *v4++ * sum1; 923 sum3 -= *v3++ * sum2; 924 sum4 -= *v4++ * sum2; 925 sum4 -= *v4++ * sum3; 926 927 tmp[row ++]=sum1; 928 tmp[row ++]=sum2; 929 tmp[row ++]=sum3; 930 tmp[row ++]=sum4; 931 break; 932 case 5: 933 sum1 = b[*r++]; 934 sum2 = b[*r++]; 935 sum3 = b[*r++]; 936 sum4 = b[*r++]; 937 sum5 = b[*r++]; 938 v2 = aa + ai[row+1]; 939 v3 = aa + ai[row+2]; 940 v4 = aa + ai[row+3]; 941 v5 = aa + ai[row+4]; 942 943 for (j=0; j<nz-1; j+=2) { 944 i0 = vi[0]; 945 i1 = vi[1]; 946 vi +=2; 947 tmp0 = tmps[i0]; 948 tmp1 = tmps[i1]; 949 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 950 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 951 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 952 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 953 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 954 } 955 if (j == nz-1) { 956 tmp0 = tmps[*vi++]; 957 sum1 -= *v1++ *tmp0; 958 sum2 -= *v2++ *tmp0; 959 sum3 -= *v3++ *tmp0; 960 sum4 -= *v4++ *tmp0; 961 sum5 -= *v5++ *tmp0; 962 } 963 964 sum2 -= *v2++ * sum1; 965 sum3 -= *v3++ * sum1; 966 sum4 -= *v4++ * sum1; 967 sum5 -= *v5++ * sum1; 968 sum3 -= *v3++ * sum2; 969 sum4 -= *v4++ * sum2; 970 sum5 -= *v5++ * sum2; 971 sum4 -= *v4++ * sum3; 972 sum5 -= *v5++ * sum3; 973 sum5 -= *v5++ * sum4; 974 975 tmp[row ++]=sum1; 976 tmp[row ++]=sum2; 977 tmp[row ++]=sum3; 978 tmp[row ++]=sum4; 979 tmp[row ++]=sum5; 980 break; 981 default: 982 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 983 } 984 } 985 /* backward solve the upper triangular */ 986 for (i=node_max -1 ,row = n-1 ; i>=0; i--) { 987 nsz = ns[i]; 988 aii = ai[row+1] -1; 989 v1 = aa + aii; 990 vi = aj + aii; 991 nz = aii- ad[row]; 992 switch (nsz) { /* Each loop in 'case' is unrolled */ 993 case 1 : 994 sum1 = tmp[row]; 995 996 for (j=nz ; j>1; j-=2) { 997 vi -=2; 998 i0 = vi[2]; 999 i1 = vi[1]; 1000 tmp0 = tmps[i0]; 1001 tmp1 = tmps[i1]; 1002 v1 -= 2; 1003 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1004 } 1005 if (j==1) { 1006 tmp0 = tmps[*vi--]; 1007 sum1 -= *v1-- * tmp0; 1008 } 1009 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1010 break; 1011 case 2 : 1012 sum1 = tmp[row]; 1013 sum2 = tmp[row -1]; 1014 v2 = aa + ai[row]-1; 1015 for (j=nz ; j>1; j-=2) { 1016 vi -=2; 1017 i0 = vi[2]; 1018 i1 = vi[1]; 1019 tmp0 = tmps[i0]; 1020 tmp1 = tmps[i1]; 1021 v1 -= 2; 1022 v2 -= 2; 1023 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1024 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1025 } 1026 if (j==1) { 1027 tmp0 = tmps[*vi--]; 1028 sum1 -= *v1-- * tmp0; 1029 sum2 -= *v2-- * tmp0; 1030 } 1031 1032 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1033 sum2 -= *v2-- * tmp0; 1034 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1035 break; 1036 case 3 : 1037 sum1 = tmp[row]; 1038 sum2 = tmp[row -1]; 1039 sum3 = tmp[row -2]; 1040 v2 = aa + ai[row]-1; 1041 v3 = aa + ai[row -1]-1; 1042 for (j=nz ; j>1; j-=2) { 1043 vi -=2; 1044 i0 = vi[2]; 1045 i1 = vi[1]; 1046 tmp0 = tmps[i0]; 1047 tmp1 = tmps[i1]; 1048 v1 -= 2; 1049 v2 -= 2; 1050 v3 -= 2; 1051 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1052 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1053 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1054 } 1055 if (j==1) { 1056 tmp0 = tmps[*vi--]; 1057 sum1 -= *v1-- * tmp0; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 } 1061 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1062 sum2 -= *v2-- * tmp0; 1063 sum3 -= *v3-- * tmp0; 1064 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1065 sum3 -= *v3-- * tmp0; 1066 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1067 1068 break; 1069 case 4 : 1070 sum1 = tmp[row]; 1071 sum2 = tmp[row -1]; 1072 sum3 = tmp[row -2]; 1073 sum4 = tmp[row -3]; 1074 v2 = aa + ai[row]-1; 1075 v3 = aa + ai[row -1]-1; 1076 v4 = aa + ai[row -2]-1; 1077 1078 for (j=nz ; j>1; j-=2) { 1079 vi -=2; 1080 i0 = vi[2]; 1081 i1 = vi[1]; 1082 tmp0 = tmps[i0]; 1083 tmp1 = tmps[i1]; 1084 v1 -= 2; 1085 v2 -= 2; 1086 v3 -= 2; 1087 v4 -= 2; 1088 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1089 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1090 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1091 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1092 } 1093 if (j==1) { 1094 tmp0 = tmps[*vi--]; 1095 sum1 -= *v1-- * tmp0; 1096 sum2 -= *v2-- * tmp0; 1097 sum3 -= *v3-- * tmp0; 1098 sum4 -= *v4-- * tmp0; 1099 } 1100 1101 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1102 sum2 -= *v2-- * tmp0; 1103 sum3 -= *v3-- * tmp0; 1104 sum4 -= *v4-- * tmp0; 1105 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1106 sum3 -= *v3-- * tmp0; 1107 sum4 -= *v4-- * tmp0; 1108 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1109 sum4 -= *v4-- * tmp0; 1110 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1111 break; 1112 case 5 : 1113 sum1 = tmp[row]; 1114 sum2 = tmp[row -1]; 1115 sum3 = tmp[row -2]; 1116 sum4 = tmp[row -3]; 1117 sum5 = tmp[row -4]; 1118 v2 = aa + ai[row]-1; 1119 v3 = aa + ai[row -1]-1; 1120 v4 = aa + ai[row -2]-1; 1121 v5 = aa + ai[row -3]-1; 1122 for (j=nz ; j>1; j-=2) { 1123 vi -= 2; 1124 i0 = vi[2]; 1125 i1 = vi[1]; 1126 tmp0 = tmps[i0]; 1127 tmp1 = tmps[i1]; 1128 v1 -= 2; 1129 v2 -= 2; 1130 v3 -= 2; 1131 v4 -= 2; 1132 v5 -= 2; 1133 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1134 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1135 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1136 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1137 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1138 } 1139 if (j==1) { 1140 tmp0 = tmps[*vi--]; 1141 sum1 -= *v1-- * tmp0; 1142 sum2 -= *v2-- * tmp0; 1143 sum3 -= *v3-- * tmp0; 1144 sum4 -= *v4-- * tmp0; 1145 sum5 -= *v5-- * tmp0; 1146 } 1147 1148 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1149 sum2 -= *v2-- * tmp0; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1154 sum3 -= *v3-- * tmp0; 1155 sum4 -= *v4-- * tmp0; 1156 sum5 -= *v5-- * tmp0; 1157 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1158 sum4 -= *v4-- * tmp0; 1159 sum5 -= *v5-- * tmp0; 1160 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1161 sum5 -= *v5-- * tmp0; 1162 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1163 break; 1164 default: 1165 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1166 } 1167 } 1168 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1169 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1170 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1172 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1173 PetscFunctionReturn(0); 1174 } 1175 1176 #undef __FUNCT__ 1177 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1178 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1179 { 1180 Mat C=B; 1181 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1182 IS isrow = b->row,isicol = b->icol; 1183 PetscErrorCode ierr; 1184 const PetscInt *r,*ic,*ics; 1185 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1186 PetscInt i,j,k,nz,nzL,row,*pj; 1187 const PetscInt *ajtmp,*bjtmp; 1188 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1189 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1190 FactorShiftCtx sctx; 1191 const PetscInt *ddiag; 1192 PetscReal rs; 1193 MatScalar d; 1194 PetscInt inod,nodesz,node_max,col; 1195 const PetscInt *ns; 1196 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1197 1198 PetscFunctionBegin; 1199 /* MatPivotSetUp(): initialize shift context sctx */ 1200 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1201 1202 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1203 ddiag = a->diag; 1204 sctx.shift_top = info->zeropivot; 1205 for (i=0; i<n; i++) { 1206 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1207 d = (aa)[ddiag[i]]; 1208 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1209 v = aa+ai[i]; 1210 nz = ai[i+1] - ai[i]; 1211 for (j=0; j<nz; j++) 1212 rs += PetscAbsScalar(v[j]); 1213 if (rs>sctx.shift_top) sctx.shift_top = rs; 1214 } 1215 sctx.shift_top *= 1.1; 1216 sctx.nshift_max = 5; 1217 sctx.shift_lo = 0.; 1218 sctx.shift_hi = 1.; 1219 } 1220 1221 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1222 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1223 1224 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1225 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1226 rtmp2 = rtmp1 + n; 1227 rtmp3 = rtmp2 + n; 1228 rtmp4 = rtmp3 + n; 1229 ics = ic; 1230 1231 node_max = a->inode.node_count; 1232 ns = a->inode.size; 1233 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1234 1235 /* If max inode size > 4, split it into two inodes.*/ 1236 /* also map the inode sizes according to the ordering */ 1237 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1238 for (i=0,j=0; i<node_max; ++i,++j) { 1239 if (ns[i] > 4) { 1240 tmp_vec1[j] = 4; 1241 ++j; 1242 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1243 } else { 1244 tmp_vec1[j] = ns[i]; 1245 } 1246 } 1247 /* Use the correct node_max */ 1248 node_max = j; 1249 1250 /* Now reorder the inode info based on mat re-ordering info */ 1251 /* First create a row -> inode_size_array_index map */ 1252 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1253 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1254 for (i=0,row=0; i<node_max; i++) { 1255 nodesz = tmp_vec1[i]; 1256 for (j=0; j<nodesz; j++,row++) { 1257 nsmap[row] = i; 1258 } 1259 } 1260 /* Using nsmap, create a reordered ns structure */ 1261 for (i=0,j=0; i< node_max; i++) { 1262 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1263 tmp_vec2[i] = nodesz; 1264 j += nodesz; 1265 } 1266 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1267 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1268 1269 /* Now use the correct ns */ 1270 ns = tmp_vec2; 1271 1272 do { 1273 sctx.newshift = PETSC_FALSE; 1274 /* Now loop over each block-row, and do the factorization */ 1275 for (inod=0,i=0; inod<node_max; inod++) { /* i: row index; inod: inode index */ 1276 nodesz = ns[inod]; 1277 1278 switch (nodesz) { 1279 case 1: 1280 /*----------*/ 1281 /* zero rtmp1 */ 1282 /* L part */ 1283 nz = bi[i+1] - bi[i]; 1284 bjtmp = bj + bi[i]; 1285 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1286 1287 /* U part */ 1288 nz = bdiag[i]-bdiag[i+1]; 1289 bjtmp = bj + bdiag[i+1]+1; 1290 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1291 1292 /* load in initial (unfactored row) */ 1293 nz = ai[r[i]+1] - ai[r[i]]; 1294 ajtmp = aj + ai[r[i]]; 1295 v = aa + ai[r[i]]; 1296 for (j=0; j<nz; j++) { 1297 rtmp1[ics[ajtmp[j]]] = v[j]; 1298 } 1299 /* ZeropivotApply() */ 1300 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1301 1302 /* elimination */ 1303 bjtmp = bj + bi[i]; 1304 row = *bjtmp++; 1305 nzL = bi[i+1] - bi[i]; 1306 for (k=0; k < nzL;k++) { 1307 pc = rtmp1 + row; 1308 if (*pc != 0.0) { 1309 pv = b->a + bdiag[row]; 1310 mul1 = *pc * (*pv); 1311 *pc = mul1; 1312 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1313 pv = b->a + bdiag[row+1]+1; 1314 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1315 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1316 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1317 } 1318 row = *bjtmp++; 1319 } 1320 1321 /* finished row so stick it into b->a */ 1322 rs = 0.0; 1323 /* L part */ 1324 pv = b->a + bi[i] ; 1325 pj = b->j + bi[i] ; 1326 nz = bi[i+1] - bi[i]; 1327 for (j=0; j<nz; j++) { 1328 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1329 } 1330 1331 /* U part */ 1332 pv = b->a + bdiag[i+1]+1; 1333 pj = b->j + bdiag[i+1]+1; 1334 nz = bdiag[i] - bdiag[i+1]-1; 1335 for (j=0; j<nz; j++) { 1336 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1337 } 1338 1339 /* Check zero pivot */ 1340 sctx.rs = rs; 1341 sctx.pv = rtmp1[i]; 1342 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1343 if (sctx.newshift) break; 1344 1345 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1346 pv = b->a + bdiag[i]; 1347 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1348 break; 1349 1350 case 2: 1351 /*----------*/ 1352 /* zero rtmp1 and rtmp2 */ 1353 /* L part */ 1354 nz = bi[i+1] - bi[i]; 1355 bjtmp = bj + bi[i]; 1356 for (j=0; j<nz; j++) { 1357 col = bjtmp[j]; 1358 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1359 } 1360 1361 /* U part */ 1362 nz = bdiag[i]-bdiag[i+1]; 1363 bjtmp = bj + bdiag[i+1]+1; 1364 for (j=0; j<nz; j++) { 1365 col = bjtmp[j]; 1366 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1367 } 1368 1369 /* load in initial (unfactored row) */ 1370 nz = ai[r[i]+1] - ai[r[i]]; 1371 ajtmp = aj + ai[r[i]]; 1372 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1373 for (j=0; j<nz; j++) { 1374 col = ics[ajtmp[j]]; 1375 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1376 } 1377 /* ZeropivotApply(): shift the diagonal of the matrix */ 1378 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1379 1380 /* elimination */ 1381 bjtmp = bj + bi[i]; 1382 row = *bjtmp++; /* pivot row */ 1383 nzL = bi[i+1] - bi[i]; 1384 for (k=0; k < nzL;k++) { 1385 pc1 = rtmp1 + row; 1386 pc2 = rtmp2 + row; 1387 if (*pc1 != 0.0 || *pc2 != 0.0) { 1388 pv = b->a + bdiag[row]; 1389 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1390 *pc1 = mul1; *pc2 = mul2; 1391 1392 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1393 pv = b->a + bdiag[row+1]+1; 1394 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1395 for (j=0; j<nz; j++) { 1396 col = pj[j]; 1397 rtmp1[col] -= mul1 * pv[j]; 1398 rtmp2[col] -= mul2 * pv[j]; 1399 } 1400 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1401 } 1402 row = *bjtmp++; 1403 } 1404 1405 /* finished row i; check zero pivot, then stick row i into b->a */ 1406 rs = 0.0; 1407 /* L part */ 1408 pc1 = b->a + bi[i]; 1409 pj = b->j + bi[i] ; 1410 nz = bi[i+1] - bi[i]; 1411 for (j=0; j<nz; j++) { 1412 col = pj[j]; 1413 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1414 } 1415 /* U part */ 1416 pc1 = b->a + bdiag[i+1]+1; 1417 pj = b->j + bdiag[i+1]+1; 1418 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1419 for (j=0; j<nz; j++) { 1420 col = pj[j]; 1421 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1422 } 1423 1424 sctx.rs = rs; 1425 sctx.pv = rtmp1[i]; 1426 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1427 if (sctx.newshift) break; 1428 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1429 *pc1 = 1.0/sctx.pv; 1430 1431 /* Now take care of diagonal 2x2 block. */ 1432 pc2 = rtmp2 + i; 1433 if (*pc2 != 0.0) { 1434 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1435 *pc2 = mul1; /* insert L entry */ 1436 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1437 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1438 for (j=0; j<nz; j++) { 1439 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1440 } 1441 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1442 } 1443 1444 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1445 rs = 0.0; 1446 /* L part */ 1447 pc2 = b->a + bi[i+1]; 1448 pj = b->j + bi[i+1] ; 1449 nz = bi[i+2] - bi[i+1]; 1450 for (j=0; j<nz; j++) { 1451 col = pj[j]; 1452 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1453 } 1454 /* U part */ 1455 pc2 = b->a + bdiag[i+2]+1; 1456 pj = b->j + bdiag[i+2]+1; 1457 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1458 for (j=0; j<nz; j++) { 1459 col = pj[j]; 1460 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1461 } 1462 1463 sctx.rs = rs; 1464 sctx.pv = rtmp2[i+1]; 1465 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1466 if (sctx.newshift) break; 1467 pc2 = b->a + bdiag[i+1]; 1468 *pc2 = 1.0/sctx.pv; 1469 break; 1470 1471 case 3: 1472 /*----------*/ 1473 /* zero rtmp */ 1474 /* L part */ 1475 nz = bi[i+1] - bi[i]; 1476 bjtmp = bj + bi[i]; 1477 for (j=0; j<nz; j++) { 1478 col = bjtmp[j]; 1479 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1480 } 1481 1482 /* U part */ 1483 nz = bdiag[i]-bdiag[i+1]; 1484 bjtmp = bj + bdiag[i+1]+1; 1485 for (j=0; j<nz; j++) { 1486 col = bjtmp[j]; 1487 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1488 } 1489 1490 /* load in initial (unfactored row) */ 1491 nz = ai[r[i]+1] - ai[r[i]]; 1492 ajtmp = aj + ai[r[i]]; 1493 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1494 for (j=0; j<nz; j++) { 1495 col = ics[ajtmp[j]]; 1496 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1497 } 1498 /* ZeropivotApply(): shift the diagonal of the matrix */ 1499 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1500 1501 /* elimination */ 1502 bjtmp = bj + bi[i]; 1503 row = *bjtmp++; /* pivot row */ 1504 nzL = bi[i+1] - bi[i]; 1505 for (k=0; k < nzL;k++) { 1506 pc1 = rtmp1 + row; 1507 pc2 = rtmp2 + row; 1508 pc3 = rtmp3 + row; 1509 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1510 pv = b->a + bdiag[row]; 1511 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1512 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1513 1514 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1515 pv = b->a + bdiag[row+1]+1; 1516 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1517 for (j=0; j<nz; j++) { 1518 col = pj[j]; 1519 rtmp1[col] -= mul1 * pv[j]; 1520 rtmp2[col] -= mul2 * pv[j]; 1521 rtmp3[col] -= mul3 * pv[j]; 1522 } 1523 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1524 } 1525 row = *bjtmp++; 1526 } 1527 1528 /* finished row i; check zero pivot, then stick row i into b->a */ 1529 rs = 0.0; 1530 /* L part */ 1531 pc1 = b->a + bi[i]; 1532 pj = b->j + bi[i] ; 1533 nz = bi[i+1] - bi[i]; 1534 for (j=0; j<nz; j++) { 1535 col = pj[j]; 1536 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1537 } 1538 /* U part */ 1539 pc1 = b->a + bdiag[i+1]+1; 1540 pj = b->j + bdiag[i+1]+1; 1541 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1542 for (j=0; j<nz; j++) { 1543 col = pj[j]; 1544 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1545 } 1546 1547 sctx.rs = rs; 1548 sctx.pv = rtmp1[i]; 1549 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1550 if (sctx.newshift) break; 1551 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1552 *pc1 = 1.0/sctx.pv; 1553 1554 /* Now take care of 1st column of diagonal 3x3 block. */ 1555 pc2 = rtmp2 + i; 1556 pc3 = rtmp3 + i; 1557 if (*pc2 != 0.0 || *pc3 != 0.0) { 1558 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1559 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1560 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1561 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1562 for (j=0; j<nz; j++) { 1563 col = pj[j]; 1564 rtmp2[col] -= mul2 * rtmp1[col]; 1565 rtmp3[col] -= mul3 * rtmp1[col]; 1566 } 1567 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1568 } 1569 1570 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1571 rs = 0.0; 1572 /* L part */ 1573 pc2 = b->a + bi[i+1]; 1574 pj = b->j + bi[i+1] ; 1575 nz = bi[i+2] - bi[i+1]; 1576 for (j=0; j<nz; j++) { 1577 col = pj[j]; 1578 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1579 } 1580 /* U part */ 1581 pc2 = b->a + bdiag[i+2]+1; 1582 pj = b->j + bdiag[i+2]+1; 1583 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1584 for (j=0; j<nz; j++) { 1585 col = pj[j]; 1586 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1587 } 1588 1589 sctx.rs = rs; 1590 sctx.pv = rtmp2[i+1]; 1591 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1592 if (sctx.newshift) break; 1593 pc2 = b->a + bdiag[i+1]; 1594 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1595 1596 /* Now take care of 2nd column of diagonal 3x3 block. */ 1597 pc3 = rtmp3 + i+1; 1598 if (*pc3 != 0.0) { 1599 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1600 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1601 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1602 for (j=0; j<nz; j++) { 1603 col = pj[j]; 1604 rtmp3[col] -= mul3 * rtmp2[col]; 1605 } 1606 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1607 } 1608 1609 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1610 rs = 0.0; 1611 /* L part */ 1612 pc3 = b->a + bi[i+2]; 1613 pj = b->j + bi[i+2] ; 1614 nz = bi[i+3] - bi[i+2]; 1615 for (j=0; j<nz; j++) { 1616 col = pj[j]; 1617 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1618 } 1619 /* U part */ 1620 pc3 = b->a + bdiag[i+3]+1; 1621 pj = b->j + bdiag[i+3]+1; 1622 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1623 for (j=0; j<nz; j++) { 1624 col = pj[j]; 1625 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1626 } 1627 1628 sctx.rs = rs; 1629 sctx.pv = rtmp3[i+2]; 1630 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1631 if (sctx.newshift) break; 1632 pc3 = b->a + bdiag[i+2]; 1633 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1634 break; 1635 case 4: 1636 /*----------*/ 1637 /* zero rtmp */ 1638 /* L part */ 1639 nz = bi[i+1] - bi[i]; 1640 bjtmp = bj + bi[i]; 1641 for (j=0; j<nz; j++) { 1642 col = bjtmp[j]; 1643 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1644 } 1645 1646 /* U part */ 1647 nz = bdiag[i]-bdiag[i+1]; 1648 bjtmp = bj + bdiag[i+1]+1; 1649 for (j=0; j<nz; j++) { 1650 col = bjtmp[j]; 1651 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1652 } 1653 1654 /* load in initial (unfactored row) */ 1655 nz = ai[r[i]+1] - ai[r[i]]; 1656 ajtmp = aj + ai[r[i]]; 1657 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1658 for (j=0; j<nz; j++) { 1659 col = ics[ajtmp[j]]; 1660 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1661 } 1662 /* ZeropivotApply(): shift the diagonal of the matrix */ 1663 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1664 1665 /* elimination */ 1666 bjtmp = bj + bi[i]; 1667 row = *bjtmp++; /* pivot row */ 1668 nzL = bi[i+1] - bi[i]; 1669 for (k=0; k < nzL;k++) { 1670 pc1 = rtmp1 + row; 1671 pc2 = rtmp2 + row; 1672 pc3 = rtmp3 + row; 1673 pc4 = rtmp4 + row; 1674 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1675 pv = b->a + bdiag[row]; 1676 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1677 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1678 1679 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1680 pv = b->a + bdiag[row+1]+1; 1681 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1682 for (j=0; j<nz; j++) { 1683 col = pj[j]; 1684 rtmp1[col] -= mul1 * pv[j]; 1685 rtmp2[col] -= mul2 * pv[j]; 1686 rtmp3[col] -= mul3 * pv[j]; 1687 rtmp4[col] -= mul4 * pv[j]; 1688 } 1689 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1690 } 1691 row = *bjtmp++; 1692 } 1693 1694 /* finished row i; check zero pivot, then stick row i into b->a */ 1695 rs = 0.0; 1696 /* L part */ 1697 pc1 = b->a + bi[i]; 1698 pj = b->j + bi[i] ; 1699 nz = bi[i+1] - bi[i]; 1700 for (j=0; j<nz; j++) { 1701 col = pj[j]; 1702 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1703 } 1704 /* U part */ 1705 pc1 = b->a + bdiag[i+1]+1; 1706 pj = b->j + bdiag[i+1]+1; 1707 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1708 for (j=0; j<nz; j++) { 1709 col = pj[j]; 1710 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1711 } 1712 1713 sctx.rs = rs; 1714 sctx.pv = rtmp1[i]; 1715 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1716 if (sctx.newshift) break; 1717 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1718 *pc1 = 1.0/sctx.pv; 1719 1720 /* Now take care of 1st column of diagonal 4x4 block. */ 1721 pc2 = rtmp2 + i; 1722 pc3 = rtmp3 + i; 1723 pc4 = rtmp4 + i; 1724 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1725 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1726 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1727 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1728 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1729 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1730 for (j=0; j<nz; j++) { 1731 col = pj[j]; 1732 rtmp2[col] -= mul2 * rtmp1[col]; 1733 rtmp3[col] -= mul3 * rtmp1[col]; 1734 rtmp4[col] -= mul4 * rtmp1[col]; 1735 } 1736 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1737 } 1738 1739 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1740 rs = 0.0; 1741 /* L part */ 1742 pc2 = b->a + bi[i+1]; 1743 pj = b->j + bi[i+1] ; 1744 nz = bi[i+2] - bi[i+1]; 1745 for (j=0; j<nz; j++) { 1746 col = pj[j]; 1747 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1748 } 1749 /* U part */ 1750 pc2 = b->a + bdiag[i+2]+1; 1751 pj = b->j + bdiag[i+2]+1; 1752 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1753 for (j=0; j<nz; j++) { 1754 col = pj[j]; 1755 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1756 } 1757 1758 sctx.rs = rs; 1759 sctx.pv = rtmp2[i+1]; 1760 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1761 if (sctx.newshift) break; 1762 pc2 = b->a + bdiag[i+1]; 1763 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1764 1765 /* Now take care of 2nd column of diagonal 4x4 block. */ 1766 pc3 = rtmp3 + i+1; 1767 pc4 = rtmp4 + i+1; 1768 if (*pc3 != 0.0 || *pc4 != 0.0) { 1769 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1770 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1771 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1772 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1773 for (j=0; j<nz; j++) { 1774 col = pj[j]; 1775 rtmp3[col] -= mul3 * rtmp2[col]; 1776 rtmp4[col] -= mul4 * rtmp2[col]; 1777 } 1778 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1779 } 1780 1781 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1782 rs = 0.0; 1783 /* L part */ 1784 pc3 = b->a + bi[i+2]; 1785 pj = b->j + bi[i+2] ; 1786 nz = bi[i+3] - bi[i+2]; 1787 for (j=0; j<nz; j++) { 1788 col = pj[j]; 1789 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1790 } 1791 /* U part */ 1792 pc3 = b->a + bdiag[i+3]+1; 1793 pj = b->j + bdiag[i+3]+1; 1794 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1795 for (j=0; j<nz; j++) { 1796 col = pj[j]; 1797 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1798 } 1799 1800 sctx.rs = rs; 1801 sctx.pv = rtmp3[i+2]; 1802 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1803 if (sctx.newshift) break; 1804 pc3 = b->a + bdiag[i+2]; 1805 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1806 1807 /* Now take care of 3rd column of diagonal 4x4 block. */ 1808 pc4 = rtmp4 + i+2; 1809 if (*pc4 != 0.0) { 1810 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1811 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1812 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1813 for (j=0; j<nz; j++) { 1814 col = pj[j]; 1815 rtmp4[col] -= mul4 * rtmp3[col]; 1816 } 1817 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1818 } 1819 1820 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1821 rs = 0.0; 1822 /* L part */ 1823 pc4 = b->a + bi[i+3]; 1824 pj = b->j + bi[i+3] ; 1825 nz = bi[i+4] - bi[i+3]; 1826 for (j=0; j<nz; j++) { 1827 col = pj[j]; 1828 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1829 } 1830 /* U part */ 1831 pc4 = b->a + bdiag[i+4]+1; 1832 pj = b->j + bdiag[i+4]+1; 1833 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1834 for (j=0; j<nz; j++) { 1835 col = pj[j]; 1836 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1837 } 1838 1839 sctx.rs = rs; 1840 sctx.pv = rtmp4[i+3]; 1841 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1842 if (sctx.newshift) break; 1843 pc4 = b->a + bdiag[i+3]; 1844 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1845 break; 1846 1847 default: 1848 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1849 } 1850 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1851 i += nodesz; /* Update the row */ 1852 } 1853 1854 /* MatPivotRefine() */ 1855 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max) { 1856 /* 1857 * if no shift in this attempt & shifting & started shifting & can refine, 1858 * then try lower shift 1859 */ 1860 sctx.shift_hi = sctx.shift_fraction; 1861 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1862 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1863 sctx.newshift = PETSC_TRUE; 1864 sctx.nshift++; 1865 } 1866 } while (sctx.newshift); 1867 1868 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1869 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1870 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1871 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1872 1873 C->ops->solve = MatSolve_SeqAIJ; 1874 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1875 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1876 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1877 C->ops->matsolve = MatMatSolve_SeqAIJ; 1878 C->assembled = PETSC_TRUE; 1879 C->preallocated = PETSC_TRUE; 1880 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1881 1882 /* MatShiftView(A,info,&sctx) */ 1883 if (sctx.nshift) { 1884 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1885 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1886 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1887 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1888 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 1889 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1890 } 1891 } 1892 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1893 PetscFunctionReturn(0); 1894 } 1895 1896 #undef __FUNCT__ 1897 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1898 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1899 { 1900 Mat C = B; 1901 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1902 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1903 PetscErrorCode ierr; 1904 const PetscInt *r,*ic,*c,*ics; 1905 PetscInt n = A->rmap->n,*bi = b->i; 1906 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1907 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1908 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1909 PetscScalar mul1,mul2,mul3,tmp; 1910 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1911 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1912 PetscReal rs=0.0; 1913 FactorShiftCtx sctx; 1914 1915 PetscFunctionBegin; 1916 sctx.shift_top = 0; 1917 sctx.nshift_max = 0; 1918 sctx.shift_lo = 0; 1919 sctx.shift_hi = 0; 1920 sctx.shift_fraction = 0; 1921 1922 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1923 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1924 sctx.shift_top = 0; 1925 for (i=0; i<n; i++) { 1926 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1927 rs = 0.0; 1928 ajtmp = aj + ai[i]; 1929 rtmp1 = aa + ai[i]; 1930 nz = ai[i+1] - ai[i]; 1931 for (j=0; j<nz; j++) { 1932 if (*ajtmp != i) { 1933 rs += PetscAbsScalar(*rtmp1++); 1934 } else { 1935 rs -= PetscRealPart(*rtmp1++); 1936 } 1937 ajtmp++; 1938 } 1939 if (rs>sctx.shift_top) sctx.shift_top = rs; 1940 } 1941 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1942 sctx.shift_top *= 1.1; 1943 sctx.nshift_max = 5; 1944 sctx.shift_lo = 0.; 1945 sctx.shift_hi = 1.; 1946 } 1947 sctx.shift_amount = 0; 1948 sctx.nshift = 0; 1949 1950 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1951 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1952 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1953 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1954 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1955 ics = ic ; 1956 rtmp22 = rtmp11 + n; 1957 rtmp33 = rtmp22 + n; 1958 1959 node_max = a->inode.node_count; 1960 ns = a->inode.size; 1961 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1962 1963 /* If max inode size > 3, split it into two inodes.*/ 1964 /* also map the inode sizes according to the ordering */ 1965 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1966 for (i=0,j=0; i<node_max; ++i,++j) { 1967 if (ns[i]>3) { 1968 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1969 ++j; 1970 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1971 } else { 1972 tmp_vec1[j] = ns[i]; 1973 } 1974 } 1975 /* Use the correct node_max */ 1976 node_max = j; 1977 1978 /* Now reorder the inode info based on mat re-ordering info */ 1979 /* First create a row -> inode_size_array_index map */ 1980 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1981 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1982 for (i=0,row=0; i<node_max; i++) { 1983 nodesz = tmp_vec1[i]; 1984 for (j=0; j<nodesz; j++,row++) { 1985 nsmap[row] = i; 1986 } 1987 } 1988 /* Using nsmap, create a reordered ns structure */ 1989 for (i=0,j=0; i< node_max; i++) { 1990 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1991 tmp_vec2[i] = nodesz; 1992 j += nodesz; 1993 } 1994 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1995 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1996 /* Now use the correct ns */ 1997 ns = tmp_vec2; 1998 1999 do { 2000 sctx.newshift = PETSC_FALSE; 2001 /* Now loop over each block-row, and do the factorization */ 2002 for (i=0,row=0; i<node_max; i++) { 2003 nodesz = ns[i]; 2004 nz = bi[row+1] - bi[row]; 2005 bjtmp = bj + bi[row]; 2006 2007 switch (nodesz) { 2008 case 1: 2009 for (j=0; j<nz; j++) { 2010 idx = bjtmp[j]; 2011 rtmp11[idx] = 0.0; 2012 } 2013 2014 /* load in initial (unfactored row) */ 2015 idx = r[row]; 2016 nz_tmp = ai[idx+1] - ai[idx]; 2017 ajtmp = aj + ai[idx]; 2018 v1 = aa + ai[idx]; 2019 2020 for (j=0; j<nz_tmp; j++) { 2021 idx = ics[ajtmp[j]]; 2022 rtmp11[idx] = v1[j]; 2023 } 2024 rtmp11[ics[r[row]]] += sctx.shift_amount; 2025 2026 prow = *bjtmp++ ; 2027 while (prow < row) { 2028 pc1 = rtmp11 + prow; 2029 if (*pc1 != 0.0) { 2030 pv = ba + bd[prow]; 2031 pj = nbj + bd[prow]; 2032 mul1 = *pc1 * *pv++; 2033 *pc1 = mul1; 2034 nz_tmp = bi[prow+1] - bd[prow] - 1; 2035 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2036 for (j=0; j<nz_tmp; j++) { 2037 tmp = pv[j]; 2038 idx = pj[j]; 2039 rtmp11[idx] -= mul1 * tmp; 2040 } 2041 } 2042 prow = *bjtmp++ ; 2043 } 2044 pj = bj + bi[row]; 2045 pc1 = ba + bi[row]; 2046 2047 sctx.pv = rtmp11[row]; 2048 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2049 rs = 0.0; 2050 for (j=0; j<nz; j++) { 2051 idx = pj[j]; 2052 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2053 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2054 } 2055 sctx.rs = rs; 2056 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2057 if (sctx.newshift) goto endofwhile; 2058 break; 2059 2060 case 2: 2061 for (j=0; j<nz; j++) { 2062 idx = bjtmp[j]; 2063 rtmp11[idx] = 0.0; 2064 rtmp22[idx] = 0.0; 2065 } 2066 2067 /* load in initial (unfactored row) */ 2068 idx = r[row]; 2069 nz_tmp = ai[idx+1] - ai[idx]; 2070 ajtmp = aj + ai[idx]; 2071 v1 = aa + ai[idx]; 2072 v2 = aa + ai[idx+1]; 2073 for (j=0; j<nz_tmp; j++) { 2074 idx = ics[ajtmp[j]]; 2075 rtmp11[idx] = v1[j]; 2076 rtmp22[idx] = v2[j]; 2077 } 2078 rtmp11[ics[r[row]]] += sctx.shift_amount; 2079 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2080 2081 prow = *bjtmp++ ; 2082 while (prow < row) { 2083 pc1 = rtmp11 + prow; 2084 pc2 = rtmp22 + prow; 2085 if (*pc1 != 0.0 || *pc2 != 0.0) { 2086 pv = ba + bd[prow]; 2087 pj = nbj + bd[prow]; 2088 mul1 = *pc1 * *pv; 2089 mul2 = *pc2 * *pv; 2090 ++pv; 2091 *pc1 = mul1; 2092 *pc2 = mul2; 2093 2094 nz_tmp = bi[prow+1] - bd[prow] - 1; 2095 for (j=0; j<nz_tmp; j++) { 2096 tmp = pv[j]; 2097 idx = pj[j]; 2098 rtmp11[idx] -= mul1 * tmp; 2099 rtmp22[idx] -= mul2 * tmp; 2100 } 2101 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2102 } 2103 prow = *bjtmp++ ; 2104 } 2105 2106 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2107 pc1 = rtmp11 + prow; 2108 pc2 = rtmp22 + prow; 2109 2110 sctx.pv = *pc1; 2111 pj = bj + bi[prow]; 2112 rs = 0.0; 2113 for (j=0; j<nz; j++) { 2114 idx = pj[j]; 2115 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2116 } 2117 sctx.rs = rs; 2118 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2119 if (sctx.newshift) goto endofwhile; 2120 2121 if (*pc2 != 0.0) { 2122 pj = nbj + bd[prow]; 2123 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2124 *pc2 = mul2; 2125 nz_tmp = bi[prow+1] - bd[prow] - 1; 2126 for (j=0; j<nz_tmp; j++) { 2127 idx = pj[j] ; 2128 tmp = rtmp11[idx]; 2129 rtmp22[idx] -= mul2 * tmp; 2130 } 2131 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2132 } 2133 2134 pj = bj + bi[row]; 2135 pc1 = ba + bi[row]; 2136 pc2 = ba + bi[row+1]; 2137 2138 sctx.pv = rtmp22[row+1]; 2139 rs = 0.0; 2140 rtmp11[row] = 1.0/rtmp11[row]; 2141 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2142 /* copy row entries from dense representation to sparse */ 2143 for (j=0; j<nz; j++) { 2144 idx = pj[j]; 2145 pc1[j] = rtmp11[idx]; 2146 pc2[j] = rtmp22[idx]; 2147 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2148 } 2149 sctx.rs = rs; 2150 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2151 if (sctx.newshift) goto endofwhile; 2152 break; 2153 2154 case 3: 2155 for (j=0; j<nz; j++) { 2156 idx = bjtmp[j]; 2157 rtmp11[idx] = 0.0; 2158 rtmp22[idx] = 0.0; 2159 rtmp33[idx] = 0.0; 2160 } 2161 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2162 idx = r[row]; 2163 nz_tmp = ai[idx+1] - ai[idx]; 2164 ajtmp = aj + ai[idx]; 2165 v1 = aa + ai[idx]; 2166 v2 = aa + ai[idx+1]; 2167 v3 = aa + ai[idx+2]; 2168 for (j=0; j<nz_tmp; j++) { 2169 idx = ics[ajtmp[j]]; 2170 rtmp11[idx] = v1[j]; 2171 rtmp22[idx] = v2[j]; 2172 rtmp33[idx] = v3[j]; 2173 } 2174 rtmp11[ics[r[row]]] += sctx.shift_amount; 2175 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2176 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2177 2178 /* loop over all pivot row blocks above this row block */ 2179 prow = *bjtmp++ ; 2180 while (prow < row) { 2181 pc1 = rtmp11 + prow; 2182 pc2 = rtmp22 + prow; 2183 pc3 = rtmp33 + prow; 2184 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0) { 2185 pv = ba + bd[prow]; 2186 pj = nbj + bd[prow]; 2187 mul1 = *pc1 * *pv; 2188 mul2 = *pc2 * *pv; 2189 mul3 = *pc3 * *pv; 2190 ++pv; 2191 *pc1 = mul1; 2192 *pc2 = mul2; 2193 *pc3 = mul3; 2194 2195 nz_tmp = bi[prow+1] - bd[prow] - 1; 2196 /* update this row based on pivot row */ 2197 for (j=0; j<nz_tmp; j++) { 2198 tmp = pv[j]; 2199 idx = pj[j]; 2200 rtmp11[idx] -= mul1 * tmp; 2201 rtmp22[idx] -= mul2 * tmp; 2202 rtmp33[idx] -= mul3 * tmp; 2203 } 2204 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2205 } 2206 prow = *bjtmp++ ; 2207 } 2208 2209 /* Now take care of diagonal 3x3 block in this set of rows */ 2210 /* note: prow = row here */ 2211 pc1 = rtmp11 + prow; 2212 pc2 = rtmp22 + prow; 2213 pc3 = rtmp33 + prow; 2214 2215 sctx.pv = *pc1; 2216 pj = bj + bi[prow]; 2217 rs = 0.0; 2218 for (j=0; j<nz; j++) { 2219 idx = pj[j]; 2220 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2221 } 2222 sctx.rs = rs; 2223 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2224 if (sctx.newshift) goto endofwhile; 2225 2226 if (*pc2 != 0.0 || *pc3 != 0.0) { 2227 mul2 = (*pc2)/(*pc1); 2228 mul3 = (*pc3)/(*pc1); 2229 *pc2 = mul2; 2230 *pc3 = mul3; 2231 nz_tmp = bi[prow+1] - bd[prow] - 1; 2232 pj = nbj + bd[prow]; 2233 for (j=0; j<nz_tmp; j++) { 2234 idx = pj[j] ; 2235 tmp = rtmp11[idx]; 2236 rtmp22[idx] -= mul2 * tmp; 2237 rtmp33[idx] -= mul3 * tmp; 2238 } 2239 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2240 } 2241 ++prow; 2242 2243 pc2 = rtmp22 + prow; 2244 pc3 = rtmp33 + prow; 2245 sctx.pv = *pc2; 2246 pj = bj + bi[prow]; 2247 rs = 0.0; 2248 for (j=0; j<nz; j++) { 2249 idx = pj[j]; 2250 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2251 } 2252 sctx.rs = rs; 2253 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2254 if (sctx.newshift) goto endofwhile; 2255 2256 if (*pc3 != 0.0) { 2257 mul3 = (*pc3)/(*pc2); 2258 *pc3 = mul3; 2259 pj = nbj + bd[prow]; 2260 nz_tmp = bi[prow+1] - bd[prow] - 1; 2261 for (j=0; j<nz_tmp; j++) { 2262 idx = pj[j] ; 2263 tmp = rtmp22[idx]; 2264 rtmp33[idx] -= mul3 * tmp; 2265 } 2266 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2267 } 2268 2269 pj = bj + bi[row]; 2270 pc1 = ba + bi[row]; 2271 pc2 = ba + bi[row+1]; 2272 pc3 = ba + bi[row+2]; 2273 2274 sctx.pv = rtmp33[row+2]; 2275 rs = 0.0; 2276 rtmp11[row] = 1.0/rtmp11[row]; 2277 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2278 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2279 /* copy row entries from dense representation to sparse */ 2280 for (j=0; j<nz; j++) { 2281 idx = pj[j]; 2282 pc1[j] = rtmp11[idx]; 2283 pc2[j] = rtmp22[idx]; 2284 pc3[j] = rtmp33[idx]; 2285 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2286 } 2287 2288 sctx.rs = rs; 2289 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2290 if (sctx.newshift) goto endofwhile; 2291 break; 2292 2293 default: 2294 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2295 } 2296 row += nodesz; /* Update the row */ 2297 } 2298 endofwhile:; 2299 } while (sctx.newshift); 2300 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2301 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2302 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2303 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2304 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2305 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2306 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2307 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2308 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2309 C->assembled = PETSC_TRUE; 2310 C->preallocated = PETSC_TRUE; 2311 if (sctx.nshift) { 2312 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2313 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2314 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2315 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2316 } 2317 } 2318 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2319 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2320 PetscFunctionReturn(0); 2321 } 2322 2323 2324 /* ----------------------------------------------------------- */ 2325 #undef __FUNCT__ 2326 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2327 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2328 { 2329 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2330 IS iscol = a->col,isrow = a->row; 2331 PetscErrorCode ierr; 2332 const PetscInt *r,*c,*rout,*cout; 2333 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2334 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2335 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2336 PetscScalar sum1,sum2,sum3,sum4,sum5; 2337 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2338 const PetscScalar *b; 2339 2340 PetscFunctionBegin; 2341 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2342 node_max = a->inode.node_count; 2343 ns = a->inode.size; /* Node Size array */ 2344 2345 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2346 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2347 tmp = a->solve_work; 2348 2349 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2350 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2351 2352 /* forward solve the lower triangular */ 2353 tmps = tmp ; 2354 aa = a_a ; 2355 aj = a_j ; 2356 ad = a->diag; 2357 2358 for (i = 0,row = 0; i< node_max; ++i) { 2359 nsz = ns[i]; 2360 aii = ai[row]; 2361 v1 = aa + aii; 2362 vi = aj + aii; 2363 nz = ai[row+1]- ai[row]; 2364 2365 if (i < node_max-1) { 2366 /* Prefetch the indices for the next block */ 2367 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2368 /* Prefetch the data for the next block */ 2369 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2370 } 2371 2372 switch (nsz) { /* Each loop in 'case' is unrolled */ 2373 case 1 : 2374 sum1 = b[r[row]]; 2375 for (j=0; j<nz-1; j+=2) { 2376 i0 = vi[j]; 2377 i1 = vi[j+1]; 2378 tmp0 = tmps[i0]; 2379 tmp1 = tmps[i1]; 2380 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2381 } 2382 if (j == nz-1) { 2383 tmp0 = tmps[vi[j]]; 2384 sum1 -= v1[j]*tmp0; 2385 } 2386 tmp[row++]=sum1; 2387 break; 2388 case 2: 2389 sum1 = b[r[row]]; 2390 sum2 = b[r[row+1]]; 2391 v2 = aa + ai[row+1]; 2392 2393 for (j=0; j<nz-1; j+=2) { 2394 i0 = vi[j]; 2395 i1 = vi[j+1]; 2396 tmp0 = tmps[i0]; 2397 tmp1 = tmps[i1]; 2398 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2399 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2400 } 2401 if (j == nz-1) { 2402 tmp0 = tmps[vi[j]]; 2403 sum1 -= v1[j] *tmp0; 2404 sum2 -= v2[j] *tmp0; 2405 } 2406 sum2 -= v2[nz] * sum1; 2407 tmp[row ++]=sum1; 2408 tmp[row ++]=sum2; 2409 break; 2410 case 3: 2411 sum1 = b[r[row]]; 2412 sum2 = b[r[row+1]]; 2413 sum3 = b[r[row+2]]; 2414 v2 = aa + ai[row+1]; 2415 v3 = aa + ai[row+2]; 2416 2417 for (j=0; j<nz-1; j+=2) { 2418 i0 = vi[j]; 2419 i1 = vi[j+1]; 2420 tmp0 = tmps[i0]; 2421 tmp1 = tmps[i1]; 2422 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2423 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2424 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2425 } 2426 if (j == nz-1) { 2427 tmp0 = tmps[vi[j]]; 2428 sum1 -= v1[j] *tmp0; 2429 sum2 -= v2[j] *tmp0; 2430 sum3 -= v3[j] *tmp0; 2431 } 2432 sum2 -= v2[nz] * sum1; 2433 sum3 -= v3[nz] * sum1; 2434 sum3 -= v3[nz+1] * sum2; 2435 tmp[row ++]=sum1; 2436 tmp[row ++]=sum2; 2437 tmp[row ++]=sum3; 2438 break; 2439 2440 case 4: 2441 sum1 = b[r[row]]; 2442 sum2 = b[r[row+1]]; 2443 sum3 = b[r[row+2]]; 2444 sum4 = b[r[row+3]]; 2445 v2 = aa + ai[row+1]; 2446 v3 = aa + ai[row+2]; 2447 v4 = aa + ai[row+3]; 2448 2449 for (j=0; j<nz-1; j+=2) { 2450 i0 = vi[j]; 2451 i1 = vi[j+1]; 2452 tmp0 = tmps[i0]; 2453 tmp1 = tmps[i1]; 2454 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2455 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2456 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2457 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2458 } 2459 if (j == nz-1) { 2460 tmp0 = tmps[vi[j]]; 2461 sum1 -= v1[j] *tmp0; 2462 sum2 -= v2[j] *tmp0; 2463 sum3 -= v3[j] *tmp0; 2464 sum4 -= v4[j] *tmp0; 2465 } 2466 sum2 -= v2[nz] * sum1; 2467 sum3 -= v3[nz] * sum1; 2468 sum4 -= v4[nz] * sum1; 2469 sum3 -= v3[nz+1] * sum2; 2470 sum4 -= v4[nz+1] * sum2; 2471 sum4 -= v4[nz+2] * sum3; 2472 2473 tmp[row ++]=sum1; 2474 tmp[row ++]=sum2; 2475 tmp[row ++]=sum3; 2476 tmp[row ++]=sum4; 2477 break; 2478 case 5: 2479 sum1 = b[r[row]]; 2480 sum2 = b[r[row+1]]; 2481 sum3 = b[r[row+2]]; 2482 sum4 = b[r[row+3]]; 2483 sum5 = b[r[row+4]]; 2484 v2 = aa + ai[row+1]; 2485 v3 = aa + ai[row+2]; 2486 v4 = aa + ai[row+3]; 2487 v5 = aa + ai[row+4]; 2488 2489 for (j=0; j<nz-1; j+=2) { 2490 i0 = vi[j]; 2491 i1 = vi[j+1]; 2492 tmp0 = tmps[i0]; 2493 tmp1 = tmps[i1]; 2494 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2495 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2496 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2497 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2498 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2499 } 2500 if (j == nz-1) { 2501 tmp0 = tmps[vi[j]]; 2502 sum1 -= v1[j] *tmp0; 2503 sum2 -= v2[j] *tmp0; 2504 sum3 -= v3[j] *tmp0; 2505 sum4 -= v4[j] *tmp0; 2506 sum5 -= v5[j] *tmp0; 2507 } 2508 2509 sum2 -= v2[nz] * sum1; 2510 sum3 -= v3[nz] * sum1; 2511 sum4 -= v4[nz] * sum1; 2512 sum5 -= v5[nz] * sum1; 2513 sum3 -= v3[nz+1] * sum2; 2514 sum4 -= v4[nz+1] * sum2; 2515 sum5 -= v5[nz+1] * sum2; 2516 sum4 -= v4[nz+2] * sum3; 2517 sum5 -= v5[nz+2] * sum3; 2518 sum5 -= v5[nz+3] * sum4; 2519 2520 tmp[row ++]=sum1; 2521 tmp[row ++]=sum2; 2522 tmp[row ++]=sum3; 2523 tmp[row ++]=sum4; 2524 tmp[row ++]=sum5; 2525 break; 2526 default: 2527 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2528 } 2529 } 2530 /* backward solve the upper triangular */ 2531 for (i=node_max -1 ,row = n-1 ; i>=0; i--) { 2532 nsz = ns[i]; 2533 aii = ad[row+1] + 1; 2534 v1 = aa + aii; 2535 vi = aj + aii; 2536 nz = ad[row]- ad[row+1] - 1; 2537 2538 if (i > 0) { 2539 /* Prefetch the indices for the next block */ 2540 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2541 /* Prefetch the data for the next block */ 2542 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2543 } 2544 2545 switch (nsz) { /* Each loop in 'case' is unrolled */ 2546 case 1 : 2547 sum1 = tmp[row]; 2548 2549 for (j=0 ; j<nz-1; j+=2) { 2550 i0 = vi[j]; 2551 i1 = vi[j+1]; 2552 tmp0 = tmps[i0]; 2553 tmp1 = tmps[i1]; 2554 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2555 } 2556 if (j == nz-1) { 2557 tmp0 = tmps[vi[j]]; 2558 sum1 -= v1[j]*tmp0; 2559 } 2560 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2561 break; 2562 case 2 : 2563 sum1 = tmp[row]; 2564 sum2 = tmp[row-1]; 2565 v2 = aa + ad[row] + 1; 2566 for (j=0 ; j<nz-1; j+=2) { 2567 i0 = vi[j]; 2568 i1 = vi[j+1]; 2569 tmp0 = tmps[i0]; 2570 tmp1 = tmps[i1]; 2571 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2572 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2573 } 2574 if (j == nz-1) { 2575 tmp0 = tmps[vi[j]]; 2576 sum1 -= v1[j]* tmp0; 2577 sum2 -= v2[j+1]* tmp0; 2578 } 2579 2580 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2581 sum2 -= v2[0] * tmp0; 2582 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2583 break; 2584 case 3 : 2585 sum1 = tmp[row]; 2586 sum2 = tmp[row -1]; 2587 sum3 = tmp[row -2]; 2588 v2 = aa + ad[row] + 1; 2589 v3 = aa + ad[row -1] + 1; 2590 for (j=0 ; j<nz-1; j+=2) { 2591 i0 = vi[j]; 2592 i1 = vi[j+1]; 2593 tmp0 = tmps[i0]; 2594 tmp1 = tmps[i1]; 2595 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2596 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2597 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2598 } 2599 if (j== nz-1) { 2600 tmp0 = tmps[vi[j]]; 2601 sum1 -= v1[j] * tmp0; 2602 sum2 -= v2[j+1] * tmp0; 2603 sum3 -= v3[j+2] * tmp0; 2604 } 2605 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2606 sum2 -= v2[0]* tmp0; 2607 sum3 -= v3[1] * tmp0; 2608 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2609 sum3 -= v3[0]* tmp0; 2610 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2611 2612 break; 2613 case 4 : 2614 sum1 = tmp[row]; 2615 sum2 = tmp[row -1]; 2616 sum3 = tmp[row -2]; 2617 sum4 = tmp[row -3]; 2618 v2 = aa + ad[row]+1; 2619 v3 = aa + ad[row -1]+1; 2620 v4 = aa + ad[row -2]+1; 2621 2622 for (j=0 ; j<nz-1; j+=2) { 2623 i0 = vi[j]; 2624 i1 = vi[j+1]; 2625 tmp0 = tmps[i0]; 2626 tmp1 = tmps[i1]; 2627 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2628 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2629 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2630 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2631 } 2632 if (j== nz-1) { 2633 tmp0 = tmps[vi[j]]; 2634 sum1 -= v1[j] * tmp0; 2635 sum2 -= v2[j+1] * tmp0; 2636 sum3 -= v3[j+2] * tmp0; 2637 sum4 -= v4[j+3] * tmp0; 2638 } 2639 2640 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2641 sum2 -= v2[0] * tmp0; 2642 sum3 -= v3[1] * tmp0; 2643 sum4 -= v4[2] * tmp0; 2644 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2645 sum3 -= v3[0] * tmp0; 2646 sum4 -= v4[1] * tmp0; 2647 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2648 sum4 -= v4[0] * tmp0; 2649 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2650 break; 2651 case 5 : 2652 sum1 = tmp[row]; 2653 sum2 = tmp[row -1]; 2654 sum3 = tmp[row -2]; 2655 sum4 = tmp[row -3]; 2656 sum5 = tmp[row -4]; 2657 v2 = aa + ad[row]+1; 2658 v3 = aa + ad[row -1]+1; 2659 v4 = aa + ad[row -2]+1; 2660 v5 = aa + ad[row -3]+1; 2661 for (j=0 ; j<nz-1; j+=2) { 2662 i0 = vi[j]; 2663 i1 = vi[j+1]; 2664 tmp0 = tmps[i0]; 2665 tmp1 = tmps[i1]; 2666 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2667 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2668 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2669 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2670 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2671 } 2672 if (j==nz-1) { 2673 tmp0 = tmps[vi[j]]; 2674 sum1 -= v1[j] * tmp0; 2675 sum2 -= v2[j+1] * tmp0; 2676 sum3 -= v3[j+2] * tmp0; 2677 sum4 -= v4[j+3] * tmp0; 2678 sum5 -= v5[j+4] * tmp0; 2679 } 2680 2681 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2682 sum2 -= v2[0] * tmp0; 2683 sum3 -= v3[1] * tmp0; 2684 sum4 -= v4[2] * tmp0; 2685 sum5 -= v5[3] * tmp0; 2686 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2687 sum3 -= v3[0] * tmp0; 2688 sum4 -= v4[1] * tmp0; 2689 sum5 -= v5[2] * tmp0; 2690 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2691 sum4 -= v4[0] * tmp0; 2692 sum5 -= v5[1] * tmp0; 2693 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2694 sum5 -= v5[0] * tmp0; 2695 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2696 break; 2697 default: 2698 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2699 } 2700 } 2701 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2702 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2703 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2704 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2705 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2706 PetscFunctionReturn(0); 2707 } 2708 2709 2710 /* 2711 Makes a longer coloring[] array and calls the usual code with that 2712 */ 2713 #undef __FUNCT__ 2714 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2715 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2716 { 2717 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2718 PetscErrorCode ierr; 2719 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2720 PetscInt *colorused,i; 2721 ISColoringValue *newcolor; 2722 2723 PetscFunctionBegin; 2724 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2725 /* loop over inodes, marking a color for each column*/ 2726 row = 0; 2727 for (i=0; i<m; i++) { 2728 for (j=0; j<ns[i]; j++) { 2729 newcolor[row++] = coloring[i] + j*ncolors; 2730 } 2731 } 2732 2733 /* eliminate unneeded colors */ 2734 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2735 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2736 for (i=0; i<n; i++) { 2737 colorused[newcolor[i]] = 1; 2738 } 2739 2740 for (i=1; i<5*ncolors; i++) { 2741 colorused[i] += colorused[i-1]; 2742 } 2743 ncolors = colorused[5*ncolors-1]; 2744 for (i=0; i<n; i++) { 2745 newcolor[i] = colorused[newcolor[i]]-1; 2746 } 2747 ierr = PetscFree(colorused);CHKERRQ(ierr); 2748 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2749 ierr = PetscFree(coloring);CHKERRQ(ierr); 2750 PetscFunctionReturn(0); 2751 } 2752 2753 #include <../src/mat/blockinvert.h> 2754 2755 #undef __FUNCT__ 2756 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2757 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2758 { 2759 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2760 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2761 MatScalar *ibdiag,*bdiag,work[25],*t; 2762 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2763 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2764 const PetscScalar *xb, *b; 2765 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2766 PetscErrorCode ierr; 2767 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2768 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2769 2770 PetscFunctionBegin; 2771 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2772 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2773 2774 if (!a->inode.ibdiagvalid) { 2775 if (!a->inode.ibdiag) { 2776 /* calculate space needed for diagonal blocks */ 2777 for (i=0; i<m; i++) { 2778 cnt += sizes[i]*sizes[i]; 2779 } 2780 a->inode.bdiagsize = cnt; 2781 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2782 } 2783 2784 /* copy over the diagonal blocks and invert them */ 2785 ibdiag = a->inode.ibdiag; 2786 bdiag = a->inode.bdiag; 2787 cnt = 0; 2788 for (i=0, row = 0; i<m; i++) { 2789 for (j=0; j<sizes[i]; j++) { 2790 for (k=0; k<sizes[i]; k++) { 2791 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2792 } 2793 } 2794 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2795 2796 switch (sizes[i]) { 2797 case 1: 2798 /* Create matrix data structure */ 2799 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2800 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2801 break; 2802 case 2: 2803 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2804 break; 2805 case 3: 2806 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2807 break; 2808 case 4: 2809 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2810 break; 2811 case 5: 2812 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2813 break; 2814 default: 2815 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2816 } 2817 cnt += sizes[i]*sizes[i]; 2818 row += sizes[i]; 2819 } 2820 a->inode.ibdiagvalid = PETSC_TRUE; 2821 } 2822 ibdiag = a->inode.ibdiag; 2823 bdiag = a->inode.bdiag; 2824 t = a->inode.ssor_work; 2825 2826 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2827 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2828 CHKMEMQ; 2829 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2830 if (flag & SOR_ZERO_INITIAL_GUESS) { 2831 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 2832 2833 for (i=0, row=0; i<m; i++) { 2834 sz = diag[row] - ii[row]; 2835 v1 = a->a + ii[row]; 2836 idx = a->j + ii[row]; 2837 2838 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2839 switch (sizes[i]) { 2840 case 1: 2841 2842 sum1 = b[row]; 2843 for (n = 0; n<sz-1; n+=2) { 2844 i1 = idx[0]; 2845 i2 = idx[1]; 2846 idx += 2; 2847 tmp0 = x[i1]; 2848 tmp1 = x[i2]; 2849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2850 } 2851 2852 if (n == sz-1) { 2853 tmp0 = x[*idx]; 2854 sum1 -= *v1 * tmp0; 2855 } 2856 t[row] = sum1; 2857 x[row++] = sum1*(*ibdiag++); 2858 break; 2859 case 2: 2860 v2 = a->a + ii[row+1]; 2861 sum1 = b[row]; 2862 sum2 = b[row+1]; 2863 for (n = 0; n<sz-1; n+=2) { 2864 i1 = idx[0]; 2865 i2 = idx[1]; 2866 idx += 2; 2867 tmp0 = x[i1]; 2868 tmp1 = x[i2]; 2869 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2870 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2871 } 2872 2873 if (n == sz-1) { 2874 tmp0 = x[*idx]; 2875 sum1 -= v1[0] * tmp0; 2876 sum2 -= v2[0] * tmp0; 2877 } 2878 t[row] = sum1; 2879 t[row+1] = sum2; 2880 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2881 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2882 ibdiag += 4; 2883 break; 2884 case 3: 2885 v2 = a->a + ii[row+1]; 2886 v3 = a->a + ii[row+2]; 2887 sum1 = b[row]; 2888 sum2 = b[row+1]; 2889 sum3 = b[row+2]; 2890 for (n = 0; n<sz-1; n+=2) { 2891 i1 = idx[0]; 2892 i2 = idx[1]; 2893 idx += 2; 2894 tmp0 = x[i1]; 2895 tmp1 = x[i2]; 2896 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2897 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2898 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2899 } 2900 2901 if (n == sz-1) { 2902 tmp0 = x[*idx]; 2903 sum1 -= v1[0] * tmp0; 2904 sum2 -= v2[0] * tmp0; 2905 sum3 -= v3[0] * tmp0; 2906 } 2907 t[row] = sum1; 2908 t[row+1] = sum2; 2909 t[row+2] = sum3; 2910 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2911 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2912 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2913 ibdiag += 9; 2914 break; 2915 case 4: 2916 v2 = a->a + ii[row+1]; 2917 v3 = a->a + ii[row+2]; 2918 v4 = a->a + ii[row+3]; 2919 sum1 = b[row]; 2920 sum2 = b[row+1]; 2921 sum3 = b[row+2]; 2922 sum4 = b[row+3]; 2923 for (n = 0; n<sz-1; n+=2) { 2924 i1 = idx[0]; 2925 i2 = idx[1]; 2926 idx += 2; 2927 tmp0 = x[i1]; 2928 tmp1 = x[i2]; 2929 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2930 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2931 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2932 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2933 } 2934 2935 if (n == sz-1) { 2936 tmp0 = x[*idx]; 2937 sum1 -= v1[0] * tmp0; 2938 sum2 -= v2[0] * tmp0; 2939 sum3 -= v3[0] * tmp0; 2940 sum4 -= v4[0] * tmp0; 2941 } 2942 t[row] = sum1; 2943 t[row+1] = sum2; 2944 t[row+2] = sum3; 2945 t[row+3] = sum4; 2946 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2947 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2948 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2949 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2950 ibdiag += 16; 2951 break; 2952 case 5: 2953 v2 = a->a + ii[row+1]; 2954 v3 = a->a + ii[row+2]; 2955 v4 = a->a + ii[row+3]; 2956 v5 = a->a + ii[row+4]; 2957 sum1 = b[row]; 2958 sum2 = b[row+1]; 2959 sum3 = b[row+2]; 2960 sum4 = b[row+3]; 2961 sum5 = b[row+4]; 2962 for (n = 0; n<sz-1; n+=2) { 2963 i1 = idx[0]; 2964 i2 = idx[1]; 2965 idx += 2; 2966 tmp0 = x[i1]; 2967 tmp1 = x[i2]; 2968 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2969 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2970 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2971 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2972 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2973 } 2974 2975 if (n == sz-1) { 2976 tmp0 = x[*idx]; 2977 sum1 -= v1[0] * tmp0; 2978 sum2 -= v2[0] * tmp0; 2979 sum3 -= v3[0] * tmp0; 2980 sum4 -= v4[0] * tmp0; 2981 sum5 -= v5[0] * tmp0; 2982 } 2983 t[row] = sum1; 2984 t[row+1] = sum2; 2985 t[row+2] = sum3; 2986 t[row+3] = sum4; 2987 t[row+4] = sum5; 2988 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2989 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2990 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2991 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2992 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2993 ibdiag += 25; 2994 break; 2995 default: 2996 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2997 } 2998 } 2999 3000 xb = t; 3001 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3002 } else xb = b; 3003 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3004 3005 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3006 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3007 ibdiag -= sizes[i]*sizes[i]; 3008 sz = ii[row+1] - diag[row] - 1; 3009 v1 = a->a + diag[row] + 1; 3010 idx = a->j + diag[row] + 1; 3011 3012 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3013 switch (sizes[i]) { 3014 case 1: 3015 3016 sum1 = xb[row]; 3017 for (n = 0; n<sz-1; n+=2) { 3018 i1 = idx[0]; 3019 i2 = idx[1]; 3020 idx += 2; 3021 tmp0 = x[i1]; 3022 tmp1 = x[i2]; 3023 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3024 } 3025 3026 if (n == sz-1) { 3027 tmp0 = x[*idx]; 3028 sum1 -= *v1*tmp0; 3029 } 3030 x[row--] = sum1*(*ibdiag); 3031 break; 3032 3033 case 2: 3034 3035 sum1 = xb[row]; 3036 sum2 = xb[row-1]; 3037 /* note that sum1 is associated with the second of the two rows */ 3038 v2 = a->a + diag[row-1] + 2; 3039 for (n = 0; n<sz-1; n+=2) { 3040 i1 = idx[0]; 3041 i2 = idx[1]; 3042 idx += 2; 3043 tmp0 = x[i1]; 3044 tmp1 = x[i2]; 3045 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3046 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3047 } 3048 3049 if (n == sz-1) { 3050 tmp0 = x[*idx]; 3051 sum1 -= *v1*tmp0; 3052 sum2 -= *v2*tmp0; 3053 } 3054 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3055 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3056 break; 3057 case 3: 3058 3059 sum1 = xb[row]; 3060 sum2 = xb[row-1]; 3061 sum3 = xb[row-2]; 3062 v2 = a->a + diag[row-1] + 2; 3063 v3 = a->a + diag[row-2] + 3; 3064 for (n = 0; n<sz-1; n+=2) { 3065 i1 = idx[0]; 3066 i2 = idx[1]; 3067 idx += 2; 3068 tmp0 = x[i1]; 3069 tmp1 = x[i2]; 3070 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3071 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3072 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3073 } 3074 3075 if (n == sz-1) { 3076 tmp0 = x[*idx]; 3077 sum1 -= *v1*tmp0; 3078 sum2 -= *v2*tmp0; 3079 sum3 -= *v3*tmp0; 3080 } 3081 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3082 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3083 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3084 break; 3085 case 4: 3086 3087 sum1 = xb[row]; 3088 sum2 = xb[row-1]; 3089 sum3 = xb[row-2]; 3090 sum4 = xb[row-3]; 3091 v2 = a->a + diag[row-1] + 2; 3092 v3 = a->a + diag[row-2] + 3; 3093 v4 = a->a + diag[row-3] + 4; 3094 for (n = 0; n<sz-1; n+=2) { 3095 i1 = idx[0]; 3096 i2 = idx[1]; 3097 idx += 2; 3098 tmp0 = x[i1]; 3099 tmp1 = x[i2]; 3100 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3101 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3102 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3103 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3104 } 3105 3106 if (n == sz-1) { 3107 tmp0 = x[*idx]; 3108 sum1 -= *v1*tmp0; 3109 sum2 -= *v2*tmp0; 3110 sum3 -= *v3*tmp0; 3111 sum4 -= *v4*tmp0; 3112 } 3113 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3114 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3115 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3116 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3117 break; 3118 case 5: 3119 3120 sum1 = xb[row]; 3121 sum2 = xb[row-1]; 3122 sum3 = xb[row-2]; 3123 sum4 = xb[row-3]; 3124 sum5 = xb[row-4]; 3125 v2 = a->a + diag[row-1] + 2; 3126 v3 = a->a + diag[row-2] + 3; 3127 v4 = a->a + diag[row-3] + 4; 3128 v5 = a->a + diag[row-4] + 5; 3129 for (n = 0; n<sz-1; n+=2) { 3130 i1 = idx[0]; 3131 i2 = idx[1]; 3132 idx += 2; 3133 tmp0 = x[i1]; 3134 tmp1 = x[i2]; 3135 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3136 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3137 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3138 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3139 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3140 } 3141 3142 if (n == sz-1) { 3143 tmp0 = x[*idx]; 3144 sum1 -= *v1*tmp0; 3145 sum2 -= *v2*tmp0; 3146 sum3 -= *v3*tmp0; 3147 sum4 -= *v4*tmp0; 3148 sum5 -= *v5*tmp0; 3149 } 3150 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3151 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3152 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3153 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3154 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3155 break; 3156 default: 3157 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3158 } 3159 } 3160 3161 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3162 } 3163 its--; 3164 } 3165 while (its--) { 3166 3167 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) { 3168 ibdiag = a->inode.ibdiag; 3169 3170 for (i=0, row=0; i<m; i++) { 3171 sz = ii[row + 1] - ii[row]; 3172 v1 = a->a + ii[row]; 3173 idx = a->j + ii[row]; 3174 3175 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3176 switch (sizes[i]) { 3177 case 1: 3178 3179 sum1 = b[row]; 3180 for (n = 0; n<sz-1; n+=2) { 3181 i1 = idx[0]; 3182 i2 = idx[1]; 3183 idx += 2; 3184 tmp0 = x[i1]; 3185 tmp1 = x[i2]; 3186 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3187 } 3188 3189 if (n == sz-1) { 3190 tmp0 = x[*idx]; 3191 sum1 -= *v1 * tmp0; 3192 } 3193 3194 /* in MatSOR_SeqAIJ this line would be 3195 * 3196 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3197 * 3198 * but omega == 1, so this becomes 3199 * 3200 * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++); 3201 * 3202 * but bdiag and ibdiag cancel each other, so we can change this 3203 * to adding sum1*(*ibdiag++). We can skip bdiag for the larger 3204 * block sizes as well 3205 */ 3206 x[row++] += sum1*(*ibdiag++); 3207 break; 3208 case 2: 3209 v2 = a->a + ii[row+1]; 3210 sum1 = b[row]; 3211 sum2 = b[row+1]; 3212 for (n = 0; n<sz-1; n+=2) { 3213 i1 = idx[0]; 3214 i2 = idx[1]; 3215 idx += 2; 3216 tmp0 = x[i1]; 3217 tmp1 = x[i2]; 3218 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3219 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3220 } 3221 3222 if (n == sz-1) { 3223 tmp0 = x[*idx]; 3224 sum1 -= v1[0] * tmp0; 3225 sum2 -= v2[0] * tmp0; 3226 } 3227 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2]; 3228 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3]; 3229 ibdiag += 4; 3230 break; 3231 case 3: 3232 v2 = a->a + ii[row+1]; 3233 v3 = a->a + ii[row+2]; 3234 sum1 = b[row]; 3235 sum2 = b[row+1]; 3236 sum3 = b[row+2]; 3237 for (n = 0; n<sz-1; n+=2) { 3238 i1 = idx[0]; 3239 i2 = idx[1]; 3240 idx += 2; 3241 tmp0 = x[i1]; 3242 tmp1 = x[i2]; 3243 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3244 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3245 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3246 } 3247 3248 if (n == sz-1) { 3249 tmp0 = x[*idx]; 3250 sum1 -= v1[0] * tmp0; 3251 sum2 -= v2[0] * tmp0; 3252 sum3 -= v3[0] * tmp0; 3253 } 3254 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3255 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3256 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3257 ibdiag += 9; 3258 break; 3259 case 4: 3260 v2 = a->a + ii[row+1]; 3261 v3 = a->a + ii[row+2]; 3262 v4 = a->a + ii[row+3]; 3263 sum1 = b[row]; 3264 sum2 = b[row+1]; 3265 sum3 = b[row+2]; 3266 sum4 = b[row+3]; 3267 for (n = 0; n<sz-1; n+=2) { 3268 i1 = idx[0]; 3269 i2 = idx[1]; 3270 idx += 2; 3271 tmp0 = x[i1]; 3272 tmp1 = x[i2]; 3273 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3274 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3275 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3276 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3277 } 3278 3279 if (n == sz-1) { 3280 tmp0 = x[*idx]; 3281 sum1 -= v1[0] * tmp0; 3282 sum2 -= v2[0] * tmp0; 3283 sum3 -= v3[0] * tmp0; 3284 sum4 -= v4[0] * tmp0; 3285 } 3286 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3287 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3288 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3289 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3290 ibdiag += 16; 3291 break; 3292 case 5: 3293 v2 = a->a + ii[row+1]; 3294 v3 = a->a + ii[row+2]; 3295 v4 = a->a + ii[row+3]; 3296 v5 = a->a + ii[row+4]; 3297 sum1 = b[row]; 3298 sum2 = b[row+1]; 3299 sum3 = b[row+2]; 3300 sum4 = b[row+3]; 3301 sum5 = b[row+4]; 3302 for (n = 0; n<sz-1; n+=2) { 3303 i1 = idx[0]; 3304 i2 = idx[1]; 3305 idx += 2; 3306 tmp0 = x[i1]; 3307 tmp1 = x[i2]; 3308 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3309 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3310 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3311 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3312 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3313 } 3314 3315 if (n == sz-1) { 3316 tmp0 = x[*idx]; 3317 sum1 -= v1[0] * tmp0; 3318 sum2 -= v2[0] * tmp0; 3319 sum3 -= v3[0] * tmp0; 3320 sum4 -= v4[0] * tmp0; 3321 sum5 -= v5[0] * tmp0; 3322 } 3323 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3324 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3325 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3326 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3327 x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3328 ibdiag += 25; 3329 break; 3330 default: 3331 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3332 } 3333 } 3334 3335 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3336 } 3337 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP) { 3338 3339 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3340 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3341 ibdiag -= sizes[i]*sizes[i]; 3342 sz = ii[row+1] - ii[row]; 3343 v1 = a->a + ii[row]; 3344 idx = a->j + ii[row]; 3345 3346 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3347 switch (sizes[i]) { 3348 case 1: 3349 3350 sum1 = b[row]; 3351 for (n = 0; n<sz-1; n+=2) { 3352 i1 = idx[0]; 3353 i2 = idx[1]; 3354 idx += 2; 3355 tmp0 = x[i1]; 3356 tmp1 = x[i2]; 3357 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3358 } 3359 3360 if (n == sz-1) { 3361 tmp0 = x[*idx]; 3362 sum1 -= *v1*tmp0; 3363 } 3364 x[row--] += sum1*(*ibdiag); 3365 break; 3366 3367 case 2: 3368 3369 sum1 = b[row]; 3370 sum2 = b[row-1]; 3371 /* note that sum1 is associated with the second of the two rows */ 3372 v2 = a->a + ii[row - 1]; 3373 for (n = 0; n<sz-1; n+=2) { 3374 i1 = idx[0]; 3375 i2 = idx[1]; 3376 idx += 2; 3377 tmp0 = x[i1]; 3378 tmp1 = x[i2]; 3379 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3380 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3381 } 3382 3383 if (n == sz-1) { 3384 tmp0 = x[*idx]; 3385 sum1 -= *v1*tmp0; 3386 sum2 -= *v2*tmp0; 3387 } 3388 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3389 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3390 break; 3391 case 3: 3392 3393 sum1 = b[row]; 3394 sum2 = b[row-1]; 3395 sum3 = b[row-2]; 3396 v2 = a->a + ii[row-1]; 3397 v3 = a->a + ii[row-2]; 3398 for (n = 0; n<sz-1; n+=2) { 3399 i1 = idx[0]; 3400 i2 = idx[1]; 3401 idx += 2; 3402 tmp0 = x[i1]; 3403 tmp1 = x[i2]; 3404 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3405 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3406 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3407 } 3408 3409 if (n == sz-1) { 3410 tmp0 = x[*idx]; 3411 sum1 -= *v1*tmp0; 3412 sum2 -= *v2*tmp0; 3413 sum3 -= *v3*tmp0; 3414 } 3415 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3416 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3417 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3418 break; 3419 case 4: 3420 3421 sum1 = b[row]; 3422 sum2 = b[row-1]; 3423 sum3 = b[row-2]; 3424 sum4 = b[row-3]; 3425 v2 = a->a + ii[row-1]; 3426 v3 = a->a + ii[row-2]; 3427 v4 = a->a + ii[row-3]; 3428 for (n = 0; n<sz-1; n+=2) { 3429 i1 = idx[0]; 3430 i2 = idx[1]; 3431 idx += 2; 3432 tmp0 = x[i1]; 3433 tmp1 = x[i2]; 3434 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3435 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3436 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3437 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3438 } 3439 3440 if (n == sz-1) { 3441 tmp0 = x[*idx]; 3442 sum1 -= *v1*tmp0; 3443 sum2 -= *v2*tmp0; 3444 sum3 -= *v3*tmp0; 3445 sum4 -= *v4*tmp0; 3446 } 3447 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3448 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3449 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3450 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3451 break; 3452 case 5: 3453 3454 sum1 = b[row]; 3455 sum2 = b[row-1]; 3456 sum3 = b[row-2]; 3457 sum4 = b[row-3]; 3458 sum5 = b[row-4]; 3459 v2 = a->a + ii[row-1]; 3460 v3 = a->a + ii[row-2]; 3461 v4 = a->a + ii[row-3]; 3462 v5 = a->a + ii[row-4]; 3463 for (n = 0; n<sz-1; n+=2) { 3464 i1 = idx[0]; 3465 i2 = idx[1]; 3466 idx += 2; 3467 tmp0 = x[i1]; 3468 tmp1 = x[i2]; 3469 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3470 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3471 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3472 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3473 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3474 } 3475 3476 if (n == sz-1) { 3477 tmp0 = x[*idx]; 3478 sum1 -= *v1*tmp0; 3479 sum2 -= *v2*tmp0; 3480 sum3 -= *v3*tmp0; 3481 sum4 -= *v4*tmp0; 3482 sum5 -= *v5*tmp0; 3483 } 3484 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3485 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3486 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3487 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3488 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3489 break; 3490 default: 3491 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3492 } 3493 } 3494 3495 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3496 } 3497 } 3498 if (flag & SOR_EISENSTAT) { 3499 /* 3500 Apply (U + D)^-1 where D is now the block diagonal 3501 */ 3502 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3503 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3504 ibdiag -= sizes[i]*sizes[i]; 3505 sz = ii[row+1] - diag[row] - 1; 3506 v1 = a->a + diag[row] + 1; 3507 idx = a->j + diag[row] + 1; 3508 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3509 switch (sizes[i]) { 3510 case 1: 3511 3512 sum1 = b[row]; 3513 for (n = 0; n<sz-1; n+=2) { 3514 i1 = idx[0]; 3515 i2 = idx[1]; 3516 idx += 2; 3517 tmp0 = x[i1]; 3518 tmp1 = x[i2]; 3519 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3520 } 3521 3522 if (n == sz-1) { 3523 tmp0 = x[*idx]; 3524 sum1 -= *v1*tmp0; 3525 } 3526 x[row] = sum1*(*ibdiag);row--; 3527 break; 3528 3529 case 2: 3530 3531 sum1 = b[row]; 3532 sum2 = b[row-1]; 3533 /* note that sum1 is associated with the second of the two rows */ 3534 v2 = a->a + diag[row-1] + 2; 3535 for (n = 0; n<sz-1; n+=2) { 3536 i1 = idx[0]; 3537 i2 = idx[1]; 3538 idx += 2; 3539 tmp0 = x[i1]; 3540 tmp1 = x[i2]; 3541 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3542 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3543 } 3544 3545 if (n == sz-1) { 3546 tmp0 = x[*idx]; 3547 sum1 -= *v1*tmp0; 3548 sum2 -= *v2*tmp0; 3549 } 3550 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3551 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3552 row -= 2; 3553 break; 3554 case 3: 3555 3556 sum1 = b[row]; 3557 sum2 = b[row-1]; 3558 sum3 = b[row-2]; 3559 v2 = a->a + diag[row-1] + 2; 3560 v3 = a->a + diag[row-2] + 3; 3561 for (n = 0; n<sz-1; n+=2) { 3562 i1 = idx[0]; 3563 i2 = idx[1]; 3564 idx += 2; 3565 tmp0 = x[i1]; 3566 tmp1 = x[i2]; 3567 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3568 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3569 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3570 } 3571 3572 if (n == sz-1) { 3573 tmp0 = x[*idx]; 3574 sum1 -= *v1*tmp0; 3575 sum2 -= *v2*tmp0; 3576 sum3 -= *v3*tmp0; 3577 } 3578 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3579 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3580 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3581 row -= 3; 3582 break; 3583 case 4: 3584 3585 sum1 = b[row]; 3586 sum2 = b[row-1]; 3587 sum3 = b[row-2]; 3588 sum4 = b[row-3]; 3589 v2 = a->a + diag[row-1] + 2; 3590 v3 = a->a + diag[row-2] + 3; 3591 v4 = a->a + diag[row-3] + 4; 3592 for (n = 0; n<sz-1; n+=2) { 3593 i1 = idx[0]; 3594 i2 = idx[1]; 3595 idx += 2; 3596 tmp0 = x[i1]; 3597 tmp1 = x[i2]; 3598 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3599 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3600 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3601 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3602 } 3603 3604 if (n == sz-1) { 3605 tmp0 = x[*idx]; 3606 sum1 -= *v1*tmp0; 3607 sum2 -= *v2*tmp0; 3608 sum3 -= *v3*tmp0; 3609 sum4 -= *v4*tmp0; 3610 } 3611 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3612 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3613 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3614 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3615 row -= 4; 3616 break; 3617 case 5: 3618 3619 sum1 = b[row]; 3620 sum2 = b[row-1]; 3621 sum3 = b[row-2]; 3622 sum4 = b[row-3]; 3623 sum5 = b[row-4]; 3624 v2 = a->a + diag[row-1] + 2; 3625 v3 = a->a + diag[row-2] + 3; 3626 v4 = a->a + diag[row-3] + 4; 3627 v5 = a->a + diag[row-4] + 5; 3628 for (n = 0; n<sz-1; n+=2) { 3629 i1 = idx[0]; 3630 i2 = idx[1]; 3631 idx += 2; 3632 tmp0 = x[i1]; 3633 tmp1 = x[i2]; 3634 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3635 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3636 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3637 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3638 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3639 } 3640 3641 if (n == sz-1) { 3642 tmp0 = x[*idx]; 3643 sum1 -= *v1*tmp0; 3644 sum2 -= *v2*tmp0; 3645 sum3 -= *v3*tmp0; 3646 sum4 -= *v4*tmp0; 3647 sum5 -= *v5*tmp0; 3648 } 3649 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3650 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3651 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3652 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3653 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3654 row -= 5; 3655 break; 3656 default: 3657 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3658 } 3659 } 3660 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3661 3662 /* 3663 t = b - D x where D is the block diagonal 3664 */ 3665 cnt = 0; 3666 for (i=0, row=0; i<m; i++) { 3667 switch (sizes[i]) { 3668 case 1: 3669 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3670 break; 3671 case 2: 3672 x1 = x[row]; x2 = x[row+1]; 3673 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3674 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3675 t[row] = b[row] - tmp1; 3676 t[row+1] = b[row+1] - tmp2; row += 2; 3677 cnt += 4; 3678 break; 3679 case 3: 3680 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3681 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3682 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3683 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3684 t[row] = b[row] - tmp1; 3685 t[row+1] = b[row+1] - tmp2; 3686 t[row+2] = b[row+2] - tmp3; row += 3; 3687 cnt += 9; 3688 break; 3689 case 4: 3690 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3691 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3692 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3693 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3694 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3695 t[row] = b[row] - tmp1; 3696 t[row+1] = b[row+1] - tmp2; 3697 t[row+2] = b[row+2] - tmp3; 3698 t[row+3] = b[row+3] - tmp4; row += 4; 3699 cnt += 16; 3700 break; 3701 case 5: 3702 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3703 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3704 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3705 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3706 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3707 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3708 t[row] = b[row] - tmp1; 3709 t[row+1] = b[row+1] - tmp2; 3710 t[row+2] = b[row+2] - tmp3; 3711 t[row+3] = b[row+3] - tmp4; 3712 t[row+4] = b[row+4] - tmp5;row += 5; 3713 cnt += 25; 3714 break; 3715 default: 3716 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3717 } 3718 } 3719 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3720 3721 3722 3723 /* 3724 Apply (L + D)^-1 where D is the block diagonal 3725 */ 3726 for (i=0, row=0; i<m; i++) { 3727 sz = diag[row] - ii[row]; 3728 v1 = a->a + ii[row]; 3729 idx = a->j + ii[row]; 3730 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3731 switch (sizes[i]) { 3732 case 1: 3733 3734 sum1 = t[row]; 3735 for (n = 0; n<sz-1; n+=2) { 3736 i1 = idx[0]; 3737 i2 = idx[1]; 3738 idx += 2; 3739 tmp0 = t[i1]; 3740 tmp1 = t[i2]; 3741 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3742 } 3743 3744 if (n == sz-1) { 3745 tmp0 = t[*idx]; 3746 sum1 -= *v1 * tmp0; 3747 } 3748 x[row] += t[row] = sum1*(*ibdiag++); row++; 3749 break; 3750 case 2: 3751 v2 = a->a + ii[row+1]; 3752 sum1 = t[row]; 3753 sum2 = t[row+1]; 3754 for (n = 0; n<sz-1; n+=2) { 3755 i1 = idx[0]; 3756 i2 = idx[1]; 3757 idx += 2; 3758 tmp0 = t[i1]; 3759 tmp1 = t[i2]; 3760 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3761 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3762 } 3763 3764 if (n == sz-1) { 3765 tmp0 = t[*idx]; 3766 sum1 -= v1[0] * tmp0; 3767 sum2 -= v2[0] * tmp0; 3768 } 3769 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3770 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3771 ibdiag += 4; row += 2; 3772 break; 3773 case 3: 3774 v2 = a->a + ii[row+1]; 3775 v3 = a->a + ii[row+2]; 3776 sum1 = t[row]; 3777 sum2 = t[row+1]; 3778 sum3 = t[row+2]; 3779 for (n = 0; n<sz-1; n+=2) { 3780 i1 = idx[0]; 3781 i2 = idx[1]; 3782 idx += 2; 3783 tmp0 = t[i1]; 3784 tmp1 = t[i2]; 3785 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3786 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3787 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3788 } 3789 3790 if (n == sz-1) { 3791 tmp0 = t[*idx]; 3792 sum1 -= v1[0] * tmp0; 3793 sum2 -= v2[0] * tmp0; 3794 sum3 -= v3[0] * tmp0; 3795 } 3796 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3797 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3798 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3799 ibdiag += 9; row += 3; 3800 break; 3801 case 4: 3802 v2 = a->a + ii[row+1]; 3803 v3 = a->a + ii[row+2]; 3804 v4 = a->a + ii[row+3]; 3805 sum1 = t[row]; 3806 sum2 = t[row+1]; 3807 sum3 = t[row+2]; 3808 sum4 = t[row+3]; 3809 for (n = 0; n<sz-1; n+=2) { 3810 i1 = idx[0]; 3811 i2 = idx[1]; 3812 idx += 2; 3813 tmp0 = t[i1]; 3814 tmp1 = t[i2]; 3815 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3816 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3817 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3818 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3819 } 3820 3821 if (n == sz-1) { 3822 tmp0 = t[*idx]; 3823 sum1 -= v1[0] * tmp0; 3824 sum2 -= v2[0] * tmp0; 3825 sum3 -= v3[0] * tmp0; 3826 sum4 -= v4[0] * tmp0; 3827 } 3828 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3829 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3830 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3831 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3832 ibdiag += 16; row += 4; 3833 break; 3834 case 5: 3835 v2 = a->a + ii[row+1]; 3836 v3 = a->a + ii[row+2]; 3837 v4 = a->a + ii[row+3]; 3838 v5 = a->a + ii[row+4]; 3839 sum1 = t[row]; 3840 sum2 = t[row+1]; 3841 sum3 = t[row+2]; 3842 sum4 = t[row+3]; 3843 sum5 = t[row+4]; 3844 for (n = 0; n<sz-1; n+=2) { 3845 i1 = idx[0]; 3846 i2 = idx[1]; 3847 idx += 2; 3848 tmp0 = t[i1]; 3849 tmp1 = t[i2]; 3850 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3851 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3852 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3853 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3854 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3855 } 3856 3857 if (n == sz-1) { 3858 tmp0 = t[*idx]; 3859 sum1 -= v1[0] * tmp0; 3860 sum2 -= v2[0] * tmp0; 3861 sum3 -= v3[0] * tmp0; 3862 sum4 -= v4[0] * tmp0; 3863 sum5 -= v5[0] * tmp0; 3864 } 3865 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3866 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3867 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3868 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3869 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3870 ibdiag += 25; row += 5; 3871 break; 3872 default: 3873 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3874 } 3875 } 3876 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3877 } 3878 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3879 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3880 CHKMEMQ; PetscFunctionReturn(0); 3881 } 3882 3883 #undef __FUNCT__ 3884 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3885 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3886 { 3887 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3888 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3889 const MatScalar *bdiag = a->inode.bdiag; 3890 const PetscScalar *b; 3891 PetscErrorCode ierr; 3892 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3893 const PetscInt *sizes = a->inode.size; 3894 3895 PetscFunctionBegin; 3896 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3897 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3898 cnt = 0; 3899 for (i=0, row=0; i<m; i++) { 3900 switch (sizes[i]) { 3901 case 1: 3902 x[row] = b[row]*bdiag[cnt++];row++; 3903 break; 3904 case 2: 3905 x1 = b[row]; x2 = b[row+1]; 3906 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3907 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3908 x[row++] = tmp1; 3909 x[row++] = tmp2; 3910 cnt += 4; 3911 break; 3912 case 3: 3913 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3914 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3915 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3916 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3917 x[row++] = tmp1; 3918 x[row++] = tmp2; 3919 x[row++] = tmp3; 3920 cnt += 9; 3921 break; 3922 case 4: 3923 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3924 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3925 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3926 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3927 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3928 x[row++] = tmp1; 3929 x[row++] = tmp2; 3930 x[row++] = tmp3; 3931 x[row++] = tmp4; 3932 cnt += 16; 3933 break; 3934 case 5: 3935 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3936 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3937 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3938 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3939 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3940 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3941 x[row++] = tmp1; 3942 x[row++] = tmp2; 3943 x[row++] = tmp3; 3944 x[row++] = tmp4; 3945 x[row++] = tmp5; 3946 cnt += 25; 3947 break; 3948 default: 3949 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3950 } 3951 } 3952 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3953 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3954 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3955 PetscFunctionReturn(0); 3956 } 3957 3958 /* 3959 samestructure indicates that the matrix has not changed its nonzero structure so we 3960 do not need to recompute the inodes 3961 */ 3962 #undef __FUNCT__ 3963 #define __FUNCT__ "Mat_CheckInode" 3964 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 3965 { 3966 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3967 PetscErrorCode ierr; 3968 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3969 PetscBool flag; 3970 3971 PetscFunctionBegin; 3972 if (!a->inode.use) PetscFunctionReturn(0); 3973 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3974 3975 3976 m = A->rmap->n; 3977 if (a->inode.size) {ns = a->inode.size;} 3978 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3979 3980 i = 0; 3981 node_count = 0; 3982 idx = a->j; 3983 ii = a->i; 3984 while (i < m) { /* For each row */ 3985 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3986 /* Limits the number of elements in a node to 'a->inode.limit' */ 3987 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3988 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3989 if (nzy != nzx) break; 3990 idy += nzx; /* Same nonzero pattern */ 3991 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3992 if (!flag) break; 3993 } 3994 ns[node_count++] = blk_size; 3995 idx += blk_size*nzx; 3996 i = j; 3997 } 3998 /* If not enough inodes found,, do not use inode version of the routines */ 3999 if (!m || node_count > .8*m) { 4000 ierr = PetscFree(ns);CHKERRQ(ierr); 4001 a->inode.node_count = 0; 4002 a->inode.size = PETSC_NULL; 4003 a->inode.use = PETSC_FALSE; 4004 A->ops->mult = MatMult_SeqAIJ; 4005 A->ops->sor = MatSOR_SeqAIJ; 4006 A->ops->multadd = MatMultAdd_SeqAIJ; 4007 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4008 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4009 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4010 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4011 A->ops->coloringpatch = 0; 4012 A->ops->multdiagonalblock = 0; 4013 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4014 } else { 4015 if (!A->factortype) { 4016 A->ops->mult = MatMult_SeqAIJ_Inode; 4017 A->ops->sor = MatSOR_SeqAIJ_Inode; 4018 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4019 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4020 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4021 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4022 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4023 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4024 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4025 } else { 4026 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4027 } 4028 a->inode.node_count = node_count; 4029 a->inode.size = ns; 4030 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4031 } 4032 a->inode.checked = PETSC_TRUE; 4033 PetscFunctionReturn(0); 4034 } 4035 4036 #undef __FUNCT__ 4037 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4038 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4039 { 4040 Mat B=*C; 4041 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4042 PetscErrorCode ierr; 4043 PetscInt m=A->rmap->n; 4044 4045 PetscFunctionBegin; 4046 c->inode.use = a->inode.use; 4047 c->inode.limit = a->inode.limit; 4048 c->inode.max_limit = a->inode.max_limit; 4049 if (a->inode.size) { 4050 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr); 4051 c->inode.node_count = a->inode.node_count; 4052 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4053 /* note the table of functions below should match that in Mat_CheckInode() */ 4054 if (!B->factortype) { 4055 B->ops->mult = MatMult_SeqAIJ_Inode; 4056 B->ops->sor = MatSOR_SeqAIJ_Inode; 4057 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4058 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4059 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4060 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4061 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4062 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4063 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4064 } else { 4065 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4066 } 4067 } else { 4068 c->inode.size = 0; 4069 c->inode.node_count = 0; 4070 } 4071 c->inode.ibdiagvalid = PETSC_FALSE; 4072 c->inode.ibdiag = 0; 4073 c->inode.bdiag = 0; 4074 PetscFunctionReturn(0); 4075 } 4076 4077 #undef __FUNCT__ 4078 #define __FUNCT__ "MatGetRow_FactoredLU" 4079 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row) 4080 { 4081 PetscInt k, *vi; 4082 4083 PetscFunctionBegin; 4084 vi = aj + ai[row]; 4085 for (k=0;k<nzl;k++) cols[k] = vi[k]; 4086 vi = aj + adiag[row]; 4087 cols[nzl] = vi[0]; 4088 vi = aj + adiag[row+1]+1; 4089 for (k=0;k<nzu;k++) cols[nzl+1+k] = vi[k]; 4090 PetscFunctionReturn(0); 4091 } 4092 /* 4093 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4094 Modified from Mat_CheckInode(). 4095 4096 Input Parameters: 4097 + Mat A - ILU or LU matrix factor 4098 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 4099 do not need to recompute the inodes 4100 */ 4101 #undef __FUNCT__ 4102 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4103 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure) 4104 { 4105 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4106 PetscErrorCode ierr; 4107 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4108 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 4109 PetscBool flag; 4110 4111 PetscFunctionBegin; 4112 if (!a->inode.use) PetscFunctionReturn(0); 4113 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4114 4115 m = A->rmap->n; 4116 if (a->inode.size) {ns = a->inode.size;} 4117 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 4118 4119 i = 0; 4120 node_count = 0; 4121 while (i < m) { /* For each row */ 4122 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4123 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4124 nzx = nzl1 + nzu1 + 1; 4125 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 4126 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4127 4128 /* Limits the number of elements in a node to 'a->inode.limit' */ 4129 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4130 nzl2 = ai[j+1] - ai[j]; 4131 nzu2 = adiag[j] - adiag[j+1] - 1; 4132 nzy = nzl2 + nzu2 + 1; 4133 if (nzy != nzx) break; 4134 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 4135 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4136 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4137 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 4138 ierr = PetscFree(cols2);CHKERRQ(ierr); 4139 } 4140 ns[node_count++] = blk_size; 4141 ierr = PetscFree(cols1);CHKERRQ(ierr); 4142 i = j; 4143 } 4144 /* If not enough inodes found,, do not use inode version of the routines */ 4145 if (!m || node_count > .8*m) { 4146 ierr = PetscFree(ns);CHKERRQ(ierr); 4147 a->inode.node_count = 0; 4148 a->inode.size = PETSC_NULL; 4149 a->inode.use = PETSC_FALSE; 4150 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4151 } else { 4152 A->ops->mult = 0; 4153 A->ops->sor = 0; 4154 A->ops->multadd = 0; 4155 A->ops->getrowij = 0; 4156 A->ops->restorerowij = 0; 4157 A->ops->getcolumnij = 0; 4158 A->ops->restorecolumnij = 0; 4159 A->ops->coloringpatch = 0; 4160 A->ops->multdiagonalblock = 0; 4161 A->ops->solve = MatSolve_SeqAIJ_Inode; 4162 a->inode.node_count = node_count; 4163 a->inode.size = ns; 4164 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4165 } 4166 a->inode.checked = PETSC_TRUE; 4167 PetscFunctionReturn(0); 4168 } 4169 4170 #undef __FUNCT__ 4171 #define __FUNCT__ "MatSeqAIJInvalidateDiagonal_Inode" 4172 PetscErrorCode MatSeqAIJInvalidateDiagonal_Inode(Mat A) 4173 { 4174 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4175 4176 PetscFunctionBegin; 4177 a->inode.ibdiagvalid = PETSC_FALSE; 4178 PetscFunctionReturn(0); 4179 } 4180 4181 /* 4182 This is really ugly. if inodes are used this replaces the 4183 permutations with ones that correspond to rows/cols of the matrix 4184 rather then inode blocks 4185 */ 4186 #undef __FUNCT__ 4187 #define __FUNCT__ "MatInodeAdjustForInodes" 4188 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4189 { 4190 PetscErrorCode ierr; 4191 4192 PetscFunctionBegin; 4193 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4194 PetscFunctionReturn(0); 4195 } 4196 4197 EXTERN_C_BEGIN 4198 #undef __FUNCT__ 4199 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4200 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4201 { 4202 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4203 PetscErrorCode ierr; 4204 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4205 const PetscInt *ridx,*cidx; 4206 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4207 PetscInt nslim_col,*ns_col; 4208 IS ris = *rperm,cis = *cperm; 4209 4210 PetscFunctionBegin; 4211 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4212 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4213 4214 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4215 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 4216 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 4217 4218 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4219 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4220 4221 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4222 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4223 4224 /* Construct the permutations for rows*/ 4225 for (i=0,row = 0; i<nslim_row; ++i) { 4226 indx = ridx[i]; 4227 start_val = tns[indx]; 4228 end_val = tns[indx + 1]; 4229 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4230 } 4231 4232 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4233 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4234 4235 /* Construct permutations for columns */ 4236 for (i=0,col=0; i<nslim_col; ++i) { 4237 indx = cidx[i]; 4238 start_val = tns[indx]; 4239 end_val = tns[indx + 1]; 4240 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4241 } 4242 4243 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4244 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4245 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4246 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4247 4248 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4249 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4250 4251 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4252 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4253 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4254 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4255 ierr = PetscFree(tns);CHKERRQ(ierr); 4256 PetscFunctionReturn(0); 4257 } 4258 EXTERN_C_END 4259 4260 #undef __FUNCT__ 4261 #define __FUNCT__ "MatInodeGetInodeSizes" 4262 /*@C 4263 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4264 4265 Not Collective 4266 4267 Input Parameter: 4268 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4269 4270 Output Parameter: 4271 + node_count - no of inodes present in the matrix. 4272 . sizes - an array of size node_count,with sizes of each inode. 4273 - limit - the max size used to generate the inodes. 4274 4275 Level: advanced 4276 4277 Notes: This routine returns some internal storage information 4278 of the matrix, it is intended to be used by advanced users. 4279 It should be called after the matrix is assembled. 4280 The contents of the sizes[] array should not be changed. 4281 PETSC_NULL may be passed for information not requested. 4282 4283 .keywords: matrix, seqaij, get, inode 4284 4285 .seealso: MatGetInfo() 4286 @*/ 4287 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4288 { 4289 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4290 4291 PetscFunctionBegin; 4292 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4293 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 4294 if (f) { 4295 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4296 } 4297 PetscFunctionReturn(0); 4298 } 4299 4300 EXTERN_C_BEGIN 4301 #undef __FUNCT__ 4302 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4303 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4304 { 4305 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4306 4307 PetscFunctionBegin; 4308 if (node_count) *node_count = a->inode.node_count; 4309 if (sizes) *sizes = a->inode.size; 4310 if (limit) *limit = a->inode.limit; 4311 PetscFunctionReturn(0); 4312 } 4313 EXTERN_C_END 4314