1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *pc,*pc1,*pc2,*pc3,mul1,mul2,mul3,*pv,*rtmp1,*rtmp2,*rtmp3; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3; 1195 FactorShiftCtx sctx; 1196 const PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,node_max,col; 1200 const PetscInt *ns; 1201 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1202 1203 PetscFunctionBegin; 1204 /* MatPivotSetUp(): initialize shift context sctx */ 1205 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1206 1207 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1208 ddiag = a->diag; 1209 sctx.shift_top = info->zeropivot; 1210 for (i=0; i<n; i++) { 1211 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1212 d = (aa)[ddiag[i]]; 1213 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1214 v = aa+ai[i]; 1215 nz = ai[i+1] - ai[i]; 1216 for (j=0; j<nz; j++) 1217 rs += PetscAbsScalar(v[j]); 1218 if (rs>sctx.shift_top) sctx.shift_top = rs; 1219 } 1220 sctx.shift_top *= 1.1; 1221 sctx.nshift_max = 5; 1222 sctx.shift_lo = 0.; 1223 sctx.shift_hi = 1.; 1224 } 1225 1226 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1227 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1228 1229 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1230 ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1231 rtmp2 = rtmp1 + n; 1232 rtmp3 = rtmp2 + n; 1233 ics = ic; 1234 1235 node_max = a->inode.node_count; 1236 ns = a->inode.size; 1237 if (!ns){ 1238 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1239 } 1240 1241 /* If max inode size > 3, split it into two inodes.*/ 1242 /* also map the inode sizes according to the ordering */ 1243 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1244 for (i=0,j=0; i<node_max; ++i,++j){ 1245 if (ns[i]>3) { 1246 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1247 ++j; 1248 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1249 } else { 1250 tmp_vec1[j] = ns[i]; 1251 } 1252 } 1253 /* Use the correct node_max */ 1254 node_max = j; 1255 1256 /* Now reorder the inode info based on mat re-ordering info */ 1257 /* First create a row -> inode_size_array_index map */ 1258 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1259 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1260 for (i=0,row=0; i<node_max; i++) { 1261 nodesz = tmp_vec1[i]; 1262 for (j=0; j<nodesz; j++,row++) { 1263 nsmap[row] = i; 1264 } 1265 } 1266 /* Using nsmap, create a reordered ns structure */ 1267 for (i=0,j=0; i< node_max; i++) { 1268 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1269 tmp_vec2[i] = nodesz; 1270 j += nodesz; 1271 } 1272 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1273 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1274 1275 /* Now use the correct ns */ 1276 ns = tmp_vec2; 1277 1278 do { 1279 sctx.useshift = PETSC_FALSE; 1280 /* Now loop over each block-row, and do the factorization */ 1281 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1282 nodesz = ns[inod]; 1283 1284 switch (nodesz){ 1285 case 1: 1286 /*----------*/ 1287 /* zero rtmp1 */ 1288 /* L part */ 1289 nz = bi[i+1] - bi[i]; 1290 bjtmp = bj + bi[i]; 1291 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1292 1293 /* U part */ 1294 nz = bdiag[i]-bdiag[i+1]; 1295 bjtmp = bj + bdiag[i+1]+1; 1296 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1297 1298 /* load in initial (unfactored row) */ 1299 nz = ai[r[i]+1] - ai[r[i]]; 1300 ajtmp = aj + ai[r[i]]; 1301 v = aa + ai[r[i]]; 1302 for (j=0; j<nz; j++) { 1303 rtmp1[ics[ajtmp[j]]] = v[j]; 1304 } 1305 /* ZeropivotApply() */ 1306 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1307 1308 /* elimination */ 1309 bjtmp = bj + bi[i]; 1310 row = *bjtmp++; 1311 nzL = bi[i+1] - bi[i]; 1312 for(k=0; k < nzL;k++) { 1313 pc = rtmp1 + row; 1314 if (*pc != 0.0) { 1315 pv = b->a + bdiag[row]; 1316 mul1 = *pc * (*pv); 1317 *pc = mul1; 1318 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1319 pv = b->a + bdiag[row+1]+1; 1320 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1321 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1322 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1323 } 1324 row = *bjtmp++; 1325 } 1326 1327 /* finished row so stick it into b->a */ 1328 rs = 0.0; 1329 /* L part */ 1330 pv = b->a + bi[i] ; 1331 pj = b->j + bi[i] ; 1332 nz = bi[i+1] - bi[i]; 1333 for (j=0; j<nz; j++) { 1334 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1335 } 1336 1337 /* U part */ 1338 pv = b->a + bdiag[i+1]+1; 1339 pj = b->j + bdiag[i+1]+1; 1340 nz = bdiag[i] - bdiag[i+1]-1; 1341 for (j=0; j<nz; j++) { 1342 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1343 } 1344 1345 /* Check zero pivot */ 1346 sctx.rs = rs; 1347 sctx.pv = rtmp1[i]; 1348 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1349 1350 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1351 pv = b->a + bdiag[i]; 1352 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1353 break; 1354 1355 case 2: 1356 /*----------*/ 1357 /* zero rtmp1 and rtmp2 */ 1358 /* L part */ 1359 nz = bi[i+1] - bi[i]; 1360 bjtmp = bj + bi[i]; 1361 for (j=0; j<nz; j++) { 1362 col = bjtmp[j]; 1363 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1364 } 1365 1366 /* U part */ 1367 nz = bdiag[i]-bdiag[i+1]; 1368 bjtmp = bj + bdiag[i+1]+1; 1369 for (j=0; j<nz; j++) { 1370 col = bjtmp[j]; 1371 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1372 } 1373 1374 /* load in initial (unfactored row) */ 1375 nz = ai[r[i]+1] - ai[r[i]]; 1376 ajtmp = aj + ai[r[i]]; 1377 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1378 for (j=0; j<nz; j++) { 1379 col = ics[ajtmp[j]]; 1380 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1381 } 1382 /* ZeropivotApply(): shift the diagonal of the matrix */ 1383 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1384 1385 /* elimination */ 1386 bjtmp = bj + bi[i]; 1387 row = *bjtmp++; /* pivot row */ 1388 nzL = bi[i+1] - bi[i]; 1389 for(k=0; k < nzL;k++) { 1390 pc1 = rtmp1 + row; 1391 pc2 = rtmp2 + row; 1392 if (*pc1 != 0.0 || *pc2 != 0.0) { 1393 pv = b->a + bdiag[row]; 1394 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1395 *pc1 = mul1; *pc2 = mul2; 1396 1397 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1398 pv = b->a + bdiag[row+1]+1; 1399 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1400 for (j=0; j<nz; j++){ 1401 col = pj[j]; 1402 rtmp1[col] -= mul1 * pv[j]; 1403 rtmp2[col] -= mul2 * pv[j]; 1404 } 1405 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1406 } 1407 row = *bjtmp++; 1408 } 1409 1410 /* finished row i; check zero pivot, then stick row i into b->a */ 1411 rs = 0.0; 1412 /* L part */ 1413 pc1 = b->a + bi[i]; 1414 pj = b->j + bi[i] ; 1415 nz = bi[i+1] - bi[i]; 1416 for (j=0; j<nz; j++) { 1417 col = pj[j]; 1418 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1419 } 1420 /* U part */ 1421 pc1 = b->a + bdiag[i+1]+1; 1422 pj = b->j + bdiag[i+1]+1; 1423 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1424 for (j=0; j<nz; j++) { 1425 col = pj[j]; 1426 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1427 } 1428 1429 sctx.rs = rs; 1430 sctx.pv = rtmp1[i]; 1431 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1432 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1433 *pc1 = 1.0/sctx.pv; 1434 1435 /* Now take care of diagonal 2x2 block. */ 1436 pc2 = rtmp2 + i; 1437 if (*pc2 != 0.0){ 1438 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1439 *pc2 = mul1; /* insert L entry */ 1440 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1441 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1442 for (j=0; j<nz; j++) { 1443 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1444 } 1445 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1446 } 1447 1448 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1449 rs = 0.0; 1450 /* L part */ 1451 pc2 = b->a + bi[i+1]; 1452 pj = b->j + bi[i+1] ; 1453 nz = bi[i+2] - bi[i+1]; 1454 for (j=0; j<nz; j++) { 1455 col = pj[j]; 1456 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1457 } 1458 /* U part */ 1459 pc2 = b->a + bdiag[i+2]+1; 1460 pj = b->j + bdiag[i+2]+1; 1461 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1462 for (j=0; j<nz; j++) { 1463 col = pj[j]; 1464 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1465 } 1466 1467 sctx.rs = rs; 1468 sctx.pv = rtmp2[i+1]; 1469 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1470 pc2 = b->a + bdiag[i+1]; 1471 *pc2 = 1.0/sctx.pv; 1472 break; 1473 1474 case 3: 1475 /*----------*/ 1476 /* zero rtmp */ 1477 /* L part */ 1478 nz = bi[i+1] - bi[i]; 1479 bjtmp = bj + bi[i]; 1480 for (j=0; j<nz; j++) { 1481 col = bjtmp[j]; 1482 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1483 } 1484 1485 /* U part */ 1486 nz = bdiag[i]-bdiag[i+1]; 1487 bjtmp = bj + bdiag[i+1]+1; 1488 for (j=0; j<nz; j++) { 1489 col = bjtmp[j]; 1490 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1491 } 1492 1493 /* load in initial (unfactored row) */ 1494 nz = ai[r[i]+1] - ai[r[i]]; 1495 ajtmp = aj + ai[r[i]]; 1496 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1497 for (j=0; j<nz; j++) { 1498 col = ics[ajtmp[j]]; 1499 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1500 } 1501 /* ZeropivotApply(): shift the diagonal of the matrix */ 1502 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1503 1504 /* elimination */ 1505 bjtmp = bj + bi[i]; 1506 row = *bjtmp++; /* pivot row */ 1507 nzL = bi[i+1] - bi[i]; 1508 for(k=0; k < nzL;k++) { 1509 pc1 = rtmp1 + row; 1510 pc2 = rtmp2 + row; 1511 pc3 = rtmp3 + row; 1512 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1513 pv = b->a + bdiag[row]; 1514 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1515 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1516 1517 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1518 pv = b->a + bdiag[row+1]+1; 1519 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1520 for (j=0; j<nz; j++){ 1521 col = pj[j]; 1522 rtmp1[col] -= mul1 * pv[j]; 1523 rtmp2[col] -= mul2 * pv[j]; 1524 rtmp3[col] -= mul3 * pv[j]; 1525 } 1526 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1527 } 1528 row = *bjtmp++; 1529 } 1530 1531 /* finished row i; check zero pivot, then stick row i into b->a */ 1532 rs = 0.0; 1533 /* L part */ 1534 pc1 = b->a + bi[i]; 1535 pj = b->j + bi[i] ; 1536 nz = bi[i+1] - bi[i]; 1537 for (j=0; j<nz; j++) { 1538 col = pj[j]; 1539 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1540 } 1541 /* U part */ 1542 pc1 = b->a + bdiag[i+1]+1; 1543 pj = b->j + bdiag[i+1]+1; 1544 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1545 for (j=0; j<nz; j++) { 1546 col = pj[j]; 1547 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1548 } 1549 1550 sctx.rs = rs; 1551 sctx.pv = rtmp1[i]; 1552 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1553 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1554 *pc1 = 1.0/sctx.pv; 1555 1556 /* Now take care of 1st column of diagonal 3x3 block. */ 1557 pc2 = rtmp2 + i; 1558 pc3 = rtmp3 + i; 1559 if (*pc2 != 0.0 || *pc3 != 0.0){ 1560 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1561 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1562 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1563 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1564 for (j=0; j<nz; j++) { 1565 col = pj[j]; 1566 rtmp2[col] -= mul2 * rtmp1[col]; 1567 rtmp3[col] -= mul3 * rtmp1[col]; 1568 } 1569 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1570 } 1571 1572 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1573 rs = 0.0; 1574 /* L part */ 1575 pc2 = b->a + bi[i+1]; 1576 pj = b->j + bi[i+1] ; 1577 nz = bi[i+2] - bi[i+1]; 1578 for (j=0; j<nz; j++) { 1579 col = pj[j]; 1580 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1581 } 1582 /* U part */ 1583 pc2 = b->a + bdiag[i+2]+1; 1584 pj = b->j + bdiag[i+2]+1; 1585 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1586 for (j=0; j<nz; j++) { 1587 col = pj[j]; 1588 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1589 } 1590 1591 sctx.rs = rs; 1592 sctx.pv = rtmp2[i+1]; 1593 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1594 pc2 = b->a + bdiag[i+1]; 1595 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1596 1597 /* Now take care of 2nd column of diagonal 3x3 block. */ 1598 pc3 = rtmp3 + i+1; 1599 if (*pc3 != 0.0){ 1600 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1601 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1602 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1603 for (j=0; j<nz; j++) { 1604 col = pj[j]; 1605 rtmp3[col] -= mul3 * rtmp2[col]; 1606 } 1607 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1608 } 1609 1610 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1611 rs = 0.0; 1612 /* L part */ 1613 pc3 = b->a + bi[i+2]; 1614 pj = b->j + bi[i+2] ; 1615 nz = bi[i+3] - bi[i+2]; 1616 for (j=0; j<nz; j++) { 1617 col = pj[j]; 1618 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1619 } 1620 /* U part */ 1621 pc3 = b->a + bdiag[i+3]+1; 1622 pj = b->j + bdiag[i+3]+1; 1623 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1624 for (j=0; j<nz; j++) { 1625 col = pj[j]; 1626 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1627 } 1628 1629 sctx.rs = rs; 1630 sctx.pv = rtmp3[i+2]; 1631 ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr); 1632 pc3 = b->a + bdiag[i+2]; 1633 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1634 break; 1635 1636 default: 1637 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 1638 } 1639 i += nodesz; /* Update the row */ 1640 } 1641 1642 /* MatPivotRefine() */ 1643 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1644 /* 1645 * if no shift in this attempt & shifting & started shifting & can refine, 1646 * then try lower shift 1647 */ 1648 sctx.shift_hi = sctx.shift_fraction; 1649 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1650 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1651 sctx.useshift = PETSC_TRUE; 1652 sctx.nshift++; 1653 } 1654 } while (sctx.useshift); 1655 1656 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1657 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1658 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1659 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1660 1661 C->ops->solve = MatSolve_SeqAIJ; 1662 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1663 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1664 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1665 C->ops->matsolve = MatMatSolve_SeqAIJ; 1666 C->assembled = PETSC_TRUE; 1667 C->preallocated = PETSC_TRUE; 1668 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1669 1670 /* MatShiftView(A,info,&sctx) */ 1671 if (sctx.nshift){ 1672 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1673 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1674 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1675 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1676 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){ 1677 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1678 } 1679 } 1680 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1681 PetscFunctionReturn(0); 1682 } 1683 1684 #undef __FUNCT__ 1685 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1686 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1687 { 1688 Mat C = B; 1689 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1690 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1691 PetscErrorCode ierr; 1692 const PetscInt *r,*ic,*c,*ics; 1693 PetscInt n = A->rmap->n,*bi = b->i; 1694 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1695 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1696 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1697 PetscScalar mul1,mul2,mul3,tmp; 1698 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1699 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1700 PetscReal rs=0.0; 1701 FactorShiftCtx sctx; 1702 PetscInt newshift; 1703 1704 PetscFunctionBegin; 1705 sctx.shift_top = 0; 1706 sctx.nshift_max = 0; 1707 sctx.shift_lo = 0; 1708 sctx.shift_hi = 0; 1709 sctx.shift_fraction = 0; 1710 1711 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1712 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1713 sctx.shift_top = 0; 1714 for (i=0; i<n; i++) { 1715 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1716 rs = 0.0; 1717 ajtmp = aj + ai[i]; 1718 rtmp1 = aa + ai[i]; 1719 nz = ai[i+1] - ai[i]; 1720 for (j=0; j<nz; j++){ 1721 if (*ajtmp != i){ 1722 rs += PetscAbsScalar(*rtmp1++); 1723 } else { 1724 rs -= PetscRealPart(*rtmp1++); 1725 } 1726 ajtmp++; 1727 } 1728 if (rs>sctx.shift_top) sctx.shift_top = rs; 1729 } 1730 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1731 sctx.shift_top *= 1.1; 1732 sctx.nshift_max = 5; 1733 sctx.shift_lo = 0.; 1734 sctx.shift_hi = 1.; 1735 } 1736 sctx.shift_amount = 0; 1737 sctx.nshift = 0; 1738 1739 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1740 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1741 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1742 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1743 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1744 ics = ic ; 1745 rtmp22 = rtmp11 + n; 1746 rtmp33 = rtmp22 + n; 1747 1748 node_max = a->inode.node_count; 1749 ns = a->inode.size; 1750 if (!ns){ 1751 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1752 } 1753 1754 /* If max inode size > 3, split it into two inodes.*/ 1755 /* also map the inode sizes according to the ordering */ 1756 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1757 for (i=0,j=0; i<node_max; ++i,++j){ 1758 if (ns[i]>3) { 1759 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1760 ++j; 1761 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1762 } else { 1763 tmp_vec1[j] = ns[i]; 1764 } 1765 } 1766 /* Use the correct node_max */ 1767 node_max = j; 1768 1769 /* Now reorder the inode info based on mat re-ordering info */ 1770 /* First create a row -> inode_size_array_index map */ 1771 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1772 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1773 for (i=0,row=0; i<node_max; i++) { 1774 nodesz = tmp_vec1[i]; 1775 for (j=0; j<nodesz; j++,row++) { 1776 nsmap[row] = i; 1777 } 1778 } 1779 /* Using nsmap, create a reordered ns structure */ 1780 for (i=0,j=0; i< node_max; i++) { 1781 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1782 tmp_vec2[i] = nodesz; 1783 j += nodesz; 1784 } 1785 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1786 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1787 /* Now use the correct ns */ 1788 ns = tmp_vec2; 1789 1790 do { 1791 sctx.useshift = PETSC_FALSE; 1792 /* Now loop over each block-row, and do the factorization */ 1793 for (i=0,row=0; i<node_max; i++) { 1794 nodesz = ns[i]; 1795 nz = bi[row+1] - bi[row]; 1796 bjtmp = bj + bi[row]; 1797 1798 switch (nodesz){ 1799 case 1: 1800 for (j=0; j<nz; j++){ 1801 idx = bjtmp[j]; 1802 rtmp11[idx] = 0.0; 1803 } 1804 1805 /* load in initial (unfactored row) */ 1806 idx = r[row]; 1807 nz_tmp = ai[idx+1] - ai[idx]; 1808 ajtmp = aj + ai[idx]; 1809 v1 = aa + ai[idx]; 1810 1811 for (j=0; j<nz_tmp; j++) { 1812 idx = ics[ajtmp[j]]; 1813 rtmp11[idx] = v1[j]; 1814 } 1815 rtmp11[ics[r[row]]] += sctx.shift_amount; 1816 1817 prow = *bjtmp++ ; 1818 while (prow < row) { 1819 pc1 = rtmp11 + prow; 1820 if (*pc1 != 0.0){ 1821 pv = ba + bd[prow]; 1822 pj = nbj + bd[prow]; 1823 mul1 = *pc1 * *pv++; 1824 *pc1 = mul1; 1825 nz_tmp = bi[prow+1] - bd[prow] - 1; 1826 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1827 for (j=0; j<nz_tmp; j++) { 1828 tmp = pv[j]; 1829 idx = pj[j]; 1830 rtmp11[idx] -= mul1 * tmp; 1831 } 1832 } 1833 prow = *bjtmp++ ; 1834 } 1835 pj = bj + bi[row]; 1836 pc1 = ba + bi[row]; 1837 1838 sctx.pv = rtmp11[row]; 1839 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1840 rs = 0.0; 1841 for (j=0; j<nz; j++) { 1842 idx = pj[j]; 1843 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1844 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1845 } 1846 sctx.rs = rs; 1847 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1848 if (newshift == 1) goto endofwhile; 1849 break; 1850 1851 case 2: 1852 for (j=0; j<nz; j++) { 1853 idx = bjtmp[j]; 1854 rtmp11[idx] = 0.0; 1855 rtmp22[idx] = 0.0; 1856 } 1857 1858 /* load in initial (unfactored row) */ 1859 idx = r[row]; 1860 nz_tmp = ai[idx+1] - ai[idx]; 1861 ajtmp = aj + ai[idx]; 1862 v1 = aa + ai[idx]; 1863 v2 = aa + ai[idx+1]; 1864 for (j=0; j<nz_tmp; j++) { 1865 idx = ics[ajtmp[j]]; 1866 rtmp11[idx] = v1[j]; 1867 rtmp22[idx] = v2[j]; 1868 } 1869 rtmp11[ics[r[row]]] += sctx.shift_amount; 1870 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1871 1872 prow = *bjtmp++ ; 1873 while (prow < row) { 1874 pc1 = rtmp11 + prow; 1875 pc2 = rtmp22 + prow; 1876 if (*pc1 != 0.0 || *pc2 != 0.0){ 1877 pv = ba + bd[prow]; 1878 pj = nbj + bd[prow]; 1879 mul1 = *pc1 * *pv; 1880 mul2 = *pc2 * *pv; 1881 ++pv; 1882 *pc1 = mul1; 1883 *pc2 = mul2; 1884 1885 nz_tmp = bi[prow+1] - bd[prow] - 1; 1886 for (j=0; j<nz_tmp; j++) { 1887 tmp = pv[j]; 1888 idx = pj[j]; 1889 rtmp11[idx] -= mul1 * tmp; 1890 rtmp22[idx] -= mul2 * tmp; 1891 } 1892 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1893 } 1894 prow = *bjtmp++ ; 1895 } 1896 1897 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1898 pc1 = rtmp11 + prow; 1899 pc2 = rtmp22 + prow; 1900 1901 sctx.pv = *pc1; 1902 pj = bj + bi[prow]; 1903 rs = 0.0; 1904 for (j=0; j<nz; j++){ 1905 idx = pj[j]; 1906 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1907 } 1908 sctx.rs = rs; 1909 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1910 if (newshift == 1) goto endofwhile; 1911 1912 if (*pc2 != 0.0){ 1913 pj = nbj + bd[prow]; 1914 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1915 *pc2 = mul2; 1916 nz_tmp = bi[prow+1] - bd[prow] - 1; 1917 for (j=0; j<nz_tmp; j++) { 1918 idx = pj[j] ; 1919 tmp = rtmp11[idx]; 1920 rtmp22[idx] -= mul2 * tmp; 1921 } 1922 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1923 } 1924 1925 pj = bj + bi[row]; 1926 pc1 = ba + bi[row]; 1927 pc2 = ba + bi[row+1]; 1928 1929 sctx.pv = rtmp22[row+1]; 1930 rs = 0.0; 1931 rtmp11[row] = 1.0/rtmp11[row]; 1932 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1933 /* copy row entries from dense representation to sparse */ 1934 for (j=0; j<nz; j++) { 1935 idx = pj[j]; 1936 pc1[j] = rtmp11[idx]; 1937 pc2[j] = rtmp22[idx]; 1938 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1939 } 1940 sctx.rs = rs; 1941 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1942 if (newshift == 1) goto endofwhile; 1943 break; 1944 1945 case 3: 1946 for (j=0; j<nz; j++) { 1947 idx = bjtmp[j]; 1948 rtmp11[idx] = 0.0; 1949 rtmp22[idx] = 0.0; 1950 rtmp33[idx] = 0.0; 1951 } 1952 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1953 idx = r[row]; 1954 nz_tmp = ai[idx+1] - ai[idx]; 1955 ajtmp = aj + ai[idx]; 1956 v1 = aa + ai[idx]; 1957 v2 = aa + ai[idx+1]; 1958 v3 = aa + ai[idx+2]; 1959 for (j=0; j<nz_tmp; j++) { 1960 idx = ics[ajtmp[j]]; 1961 rtmp11[idx] = v1[j]; 1962 rtmp22[idx] = v2[j]; 1963 rtmp33[idx] = v3[j]; 1964 } 1965 rtmp11[ics[r[row]]] += sctx.shift_amount; 1966 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1967 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1968 1969 /* loop over all pivot row blocks above this row block */ 1970 prow = *bjtmp++ ; 1971 while (prow < row) { 1972 pc1 = rtmp11 + prow; 1973 pc2 = rtmp22 + prow; 1974 pc3 = rtmp33 + prow; 1975 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1976 pv = ba + bd[prow]; 1977 pj = nbj + bd[prow]; 1978 mul1 = *pc1 * *pv; 1979 mul2 = *pc2 * *pv; 1980 mul3 = *pc3 * *pv; 1981 ++pv; 1982 *pc1 = mul1; 1983 *pc2 = mul2; 1984 *pc3 = mul3; 1985 1986 nz_tmp = bi[prow+1] - bd[prow] - 1; 1987 /* update this row based on pivot row */ 1988 for (j=0; j<nz_tmp; j++) { 1989 tmp = pv[j]; 1990 idx = pj[j]; 1991 rtmp11[idx] -= mul1 * tmp; 1992 rtmp22[idx] -= mul2 * tmp; 1993 rtmp33[idx] -= mul3 * tmp; 1994 } 1995 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1996 } 1997 prow = *bjtmp++ ; 1998 } 1999 2000 /* Now take care of diagonal 3x3 block in this set of rows */ 2001 /* note: prow = row here */ 2002 pc1 = rtmp11 + prow; 2003 pc2 = rtmp22 + prow; 2004 pc3 = rtmp33 + prow; 2005 2006 sctx.pv = *pc1; 2007 pj = bj + bi[prow]; 2008 rs = 0.0; 2009 for (j=0; j<nz; j++){ 2010 idx = pj[j]; 2011 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2012 } 2013 sctx.rs = rs; 2014 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2015 if (newshift == 1) goto endofwhile; 2016 2017 if (*pc2 != 0.0 || *pc3 != 0.0){ 2018 mul2 = (*pc2)/(*pc1); 2019 mul3 = (*pc3)/(*pc1); 2020 *pc2 = mul2; 2021 *pc3 = mul3; 2022 nz_tmp = bi[prow+1] - bd[prow] - 1; 2023 pj = nbj + bd[prow]; 2024 for (j=0; j<nz_tmp; j++) { 2025 idx = pj[j] ; 2026 tmp = rtmp11[idx]; 2027 rtmp22[idx] -= mul2 * tmp; 2028 rtmp33[idx] -= mul3 * tmp; 2029 } 2030 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2031 } 2032 ++prow; 2033 2034 pc2 = rtmp22 + prow; 2035 pc3 = rtmp33 + prow; 2036 sctx.pv = *pc2; 2037 pj = bj + bi[prow]; 2038 rs = 0.0; 2039 for (j=0; j<nz; j++){ 2040 idx = pj[j]; 2041 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2042 } 2043 sctx.rs = rs; 2044 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2045 if (newshift == 1) goto endofwhile; 2046 2047 if (*pc3 != 0.0){ 2048 mul3 = (*pc3)/(*pc2); 2049 *pc3 = mul3; 2050 pj = nbj + bd[prow]; 2051 nz_tmp = bi[prow+1] - bd[prow] - 1; 2052 for (j=0; j<nz_tmp; j++) { 2053 idx = pj[j] ; 2054 tmp = rtmp22[idx]; 2055 rtmp33[idx] -= mul3 * tmp; 2056 } 2057 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2058 } 2059 2060 pj = bj + bi[row]; 2061 pc1 = ba + bi[row]; 2062 pc2 = ba + bi[row+1]; 2063 pc3 = ba + bi[row+2]; 2064 2065 sctx.pv = rtmp33[row+2]; 2066 rs = 0.0; 2067 rtmp11[row] = 1.0/rtmp11[row]; 2068 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2069 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2070 /* copy row entries from dense representation to sparse */ 2071 for (j=0; j<nz; j++) { 2072 idx = pj[j]; 2073 pc1[j] = rtmp11[idx]; 2074 pc2[j] = rtmp22[idx]; 2075 pc3[j] = rtmp33[idx]; 2076 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2077 } 2078 2079 sctx.rs = rs; 2080 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 2081 if (newshift == 1) goto endofwhile; 2082 break; 2083 2084 default: 2085 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 2086 } 2087 row += nodesz; /* Update the row */ 2088 } 2089 endofwhile:; 2090 } while (sctx.useshift); 2091 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2092 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2093 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2094 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2095 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2096 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2097 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2098 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2099 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2100 C->assembled = PETSC_TRUE; 2101 C->preallocated = PETSC_TRUE; 2102 if (sctx.nshift) { 2103 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2104 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2105 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2106 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2107 } 2108 } 2109 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2110 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2111 PetscFunctionReturn(0); 2112 } 2113 2114 2115 /* ----------------------------------------------------------- */ 2116 #undef __FUNCT__ 2117 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2118 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2119 { 2120 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2121 IS iscol = a->col,isrow = a->row; 2122 PetscErrorCode ierr; 2123 const PetscInt *r,*c,*rout,*cout; 2124 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2125 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2126 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2127 PetscScalar sum1,sum2,sum3,sum4,sum5; 2128 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2129 const PetscScalar *b; 2130 2131 PetscFunctionBegin; 2132 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 2133 node_max = a->inode.node_count; 2134 ns = a->inode.size; /* Node Size array */ 2135 2136 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2137 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2138 tmp = a->solve_work; 2139 2140 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2141 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2142 2143 /* forward solve the lower triangular */ 2144 tmps = tmp ; 2145 aa = a_a ; 2146 aj = a_j ; 2147 ad = a->diag; 2148 2149 for (i = 0,row = 0; i< node_max; ++i){ 2150 nsz = ns[i]; 2151 aii = ai[row]; 2152 v1 = aa + aii; 2153 vi = aj + aii; 2154 nz = ai[row+1]- ai[row]; 2155 2156 if (i < node_max-1) { 2157 /* Prefetch the indices for the next block */ 2158 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2159 /* Prefetch the data for the next block */ 2160 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2161 } 2162 2163 switch (nsz){ /* Each loop in 'case' is unrolled */ 2164 case 1 : 2165 sum1 = b[r[row]]; 2166 for(j=0; j<nz-1; j+=2){ 2167 i0 = vi[j]; 2168 i1 = vi[j+1]; 2169 tmp0 = tmps[i0]; 2170 tmp1 = tmps[i1]; 2171 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2172 } 2173 if(j == nz-1){ 2174 tmp0 = tmps[vi[j]]; 2175 sum1 -= v1[j]*tmp0; 2176 } 2177 tmp[row++]=sum1; 2178 break; 2179 case 2: 2180 sum1 = b[r[row]]; 2181 sum2 = b[r[row+1]]; 2182 v2 = aa + ai[row+1]; 2183 2184 for(j=0; j<nz-1; j+=2){ 2185 i0 = vi[j]; 2186 i1 = vi[j+1]; 2187 tmp0 = tmps[i0]; 2188 tmp1 = tmps[i1]; 2189 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2190 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2191 } 2192 if(j == nz-1){ 2193 tmp0 = tmps[vi[j]]; 2194 sum1 -= v1[j] *tmp0; 2195 sum2 -= v2[j] *tmp0; 2196 } 2197 sum2 -= v2[nz] * sum1; 2198 tmp[row ++]=sum1; 2199 tmp[row ++]=sum2; 2200 break; 2201 case 3: 2202 sum1 = b[r[row]]; 2203 sum2 = b[r[row+1]]; 2204 sum3 = b[r[row+2]]; 2205 v2 = aa + ai[row+1]; 2206 v3 = aa + ai[row+2]; 2207 2208 for (j=0; j<nz-1; j+=2){ 2209 i0 = vi[j]; 2210 i1 = vi[j+1]; 2211 tmp0 = tmps[i0]; 2212 tmp1 = tmps[i1]; 2213 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2214 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2215 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2216 } 2217 if (j == nz-1){ 2218 tmp0 = tmps[vi[j]]; 2219 sum1 -= v1[j] *tmp0; 2220 sum2 -= v2[j] *tmp0; 2221 sum3 -= v3[j] *tmp0; 2222 } 2223 sum2 -= v2[nz] * sum1; 2224 sum3 -= v3[nz] * sum1; 2225 sum3 -= v3[nz+1] * sum2; 2226 tmp[row ++]=sum1; 2227 tmp[row ++]=sum2; 2228 tmp[row ++]=sum3; 2229 break; 2230 2231 case 4: 2232 sum1 = b[r[row]]; 2233 sum2 = b[r[row+1]]; 2234 sum3 = b[r[row+2]]; 2235 sum4 = b[r[row+3]]; 2236 v2 = aa + ai[row+1]; 2237 v3 = aa + ai[row+2]; 2238 v4 = aa + ai[row+3]; 2239 2240 for (j=0; j<nz-1; j+=2){ 2241 i0 = vi[j]; 2242 i1 = vi[j+1]; 2243 tmp0 = tmps[i0]; 2244 tmp1 = tmps[i1]; 2245 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2246 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2247 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2248 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2249 } 2250 if (j == nz-1){ 2251 tmp0 = tmps[vi[j]]; 2252 sum1 -= v1[j] *tmp0; 2253 sum2 -= v2[j] *tmp0; 2254 sum3 -= v3[j] *tmp0; 2255 sum4 -= v4[j] *tmp0; 2256 } 2257 sum2 -= v2[nz] * sum1; 2258 sum3 -= v3[nz] * sum1; 2259 sum4 -= v4[nz] * sum1; 2260 sum3 -= v3[nz+1] * sum2; 2261 sum4 -= v4[nz+1] * sum2; 2262 sum4 -= v4[nz+2] * sum3; 2263 2264 tmp[row ++]=sum1; 2265 tmp[row ++]=sum2; 2266 tmp[row ++]=sum3; 2267 tmp[row ++]=sum4; 2268 break; 2269 case 5: 2270 sum1 = b[r[row]]; 2271 sum2 = b[r[row+1]]; 2272 sum3 = b[r[row+2]]; 2273 sum4 = b[r[row+3]]; 2274 sum5 = b[r[row+4]]; 2275 v2 = aa + ai[row+1]; 2276 v3 = aa + ai[row+2]; 2277 v4 = aa + ai[row+3]; 2278 v5 = aa + ai[row+4]; 2279 2280 for (j=0; j<nz-1; j+=2){ 2281 i0 = vi[j]; 2282 i1 = vi[j+1]; 2283 tmp0 = tmps[i0]; 2284 tmp1 = tmps[i1]; 2285 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2286 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2287 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2288 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2289 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2290 } 2291 if (j == nz-1){ 2292 tmp0 = tmps[vi[j]]; 2293 sum1 -= v1[j] *tmp0; 2294 sum2 -= v2[j] *tmp0; 2295 sum3 -= v3[j] *tmp0; 2296 sum4 -= v4[j] *tmp0; 2297 sum5 -= v5[j] *tmp0; 2298 } 2299 2300 sum2 -= v2[nz] * sum1; 2301 sum3 -= v3[nz] * sum1; 2302 sum4 -= v4[nz] * sum1; 2303 sum5 -= v5[nz] * sum1; 2304 sum3 -= v3[nz+1] * sum2; 2305 sum4 -= v4[nz+1] * sum2; 2306 sum5 -= v5[nz+1] * sum2; 2307 sum4 -= v4[nz+2] * sum3; 2308 sum5 -= v5[nz+2] * sum3; 2309 sum5 -= v5[nz+3] * sum4; 2310 2311 tmp[row ++]=sum1; 2312 tmp[row ++]=sum2; 2313 tmp[row ++]=sum3; 2314 tmp[row ++]=sum4; 2315 tmp[row ++]=sum5; 2316 break; 2317 default: 2318 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2319 } 2320 } 2321 /* backward solve the upper triangular */ 2322 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2323 nsz = ns[i]; 2324 aii = ad[row+1] + 1; 2325 v1 = aa + aii; 2326 vi = aj + aii; 2327 nz = ad[row]- ad[row+1] - 1; 2328 2329 if (i > 0) { 2330 /* Prefetch the indices for the next block */ 2331 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2332 /* Prefetch the data for the next block */ 2333 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2334 } 2335 2336 switch (nsz){ /* Each loop in 'case' is unrolled */ 2337 case 1 : 2338 sum1 = tmp[row]; 2339 2340 for(j=0 ; j<nz-1; j+=2){ 2341 i0 = vi[j]; 2342 i1 = vi[j+1]; 2343 tmp0 = tmps[i0]; 2344 tmp1 = tmps[i1]; 2345 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2346 } 2347 if (j == nz-1){ 2348 tmp0 = tmps[vi[j]]; 2349 sum1 -= v1[j]*tmp0; 2350 } 2351 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2352 break; 2353 case 2 : 2354 sum1 = tmp[row]; 2355 sum2 = tmp[row-1]; 2356 v2 = aa + ad[row] + 1; 2357 for (j=0 ; j<nz-1; j+=2){ 2358 i0 = vi[j]; 2359 i1 = vi[j+1]; 2360 tmp0 = tmps[i0]; 2361 tmp1 = tmps[i1]; 2362 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2363 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2364 } 2365 if (j == nz-1){ 2366 tmp0 = tmps[vi[j]]; 2367 sum1 -= v1[j]* tmp0; 2368 sum2 -= v2[j+1]* tmp0; 2369 } 2370 2371 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2372 sum2 -= v2[0] * tmp0; 2373 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2374 break; 2375 case 3 : 2376 sum1 = tmp[row]; 2377 sum2 = tmp[row -1]; 2378 sum3 = tmp[row -2]; 2379 v2 = aa + ad[row] + 1; 2380 v3 = aa + ad[row -1] + 1; 2381 for (j=0 ; j<nz-1; j+=2){ 2382 i0 = vi[j]; 2383 i1 = vi[j+1]; 2384 tmp0 = tmps[i0]; 2385 tmp1 = tmps[i1]; 2386 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2387 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2388 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2389 } 2390 if (j== nz-1){ 2391 tmp0 = tmps[vi[j]]; 2392 sum1 -= v1[j] * tmp0; 2393 sum2 -= v2[j+1] * tmp0; 2394 sum3 -= v3[j+2] * tmp0; 2395 } 2396 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2397 sum2 -= v2[0]* tmp0; 2398 sum3 -= v3[1] * tmp0; 2399 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2400 sum3 -= v3[0]* tmp0; 2401 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2402 2403 break; 2404 case 4 : 2405 sum1 = tmp[row]; 2406 sum2 = tmp[row -1]; 2407 sum3 = tmp[row -2]; 2408 sum4 = tmp[row -3]; 2409 v2 = aa + ad[row]+1; 2410 v3 = aa + ad[row -1]+1; 2411 v4 = aa + ad[row -2]+1; 2412 2413 for (j=0 ; j<nz-1; j+=2){ 2414 i0 = vi[j]; 2415 i1 = vi[j+1]; 2416 tmp0 = tmps[i0]; 2417 tmp1 = tmps[i1]; 2418 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2419 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2420 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2421 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2422 } 2423 if (j== nz-1){ 2424 tmp0 = tmps[vi[j]]; 2425 sum1 -= v1[j] * tmp0; 2426 sum2 -= v2[j+1] * tmp0; 2427 sum3 -= v3[j+2] * tmp0; 2428 sum4 -= v4[j+3] * tmp0; 2429 } 2430 2431 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2432 sum2 -= v2[0] * tmp0; 2433 sum3 -= v3[1] * tmp0; 2434 sum4 -= v4[2] * tmp0; 2435 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2436 sum3 -= v3[0] * tmp0; 2437 sum4 -= v4[1] * tmp0; 2438 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2439 sum4 -= v4[0] * tmp0; 2440 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2441 break; 2442 case 5 : 2443 sum1 = tmp[row]; 2444 sum2 = tmp[row -1]; 2445 sum3 = tmp[row -2]; 2446 sum4 = tmp[row -3]; 2447 sum5 = tmp[row -4]; 2448 v2 = aa + ad[row]+1; 2449 v3 = aa + ad[row -1]+1; 2450 v4 = aa + ad[row -2]+1; 2451 v5 = aa + ad[row -3]+1; 2452 for (j=0 ; j<nz-1; j+=2){ 2453 i0 = vi[j]; 2454 i1 = vi[j+1]; 2455 tmp0 = tmps[i0]; 2456 tmp1 = tmps[i1]; 2457 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2458 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2459 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2460 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2461 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2462 } 2463 if (j==nz-1){ 2464 tmp0 = tmps[vi[j]]; 2465 sum1 -= v1[j] * tmp0; 2466 sum2 -= v2[j+1] * tmp0; 2467 sum3 -= v3[j+2] * tmp0; 2468 sum4 -= v4[j+3] * tmp0; 2469 sum5 -= v5[j+4] * tmp0; 2470 } 2471 2472 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2473 sum2 -= v2[0] * tmp0; 2474 sum3 -= v3[1] * tmp0; 2475 sum4 -= v4[2] * tmp0; 2476 sum5 -= v5[3] * tmp0; 2477 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2478 sum3 -= v3[0] * tmp0; 2479 sum4 -= v4[1] * tmp0; 2480 sum5 -= v5[2] * tmp0; 2481 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2482 sum4 -= v4[0] * tmp0; 2483 sum5 -= v5[1] * tmp0; 2484 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2485 sum5 -= v5[0] * tmp0; 2486 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2487 break; 2488 default: 2489 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2490 } 2491 } 2492 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2493 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2494 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2495 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2496 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2497 PetscFunctionReturn(0); 2498 } 2499 2500 2501 /* 2502 Makes a longer coloring[] array and calls the usual code with that 2503 */ 2504 #undef __FUNCT__ 2505 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2506 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2507 { 2508 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2509 PetscErrorCode ierr; 2510 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2511 PetscInt *colorused,i; 2512 ISColoringValue *newcolor; 2513 2514 PetscFunctionBegin; 2515 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2516 /* loop over inodes, marking a color for each column*/ 2517 row = 0; 2518 for (i=0; i<m; i++){ 2519 for (j=0; j<ns[i]; j++) { 2520 newcolor[row++] = coloring[i] + j*ncolors; 2521 } 2522 } 2523 2524 /* eliminate unneeded colors */ 2525 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2526 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2527 for (i=0; i<n; i++) { 2528 colorused[newcolor[i]] = 1; 2529 } 2530 2531 for (i=1; i<5*ncolors; i++) { 2532 colorused[i] += colorused[i-1]; 2533 } 2534 ncolors = colorused[5*ncolors-1]; 2535 for (i=0; i<n; i++) { 2536 newcolor[i] = colorused[newcolor[i]]-1; 2537 } 2538 ierr = PetscFree(colorused);CHKERRQ(ierr); 2539 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2540 ierr = PetscFree(coloring);CHKERRQ(ierr); 2541 PetscFunctionReturn(0); 2542 } 2543 2544 #include "../src/mat/blockinvert.h" 2545 2546 #undef __FUNCT__ 2547 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2548 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2549 { 2550 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2551 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2552 MatScalar *ibdiag,*bdiag,work[25]; 2553 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2554 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2555 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2556 PetscErrorCode ierr; 2557 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2558 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2559 2560 PetscFunctionBegin; 2561 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2562 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2563 if (its > 1) { 2564 /* switch to non-inode version */ 2565 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2566 PetscFunctionReturn(0); 2567 } 2568 2569 if (!a->inode.ibdiagvalid) { 2570 if (!a->inode.ibdiag) { 2571 /* calculate space needed for diagonal blocks */ 2572 for (i=0; i<m; i++) { 2573 cnt += sizes[i]*sizes[i]; 2574 } 2575 a->inode.bdiagsize = cnt; 2576 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2577 } 2578 2579 /* copy over the diagonal blocks and invert them */ 2580 ibdiag = a->inode.ibdiag; 2581 bdiag = a->inode.bdiag; 2582 cnt = 0; 2583 for (i=0, row = 0; i<m; i++) { 2584 for (j=0; j<sizes[i]; j++) { 2585 for (k=0; k<sizes[i]; k++) { 2586 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2587 } 2588 } 2589 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2590 2591 switch(sizes[i]) { 2592 case 1: 2593 /* Create matrix data structure */ 2594 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2595 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2596 break; 2597 case 2: 2598 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2599 break; 2600 case 3: 2601 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2602 break; 2603 case 4: 2604 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2605 break; 2606 case 5: 2607 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2608 break; 2609 default: 2610 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2611 } 2612 cnt += sizes[i]*sizes[i]; 2613 row += sizes[i]; 2614 } 2615 a->inode.ibdiagvalid = PETSC_TRUE; 2616 } 2617 ibdiag = a->inode.ibdiag; 2618 bdiag = a->inode.bdiag; 2619 2620 2621 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2622 if (flag & SOR_ZERO_INITIAL_GUESS) { 2623 PetscScalar *b; 2624 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2625 if (xx != bb) { 2626 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2627 } else { 2628 b = x; 2629 } 2630 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2631 2632 for (i=0, row=0; i<m; i++) { 2633 sz = diag[row] - ii[row]; 2634 v1 = a->a + ii[row]; 2635 idx = a->j + ii[row]; 2636 2637 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2638 switch (sizes[i]){ 2639 case 1: 2640 2641 sum1 = b[row]; 2642 for(n = 0; n<sz-1; n+=2) { 2643 i1 = idx[0]; 2644 i2 = idx[1]; 2645 idx += 2; 2646 tmp0 = x[i1]; 2647 tmp1 = x[i2]; 2648 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2649 } 2650 2651 if (n == sz-1){ 2652 tmp0 = x[*idx]; 2653 sum1 -= *v1 * tmp0; 2654 } 2655 x[row++] = sum1*(*ibdiag++); 2656 break; 2657 case 2: 2658 v2 = a->a + ii[row+1]; 2659 sum1 = b[row]; 2660 sum2 = b[row+1]; 2661 for(n = 0; n<sz-1; n+=2) { 2662 i1 = idx[0]; 2663 i2 = idx[1]; 2664 idx += 2; 2665 tmp0 = x[i1]; 2666 tmp1 = x[i2]; 2667 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2668 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2669 } 2670 2671 if (n == sz-1){ 2672 tmp0 = x[*idx]; 2673 sum1 -= v1[0] * tmp0; 2674 sum2 -= v2[0] * tmp0; 2675 } 2676 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2677 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2678 ibdiag += 4; 2679 break; 2680 case 3: 2681 v2 = a->a + ii[row+1]; 2682 v3 = a->a + ii[row+2]; 2683 sum1 = b[row]; 2684 sum2 = b[row+1]; 2685 sum3 = b[row+2]; 2686 for(n = 0; n<sz-1; n+=2) { 2687 i1 = idx[0]; 2688 i2 = idx[1]; 2689 idx += 2; 2690 tmp0 = x[i1]; 2691 tmp1 = x[i2]; 2692 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2693 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2694 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2695 } 2696 2697 if (n == sz-1){ 2698 tmp0 = x[*idx]; 2699 sum1 -= v1[0] * tmp0; 2700 sum2 -= v2[0] * tmp0; 2701 sum3 -= v3[0] * tmp0; 2702 } 2703 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2704 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2705 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2706 ibdiag += 9; 2707 break; 2708 case 4: 2709 v2 = a->a + ii[row+1]; 2710 v3 = a->a + ii[row+2]; 2711 v4 = a->a + ii[row+3]; 2712 sum1 = b[row]; 2713 sum2 = b[row+1]; 2714 sum3 = b[row+2]; 2715 sum4 = b[row+3]; 2716 for(n = 0; n<sz-1; n+=2) { 2717 i1 = idx[0]; 2718 i2 = idx[1]; 2719 idx += 2; 2720 tmp0 = x[i1]; 2721 tmp1 = x[i2]; 2722 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2723 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2724 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2725 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2726 } 2727 2728 if (n == sz-1){ 2729 tmp0 = x[*idx]; 2730 sum1 -= v1[0] * tmp0; 2731 sum2 -= v2[0] * tmp0; 2732 sum3 -= v3[0] * tmp0; 2733 sum4 -= v4[0] * tmp0; 2734 } 2735 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2736 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2737 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2738 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2739 ibdiag += 16; 2740 break; 2741 case 5: 2742 v2 = a->a + ii[row+1]; 2743 v3 = a->a + ii[row+2]; 2744 v4 = a->a + ii[row+3]; 2745 v5 = a->a + ii[row+4]; 2746 sum1 = b[row]; 2747 sum2 = b[row+1]; 2748 sum3 = b[row+2]; 2749 sum4 = b[row+3]; 2750 sum5 = b[row+4]; 2751 for(n = 0; n<sz-1; n+=2) { 2752 i1 = idx[0]; 2753 i2 = idx[1]; 2754 idx += 2; 2755 tmp0 = x[i1]; 2756 tmp1 = x[i2]; 2757 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2758 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2759 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2760 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2761 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2762 } 2763 2764 if (n == sz-1){ 2765 tmp0 = x[*idx]; 2766 sum1 -= v1[0] * tmp0; 2767 sum2 -= v2[0] * tmp0; 2768 sum3 -= v3[0] * tmp0; 2769 sum4 -= v4[0] * tmp0; 2770 sum5 -= v5[0] * tmp0; 2771 } 2772 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2773 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2774 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2775 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2776 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2777 ibdiag += 25; 2778 break; 2779 default: 2780 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2781 } 2782 } 2783 2784 xb = x; 2785 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2786 } else xb = b; 2787 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2788 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2789 cnt = 0; 2790 for (i=0, row=0; i<m; i++) { 2791 2792 switch (sizes[i]){ 2793 case 1: 2794 x[row++] *= bdiag[cnt++]; 2795 break; 2796 case 2: 2797 x1 = x[row]; x2 = x[row+1]; 2798 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2799 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2800 x[row++] = tmp1; 2801 x[row++] = tmp2; 2802 cnt += 4; 2803 break; 2804 case 3: 2805 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2806 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2807 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2808 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2809 x[row++] = tmp1; 2810 x[row++] = tmp2; 2811 x[row++] = tmp3; 2812 cnt += 9; 2813 break; 2814 case 4: 2815 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2816 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2817 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2818 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2819 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2820 x[row++] = tmp1; 2821 x[row++] = tmp2; 2822 x[row++] = tmp3; 2823 x[row++] = tmp4; 2824 cnt += 16; 2825 break; 2826 case 5: 2827 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2828 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2829 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2830 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2831 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2832 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2833 x[row++] = tmp1; 2834 x[row++] = tmp2; 2835 x[row++] = tmp3; 2836 x[row++] = tmp4; 2837 x[row++] = tmp5; 2838 cnt += 25; 2839 break; 2840 default: 2841 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2842 } 2843 } 2844 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2845 } 2846 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 2847 2848 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2849 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2850 ibdiag -= sizes[i]*sizes[i]; 2851 sz = ii[row+1] - diag[row] - 1; 2852 v1 = a->a + diag[row] + 1; 2853 idx = a->j + diag[row] + 1; 2854 2855 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2856 switch (sizes[i]){ 2857 case 1: 2858 2859 sum1 = xb[row]; 2860 for(n = 0; n<sz-1; n+=2) { 2861 i1 = idx[0]; 2862 i2 = idx[1]; 2863 idx += 2; 2864 tmp0 = x[i1]; 2865 tmp1 = x[i2]; 2866 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2867 } 2868 2869 if (n == sz-1){ 2870 tmp0 = x[*idx]; 2871 sum1 -= *v1*tmp0; 2872 } 2873 x[row--] = sum1*(*ibdiag); 2874 break; 2875 2876 case 2: 2877 2878 sum1 = xb[row]; 2879 sum2 = xb[row-1]; 2880 /* note that sum1 is associated with the second of the two rows */ 2881 v2 = a->a + diag[row-1] + 2; 2882 for(n = 0; n<sz-1; n+=2) { 2883 i1 = idx[0]; 2884 i2 = idx[1]; 2885 idx += 2; 2886 tmp0 = x[i1]; 2887 tmp1 = x[i2]; 2888 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2889 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2890 } 2891 2892 if (n == sz-1){ 2893 tmp0 = x[*idx]; 2894 sum1 -= *v1*tmp0; 2895 sum2 -= *v2*tmp0; 2896 } 2897 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2898 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2899 break; 2900 case 3: 2901 2902 sum1 = xb[row]; 2903 sum2 = xb[row-1]; 2904 sum3 = xb[row-2]; 2905 v2 = a->a + diag[row-1] + 2; 2906 v3 = a->a + diag[row-2] + 3; 2907 for(n = 0; n<sz-1; n+=2) { 2908 i1 = idx[0]; 2909 i2 = idx[1]; 2910 idx += 2; 2911 tmp0 = x[i1]; 2912 tmp1 = x[i2]; 2913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2916 } 2917 2918 if (n == sz-1){ 2919 tmp0 = x[*idx]; 2920 sum1 -= *v1*tmp0; 2921 sum2 -= *v2*tmp0; 2922 sum3 -= *v3*tmp0; 2923 } 2924 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2925 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2926 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2927 break; 2928 case 4: 2929 2930 sum1 = xb[row]; 2931 sum2 = xb[row-1]; 2932 sum3 = xb[row-2]; 2933 sum4 = xb[row-3]; 2934 v2 = a->a + diag[row-1] + 2; 2935 v3 = a->a + diag[row-2] + 3; 2936 v4 = a->a + diag[row-3] + 4; 2937 for(n = 0; n<sz-1; n+=2) { 2938 i1 = idx[0]; 2939 i2 = idx[1]; 2940 idx += 2; 2941 tmp0 = x[i1]; 2942 tmp1 = x[i2]; 2943 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2944 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2945 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2946 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2947 } 2948 2949 if (n == sz-1){ 2950 tmp0 = x[*idx]; 2951 sum1 -= *v1*tmp0; 2952 sum2 -= *v2*tmp0; 2953 sum3 -= *v3*tmp0; 2954 sum4 -= *v4*tmp0; 2955 } 2956 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2957 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2958 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2959 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2960 break; 2961 case 5: 2962 2963 sum1 = xb[row]; 2964 sum2 = xb[row-1]; 2965 sum3 = xb[row-2]; 2966 sum4 = xb[row-3]; 2967 sum5 = xb[row-4]; 2968 v2 = a->a + diag[row-1] + 2; 2969 v3 = a->a + diag[row-2] + 3; 2970 v4 = a->a + diag[row-3] + 4; 2971 v5 = a->a + diag[row-4] + 5; 2972 for(n = 0; n<sz-1; n+=2) { 2973 i1 = idx[0]; 2974 i2 = idx[1]; 2975 idx += 2; 2976 tmp0 = x[i1]; 2977 tmp1 = x[i2]; 2978 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2979 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2980 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2981 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2982 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2983 } 2984 2985 if (n == sz-1){ 2986 tmp0 = x[*idx]; 2987 sum1 -= *v1*tmp0; 2988 sum2 -= *v2*tmp0; 2989 sum3 -= *v3*tmp0; 2990 sum4 -= *v4*tmp0; 2991 sum5 -= *v5*tmp0; 2992 } 2993 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2994 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2995 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2996 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2997 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2998 break; 2999 default: 3000 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3001 } 3002 } 3003 3004 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3005 } 3006 its--; 3007 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3008 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 3009 } 3010 if (flag & SOR_EISENSTAT) { 3011 const PetscScalar *b; 3012 MatScalar *t = a->inode.ssor_work; 3013 3014 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3015 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3016 /* 3017 Apply (U + D)^-1 where D is now the block diagonal 3018 */ 3019 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3020 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3021 ibdiag -= sizes[i]*sizes[i]; 3022 sz = ii[row+1] - diag[row] - 1; 3023 v1 = a->a + diag[row] + 1; 3024 idx = a->j + diag[row] + 1; 3025 CHKMEMQ; 3026 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3027 switch (sizes[i]){ 3028 case 1: 3029 3030 sum1 = b[row]; 3031 for(n = 0; n<sz-1; n+=2) { 3032 i1 = idx[0]; 3033 i2 = idx[1]; 3034 idx += 2; 3035 tmp0 = x[i1]; 3036 tmp1 = x[i2]; 3037 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3038 } 3039 3040 if (n == sz-1){ 3041 tmp0 = x[*idx]; 3042 sum1 -= *v1*tmp0; 3043 } 3044 x[row] = sum1*(*ibdiag);row--; 3045 break; 3046 3047 case 2: 3048 3049 sum1 = b[row]; 3050 sum2 = b[row-1]; 3051 /* note that sum1 is associated with the second of the two rows */ 3052 v2 = a->a + diag[row-1] + 2; 3053 for(n = 0; n<sz-1; n+=2) { 3054 i1 = idx[0]; 3055 i2 = idx[1]; 3056 idx += 2; 3057 tmp0 = x[i1]; 3058 tmp1 = x[i2]; 3059 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3060 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3061 } 3062 3063 if (n == sz-1){ 3064 tmp0 = x[*idx]; 3065 sum1 -= *v1*tmp0; 3066 sum2 -= *v2*tmp0; 3067 } 3068 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3069 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3070 row -= 2; 3071 break; 3072 case 3: 3073 3074 sum1 = b[row]; 3075 sum2 = b[row-1]; 3076 sum3 = b[row-2]; 3077 v2 = a->a + diag[row-1] + 2; 3078 v3 = a->a + diag[row-2] + 3; 3079 for(n = 0; n<sz-1; n+=2) { 3080 i1 = idx[0]; 3081 i2 = idx[1]; 3082 idx += 2; 3083 tmp0 = x[i1]; 3084 tmp1 = x[i2]; 3085 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3086 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3087 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3088 } 3089 3090 if (n == sz-1){ 3091 tmp0 = x[*idx]; 3092 sum1 -= *v1*tmp0; 3093 sum2 -= *v2*tmp0; 3094 sum3 -= *v3*tmp0; 3095 } 3096 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3097 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3098 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3099 row -= 3; 3100 break; 3101 case 4: 3102 3103 sum1 = b[row]; 3104 sum2 = b[row-1]; 3105 sum3 = b[row-2]; 3106 sum4 = b[row-3]; 3107 v2 = a->a + diag[row-1] + 2; 3108 v3 = a->a + diag[row-2] + 3; 3109 v4 = a->a + diag[row-3] + 4; 3110 for(n = 0; n<sz-1; n+=2) { 3111 i1 = idx[0]; 3112 i2 = idx[1]; 3113 idx += 2; 3114 tmp0 = x[i1]; 3115 tmp1 = x[i2]; 3116 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3117 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3118 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3119 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3120 } 3121 3122 if (n == sz-1){ 3123 tmp0 = x[*idx]; 3124 sum1 -= *v1*tmp0; 3125 sum2 -= *v2*tmp0; 3126 sum3 -= *v3*tmp0; 3127 sum4 -= *v4*tmp0; 3128 } 3129 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3130 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3131 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3132 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3133 row -= 4; 3134 break; 3135 case 5: 3136 3137 sum1 = b[row]; 3138 sum2 = b[row-1]; 3139 sum3 = b[row-2]; 3140 sum4 = b[row-3]; 3141 sum5 = b[row-4]; 3142 v2 = a->a + diag[row-1] + 2; 3143 v3 = a->a + diag[row-2] + 3; 3144 v4 = a->a + diag[row-3] + 4; 3145 v5 = a->a + diag[row-4] + 5; 3146 for(n = 0; n<sz-1; n+=2) { 3147 i1 = idx[0]; 3148 i2 = idx[1]; 3149 idx += 2; 3150 tmp0 = x[i1]; 3151 tmp1 = x[i2]; 3152 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3153 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3154 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3155 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3156 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3157 } 3158 3159 if (n == sz-1){ 3160 tmp0 = x[*idx]; 3161 sum1 -= *v1*tmp0; 3162 sum2 -= *v2*tmp0; 3163 sum3 -= *v3*tmp0; 3164 sum4 -= *v4*tmp0; 3165 sum5 -= *v5*tmp0; 3166 } 3167 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3168 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3169 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3170 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3171 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3172 row -= 5; 3173 break; 3174 default: 3175 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3176 } 3177 CHKMEMQ; 3178 } 3179 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3180 3181 /* 3182 t = b - D x where D is the block diagonal 3183 */ 3184 cnt = 0; 3185 for (i=0, row=0; i<m; i++) { 3186 CHKMEMQ; 3187 switch (sizes[i]){ 3188 case 1: 3189 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3190 break; 3191 case 2: 3192 x1 = x[row]; x2 = x[row+1]; 3193 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3194 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3195 t[row] = b[row] - tmp1; 3196 t[row+1] = b[row+1] - tmp2; row += 2; 3197 cnt += 4; 3198 break; 3199 case 3: 3200 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3201 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3202 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3203 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3204 t[row] = b[row] - tmp1; 3205 t[row+1] = b[row+1] - tmp2; 3206 t[row+2] = b[row+2] - tmp3; row += 3; 3207 cnt += 9; 3208 break; 3209 case 4: 3210 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3211 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3212 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3213 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3214 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3215 t[row] = b[row] - tmp1; 3216 t[row+1] = b[row+1] - tmp2; 3217 t[row+2] = b[row+2] - tmp3; 3218 t[row+3] = b[row+3] - tmp4; row += 4; 3219 cnt += 16; 3220 break; 3221 case 5: 3222 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3223 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3224 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3225 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3226 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3227 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3228 t[row] = b[row] - tmp1; 3229 t[row+1] = b[row+1] - tmp2; 3230 t[row+2] = b[row+2] - tmp3; 3231 t[row+3] = b[row+3] - tmp4; 3232 t[row+4] = b[row+4] - tmp5;row += 5; 3233 cnt += 25; 3234 break; 3235 default: 3236 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3237 } 3238 CHKMEMQ; 3239 } 3240 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3241 3242 3243 3244 /* 3245 Apply (L + D)^-1 where D is the block diagonal 3246 */ 3247 for (i=0, row=0; i<m; i++) { 3248 sz = diag[row] - ii[row]; 3249 v1 = a->a + ii[row]; 3250 idx = a->j + ii[row]; 3251 CHKMEMQ; 3252 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3253 switch (sizes[i]){ 3254 case 1: 3255 3256 sum1 = t[row]; 3257 for(n = 0; n<sz-1; n+=2) { 3258 i1 = idx[0]; 3259 i2 = idx[1]; 3260 idx += 2; 3261 tmp0 = t[i1]; 3262 tmp1 = t[i2]; 3263 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3264 } 3265 3266 if (n == sz-1){ 3267 tmp0 = t[*idx]; 3268 sum1 -= *v1 * tmp0; 3269 } 3270 x[row] += t[row] = sum1*(*ibdiag++); row++; 3271 break; 3272 case 2: 3273 v2 = a->a + ii[row+1]; 3274 sum1 = t[row]; 3275 sum2 = t[row+1]; 3276 for(n = 0; n<sz-1; n+=2) { 3277 i1 = idx[0]; 3278 i2 = idx[1]; 3279 idx += 2; 3280 tmp0 = t[i1]; 3281 tmp1 = t[i2]; 3282 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3283 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3284 } 3285 3286 if (n == sz-1){ 3287 tmp0 = t[*idx]; 3288 sum1 -= v1[0] * tmp0; 3289 sum2 -= v2[0] * tmp0; 3290 } 3291 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3292 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3293 ibdiag += 4; row += 2; 3294 break; 3295 case 3: 3296 v2 = a->a + ii[row+1]; 3297 v3 = a->a + ii[row+2]; 3298 sum1 = t[row]; 3299 sum2 = t[row+1]; 3300 sum3 = t[row+2]; 3301 for(n = 0; n<sz-1; n+=2) { 3302 i1 = idx[0]; 3303 i2 = idx[1]; 3304 idx += 2; 3305 tmp0 = t[i1]; 3306 tmp1 = t[i2]; 3307 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3308 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3309 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3310 } 3311 3312 if (n == sz-1){ 3313 tmp0 = t[*idx]; 3314 sum1 -= v1[0] * tmp0; 3315 sum2 -= v2[0] * tmp0; 3316 sum3 -= v3[0] * tmp0; 3317 } 3318 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3319 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3320 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3321 ibdiag += 9; row += 3; 3322 break; 3323 case 4: 3324 v2 = a->a + ii[row+1]; 3325 v3 = a->a + ii[row+2]; 3326 v4 = a->a + ii[row+3]; 3327 sum1 = t[row]; 3328 sum2 = t[row+1]; 3329 sum3 = t[row+2]; 3330 sum4 = t[row+3]; 3331 for(n = 0; n<sz-1; n+=2) { 3332 i1 = idx[0]; 3333 i2 = idx[1]; 3334 idx += 2; 3335 tmp0 = t[i1]; 3336 tmp1 = t[i2]; 3337 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3338 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3339 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3340 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3341 } 3342 3343 if (n == sz-1){ 3344 tmp0 = t[*idx]; 3345 sum1 -= v1[0] * tmp0; 3346 sum2 -= v2[0] * tmp0; 3347 sum3 -= v3[0] * tmp0; 3348 sum4 -= v4[0] * tmp0; 3349 } 3350 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3351 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3352 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3353 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3354 ibdiag += 16; row += 4; 3355 break; 3356 case 5: 3357 v2 = a->a + ii[row+1]; 3358 v3 = a->a + ii[row+2]; 3359 v4 = a->a + ii[row+3]; 3360 v5 = a->a + ii[row+4]; 3361 sum1 = t[row]; 3362 sum2 = t[row+1]; 3363 sum3 = t[row+2]; 3364 sum4 = t[row+3]; 3365 sum5 = t[row+4]; 3366 for(n = 0; n<sz-1; n+=2) { 3367 i1 = idx[0]; 3368 i2 = idx[1]; 3369 idx += 2; 3370 tmp0 = t[i1]; 3371 tmp1 = t[i2]; 3372 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3373 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3374 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3375 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3376 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3377 } 3378 3379 if (n == sz-1){ 3380 tmp0 = t[*idx]; 3381 sum1 -= v1[0] * tmp0; 3382 sum2 -= v2[0] * tmp0; 3383 sum3 -= v3[0] * tmp0; 3384 sum4 -= v4[0] * tmp0; 3385 sum5 -= v5[0] * tmp0; 3386 } 3387 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3388 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3389 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3390 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3391 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3392 ibdiag += 25; row += 5; 3393 break; 3394 default: 3395 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3396 } 3397 CHKMEMQ; 3398 } 3399 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3400 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3401 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3402 } 3403 PetscFunctionReturn(0); 3404 } 3405 3406 #undef __FUNCT__ 3407 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3408 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3409 { 3410 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3411 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3412 const MatScalar *bdiag = a->inode.bdiag; 3413 const PetscScalar *b; 3414 PetscErrorCode ierr; 3415 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3416 const PetscInt *sizes = a->inode.size; 3417 3418 PetscFunctionBegin; 3419 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3420 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3421 cnt = 0; 3422 for (i=0, row=0; i<m; i++) { 3423 switch (sizes[i]){ 3424 case 1: 3425 x[row] = b[row]*bdiag[cnt++];row++; 3426 break; 3427 case 2: 3428 x1 = b[row]; x2 = b[row+1]; 3429 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3430 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3431 x[row++] = tmp1; 3432 x[row++] = tmp2; 3433 cnt += 4; 3434 break; 3435 case 3: 3436 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3437 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3438 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3439 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3440 x[row++] = tmp1; 3441 x[row++] = tmp2; 3442 x[row++] = tmp3; 3443 cnt += 9; 3444 break; 3445 case 4: 3446 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3447 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3448 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3449 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3450 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3451 x[row++] = tmp1; 3452 x[row++] = tmp2; 3453 x[row++] = tmp3; 3454 x[row++] = tmp4; 3455 cnt += 16; 3456 break; 3457 case 5: 3458 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3459 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3460 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3461 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3462 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3463 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3464 x[row++] = tmp1; 3465 x[row++] = tmp2; 3466 x[row++] = tmp3; 3467 x[row++] = tmp4; 3468 x[row++] = tmp5; 3469 cnt += 25; 3470 break; 3471 default: 3472 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3473 } 3474 } 3475 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3476 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3477 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3478 PetscFunctionReturn(0); 3479 } 3480 3481 /* 3482 samestructure indicates that the matrix has not changed its nonzero structure so we 3483 do not need to recompute the inodes 3484 */ 3485 #undef __FUNCT__ 3486 #define __FUNCT__ "Mat_CheckInode" 3487 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3488 { 3489 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3490 PetscErrorCode ierr; 3491 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3492 PetscTruth flag; 3493 3494 PetscFunctionBegin; 3495 if (!a->inode.use) PetscFunctionReturn(0); 3496 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3497 3498 3499 m = A->rmap->n; 3500 if (a->inode.size) {ns = a->inode.size;} 3501 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3502 3503 i = 0; 3504 node_count = 0; 3505 idx = a->j; 3506 ii = a->i; 3507 while (i < m){ /* For each row */ 3508 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3509 /* Limits the number of elements in a node to 'a->inode.limit' */ 3510 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3511 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3512 if (nzy != nzx) break; 3513 idy += nzx; /* Same nonzero pattern */ 3514 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3515 if (!flag) break; 3516 } 3517 ns[node_count++] = blk_size; 3518 idx += blk_size*nzx; 3519 i = j; 3520 } 3521 /* If not enough inodes found,, do not use inode version of the routines */ 3522 if (!a->inode.size && m && node_count > .9*m) { 3523 ierr = PetscFree(ns);CHKERRQ(ierr); 3524 a->inode.node_count = 0; 3525 a->inode.size = PETSC_NULL; 3526 a->inode.use = PETSC_FALSE; 3527 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3528 } else { 3529 if (!A->factor) { 3530 A->ops->mult = MatMult_SeqAIJ_Inode; 3531 A->ops->sor = MatSOR_SeqAIJ_Inode; 3532 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3533 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3534 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3535 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3536 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3537 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3538 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3539 } else { 3540 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 3541 } 3542 a->inode.node_count = node_count; 3543 a->inode.size = ns; 3544 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3545 } 3546 PetscFunctionReturn(0); 3547 } 3548 3549 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3550 PetscInt __k, *__vi; \ 3551 __vi = aj + ai[row]; \ 3552 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3553 __vi = aj + adiag[row]; \ 3554 cols[nzl] = __vi[0];\ 3555 __vi = aj + adiag[row+1]+1;\ 3556 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3557 3558 3559 /* 3560 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3561 Modified from Mat_CheckInode(). 3562 3563 Input Parameters: 3564 + Mat A - ILU or LU matrix factor 3565 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3566 do not need to recompute the inodes 3567 */ 3568 #undef __FUNCT__ 3569 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3570 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3571 { 3572 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3573 PetscErrorCode ierr; 3574 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3575 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3576 PetscTruth flag; 3577 3578 PetscFunctionBegin; 3579 if (!a->inode.use) PetscFunctionReturn(0); 3580 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3581 3582 m = A->rmap->n; 3583 if (a->inode.size) {ns = a->inode.size;} 3584 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3585 3586 i = 0; 3587 node_count = 0; 3588 while (i < m){ /* For each row */ 3589 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3590 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3591 nzx = nzl1 + nzu1 + 1; 3592 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3593 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3594 3595 /* Limits the number of elements in a node to 'a->inode.limit' */ 3596 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3597 nzl2 = ai[j+1] - ai[j]; 3598 nzu2 = adiag[j] - adiag[j+1] - 1; 3599 nzy = nzl2 + nzu2 + 1; 3600 if( nzy != nzx) break; 3601 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3602 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3603 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3604 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3605 ierr = PetscFree(cols2);CHKERRQ(ierr); 3606 } 3607 ns[node_count++] = blk_size; 3608 ierr = PetscFree(cols1);CHKERRQ(ierr); 3609 i = j; 3610 } 3611 /* If not enough inodes found,, do not use inode version of the routines */ 3612 if (!a->inode.size && m && node_count > .9*m) { 3613 ierr = PetscFree(ns);CHKERRQ(ierr); 3614 a->inode.node_count = 0; 3615 a->inode.size = PETSC_NULL; 3616 a->inode.use = PETSC_FALSE; 3617 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3618 } else { 3619 A->ops->mult = 0; 3620 A->ops->sor = 0; 3621 A->ops->multadd = 0; 3622 A->ops->getrowij = 0; 3623 A->ops->restorerowij = 0; 3624 A->ops->getcolumnij = 0; 3625 A->ops->restorecolumnij = 0; 3626 A->ops->coloringpatch = 0; 3627 A->ops->multdiagonalblock = 0; 3628 A->ops->solve = MatSolve_SeqAIJ_Inode; 3629 a->inode.node_count = node_count; 3630 a->inode.size = ns; 3631 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3632 } 3633 PetscFunctionReturn(0); 3634 } 3635 3636 /* 3637 This is really ugly. if inodes are used this replaces the 3638 permutations with ones that correspond to rows/cols of the matrix 3639 rather then inode blocks 3640 */ 3641 #undef __FUNCT__ 3642 #define __FUNCT__ "MatInodeAdjustForInodes" 3643 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3644 { 3645 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3646 3647 PetscFunctionBegin; 3648 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3649 if (f) { 3650 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3651 } 3652 PetscFunctionReturn(0); 3653 } 3654 3655 EXTERN_C_BEGIN 3656 #undef __FUNCT__ 3657 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3658 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3659 { 3660 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3661 PetscErrorCode ierr; 3662 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3663 const PetscInt *ridx,*cidx; 3664 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3665 PetscInt nslim_col,*ns_col; 3666 IS ris = *rperm,cis = *cperm; 3667 3668 PetscFunctionBegin; 3669 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3670 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3671 3672 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3673 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3674 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3675 3676 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3677 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3678 3679 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3680 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3681 3682 /* Construct the permutations for rows*/ 3683 for (i=0,row = 0; i<nslim_row; ++i){ 3684 indx = ridx[i]; 3685 start_val = tns[indx]; 3686 end_val = tns[indx + 1]; 3687 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3688 } 3689 3690 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3691 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3692 3693 /* Construct permutations for columns */ 3694 for (i=0,col=0; i<nslim_col; ++i){ 3695 indx = cidx[i]; 3696 start_val = tns[indx]; 3697 end_val = tns[indx + 1]; 3698 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3699 } 3700 3701 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3702 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3703 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3704 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3705 3706 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3707 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3708 3709 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3710 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3711 ierr = ISDestroy(cis);CHKERRQ(ierr); 3712 ierr = ISDestroy(ris);CHKERRQ(ierr); 3713 ierr = PetscFree(tns);CHKERRQ(ierr); 3714 PetscFunctionReturn(0); 3715 } 3716 EXTERN_C_END 3717 3718 #undef __FUNCT__ 3719 #define __FUNCT__ "MatInodeGetInodeSizes" 3720 /*@C 3721 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3722 3723 Collective on Mat 3724 3725 Input Parameter: 3726 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3727 3728 Output Parameter: 3729 + node_count - no of inodes present in the matrix. 3730 . sizes - an array of size node_count,with sizes of each inode. 3731 - limit - the max size used to generate the inodes. 3732 3733 Level: advanced 3734 3735 Notes: This routine returns some internal storage information 3736 of the matrix, it is intended to be used by advanced users. 3737 It should be called after the matrix is assembled. 3738 The contents of the sizes[] array should not be changed. 3739 PETSC_NULL may be passed for information not requested. 3740 3741 .keywords: matrix, seqaij, get, inode 3742 3743 .seealso: MatGetInfo() 3744 @*/ 3745 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3746 { 3747 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3748 3749 PetscFunctionBegin; 3750 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3751 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3752 if (f) { 3753 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3754 } 3755 PetscFunctionReturn(0); 3756 } 3757 3758 EXTERN_C_BEGIN 3759 #undef __FUNCT__ 3760 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3761 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3762 { 3763 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3764 3765 PetscFunctionBegin; 3766 if (node_count) *node_count = a->inode.node_count; 3767 if (sizes) *sizes = a->inode.size; 3768 if (limit) *limit = a->inode.limit; 3769 PetscFunctionReturn(0); 3770 } 3771 EXTERN_C_END 3772