1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1195 FactorShiftCtx sctx; 1196 const PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,node_max,col; 1200 const PetscInt *ns; 1201 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1202 1203 PetscFunctionBegin; 1204 /* MatPivotSetUp(): initialize shift context sctx */ 1205 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1206 1207 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1208 ddiag = a->diag; 1209 sctx.shift_top = info->zeropivot; 1210 for (i=0; i<n; i++) { 1211 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1212 d = (aa)[ddiag[i]]; 1213 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1214 v = aa+ai[i]; 1215 nz = ai[i+1] - ai[i]; 1216 for (j=0; j<nz; j++) 1217 rs += PetscAbsScalar(v[j]); 1218 if (rs>sctx.shift_top) sctx.shift_top = rs; 1219 } 1220 sctx.shift_top *= 1.1; 1221 sctx.nshift_max = 5; 1222 sctx.shift_lo = 0.; 1223 sctx.shift_hi = 1.; 1224 } 1225 1226 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1227 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1228 1229 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1230 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1231 rtmp2 = rtmp1 + n; 1232 rtmp3 = rtmp2 + n; 1233 rtmp4 = rtmp3 + n; 1234 ics = ic; 1235 1236 node_max = a->inode.node_count; 1237 ns = a->inode.size; 1238 if (!ns){ 1239 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1240 } 1241 1242 /* If max inode size > 4, split it into two inodes.*/ 1243 /* also map the inode sizes according to the ordering */ 1244 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1245 for (i=0,j=0; i<node_max; ++i,++j){ 1246 if (ns[i]>4) { 1247 tmp_vec1[j] = 4; 1248 ++j; 1249 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1250 } else { 1251 tmp_vec1[j] = ns[i]; 1252 } 1253 } 1254 /* Use the correct node_max */ 1255 node_max = j; 1256 1257 /* Now reorder the inode info based on mat re-ordering info */ 1258 /* First create a row -> inode_size_array_index map */ 1259 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1260 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1261 for (i=0,row=0; i<node_max; i++) { 1262 nodesz = tmp_vec1[i]; 1263 for (j=0; j<nodesz; j++,row++) { 1264 nsmap[row] = i; 1265 } 1266 } 1267 /* Using nsmap, create a reordered ns structure */ 1268 for (i=0,j=0; i< node_max; i++) { 1269 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1270 tmp_vec2[i] = nodesz; 1271 j += nodesz; 1272 } 1273 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1274 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1275 1276 /* Now use the correct ns */ 1277 ns = tmp_vec2; 1278 1279 do { 1280 sctx.useshift = PETSC_FALSE; 1281 /* Now loop over each block-row, and do the factorization */ 1282 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1283 nodesz = ns[inod]; 1284 1285 switch (nodesz){ 1286 case 1: 1287 /*----------*/ 1288 /* zero rtmp1 */ 1289 /* L part */ 1290 nz = bi[i+1] - bi[i]; 1291 bjtmp = bj + bi[i]; 1292 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1293 1294 /* U part */ 1295 nz = bdiag[i]-bdiag[i+1]; 1296 bjtmp = bj + bdiag[i+1]+1; 1297 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1298 1299 /* load in initial (unfactored row) */ 1300 nz = ai[r[i]+1] - ai[r[i]]; 1301 ajtmp = aj + ai[r[i]]; 1302 v = aa + ai[r[i]]; 1303 for (j=0; j<nz; j++) { 1304 rtmp1[ics[ajtmp[j]]] = v[j]; 1305 } 1306 /* ZeropivotApply() */ 1307 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1308 1309 /* elimination */ 1310 bjtmp = bj + bi[i]; 1311 row = *bjtmp++; 1312 nzL = bi[i+1] - bi[i]; 1313 for(k=0; k < nzL;k++) { 1314 pc = rtmp1 + row; 1315 if (*pc != 0.0) { 1316 pv = b->a + bdiag[row]; 1317 mul1 = *pc * (*pv); 1318 *pc = mul1; 1319 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1320 pv = b->a + bdiag[row+1]+1; 1321 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1322 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1323 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1324 } 1325 row = *bjtmp++; 1326 } 1327 1328 /* finished row so stick it into b->a */ 1329 rs = 0.0; 1330 /* L part */ 1331 pv = b->a + bi[i] ; 1332 pj = b->j + bi[i] ; 1333 nz = bi[i+1] - bi[i]; 1334 for (j=0; j<nz; j++) { 1335 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1336 } 1337 1338 /* U part */ 1339 pv = b->a + bdiag[i+1]+1; 1340 pj = b->j + bdiag[i+1]+1; 1341 nz = bdiag[i] - bdiag[i+1]-1; 1342 for (j=0; j<nz; j++) { 1343 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1344 } 1345 1346 /* Check zero pivot */ 1347 sctx.rs = rs; 1348 sctx.pv = rtmp1[i]; 1349 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1350 1351 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1352 pv = b->a + bdiag[i]; 1353 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1354 break; 1355 1356 case 2: 1357 /*----------*/ 1358 /* zero rtmp1 and rtmp2 */ 1359 /* L part */ 1360 nz = bi[i+1] - bi[i]; 1361 bjtmp = bj + bi[i]; 1362 for (j=0; j<nz; j++) { 1363 col = bjtmp[j]; 1364 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1365 } 1366 1367 /* U part */ 1368 nz = bdiag[i]-bdiag[i+1]; 1369 bjtmp = bj + bdiag[i+1]+1; 1370 for (j=0; j<nz; j++) { 1371 col = bjtmp[j]; 1372 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1373 } 1374 1375 /* load in initial (unfactored row) */ 1376 nz = ai[r[i]+1] - ai[r[i]]; 1377 ajtmp = aj + ai[r[i]]; 1378 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1379 for (j=0; j<nz; j++) { 1380 col = ics[ajtmp[j]]; 1381 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1382 } 1383 /* ZeropivotApply(): shift the diagonal of the matrix */ 1384 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1385 1386 /* elimination */ 1387 bjtmp = bj + bi[i]; 1388 row = *bjtmp++; /* pivot row */ 1389 nzL = bi[i+1] - bi[i]; 1390 for(k=0; k < nzL;k++) { 1391 pc1 = rtmp1 + row; 1392 pc2 = rtmp2 + row; 1393 if (*pc1 != 0.0 || *pc2 != 0.0) { 1394 pv = b->a + bdiag[row]; 1395 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1396 *pc1 = mul1; *pc2 = mul2; 1397 1398 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1399 pv = b->a + bdiag[row+1]+1; 1400 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1401 for (j=0; j<nz; j++){ 1402 col = pj[j]; 1403 rtmp1[col] -= mul1 * pv[j]; 1404 rtmp2[col] -= mul2 * pv[j]; 1405 } 1406 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1407 } 1408 row = *bjtmp++; 1409 } 1410 1411 /* finished row i; check zero pivot, then stick row i into b->a */ 1412 rs = 0.0; 1413 /* L part */ 1414 pc1 = b->a + bi[i]; 1415 pj = b->j + bi[i] ; 1416 nz = bi[i+1] - bi[i]; 1417 for (j=0; j<nz; j++) { 1418 col = pj[j]; 1419 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1420 } 1421 /* U part */ 1422 pc1 = b->a + bdiag[i+1]+1; 1423 pj = b->j + bdiag[i+1]+1; 1424 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1425 for (j=0; j<nz; j++) { 1426 col = pj[j]; 1427 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1428 } 1429 1430 sctx.rs = rs; 1431 sctx.pv = rtmp1[i]; 1432 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1433 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1434 *pc1 = 1.0/sctx.pv; 1435 1436 /* Now take care of diagonal 2x2 block. */ 1437 pc2 = rtmp2 + i; 1438 if (*pc2 != 0.0){ 1439 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1440 *pc2 = mul1; /* insert L entry */ 1441 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1442 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1443 for (j=0; j<nz; j++) { 1444 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1445 } 1446 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1447 } 1448 1449 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1450 rs = 0.0; 1451 /* L part */ 1452 pc2 = b->a + bi[i+1]; 1453 pj = b->j + bi[i+1] ; 1454 nz = bi[i+2] - bi[i+1]; 1455 for (j=0; j<nz; j++) { 1456 col = pj[j]; 1457 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1458 } 1459 /* U part */ 1460 pc2 = b->a + bdiag[i+2]+1; 1461 pj = b->j + bdiag[i+2]+1; 1462 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1463 for (j=0; j<nz; j++) { 1464 col = pj[j]; 1465 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1466 } 1467 1468 sctx.rs = rs; 1469 sctx.pv = rtmp2[i+1]; 1470 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1471 pc2 = b->a + bdiag[i+1]; 1472 *pc2 = 1.0/sctx.pv; 1473 break; 1474 1475 case 3: 1476 /*----------*/ 1477 /* zero rtmp */ 1478 /* L part */ 1479 nz = bi[i+1] - bi[i]; 1480 bjtmp = bj + bi[i]; 1481 for (j=0; j<nz; j++) { 1482 col = bjtmp[j]; 1483 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1484 } 1485 1486 /* U part */ 1487 nz = bdiag[i]-bdiag[i+1]; 1488 bjtmp = bj + bdiag[i+1]+1; 1489 for (j=0; j<nz; j++) { 1490 col = bjtmp[j]; 1491 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1492 } 1493 1494 /* load in initial (unfactored row) */ 1495 nz = ai[r[i]+1] - ai[r[i]]; 1496 ajtmp = aj + ai[r[i]]; 1497 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1498 for (j=0; j<nz; j++) { 1499 col = ics[ajtmp[j]]; 1500 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1501 } 1502 /* ZeropivotApply(): shift the diagonal of the matrix */ 1503 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1504 1505 /* elimination */ 1506 bjtmp = bj + bi[i]; 1507 row = *bjtmp++; /* pivot row */ 1508 nzL = bi[i+1] - bi[i]; 1509 for(k=0; k < nzL;k++) { 1510 pc1 = rtmp1 + row; 1511 pc2 = rtmp2 + row; 1512 pc3 = rtmp3 + row; 1513 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1514 pv = b->a + bdiag[row]; 1515 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1516 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1517 1518 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1519 pv = b->a + bdiag[row+1]+1; 1520 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1521 for (j=0; j<nz; j++){ 1522 col = pj[j]; 1523 rtmp1[col] -= mul1 * pv[j]; 1524 rtmp2[col] -= mul2 * pv[j]; 1525 rtmp3[col] -= mul3 * pv[j]; 1526 } 1527 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1528 } 1529 row = *bjtmp++; 1530 } 1531 1532 /* finished row i; check zero pivot, then stick row i into b->a */ 1533 rs = 0.0; 1534 /* L part */ 1535 pc1 = b->a + bi[i]; 1536 pj = b->j + bi[i] ; 1537 nz = bi[i+1] - bi[i]; 1538 for (j=0; j<nz; j++) { 1539 col = pj[j]; 1540 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1541 } 1542 /* U part */ 1543 pc1 = b->a + bdiag[i+1]+1; 1544 pj = b->j + bdiag[i+1]+1; 1545 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1546 for (j=0; j<nz; j++) { 1547 col = pj[j]; 1548 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1549 } 1550 1551 sctx.rs = rs; 1552 sctx.pv = rtmp1[i]; 1553 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1554 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1555 *pc1 = 1.0/sctx.pv; 1556 1557 /* Now take care of 1st column of diagonal 3x3 block. */ 1558 pc2 = rtmp2 + i; 1559 pc3 = rtmp3 + i; 1560 if (*pc2 != 0.0 || *pc3 != 0.0){ 1561 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1562 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1563 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1564 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1565 for (j=0; j<nz; j++) { 1566 col = pj[j]; 1567 rtmp2[col] -= mul2 * rtmp1[col]; 1568 rtmp3[col] -= mul3 * rtmp1[col]; 1569 } 1570 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1571 } 1572 1573 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1574 rs = 0.0; 1575 /* L part */ 1576 pc2 = b->a + bi[i+1]; 1577 pj = b->j + bi[i+1] ; 1578 nz = bi[i+2] - bi[i+1]; 1579 for (j=0; j<nz; j++) { 1580 col = pj[j]; 1581 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1582 } 1583 /* U part */ 1584 pc2 = b->a + bdiag[i+2]+1; 1585 pj = b->j + bdiag[i+2]+1; 1586 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1587 for (j=0; j<nz; j++) { 1588 col = pj[j]; 1589 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1590 } 1591 1592 sctx.rs = rs; 1593 sctx.pv = rtmp2[i+1]; 1594 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1595 pc2 = b->a + bdiag[i+1]; 1596 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1597 1598 /* Now take care of 2nd column of diagonal 3x3 block. */ 1599 pc3 = rtmp3 + i+1; 1600 if (*pc3 != 0.0){ 1601 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1602 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1603 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1604 for (j=0; j<nz; j++) { 1605 col = pj[j]; 1606 rtmp3[col] -= mul3 * rtmp2[col]; 1607 } 1608 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1609 } 1610 1611 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1612 rs = 0.0; 1613 /* L part */ 1614 pc3 = b->a + bi[i+2]; 1615 pj = b->j + bi[i+2] ; 1616 nz = bi[i+3] - bi[i+2]; 1617 for (j=0; j<nz; j++) { 1618 col = pj[j]; 1619 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1620 } 1621 /* U part */ 1622 pc3 = b->a + bdiag[i+3]+1; 1623 pj = b->j + bdiag[i+3]+1; 1624 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1625 for (j=0; j<nz; j++) { 1626 col = pj[j]; 1627 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1628 } 1629 1630 sctx.rs = rs; 1631 sctx.pv = rtmp3[i+2]; 1632 ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr); 1633 pc3 = b->a + bdiag[i+2]; 1634 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1635 break; 1636 case 4: 1637 /*----------*/ 1638 /* zero rtmp */ 1639 /* L part */ 1640 nz = bi[i+1] - bi[i]; 1641 bjtmp = bj + bi[i]; 1642 for (j=0; j<nz; j++) { 1643 col = bjtmp[j]; 1644 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1645 } 1646 1647 /* U part */ 1648 nz = bdiag[i]-bdiag[i+1]; 1649 bjtmp = bj + bdiag[i+1]+1; 1650 for (j=0; j<nz; j++) { 1651 col = bjtmp[j]; 1652 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1653 } 1654 1655 /* load in initial (unfactored row) */ 1656 nz = ai[r[i]+1] - ai[r[i]]; 1657 ajtmp = aj + ai[r[i]]; 1658 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1659 for (j=0; j<nz; j++) { 1660 col = ics[ajtmp[j]]; 1661 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1662 } 1663 /* ZeropivotApply(): shift the diagonal of the matrix */ 1664 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1665 1666 /* elimination */ 1667 bjtmp = bj + bi[i]; 1668 row = *bjtmp++; /* pivot row */ 1669 nzL = bi[i+1] - bi[i]; 1670 for(k=0; k < nzL;k++) { 1671 pc1 = rtmp1 + row; 1672 pc2 = rtmp2 + row; 1673 pc3 = rtmp3 + row; 1674 pc4 = rtmp4 + row; 1675 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1676 pv = b->a + bdiag[row]; 1677 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1678 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1679 1680 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1681 pv = b->a + bdiag[row+1]+1; 1682 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1683 for (j=0; j<nz; j++){ 1684 col = pj[j]; 1685 rtmp1[col] -= mul1 * pv[j]; 1686 rtmp2[col] -= mul2 * pv[j]; 1687 rtmp3[col] -= mul3 * pv[j]; 1688 rtmp4[col] -= mul4 * pv[j]; 1689 } 1690 ierr = PetscLogFlops(8*nz);CHKERRQ(ierr); 1691 } 1692 row = *bjtmp++; 1693 } 1694 1695 /* finished row i; check zero pivot, then stick row i into b->a */ 1696 rs = 0.0; 1697 /* L part */ 1698 pc1 = b->a + bi[i]; 1699 pj = b->j + bi[i] ; 1700 nz = bi[i+1] - bi[i]; 1701 for (j=0; j<nz; j++) { 1702 col = pj[j]; 1703 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1704 } 1705 /* U part */ 1706 pc1 = b->a + bdiag[i+1]+1; 1707 pj = b->j + bdiag[i+1]+1; 1708 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1709 for (j=0; j<nz; j++) { 1710 col = pj[j]; 1711 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1712 } 1713 1714 sctx.rs = rs; 1715 sctx.pv = rtmp1[i]; 1716 ierr = MatPivotCheck(info,sctx,i);CHKERRQ(ierr); 1717 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1718 *pc1 = 1.0/sctx.pv; 1719 1720 /* Now take care of 1st column of diagonal 4x4 block. */ 1721 pc2 = rtmp2 + i; 1722 pc3 = rtmp3 + i; 1723 pc4 = rtmp4 + i; 1724 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){ 1725 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1726 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1727 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1728 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1729 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1730 for (j=0; j<nz; j++) { 1731 col = pj[j]; 1732 rtmp2[col] -= mul2 * rtmp1[col]; 1733 rtmp3[col] -= mul3 * rtmp1[col]; 1734 rtmp4[col] -= mul4 * rtmp1[col]; 1735 } 1736 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1737 } 1738 1739 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1740 rs = 0.0; 1741 /* L part */ 1742 pc2 = b->a + bi[i+1]; 1743 pj = b->j + bi[i+1] ; 1744 nz = bi[i+2] - bi[i+1]; 1745 for (j=0; j<nz; j++) { 1746 col = pj[j]; 1747 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1748 } 1749 /* U part */ 1750 pc2 = b->a + bdiag[i+2]+1; 1751 pj = b->j + bdiag[i+2]+1; 1752 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1753 for (j=0; j<nz; j++) { 1754 col = pj[j]; 1755 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1756 } 1757 1758 sctx.rs = rs; 1759 sctx.pv = rtmp2[i+1]; 1760 ierr = MatPivotCheck(info,sctx,i+1);CHKERRQ(ierr); 1761 pc2 = b->a + bdiag[i+1]; 1762 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1763 1764 /* Now take care of 2nd column of diagonal 4x4 block. */ 1765 pc3 = rtmp3 + i+1; 1766 pc4 = rtmp4 + i+1; 1767 if (*pc3 != 0.0 || *pc4 != 0.0){ 1768 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1769 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1770 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1771 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1772 for (j=0; j<nz; j++) { 1773 col = pj[j]; 1774 rtmp3[col] -= mul3 * rtmp2[col]; 1775 rtmp4[col] -= mul4 * rtmp2[col]; 1776 } 1777 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1778 } 1779 1780 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1781 rs = 0.0; 1782 /* L part */ 1783 pc3 = b->a + bi[i+2]; 1784 pj = b->j + bi[i+2] ; 1785 nz = bi[i+3] - bi[i+2]; 1786 for (j=0; j<nz; j++) { 1787 col = pj[j]; 1788 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1789 } 1790 /* U part */ 1791 pc3 = b->a + bdiag[i+3]+1; 1792 pj = b->j + bdiag[i+3]+1; 1793 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1794 for (j=0; j<nz; j++) { 1795 col = pj[j]; 1796 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1797 } 1798 1799 sctx.rs = rs; 1800 sctx.pv = rtmp3[i+2]; 1801 ierr = MatPivotCheck(info,sctx,i+2);CHKERRQ(ierr); 1802 pc3 = b->a + bdiag[i+2]; 1803 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1804 1805 /* Now take care of 3rd column of diagonal 4x4 block. */ 1806 pc4 = rtmp4 + i+2; 1807 if (*pc4 != 0.0){ 1808 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1809 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1810 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1811 for (j=0; j<nz; j++) { 1812 col = pj[j]; 1813 rtmp4[col] -= mul4 * rtmp3[col]; 1814 } 1815 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1816 } 1817 1818 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1819 rs = 0.0; 1820 /* L part */ 1821 pc4 = b->a + bi[i+3]; 1822 pj = b->j + bi[i+3] ; 1823 nz = bi[i+4] - bi[i+3]; 1824 for (j=0; j<nz; j++) { 1825 col = pj[j]; 1826 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1827 } 1828 /* U part */ 1829 pc4 = b->a + bdiag[i+4]+1; 1830 pj = b->j + bdiag[i+4]+1; 1831 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1832 for (j=0; j<nz; j++) { 1833 col = pj[j]; 1834 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1835 } 1836 1837 sctx.rs = rs; 1838 sctx.pv = rtmp4[i+3]; 1839 ierr = MatPivotCheck(info,sctx,i+3);CHKERRQ(ierr); 1840 pc4 = b->a + bdiag[i+3]; 1841 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1842 break; 1843 1844 default: 1845 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 1846 } 1847 i += nodesz; /* Update the row */ 1848 } 1849 1850 /* MatPivotRefine() */ 1851 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1852 /* 1853 * if no shift in this attempt & shifting & started shifting & can refine, 1854 * then try lower shift 1855 */ 1856 sctx.shift_hi = sctx.shift_fraction; 1857 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1858 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1859 sctx.useshift = PETSC_TRUE; 1860 sctx.nshift++; 1861 } 1862 } while (sctx.useshift); 1863 1864 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1865 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1866 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1867 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1868 1869 C->ops->solve = MatSolve_SeqAIJ; 1870 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1871 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1872 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1873 C->ops->matsolve = MatMatSolve_SeqAIJ; 1874 C->assembled = PETSC_TRUE; 1875 C->preallocated = PETSC_TRUE; 1876 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1877 1878 /* MatShiftView(A,info,&sctx) */ 1879 if (sctx.nshift){ 1880 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1881 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1882 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1883 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1884 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){ 1885 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1886 } 1887 } 1888 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1889 PetscFunctionReturn(0); 1890 } 1891 1892 #undef __FUNCT__ 1893 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1894 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1895 { 1896 Mat C = B; 1897 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1898 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1899 PetscErrorCode ierr; 1900 const PetscInt *r,*ic,*c,*ics; 1901 PetscInt n = A->rmap->n,*bi = b->i; 1902 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1903 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1904 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1905 PetscScalar mul1,mul2,mul3,tmp; 1906 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1907 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1908 PetscReal rs=0.0; 1909 FactorShiftCtx sctx; 1910 PetscInt newshift; 1911 1912 PetscFunctionBegin; 1913 sctx.shift_top = 0; 1914 sctx.nshift_max = 0; 1915 sctx.shift_lo = 0; 1916 sctx.shift_hi = 0; 1917 sctx.shift_fraction = 0; 1918 1919 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1920 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1921 sctx.shift_top = 0; 1922 for (i=0; i<n; i++) { 1923 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1924 rs = 0.0; 1925 ajtmp = aj + ai[i]; 1926 rtmp1 = aa + ai[i]; 1927 nz = ai[i+1] - ai[i]; 1928 for (j=0; j<nz; j++){ 1929 if (*ajtmp != i){ 1930 rs += PetscAbsScalar(*rtmp1++); 1931 } else { 1932 rs -= PetscRealPart(*rtmp1++); 1933 } 1934 ajtmp++; 1935 } 1936 if (rs>sctx.shift_top) sctx.shift_top = rs; 1937 } 1938 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1939 sctx.shift_top *= 1.1; 1940 sctx.nshift_max = 5; 1941 sctx.shift_lo = 0.; 1942 sctx.shift_hi = 1.; 1943 } 1944 sctx.shift_amount = 0; 1945 sctx.nshift = 0; 1946 1947 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1948 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1949 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1950 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1951 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1952 ics = ic ; 1953 rtmp22 = rtmp11 + n; 1954 rtmp33 = rtmp22 + n; 1955 1956 node_max = a->inode.node_count; 1957 ns = a->inode.size; 1958 if (!ns){ 1959 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1960 } 1961 1962 /* If max inode size > 3, split it into two inodes.*/ 1963 /* also map the inode sizes according to the ordering */ 1964 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1965 for (i=0,j=0; i<node_max; ++i,++j){ 1966 if (ns[i]>3) { 1967 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1968 ++j; 1969 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1970 } else { 1971 tmp_vec1[j] = ns[i]; 1972 } 1973 } 1974 /* Use the correct node_max */ 1975 node_max = j; 1976 1977 /* Now reorder the inode info based on mat re-ordering info */ 1978 /* First create a row -> inode_size_array_index map */ 1979 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1980 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1981 for (i=0,row=0; i<node_max; i++) { 1982 nodesz = tmp_vec1[i]; 1983 for (j=0; j<nodesz; j++,row++) { 1984 nsmap[row] = i; 1985 } 1986 } 1987 /* Using nsmap, create a reordered ns structure */ 1988 for (i=0,j=0; i< node_max; i++) { 1989 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1990 tmp_vec2[i] = nodesz; 1991 j += nodesz; 1992 } 1993 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1994 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1995 /* Now use the correct ns */ 1996 ns = tmp_vec2; 1997 1998 do { 1999 sctx.useshift = PETSC_FALSE; 2000 /* Now loop over each block-row, and do the factorization */ 2001 for (i=0,row=0; i<node_max; i++) { 2002 nodesz = ns[i]; 2003 nz = bi[row+1] - bi[row]; 2004 bjtmp = bj + bi[row]; 2005 2006 switch (nodesz){ 2007 case 1: 2008 for (j=0; j<nz; j++){ 2009 idx = bjtmp[j]; 2010 rtmp11[idx] = 0.0; 2011 } 2012 2013 /* load in initial (unfactored row) */ 2014 idx = r[row]; 2015 nz_tmp = ai[idx+1] - ai[idx]; 2016 ajtmp = aj + ai[idx]; 2017 v1 = aa + ai[idx]; 2018 2019 for (j=0; j<nz_tmp; j++) { 2020 idx = ics[ajtmp[j]]; 2021 rtmp11[idx] = v1[j]; 2022 } 2023 rtmp11[ics[r[row]]] += sctx.shift_amount; 2024 2025 prow = *bjtmp++ ; 2026 while (prow < row) { 2027 pc1 = rtmp11 + prow; 2028 if (*pc1 != 0.0){ 2029 pv = ba + bd[prow]; 2030 pj = nbj + bd[prow]; 2031 mul1 = *pc1 * *pv++; 2032 *pc1 = mul1; 2033 nz_tmp = bi[prow+1] - bd[prow] - 1; 2034 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2035 for (j=0; j<nz_tmp; j++) { 2036 tmp = pv[j]; 2037 idx = pj[j]; 2038 rtmp11[idx] -= mul1 * tmp; 2039 } 2040 } 2041 prow = *bjtmp++ ; 2042 } 2043 pj = bj + bi[row]; 2044 pc1 = ba + bi[row]; 2045 2046 sctx.pv = rtmp11[row]; 2047 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2048 rs = 0.0; 2049 for (j=0; j<nz; j++) { 2050 idx = pj[j]; 2051 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2052 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2053 } 2054 sctx.rs = rs; 2055 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2056 if (newshift == 1) goto endofwhile; 2057 break; 2058 2059 case 2: 2060 for (j=0; j<nz; j++) { 2061 idx = bjtmp[j]; 2062 rtmp11[idx] = 0.0; 2063 rtmp22[idx] = 0.0; 2064 } 2065 2066 /* load in initial (unfactored row) */ 2067 idx = r[row]; 2068 nz_tmp = ai[idx+1] - ai[idx]; 2069 ajtmp = aj + ai[idx]; 2070 v1 = aa + ai[idx]; 2071 v2 = aa + ai[idx+1]; 2072 for (j=0; j<nz_tmp; j++) { 2073 idx = ics[ajtmp[j]]; 2074 rtmp11[idx] = v1[j]; 2075 rtmp22[idx] = v2[j]; 2076 } 2077 rtmp11[ics[r[row]]] += sctx.shift_amount; 2078 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2079 2080 prow = *bjtmp++ ; 2081 while (prow < row) { 2082 pc1 = rtmp11 + prow; 2083 pc2 = rtmp22 + prow; 2084 if (*pc1 != 0.0 || *pc2 != 0.0){ 2085 pv = ba + bd[prow]; 2086 pj = nbj + bd[prow]; 2087 mul1 = *pc1 * *pv; 2088 mul2 = *pc2 * *pv; 2089 ++pv; 2090 *pc1 = mul1; 2091 *pc2 = mul2; 2092 2093 nz_tmp = bi[prow+1] - bd[prow] - 1; 2094 for (j=0; j<nz_tmp; j++) { 2095 tmp = pv[j]; 2096 idx = pj[j]; 2097 rtmp11[idx] -= mul1 * tmp; 2098 rtmp22[idx] -= mul2 * tmp; 2099 } 2100 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2101 } 2102 prow = *bjtmp++ ; 2103 } 2104 2105 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2106 pc1 = rtmp11 + prow; 2107 pc2 = rtmp22 + prow; 2108 2109 sctx.pv = *pc1; 2110 pj = bj + bi[prow]; 2111 rs = 0.0; 2112 for (j=0; j<nz; j++){ 2113 idx = pj[j]; 2114 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2115 } 2116 sctx.rs = rs; 2117 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2118 if (newshift == 1) goto endofwhile; 2119 2120 if (*pc2 != 0.0){ 2121 pj = nbj + bd[prow]; 2122 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2123 *pc2 = mul2; 2124 nz_tmp = bi[prow+1] - bd[prow] - 1; 2125 for (j=0; j<nz_tmp; j++) { 2126 idx = pj[j] ; 2127 tmp = rtmp11[idx]; 2128 rtmp22[idx] -= mul2 * tmp; 2129 } 2130 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2131 } 2132 2133 pj = bj + bi[row]; 2134 pc1 = ba + bi[row]; 2135 pc2 = ba + bi[row+1]; 2136 2137 sctx.pv = rtmp22[row+1]; 2138 rs = 0.0; 2139 rtmp11[row] = 1.0/rtmp11[row]; 2140 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2141 /* copy row entries from dense representation to sparse */ 2142 for (j=0; j<nz; j++) { 2143 idx = pj[j]; 2144 pc1[j] = rtmp11[idx]; 2145 pc2[j] = rtmp22[idx]; 2146 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2147 } 2148 sctx.rs = rs; 2149 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2150 if (newshift == 1) goto endofwhile; 2151 break; 2152 2153 case 3: 2154 for (j=0; j<nz; j++) { 2155 idx = bjtmp[j]; 2156 rtmp11[idx] = 0.0; 2157 rtmp22[idx] = 0.0; 2158 rtmp33[idx] = 0.0; 2159 } 2160 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2161 idx = r[row]; 2162 nz_tmp = ai[idx+1] - ai[idx]; 2163 ajtmp = aj + ai[idx]; 2164 v1 = aa + ai[idx]; 2165 v2 = aa + ai[idx+1]; 2166 v3 = aa + ai[idx+2]; 2167 for (j=0; j<nz_tmp; j++) { 2168 idx = ics[ajtmp[j]]; 2169 rtmp11[idx] = v1[j]; 2170 rtmp22[idx] = v2[j]; 2171 rtmp33[idx] = v3[j]; 2172 } 2173 rtmp11[ics[r[row]]] += sctx.shift_amount; 2174 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2175 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2176 2177 /* loop over all pivot row blocks above this row block */ 2178 prow = *bjtmp++ ; 2179 while (prow < row) { 2180 pc1 = rtmp11 + prow; 2181 pc2 = rtmp22 + prow; 2182 pc3 = rtmp33 + prow; 2183 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 2184 pv = ba + bd[prow]; 2185 pj = nbj + bd[prow]; 2186 mul1 = *pc1 * *pv; 2187 mul2 = *pc2 * *pv; 2188 mul3 = *pc3 * *pv; 2189 ++pv; 2190 *pc1 = mul1; 2191 *pc2 = mul2; 2192 *pc3 = mul3; 2193 2194 nz_tmp = bi[prow+1] - bd[prow] - 1; 2195 /* update this row based on pivot row */ 2196 for (j=0; j<nz_tmp; j++) { 2197 tmp = pv[j]; 2198 idx = pj[j]; 2199 rtmp11[idx] -= mul1 * tmp; 2200 rtmp22[idx] -= mul2 * tmp; 2201 rtmp33[idx] -= mul3 * tmp; 2202 } 2203 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 2204 } 2205 prow = *bjtmp++ ; 2206 } 2207 2208 /* Now take care of diagonal 3x3 block in this set of rows */ 2209 /* note: prow = row here */ 2210 pc1 = rtmp11 + prow; 2211 pc2 = rtmp22 + prow; 2212 pc3 = rtmp33 + prow; 2213 2214 sctx.pv = *pc1; 2215 pj = bj + bi[prow]; 2216 rs = 0.0; 2217 for (j=0; j<nz; j++){ 2218 idx = pj[j]; 2219 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2220 } 2221 sctx.rs = rs; 2222 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2223 if (newshift == 1) goto endofwhile; 2224 2225 if (*pc2 != 0.0 || *pc3 != 0.0){ 2226 mul2 = (*pc2)/(*pc1); 2227 mul3 = (*pc3)/(*pc1); 2228 *pc2 = mul2; 2229 *pc3 = mul3; 2230 nz_tmp = bi[prow+1] - bd[prow] - 1; 2231 pj = nbj + bd[prow]; 2232 for (j=0; j<nz_tmp; j++) { 2233 idx = pj[j] ; 2234 tmp = rtmp11[idx]; 2235 rtmp22[idx] -= mul2 * tmp; 2236 rtmp33[idx] -= mul3 * tmp; 2237 } 2238 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2239 } 2240 ++prow; 2241 2242 pc2 = rtmp22 + prow; 2243 pc3 = rtmp33 + prow; 2244 sctx.pv = *pc2; 2245 pj = bj + bi[prow]; 2246 rs = 0.0; 2247 for (j=0; j<nz; j++){ 2248 idx = pj[j]; 2249 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2250 } 2251 sctx.rs = rs; 2252 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2253 if (newshift == 1) goto endofwhile; 2254 2255 if (*pc3 != 0.0){ 2256 mul3 = (*pc3)/(*pc2); 2257 *pc3 = mul3; 2258 pj = nbj + bd[prow]; 2259 nz_tmp = bi[prow+1] - bd[prow] - 1; 2260 for (j=0; j<nz_tmp; j++) { 2261 idx = pj[j] ; 2262 tmp = rtmp22[idx]; 2263 rtmp33[idx] -= mul3 * tmp; 2264 } 2265 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2266 } 2267 2268 pj = bj + bi[row]; 2269 pc1 = ba + bi[row]; 2270 pc2 = ba + bi[row+1]; 2271 pc3 = ba + bi[row+2]; 2272 2273 sctx.pv = rtmp33[row+2]; 2274 rs = 0.0; 2275 rtmp11[row] = 1.0/rtmp11[row]; 2276 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2277 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2278 /* copy row entries from dense representation to sparse */ 2279 for (j=0; j<nz; j++) { 2280 idx = pj[j]; 2281 pc1[j] = rtmp11[idx]; 2282 pc2[j] = rtmp22[idx]; 2283 pc3[j] = rtmp33[idx]; 2284 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2285 } 2286 2287 sctx.rs = rs; 2288 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 2289 if (newshift == 1) goto endofwhile; 2290 break; 2291 2292 default: 2293 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 2294 } 2295 row += nodesz; /* Update the row */ 2296 } 2297 endofwhile:; 2298 } while (sctx.useshift); 2299 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2300 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2301 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2302 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2303 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2304 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2305 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2306 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2307 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2308 C->assembled = PETSC_TRUE; 2309 C->preallocated = PETSC_TRUE; 2310 if (sctx.nshift) { 2311 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2312 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2313 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2314 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2315 } 2316 } 2317 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2318 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2319 PetscFunctionReturn(0); 2320 } 2321 2322 2323 /* ----------------------------------------------------------- */ 2324 #undef __FUNCT__ 2325 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2326 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2327 { 2328 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2329 IS iscol = a->col,isrow = a->row; 2330 PetscErrorCode ierr; 2331 const PetscInt *r,*c,*rout,*cout; 2332 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2333 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2334 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2335 PetscScalar sum1,sum2,sum3,sum4,sum5; 2336 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2337 const PetscScalar *b; 2338 2339 PetscFunctionBegin; 2340 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 2341 node_max = a->inode.node_count; 2342 ns = a->inode.size; /* Node Size array */ 2343 2344 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2345 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2346 tmp = a->solve_work; 2347 2348 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2349 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2350 2351 /* forward solve the lower triangular */ 2352 tmps = tmp ; 2353 aa = a_a ; 2354 aj = a_j ; 2355 ad = a->diag; 2356 2357 for (i = 0,row = 0; i< node_max; ++i){ 2358 nsz = ns[i]; 2359 aii = ai[row]; 2360 v1 = aa + aii; 2361 vi = aj + aii; 2362 nz = ai[row+1]- ai[row]; 2363 2364 if (i < node_max-1) { 2365 /* Prefetch the indices for the next block */ 2366 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2367 /* Prefetch the data for the next block */ 2368 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2369 } 2370 2371 switch (nsz){ /* Each loop in 'case' is unrolled */ 2372 case 1 : 2373 sum1 = b[r[row]]; 2374 for(j=0; j<nz-1; j+=2){ 2375 i0 = vi[j]; 2376 i1 = vi[j+1]; 2377 tmp0 = tmps[i0]; 2378 tmp1 = tmps[i1]; 2379 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2380 } 2381 if(j == nz-1){ 2382 tmp0 = tmps[vi[j]]; 2383 sum1 -= v1[j]*tmp0; 2384 } 2385 tmp[row++]=sum1; 2386 break; 2387 case 2: 2388 sum1 = b[r[row]]; 2389 sum2 = b[r[row+1]]; 2390 v2 = aa + ai[row+1]; 2391 2392 for(j=0; j<nz-1; j+=2){ 2393 i0 = vi[j]; 2394 i1 = vi[j+1]; 2395 tmp0 = tmps[i0]; 2396 tmp1 = tmps[i1]; 2397 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2398 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2399 } 2400 if(j == nz-1){ 2401 tmp0 = tmps[vi[j]]; 2402 sum1 -= v1[j] *tmp0; 2403 sum2 -= v2[j] *tmp0; 2404 } 2405 sum2 -= v2[nz] * sum1; 2406 tmp[row ++]=sum1; 2407 tmp[row ++]=sum2; 2408 break; 2409 case 3: 2410 sum1 = b[r[row]]; 2411 sum2 = b[r[row+1]]; 2412 sum3 = b[r[row+2]]; 2413 v2 = aa + ai[row+1]; 2414 v3 = aa + ai[row+2]; 2415 2416 for (j=0; j<nz-1; j+=2){ 2417 i0 = vi[j]; 2418 i1 = vi[j+1]; 2419 tmp0 = tmps[i0]; 2420 tmp1 = tmps[i1]; 2421 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2422 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2423 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2424 } 2425 if (j == nz-1){ 2426 tmp0 = tmps[vi[j]]; 2427 sum1 -= v1[j] *tmp0; 2428 sum2 -= v2[j] *tmp0; 2429 sum3 -= v3[j] *tmp0; 2430 } 2431 sum2 -= v2[nz] * sum1; 2432 sum3 -= v3[nz] * sum1; 2433 sum3 -= v3[nz+1] * sum2; 2434 tmp[row ++]=sum1; 2435 tmp[row ++]=sum2; 2436 tmp[row ++]=sum3; 2437 break; 2438 2439 case 4: 2440 sum1 = b[r[row]]; 2441 sum2 = b[r[row+1]]; 2442 sum3 = b[r[row+2]]; 2443 sum4 = b[r[row+3]]; 2444 v2 = aa + ai[row+1]; 2445 v3 = aa + ai[row+2]; 2446 v4 = aa + ai[row+3]; 2447 2448 for (j=0; j<nz-1; j+=2){ 2449 i0 = vi[j]; 2450 i1 = vi[j+1]; 2451 tmp0 = tmps[i0]; 2452 tmp1 = tmps[i1]; 2453 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2454 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2455 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2456 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2457 } 2458 if (j == nz-1){ 2459 tmp0 = tmps[vi[j]]; 2460 sum1 -= v1[j] *tmp0; 2461 sum2 -= v2[j] *tmp0; 2462 sum3 -= v3[j] *tmp0; 2463 sum4 -= v4[j] *tmp0; 2464 } 2465 sum2 -= v2[nz] * sum1; 2466 sum3 -= v3[nz] * sum1; 2467 sum4 -= v4[nz] * sum1; 2468 sum3 -= v3[nz+1] * sum2; 2469 sum4 -= v4[nz+1] * sum2; 2470 sum4 -= v4[nz+2] * sum3; 2471 2472 tmp[row ++]=sum1; 2473 tmp[row ++]=sum2; 2474 tmp[row ++]=sum3; 2475 tmp[row ++]=sum4; 2476 break; 2477 case 5: 2478 sum1 = b[r[row]]; 2479 sum2 = b[r[row+1]]; 2480 sum3 = b[r[row+2]]; 2481 sum4 = b[r[row+3]]; 2482 sum5 = b[r[row+4]]; 2483 v2 = aa + ai[row+1]; 2484 v3 = aa + ai[row+2]; 2485 v4 = aa + ai[row+3]; 2486 v5 = aa + ai[row+4]; 2487 2488 for (j=0; j<nz-1; j+=2){ 2489 i0 = vi[j]; 2490 i1 = vi[j+1]; 2491 tmp0 = tmps[i0]; 2492 tmp1 = tmps[i1]; 2493 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2494 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2495 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2496 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2497 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2498 } 2499 if (j == nz-1){ 2500 tmp0 = tmps[vi[j]]; 2501 sum1 -= v1[j] *tmp0; 2502 sum2 -= v2[j] *tmp0; 2503 sum3 -= v3[j] *tmp0; 2504 sum4 -= v4[j] *tmp0; 2505 sum5 -= v5[j] *tmp0; 2506 } 2507 2508 sum2 -= v2[nz] * sum1; 2509 sum3 -= v3[nz] * sum1; 2510 sum4 -= v4[nz] * sum1; 2511 sum5 -= v5[nz] * sum1; 2512 sum3 -= v3[nz+1] * sum2; 2513 sum4 -= v4[nz+1] * sum2; 2514 sum5 -= v5[nz+1] * sum2; 2515 sum4 -= v4[nz+2] * sum3; 2516 sum5 -= v5[nz+2] * sum3; 2517 sum5 -= v5[nz+3] * sum4; 2518 2519 tmp[row ++]=sum1; 2520 tmp[row ++]=sum2; 2521 tmp[row ++]=sum3; 2522 tmp[row ++]=sum4; 2523 tmp[row ++]=sum5; 2524 break; 2525 default: 2526 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2527 } 2528 } 2529 /* backward solve the upper triangular */ 2530 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2531 nsz = ns[i]; 2532 aii = ad[row+1] + 1; 2533 v1 = aa + aii; 2534 vi = aj + aii; 2535 nz = ad[row]- ad[row+1] - 1; 2536 2537 if (i > 0) { 2538 /* Prefetch the indices for the next block */ 2539 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2540 /* Prefetch the data for the next block */ 2541 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2542 } 2543 2544 switch (nsz){ /* Each loop in 'case' is unrolled */ 2545 case 1 : 2546 sum1 = tmp[row]; 2547 2548 for(j=0 ; j<nz-1; j+=2){ 2549 i0 = vi[j]; 2550 i1 = vi[j+1]; 2551 tmp0 = tmps[i0]; 2552 tmp1 = tmps[i1]; 2553 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2554 } 2555 if (j == nz-1){ 2556 tmp0 = tmps[vi[j]]; 2557 sum1 -= v1[j]*tmp0; 2558 } 2559 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2560 break; 2561 case 2 : 2562 sum1 = tmp[row]; 2563 sum2 = tmp[row-1]; 2564 v2 = aa + ad[row] + 1; 2565 for (j=0 ; j<nz-1; j+=2){ 2566 i0 = vi[j]; 2567 i1 = vi[j+1]; 2568 tmp0 = tmps[i0]; 2569 tmp1 = tmps[i1]; 2570 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2571 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2572 } 2573 if (j == nz-1){ 2574 tmp0 = tmps[vi[j]]; 2575 sum1 -= v1[j]* tmp0; 2576 sum2 -= v2[j+1]* tmp0; 2577 } 2578 2579 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2580 sum2 -= v2[0] * tmp0; 2581 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2582 break; 2583 case 3 : 2584 sum1 = tmp[row]; 2585 sum2 = tmp[row -1]; 2586 sum3 = tmp[row -2]; 2587 v2 = aa + ad[row] + 1; 2588 v3 = aa + ad[row -1] + 1; 2589 for (j=0 ; j<nz-1; j+=2){ 2590 i0 = vi[j]; 2591 i1 = vi[j+1]; 2592 tmp0 = tmps[i0]; 2593 tmp1 = tmps[i1]; 2594 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2595 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2596 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2597 } 2598 if (j== nz-1){ 2599 tmp0 = tmps[vi[j]]; 2600 sum1 -= v1[j] * tmp0; 2601 sum2 -= v2[j+1] * tmp0; 2602 sum3 -= v3[j+2] * tmp0; 2603 } 2604 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2605 sum2 -= v2[0]* tmp0; 2606 sum3 -= v3[1] * tmp0; 2607 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2608 sum3 -= v3[0]* tmp0; 2609 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2610 2611 break; 2612 case 4 : 2613 sum1 = tmp[row]; 2614 sum2 = tmp[row -1]; 2615 sum3 = tmp[row -2]; 2616 sum4 = tmp[row -3]; 2617 v2 = aa + ad[row]+1; 2618 v3 = aa + ad[row -1]+1; 2619 v4 = aa + ad[row -2]+1; 2620 2621 for (j=0 ; j<nz-1; j+=2){ 2622 i0 = vi[j]; 2623 i1 = vi[j+1]; 2624 tmp0 = tmps[i0]; 2625 tmp1 = tmps[i1]; 2626 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2627 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2628 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2629 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2630 } 2631 if (j== nz-1){ 2632 tmp0 = tmps[vi[j]]; 2633 sum1 -= v1[j] * tmp0; 2634 sum2 -= v2[j+1] * tmp0; 2635 sum3 -= v3[j+2] * tmp0; 2636 sum4 -= v4[j+3] * tmp0; 2637 } 2638 2639 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2640 sum2 -= v2[0] * tmp0; 2641 sum3 -= v3[1] * tmp0; 2642 sum4 -= v4[2] * tmp0; 2643 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2644 sum3 -= v3[0] * tmp0; 2645 sum4 -= v4[1] * tmp0; 2646 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2647 sum4 -= v4[0] * tmp0; 2648 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2649 break; 2650 case 5 : 2651 sum1 = tmp[row]; 2652 sum2 = tmp[row -1]; 2653 sum3 = tmp[row -2]; 2654 sum4 = tmp[row -3]; 2655 sum5 = tmp[row -4]; 2656 v2 = aa + ad[row]+1; 2657 v3 = aa + ad[row -1]+1; 2658 v4 = aa + ad[row -2]+1; 2659 v5 = aa + ad[row -3]+1; 2660 for (j=0 ; j<nz-1; j+=2){ 2661 i0 = vi[j]; 2662 i1 = vi[j+1]; 2663 tmp0 = tmps[i0]; 2664 tmp1 = tmps[i1]; 2665 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2666 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2667 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2668 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2669 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2670 } 2671 if (j==nz-1){ 2672 tmp0 = tmps[vi[j]]; 2673 sum1 -= v1[j] * tmp0; 2674 sum2 -= v2[j+1] * tmp0; 2675 sum3 -= v3[j+2] * tmp0; 2676 sum4 -= v4[j+3] * tmp0; 2677 sum5 -= v5[j+4] * tmp0; 2678 } 2679 2680 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2681 sum2 -= v2[0] * tmp0; 2682 sum3 -= v3[1] * tmp0; 2683 sum4 -= v4[2] * tmp0; 2684 sum5 -= v5[3] * tmp0; 2685 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2686 sum3 -= v3[0] * tmp0; 2687 sum4 -= v4[1] * tmp0; 2688 sum5 -= v5[2] * tmp0; 2689 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2690 sum4 -= v4[0] * tmp0; 2691 sum5 -= v5[1] * tmp0; 2692 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2693 sum5 -= v5[0] * tmp0; 2694 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2695 break; 2696 default: 2697 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2698 } 2699 } 2700 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2701 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2702 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2703 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2704 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2705 PetscFunctionReturn(0); 2706 } 2707 2708 2709 /* 2710 Makes a longer coloring[] array and calls the usual code with that 2711 */ 2712 #undef __FUNCT__ 2713 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2714 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2715 { 2716 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2717 PetscErrorCode ierr; 2718 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2719 PetscInt *colorused,i; 2720 ISColoringValue *newcolor; 2721 2722 PetscFunctionBegin; 2723 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2724 /* loop over inodes, marking a color for each column*/ 2725 row = 0; 2726 for (i=0; i<m; i++){ 2727 for (j=0; j<ns[i]; j++) { 2728 newcolor[row++] = coloring[i] + j*ncolors; 2729 } 2730 } 2731 2732 /* eliminate unneeded colors */ 2733 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2734 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2735 for (i=0; i<n; i++) { 2736 colorused[newcolor[i]] = 1; 2737 } 2738 2739 for (i=1; i<5*ncolors; i++) { 2740 colorused[i] += colorused[i-1]; 2741 } 2742 ncolors = colorused[5*ncolors-1]; 2743 for (i=0; i<n; i++) { 2744 newcolor[i] = colorused[newcolor[i]]-1; 2745 } 2746 ierr = PetscFree(colorused);CHKERRQ(ierr); 2747 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2748 ierr = PetscFree(coloring);CHKERRQ(ierr); 2749 PetscFunctionReturn(0); 2750 } 2751 2752 #include "../src/mat/blockinvert.h" 2753 2754 #undef __FUNCT__ 2755 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2756 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2757 { 2758 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2759 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2760 MatScalar *ibdiag,*bdiag,work[25]; 2761 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2762 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2763 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2764 PetscErrorCode ierr; 2765 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2766 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2767 2768 PetscFunctionBegin; 2769 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2770 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2771 if (its > 1) { 2772 /* switch to non-inode version */ 2773 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2774 PetscFunctionReturn(0); 2775 } 2776 2777 if (!a->inode.ibdiagvalid) { 2778 if (!a->inode.ibdiag) { 2779 /* calculate space needed for diagonal blocks */ 2780 for (i=0; i<m; i++) { 2781 cnt += sizes[i]*sizes[i]; 2782 } 2783 a->inode.bdiagsize = cnt; 2784 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2785 } 2786 2787 /* copy over the diagonal blocks and invert them */ 2788 ibdiag = a->inode.ibdiag; 2789 bdiag = a->inode.bdiag; 2790 cnt = 0; 2791 for (i=0, row = 0; i<m; i++) { 2792 for (j=0; j<sizes[i]; j++) { 2793 for (k=0; k<sizes[i]; k++) { 2794 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2795 } 2796 } 2797 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2798 2799 switch(sizes[i]) { 2800 case 1: 2801 /* Create matrix data structure */ 2802 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2803 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2804 break; 2805 case 2: 2806 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2807 break; 2808 case 3: 2809 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2810 break; 2811 case 4: 2812 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2813 break; 2814 case 5: 2815 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2816 break; 2817 default: 2818 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2819 } 2820 cnt += sizes[i]*sizes[i]; 2821 row += sizes[i]; 2822 } 2823 a->inode.ibdiagvalid = PETSC_TRUE; 2824 } 2825 ibdiag = a->inode.ibdiag; 2826 bdiag = a->inode.bdiag; 2827 2828 2829 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2830 if (flag & SOR_ZERO_INITIAL_GUESS) { 2831 PetscScalar *b; 2832 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2833 if (xx != bb) { 2834 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2835 } else { 2836 b = x; 2837 } 2838 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2839 2840 for (i=0, row=0; i<m; i++) { 2841 sz = diag[row] - ii[row]; 2842 v1 = a->a + ii[row]; 2843 idx = a->j + ii[row]; 2844 2845 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2846 switch (sizes[i]){ 2847 case 1: 2848 2849 sum1 = b[row]; 2850 for(n = 0; n<sz-1; n+=2) { 2851 i1 = idx[0]; 2852 i2 = idx[1]; 2853 idx += 2; 2854 tmp0 = x[i1]; 2855 tmp1 = x[i2]; 2856 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2857 } 2858 2859 if (n == sz-1){ 2860 tmp0 = x[*idx]; 2861 sum1 -= *v1 * tmp0; 2862 } 2863 x[row++] = sum1*(*ibdiag++); 2864 break; 2865 case 2: 2866 v2 = a->a + ii[row+1]; 2867 sum1 = b[row]; 2868 sum2 = b[row+1]; 2869 for(n = 0; n<sz-1; n+=2) { 2870 i1 = idx[0]; 2871 i2 = idx[1]; 2872 idx += 2; 2873 tmp0 = x[i1]; 2874 tmp1 = x[i2]; 2875 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2876 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2877 } 2878 2879 if (n == sz-1){ 2880 tmp0 = x[*idx]; 2881 sum1 -= v1[0] * tmp0; 2882 sum2 -= v2[0] * tmp0; 2883 } 2884 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2885 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2886 ibdiag += 4; 2887 break; 2888 case 3: 2889 v2 = a->a + ii[row+1]; 2890 v3 = a->a + ii[row+2]; 2891 sum1 = b[row]; 2892 sum2 = b[row+1]; 2893 sum3 = b[row+2]; 2894 for(n = 0; n<sz-1; n+=2) { 2895 i1 = idx[0]; 2896 i2 = idx[1]; 2897 idx += 2; 2898 tmp0 = x[i1]; 2899 tmp1 = x[i2]; 2900 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2901 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2902 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2903 } 2904 2905 if (n == sz-1){ 2906 tmp0 = x[*idx]; 2907 sum1 -= v1[0] * tmp0; 2908 sum2 -= v2[0] * tmp0; 2909 sum3 -= v3[0] * tmp0; 2910 } 2911 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2912 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2913 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2914 ibdiag += 9; 2915 break; 2916 case 4: 2917 v2 = a->a + ii[row+1]; 2918 v3 = a->a + ii[row+2]; 2919 v4 = a->a + ii[row+3]; 2920 sum1 = b[row]; 2921 sum2 = b[row+1]; 2922 sum3 = b[row+2]; 2923 sum4 = b[row+3]; 2924 for(n = 0; n<sz-1; n+=2) { 2925 i1 = idx[0]; 2926 i2 = idx[1]; 2927 idx += 2; 2928 tmp0 = x[i1]; 2929 tmp1 = x[i2]; 2930 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2931 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2932 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2933 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2934 } 2935 2936 if (n == sz-1){ 2937 tmp0 = x[*idx]; 2938 sum1 -= v1[0] * tmp0; 2939 sum2 -= v2[0] * tmp0; 2940 sum3 -= v3[0] * tmp0; 2941 sum4 -= v4[0] * tmp0; 2942 } 2943 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2944 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2945 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2946 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2947 ibdiag += 16; 2948 break; 2949 case 5: 2950 v2 = a->a + ii[row+1]; 2951 v3 = a->a + ii[row+2]; 2952 v4 = a->a + ii[row+3]; 2953 v5 = a->a + ii[row+4]; 2954 sum1 = b[row]; 2955 sum2 = b[row+1]; 2956 sum3 = b[row+2]; 2957 sum4 = b[row+3]; 2958 sum5 = b[row+4]; 2959 for(n = 0; n<sz-1; n+=2) { 2960 i1 = idx[0]; 2961 i2 = idx[1]; 2962 idx += 2; 2963 tmp0 = x[i1]; 2964 tmp1 = x[i2]; 2965 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2966 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2967 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2968 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2969 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2970 } 2971 2972 if (n == sz-1){ 2973 tmp0 = x[*idx]; 2974 sum1 -= v1[0] * tmp0; 2975 sum2 -= v2[0] * tmp0; 2976 sum3 -= v3[0] * tmp0; 2977 sum4 -= v4[0] * tmp0; 2978 sum5 -= v5[0] * tmp0; 2979 } 2980 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2981 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2982 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2983 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2984 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2985 ibdiag += 25; 2986 break; 2987 default: 2988 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2989 } 2990 } 2991 2992 xb = x; 2993 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2994 } else xb = b; 2995 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2996 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2997 cnt = 0; 2998 for (i=0, row=0; i<m; i++) { 2999 3000 switch (sizes[i]){ 3001 case 1: 3002 x[row++] *= bdiag[cnt++]; 3003 break; 3004 case 2: 3005 x1 = x[row]; x2 = x[row+1]; 3006 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3007 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3008 x[row++] = tmp1; 3009 x[row++] = tmp2; 3010 cnt += 4; 3011 break; 3012 case 3: 3013 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3014 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3015 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3016 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3017 x[row++] = tmp1; 3018 x[row++] = tmp2; 3019 x[row++] = tmp3; 3020 cnt += 9; 3021 break; 3022 case 4: 3023 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3024 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3025 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3026 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3027 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3028 x[row++] = tmp1; 3029 x[row++] = tmp2; 3030 x[row++] = tmp3; 3031 x[row++] = tmp4; 3032 cnt += 16; 3033 break; 3034 case 5: 3035 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3036 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3037 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3038 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3039 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3040 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3041 x[row++] = tmp1; 3042 x[row++] = tmp2; 3043 x[row++] = tmp3; 3044 x[row++] = tmp4; 3045 x[row++] = tmp5; 3046 cnt += 25; 3047 break; 3048 default: 3049 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3050 } 3051 } 3052 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3053 } 3054 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 3055 3056 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3057 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3058 ibdiag -= sizes[i]*sizes[i]; 3059 sz = ii[row+1] - diag[row] - 1; 3060 v1 = a->a + diag[row] + 1; 3061 idx = a->j + diag[row] + 1; 3062 3063 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3064 switch (sizes[i]){ 3065 case 1: 3066 3067 sum1 = xb[row]; 3068 for(n = 0; n<sz-1; n+=2) { 3069 i1 = idx[0]; 3070 i2 = idx[1]; 3071 idx += 2; 3072 tmp0 = x[i1]; 3073 tmp1 = x[i2]; 3074 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3075 } 3076 3077 if (n == sz-1){ 3078 tmp0 = x[*idx]; 3079 sum1 -= *v1*tmp0; 3080 } 3081 x[row--] = sum1*(*ibdiag); 3082 break; 3083 3084 case 2: 3085 3086 sum1 = xb[row]; 3087 sum2 = xb[row-1]; 3088 /* note that sum1 is associated with the second of the two rows */ 3089 v2 = a->a + diag[row-1] + 2; 3090 for(n = 0; n<sz-1; n+=2) { 3091 i1 = idx[0]; 3092 i2 = idx[1]; 3093 idx += 2; 3094 tmp0 = x[i1]; 3095 tmp1 = x[i2]; 3096 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3097 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3098 } 3099 3100 if (n == sz-1){ 3101 tmp0 = x[*idx]; 3102 sum1 -= *v1*tmp0; 3103 sum2 -= *v2*tmp0; 3104 } 3105 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3106 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3107 break; 3108 case 3: 3109 3110 sum1 = xb[row]; 3111 sum2 = xb[row-1]; 3112 sum3 = xb[row-2]; 3113 v2 = a->a + diag[row-1] + 2; 3114 v3 = a->a + diag[row-2] + 3; 3115 for(n = 0; n<sz-1; n+=2) { 3116 i1 = idx[0]; 3117 i2 = idx[1]; 3118 idx += 2; 3119 tmp0 = x[i1]; 3120 tmp1 = x[i2]; 3121 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3122 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3123 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3124 } 3125 3126 if (n == sz-1){ 3127 tmp0 = x[*idx]; 3128 sum1 -= *v1*tmp0; 3129 sum2 -= *v2*tmp0; 3130 sum3 -= *v3*tmp0; 3131 } 3132 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3133 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3134 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3135 break; 3136 case 4: 3137 3138 sum1 = xb[row]; 3139 sum2 = xb[row-1]; 3140 sum3 = xb[row-2]; 3141 sum4 = xb[row-3]; 3142 v2 = a->a + diag[row-1] + 2; 3143 v3 = a->a + diag[row-2] + 3; 3144 v4 = a->a + diag[row-3] + 4; 3145 for(n = 0; n<sz-1; n+=2) { 3146 i1 = idx[0]; 3147 i2 = idx[1]; 3148 idx += 2; 3149 tmp0 = x[i1]; 3150 tmp1 = x[i2]; 3151 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3152 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3153 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3154 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3155 } 3156 3157 if (n == sz-1){ 3158 tmp0 = x[*idx]; 3159 sum1 -= *v1*tmp0; 3160 sum2 -= *v2*tmp0; 3161 sum3 -= *v3*tmp0; 3162 sum4 -= *v4*tmp0; 3163 } 3164 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3165 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3166 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3167 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3168 break; 3169 case 5: 3170 3171 sum1 = xb[row]; 3172 sum2 = xb[row-1]; 3173 sum3 = xb[row-2]; 3174 sum4 = xb[row-3]; 3175 sum5 = xb[row-4]; 3176 v2 = a->a + diag[row-1] + 2; 3177 v3 = a->a + diag[row-2] + 3; 3178 v4 = a->a + diag[row-3] + 4; 3179 v5 = a->a + diag[row-4] + 5; 3180 for(n = 0; n<sz-1; n+=2) { 3181 i1 = idx[0]; 3182 i2 = idx[1]; 3183 idx += 2; 3184 tmp0 = x[i1]; 3185 tmp1 = x[i2]; 3186 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3187 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3188 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3189 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3190 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3191 } 3192 3193 if (n == sz-1){ 3194 tmp0 = x[*idx]; 3195 sum1 -= *v1*tmp0; 3196 sum2 -= *v2*tmp0; 3197 sum3 -= *v3*tmp0; 3198 sum4 -= *v4*tmp0; 3199 sum5 -= *v5*tmp0; 3200 } 3201 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3202 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3203 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3204 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3205 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3206 break; 3207 default: 3208 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3209 } 3210 } 3211 3212 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3213 } 3214 its--; 3215 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3216 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 3217 } 3218 if (flag & SOR_EISENSTAT) { 3219 const PetscScalar *b; 3220 MatScalar *t = a->inode.ssor_work; 3221 3222 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3223 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3224 /* 3225 Apply (U + D)^-1 where D is now the block diagonal 3226 */ 3227 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3228 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3229 ibdiag -= sizes[i]*sizes[i]; 3230 sz = ii[row+1] - diag[row] - 1; 3231 v1 = a->a + diag[row] + 1; 3232 idx = a->j + diag[row] + 1; 3233 CHKMEMQ; 3234 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3235 switch (sizes[i]){ 3236 case 1: 3237 3238 sum1 = b[row]; 3239 for(n = 0; n<sz-1; n+=2) { 3240 i1 = idx[0]; 3241 i2 = idx[1]; 3242 idx += 2; 3243 tmp0 = x[i1]; 3244 tmp1 = x[i2]; 3245 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3246 } 3247 3248 if (n == sz-1){ 3249 tmp0 = x[*idx]; 3250 sum1 -= *v1*tmp0; 3251 } 3252 x[row] = sum1*(*ibdiag);row--; 3253 break; 3254 3255 case 2: 3256 3257 sum1 = b[row]; 3258 sum2 = b[row-1]; 3259 /* note that sum1 is associated with the second of the two rows */ 3260 v2 = a->a + diag[row-1] + 2; 3261 for(n = 0; n<sz-1; n+=2) { 3262 i1 = idx[0]; 3263 i2 = idx[1]; 3264 idx += 2; 3265 tmp0 = x[i1]; 3266 tmp1 = x[i2]; 3267 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3268 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3269 } 3270 3271 if (n == sz-1){ 3272 tmp0 = x[*idx]; 3273 sum1 -= *v1*tmp0; 3274 sum2 -= *v2*tmp0; 3275 } 3276 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3277 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3278 row -= 2; 3279 break; 3280 case 3: 3281 3282 sum1 = b[row]; 3283 sum2 = b[row-1]; 3284 sum3 = b[row-2]; 3285 v2 = a->a + diag[row-1] + 2; 3286 v3 = a->a + diag[row-2] + 3; 3287 for(n = 0; n<sz-1; n+=2) { 3288 i1 = idx[0]; 3289 i2 = idx[1]; 3290 idx += 2; 3291 tmp0 = x[i1]; 3292 tmp1 = x[i2]; 3293 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3294 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3295 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3296 } 3297 3298 if (n == sz-1){ 3299 tmp0 = x[*idx]; 3300 sum1 -= *v1*tmp0; 3301 sum2 -= *v2*tmp0; 3302 sum3 -= *v3*tmp0; 3303 } 3304 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3305 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3306 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3307 row -= 3; 3308 break; 3309 case 4: 3310 3311 sum1 = b[row]; 3312 sum2 = b[row-1]; 3313 sum3 = b[row-2]; 3314 sum4 = b[row-3]; 3315 v2 = a->a + diag[row-1] + 2; 3316 v3 = a->a + diag[row-2] + 3; 3317 v4 = a->a + diag[row-3] + 4; 3318 for(n = 0; n<sz-1; n+=2) { 3319 i1 = idx[0]; 3320 i2 = idx[1]; 3321 idx += 2; 3322 tmp0 = x[i1]; 3323 tmp1 = x[i2]; 3324 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3325 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3326 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3327 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3328 } 3329 3330 if (n == sz-1){ 3331 tmp0 = x[*idx]; 3332 sum1 -= *v1*tmp0; 3333 sum2 -= *v2*tmp0; 3334 sum3 -= *v3*tmp0; 3335 sum4 -= *v4*tmp0; 3336 } 3337 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3338 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3339 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3340 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3341 row -= 4; 3342 break; 3343 case 5: 3344 3345 sum1 = b[row]; 3346 sum2 = b[row-1]; 3347 sum3 = b[row-2]; 3348 sum4 = b[row-3]; 3349 sum5 = b[row-4]; 3350 v2 = a->a + diag[row-1] + 2; 3351 v3 = a->a + diag[row-2] + 3; 3352 v4 = a->a + diag[row-3] + 4; 3353 v5 = a->a + diag[row-4] + 5; 3354 for(n = 0; n<sz-1; n+=2) { 3355 i1 = idx[0]; 3356 i2 = idx[1]; 3357 idx += 2; 3358 tmp0 = x[i1]; 3359 tmp1 = x[i2]; 3360 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3361 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3362 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3363 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3364 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3365 } 3366 3367 if (n == sz-1){ 3368 tmp0 = x[*idx]; 3369 sum1 -= *v1*tmp0; 3370 sum2 -= *v2*tmp0; 3371 sum3 -= *v3*tmp0; 3372 sum4 -= *v4*tmp0; 3373 sum5 -= *v5*tmp0; 3374 } 3375 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3376 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3377 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3378 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3379 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3380 row -= 5; 3381 break; 3382 default: 3383 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3384 } 3385 CHKMEMQ; 3386 } 3387 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3388 3389 /* 3390 t = b - D x where D is the block diagonal 3391 */ 3392 cnt = 0; 3393 for (i=0, row=0; i<m; i++) { 3394 CHKMEMQ; 3395 switch (sizes[i]){ 3396 case 1: 3397 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3398 break; 3399 case 2: 3400 x1 = x[row]; x2 = x[row+1]; 3401 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3402 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3403 t[row] = b[row] - tmp1; 3404 t[row+1] = b[row+1] - tmp2; row += 2; 3405 cnt += 4; 3406 break; 3407 case 3: 3408 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3409 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3410 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3411 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3412 t[row] = b[row] - tmp1; 3413 t[row+1] = b[row+1] - tmp2; 3414 t[row+2] = b[row+2] - tmp3; row += 3; 3415 cnt += 9; 3416 break; 3417 case 4: 3418 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3419 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3420 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3421 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3422 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3423 t[row] = b[row] - tmp1; 3424 t[row+1] = b[row+1] - tmp2; 3425 t[row+2] = b[row+2] - tmp3; 3426 t[row+3] = b[row+3] - tmp4; row += 4; 3427 cnt += 16; 3428 break; 3429 case 5: 3430 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3431 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3432 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3433 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3434 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3435 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3436 t[row] = b[row] - tmp1; 3437 t[row+1] = b[row+1] - tmp2; 3438 t[row+2] = b[row+2] - tmp3; 3439 t[row+3] = b[row+3] - tmp4; 3440 t[row+4] = b[row+4] - tmp5;row += 5; 3441 cnt += 25; 3442 break; 3443 default: 3444 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3445 } 3446 CHKMEMQ; 3447 } 3448 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3449 3450 3451 3452 /* 3453 Apply (L + D)^-1 where D is the block diagonal 3454 */ 3455 for (i=0, row=0; i<m; i++) { 3456 sz = diag[row] - ii[row]; 3457 v1 = a->a + ii[row]; 3458 idx = a->j + ii[row]; 3459 CHKMEMQ; 3460 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3461 switch (sizes[i]){ 3462 case 1: 3463 3464 sum1 = t[row]; 3465 for(n = 0; n<sz-1; n+=2) { 3466 i1 = idx[0]; 3467 i2 = idx[1]; 3468 idx += 2; 3469 tmp0 = t[i1]; 3470 tmp1 = t[i2]; 3471 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3472 } 3473 3474 if (n == sz-1){ 3475 tmp0 = t[*idx]; 3476 sum1 -= *v1 * tmp0; 3477 } 3478 x[row] += t[row] = sum1*(*ibdiag++); row++; 3479 break; 3480 case 2: 3481 v2 = a->a + ii[row+1]; 3482 sum1 = t[row]; 3483 sum2 = t[row+1]; 3484 for(n = 0; n<sz-1; n+=2) { 3485 i1 = idx[0]; 3486 i2 = idx[1]; 3487 idx += 2; 3488 tmp0 = t[i1]; 3489 tmp1 = t[i2]; 3490 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3491 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3492 } 3493 3494 if (n == sz-1){ 3495 tmp0 = t[*idx]; 3496 sum1 -= v1[0] * tmp0; 3497 sum2 -= v2[0] * tmp0; 3498 } 3499 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3500 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3501 ibdiag += 4; row += 2; 3502 break; 3503 case 3: 3504 v2 = a->a + ii[row+1]; 3505 v3 = a->a + ii[row+2]; 3506 sum1 = t[row]; 3507 sum2 = t[row+1]; 3508 sum3 = t[row+2]; 3509 for(n = 0; n<sz-1; n+=2) { 3510 i1 = idx[0]; 3511 i2 = idx[1]; 3512 idx += 2; 3513 tmp0 = t[i1]; 3514 tmp1 = t[i2]; 3515 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3516 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3517 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3518 } 3519 3520 if (n == sz-1){ 3521 tmp0 = t[*idx]; 3522 sum1 -= v1[0] * tmp0; 3523 sum2 -= v2[0] * tmp0; 3524 sum3 -= v3[0] * tmp0; 3525 } 3526 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3527 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3528 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3529 ibdiag += 9; row += 3; 3530 break; 3531 case 4: 3532 v2 = a->a + ii[row+1]; 3533 v3 = a->a + ii[row+2]; 3534 v4 = a->a + ii[row+3]; 3535 sum1 = t[row]; 3536 sum2 = t[row+1]; 3537 sum3 = t[row+2]; 3538 sum4 = t[row+3]; 3539 for(n = 0; n<sz-1; n+=2) { 3540 i1 = idx[0]; 3541 i2 = idx[1]; 3542 idx += 2; 3543 tmp0 = t[i1]; 3544 tmp1 = t[i2]; 3545 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3546 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3547 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3548 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3549 } 3550 3551 if (n == sz-1){ 3552 tmp0 = t[*idx]; 3553 sum1 -= v1[0] * tmp0; 3554 sum2 -= v2[0] * tmp0; 3555 sum3 -= v3[0] * tmp0; 3556 sum4 -= v4[0] * tmp0; 3557 } 3558 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3559 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3560 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3561 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3562 ibdiag += 16; row += 4; 3563 break; 3564 case 5: 3565 v2 = a->a + ii[row+1]; 3566 v3 = a->a + ii[row+2]; 3567 v4 = a->a + ii[row+3]; 3568 v5 = a->a + ii[row+4]; 3569 sum1 = t[row]; 3570 sum2 = t[row+1]; 3571 sum3 = t[row+2]; 3572 sum4 = t[row+3]; 3573 sum5 = t[row+4]; 3574 for(n = 0; n<sz-1; n+=2) { 3575 i1 = idx[0]; 3576 i2 = idx[1]; 3577 idx += 2; 3578 tmp0 = t[i1]; 3579 tmp1 = t[i2]; 3580 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3581 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3582 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3583 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3584 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3585 } 3586 3587 if (n == sz-1){ 3588 tmp0 = t[*idx]; 3589 sum1 -= v1[0] * tmp0; 3590 sum2 -= v2[0] * tmp0; 3591 sum3 -= v3[0] * tmp0; 3592 sum4 -= v4[0] * tmp0; 3593 sum5 -= v5[0] * tmp0; 3594 } 3595 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3596 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3597 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3598 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3599 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3600 ibdiag += 25; row += 5; 3601 break; 3602 default: 3603 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3604 } 3605 CHKMEMQ; 3606 } 3607 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3608 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3609 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3610 } 3611 PetscFunctionReturn(0); 3612 } 3613 3614 #undef __FUNCT__ 3615 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3616 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3617 { 3618 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3619 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3620 const MatScalar *bdiag = a->inode.bdiag; 3621 const PetscScalar *b; 3622 PetscErrorCode ierr; 3623 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3624 const PetscInt *sizes = a->inode.size; 3625 3626 PetscFunctionBegin; 3627 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3628 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3629 cnt = 0; 3630 for (i=0, row=0; i<m; i++) { 3631 switch (sizes[i]){ 3632 case 1: 3633 x[row] = b[row]*bdiag[cnt++];row++; 3634 break; 3635 case 2: 3636 x1 = b[row]; x2 = b[row+1]; 3637 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3638 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3639 x[row++] = tmp1; 3640 x[row++] = tmp2; 3641 cnt += 4; 3642 break; 3643 case 3: 3644 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3645 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3646 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3647 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3648 x[row++] = tmp1; 3649 x[row++] = tmp2; 3650 x[row++] = tmp3; 3651 cnt += 9; 3652 break; 3653 case 4: 3654 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3655 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3656 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3657 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3658 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3659 x[row++] = tmp1; 3660 x[row++] = tmp2; 3661 x[row++] = tmp3; 3662 x[row++] = tmp4; 3663 cnt += 16; 3664 break; 3665 case 5: 3666 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3667 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3668 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3669 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3670 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3671 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3672 x[row++] = tmp1; 3673 x[row++] = tmp2; 3674 x[row++] = tmp3; 3675 x[row++] = tmp4; 3676 x[row++] = tmp5; 3677 cnt += 25; 3678 break; 3679 default: 3680 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3681 } 3682 } 3683 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3684 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3685 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3686 PetscFunctionReturn(0); 3687 } 3688 3689 /* 3690 samestructure indicates that the matrix has not changed its nonzero structure so we 3691 do not need to recompute the inodes 3692 */ 3693 #undef __FUNCT__ 3694 #define __FUNCT__ "Mat_CheckInode" 3695 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3696 { 3697 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3698 PetscErrorCode ierr; 3699 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3700 PetscTruth flag; 3701 3702 PetscFunctionBegin; 3703 if (!a->inode.use) PetscFunctionReturn(0); 3704 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3705 3706 3707 m = A->rmap->n; 3708 if (a->inode.size) {ns = a->inode.size;} 3709 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3710 3711 i = 0; 3712 node_count = 0; 3713 idx = a->j; 3714 ii = a->i; 3715 while (i < m){ /* For each row */ 3716 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3717 /* Limits the number of elements in a node to 'a->inode.limit' */ 3718 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3719 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3720 if (nzy != nzx) break; 3721 idy += nzx; /* Same nonzero pattern */ 3722 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3723 if (!flag) break; 3724 } 3725 ns[node_count++] = blk_size; 3726 idx += blk_size*nzx; 3727 i = j; 3728 } 3729 /* If not enough inodes found,, do not use inode version of the routines */ 3730 if (!a->inode.size && m && node_count > .9*m) { 3731 ierr = PetscFree(ns);CHKERRQ(ierr); 3732 a->inode.node_count = 0; 3733 a->inode.size = PETSC_NULL; 3734 a->inode.use = PETSC_FALSE; 3735 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3736 } else { 3737 if (!A->factor) { 3738 A->ops->mult = MatMult_SeqAIJ_Inode; 3739 A->ops->sor = MatSOR_SeqAIJ_Inode; 3740 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3741 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3742 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3743 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3744 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3745 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3746 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3747 } else { 3748 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 3749 } 3750 a->inode.node_count = node_count; 3751 a->inode.size = ns; 3752 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3753 } 3754 PetscFunctionReturn(0); 3755 } 3756 3757 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3758 PetscInt __k, *__vi; \ 3759 __vi = aj + ai[row]; \ 3760 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3761 __vi = aj + adiag[row]; \ 3762 cols[nzl] = __vi[0];\ 3763 __vi = aj + adiag[row+1]+1;\ 3764 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3765 3766 3767 /* 3768 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3769 Modified from Mat_CheckInode(). 3770 3771 Input Parameters: 3772 + Mat A - ILU or LU matrix factor 3773 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3774 do not need to recompute the inodes 3775 */ 3776 #undef __FUNCT__ 3777 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3778 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3779 { 3780 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3781 PetscErrorCode ierr; 3782 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3783 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3784 PetscTruth flag; 3785 3786 PetscFunctionBegin; 3787 if (!a->inode.use) PetscFunctionReturn(0); 3788 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3789 3790 m = A->rmap->n; 3791 if (a->inode.size) {ns = a->inode.size;} 3792 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3793 3794 i = 0; 3795 node_count = 0; 3796 while (i < m){ /* For each row */ 3797 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3798 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3799 nzx = nzl1 + nzu1 + 1; 3800 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3801 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3802 3803 /* Limits the number of elements in a node to 'a->inode.limit' */ 3804 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3805 nzl2 = ai[j+1] - ai[j]; 3806 nzu2 = adiag[j] - adiag[j+1] - 1; 3807 nzy = nzl2 + nzu2 + 1; 3808 if( nzy != nzx) break; 3809 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3810 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3811 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3812 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3813 ierr = PetscFree(cols2);CHKERRQ(ierr); 3814 } 3815 ns[node_count++] = blk_size; 3816 ierr = PetscFree(cols1);CHKERRQ(ierr); 3817 i = j; 3818 } 3819 /* If not enough inodes found,, do not use inode version of the routines */ 3820 if (!a->inode.size && m && node_count > .9*m) { 3821 ierr = PetscFree(ns);CHKERRQ(ierr); 3822 a->inode.node_count = 0; 3823 a->inode.size = PETSC_NULL; 3824 a->inode.use = PETSC_FALSE; 3825 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3826 } else { 3827 A->ops->mult = 0; 3828 A->ops->sor = 0; 3829 A->ops->multadd = 0; 3830 A->ops->getrowij = 0; 3831 A->ops->restorerowij = 0; 3832 A->ops->getcolumnij = 0; 3833 A->ops->restorecolumnij = 0; 3834 A->ops->coloringpatch = 0; 3835 A->ops->multdiagonalblock = 0; 3836 A->ops->solve = MatSolve_SeqAIJ_Inode; 3837 a->inode.node_count = node_count; 3838 a->inode.size = ns; 3839 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3840 } 3841 PetscFunctionReturn(0); 3842 } 3843 3844 /* 3845 This is really ugly. if inodes are used this replaces the 3846 permutations with ones that correspond to rows/cols of the matrix 3847 rather then inode blocks 3848 */ 3849 #undef __FUNCT__ 3850 #define __FUNCT__ "MatInodeAdjustForInodes" 3851 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3852 { 3853 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3854 3855 PetscFunctionBegin; 3856 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3857 if (f) { 3858 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3859 } 3860 PetscFunctionReturn(0); 3861 } 3862 3863 EXTERN_C_BEGIN 3864 #undef __FUNCT__ 3865 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3866 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3867 { 3868 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3869 PetscErrorCode ierr; 3870 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3871 const PetscInt *ridx,*cidx; 3872 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3873 PetscInt nslim_col,*ns_col; 3874 IS ris = *rperm,cis = *cperm; 3875 3876 PetscFunctionBegin; 3877 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3878 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3879 3880 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3881 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3882 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3883 3884 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3885 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3886 3887 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3888 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3889 3890 /* Construct the permutations for rows*/ 3891 for (i=0,row = 0; i<nslim_row; ++i){ 3892 indx = ridx[i]; 3893 start_val = tns[indx]; 3894 end_val = tns[indx + 1]; 3895 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3896 } 3897 3898 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3899 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3900 3901 /* Construct permutations for columns */ 3902 for (i=0,col=0; i<nslim_col; ++i){ 3903 indx = cidx[i]; 3904 start_val = tns[indx]; 3905 end_val = tns[indx + 1]; 3906 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3907 } 3908 3909 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3910 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3911 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3912 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3913 3914 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3915 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3916 3917 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3918 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3919 ierr = ISDestroy(cis);CHKERRQ(ierr); 3920 ierr = ISDestroy(ris);CHKERRQ(ierr); 3921 ierr = PetscFree(tns);CHKERRQ(ierr); 3922 PetscFunctionReturn(0); 3923 } 3924 EXTERN_C_END 3925 3926 #undef __FUNCT__ 3927 #define __FUNCT__ "MatInodeGetInodeSizes" 3928 /*@C 3929 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3930 3931 Collective on Mat 3932 3933 Input Parameter: 3934 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3935 3936 Output Parameter: 3937 + node_count - no of inodes present in the matrix. 3938 . sizes - an array of size node_count,with sizes of each inode. 3939 - limit - the max size used to generate the inodes. 3940 3941 Level: advanced 3942 3943 Notes: This routine returns some internal storage information 3944 of the matrix, it is intended to be used by advanced users. 3945 It should be called after the matrix is assembled. 3946 The contents of the sizes[] array should not be changed. 3947 PETSC_NULL may be passed for information not requested. 3948 3949 .keywords: matrix, seqaij, get, inode 3950 3951 .seealso: MatGetInfo() 3952 @*/ 3953 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3954 { 3955 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3956 3957 PetscFunctionBegin; 3958 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3959 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3960 if (f) { 3961 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3962 } 3963 PetscFunctionReturn(0); 3964 } 3965 3966 EXTERN_C_BEGIN 3967 #undef __FUNCT__ 3968 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3969 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3970 { 3971 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3972 3973 PetscFunctionBegin; 3974 if (node_count) *node_count = a->inode.node_count; 3975 if (sizes) *sizes = a->inode.size; 3976 if (limit) *limit = a->inode.limit; 3977 PetscFunctionReturn(0); 3978 } 3979 EXTERN_C_END 3980