1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1195 FactorShiftCtx sctx; 1196 const PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,node_max,col; 1200 const PetscInt *ns; 1201 PetscInt *tmp_vec1,*tmp_vec2,*nsmap,newshift; 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) 1216 rs += PetscAbsScalar(v[j]); 1217 if (rs>sctx.shift_top) sctx.shift_top = rs; 1218 } 1219 sctx.shift_top *= 1.1; 1220 sctx.nshift_max = 5; 1221 sctx.shift_lo = 0.; 1222 sctx.shift_hi = 1.; 1223 } 1224 1225 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1226 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1227 1228 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1230 rtmp2 = rtmp1 + n; 1231 rtmp3 = rtmp2 + n; 1232 rtmp4 = rtmp3 + n; 1233 ics = ic; 1234 1235 node_max = a->inode.node_count; 1236 ns = a->inode.size; 1237 if (!ns){ 1238 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1239 } 1240 1241 /* If max inode size > 4, split it into two inodes.*/ 1242 /* also map the inode sizes according to the ordering */ 1243 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1244 for (i=0,j=0; i<node_max; ++i,++j){ 1245 if (ns[i]>3) { 1246 tmp_vec1[j] = ns[i]/2; 1247 ++j; 1248 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1249 } else { 1250 tmp_vec1[j] = ns[i]; 1251 } 1252 } 1253 /* Use the correct node_max */ 1254 node_max = j; 1255 1256 /* Now reorder the inode info based on mat re-ordering info */ 1257 /* First create a row -> inode_size_array_index map */ 1258 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1259 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1260 for (i=0,row=0; i<node_max; i++) { 1261 nodesz = tmp_vec1[i]; 1262 for (j=0; j<nodesz; j++,row++) { 1263 nsmap[row] = i; 1264 } 1265 } 1266 /* Using nsmap, create a reordered ns structure */ 1267 for (i=0,j=0; i< node_max; i++) { 1268 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1269 tmp_vec2[i] = nodesz; 1270 j += nodesz; 1271 } 1272 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1273 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1274 1275 /* Now use the correct ns */ 1276 ns = tmp_vec2; 1277 1278 do { 1279 sctx.useshift = PETSC_FALSE; 1280 /* Now loop over each block-row, and do the factorization */ 1281 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1282 nodesz = ns[inod]; 1283 1284 switch (nodesz){ 1285 case 1: 1286 /*----------*/ 1287 /* zero rtmp1 */ 1288 /* L part */ 1289 nz = bi[i+1] - bi[i]; 1290 bjtmp = bj + bi[i]; 1291 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1292 1293 /* U part */ 1294 nz = bdiag[i]-bdiag[i+1]; 1295 bjtmp = bj + bdiag[i+1]+1; 1296 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1297 1298 /* load in initial (unfactored row) */ 1299 nz = ai[r[i]+1] - ai[r[i]]; 1300 ajtmp = aj + ai[r[i]]; 1301 v = aa + ai[r[i]]; 1302 for (j=0; j<nz; j++) { 1303 rtmp1[ics[ajtmp[j]]] = v[j]; 1304 } 1305 /* ZeropivotApply() */ 1306 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1307 1308 /* elimination */ 1309 bjtmp = bj + bi[i]; 1310 row = *bjtmp++; 1311 nzL = bi[i+1] - bi[i]; 1312 for(k=0; k < nzL;k++) { 1313 pc = rtmp1 + row; 1314 if (*pc != 0.0) { 1315 pv = b->a + bdiag[row]; 1316 mul1 = *pc * (*pv); 1317 *pc = mul1; 1318 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1319 pv = b->a + bdiag[row+1]+1; 1320 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1321 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1322 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1323 } 1324 row = *bjtmp++; 1325 } 1326 1327 /* finished row so stick it into b->a */ 1328 rs = 0.0; 1329 /* L part */ 1330 pv = b->a + bi[i] ; 1331 pj = b->j + bi[i] ; 1332 nz = bi[i+1] - bi[i]; 1333 for (j=0; j<nz; j++) { 1334 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1335 } 1336 1337 /* U part */ 1338 pv = b->a + bdiag[i+1]+1; 1339 pj = b->j + bdiag[i+1]+1; 1340 nz = bdiag[i] - bdiag[i+1]-1; 1341 for (j=0; j<nz; j++) { 1342 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1343 } 1344 1345 /* Check zero pivot */ 1346 sctx.rs = rs; 1347 sctx.pv = rtmp1[i]; 1348 ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr); 1349 if(newshift == 1) break; 1350 1351 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1352 pv = b->a + bdiag[i]; 1353 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1354 break; 1355 1356 case 2: 1357 /*----------*/ 1358 /* zero rtmp1 and rtmp2 */ 1359 /* L part */ 1360 nz = bi[i+1] - bi[i]; 1361 bjtmp = bj + bi[i]; 1362 for (j=0; j<nz; j++) { 1363 col = bjtmp[j]; 1364 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1365 } 1366 1367 /* U part */ 1368 nz = bdiag[i]-bdiag[i+1]; 1369 bjtmp = bj + bdiag[i+1]+1; 1370 for (j=0; j<nz; j++) { 1371 col = bjtmp[j]; 1372 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1373 } 1374 1375 /* load in initial (unfactored row) */ 1376 nz = ai[r[i]+1] - ai[r[i]]; 1377 ajtmp = aj + ai[r[i]]; 1378 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1379 for (j=0; j<nz; j++) { 1380 col = ics[ajtmp[j]]; 1381 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1382 } 1383 /* ZeropivotApply(): shift the diagonal of the matrix */ 1384 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1385 1386 /* elimination */ 1387 bjtmp = bj + bi[i]; 1388 row = *bjtmp++; /* pivot row */ 1389 nzL = bi[i+1] - bi[i]; 1390 for(k=0; k < nzL;k++) { 1391 pc1 = rtmp1 + row; 1392 pc2 = rtmp2 + row; 1393 if (*pc1 != 0.0 || *pc2 != 0.0) { 1394 pv = b->a + bdiag[row]; 1395 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1396 *pc1 = mul1; *pc2 = mul2; 1397 1398 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1399 pv = b->a + bdiag[row+1]+1; 1400 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1401 for (j=0; j<nz; j++){ 1402 col = pj[j]; 1403 rtmp1[col] -= mul1 * pv[j]; 1404 rtmp2[col] -= mul2 * pv[j]; 1405 } 1406 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1407 } 1408 row = *bjtmp++; 1409 } 1410 1411 /* finished row i; check zero pivot, then stick row i into b->a */ 1412 rs = 0.0; 1413 /* L part */ 1414 pc1 = b->a + bi[i]; 1415 pj = b->j + bi[i] ; 1416 nz = bi[i+1] - bi[i]; 1417 for (j=0; j<nz; j++) { 1418 col = pj[j]; 1419 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1420 } 1421 /* U part */ 1422 pc1 = b->a + bdiag[i+1]+1; 1423 pj = b->j + bdiag[i+1]+1; 1424 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1425 for (j=0; j<nz; j++) { 1426 col = pj[j]; 1427 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1428 } 1429 1430 sctx.rs = rs; 1431 sctx.pv = rtmp1[i]; 1432 ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr); 1433 if(newshift == 1) break; 1434 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1435 *pc1 = 1.0/sctx.pv; 1436 1437 /* Now take care of diagonal 2x2 block. */ 1438 pc2 = rtmp2 + i; 1439 if (*pc2 != 0.0){ 1440 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1441 *pc2 = mul1; /* insert L entry */ 1442 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1443 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1444 for (j=0; j<nz; j++) { 1445 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1446 } 1447 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1448 } 1449 1450 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1451 rs = 0.0; 1452 /* L part */ 1453 pc2 = b->a + bi[i+1]; 1454 pj = b->j + bi[i+1] ; 1455 nz = bi[i+2] - bi[i+1]; 1456 for (j=0; j<nz; j++) { 1457 col = pj[j]; 1458 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1459 } 1460 /* U part */ 1461 pc2 = b->a + bdiag[i+2]+1; 1462 pj = b->j + bdiag[i+2]+1; 1463 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1464 for (j=0; j<nz; j++) { 1465 col = pj[j]; 1466 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1467 } 1468 1469 sctx.rs = rs; 1470 sctx.pv = rtmp2[i+1]; 1471 ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr); 1472 if(newshift == 1) break; 1473 pc2 = b->a + bdiag[i+1]; 1474 *pc2 = 1.0/sctx.pv; 1475 break; 1476 1477 case 3: 1478 /*----------*/ 1479 /* zero rtmp */ 1480 /* L part */ 1481 nz = bi[i+1] - bi[i]; 1482 bjtmp = bj + bi[i]; 1483 for (j=0; j<nz; j++) { 1484 col = bjtmp[j]; 1485 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1486 } 1487 1488 /* U part */ 1489 nz = bdiag[i]-bdiag[i+1]; 1490 bjtmp = bj + bdiag[i+1]+1; 1491 for (j=0; j<nz; j++) { 1492 col = bjtmp[j]; 1493 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1494 } 1495 1496 /* load in initial (unfactored row) */ 1497 nz = ai[r[i]+1] - ai[r[i]]; 1498 ajtmp = aj + ai[r[i]]; 1499 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1500 for (j=0; j<nz; j++) { 1501 col = ics[ajtmp[j]]; 1502 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1503 } 1504 /* ZeropivotApply(): shift the diagonal of the matrix */ 1505 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1506 1507 /* elimination */ 1508 bjtmp = bj + bi[i]; 1509 row = *bjtmp++; /* pivot row */ 1510 nzL = bi[i+1] - bi[i]; 1511 for(k=0; k < nzL;k++) { 1512 pc1 = rtmp1 + row; 1513 pc2 = rtmp2 + row; 1514 pc3 = rtmp3 + row; 1515 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1516 pv = b->a + bdiag[row]; 1517 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1518 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1519 1520 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1521 pv = b->a + bdiag[row+1]+1; 1522 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1523 for (j=0; j<nz; j++){ 1524 col = pj[j]; 1525 rtmp1[col] -= mul1 * pv[j]; 1526 rtmp2[col] -= mul2 * pv[j]; 1527 rtmp3[col] -= mul3 * pv[j]; 1528 } 1529 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1530 } 1531 row = *bjtmp++; 1532 } 1533 1534 /* finished row i; check zero pivot, then stick row i into b->a */ 1535 rs = 0.0; 1536 /* L part */ 1537 pc1 = b->a + bi[i]; 1538 pj = b->j + bi[i] ; 1539 nz = bi[i+1] - bi[i]; 1540 for (j=0; j<nz; j++) { 1541 col = pj[j]; 1542 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1543 } 1544 /* U part */ 1545 pc1 = b->a + bdiag[i+1]+1; 1546 pj = b->j + bdiag[i+1]+1; 1547 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1548 for (j=0; j<nz; j++) { 1549 col = pj[j]; 1550 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1551 } 1552 1553 sctx.rs = rs; 1554 sctx.pv = rtmp1[i]; 1555 ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr); 1556 if(newshift == 1) break; 1557 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1558 *pc1 = 1.0/sctx.pv; 1559 1560 /* Now take care of 1st column of diagonal 3x3 block. */ 1561 pc2 = rtmp2 + i; 1562 pc3 = rtmp3 + i; 1563 if (*pc2 != 0.0 || *pc3 != 0.0){ 1564 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1565 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1566 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1567 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1568 for (j=0; j<nz; j++) { 1569 col = pj[j]; 1570 rtmp2[col] -= mul2 * rtmp1[col]; 1571 rtmp3[col] -= mul3 * rtmp1[col]; 1572 } 1573 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1574 } 1575 1576 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1577 rs = 0.0; 1578 /* L part */ 1579 pc2 = b->a + bi[i+1]; 1580 pj = b->j + bi[i+1] ; 1581 nz = bi[i+2] - bi[i+1]; 1582 for (j=0; j<nz; j++) { 1583 col = pj[j]; 1584 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1585 } 1586 /* U part */ 1587 pc2 = b->a + bdiag[i+2]+1; 1588 pj = b->j + bdiag[i+2]+1; 1589 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1590 for (j=0; j<nz; j++) { 1591 col = pj[j]; 1592 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1593 } 1594 1595 sctx.rs = rs; 1596 sctx.pv = rtmp2[i+1]; 1597 ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr); 1598 if(newshift == 1) break; 1599 pc2 = b->a + bdiag[i+1]; 1600 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1601 1602 /* Now take care of 2nd column of diagonal 3x3 block. */ 1603 pc3 = rtmp3 + i+1; 1604 if (*pc3 != 0.0){ 1605 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1606 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1607 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1608 for (j=0; j<nz; j++) { 1609 col = pj[j]; 1610 rtmp3[col] -= mul3 * rtmp2[col]; 1611 } 1612 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1613 } 1614 1615 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1616 rs = 0.0; 1617 /* L part */ 1618 pc3 = b->a + bi[i+2]; 1619 pj = b->j + bi[i+2] ; 1620 nz = bi[i+3] - bi[i+2]; 1621 for (j=0; j<nz; j++) { 1622 col = pj[j]; 1623 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1624 } 1625 /* U part */ 1626 pc3 = b->a + bdiag[i+3]+1; 1627 pj = b->j + bdiag[i+3]+1; 1628 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1629 for (j=0; j<nz; j++) { 1630 col = pj[j]; 1631 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1632 } 1633 1634 sctx.rs = rs; 1635 sctx.pv = rtmp3[i+2]; 1636 ierr = MatPivotCheck(info,&sctx,i+2,&newshift);CHKERRQ(ierr); 1637 if(newshift == 1) break; 1638 pc3 = b->a + bdiag[i+2]; 1639 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1640 break; 1641 case 4: 1642 /*----------*/ 1643 /* zero rtmp */ 1644 /* L part */ 1645 nz = bi[i+1] - bi[i]; 1646 bjtmp = bj + bi[i]; 1647 for (j=0; j<nz; j++) { 1648 col = bjtmp[j]; 1649 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1650 } 1651 1652 /* U part */ 1653 nz = bdiag[i]-bdiag[i+1]; 1654 bjtmp = bj + bdiag[i+1]+1; 1655 for (j=0; j<nz; j++) { 1656 col = bjtmp[j]; 1657 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1658 } 1659 1660 /* load in initial (unfactored row) */ 1661 nz = ai[r[i]+1] - ai[r[i]]; 1662 ajtmp = aj + ai[r[i]]; 1663 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1664 for (j=0; j<nz; j++) { 1665 col = ics[ajtmp[j]]; 1666 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1667 } 1668 /* ZeropivotApply(): shift the diagonal of the matrix */ 1669 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1670 1671 /* elimination */ 1672 bjtmp = bj + bi[i]; 1673 row = *bjtmp++; /* pivot row */ 1674 nzL = bi[i+1] - bi[i]; 1675 for(k=0; k < nzL;k++) { 1676 pc1 = rtmp1 + row; 1677 pc2 = rtmp2 + row; 1678 pc3 = rtmp3 + row; 1679 pc4 = rtmp4 + row; 1680 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1681 pv = b->a + bdiag[row]; 1682 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1683 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1684 1685 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1686 pv = b->a + bdiag[row+1]+1; 1687 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1688 for (j=0; j<nz; j++){ 1689 col = pj[j]; 1690 rtmp1[col] -= mul1 * pv[j]; 1691 rtmp2[col] -= mul2 * pv[j]; 1692 rtmp3[col] -= mul3 * pv[j]; 1693 rtmp4[col] -= mul4 * pv[j]; 1694 } 1695 ierr = PetscLogFlops(8*nz);CHKERRQ(ierr); 1696 } 1697 row = *bjtmp++; 1698 } 1699 1700 /* finished row i; check zero pivot, then stick row i into b->a */ 1701 rs = 0.0; 1702 /* L part */ 1703 pc1 = b->a + bi[i]; 1704 pj = b->j + bi[i] ; 1705 nz = bi[i+1] - bi[i]; 1706 for (j=0; j<nz; j++) { 1707 col = pj[j]; 1708 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1709 } 1710 /* U part */ 1711 pc1 = b->a + bdiag[i+1]+1; 1712 pj = b->j + bdiag[i+1]+1; 1713 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1714 for (j=0; j<nz; j++) { 1715 col = pj[j]; 1716 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1717 } 1718 1719 sctx.rs = rs; 1720 sctx.pv = rtmp1[i]; 1721 ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr); 1722 if(newshift == 1) break; 1723 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1724 *pc1 = 1.0/sctx.pv; 1725 1726 /* Now take care of 1st column of diagonal 4x4 block. */ 1727 pc2 = rtmp2 + i; 1728 pc3 = rtmp3 + i; 1729 pc4 = rtmp4 + i; 1730 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){ 1731 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1732 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1733 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1734 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1735 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1736 for (j=0; j<nz; j++) { 1737 col = pj[j]; 1738 rtmp2[col] -= mul2 * rtmp1[col]; 1739 rtmp3[col] -= mul3 * rtmp1[col]; 1740 rtmp4[col] -= mul4 * rtmp1[col]; 1741 } 1742 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1743 } 1744 1745 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1746 rs = 0.0; 1747 /* L part */ 1748 pc2 = b->a + bi[i+1]; 1749 pj = b->j + bi[i+1] ; 1750 nz = bi[i+2] - bi[i+1]; 1751 for (j=0; j<nz; j++) { 1752 col = pj[j]; 1753 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1754 } 1755 /* U part */ 1756 pc2 = b->a + bdiag[i+2]+1; 1757 pj = b->j + bdiag[i+2]+1; 1758 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1759 for (j=0; j<nz; j++) { 1760 col = pj[j]; 1761 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1762 } 1763 1764 sctx.rs = rs; 1765 sctx.pv = rtmp2[i+1]; 1766 ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr); 1767 if(newshift == 1) break; 1768 pc2 = b->a + bdiag[i+1]; 1769 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1770 1771 /* Now take care of 2nd column of diagonal 4x4 block. */ 1772 pc3 = rtmp3 + i+1; 1773 pc4 = rtmp4 + i+1; 1774 if (*pc3 != 0.0 || *pc4 != 0.0){ 1775 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1776 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1777 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1778 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1779 for (j=0; j<nz; j++) { 1780 col = pj[j]; 1781 rtmp3[col] -= mul3 * rtmp2[col]; 1782 rtmp4[col] -= mul4 * rtmp2[col]; 1783 } 1784 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1785 } 1786 1787 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1788 rs = 0.0; 1789 /* L part */ 1790 pc3 = b->a + bi[i+2]; 1791 pj = b->j + bi[i+2] ; 1792 nz = bi[i+3] - bi[i+2]; 1793 for (j=0; j<nz; j++) { 1794 col = pj[j]; 1795 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1796 } 1797 /* U part */ 1798 pc3 = b->a + bdiag[i+3]+1; 1799 pj = b->j + bdiag[i+3]+1; 1800 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1801 for (j=0; j<nz; j++) { 1802 col = pj[j]; 1803 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1804 } 1805 1806 sctx.rs = rs; 1807 sctx.pv = rtmp3[i+2]; 1808 ierr = MatPivotCheck(info,&sctx,i+2,&newshift);CHKERRQ(ierr); 1809 if(newshift == 1) break; 1810 pc3 = b->a + bdiag[i+2]; 1811 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1812 1813 /* Now take care of 3rd column of diagonal 4x4 block. */ 1814 pc4 = rtmp4 + i+2; 1815 if (*pc4 != 0.0){ 1816 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1817 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1818 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1819 for (j=0; j<nz; j++) { 1820 col = pj[j]; 1821 rtmp4[col] -= mul4 * rtmp3[col]; 1822 } 1823 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1824 } 1825 1826 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1827 rs = 0.0; 1828 /* L part */ 1829 pc4 = b->a + bi[i+3]; 1830 pj = b->j + bi[i+3] ; 1831 nz = bi[i+4] - bi[i+3]; 1832 for (j=0; j<nz; j++) { 1833 col = pj[j]; 1834 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1835 } 1836 /* U part */ 1837 pc4 = b->a + bdiag[i+4]+1; 1838 pj = b->j + bdiag[i+4]+1; 1839 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1840 for (j=0; j<nz; j++) { 1841 col = pj[j]; 1842 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1843 } 1844 1845 sctx.rs = rs; 1846 sctx.pv = rtmp4[i+3]; 1847 ierr = MatPivotCheck(info,&sctx,i+3,&newshift);CHKERRQ(ierr); 1848 if(newshift == 1) break; 1849 pc4 = b->a + bdiag[i+3]; 1850 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1851 break; 1852 1853 default: 1854 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 1855 } 1856 i += nodesz; /* Update the row */ 1857 } 1858 1859 /* MatPivotRefine() */ 1860 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1861 /* 1862 * if no shift in this attempt & shifting & started shifting & can refine, 1863 * then try lower shift 1864 */ 1865 sctx.shift_hi = sctx.shift_fraction; 1866 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1867 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1868 sctx.useshift = PETSC_TRUE; 1869 sctx.nshift++; 1870 } 1871 } while (sctx.useshift); 1872 1873 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1874 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1875 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1876 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1877 1878 C->ops->solve = MatSolve_SeqAIJ; 1879 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1880 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1881 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1882 C->ops->matsolve = MatMatSolve_SeqAIJ; 1883 C->assembled = PETSC_TRUE; 1884 C->preallocated = PETSC_TRUE; 1885 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1886 1887 /* MatShiftView(A,info,&sctx) */ 1888 if (sctx.nshift){ 1889 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1890 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1891 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1892 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1893 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){ 1894 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1895 } 1896 } 1897 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1898 PetscFunctionReturn(0); 1899 } 1900 1901 #undef __FUNCT__ 1902 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1903 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1904 { 1905 Mat C = B; 1906 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1907 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1908 PetscErrorCode ierr; 1909 const PetscInt *r,*ic,*c,*ics; 1910 PetscInt n = A->rmap->n,*bi = b->i; 1911 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1912 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1913 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1914 PetscScalar mul1,mul2,mul3,tmp; 1915 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1916 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1917 PetscReal rs=0.0; 1918 FactorShiftCtx sctx; 1919 PetscInt newshift; 1920 1921 PetscFunctionBegin; 1922 sctx.shift_top = 0; 1923 sctx.nshift_max = 0; 1924 sctx.shift_lo = 0; 1925 sctx.shift_hi = 0; 1926 sctx.shift_fraction = 0; 1927 1928 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1929 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1930 sctx.shift_top = 0; 1931 for (i=0; i<n; i++) { 1932 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1933 rs = 0.0; 1934 ajtmp = aj + ai[i]; 1935 rtmp1 = aa + ai[i]; 1936 nz = ai[i+1] - ai[i]; 1937 for (j=0; j<nz; j++){ 1938 if (*ajtmp != i){ 1939 rs += PetscAbsScalar(*rtmp1++); 1940 } else { 1941 rs -= PetscRealPart(*rtmp1++); 1942 } 1943 ajtmp++; 1944 } 1945 if (rs>sctx.shift_top) sctx.shift_top = rs; 1946 } 1947 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1948 sctx.shift_top *= 1.1; 1949 sctx.nshift_max = 5; 1950 sctx.shift_lo = 0.; 1951 sctx.shift_hi = 1.; 1952 } 1953 sctx.shift_amount = 0; 1954 sctx.nshift = 0; 1955 1956 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1957 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1958 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1959 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1960 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1961 ics = ic ; 1962 rtmp22 = rtmp11 + n; 1963 rtmp33 = rtmp22 + n; 1964 1965 node_max = a->inode.node_count; 1966 ns = a->inode.size; 1967 if (!ns){ 1968 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1969 } 1970 1971 /* If max inode size > 3, split it into two inodes.*/ 1972 /* also map the inode sizes according to the ordering */ 1973 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1974 for (i=0,j=0; i<node_max; ++i,++j){ 1975 if (ns[i]>3) { 1976 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1977 ++j; 1978 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1979 } else { 1980 tmp_vec1[j] = ns[i]; 1981 } 1982 } 1983 /* Use the correct node_max */ 1984 node_max = j; 1985 1986 /* Now reorder the inode info based on mat re-ordering info */ 1987 /* First create a row -> inode_size_array_index map */ 1988 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1989 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1990 for (i=0,row=0; i<node_max; i++) { 1991 nodesz = tmp_vec1[i]; 1992 for (j=0; j<nodesz; j++,row++) { 1993 nsmap[row] = i; 1994 } 1995 } 1996 /* Using nsmap, create a reordered ns structure */ 1997 for (i=0,j=0; i< node_max; i++) { 1998 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1999 tmp_vec2[i] = nodesz; 2000 j += nodesz; 2001 } 2002 ierr = PetscFree(nsmap);CHKERRQ(ierr); 2003 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 2004 /* Now use the correct ns */ 2005 ns = tmp_vec2; 2006 2007 do { 2008 sctx.useshift = PETSC_FALSE; 2009 /* Now loop over each block-row, and do the factorization */ 2010 for (i=0,row=0; i<node_max; i++) { 2011 nodesz = ns[i]; 2012 nz = bi[row+1] - bi[row]; 2013 bjtmp = bj + bi[row]; 2014 2015 switch (nodesz){ 2016 case 1: 2017 for (j=0; j<nz; j++){ 2018 idx = bjtmp[j]; 2019 rtmp11[idx] = 0.0; 2020 } 2021 2022 /* load in initial (unfactored row) */ 2023 idx = r[row]; 2024 nz_tmp = ai[idx+1] - ai[idx]; 2025 ajtmp = aj + ai[idx]; 2026 v1 = aa + ai[idx]; 2027 2028 for (j=0; j<nz_tmp; j++) { 2029 idx = ics[ajtmp[j]]; 2030 rtmp11[idx] = v1[j]; 2031 } 2032 rtmp11[ics[r[row]]] += sctx.shift_amount; 2033 2034 prow = *bjtmp++ ; 2035 while (prow < row) { 2036 pc1 = rtmp11 + prow; 2037 if (*pc1 != 0.0){ 2038 pv = ba + bd[prow]; 2039 pj = nbj + bd[prow]; 2040 mul1 = *pc1 * *pv++; 2041 *pc1 = mul1; 2042 nz_tmp = bi[prow+1] - bd[prow] - 1; 2043 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2044 for (j=0; j<nz_tmp; j++) { 2045 tmp = pv[j]; 2046 idx = pj[j]; 2047 rtmp11[idx] -= mul1 * tmp; 2048 } 2049 } 2050 prow = *bjtmp++ ; 2051 } 2052 pj = bj + bi[row]; 2053 pc1 = ba + bi[row]; 2054 2055 sctx.pv = rtmp11[row]; 2056 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2057 rs = 0.0; 2058 for (j=0; j<nz; j++) { 2059 idx = pj[j]; 2060 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2061 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2062 } 2063 sctx.rs = rs; 2064 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2065 if (newshift == 1) goto endofwhile; 2066 break; 2067 2068 case 2: 2069 for (j=0; j<nz; j++) { 2070 idx = bjtmp[j]; 2071 rtmp11[idx] = 0.0; 2072 rtmp22[idx] = 0.0; 2073 } 2074 2075 /* load in initial (unfactored row) */ 2076 idx = r[row]; 2077 nz_tmp = ai[idx+1] - ai[idx]; 2078 ajtmp = aj + ai[idx]; 2079 v1 = aa + ai[idx]; 2080 v2 = aa + ai[idx+1]; 2081 for (j=0; j<nz_tmp; j++) { 2082 idx = ics[ajtmp[j]]; 2083 rtmp11[idx] = v1[j]; 2084 rtmp22[idx] = v2[j]; 2085 } 2086 rtmp11[ics[r[row]]] += sctx.shift_amount; 2087 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2088 2089 prow = *bjtmp++ ; 2090 while (prow < row) { 2091 pc1 = rtmp11 + prow; 2092 pc2 = rtmp22 + prow; 2093 if (*pc1 != 0.0 || *pc2 != 0.0){ 2094 pv = ba + bd[prow]; 2095 pj = nbj + bd[prow]; 2096 mul1 = *pc1 * *pv; 2097 mul2 = *pc2 * *pv; 2098 ++pv; 2099 *pc1 = mul1; 2100 *pc2 = mul2; 2101 2102 nz_tmp = bi[prow+1] - bd[prow] - 1; 2103 for (j=0; j<nz_tmp; j++) { 2104 tmp = pv[j]; 2105 idx = pj[j]; 2106 rtmp11[idx] -= mul1 * tmp; 2107 rtmp22[idx] -= mul2 * tmp; 2108 } 2109 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2110 } 2111 prow = *bjtmp++ ; 2112 } 2113 2114 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2115 pc1 = rtmp11 + prow; 2116 pc2 = rtmp22 + prow; 2117 2118 sctx.pv = *pc1; 2119 pj = bj + bi[prow]; 2120 rs = 0.0; 2121 for (j=0; j<nz; j++){ 2122 idx = pj[j]; 2123 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2124 } 2125 sctx.rs = rs; 2126 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2127 if (newshift == 1) goto endofwhile; 2128 2129 if (*pc2 != 0.0){ 2130 pj = nbj + bd[prow]; 2131 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2132 *pc2 = mul2; 2133 nz_tmp = bi[prow+1] - bd[prow] - 1; 2134 for (j=0; j<nz_tmp; j++) { 2135 idx = pj[j] ; 2136 tmp = rtmp11[idx]; 2137 rtmp22[idx] -= mul2 * tmp; 2138 } 2139 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2140 } 2141 2142 pj = bj + bi[row]; 2143 pc1 = ba + bi[row]; 2144 pc2 = ba + bi[row+1]; 2145 2146 sctx.pv = rtmp22[row+1]; 2147 rs = 0.0; 2148 rtmp11[row] = 1.0/rtmp11[row]; 2149 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2150 /* copy row entries from dense representation to sparse */ 2151 for (j=0; j<nz; j++) { 2152 idx = pj[j]; 2153 pc1[j] = rtmp11[idx]; 2154 pc2[j] = rtmp22[idx]; 2155 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2156 } 2157 sctx.rs = rs; 2158 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2159 if (newshift == 1) goto endofwhile; 2160 break; 2161 2162 case 3: 2163 for (j=0; j<nz; j++) { 2164 idx = bjtmp[j]; 2165 rtmp11[idx] = 0.0; 2166 rtmp22[idx] = 0.0; 2167 rtmp33[idx] = 0.0; 2168 } 2169 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2170 idx = r[row]; 2171 nz_tmp = ai[idx+1] - ai[idx]; 2172 ajtmp = aj + ai[idx]; 2173 v1 = aa + ai[idx]; 2174 v2 = aa + ai[idx+1]; 2175 v3 = aa + ai[idx+2]; 2176 for (j=0; j<nz_tmp; j++) { 2177 idx = ics[ajtmp[j]]; 2178 rtmp11[idx] = v1[j]; 2179 rtmp22[idx] = v2[j]; 2180 rtmp33[idx] = v3[j]; 2181 } 2182 rtmp11[ics[r[row]]] += sctx.shift_amount; 2183 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2184 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2185 2186 /* loop over all pivot row blocks above this row block */ 2187 prow = *bjtmp++ ; 2188 while (prow < row) { 2189 pc1 = rtmp11 + prow; 2190 pc2 = rtmp22 + prow; 2191 pc3 = rtmp33 + prow; 2192 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 2193 pv = ba + bd[prow]; 2194 pj = nbj + bd[prow]; 2195 mul1 = *pc1 * *pv; 2196 mul2 = *pc2 * *pv; 2197 mul3 = *pc3 * *pv; 2198 ++pv; 2199 *pc1 = mul1; 2200 *pc2 = mul2; 2201 *pc3 = mul3; 2202 2203 nz_tmp = bi[prow+1] - bd[prow] - 1; 2204 /* update this row based on pivot row */ 2205 for (j=0; j<nz_tmp; j++) { 2206 tmp = pv[j]; 2207 idx = pj[j]; 2208 rtmp11[idx] -= mul1 * tmp; 2209 rtmp22[idx] -= mul2 * tmp; 2210 rtmp33[idx] -= mul3 * tmp; 2211 } 2212 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 2213 } 2214 prow = *bjtmp++ ; 2215 } 2216 2217 /* Now take care of diagonal 3x3 block in this set of rows */ 2218 /* note: prow = row here */ 2219 pc1 = rtmp11 + prow; 2220 pc2 = rtmp22 + prow; 2221 pc3 = rtmp33 + prow; 2222 2223 sctx.pv = *pc1; 2224 pj = bj + bi[prow]; 2225 rs = 0.0; 2226 for (j=0; j<nz; j++){ 2227 idx = pj[j]; 2228 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2229 } 2230 sctx.rs = rs; 2231 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2232 if (newshift == 1) goto endofwhile; 2233 2234 if (*pc2 != 0.0 || *pc3 != 0.0){ 2235 mul2 = (*pc2)/(*pc1); 2236 mul3 = (*pc3)/(*pc1); 2237 *pc2 = mul2; 2238 *pc3 = mul3; 2239 nz_tmp = bi[prow+1] - bd[prow] - 1; 2240 pj = nbj + bd[prow]; 2241 for (j=0; j<nz_tmp; j++) { 2242 idx = pj[j] ; 2243 tmp = rtmp11[idx]; 2244 rtmp22[idx] -= mul2 * tmp; 2245 rtmp33[idx] -= mul3 * tmp; 2246 } 2247 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2248 } 2249 ++prow; 2250 2251 pc2 = rtmp22 + prow; 2252 pc3 = rtmp33 + prow; 2253 sctx.pv = *pc2; 2254 pj = bj + bi[prow]; 2255 rs = 0.0; 2256 for (j=0; j<nz; j++){ 2257 idx = pj[j]; 2258 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2259 } 2260 sctx.rs = rs; 2261 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2262 if (newshift == 1) goto endofwhile; 2263 2264 if (*pc3 != 0.0){ 2265 mul3 = (*pc3)/(*pc2); 2266 *pc3 = mul3; 2267 pj = nbj + bd[prow]; 2268 nz_tmp = bi[prow+1] - bd[prow] - 1; 2269 for (j=0; j<nz_tmp; j++) { 2270 idx = pj[j] ; 2271 tmp = rtmp22[idx]; 2272 rtmp33[idx] -= mul3 * tmp; 2273 } 2274 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2275 } 2276 2277 pj = bj + bi[row]; 2278 pc1 = ba + bi[row]; 2279 pc2 = ba + bi[row+1]; 2280 pc3 = ba + bi[row+2]; 2281 2282 sctx.pv = rtmp33[row+2]; 2283 rs = 0.0; 2284 rtmp11[row] = 1.0/rtmp11[row]; 2285 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2286 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2287 /* copy row entries from dense representation to sparse */ 2288 for (j=0; j<nz; j++) { 2289 idx = pj[j]; 2290 pc1[j] = rtmp11[idx]; 2291 pc2[j] = rtmp22[idx]; 2292 pc3[j] = rtmp33[idx]; 2293 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2294 } 2295 2296 sctx.rs = rs; 2297 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 2298 if (newshift == 1) goto endofwhile; 2299 break; 2300 2301 default: 2302 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 2303 } 2304 row += nodesz; /* Update the row */ 2305 } 2306 endofwhile:; 2307 } while (sctx.useshift); 2308 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2309 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2310 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2311 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2312 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2313 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2314 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2315 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2316 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2317 C->assembled = PETSC_TRUE; 2318 C->preallocated = PETSC_TRUE; 2319 if (sctx.nshift) { 2320 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2321 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2322 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2323 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2324 } 2325 } 2326 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2327 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2328 PetscFunctionReturn(0); 2329 } 2330 2331 2332 /* ----------------------------------------------------------- */ 2333 #undef __FUNCT__ 2334 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2335 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2336 { 2337 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2338 IS iscol = a->col,isrow = a->row; 2339 PetscErrorCode ierr; 2340 const PetscInt *r,*c,*rout,*cout; 2341 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2342 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2343 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2344 PetscScalar sum1,sum2,sum3,sum4,sum5; 2345 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2346 const PetscScalar *b; 2347 2348 PetscFunctionBegin; 2349 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 2350 node_max = a->inode.node_count; 2351 ns = a->inode.size; /* Node Size array */ 2352 2353 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2354 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2355 tmp = a->solve_work; 2356 2357 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2358 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2359 2360 /* forward solve the lower triangular */ 2361 tmps = tmp ; 2362 aa = a_a ; 2363 aj = a_j ; 2364 ad = a->diag; 2365 2366 for (i = 0,row = 0; i< node_max; ++i){ 2367 nsz = ns[i]; 2368 aii = ai[row]; 2369 v1 = aa + aii; 2370 vi = aj + aii; 2371 nz = ai[row+1]- ai[row]; 2372 2373 if (i < node_max-1) { 2374 /* Prefetch the indices for the next block */ 2375 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2376 /* Prefetch the data for the next block */ 2377 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2378 } 2379 2380 switch (nsz){ /* Each loop in 'case' is unrolled */ 2381 case 1 : 2382 sum1 = b[r[row]]; 2383 for(j=0; j<nz-1; j+=2){ 2384 i0 = vi[j]; 2385 i1 = vi[j+1]; 2386 tmp0 = tmps[i0]; 2387 tmp1 = tmps[i1]; 2388 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2389 } 2390 if(j == nz-1){ 2391 tmp0 = tmps[vi[j]]; 2392 sum1 -= v1[j]*tmp0; 2393 } 2394 tmp[row++]=sum1; 2395 break; 2396 case 2: 2397 sum1 = b[r[row]]; 2398 sum2 = b[r[row+1]]; 2399 v2 = aa + ai[row+1]; 2400 2401 for(j=0; j<nz-1; j+=2){ 2402 i0 = vi[j]; 2403 i1 = vi[j+1]; 2404 tmp0 = tmps[i0]; 2405 tmp1 = tmps[i1]; 2406 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2407 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2408 } 2409 if(j == nz-1){ 2410 tmp0 = tmps[vi[j]]; 2411 sum1 -= v1[j] *tmp0; 2412 sum2 -= v2[j] *tmp0; 2413 } 2414 sum2 -= v2[nz] * sum1; 2415 tmp[row ++]=sum1; 2416 tmp[row ++]=sum2; 2417 break; 2418 case 3: 2419 sum1 = b[r[row]]; 2420 sum2 = b[r[row+1]]; 2421 sum3 = b[r[row+2]]; 2422 v2 = aa + ai[row+1]; 2423 v3 = aa + ai[row+2]; 2424 2425 for (j=0; j<nz-1; j+=2){ 2426 i0 = vi[j]; 2427 i1 = vi[j+1]; 2428 tmp0 = tmps[i0]; 2429 tmp1 = tmps[i1]; 2430 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2431 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2432 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2433 } 2434 if (j == nz-1){ 2435 tmp0 = tmps[vi[j]]; 2436 sum1 -= v1[j] *tmp0; 2437 sum2 -= v2[j] *tmp0; 2438 sum3 -= v3[j] *tmp0; 2439 } 2440 sum2 -= v2[nz] * sum1; 2441 sum3 -= v3[nz] * sum1; 2442 sum3 -= v3[nz+1] * sum2; 2443 tmp[row ++]=sum1; 2444 tmp[row ++]=sum2; 2445 tmp[row ++]=sum3; 2446 break; 2447 2448 case 4: 2449 sum1 = b[r[row]]; 2450 sum2 = b[r[row+1]]; 2451 sum3 = b[r[row+2]]; 2452 sum4 = b[r[row+3]]; 2453 v2 = aa + ai[row+1]; 2454 v3 = aa + ai[row+2]; 2455 v4 = aa + ai[row+3]; 2456 2457 for (j=0; j<nz-1; j+=2){ 2458 i0 = vi[j]; 2459 i1 = vi[j+1]; 2460 tmp0 = tmps[i0]; 2461 tmp1 = tmps[i1]; 2462 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2463 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2464 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2465 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2466 } 2467 if (j == nz-1){ 2468 tmp0 = tmps[vi[j]]; 2469 sum1 -= v1[j] *tmp0; 2470 sum2 -= v2[j] *tmp0; 2471 sum3 -= v3[j] *tmp0; 2472 sum4 -= v4[j] *tmp0; 2473 } 2474 sum2 -= v2[nz] * sum1; 2475 sum3 -= v3[nz] * sum1; 2476 sum4 -= v4[nz] * sum1; 2477 sum3 -= v3[nz+1] * sum2; 2478 sum4 -= v4[nz+1] * sum2; 2479 sum4 -= v4[nz+2] * sum3; 2480 2481 tmp[row ++]=sum1; 2482 tmp[row ++]=sum2; 2483 tmp[row ++]=sum3; 2484 tmp[row ++]=sum4; 2485 break; 2486 case 5: 2487 sum1 = b[r[row]]; 2488 sum2 = b[r[row+1]]; 2489 sum3 = b[r[row+2]]; 2490 sum4 = b[r[row+3]]; 2491 sum5 = b[r[row+4]]; 2492 v2 = aa + ai[row+1]; 2493 v3 = aa + ai[row+2]; 2494 v4 = aa + ai[row+3]; 2495 v5 = aa + ai[row+4]; 2496 2497 for (j=0; j<nz-1; j+=2){ 2498 i0 = vi[j]; 2499 i1 = vi[j+1]; 2500 tmp0 = tmps[i0]; 2501 tmp1 = tmps[i1]; 2502 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2503 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2504 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2505 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2506 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2507 } 2508 if (j == nz-1){ 2509 tmp0 = tmps[vi[j]]; 2510 sum1 -= v1[j] *tmp0; 2511 sum2 -= v2[j] *tmp0; 2512 sum3 -= v3[j] *tmp0; 2513 sum4 -= v4[j] *tmp0; 2514 sum5 -= v5[j] *tmp0; 2515 } 2516 2517 sum2 -= v2[nz] * sum1; 2518 sum3 -= v3[nz] * sum1; 2519 sum4 -= v4[nz] * sum1; 2520 sum5 -= v5[nz] * sum1; 2521 sum3 -= v3[nz+1] * sum2; 2522 sum4 -= v4[nz+1] * sum2; 2523 sum5 -= v5[nz+1] * sum2; 2524 sum4 -= v4[nz+2] * sum3; 2525 sum5 -= v5[nz+2] * sum3; 2526 sum5 -= v5[nz+3] * sum4; 2527 2528 tmp[row ++]=sum1; 2529 tmp[row ++]=sum2; 2530 tmp[row ++]=sum3; 2531 tmp[row ++]=sum4; 2532 tmp[row ++]=sum5; 2533 break; 2534 default: 2535 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2536 } 2537 } 2538 /* backward solve the upper triangular */ 2539 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2540 nsz = ns[i]; 2541 aii = ad[row+1] + 1; 2542 v1 = aa + aii; 2543 vi = aj + aii; 2544 nz = ad[row]- ad[row+1] - 1; 2545 2546 if (i > 0) { 2547 /* Prefetch the indices for the next block */ 2548 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2549 /* Prefetch the data for the next block */ 2550 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2551 } 2552 2553 switch (nsz){ /* Each loop in 'case' is unrolled */ 2554 case 1 : 2555 sum1 = tmp[row]; 2556 2557 for(j=0 ; j<nz-1; j+=2){ 2558 i0 = vi[j]; 2559 i1 = vi[j+1]; 2560 tmp0 = tmps[i0]; 2561 tmp1 = tmps[i1]; 2562 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2563 } 2564 if (j == nz-1){ 2565 tmp0 = tmps[vi[j]]; 2566 sum1 -= v1[j]*tmp0; 2567 } 2568 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2569 break; 2570 case 2 : 2571 sum1 = tmp[row]; 2572 sum2 = tmp[row-1]; 2573 v2 = aa + ad[row] + 1; 2574 for (j=0 ; j<nz-1; j+=2){ 2575 i0 = vi[j]; 2576 i1 = vi[j+1]; 2577 tmp0 = tmps[i0]; 2578 tmp1 = tmps[i1]; 2579 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2580 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2581 } 2582 if (j == nz-1){ 2583 tmp0 = tmps[vi[j]]; 2584 sum1 -= v1[j]* tmp0; 2585 sum2 -= v2[j+1]* tmp0; 2586 } 2587 2588 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2589 sum2 -= v2[0] * tmp0; 2590 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2591 break; 2592 case 3 : 2593 sum1 = tmp[row]; 2594 sum2 = tmp[row -1]; 2595 sum3 = tmp[row -2]; 2596 v2 = aa + ad[row] + 1; 2597 v3 = aa + ad[row -1] + 1; 2598 for (j=0 ; j<nz-1; j+=2){ 2599 i0 = vi[j]; 2600 i1 = vi[j+1]; 2601 tmp0 = tmps[i0]; 2602 tmp1 = tmps[i1]; 2603 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2604 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2605 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2606 } 2607 if (j== nz-1){ 2608 tmp0 = tmps[vi[j]]; 2609 sum1 -= v1[j] * tmp0; 2610 sum2 -= v2[j+1] * tmp0; 2611 sum3 -= v3[j+2] * tmp0; 2612 } 2613 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2614 sum2 -= v2[0]* tmp0; 2615 sum3 -= v3[1] * tmp0; 2616 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2617 sum3 -= v3[0]* tmp0; 2618 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2619 2620 break; 2621 case 4 : 2622 sum1 = tmp[row]; 2623 sum2 = tmp[row -1]; 2624 sum3 = tmp[row -2]; 2625 sum4 = tmp[row -3]; 2626 v2 = aa + ad[row]+1; 2627 v3 = aa + ad[row -1]+1; 2628 v4 = aa + ad[row -2]+1; 2629 2630 for (j=0 ; j<nz-1; j+=2){ 2631 i0 = vi[j]; 2632 i1 = vi[j+1]; 2633 tmp0 = tmps[i0]; 2634 tmp1 = tmps[i1]; 2635 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2636 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2637 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2638 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2639 } 2640 if (j== nz-1){ 2641 tmp0 = tmps[vi[j]]; 2642 sum1 -= v1[j] * tmp0; 2643 sum2 -= v2[j+1] * tmp0; 2644 sum3 -= v3[j+2] * tmp0; 2645 sum4 -= v4[j+3] * tmp0; 2646 } 2647 2648 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2649 sum2 -= v2[0] * tmp0; 2650 sum3 -= v3[1] * tmp0; 2651 sum4 -= v4[2] * tmp0; 2652 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2653 sum3 -= v3[0] * tmp0; 2654 sum4 -= v4[1] * tmp0; 2655 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2656 sum4 -= v4[0] * tmp0; 2657 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2658 break; 2659 case 5 : 2660 sum1 = tmp[row]; 2661 sum2 = tmp[row -1]; 2662 sum3 = tmp[row -2]; 2663 sum4 = tmp[row -3]; 2664 sum5 = tmp[row -4]; 2665 v2 = aa + ad[row]+1; 2666 v3 = aa + ad[row -1]+1; 2667 v4 = aa + ad[row -2]+1; 2668 v5 = aa + ad[row -3]+1; 2669 for (j=0 ; j<nz-1; j+=2){ 2670 i0 = vi[j]; 2671 i1 = vi[j+1]; 2672 tmp0 = tmps[i0]; 2673 tmp1 = tmps[i1]; 2674 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2675 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2676 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2677 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2678 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2679 } 2680 if (j==nz-1){ 2681 tmp0 = tmps[vi[j]]; 2682 sum1 -= v1[j] * tmp0; 2683 sum2 -= v2[j+1] * tmp0; 2684 sum3 -= v3[j+2] * tmp0; 2685 sum4 -= v4[j+3] * tmp0; 2686 sum5 -= v5[j+4] * tmp0; 2687 } 2688 2689 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2690 sum2 -= v2[0] * tmp0; 2691 sum3 -= v3[1] * tmp0; 2692 sum4 -= v4[2] * tmp0; 2693 sum5 -= v5[3] * tmp0; 2694 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2695 sum3 -= v3[0] * tmp0; 2696 sum4 -= v4[1] * tmp0; 2697 sum5 -= v5[2] * tmp0; 2698 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2699 sum4 -= v4[0] * tmp0; 2700 sum5 -= v5[1] * tmp0; 2701 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2702 sum5 -= v5[0] * tmp0; 2703 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2704 break; 2705 default: 2706 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2707 } 2708 } 2709 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2710 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2711 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2712 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2713 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2714 PetscFunctionReturn(0); 2715 } 2716 2717 2718 /* 2719 Makes a longer coloring[] array and calls the usual code with that 2720 */ 2721 #undef __FUNCT__ 2722 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2723 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2724 { 2725 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2726 PetscErrorCode ierr; 2727 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2728 PetscInt *colorused,i; 2729 ISColoringValue *newcolor; 2730 2731 PetscFunctionBegin; 2732 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2733 /* loop over inodes, marking a color for each column*/ 2734 row = 0; 2735 for (i=0; i<m; i++){ 2736 for (j=0; j<ns[i]; j++) { 2737 newcolor[row++] = coloring[i] + j*ncolors; 2738 } 2739 } 2740 2741 /* eliminate unneeded colors */ 2742 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2743 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2744 for (i=0; i<n; i++) { 2745 colorused[newcolor[i]] = 1; 2746 } 2747 2748 for (i=1; i<5*ncolors; i++) { 2749 colorused[i] += colorused[i-1]; 2750 } 2751 ncolors = colorused[5*ncolors-1]; 2752 for (i=0; i<n; i++) { 2753 newcolor[i] = colorused[newcolor[i]]-1; 2754 } 2755 ierr = PetscFree(colorused);CHKERRQ(ierr); 2756 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2757 ierr = PetscFree(coloring);CHKERRQ(ierr); 2758 PetscFunctionReturn(0); 2759 } 2760 2761 #include "../src/mat/blockinvert.h" 2762 2763 #undef __FUNCT__ 2764 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2765 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2766 { 2767 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2768 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2769 MatScalar *ibdiag,*bdiag,work[25]; 2770 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2771 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2772 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2773 PetscErrorCode ierr; 2774 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2775 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2776 2777 PetscFunctionBegin; 2778 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2779 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2780 if (its > 1) { 2781 /* switch to non-inode version */ 2782 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2783 PetscFunctionReturn(0); 2784 } 2785 2786 if (!a->inode.ibdiagvalid) { 2787 if (!a->inode.ibdiag) { 2788 /* calculate space needed for diagonal blocks */ 2789 for (i=0; i<m; i++) { 2790 cnt += sizes[i]*sizes[i]; 2791 } 2792 a->inode.bdiagsize = cnt; 2793 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2794 } 2795 2796 /* copy over the diagonal blocks and invert them */ 2797 ibdiag = a->inode.ibdiag; 2798 bdiag = a->inode.bdiag; 2799 cnt = 0; 2800 for (i=0, row = 0; i<m; i++) { 2801 for (j=0; j<sizes[i]; j++) { 2802 for (k=0; k<sizes[i]; k++) { 2803 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2804 } 2805 } 2806 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2807 2808 switch(sizes[i]) { 2809 case 1: 2810 /* Create matrix data structure */ 2811 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2812 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2813 break; 2814 case 2: 2815 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2816 break; 2817 case 3: 2818 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2819 break; 2820 case 4: 2821 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2822 break; 2823 case 5: 2824 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2825 break; 2826 default: 2827 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2828 } 2829 cnt += sizes[i]*sizes[i]; 2830 row += sizes[i]; 2831 } 2832 a->inode.ibdiagvalid = PETSC_TRUE; 2833 } 2834 ibdiag = a->inode.ibdiag; 2835 bdiag = a->inode.bdiag; 2836 2837 2838 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2839 if (flag & SOR_ZERO_INITIAL_GUESS) { 2840 PetscScalar *b; 2841 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2842 if (xx != bb) { 2843 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2844 } else { 2845 b = x; 2846 } 2847 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2848 2849 for (i=0, row=0; i<m; i++) { 2850 sz = diag[row] - ii[row]; 2851 v1 = a->a + ii[row]; 2852 idx = a->j + ii[row]; 2853 2854 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2855 switch (sizes[i]){ 2856 case 1: 2857 2858 sum1 = b[row]; 2859 for(n = 0; n<sz-1; n+=2) { 2860 i1 = idx[0]; 2861 i2 = idx[1]; 2862 idx += 2; 2863 tmp0 = x[i1]; 2864 tmp1 = x[i2]; 2865 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2866 } 2867 2868 if (n == sz-1){ 2869 tmp0 = x[*idx]; 2870 sum1 -= *v1 * tmp0; 2871 } 2872 x[row++] = sum1*(*ibdiag++); 2873 break; 2874 case 2: 2875 v2 = a->a + ii[row+1]; 2876 sum1 = b[row]; 2877 sum2 = b[row+1]; 2878 for(n = 0; n<sz-1; n+=2) { 2879 i1 = idx[0]; 2880 i2 = idx[1]; 2881 idx += 2; 2882 tmp0 = x[i1]; 2883 tmp1 = x[i2]; 2884 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2885 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2886 } 2887 2888 if (n == sz-1){ 2889 tmp0 = x[*idx]; 2890 sum1 -= v1[0] * tmp0; 2891 sum2 -= v2[0] * tmp0; 2892 } 2893 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2894 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2895 ibdiag += 4; 2896 break; 2897 case 3: 2898 v2 = a->a + ii[row+1]; 2899 v3 = a->a + ii[row+2]; 2900 sum1 = b[row]; 2901 sum2 = b[row+1]; 2902 sum3 = b[row+2]; 2903 for(n = 0; n<sz-1; n+=2) { 2904 i1 = idx[0]; 2905 i2 = idx[1]; 2906 idx += 2; 2907 tmp0 = x[i1]; 2908 tmp1 = x[i2]; 2909 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2910 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2911 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2912 } 2913 2914 if (n == sz-1){ 2915 tmp0 = x[*idx]; 2916 sum1 -= v1[0] * tmp0; 2917 sum2 -= v2[0] * tmp0; 2918 sum3 -= v3[0] * tmp0; 2919 } 2920 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2921 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2922 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2923 ibdiag += 9; 2924 break; 2925 case 4: 2926 v2 = a->a + ii[row+1]; 2927 v3 = a->a + ii[row+2]; 2928 v4 = a->a + ii[row+3]; 2929 sum1 = b[row]; 2930 sum2 = b[row+1]; 2931 sum3 = b[row+2]; 2932 sum4 = b[row+3]; 2933 for(n = 0; n<sz-1; n+=2) { 2934 i1 = idx[0]; 2935 i2 = idx[1]; 2936 idx += 2; 2937 tmp0 = x[i1]; 2938 tmp1 = x[i2]; 2939 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2940 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2941 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2942 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2943 } 2944 2945 if (n == sz-1){ 2946 tmp0 = x[*idx]; 2947 sum1 -= v1[0] * tmp0; 2948 sum2 -= v2[0] * tmp0; 2949 sum3 -= v3[0] * tmp0; 2950 sum4 -= v4[0] * tmp0; 2951 } 2952 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2953 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2954 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2955 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2956 ibdiag += 16; 2957 break; 2958 case 5: 2959 v2 = a->a + ii[row+1]; 2960 v3 = a->a + ii[row+2]; 2961 v4 = a->a + ii[row+3]; 2962 v5 = a->a + ii[row+4]; 2963 sum1 = b[row]; 2964 sum2 = b[row+1]; 2965 sum3 = b[row+2]; 2966 sum4 = b[row+3]; 2967 sum5 = b[row+4]; 2968 for(n = 0; n<sz-1; n+=2) { 2969 i1 = idx[0]; 2970 i2 = idx[1]; 2971 idx += 2; 2972 tmp0 = x[i1]; 2973 tmp1 = x[i2]; 2974 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2975 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2976 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2977 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2978 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2979 } 2980 2981 if (n == sz-1){ 2982 tmp0 = x[*idx]; 2983 sum1 -= v1[0] * tmp0; 2984 sum2 -= v2[0] * tmp0; 2985 sum3 -= v3[0] * tmp0; 2986 sum4 -= v4[0] * tmp0; 2987 sum5 -= v5[0] * tmp0; 2988 } 2989 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2990 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2991 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2992 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2993 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2994 ibdiag += 25; 2995 break; 2996 default: 2997 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2998 } 2999 } 3000 3001 xb = x; 3002 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3003 } else xb = b; 3004 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 3005 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 3006 cnt = 0; 3007 for (i=0, row=0; i<m; i++) { 3008 3009 switch (sizes[i]){ 3010 case 1: 3011 x[row++] *= bdiag[cnt++]; 3012 break; 3013 case 2: 3014 x1 = x[row]; x2 = x[row+1]; 3015 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3016 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3017 x[row++] = tmp1; 3018 x[row++] = tmp2; 3019 cnt += 4; 3020 break; 3021 case 3: 3022 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3023 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3024 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3025 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3026 x[row++] = tmp1; 3027 x[row++] = tmp2; 3028 x[row++] = tmp3; 3029 cnt += 9; 3030 break; 3031 case 4: 3032 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3033 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3034 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3035 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3036 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3037 x[row++] = tmp1; 3038 x[row++] = tmp2; 3039 x[row++] = tmp3; 3040 x[row++] = tmp4; 3041 cnt += 16; 3042 break; 3043 case 5: 3044 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3045 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3046 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3047 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3048 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3049 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3050 x[row++] = tmp1; 3051 x[row++] = tmp2; 3052 x[row++] = tmp3; 3053 x[row++] = tmp4; 3054 x[row++] = tmp5; 3055 cnt += 25; 3056 break; 3057 default: 3058 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3059 } 3060 } 3061 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3062 } 3063 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 3064 3065 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3066 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3067 ibdiag -= sizes[i]*sizes[i]; 3068 sz = ii[row+1] - diag[row] - 1; 3069 v1 = a->a + diag[row] + 1; 3070 idx = a->j + diag[row] + 1; 3071 3072 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3073 switch (sizes[i]){ 3074 case 1: 3075 3076 sum1 = xb[row]; 3077 for(n = 0; n<sz-1; n+=2) { 3078 i1 = idx[0]; 3079 i2 = idx[1]; 3080 idx += 2; 3081 tmp0 = x[i1]; 3082 tmp1 = x[i2]; 3083 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3084 } 3085 3086 if (n == sz-1){ 3087 tmp0 = x[*idx]; 3088 sum1 -= *v1*tmp0; 3089 } 3090 x[row--] = sum1*(*ibdiag); 3091 break; 3092 3093 case 2: 3094 3095 sum1 = xb[row]; 3096 sum2 = xb[row-1]; 3097 /* note that sum1 is associated with the second of the two rows */ 3098 v2 = a->a + diag[row-1] + 2; 3099 for(n = 0; n<sz-1; n+=2) { 3100 i1 = idx[0]; 3101 i2 = idx[1]; 3102 idx += 2; 3103 tmp0 = x[i1]; 3104 tmp1 = x[i2]; 3105 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3106 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3107 } 3108 3109 if (n == sz-1){ 3110 tmp0 = x[*idx]; 3111 sum1 -= *v1*tmp0; 3112 sum2 -= *v2*tmp0; 3113 } 3114 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3115 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3116 break; 3117 case 3: 3118 3119 sum1 = xb[row]; 3120 sum2 = xb[row-1]; 3121 sum3 = xb[row-2]; 3122 v2 = a->a + diag[row-1] + 2; 3123 v3 = a->a + diag[row-2] + 3; 3124 for(n = 0; n<sz-1; n+=2) { 3125 i1 = idx[0]; 3126 i2 = idx[1]; 3127 idx += 2; 3128 tmp0 = x[i1]; 3129 tmp1 = x[i2]; 3130 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3131 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3132 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3133 } 3134 3135 if (n == sz-1){ 3136 tmp0 = x[*idx]; 3137 sum1 -= *v1*tmp0; 3138 sum2 -= *v2*tmp0; 3139 sum3 -= *v3*tmp0; 3140 } 3141 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3142 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3143 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3144 break; 3145 case 4: 3146 3147 sum1 = xb[row]; 3148 sum2 = xb[row-1]; 3149 sum3 = xb[row-2]; 3150 sum4 = xb[row-3]; 3151 v2 = a->a + diag[row-1] + 2; 3152 v3 = a->a + diag[row-2] + 3; 3153 v4 = a->a + diag[row-3] + 4; 3154 for(n = 0; n<sz-1; n+=2) { 3155 i1 = idx[0]; 3156 i2 = idx[1]; 3157 idx += 2; 3158 tmp0 = x[i1]; 3159 tmp1 = x[i2]; 3160 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3161 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3162 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3163 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3164 } 3165 3166 if (n == sz-1){ 3167 tmp0 = x[*idx]; 3168 sum1 -= *v1*tmp0; 3169 sum2 -= *v2*tmp0; 3170 sum3 -= *v3*tmp0; 3171 sum4 -= *v4*tmp0; 3172 } 3173 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3174 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3175 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3176 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3177 break; 3178 case 5: 3179 3180 sum1 = xb[row]; 3181 sum2 = xb[row-1]; 3182 sum3 = xb[row-2]; 3183 sum4 = xb[row-3]; 3184 sum5 = xb[row-4]; 3185 v2 = a->a + diag[row-1] + 2; 3186 v3 = a->a + diag[row-2] + 3; 3187 v4 = a->a + diag[row-3] + 4; 3188 v5 = a->a + diag[row-4] + 5; 3189 for(n = 0; n<sz-1; n+=2) { 3190 i1 = idx[0]; 3191 i2 = idx[1]; 3192 idx += 2; 3193 tmp0 = x[i1]; 3194 tmp1 = x[i2]; 3195 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3196 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3197 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3198 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3199 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3200 } 3201 3202 if (n == sz-1){ 3203 tmp0 = x[*idx]; 3204 sum1 -= *v1*tmp0; 3205 sum2 -= *v2*tmp0; 3206 sum3 -= *v3*tmp0; 3207 sum4 -= *v4*tmp0; 3208 sum5 -= *v5*tmp0; 3209 } 3210 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3211 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3212 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3213 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3214 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3215 break; 3216 default: 3217 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3218 } 3219 } 3220 3221 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3222 } 3223 its--; 3224 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3225 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 3226 } 3227 if (flag & SOR_EISENSTAT) { 3228 const PetscScalar *b; 3229 MatScalar *t = a->inode.ssor_work; 3230 3231 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3232 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3233 /* 3234 Apply (U + D)^-1 where D is now the block diagonal 3235 */ 3236 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3237 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3238 ibdiag -= sizes[i]*sizes[i]; 3239 sz = ii[row+1] - diag[row] - 1; 3240 v1 = a->a + diag[row] + 1; 3241 idx = a->j + diag[row] + 1; 3242 CHKMEMQ; 3243 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3244 switch (sizes[i]){ 3245 case 1: 3246 3247 sum1 = b[row]; 3248 for(n = 0; n<sz-1; n+=2) { 3249 i1 = idx[0]; 3250 i2 = idx[1]; 3251 idx += 2; 3252 tmp0 = x[i1]; 3253 tmp1 = x[i2]; 3254 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3255 } 3256 3257 if (n == sz-1){ 3258 tmp0 = x[*idx]; 3259 sum1 -= *v1*tmp0; 3260 } 3261 x[row] = sum1*(*ibdiag);row--; 3262 break; 3263 3264 case 2: 3265 3266 sum1 = b[row]; 3267 sum2 = b[row-1]; 3268 /* note that sum1 is associated with the second of the two rows */ 3269 v2 = a->a + diag[row-1] + 2; 3270 for(n = 0; n<sz-1; n+=2) { 3271 i1 = idx[0]; 3272 i2 = idx[1]; 3273 idx += 2; 3274 tmp0 = x[i1]; 3275 tmp1 = x[i2]; 3276 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3277 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3278 } 3279 3280 if (n == sz-1){ 3281 tmp0 = x[*idx]; 3282 sum1 -= *v1*tmp0; 3283 sum2 -= *v2*tmp0; 3284 } 3285 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3286 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3287 row -= 2; 3288 break; 3289 case 3: 3290 3291 sum1 = b[row]; 3292 sum2 = b[row-1]; 3293 sum3 = b[row-2]; 3294 v2 = a->a + diag[row-1] + 2; 3295 v3 = a->a + diag[row-2] + 3; 3296 for(n = 0; n<sz-1; n+=2) { 3297 i1 = idx[0]; 3298 i2 = idx[1]; 3299 idx += 2; 3300 tmp0 = x[i1]; 3301 tmp1 = x[i2]; 3302 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3303 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3304 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3305 } 3306 3307 if (n == sz-1){ 3308 tmp0 = x[*idx]; 3309 sum1 -= *v1*tmp0; 3310 sum2 -= *v2*tmp0; 3311 sum3 -= *v3*tmp0; 3312 } 3313 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3314 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3315 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3316 row -= 3; 3317 break; 3318 case 4: 3319 3320 sum1 = b[row]; 3321 sum2 = b[row-1]; 3322 sum3 = b[row-2]; 3323 sum4 = b[row-3]; 3324 v2 = a->a + diag[row-1] + 2; 3325 v3 = a->a + diag[row-2] + 3; 3326 v4 = a->a + diag[row-3] + 4; 3327 for(n = 0; n<sz-1; n+=2) { 3328 i1 = idx[0]; 3329 i2 = idx[1]; 3330 idx += 2; 3331 tmp0 = x[i1]; 3332 tmp1 = x[i2]; 3333 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3334 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3335 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3336 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3337 } 3338 3339 if (n == sz-1){ 3340 tmp0 = x[*idx]; 3341 sum1 -= *v1*tmp0; 3342 sum2 -= *v2*tmp0; 3343 sum3 -= *v3*tmp0; 3344 sum4 -= *v4*tmp0; 3345 } 3346 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3347 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3348 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3349 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3350 row -= 4; 3351 break; 3352 case 5: 3353 3354 sum1 = b[row]; 3355 sum2 = b[row-1]; 3356 sum3 = b[row-2]; 3357 sum4 = b[row-3]; 3358 sum5 = b[row-4]; 3359 v2 = a->a + diag[row-1] + 2; 3360 v3 = a->a + diag[row-2] + 3; 3361 v4 = a->a + diag[row-3] + 4; 3362 v5 = a->a + diag[row-4] + 5; 3363 for(n = 0; n<sz-1; n+=2) { 3364 i1 = idx[0]; 3365 i2 = idx[1]; 3366 idx += 2; 3367 tmp0 = x[i1]; 3368 tmp1 = x[i2]; 3369 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3370 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3371 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3372 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3373 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3374 } 3375 3376 if (n == sz-1){ 3377 tmp0 = x[*idx]; 3378 sum1 -= *v1*tmp0; 3379 sum2 -= *v2*tmp0; 3380 sum3 -= *v3*tmp0; 3381 sum4 -= *v4*tmp0; 3382 sum5 -= *v5*tmp0; 3383 } 3384 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3385 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3386 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3387 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3388 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3389 row -= 5; 3390 break; 3391 default: 3392 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3393 } 3394 CHKMEMQ; 3395 } 3396 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3397 3398 /* 3399 t = b - D x where D is the block diagonal 3400 */ 3401 cnt = 0; 3402 for (i=0, row=0; i<m; i++) { 3403 CHKMEMQ; 3404 switch (sizes[i]){ 3405 case 1: 3406 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3407 break; 3408 case 2: 3409 x1 = x[row]; x2 = x[row+1]; 3410 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3411 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3412 t[row] = b[row] - tmp1; 3413 t[row+1] = b[row+1] - tmp2; row += 2; 3414 cnt += 4; 3415 break; 3416 case 3: 3417 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3418 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3419 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3420 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3421 t[row] = b[row] - tmp1; 3422 t[row+1] = b[row+1] - tmp2; 3423 t[row+2] = b[row+2] - tmp3; row += 3; 3424 cnt += 9; 3425 break; 3426 case 4: 3427 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3428 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3429 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3430 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3431 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3432 t[row] = b[row] - tmp1; 3433 t[row+1] = b[row+1] - tmp2; 3434 t[row+2] = b[row+2] - tmp3; 3435 t[row+3] = b[row+3] - tmp4; row += 4; 3436 cnt += 16; 3437 break; 3438 case 5: 3439 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3440 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3441 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3442 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3443 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3444 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3445 t[row] = b[row] - tmp1; 3446 t[row+1] = b[row+1] - tmp2; 3447 t[row+2] = b[row+2] - tmp3; 3448 t[row+3] = b[row+3] - tmp4; 3449 t[row+4] = b[row+4] - tmp5;row += 5; 3450 cnt += 25; 3451 break; 3452 default: 3453 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3454 } 3455 CHKMEMQ; 3456 } 3457 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3458 3459 3460 3461 /* 3462 Apply (L + D)^-1 where D is the block diagonal 3463 */ 3464 for (i=0, row=0; i<m; i++) { 3465 sz = diag[row] - ii[row]; 3466 v1 = a->a + ii[row]; 3467 idx = a->j + ii[row]; 3468 CHKMEMQ; 3469 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3470 switch (sizes[i]){ 3471 case 1: 3472 3473 sum1 = t[row]; 3474 for(n = 0; n<sz-1; n+=2) { 3475 i1 = idx[0]; 3476 i2 = idx[1]; 3477 idx += 2; 3478 tmp0 = t[i1]; 3479 tmp1 = t[i2]; 3480 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3481 } 3482 3483 if (n == sz-1){ 3484 tmp0 = t[*idx]; 3485 sum1 -= *v1 * tmp0; 3486 } 3487 x[row] += t[row] = sum1*(*ibdiag++); row++; 3488 break; 3489 case 2: 3490 v2 = a->a + ii[row+1]; 3491 sum1 = t[row]; 3492 sum2 = t[row+1]; 3493 for(n = 0; n<sz-1; n+=2) { 3494 i1 = idx[0]; 3495 i2 = idx[1]; 3496 idx += 2; 3497 tmp0 = t[i1]; 3498 tmp1 = t[i2]; 3499 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3500 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3501 } 3502 3503 if (n == sz-1){ 3504 tmp0 = t[*idx]; 3505 sum1 -= v1[0] * tmp0; 3506 sum2 -= v2[0] * tmp0; 3507 } 3508 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3509 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3510 ibdiag += 4; row += 2; 3511 break; 3512 case 3: 3513 v2 = a->a + ii[row+1]; 3514 v3 = a->a + ii[row+2]; 3515 sum1 = t[row]; 3516 sum2 = t[row+1]; 3517 sum3 = t[row+2]; 3518 for(n = 0; n<sz-1; n+=2) { 3519 i1 = idx[0]; 3520 i2 = idx[1]; 3521 idx += 2; 3522 tmp0 = t[i1]; 3523 tmp1 = t[i2]; 3524 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3525 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3526 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3527 } 3528 3529 if (n == sz-1){ 3530 tmp0 = t[*idx]; 3531 sum1 -= v1[0] * tmp0; 3532 sum2 -= v2[0] * tmp0; 3533 sum3 -= v3[0] * tmp0; 3534 } 3535 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3536 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3537 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3538 ibdiag += 9; row += 3; 3539 break; 3540 case 4: 3541 v2 = a->a + ii[row+1]; 3542 v3 = a->a + ii[row+2]; 3543 v4 = a->a + ii[row+3]; 3544 sum1 = t[row]; 3545 sum2 = t[row+1]; 3546 sum3 = t[row+2]; 3547 sum4 = t[row+3]; 3548 for(n = 0; n<sz-1; n+=2) { 3549 i1 = idx[0]; 3550 i2 = idx[1]; 3551 idx += 2; 3552 tmp0 = t[i1]; 3553 tmp1 = t[i2]; 3554 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3555 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3556 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3557 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3558 } 3559 3560 if (n == sz-1){ 3561 tmp0 = t[*idx]; 3562 sum1 -= v1[0] * tmp0; 3563 sum2 -= v2[0] * tmp0; 3564 sum3 -= v3[0] * tmp0; 3565 sum4 -= v4[0] * tmp0; 3566 } 3567 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3568 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3569 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3570 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3571 ibdiag += 16; row += 4; 3572 break; 3573 case 5: 3574 v2 = a->a + ii[row+1]; 3575 v3 = a->a + ii[row+2]; 3576 v4 = a->a + ii[row+3]; 3577 v5 = a->a + ii[row+4]; 3578 sum1 = t[row]; 3579 sum2 = t[row+1]; 3580 sum3 = t[row+2]; 3581 sum4 = t[row+3]; 3582 sum5 = t[row+4]; 3583 for(n = 0; n<sz-1; n+=2) { 3584 i1 = idx[0]; 3585 i2 = idx[1]; 3586 idx += 2; 3587 tmp0 = t[i1]; 3588 tmp1 = t[i2]; 3589 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3590 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3591 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3592 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3593 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3594 } 3595 3596 if (n == sz-1){ 3597 tmp0 = t[*idx]; 3598 sum1 -= v1[0] * tmp0; 3599 sum2 -= v2[0] * tmp0; 3600 sum3 -= v3[0] * tmp0; 3601 sum4 -= v4[0] * tmp0; 3602 sum5 -= v5[0] * tmp0; 3603 } 3604 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3605 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3606 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3607 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3608 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3609 ibdiag += 25; row += 5; 3610 break; 3611 default: 3612 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3613 } 3614 CHKMEMQ; 3615 } 3616 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3617 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3618 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3619 } 3620 PetscFunctionReturn(0); 3621 } 3622 3623 #undef __FUNCT__ 3624 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3625 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3626 { 3627 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3628 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3629 const MatScalar *bdiag = a->inode.bdiag; 3630 const PetscScalar *b; 3631 PetscErrorCode ierr; 3632 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3633 const PetscInt *sizes = a->inode.size; 3634 3635 PetscFunctionBegin; 3636 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3637 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3638 cnt = 0; 3639 for (i=0, row=0; i<m; i++) { 3640 switch (sizes[i]){ 3641 case 1: 3642 x[row] = b[row]*bdiag[cnt++];row++; 3643 break; 3644 case 2: 3645 x1 = b[row]; x2 = b[row+1]; 3646 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3647 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3648 x[row++] = tmp1; 3649 x[row++] = tmp2; 3650 cnt += 4; 3651 break; 3652 case 3: 3653 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3654 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3655 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3656 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3657 x[row++] = tmp1; 3658 x[row++] = tmp2; 3659 x[row++] = tmp3; 3660 cnt += 9; 3661 break; 3662 case 4: 3663 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3664 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3665 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3666 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3667 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3668 x[row++] = tmp1; 3669 x[row++] = tmp2; 3670 x[row++] = tmp3; 3671 x[row++] = tmp4; 3672 cnt += 16; 3673 break; 3674 case 5: 3675 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3676 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3677 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3678 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3679 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3680 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3681 x[row++] = tmp1; 3682 x[row++] = tmp2; 3683 x[row++] = tmp3; 3684 x[row++] = tmp4; 3685 x[row++] = tmp5; 3686 cnt += 25; 3687 break; 3688 default: 3689 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3690 } 3691 } 3692 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3693 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3694 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3695 PetscFunctionReturn(0); 3696 } 3697 3698 /* 3699 samestructure indicates that the matrix has not changed its nonzero structure so we 3700 do not need to recompute the inodes 3701 */ 3702 #undef __FUNCT__ 3703 #define __FUNCT__ "Mat_CheckInode" 3704 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3705 { 3706 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3707 PetscErrorCode ierr; 3708 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3709 PetscTruth flag; 3710 3711 PetscFunctionBegin; 3712 if (!a->inode.use) PetscFunctionReturn(0); 3713 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3714 3715 3716 m = A->rmap->n; 3717 if (a->inode.size) {ns = a->inode.size;} 3718 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3719 3720 i = 0; 3721 node_count = 0; 3722 idx = a->j; 3723 ii = a->i; 3724 while (i < m){ /* For each row */ 3725 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3726 /* Limits the number of elements in a node to 'a->inode.limit' */ 3727 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3728 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3729 if (nzy != nzx) break; 3730 idy += nzx; /* Same nonzero pattern */ 3731 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3732 if (!flag) break; 3733 } 3734 ns[node_count++] = blk_size; 3735 idx += blk_size*nzx; 3736 i = j; 3737 } 3738 /* If not enough inodes found,, do not use inode version of the routines */ 3739 if (!a->inode.size && m && node_count > .9*m) { 3740 ierr = PetscFree(ns);CHKERRQ(ierr); 3741 a->inode.node_count = 0; 3742 a->inode.size = PETSC_NULL; 3743 a->inode.use = PETSC_FALSE; 3744 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3745 } else { 3746 if (!A->factortype) { 3747 A->ops->mult = MatMult_SeqAIJ_Inode; 3748 A->ops->sor = MatSOR_SeqAIJ_Inode; 3749 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3750 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3751 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3752 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3753 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3754 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3755 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3756 } else { 3757 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 3758 } 3759 a->inode.node_count = node_count; 3760 a->inode.size = ns; 3761 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3762 } 3763 PetscFunctionReturn(0); 3764 } 3765 3766 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3767 PetscInt __k, *__vi; \ 3768 __vi = aj + ai[row]; \ 3769 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3770 __vi = aj + adiag[row]; \ 3771 cols[nzl] = __vi[0];\ 3772 __vi = aj + adiag[row+1]+1;\ 3773 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3774 3775 3776 /* 3777 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3778 Modified from Mat_CheckInode(). 3779 3780 Input Parameters: 3781 + Mat A - ILU or LU matrix factor 3782 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3783 do not need to recompute the inodes 3784 */ 3785 #undef __FUNCT__ 3786 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3787 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3788 { 3789 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3790 PetscErrorCode ierr; 3791 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3792 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3793 PetscTruth flag; 3794 3795 PetscFunctionBegin; 3796 if (!a->inode.use) PetscFunctionReturn(0); 3797 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3798 3799 m = A->rmap->n; 3800 if (a->inode.size) {ns = a->inode.size;} 3801 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3802 3803 i = 0; 3804 node_count = 0; 3805 while (i < m){ /* For each row */ 3806 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3807 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3808 nzx = nzl1 + nzu1 + 1; 3809 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3810 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3811 3812 /* Limits the number of elements in a node to 'a->inode.limit' */ 3813 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3814 nzl2 = ai[j+1] - ai[j]; 3815 nzu2 = adiag[j] - adiag[j+1] - 1; 3816 nzy = nzl2 + nzu2 + 1; 3817 if( nzy != nzx) break; 3818 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3819 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3820 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3821 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3822 ierr = PetscFree(cols2);CHKERRQ(ierr); 3823 } 3824 ns[node_count++] = blk_size; 3825 ierr = PetscFree(cols1);CHKERRQ(ierr); 3826 i = j; 3827 } 3828 /* If not enough inodes found,, do not use inode version of the routines */ 3829 if (!a->inode.size && m && node_count > .9*m) { 3830 ierr = PetscFree(ns);CHKERRQ(ierr); 3831 a->inode.node_count = 0; 3832 a->inode.size = PETSC_NULL; 3833 a->inode.use = PETSC_FALSE; 3834 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3835 } else { 3836 A->ops->mult = 0; 3837 A->ops->sor = 0; 3838 A->ops->multadd = 0; 3839 A->ops->getrowij = 0; 3840 A->ops->restorerowij = 0; 3841 A->ops->getcolumnij = 0; 3842 A->ops->restorecolumnij = 0; 3843 A->ops->coloringpatch = 0; 3844 A->ops->multdiagonalblock = 0; 3845 A->ops->solve = MatSolve_SeqAIJ_Inode; 3846 a->inode.node_count = node_count; 3847 a->inode.size = ns; 3848 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3849 } 3850 PetscFunctionReturn(0); 3851 } 3852 3853 /* 3854 This is really ugly. if inodes are used this replaces the 3855 permutations with ones that correspond to rows/cols of the matrix 3856 rather then inode blocks 3857 */ 3858 #undef __FUNCT__ 3859 #define __FUNCT__ "MatInodeAdjustForInodes" 3860 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3861 { 3862 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3863 3864 PetscFunctionBegin; 3865 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3866 if (f) { 3867 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3868 } 3869 PetscFunctionReturn(0); 3870 } 3871 3872 EXTERN_C_BEGIN 3873 #undef __FUNCT__ 3874 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3875 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3876 { 3877 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3878 PetscErrorCode ierr; 3879 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3880 const PetscInt *ridx,*cidx; 3881 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3882 PetscInt nslim_col,*ns_col; 3883 IS ris = *rperm,cis = *cperm; 3884 3885 PetscFunctionBegin; 3886 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3887 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3888 3889 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3890 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3891 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3892 3893 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3894 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3895 3896 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3897 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3898 3899 /* Construct the permutations for rows*/ 3900 for (i=0,row = 0; i<nslim_row; ++i){ 3901 indx = ridx[i]; 3902 start_val = tns[indx]; 3903 end_val = tns[indx + 1]; 3904 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3905 } 3906 3907 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3908 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3909 3910 /* Construct permutations for columns */ 3911 for (i=0,col=0; i<nslim_col; ++i){ 3912 indx = cidx[i]; 3913 start_val = tns[indx]; 3914 end_val = tns[indx + 1]; 3915 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3916 } 3917 3918 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3919 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3920 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3921 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3922 3923 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3924 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3925 3926 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3927 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3928 ierr = ISDestroy(cis);CHKERRQ(ierr); 3929 ierr = ISDestroy(ris);CHKERRQ(ierr); 3930 ierr = PetscFree(tns);CHKERRQ(ierr); 3931 PetscFunctionReturn(0); 3932 } 3933 EXTERN_C_END 3934 3935 #undef __FUNCT__ 3936 #define __FUNCT__ "MatInodeGetInodeSizes" 3937 /*@C 3938 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3939 3940 Collective on Mat 3941 3942 Input Parameter: 3943 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3944 3945 Output Parameter: 3946 + node_count - no of inodes present in the matrix. 3947 . sizes - an array of size node_count,with sizes of each inode. 3948 - limit - the max size used to generate the inodes. 3949 3950 Level: advanced 3951 3952 Notes: This routine returns some internal storage information 3953 of the matrix, it is intended to be used by advanced users. 3954 It should be called after the matrix is assembled. 3955 The contents of the sizes[] array should not be changed. 3956 PETSC_NULL may be passed for information not requested. 3957 3958 .keywords: matrix, seqaij, get, inode 3959 3960 .seealso: MatGetInfo() 3961 @*/ 3962 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3963 { 3964 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3965 3966 PetscFunctionBegin; 3967 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3968 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3969 if (f) { 3970 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3971 } 3972 PetscFunctionReturn(0); 3973 } 3974 3975 EXTERN_C_BEGIN 3976 #undef __FUNCT__ 3977 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3978 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3979 { 3980 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3981 3982 PetscFunctionBegin; 3983 if (node_count) *node_count = a->inode.node_count; 3984 if (sizes) *sizes = a->inode.size; 3985 if (limit) *limit = a->inode.limit; 3986 PetscFunctionReturn(0); 3987 } 3988 EXTERN_C_END 3989