1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap.n; 19 m = A->rmap.n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap.n; 72 n = A->cmap.n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap.n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 243 if (symmetric) { 244 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 245 } else { 246 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 247 } 248 PetscFunctionReturn(0); 249 } 250 251 #undef __FUNCT__ 252 #define __FUNCT__ "MatRestoreRowIJ_Inode" 253 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 254 { 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 if (!ia) PetscFunctionReturn(0); 259 ierr = PetscFree(*ia);CHKERRQ(ierr); 260 ierr = PetscFree(*ja);CHKERRQ(ierr); 261 PetscFunctionReturn(0); 262 } 263 264 /* ----------------------------------------------------------- */ 265 266 #undef __FUNCT__ 267 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 268 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 269 { 270 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 271 PetscErrorCode ierr; 272 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 273 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 274 275 PetscFunctionBegin; 276 nslim_row = a->inode.node_count; 277 n = A->cmap.n; 278 279 /* Create The column_inode for this matrix */ 280 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 281 282 /* allocate space for reformated column_inode structure */ 283 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 284 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 285 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 286 287 for (i1=0,col=0; i1<nslim_col; ++i1){ 288 nsz = ns_col[i1]; 289 for (i2=0; i2<nsz; ++i2,++col) 290 tvc[col] = i1; 291 } 292 /* allocate space for column pointers */ 293 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 294 *iia = ia; 295 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 296 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 297 298 /* determine the number of columns in each row */ 299 ia[0] = oshift; 300 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 301 j = aj + ai[row] + ishift; 302 col = *j++ + ishift; 303 i2 = tvc[col]; 304 nz = ai[row+1] - ai[row]; 305 while (nz-- > 0) { /* off-diagonal elemets */ 306 /* ia[i1+1]++; */ 307 ia[i2+1]++; 308 i2++; 309 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 310 if (nz > 0) i2 = tvc[col]; 311 } 312 } 313 314 /* shift ia[i] to point to next col */ 315 for (i1=1; i1<nslim_col+1; i1++) { 316 col = ia[i1-1]; 317 ia[i1] += col; 318 work[i1-1] = col - oshift; 319 } 320 321 /* allocate space for column pointers */ 322 nz = ia[nslim_col] + (!ishift); 323 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 324 *jja = ja; 325 326 /* loop over matrix putting into ja */ 327 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 328 j = aj + ai[row] + ishift; 329 i2 = 0; /* Col inode index */ 330 col = *j++ + ishift; 331 i2 = tvc[col]; 332 nz = ai[row+1] - ai[row]; 333 while (nz-- > 0) { 334 /* ja[work[i1]++] = i2 + oshift; */ 335 ja[work[i2]++] = i1 + oshift; 336 i2++; 337 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 338 if (nz > 0) i2 = tvc[col]; 339 } 340 } 341 ierr = PetscFree(ns_col);CHKERRQ(ierr); 342 ierr = PetscFree(work);CHKERRQ(ierr); 343 ierr = PetscFree(tns);CHKERRQ(ierr); 344 ierr = PetscFree(tvc);CHKERRQ(ierr); 345 PetscFunctionReturn(0); 346 } 347 348 #undef __FUNCT__ 349 #define __FUNCT__ "MatGetColumnIJ_Inode" 350 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 351 { 352 PetscErrorCode ierr; 353 354 PetscFunctionBegin; 355 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 356 if (!ia) PetscFunctionReturn(0); 357 358 if (symmetric) { 359 /* Since the indices are symmetric it does'nt matter */ 360 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 361 } else { 362 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 363 } 364 PetscFunctionReturn(0); 365 } 366 367 #undef __FUNCT__ 368 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 369 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 370 { 371 PetscErrorCode ierr; 372 373 PetscFunctionBegin; 374 if (!ia) PetscFunctionReturn(0); 375 ierr = PetscFree(*ia);CHKERRQ(ierr); 376 ierr = PetscFree(*ja);CHKERRQ(ierr); 377 PetscFunctionReturn(0); 378 } 379 380 /* ----------------------------------------------------------- */ 381 382 #undef __FUNCT__ 383 #define __FUNCT__ "MatMult_Inode" 384 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 385 { 386 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 387 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 388 PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y; 389 PetscErrorCode ierr; 390 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 391 392 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 393 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 394 #endif 395 396 PetscFunctionBegin; 397 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 398 node_max = a->inode.node_count; 399 ns = a->inode.size; /* Node Size array */ 400 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 401 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 402 idx = a->j; 403 v1 = a->a; 404 ii = a->i; 405 406 for (i = 0,row = 0; i< node_max; ++i){ 407 nsz = ns[i]; 408 n = ii[1] - ii[0]; 409 ii += nsz; 410 sz = n; /* No of non zeros in this row */ 411 /* Switch on the size of Node */ 412 switch (nsz){ /* Each loop in 'case' is unrolled */ 413 case 1 : 414 sum1 = 0; 415 416 for(n = 0; n< sz-1; n+=2) { 417 i1 = idx[0]; /* The instructions are ordered to */ 418 i2 = idx[1]; /* make the compiler's job easy */ 419 idx += 2; 420 tmp0 = x[i1]; 421 tmp1 = x[i2]; 422 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 423 } 424 425 if (n == sz-1){ /* Take care of the last nonzero */ 426 tmp0 = x[*idx++]; 427 sum1 += *v1++ * tmp0; 428 } 429 y[row++]=sum1; 430 break; 431 case 2: 432 sum1 = 0; 433 sum2 = 0; 434 v2 = v1 + n; 435 436 for (n = 0; n< sz-1; n+=2) { 437 i1 = idx[0]; 438 i2 = idx[1]; 439 idx += 2; 440 tmp0 = x[i1]; 441 tmp1 = x[i2]; 442 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 443 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 444 } 445 if (n == sz-1){ 446 tmp0 = x[*idx++]; 447 sum1 += *v1++ * tmp0; 448 sum2 += *v2++ * tmp0; 449 } 450 y[row++]=sum1; 451 y[row++]=sum2; 452 v1 =v2; /* Since the next block to be processed starts there*/ 453 idx +=sz; 454 break; 455 case 3: 456 sum1 = 0; 457 sum2 = 0; 458 sum3 = 0; 459 v2 = v1 + n; 460 v3 = v2 + n; 461 462 for (n = 0; n< sz-1; n+=2) { 463 i1 = idx[0]; 464 i2 = idx[1]; 465 idx += 2; 466 tmp0 = x[i1]; 467 tmp1 = x[i2]; 468 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 469 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 470 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 471 } 472 if (n == sz-1){ 473 tmp0 = x[*idx++]; 474 sum1 += *v1++ * tmp0; 475 sum2 += *v2++ * tmp0; 476 sum3 += *v3++ * tmp0; 477 } 478 y[row++]=sum1; 479 y[row++]=sum2; 480 y[row++]=sum3; 481 v1 =v3; /* Since the next block to be processed starts there*/ 482 idx +=2*sz; 483 break; 484 case 4: 485 sum1 = 0; 486 sum2 = 0; 487 sum3 = 0; 488 sum4 = 0; 489 v2 = v1 + n; 490 v3 = v2 + n; 491 v4 = v3 + n; 492 493 for (n = 0; n< sz-1; n+=2) { 494 i1 = idx[0]; 495 i2 = idx[1]; 496 idx += 2; 497 tmp0 = x[i1]; 498 tmp1 = x[i2]; 499 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 500 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 501 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 502 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 503 } 504 if (n == sz-1){ 505 tmp0 = x[*idx++]; 506 sum1 += *v1++ * tmp0; 507 sum2 += *v2++ * tmp0; 508 sum3 += *v3++ * tmp0; 509 sum4 += *v4++ * tmp0; 510 } 511 y[row++]=sum1; 512 y[row++]=sum2; 513 y[row++]=sum3; 514 y[row++]=sum4; 515 v1 =v4; /* Since the next block to be processed starts there*/ 516 idx +=3*sz; 517 break; 518 case 5: 519 sum1 = 0; 520 sum2 = 0; 521 sum3 = 0; 522 sum4 = 0; 523 sum5 = 0; 524 v2 = v1 + n; 525 v3 = v2 + n; 526 v4 = v3 + n; 527 v5 = v4 + n; 528 529 for (n = 0; n<sz-1; n+=2) { 530 i1 = idx[0]; 531 i2 = idx[1]; 532 idx += 2; 533 tmp0 = x[i1]; 534 tmp1 = x[i2]; 535 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 536 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 537 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 538 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 539 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 540 } 541 if (n == sz-1){ 542 tmp0 = x[*idx++]; 543 sum1 += *v1++ * tmp0; 544 sum2 += *v2++ * tmp0; 545 sum3 += *v3++ * tmp0; 546 sum4 += *v4++ * tmp0; 547 sum5 += *v5++ * tmp0; 548 } 549 y[row++]=sum1; 550 y[row++]=sum2; 551 y[row++]=sum3; 552 y[row++]=sum4; 553 y[row++]=sum5; 554 v1 =v5; /* Since the next block to be processed starts there */ 555 idx +=4*sz; 556 break; 557 default : 558 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 559 } 560 } 561 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 562 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 563 ierr = PetscLogFlops(2*a->nz - A->rmap.n);CHKERRQ(ierr); 564 PetscFunctionReturn(0); 565 } 566 /* ----------------------------------------------------------- */ 567 /* Almost same code as the MatMult_Inode() */ 568 #undef __FUNCT__ 569 #define __FUNCT__ "MatMultAdd_Inode" 570 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 571 { 572 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 573 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 574 PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt; 575 PetscErrorCode ierr; 576 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 577 578 PetscFunctionBegin; 579 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 580 node_max = a->inode.node_count; 581 ns = a->inode.size; /* Node Size array */ 582 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 584 if (zz != yy) { 585 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 586 } else { 587 z = y; 588 } 589 zt = z; 590 591 idx = a->j; 592 v1 = a->a; 593 ii = a->i; 594 595 for (i = 0,row = 0; i< node_max; ++i){ 596 nsz = ns[i]; 597 n = ii[1] - ii[0]; 598 ii += nsz; 599 sz = n; /* No of non zeros in this row */ 600 /* Switch on the size of Node */ 601 switch (nsz){ /* Each loop in 'case' is unrolled */ 602 case 1 : 603 sum1 = *zt++; 604 605 for(n = 0; n< sz-1; n+=2) { 606 i1 = idx[0]; /* The instructions are ordered to */ 607 i2 = idx[1]; /* make the compiler's job easy */ 608 idx += 2; 609 tmp0 = x[i1]; 610 tmp1 = x[i2]; 611 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 612 } 613 614 if(n == sz-1){ /* Take care of the last nonzero */ 615 tmp0 = x[*idx++]; 616 sum1 += *v1++ * tmp0; 617 } 618 y[row++]=sum1; 619 break; 620 case 2: 621 sum1 = *zt++; 622 sum2 = *zt++; 623 v2 = v1 + n; 624 625 for(n = 0; n< sz-1; n+=2) { 626 i1 = idx[0]; 627 i2 = idx[1]; 628 idx += 2; 629 tmp0 = x[i1]; 630 tmp1 = x[i2]; 631 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 632 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 633 } 634 if(n == sz-1){ 635 tmp0 = x[*idx++]; 636 sum1 += *v1++ * tmp0; 637 sum2 += *v2++ * tmp0; 638 } 639 y[row++]=sum1; 640 y[row++]=sum2; 641 v1 =v2; /* Since the next block to be processed starts there*/ 642 idx +=sz; 643 break; 644 case 3: 645 sum1 = *zt++; 646 sum2 = *zt++; 647 sum3 = *zt++; 648 v2 = v1 + n; 649 v3 = v2 + n; 650 651 for (n = 0; n< sz-1; n+=2) { 652 i1 = idx[0]; 653 i2 = idx[1]; 654 idx += 2; 655 tmp0 = x[i1]; 656 tmp1 = x[i2]; 657 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 658 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 659 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 660 } 661 if (n == sz-1){ 662 tmp0 = x[*idx++]; 663 sum1 += *v1++ * tmp0; 664 sum2 += *v2++ * tmp0; 665 sum3 += *v3++ * tmp0; 666 } 667 y[row++]=sum1; 668 y[row++]=sum2; 669 y[row++]=sum3; 670 v1 =v3; /* Since the next block to be processed starts there*/ 671 idx +=2*sz; 672 break; 673 case 4: 674 sum1 = *zt++; 675 sum2 = *zt++; 676 sum3 = *zt++; 677 sum4 = *zt++; 678 v2 = v1 + n; 679 v3 = v2 + n; 680 v4 = v3 + n; 681 682 for (n = 0; n< sz-1; n+=2) { 683 i1 = idx[0]; 684 i2 = idx[1]; 685 idx += 2; 686 tmp0 = x[i1]; 687 tmp1 = x[i2]; 688 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 689 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 690 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 691 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 692 } 693 if (n == sz-1){ 694 tmp0 = x[*idx++]; 695 sum1 += *v1++ * tmp0; 696 sum2 += *v2++ * tmp0; 697 sum3 += *v3++ * tmp0; 698 sum4 += *v4++ * tmp0; 699 } 700 y[row++]=sum1; 701 y[row++]=sum2; 702 y[row++]=sum3; 703 y[row++]=sum4; 704 v1 =v4; /* Since the next block to be processed starts there*/ 705 idx +=3*sz; 706 break; 707 case 5: 708 sum1 = *zt++; 709 sum2 = *zt++; 710 sum3 = *zt++; 711 sum4 = *zt++; 712 sum5 = *zt++; 713 v2 = v1 + n; 714 v3 = v2 + n; 715 v4 = v3 + n; 716 v5 = v4 + n; 717 718 for (n = 0; n<sz-1; n+=2) { 719 i1 = idx[0]; 720 i2 = idx[1]; 721 idx += 2; 722 tmp0 = x[i1]; 723 tmp1 = x[i2]; 724 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 725 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 726 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 727 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 728 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 729 } 730 if(n == sz-1){ 731 tmp0 = x[*idx++]; 732 sum1 += *v1++ * tmp0; 733 sum2 += *v2++ * tmp0; 734 sum3 += *v3++ * tmp0; 735 sum4 += *v4++ * tmp0; 736 sum5 += *v5++ * tmp0; 737 } 738 y[row++]=sum1; 739 y[row++]=sum2; 740 y[row++]=sum3; 741 y[row++]=sum4; 742 y[row++]=sum5; 743 v1 =v5; /* Since the next block to be processed starts there */ 744 idx +=4*sz; 745 break; 746 default : 747 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 748 } 749 } 750 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 751 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 752 if (zz != yy) { 753 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 754 } 755 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 756 PetscFunctionReturn(0); 757 } 758 759 /* ----------------------------------------------------------- */ 760 #undef __FUNCT__ 761 #define __FUNCT__ "MatSolve_Inode" 762 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 763 { 764 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 765 IS iscol = a->col,isrow = a->row; 766 PetscErrorCode ierr; 767 PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j; 768 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout; 769 PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1; 770 PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5; 771 772 PetscFunctionBegin; 773 if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix"); 774 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 775 node_max = a->inode.node_count; 776 ns = a->inode.size; /* Node Size array */ 777 778 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 779 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 780 tmp = a->solve_work; 781 782 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 783 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 784 785 /* forward solve the lower triangular */ 786 tmps = tmp ; 787 aa = a_a ; 788 aj = a_j ; 789 ad = a->diag; 790 791 for (i = 0,row = 0; i< node_max; ++i){ 792 nsz = ns[i]; 793 aii = ai[row]; 794 v1 = aa + aii; 795 vi = aj + aii; 796 nz = ad[row]- aii; 797 798 switch (nsz){ /* Each loop in 'case' is unrolled */ 799 case 1 : 800 sum1 = b[*r++]; 801 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 802 for(j=0; j<nz-1; j+=2){ 803 i0 = vi[0]; 804 i1 = vi[1]; 805 vi +=2; 806 tmp0 = tmps[i0]; 807 tmp1 = tmps[i1]; 808 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 809 } 810 if(j == nz-1){ 811 tmp0 = tmps[*vi++]; 812 sum1 -= *v1++ *tmp0; 813 } 814 tmp[row ++]=sum1; 815 break; 816 case 2: 817 sum1 = b[*r++]; 818 sum2 = b[*r++]; 819 v2 = aa + ai[row+1]; 820 821 for(j=0; j<nz-1; j+=2){ 822 i0 = vi[0]; 823 i1 = vi[1]; 824 vi +=2; 825 tmp0 = tmps[i0]; 826 tmp1 = tmps[i1]; 827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 828 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 829 } 830 if(j == nz-1){ 831 tmp0 = tmps[*vi++]; 832 sum1 -= *v1++ *tmp0; 833 sum2 -= *v2++ *tmp0; 834 } 835 sum2 -= *v2++ * sum1; 836 tmp[row ++]=sum1; 837 tmp[row ++]=sum2; 838 break; 839 case 3: 840 sum1 = b[*r++]; 841 sum2 = b[*r++]; 842 sum3 = b[*r++]; 843 v2 = aa + ai[row+1]; 844 v3 = aa + ai[row+2]; 845 846 for (j=0; j<nz-1; j+=2){ 847 i0 = vi[0]; 848 i1 = vi[1]; 849 vi +=2; 850 tmp0 = tmps[i0]; 851 tmp1 = tmps[i1]; 852 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 853 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 854 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 855 } 856 if (j == nz-1){ 857 tmp0 = tmps[*vi++]; 858 sum1 -= *v1++ *tmp0; 859 sum2 -= *v2++ *tmp0; 860 sum3 -= *v3++ *tmp0; 861 } 862 sum2 -= *v2++ * sum1; 863 sum3 -= *v3++ * sum1; 864 sum3 -= *v3++ * sum2; 865 tmp[row ++]=sum1; 866 tmp[row ++]=sum2; 867 tmp[row ++]=sum3; 868 break; 869 870 case 4: 871 sum1 = b[*r++]; 872 sum2 = b[*r++]; 873 sum3 = b[*r++]; 874 sum4 = b[*r++]; 875 v2 = aa + ai[row+1]; 876 v3 = aa + ai[row+2]; 877 v4 = aa + ai[row+3]; 878 879 for (j=0; j<nz-1; j+=2){ 880 i0 = vi[0]; 881 i1 = vi[1]; 882 vi +=2; 883 tmp0 = tmps[i0]; 884 tmp1 = tmps[i1]; 885 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 886 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 887 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 888 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 889 } 890 if (j == nz-1){ 891 tmp0 = tmps[*vi++]; 892 sum1 -= *v1++ *tmp0; 893 sum2 -= *v2++ *tmp0; 894 sum3 -= *v3++ *tmp0; 895 sum4 -= *v4++ *tmp0; 896 } 897 sum2 -= *v2++ * sum1; 898 sum3 -= *v3++ * sum1; 899 sum4 -= *v4++ * sum1; 900 sum3 -= *v3++ * sum2; 901 sum4 -= *v4++ * sum2; 902 sum4 -= *v4++ * sum3; 903 904 tmp[row ++]=sum1; 905 tmp[row ++]=sum2; 906 tmp[row ++]=sum3; 907 tmp[row ++]=sum4; 908 break; 909 case 5: 910 sum1 = b[*r++]; 911 sum2 = b[*r++]; 912 sum3 = b[*r++]; 913 sum4 = b[*r++]; 914 sum5 = b[*r++]; 915 v2 = aa + ai[row+1]; 916 v3 = aa + ai[row+2]; 917 v4 = aa + ai[row+3]; 918 v5 = aa + ai[row+4]; 919 920 for (j=0; j<nz-1; j+=2){ 921 i0 = vi[0]; 922 i1 = vi[1]; 923 vi +=2; 924 tmp0 = tmps[i0]; 925 tmp1 = tmps[i1]; 926 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 927 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 928 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 929 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 930 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 931 } 932 if (j == nz-1){ 933 tmp0 = tmps[*vi++]; 934 sum1 -= *v1++ *tmp0; 935 sum2 -= *v2++ *tmp0; 936 sum3 -= *v3++ *tmp0; 937 sum4 -= *v4++ *tmp0; 938 sum5 -= *v5++ *tmp0; 939 } 940 941 sum2 -= *v2++ * sum1; 942 sum3 -= *v3++ * sum1; 943 sum4 -= *v4++ * sum1; 944 sum5 -= *v5++ * sum1; 945 sum3 -= *v3++ * sum2; 946 sum4 -= *v4++ * sum2; 947 sum5 -= *v5++ * sum2; 948 sum4 -= *v4++ * sum3; 949 sum5 -= *v5++ * sum3; 950 sum5 -= *v5++ * sum4; 951 952 tmp[row ++]=sum1; 953 tmp[row ++]=sum2; 954 tmp[row ++]=sum3; 955 tmp[row ++]=sum4; 956 tmp[row ++]=sum5; 957 break; 958 default: 959 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 960 } 961 } 962 /* backward solve the upper triangular */ 963 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 964 nsz = ns[i]; 965 aii = ai[row+1] -1; 966 v1 = aa + aii; 967 vi = aj + aii; 968 nz = aii- ad[row]; 969 switch (nsz){ /* Each loop in 'case' is unrolled */ 970 case 1 : 971 sum1 = tmp[row]; 972 973 for(j=nz ; j>1; j-=2){ 974 vi -=2; 975 i0 = vi[2]; 976 i1 = vi[1]; 977 tmp0 = tmps[i0]; 978 tmp1 = tmps[i1]; 979 v1 -= 2; 980 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 981 } 982 if (j==1){ 983 tmp0 = tmps[*vi--]; 984 sum1 -= *v1-- * tmp0; 985 } 986 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 987 break; 988 case 2 : 989 sum1 = tmp[row]; 990 sum2 = tmp[row -1]; 991 v2 = aa + ai[row]-1; 992 for (j=nz ; j>1; j-=2){ 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 v2 -= 2; 1000 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1001 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1002 } 1003 if (j==1){ 1004 tmp0 = tmps[*vi--]; 1005 sum1 -= *v1-- * tmp0; 1006 sum2 -= *v2-- * tmp0; 1007 } 1008 1009 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1010 sum2 -= *v2-- * tmp0; 1011 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1012 break; 1013 case 3 : 1014 sum1 = tmp[row]; 1015 sum2 = tmp[row -1]; 1016 sum3 = tmp[row -2]; 1017 v2 = aa + ai[row]-1; 1018 v3 = aa + ai[row -1]-1; 1019 for (j=nz ; j>1; j-=2){ 1020 vi -=2; 1021 i0 = vi[2]; 1022 i1 = vi[1]; 1023 tmp0 = tmps[i0]; 1024 tmp1 = tmps[i1]; 1025 v1 -= 2; 1026 v2 -= 2; 1027 v3 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1031 } 1032 if (j==1){ 1033 tmp0 = tmps[*vi--]; 1034 sum1 -= *v1-- * tmp0; 1035 sum2 -= *v2-- * tmp0; 1036 sum3 -= *v3-- * tmp0; 1037 } 1038 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1039 sum2 -= *v2-- * tmp0; 1040 sum3 -= *v3-- * tmp0; 1041 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1042 sum3 -= *v3-- * tmp0; 1043 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1044 1045 break; 1046 case 4 : 1047 sum1 = tmp[row]; 1048 sum2 = tmp[row -1]; 1049 sum3 = tmp[row -2]; 1050 sum4 = tmp[row -3]; 1051 v2 = aa + ai[row]-1; 1052 v3 = aa + ai[row -1]-1; 1053 v4 = aa + ai[row -2]-1; 1054 1055 for (j=nz ; j>1; j-=2){ 1056 vi -=2; 1057 i0 = vi[2]; 1058 i1 = vi[1]; 1059 tmp0 = tmps[i0]; 1060 tmp1 = tmps[i1]; 1061 v1 -= 2; 1062 v2 -= 2; 1063 v3 -= 2; 1064 v4 -= 2; 1065 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1066 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1067 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1068 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1069 } 1070 if (j==1){ 1071 tmp0 = tmps[*vi--]; 1072 sum1 -= *v1-- * tmp0; 1073 sum2 -= *v2-- * tmp0; 1074 sum3 -= *v3-- * tmp0; 1075 sum4 -= *v4-- * tmp0; 1076 } 1077 1078 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1079 sum2 -= *v2-- * tmp0; 1080 sum3 -= *v3-- * tmp0; 1081 sum4 -= *v4-- * tmp0; 1082 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1083 sum3 -= *v3-- * tmp0; 1084 sum4 -= *v4-- * tmp0; 1085 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1086 sum4 -= *v4-- * tmp0; 1087 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1088 break; 1089 case 5 : 1090 sum1 = tmp[row]; 1091 sum2 = tmp[row -1]; 1092 sum3 = tmp[row -2]; 1093 sum4 = tmp[row -3]; 1094 sum5 = tmp[row -4]; 1095 v2 = aa + ai[row]-1; 1096 v3 = aa + ai[row -1]-1; 1097 v4 = aa + ai[row -2]-1; 1098 v5 = aa + ai[row -3]-1; 1099 for (j=nz ; j>1; j-=2){ 1100 vi -= 2; 1101 i0 = vi[2]; 1102 i1 = vi[1]; 1103 tmp0 = tmps[i0]; 1104 tmp1 = tmps[i1]; 1105 v1 -= 2; 1106 v2 -= 2; 1107 v3 -= 2; 1108 v4 -= 2; 1109 v5 -= 2; 1110 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1111 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1112 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1113 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1114 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1115 } 1116 if (j==1){ 1117 tmp0 = tmps[*vi--]; 1118 sum1 -= *v1-- * tmp0; 1119 sum2 -= *v2-- * tmp0; 1120 sum3 -= *v3-- * tmp0; 1121 sum4 -= *v4-- * tmp0; 1122 sum5 -= *v5-- * tmp0; 1123 } 1124 1125 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1126 sum2 -= *v2-- * tmp0; 1127 sum3 -= *v3-- * tmp0; 1128 sum4 -= *v4-- * tmp0; 1129 sum5 -= *v5-- * tmp0; 1130 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1131 sum3 -= *v3-- * tmp0; 1132 sum4 -= *v4-- * tmp0; 1133 sum5 -= *v5-- * tmp0; 1134 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1135 sum4 -= *v4-- * tmp0; 1136 sum5 -= *v5-- * tmp0; 1137 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1138 sum5 -= *v5-- * tmp0; 1139 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1140 break; 1141 default: 1142 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1143 } 1144 } 1145 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1146 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1147 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1148 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1149 ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr); 1150 PetscFunctionReturn(0); 1151 } 1152 1153 #undef __FUNCT__ 1154 #define __FUNCT__ "MatLUFactorNumeric_Inode" 1155 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B) 1156 { 1157 Mat C = *B; 1158 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1159 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1160 PetscErrorCode ierr; 1161 PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i; 1162 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1163 PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1164 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1165 PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3; 1166 PetscScalar tmp,*ba = b->a,*aa = a->a,*pv; 1167 PetscReal rs=0.0; 1168 LUShift_Ctx sctx; 1169 PetscInt newshift; 1170 1171 PetscFunctionBegin; 1172 sctx.shift_top = 0; 1173 sctx.nshift_max = 0; 1174 sctx.shift_lo = 0; 1175 sctx.shift_hi = 0; 1176 1177 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1178 if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0; 1179 if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1180 sctx.shift_top = 0; 1181 for (i=0; i<n; i++) { 1182 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1183 rs = 0.0; 1184 ajtmp = aj + ai[i]; 1185 rtmp1 = aa + ai[i]; 1186 nz = ai[i+1] - ai[i]; 1187 for (j=0; j<nz; j++){ 1188 if (*ajtmp != i){ 1189 rs += PetscAbsScalar(*rtmp1++); 1190 } else { 1191 rs -= PetscRealPart(*rtmp1++); 1192 } 1193 ajtmp++; 1194 } 1195 if (rs>sctx.shift_top) sctx.shift_top = rs; 1196 } 1197 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1198 sctx.shift_top *= 1.1; 1199 sctx.nshift_max = 5; 1200 sctx.shift_lo = 0.; 1201 sctx.shift_hi = 1.; 1202 } 1203 sctx.shift_amount = 0; 1204 sctx.nshift = 0; 1205 1206 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1207 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1208 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1209 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1210 ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1211 ics = ic ; 1212 rtmp2 = rtmp1 + n; 1213 rtmp3 = rtmp2 + n; 1214 1215 node_max = a->inode.node_count; 1216 ns = a->inode.size ; 1217 if (!ns){ 1218 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1219 } 1220 1221 /* If max inode size > 3, split it into two inodes.*/ 1222 /* also map the inode sizes according to the ordering */ 1223 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1224 for (i=0,j=0; i<node_max; ++i,++j){ 1225 if (ns[i]>3) { 1226 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1227 ++j; 1228 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1229 } else { 1230 tmp_vec1[j] = ns[i]; 1231 } 1232 } 1233 /* Use the correct node_max */ 1234 node_max = j; 1235 1236 /* Now reorder the inode info based on mat re-ordering info */ 1237 /* First create a row -> inode_size_array_index map */ 1238 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1239 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1240 for (i=0,row=0; i<node_max; i++) { 1241 nodesz = tmp_vec1[i]; 1242 for (j=0; j<nodesz; j++,row++) { 1243 nsmap[row] = i; 1244 } 1245 } 1246 /* Using nsmap, create a reordered ns structure */ 1247 for (i=0,j=0; i< node_max; i++) { 1248 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1249 tmp_vec2[i] = nodesz; 1250 j += nodesz; 1251 } 1252 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1253 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1254 /* Now use the correct ns */ 1255 ns = tmp_vec2; 1256 1257 do { 1258 sctx.lushift = PETSC_FALSE; 1259 /* Now loop over each block-row, and do the factorization */ 1260 for (i=0,row=0; i<node_max; i++) { 1261 nodesz = ns[i]; 1262 nz = bi[row+1] - bi[row]; 1263 bjtmp = bj + bi[row]; 1264 1265 switch (nodesz){ 1266 case 1: 1267 for (j=0; j<nz; j++){ 1268 idx = bjtmp[j]; 1269 rtmp1[idx] = 0.0; 1270 } 1271 1272 /* load in initial (unfactored row) */ 1273 idx = r[row]; 1274 nz_tmp = ai[idx+1] - ai[idx]; 1275 ajtmp = aj + ai[idx]; 1276 v1 = aa + ai[idx]; 1277 1278 for (j=0; j<nz_tmp; j++) { 1279 idx = ics[ajtmp[j]]; 1280 rtmp1[idx] = v1[j]; 1281 } 1282 rtmp1[ics[r[row]]] += sctx.shift_amount; 1283 1284 prow = *bjtmp++ ; 1285 while (prow < row) { 1286 pc1 = rtmp1 + prow; 1287 if (*pc1 != 0.0){ 1288 pv = ba + bd[prow]; 1289 pj = nbj + bd[prow]; 1290 mul1 = *pc1 * *pv++; 1291 *pc1 = mul1; 1292 nz_tmp = bi[prow+1] - bd[prow] - 1; 1293 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1294 for (j=0; j<nz_tmp; j++) { 1295 tmp = pv[j]; 1296 idx = pj[j]; 1297 rtmp1[idx] -= mul1 * tmp; 1298 } 1299 } 1300 prow = *bjtmp++ ; 1301 } 1302 pj = bj + bi[row]; 1303 pc1 = ba + bi[row]; 1304 1305 sctx.pv = rtmp1[row]; 1306 rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */ 1307 rs = 0.0; 1308 for (j=0; j<nz; j++) { 1309 idx = pj[j]; 1310 pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */ 1311 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1312 } 1313 sctx.rs = rs; 1314 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1315 if (newshift == 1) goto endofwhile; 1316 break; 1317 1318 case 2: 1319 for (j=0; j<nz; j++) { 1320 idx = bjtmp[j]; 1321 rtmp1[idx] = 0.0; 1322 rtmp2[idx] = 0.0; 1323 } 1324 1325 /* load in initial (unfactored row) */ 1326 idx = r[row]; 1327 nz_tmp = ai[idx+1] - ai[idx]; 1328 ajtmp = aj + ai[idx]; 1329 v1 = aa + ai[idx]; 1330 v2 = aa + ai[idx+1]; 1331 for (j=0; j<nz_tmp; j++) { 1332 idx = ics[ajtmp[j]]; 1333 rtmp1[idx] = v1[j]; 1334 rtmp2[idx] = v2[j]; 1335 } 1336 rtmp1[ics[r[row]]] += sctx.shift_amount; 1337 rtmp2[ics[r[row+1]]] += sctx.shift_amount; 1338 1339 prow = *bjtmp++ ; 1340 while (prow < row) { 1341 pc1 = rtmp1 + prow; 1342 pc2 = rtmp2 + prow; 1343 if (*pc1 != 0.0 || *pc2 != 0.0){ 1344 pv = ba + bd[prow]; 1345 pj = nbj + bd[prow]; 1346 mul1 = *pc1 * *pv; 1347 mul2 = *pc2 * *pv; 1348 ++pv; 1349 *pc1 = mul1; 1350 *pc2 = mul2; 1351 1352 nz_tmp = bi[prow+1] - bd[prow] - 1; 1353 for (j=0; j<nz_tmp; j++) { 1354 tmp = pv[j]; 1355 idx = pj[j]; 1356 rtmp1[idx] -= mul1 * tmp; 1357 rtmp2[idx] -= mul2 * tmp; 1358 } 1359 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1360 } 1361 prow = *bjtmp++ ; 1362 } 1363 1364 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1365 pc1 = rtmp1 + prow; 1366 pc2 = rtmp2 + prow; 1367 1368 sctx.pv = *pc1; 1369 pj = bj + bi[prow]; 1370 rs = 0.0; 1371 for (j=0; j<nz; j++){ 1372 idx = pj[j]; 1373 if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]); 1374 } 1375 sctx.rs = rs; 1376 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1377 if (newshift == 1) goto endofwhile; 1378 1379 if (*pc2 != 0.0){ 1380 pj = nbj + bd[prow]; 1381 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1382 *pc2 = mul2; 1383 nz_tmp = bi[prow+1] - bd[prow] - 1; 1384 for (j=0; j<nz_tmp; j++) { 1385 idx = pj[j] ; 1386 tmp = rtmp1[idx]; 1387 rtmp2[idx] -= mul2 * tmp; 1388 } 1389 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1390 } 1391 1392 pj = bj + bi[row]; 1393 pc1 = ba + bi[row]; 1394 pc2 = ba + bi[row+1]; 1395 1396 sctx.pv = rtmp2[row+1]; 1397 rs = 0.0; 1398 rtmp1[row] = 1.0/rtmp1[row]; 1399 rtmp2[row+1] = 1.0/rtmp2[row+1]; 1400 /* copy row entries from dense representation to sparse */ 1401 for (j=0; j<nz; j++) { 1402 idx = pj[j]; 1403 pc1[j] = rtmp1[idx]; 1404 pc2[j] = rtmp2[idx]; 1405 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1406 } 1407 sctx.rs = rs; 1408 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1409 if (newshift == 1) goto endofwhile; 1410 break; 1411 1412 case 3: 1413 for (j=0; j<nz; j++) { 1414 idx = bjtmp[j]; 1415 rtmp1[idx] = 0.0; 1416 rtmp2[idx] = 0.0; 1417 rtmp3[idx] = 0.0; 1418 } 1419 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1420 idx = r[row]; 1421 nz_tmp = ai[idx+1] - ai[idx]; 1422 ajtmp = aj + ai[idx]; 1423 v1 = aa + ai[idx]; 1424 v2 = aa + ai[idx+1]; 1425 v3 = aa + ai[idx+2]; 1426 for (j=0; j<nz_tmp; j++) { 1427 idx = ics[ajtmp[j]]; 1428 rtmp1[idx] = v1[j]; 1429 rtmp2[idx] = v2[j]; 1430 rtmp3[idx] = v3[j]; 1431 } 1432 rtmp1[ics[r[row]]] += sctx.shift_amount; 1433 rtmp2[ics[r[row+1]]] += sctx.shift_amount; 1434 rtmp3[ics[r[row+2]]] += sctx.shift_amount; 1435 1436 /* loop over all pivot row blocks above this row block */ 1437 prow = *bjtmp++ ; 1438 while (prow < row) { 1439 pc1 = rtmp1 + prow; 1440 pc2 = rtmp2 + prow; 1441 pc3 = rtmp3 + prow; 1442 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1443 pv = ba + bd[prow]; 1444 pj = nbj + bd[prow]; 1445 mul1 = *pc1 * *pv; 1446 mul2 = *pc2 * *pv; 1447 mul3 = *pc3 * *pv; 1448 ++pv; 1449 *pc1 = mul1; 1450 *pc2 = mul2; 1451 *pc3 = mul3; 1452 1453 nz_tmp = bi[prow+1] - bd[prow] - 1; 1454 /* update this row based on pivot row */ 1455 for (j=0; j<nz_tmp; j++) { 1456 tmp = pv[j]; 1457 idx = pj[j]; 1458 rtmp1[idx] -= mul1 * tmp; 1459 rtmp2[idx] -= mul2 * tmp; 1460 rtmp3[idx] -= mul3 * tmp; 1461 } 1462 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1463 } 1464 prow = *bjtmp++ ; 1465 } 1466 1467 /* Now take care of diagonal 3x3 block in this set of rows */ 1468 /* note: prow = row here */ 1469 pc1 = rtmp1 + prow; 1470 pc2 = rtmp2 + prow; 1471 pc3 = rtmp3 + prow; 1472 1473 sctx.pv = *pc1; 1474 pj = bj + bi[prow]; 1475 rs = 0.0; 1476 for (j=0; j<nz; j++){ 1477 idx = pj[j]; 1478 if (idx != row) rs += PetscAbsScalar(rtmp1[idx]); 1479 } 1480 sctx.rs = rs; 1481 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1482 if (newshift == 1) goto endofwhile; 1483 1484 if (*pc2 != 0.0 || *pc3 != 0.0){ 1485 mul2 = (*pc2)/(*pc1); 1486 mul3 = (*pc3)/(*pc1); 1487 *pc2 = mul2; 1488 *pc3 = mul3; 1489 nz_tmp = bi[prow+1] - bd[prow] - 1; 1490 pj = nbj + bd[prow]; 1491 for (j=0; j<nz_tmp; j++) { 1492 idx = pj[j] ; 1493 tmp = rtmp1[idx]; 1494 rtmp2[idx] -= mul2 * tmp; 1495 rtmp3[idx] -= mul3 * tmp; 1496 } 1497 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1498 } 1499 ++prow; 1500 1501 pc2 = rtmp2 + prow; 1502 pc3 = rtmp3 + prow; 1503 sctx.pv = *pc2; 1504 pj = bj + bi[prow]; 1505 rs = 0.0; 1506 for (j=0; j<nz; j++){ 1507 idx = pj[j]; 1508 if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]); 1509 } 1510 sctx.rs = rs; 1511 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1512 if (newshift == 1) goto endofwhile; 1513 1514 if (*pc3 != 0.0){ 1515 mul3 = (*pc3)/(*pc2); 1516 *pc3 = mul3; 1517 pj = nbj + bd[prow]; 1518 nz_tmp = bi[prow+1] - bd[prow] - 1; 1519 for (j=0; j<nz_tmp; j++) { 1520 idx = pj[j] ; 1521 tmp = rtmp2[idx]; 1522 rtmp3[idx] -= mul3 * tmp; 1523 } 1524 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1525 } 1526 1527 pj = bj + bi[row]; 1528 pc1 = ba + bi[row]; 1529 pc2 = ba + bi[row+1]; 1530 pc3 = ba + bi[row+2]; 1531 1532 sctx.pv = rtmp3[row+2]; 1533 rs = 0.0; 1534 rtmp1[row] = 1.0/rtmp1[row]; 1535 rtmp2[row+1] = 1.0/rtmp2[row+1]; 1536 rtmp3[row+2] = 1.0/rtmp3[row+2]; 1537 /* copy row entries from dense representation to sparse */ 1538 for (j=0; j<nz; j++) { 1539 idx = pj[j]; 1540 pc1[j] = rtmp1[idx]; 1541 pc2[j] = rtmp2[idx]; 1542 pc3[j] = rtmp3[idx]; 1543 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1544 } 1545 1546 sctx.rs = rs; 1547 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1548 if (newshift == 1) goto endofwhile; 1549 break; 1550 1551 default: 1552 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1553 } 1554 row += nodesz; /* Update the row */ 1555 } 1556 endofwhile:; 1557 } while (sctx.lushift); 1558 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1559 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1560 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1561 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1562 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1563 C->factor = FACTOR_LU; 1564 C->assembled = PETSC_TRUE; 1565 if (sctx.nshift) { 1566 if (info->shiftnz) { 1567 ierr = PetscInfo2(0,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1568 } else if (info->shiftpd) { 1569 ierr = PetscInfo4(0,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1570 } 1571 } 1572 ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr); 1573 PetscFunctionReturn(0); 1574 } 1575 1576 /* 1577 Makes a longer coloring[] array and calls the usual code with that 1578 */ 1579 #undef __FUNCT__ 1580 #define __FUNCT__ "MatColoringPatch_Inode" 1581 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1582 { 1583 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1584 PetscErrorCode ierr; 1585 PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1586 PetscInt *colorused,i; 1587 ISColoringValue *newcolor; 1588 1589 PetscFunctionBegin; 1590 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1591 /* loop over inodes, marking a color for each column*/ 1592 row = 0; 1593 for (i=0; i<m; i++){ 1594 for (j=0; j<ns[i]; j++) { 1595 newcolor[row++] = coloring[i] + j*ncolors; 1596 } 1597 } 1598 1599 /* eliminate unneeded colors */ 1600 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1601 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1602 for (i=0; i<n; i++) { 1603 colorused[newcolor[i]] = 1; 1604 } 1605 1606 for (i=1; i<5*ncolors; i++) { 1607 colorused[i] += colorused[i-1]; 1608 } 1609 ncolors = colorused[5*ncolors-1]; 1610 for (i=0; i<n; i++) { 1611 newcolor[i] = colorused[newcolor[i]]; 1612 } 1613 ierr = PetscFree(colorused);CHKERRQ(ierr); 1614 ierr = ISColoringCreate(mat->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1615 ierr = PetscFree(coloring);CHKERRQ(ierr); 1616 PetscFunctionReturn(0); 1617 } 1618 1619 /* 1620 samestructure indicates that the matrix has not changed its nonzero structure so we 1621 do not need to recompute the inodes 1622 */ 1623 #undef __FUNCT__ 1624 #define __FUNCT__ "Mat_CheckInode" 1625 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 1626 { 1627 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1628 PetscErrorCode ierr; 1629 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 1630 PetscTruth flag; 1631 1632 PetscFunctionBegin; 1633 if (!a->inode.use) PetscFunctionReturn(0); 1634 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 1635 1636 1637 m = A->rmap.n; 1638 if (a->inode.size) {ns = a->inode.size;} 1639 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 1640 1641 i = 0; 1642 node_count = 0; 1643 idx = a->j; 1644 ii = a->i; 1645 while (i < m){ /* For each row */ 1646 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 1647 /* Limits the number of elements in a node to 'a->inode.limit' */ 1648 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 1649 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 1650 if (nzy != nzx) break; 1651 idy += nzx; /* Same nonzero pattern */ 1652 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 1653 if (!flag) break; 1654 } 1655 ns[node_count++] = blk_size; 1656 idx += blk_size*nzx; 1657 i = j; 1658 } 1659 /* If not enough inodes found,, do not use inode version of the routines */ 1660 if (!a->inode.size && m && node_count > 0.9*m) { 1661 ierr = PetscFree(ns);CHKERRQ(ierr); 1662 a->inode.node_count = 0; 1663 a->inode.size = PETSC_NULL; 1664 a->inode.use = PETSC_FALSE; 1665 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 1666 } else { 1667 A->ops->mult = MatMult_Inode; 1668 A->ops->multadd = MatMultAdd_Inode; 1669 A->ops->solve = MatSolve_Inode; 1670 A->ops->lufactornumeric = MatLUFactorNumeric_Inode; 1671 A->ops->getrowij = MatGetRowIJ_Inode; 1672 A->ops->restorerowij = MatRestoreRowIJ_Inode; 1673 A->ops->getcolumnij = MatGetColumnIJ_Inode; 1674 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 1675 A->ops->coloringpatch = MatColoringPatch_Inode; 1676 a->inode.node_count = node_count; 1677 a->inode.size = ns; 1678 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 1679 } 1680 PetscFunctionReturn(0); 1681 } 1682 1683 /* 1684 This is really ugly. if inodes are used this replaces the 1685 permutations with ones that correspond to rows/cols of the matrix 1686 rather then inode blocks 1687 */ 1688 #undef __FUNCT__ 1689 #define __FUNCT__ "MatInodeAdjustForInodes" 1690 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 1691 { 1692 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 1693 1694 PetscFunctionBegin; 1695 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 1696 if (f) { 1697 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 1698 } 1699 PetscFunctionReturn(0); 1700 } 1701 1702 EXTERN_C_BEGIN 1703 #undef __FUNCT__ 1704 #define __FUNCT__ "MatAdjustForInodes_Inode" 1705 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 1706 { 1707 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 1708 PetscErrorCode ierr; 1709 PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count; 1710 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 1711 PetscInt nslim_col,*ns_col; 1712 IS ris = *rperm,cis = *cperm; 1713 1714 PetscFunctionBegin; 1715 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 1716 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 1717 1718 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 1719 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 1720 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 1721 permc = permr + m; 1722 1723 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 1724 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 1725 1726 /* Form the inode structure for the rows of permuted matric using inv perm*/ 1727 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 1728 1729 /* Construct the permutations for rows*/ 1730 for (i=0,row = 0; i<nslim_row; ++i){ 1731 indx = ridx[i]; 1732 start_val = tns[indx]; 1733 end_val = tns[indx + 1]; 1734 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 1735 } 1736 1737 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 1738 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 1739 1740 /* Construct permutations for columns */ 1741 for (i=0,col=0; i<nslim_col; ++i){ 1742 indx = cidx[i]; 1743 start_val = tns[indx]; 1744 end_val = tns[indx + 1]; 1745 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 1746 } 1747 1748 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 1749 ISSetPermutation(*rperm); 1750 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 1751 ISSetPermutation(*cperm); 1752 1753 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 1754 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 1755 1756 ierr = PetscFree(ns_col);CHKERRQ(ierr); 1757 ierr = PetscFree(permr);CHKERRQ(ierr); 1758 ierr = ISDestroy(cis);CHKERRQ(ierr); 1759 ierr = ISDestroy(ris);CHKERRQ(ierr); 1760 ierr = PetscFree(tns);CHKERRQ(ierr); 1761 PetscFunctionReturn(0); 1762 } 1763 EXTERN_C_END 1764 1765 #undef __FUNCT__ 1766 #define __FUNCT__ "MatInodeGetInodeSizes" 1767 /*@C 1768 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 1769 1770 Collective on Mat 1771 1772 Input Parameter: 1773 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 1774 1775 Output Parameter: 1776 + node_count - no of inodes present in the matrix. 1777 . sizes - an array of size node_count,with sizes of each inode. 1778 - limit - the max size used to generate the inodes. 1779 1780 Level: advanced 1781 1782 Notes: This routine returns some internal storage information 1783 of the matrix, it is intended to be used by advanced users. 1784 It should be called after the matrix is assembled. 1785 The contents of the sizes[] array should not be changed. 1786 PETSC_NULL may be passed for information not requested. 1787 1788 .keywords: matrix, seqaij, get, inode 1789 1790 .seealso: MatGetInfo() 1791 @*/ 1792 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 1793 { 1794 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 1795 1796 PetscFunctionBegin; 1797 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 1798 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 1799 if (f) { 1800 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 1801 } 1802 PetscFunctionReturn(0); 1803 } 1804 1805 EXTERN_C_BEGIN 1806 #undef __FUNCT__ 1807 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 1808 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 1809 { 1810 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1811 1812 PetscFunctionBegin; 1813 if (node_count) *node_count = a->inode.node_count; 1814 if (sizes) *sizes = a->inode.size; 1815 if (limit) *limit = a->inode.limit; 1816 PetscFunctionReturn(0); 1817 } 1818 EXTERN_C_END 1819