1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap.n; 19 m = A->rmap.n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap.n; 72 n = A->cmap.n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap.n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap.n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_Inode" 397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 sz = n; /* No of non zeros in this row */ 427 /* Switch on the size of Node */ 428 switch (nsz){ /* Each loop in 'case' is unrolled */ 429 case 1 : 430 sum1 = 0; 431 432 for(n = 0; n< sz-1; n+=2) { 433 i1 = idx[0]; /* The instructions are ordered to */ 434 i2 = idx[1]; /* make the compiler's job easy */ 435 idx += 2; 436 tmp0 = x[i1]; 437 tmp1 = x[i2]; 438 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 439 } 440 441 if (n == sz-1){ /* Take care of the last nonzero */ 442 tmp0 = x[*idx++]; 443 sum1 += *v1++ * tmp0; 444 } 445 y[row++]=sum1; 446 break; 447 case 2: 448 sum1 = 0; 449 sum2 = 0; 450 v2 = v1 + n; 451 452 for (n = 0; n< sz-1; n+=2) { 453 i1 = idx[0]; 454 i2 = idx[1]; 455 idx += 2; 456 tmp0 = x[i1]; 457 tmp1 = x[i2]; 458 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 459 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 460 } 461 if (n == sz-1){ 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 } 466 y[row++]=sum1; 467 y[row++]=sum2; 468 v1 =v2; /* Since the next block to be processed starts there*/ 469 idx +=sz; 470 break; 471 case 3: 472 sum1 = 0; 473 sum2 = 0; 474 sum3 = 0; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 478 for (n = 0; n< sz-1; n+=2) { 479 i1 = idx[0]; 480 i2 = idx[1]; 481 idx += 2; 482 tmp0 = x[i1]; 483 tmp1 = x[i2]; 484 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 485 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 487 } 488 if (n == sz-1){ 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 } 494 y[row++]=sum1; 495 y[row++]=sum2; 496 y[row++]=sum3; 497 v1 =v3; /* Since the next block to be processed starts there*/ 498 idx +=2*sz; 499 break; 500 case 4: 501 sum1 = 0; 502 sum2 = 0; 503 sum3 = 0; 504 sum4 = 0; 505 v2 = v1 + n; 506 v3 = v2 + n; 507 v4 = v3 + n; 508 509 for (n = 0; n< sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 } 520 if (n == sz-1){ 521 tmp0 = x[*idx++]; 522 sum1 += *v1++ * tmp0; 523 sum2 += *v2++ * tmp0; 524 sum3 += *v3++ * tmp0; 525 sum4 += *v4++ * tmp0; 526 } 527 y[row++]=sum1; 528 y[row++]=sum2; 529 y[row++]=sum3; 530 y[row++]=sum4; 531 v1 =v4; /* Since the next block to be processed starts there*/ 532 idx +=3*sz; 533 break; 534 case 5: 535 sum1 = 0; 536 sum2 = 0; 537 sum3 = 0; 538 sum4 = 0; 539 sum5 = 0; 540 v2 = v1 + n; 541 v3 = v2 + n; 542 v4 = v3 + n; 543 v5 = v4 + n; 544 545 for (n = 0; n<sz-1; n+=2) { 546 i1 = idx[0]; 547 i2 = idx[1]; 548 idx += 2; 549 tmp0 = x[i1]; 550 tmp1 = x[i2]; 551 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 552 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 553 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 554 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 555 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 556 } 557 if (n == sz-1){ 558 tmp0 = x[*idx++]; 559 sum1 += *v1++ * tmp0; 560 sum2 += *v2++ * tmp0; 561 sum3 += *v3++ * tmp0; 562 sum4 += *v4++ * tmp0; 563 sum5 += *v5++ * tmp0; 564 } 565 y[row++]=sum1; 566 y[row++]=sum2; 567 y[row++]=sum3; 568 y[row++]=sum4; 569 y[row++]=sum5; 570 v1 =v5; /* Since the next block to be processed starts there */ 571 idx +=4*sz; 572 break; 573 default : 574 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 575 } 576 } 577 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 578 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2*a->nz - nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 /* ----------------------------------------------------------- */ 583 /* Almost same code as the MatMult_Inode() */ 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultAdd_Inode" 586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 590 MatScalar *v1,*v2,*v3,*v4,*v5; 591 PetscScalar *x,*y,*z,*zt; 592 PetscErrorCode ierr; 593 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 594 595 PetscFunctionBegin; 596 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 597 node_max = a->inode.node_count; 598 ns = a->inode.size; /* Node Size array */ 599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 601 if (zz != yy) { 602 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 603 } else { 604 z = y; 605 } 606 zt = z; 607 608 idx = a->j; 609 v1 = a->a; 610 ii = a->i; 611 612 for (i = 0,row = 0; i< node_max; ++i){ 613 nsz = ns[i]; 614 n = ii[1] - ii[0]; 615 ii += nsz; 616 sz = n; /* No of non zeros in this row */ 617 /* Switch on the size of Node */ 618 switch (nsz){ /* Each loop in 'case' is unrolled */ 619 case 1 : 620 sum1 = *zt++; 621 622 for(n = 0; n< sz-1; n+=2) { 623 i1 = idx[0]; /* The instructions are ordered to */ 624 i2 = idx[1]; /* make the compiler's job easy */ 625 idx += 2; 626 tmp0 = x[i1]; 627 tmp1 = x[i2]; 628 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 629 } 630 631 if(n == sz-1){ /* Take care of the last nonzero */ 632 tmp0 = x[*idx++]; 633 sum1 += *v1++ * tmp0; 634 } 635 y[row++]=sum1; 636 break; 637 case 2: 638 sum1 = *zt++; 639 sum2 = *zt++; 640 v2 = v1 + n; 641 642 for(n = 0; n< sz-1; n+=2) { 643 i1 = idx[0]; 644 i2 = idx[1]; 645 idx += 2; 646 tmp0 = x[i1]; 647 tmp1 = x[i2]; 648 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 649 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 650 } 651 if(n == sz-1){ 652 tmp0 = x[*idx++]; 653 sum1 += *v1++ * tmp0; 654 sum2 += *v2++ * tmp0; 655 } 656 y[row++]=sum1; 657 y[row++]=sum2; 658 v1 =v2; /* Since the next block to be processed starts there*/ 659 idx +=sz; 660 break; 661 case 3: 662 sum1 = *zt++; 663 sum2 = *zt++; 664 sum3 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 677 } 678 if (n == sz-1){ 679 tmp0 = x[*idx++]; 680 sum1 += *v1++ * tmp0; 681 sum2 += *v2++ * tmp0; 682 sum3 += *v3++ * tmp0; 683 } 684 y[row++]=sum1; 685 y[row++]=sum2; 686 y[row++]=sum3; 687 v1 =v3; /* Since the next block to be processed starts there*/ 688 idx +=2*sz; 689 break; 690 case 4: 691 sum1 = *zt++; 692 sum2 = *zt++; 693 sum3 = *zt++; 694 sum4 = *zt++; 695 v2 = v1 + n; 696 v3 = v2 + n; 697 v4 = v3 + n; 698 699 for (n = 0; n< sz-1; n+=2) { 700 i1 = idx[0]; 701 i2 = idx[1]; 702 idx += 2; 703 tmp0 = x[i1]; 704 tmp1 = x[i2]; 705 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 706 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 707 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 708 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 709 } 710 if (n == sz-1){ 711 tmp0 = x[*idx++]; 712 sum1 += *v1++ * tmp0; 713 sum2 += *v2++ * tmp0; 714 sum3 += *v3++ * tmp0; 715 sum4 += *v4++ * tmp0; 716 } 717 y[row++]=sum1; 718 y[row++]=sum2; 719 y[row++]=sum3; 720 y[row++]=sum4; 721 v1 =v4; /* Since the next block to be processed starts there*/ 722 idx +=3*sz; 723 break; 724 case 5: 725 sum1 = *zt++; 726 sum2 = *zt++; 727 sum3 = *zt++; 728 sum4 = *zt++; 729 sum5 = *zt++; 730 v2 = v1 + n; 731 v3 = v2 + n; 732 v4 = v3 + n; 733 v5 = v4 + n; 734 735 for (n = 0; n<sz-1; n+=2) { 736 i1 = idx[0]; 737 i2 = idx[1]; 738 idx += 2; 739 tmp0 = x[i1]; 740 tmp1 = x[i2]; 741 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 742 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 743 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 744 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 745 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 746 } 747 if(n == sz-1){ 748 tmp0 = x[*idx++]; 749 sum1 += *v1++ * tmp0; 750 sum2 += *v2++ * tmp0; 751 sum3 += *v3++ * tmp0; 752 sum4 += *v4++ * tmp0; 753 sum5 += *v5++ * tmp0; 754 } 755 y[row++]=sum1; 756 y[row++]=sum2; 757 y[row++]=sum3; 758 y[row++]=sum4; 759 y[row++]=sum5; 760 v1 =v5; /* Since the next block to be processed starts there */ 761 idx +=4*sz; 762 break; 763 default : 764 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 765 } 766 } 767 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 768 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 769 if (zz != yy) { 770 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 771 } 772 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 /* ----------------------------------------------------------- */ 777 #undef __FUNCT__ 778 #define __FUNCT__ "MatSolve_Inode" 779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 780 { 781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 782 IS iscol = a->col,isrow = a->row; 783 PetscErrorCode ierr; 784 PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j; 785 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout; 786 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 787 PetscScalar sum1,sum2,sum3,sum4,sum5; 788 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 789 const PetscScalar *b; 790 791 PetscFunctionBegin; 792 if (A->factor != MAT_FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix"); 793 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp ; 806 aa = a_a ; 807 aj = a_j ; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i){ 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 817 switch (nsz){ /* Each loop in 'case' is unrolled */ 818 case 1 : 819 sum1 = b[*r++]; 820 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 821 for(j=0; j<nz-1; j+=2){ 822 i0 = vi[0]; 823 i1 = vi[1]; 824 vi +=2; 825 tmp0 = tmps[i0]; 826 tmp1 = tmps[i1]; 827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 828 } 829 if(j == nz-1){ 830 tmp0 = tmps[*vi++]; 831 sum1 -= *v1++ *tmp0; 832 } 833 tmp[row ++]=sum1; 834 break; 835 case 2: 836 sum1 = b[*r++]; 837 sum2 = b[*r++]; 838 v2 = aa + ai[row+1]; 839 840 for(j=0; j<nz-1; j+=2){ 841 i0 = vi[0]; 842 i1 = vi[1]; 843 vi +=2; 844 tmp0 = tmps[i0]; 845 tmp1 = tmps[i1]; 846 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 847 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 848 } 849 if(j == nz-1){ 850 tmp0 = tmps[*vi++]; 851 sum1 -= *v1++ *tmp0; 852 sum2 -= *v2++ *tmp0; 853 } 854 sum2 -= *v2++ * sum1; 855 tmp[row ++]=sum1; 856 tmp[row ++]=sum2; 857 break; 858 case 3: 859 sum1 = b[*r++]; 860 sum2 = b[*r++]; 861 sum3 = b[*r++]; 862 v2 = aa + ai[row+1]; 863 v3 = aa + ai[row+2]; 864 865 for (j=0; j<nz-1; j+=2){ 866 i0 = vi[0]; 867 i1 = vi[1]; 868 vi +=2; 869 tmp0 = tmps[i0]; 870 tmp1 = tmps[i1]; 871 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 872 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 873 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 874 } 875 if (j == nz-1){ 876 tmp0 = tmps[*vi++]; 877 sum1 -= *v1++ *tmp0; 878 sum2 -= *v2++ *tmp0; 879 sum3 -= *v3++ *tmp0; 880 } 881 sum2 -= *v2++ * sum1; 882 sum3 -= *v3++ * sum1; 883 sum3 -= *v3++ * sum2; 884 tmp[row ++]=sum1; 885 tmp[row ++]=sum2; 886 tmp[row ++]=sum3; 887 break; 888 889 case 4: 890 sum1 = b[*r++]; 891 sum2 = b[*r++]; 892 sum3 = b[*r++]; 893 sum4 = b[*r++]; 894 v2 = aa + ai[row+1]; 895 v3 = aa + ai[row+2]; 896 v4 = aa + ai[row+3]; 897 898 for (j=0; j<nz-1; j+=2){ 899 i0 = vi[0]; 900 i1 = vi[1]; 901 vi +=2; 902 tmp0 = tmps[i0]; 903 tmp1 = tmps[i1]; 904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 908 } 909 if (j == nz-1){ 910 tmp0 = tmps[*vi++]; 911 sum1 -= *v1++ *tmp0; 912 sum2 -= *v2++ *tmp0; 913 sum3 -= *v3++ *tmp0; 914 sum4 -= *v4++ *tmp0; 915 } 916 sum2 -= *v2++ * sum1; 917 sum3 -= *v3++ * sum1; 918 sum4 -= *v4++ * sum1; 919 sum3 -= *v3++ * sum2; 920 sum4 -= *v4++ * sum2; 921 sum4 -= *v4++ * sum3; 922 923 tmp[row ++]=sum1; 924 tmp[row ++]=sum2; 925 tmp[row ++]=sum3; 926 tmp[row ++]=sum4; 927 break; 928 case 5: 929 sum1 = b[*r++]; 930 sum2 = b[*r++]; 931 sum3 = b[*r++]; 932 sum4 = b[*r++]; 933 sum5 = b[*r++]; 934 v2 = aa + ai[row+1]; 935 v3 = aa + ai[row+2]; 936 v4 = aa + ai[row+3]; 937 v5 = aa + ai[row+4]; 938 939 for (j=0; j<nz-1; j+=2){ 940 i0 = vi[0]; 941 i1 = vi[1]; 942 vi +=2; 943 tmp0 = tmps[i0]; 944 tmp1 = tmps[i1]; 945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 950 } 951 if (j == nz-1){ 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ *tmp0; 954 sum2 -= *v2++ *tmp0; 955 sum3 -= *v3++ *tmp0; 956 sum4 -= *v4++ *tmp0; 957 sum5 -= *v5++ *tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row ++]=sum1; 972 tmp[row ++]=sum2; 973 tmp[row ++]=sum3; 974 tmp[row ++]=sum4; 975 tmp[row ++]=sum5; 976 break; 977 default: 978 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 983 nsz = ns[i]; 984 aii = ai[row+1] -1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii- ad[row]; 988 switch (nsz){ /* Each loop in 'case' is unrolled */ 989 case 1 : 990 sum1 = tmp[row]; 991 992 for(j=nz ; j>1; j-=2){ 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j==1){ 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1006 break; 1007 case 2 : 1008 sum1 = tmp[row]; 1009 sum2 = tmp[row -1]; 1010 v2 = aa + ai[row]-1; 1011 for (j=nz ; j>1; j-=2){ 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1020 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1021 } 1022 if (j==1){ 1023 tmp0 = tmps[*vi--]; 1024 sum1 -= *v1-- * tmp0; 1025 sum2 -= *v2-- * tmp0; 1026 } 1027 1028 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1031 break; 1032 case 3 : 1033 sum1 = tmp[row]; 1034 sum2 = tmp[row -1]; 1035 sum3 = tmp[row -2]; 1036 v2 = aa + ai[row]-1; 1037 v3 = aa + ai[row -1]-1; 1038 for (j=nz ; j>1; j-=2){ 1039 vi -=2; 1040 i0 = vi[2]; 1041 i1 = vi[1]; 1042 tmp0 = tmps[i0]; 1043 tmp1 = tmps[i1]; 1044 v1 -= 2; 1045 v2 -= 2; 1046 v3 -= 2; 1047 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1048 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1049 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1050 } 1051 if (j==1){ 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 } 1057 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1061 sum3 -= *v3-- * tmp0; 1062 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1063 1064 break; 1065 case 4 : 1066 sum1 = tmp[row]; 1067 sum2 = tmp[row -1]; 1068 sum3 = tmp[row -2]; 1069 sum4 = tmp[row -3]; 1070 v2 = aa + ai[row]-1; 1071 v3 = aa + ai[row -1]-1; 1072 v4 = aa + ai[row -2]-1; 1073 1074 for (j=nz ; j>1; j-=2){ 1075 vi -=2; 1076 i0 = vi[2]; 1077 i1 = vi[1]; 1078 tmp0 = tmps[i0]; 1079 tmp1 = tmps[i1]; 1080 v1 -= 2; 1081 v2 -= 2; 1082 v3 -= 2; 1083 v4 -= 2; 1084 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1085 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1086 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1087 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1088 } 1089 if (j==1){ 1090 tmp0 = tmps[*vi--]; 1091 sum1 -= *v1-- * tmp0; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 } 1096 1097 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1105 sum4 -= *v4-- * tmp0; 1106 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1107 break; 1108 case 5 : 1109 sum1 = tmp[row]; 1110 sum2 = tmp[row -1]; 1111 sum3 = tmp[row -2]; 1112 sum4 = tmp[row -3]; 1113 sum5 = tmp[row -4]; 1114 v2 = aa + ai[row]-1; 1115 v3 = aa + ai[row -1]-1; 1116 v4 = aa + ai[row -2]-1; 1117 v5 = aa + ai[row -3]-1; 1118 for (j=nz ; j>1; j-=2){ 1119 vi -= 2; 1120 i0 = vi[2]; 1121 i1 = vi[1]; 1122 tmp0 = tmps[i0]; 1123 tmp1 = tmps[i1]; 1124 v1 -= 2; 1125 v2 -= 2; 1126 v3 -= 2; 1127 v4 -= 2; 1128 v5 -= 2; 1129 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1130 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1131 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1132 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1133 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1134 } 1135 if (j==1){ 1136 tmp0 = tmps[*vi--]; 1137 sum1 -= *v1-- * tmp0; 1138 sum2 -= *v2-- * tmp0; 1139 sum3 -= *v3-- * tmp0; 1140 sum4 -= *v4-- * tmp0; 1141 sum5 -= *v5-- * tmp0; 1142 } 1143 1144 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1157 sum5 -= *v5-- * tmp0; 1158 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1159 break; 1160 default: 1161 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1162 } 1163 } 1164 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1165 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1166 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatLUFactorNumeric_Inode" 1174 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B) 1175 { 1176 Mat C = *B; 1177 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1178 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1179 PetscErrorCode ierr; 1180 PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i; 1181 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1182 PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1183 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1184 PetscScalar mul1,mul2,mul3,tmp; 1185 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1186 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1187 PetscReal rs=0.0; 1188 LUShift_Ctx sctx; 1189 PetscInt newshift; 1190 1191 PetscFunctionBegin; 1192 sctx.shift_top = 0; 1193 sctx.nshift_max = 0; 1194 sctx.shift_lo = 0; 1195 sctx.shift_hi = 0; 1196 1197 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1198 if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0; 1199 if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1200 sctx.shift_top = 0; 1201 for (i=0; i<n; i++) { 1202 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1203 rs = 0.0; 1204 ajtmp = aj + ai[i]; 1205 rtmp1 = aa + ai[i]; 1206 nz = ai[i+1] - ai[i]; 1207 for (j=0; j<nz; j++){ 1208 if (*ajtmp != i){ 1209 rs += PetscAbsScalar(*rtmp1++); 1210 } else { 1211 rs -= PetscRealPart(*rtmp1++); 1212 } 1213 ajtmp++; 1214 } 1215 if (rs>sctx.shift_top) sctx.shift_top = rs; 1216 } 1217 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1218 sctx.shift_top *= 1.1; 1219 sctx.nshift_max = 5; 1220 sctx.shift_lo = 0.; 1221 sctx.shift_hi = 1.; 1222 } 1223 sctx.shift_amount = 0; 1224 sctx.nshift = 0; 1225 1226 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1227 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1228 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1229 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1230 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1231 ics = ic ; 1232 rtmp22 = rtmp11 + n; 1233 rtmp33 = rtmp22 + n; 1234 1235 node_max = a->inode.node_count; 1236 ns = a->inode.size; 1237 if (!ns){ 1238 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1239 } 1240 1241 /* If max inode size > 3, split it into two inodes.*/ 1242 /* also map the inode sizes according to the ordering */ 1243 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1244 for (i=0,j=0; i<node_max; ++i,++j){ 1245 if (ns[i]>3) { 1246 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1247 ++j; 1248 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1249 } else { 1250 tmp_vec1[j] = ns[i]; 1251 } 1252 } 1253 /* Use the correct node_max */ 1254 node_max = j; 1255 1256 /* Now reorder the inode info based on mat re-ordering info */ 1257 /* First create a row -> inode_size_array_index map */ 1258 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1259 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1260 for (i=0,row=0; i<node_max; i++) { 1261 nodesz = tmp_vec1[i]; 1262 for (j=0; j<nodesz; j++,row++) { 1263 nsmap[row] = i; 1264 } 1265 } 1266 /* Using nsmap, create a reordered ns structure */ 1267 for (i=0,j=0; i< node_max; i++) { 1268 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1269 tmp_vec2[i] = nodesz; 1270 j += nodesz; 1271 } 1272 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1273 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1274 /* Now use the correct ns */ 1275 ns = tmp_vec2; 1276 1277 do { 1278 sctx.lushift = PETSC_FALSE; 1279 /* Now loop over each block-row, and do the factorization */ 1280 for (i=0,row=0; i<node_max; i++) { 1281 nodesz = ns[i]; 1282 nz = bi[row+1] - bi[row]; 1283 bjtmp = bj + bi[row]; 1284 1285 switch (nodesz){ 1286 case 1: 1287 for (j=0; j<nz; j++){ 1288 idx = bjtmp[j]; 1289 rtmp11[idx] = 0.0; 1290 } 1291 1292 /* load in initial (unfactored row) */ 1293 idx = r[row]; 1294 nz_tmp = ai[idx+1] - ai[idx]; 1295 ajtmp = aj + ai[idx]; 1296 v1 = aa + ai[idx]; 1297 1298 for (j=0; j<nz_tmp; j++) { 1299 idx = ics[ajtmp[j]]; 1300 rtmp11[idx] = v1[j]; 1301 } 1302 rtmp11[ics[r[row]]] += sctx.shift_amount; 1303 1304 prow = *bjtmp++ ; 1305 while (prow < row) { 1306 pc1 = rtmp11 + prow; 1307 if (*pc1 != 0.0){ 1308 pv = ba + bd[prow]; 1309 pj = nbj + bd[prow]; 1310 mul1 = *pc1 * *pv++; 1311 *pc1 = mul1; 1312 nz_tmp = bi[prow+1] - bd[prow] - 1; 1313 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1314 for (j=0; j<nz_tmp; j++) { 1315 tmp = pv[j]; 1316 idx = pj[j]; 1317 rtmp11[idx] -= mul1 * tmp; 1318 } 1319 } 1320 prow = *bjtmp++ ; 1321 } 1322 pj = bj + bi[row]; 1323 pc1 = ba + bi[row]; 1324 1325 sctx.pv = rtmp11[row]; 1326 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1327 rs = 0.0; 1328 for (j=0; j<nz; j++) { 1329 idx = pj[j]; 1330 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1331 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1332 } 1333 sctx.rs = rs; 1334 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1335 if (newshift == 1) goto endofwhile; 1336 break; 1337 1338 case 2: 1339 for (j=0; j<nz; j++) { 1340 idx = bjtmp[j]; 1341 rtmp11[idx] = 0.0; 1342 rtmp22[idx] = 0.0; 1343 } 1344 1345 /* load in initial (unfactored row) */ 1346 idx = r[row]; 1347 nz_tmp = ai[idx+1] - ai[idx]; 1348 ajtmp = aj + ai[idx]; 1349 v1 = aa + ai[idx]; 1350 v2 = aa + ai[idx+1]; 1351 for (j=0; j<nz_tmp; j++) { 1352 idx = ics[ajtmp[j]]; 1353 rtmp11[idx] = v1[j]; 1354 rtmp22[idx] = v2[j]; 1355 } 1356 rtmp11[ics[r[row]]] += sctx.shift_amount; 1357 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1358 1359 prow = *bjtmp++ ; 1360 while (prow < row) { 1361 pc1 = rtmp11 + prow; 1362 pc2 = rtmp22 + prow; 1363 if (*pc1 != 0.0 || *pc2 != 0.0){ 1364 pv = ba + bd[prow]; 1365 pj = nbj + bd[prow]; 1366 mul1 = *pc1 * *pv; 1367 mul2 = *pc2 * *pv; 1368 ++pv; 1369 *pc1 = mul1; 1370 *pc2 = mul2; 1371 1372 nz_tmp = bi[prow+1] - bd[prow] - 1; 1373 for (j=0; j<nz_tmp; j++) { 1374 tmp = pv[j]; 1375 idx = pj[j]; 1376 rtmp11[idx] -= mul1 * tmp; 1377 rtmp22[idx] -= mul2 * tmp; 1378 } 1379 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1380 } 1381 prow = *bjtmp++ ; 1382 } 1383 1384 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1385 pc1 = rtmp11 + prow; 1386 pc2 = rtmp22 + prow; 1387 1388 sctx.pv = *pc1; 1389 pj = bj + bi[prow]; 1390 rs = 0.0; 1391 for (j=0; j<nz; j++){ 1392 idx = pj[j]; 1393 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1394 } 1395 sctx.rs = rs; 1396 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1397 if (newshift == 1) goto endofwhile; 1398 1399 if (*pc2 != 0.0){ 1400 pj = nbj + bd[prow]; 1401 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1402 *pc2 = mul2; 1403 nz_tmp = bi[prow+1] - bd[prow] - 1; 1404 for (j=0; j<nz_tmp; j++) { 1405 idx = pj[j] ; 1406 tmp = rtmp11[idx]; 1407 rtmp22[idx] -= mul2 * tmp; 1408 } 1409 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1410 } 1411 1412 pj = bj + bi[row]; 1413 pc1 = ba + bi[row]; 1414 pc2 = ba + bi[row+1]; 1415 1416 sctx.pv = rtmp22[row+1]; 1417 rs = 0.0; 1418 rtmp11[row] = 1.0/rtmp11[row]; 1419 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1420 /* copy row entries from dense representation to sparse */ 1421 for (j=0; j<nz; j++) { 1422 idx = pj[j]; 1423 pc1[j] = rtmp11[idx]; 1424 pc2[j] = rtmp22[idx]; 1425 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1426 } 1427 sctx.rs = rs; 1428 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1429 if (newshift == 1) goto endofwhile; 1430 break; 1431 1432 case 3: 1433 for (j=0; j<nz; j++) { 1434 idx = bjtmp[j]; 1435 rtmp11[idx] = 0.0; 1436 rtmp22[idx] = 0.0; 1437 rtmp33[idx] = 0.0; 1438 } 1439 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1440 idx = r[row]; 1441 nz_tmp = ai[idx+1] - ai[idx]; 1442 ajtmp = aj + ai[idx]; 1443 v1 = aa + ai[idx]; 1444 v2 = aa + ai[idx+1]; 1445 v3 = aa + ai[idx+2]; 1446 for (j=0; j<nz_tmp; j++) { 1447 idx = ics[ajtmp[j]]; 1448 rtmp11[idx] = v1[j]; 1449 rtmp22[idx] = v2[j]; 1450 rtmp33[idx] = v3[j]; 1451 } 1452 rtmp11[ics[r[row]]] += sctx.shift_amount; 1453 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1454 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1455 1456 /* loop over all pivot row blocks above this row block */ 1457 prow = *bjtmp++ ; 1458 while (prow < row) { 1459 pc1 = rtmp11 + prow; 1460 pc2 = rtmp22 + prow; 1461 pc3 = rtmp33 + prow; 1462 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1463 pv = ba + bd[prow]; 1464 pj = nbj + bd[prow]; 1465 mul1 = *pc1 * *pv; 1466 mul2 = *pc2 * *pv; 1467 mul3 = *pc3 * *pv; 1468 ++pv; 1469 *pc1 = mul1; 1470 *pc2 = mul2; 1471 *pc3 = mul3; 1472 1473 nz_tmp = bi[prow+1] - bd[prow] - 1; 1474 /* update this row based on pivot row */ 1475 for (j=0; j<nz_tmp; j++) { 1476 tmp = pv[j]; 1477 idx = pj[j]; 1478 rtmp11[idx] -= mul1 * tmp; 1479 rtmp22[idx] -= mul2 * tmp; 1480 rtmp33[idx] -= mul3 * tmp; 1481 } 1482 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1483 } 1484 prow = *bjtmp++ ; 1485 } 1486 1487 /* Now take care of diagonal 3x3 block in this set of rows */ 1488 /* note: prow = row here */ 1489 pc1 = rtmp11 + prow; 1490 pc2 = rtmp22 + prow; 1491 pc3 = rtmp33 + prow; 1492 1493 sctx.pv = *pc1; 1494 pj = bj + bi[prow]; 1495 rs = 0.0; 1496 for (j=0; j<nz; j++){ 1497 idx = pj[j]; 1498 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 1499 } 1500 sctx.rs = rs; 1501 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1502 if (newshift == 1) goto endofwhile; 1503 1504 if (*pc2 != 0.0 || *pc3 != 0.0){ 1505 mul2 = (*pc2)/(*pc1); 1506 mul3 = (*pc3)/(*pc1); 1507 *pc2 = mul2; 1508 *pc3 = mul3; 1509 nz_tmp = bi[prow+1] - bd[prow] - 1; 1510 pj = nbj + bd[prow]; 1511 for (j=0; j<nz_tmp; j++) { 1512 idx = pj[j] ; 1513 tmp = rtmp11[idx]; 1514 rtmp22[idx] -= mul2 * tmp; 1515 rtmp33[idx] -= mul3 * tmp; 1516 } 1517 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1518 } 1519 ++prow; 1520 1521 pc2 = rtmp22 + prow; 1522 pc3 = rtmp33 + prow; 1523 sctx.pv = *pc2; 1524 pj = bj + bi[prow]; 1525 rs = 0.0; 1526 for (j=0; j<nz; j++){ 1527 idx = pj[j]; 1528 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 1529 } 1530 sctx.rs = rs; 1531 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1532 if (newshift == 1) goto endofwhile; 1533 1534 if (*pc3 != 0.0){ 1535 mul3 = (*pc3)/(*pc2); 1536 *pc3 = mul3; 1537 pj = nbj + bd[prow]; 1538 nz_tmp = bi[prow+1] - bd[prow] - 1; 1539 for (j=0; j<nz_tmp; j++) { 1540 idx = pj[j] ; 1541 tmp = rtmp22[idx]; 1542 rtmp33[idx] -= mul3 * tmp; 1543 } 1544 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1545 } 1546 1547 pj = bj + bi[row]; 1548 pc1 = ba + bi[row]; 1549 pc2 = ba + bi[row+1]; 1550 pc3 = ba + bi[row+2]; 1551 1552 sctx.pv = rtmp33[row+2]; 1553 rs = 0.0; 1554 rtmp11[row] = 1.0/rtmp11[row]; 1555 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1556 rtmp33[row+2] = 1.0/rtmp33[row+2]; 1557 /* copy row entries from dense representation to sparse */ 1558 for (j=0; j<nz; j++) { 1559 idx = pj[j]; 1560 pc1[j] = rtmp11[idx]; 1561 pc2[j] = rtmp22[idx]; 1562 pc3[j] = rtmp33[idx]; 1563 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1564 } 1565 1566 sctx.rs = rs; 1567 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1568 if (newshift == 1) goto endofwhile; 1569 break; 1570 1571 default: 1572 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1573 } 1574 row += nodesz; /* Update the row */ 1575 } 1576 endofwhile:; 1577 } while (sctx.lushift); 1578 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 1579 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1580 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1581 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1582 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1583 C->factor = MAT_FACTOR_LU; 1584 C->assembled = PETSC_TRUE; 1585 C->preallocated = PETSC_TRUE; 1586 if (sctx.nshift) { 1587 if (info->shiftnz) { 1588 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1589 } else if (info->shiftpd) { 1590 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1591 } 1592 } 1593 ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr); 1594 PetscFunctionReturn(0); 1595 } 1596 1597 /* 1598 Makes a longer coloring[] array and calls the usual code with that 1599 */ 1600 #undef __FUNCT__ 1601 #define __FUNCT__ "MatColoringPatch_Inode" 1602 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1603 { 1604 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1605 PetscErrorCode ierr; 1606 PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1607 PetscInt *colorused,i; 1608 ISColoringValue *newcolor; 1609 1610 PetscFunctionBegin; 1611 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1612 /* loop over inodes, marking a color for each column*/ 1613 row = 0; 1614 for (i=0; i<m; i++){ 1615 for (j=0; j<ns[i]; j++) { 1616 newcolor[row++] = coloring[i] + j*ncolors; 1617 } 1618 } 1619 1620 /* eliminate unneeded colors */ 1621 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1622 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1623 for (i=0; i<n; i++) { 1624 colorused[newcolor[i]] = 1; 1625 } 1626 1627 for (i=1; i<5*ncolors; i++) { 1628 colorused[i] += colorused[i-1]; 1629 } 1630 ncolors = colorused[5*ncolors-1]; 1631 for (i=0; i<n; i++) { 1632 newcolor[i] = colorused[newcolor[i]]-1; 1633 } 1634 ierr = PetscFree(colorused);CHKERRQ(ierr); 1635 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1636 ierr = PetscFree(coloring);CHKERRQ(ierr); 1637 PetscFunctionReturn(0); 1638 } 1639 1640 #include "src/inline/ilu.h" 1641 1642 #undef __FUNCT__ 1643 #define __FUNCT__ "MatRelax_Inode" 1644 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1645 { 1646 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1647 PetscScalar *x,*xs,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1648 MatScalar *ibdiag,*bdiag; 1649 PetscScalar *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1650 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1651 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1652 PetscErrorCode ierr; 1653 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1654 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1655 1656 PetscFunctionBegin; 1657 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1658 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1659 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode"); 1660 1661 if (!a->inode.ibdiagvalid) { 1662 if (!a->inode.ibdiag) { 1663 /* calculate space needed for diagonal blocks */ 1664 for (i=0; i<m; i++) { 1665 cnt += sizes[i]*sizes[i]; 1666 } 1667 a->inode.bdiagsize = cnt; 1668 ierr = PetscMalloc2(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag);CHKERRQ(ierr); 1669 } 1670 1671 /* copy over the diagonal blocks and invert them */ 1672 ibdiag = a->inode.ibdiag; 1673 bdiag = a->inode.bdiag; 1674 cnt = 0; 1675 for (i=0, row = 0; i<m; i++) { 1676 for (j=0; j<sizes[i]; j++) { 1677 for (k=0; k<sizes[i]; k++) { 1678 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1679 } 1680 } 1681 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1682 1683 switch(sizes[i]) { 1684 case 1: 1685 /* Create matrix data structure */ 1686 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1687 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1688 break; 1689 case 2: 1690 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1691 break; 1692 case 3: 1693 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1694 break; 1695 case 4: 1696 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1697 break; 1698 case 5: 1699 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1700 break; 1701 default: 1702 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1703 } 1704 cnt += sizes[i]*sizes[i]; 1705 row += sizes[i]; 1706 } 1707 a->inode.ibdiagvalid = PETSC_TRUE; 1708 } 1709 ibdiag = a->inode.ibdiag; 1710 bdiag = a->inode.bdiag; 1711 1712 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1713 if (xx != bb) { 1714 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1715 } else { 1716 b = x; 1717 } 1718 1719 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1720 xs = x; 1721 if (flag & SOR_ZERO_INITIAL_GUESS) { 1722 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1723 1724 for (i=0, row=0; i<m; i++) { 1725 sz = diag[row] - ii[row]; 1726 v1 = a->a + ii[row]; 1727 idx = a->j + ii[row]; 1728 1729 /* see comments for MatMult_Inode() for how this is coded */ 1730 switch (sizes[i]){ 1731 case 1: 1732 1733 sum1 = b[row]; 1734 for(n = 0; n<sz-1; n+=2) { 1735 i1 = idx[0]; 1736 i2 = idx[1]; 1737 idx += 2; 1738 tmp0 = x[i1]; 1739 tmp1 = x[i2]; 1740 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1741 } 1742 1743 if (n == sz-1){ 1744 tmp0 = x[*idx]; 1745 sum1 -= *v1 * tmp0; 1746 } 1747 x[row++] = sum1*(*ibdiag++); 1748 break; 1749 case 2: 1750 v2 = a->a + ii[row+1]; 1751 sum1 = b[row]; 1752 sum2 = b[row+1]; 1753 for(n = 0; n<sz-1; n+=2) { 1754 i1 = idx[0]; 1755 i2 = idx[1]; 1756 idx += 2; 1757 tmp0 = x[i1]; 1758 tmp1 = x[i2]; 1759 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1760 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1761 } 1762 1763 if (n == sz-1){ 1764 tmp0 = x[*idx]; 1765 sum1 -= v1[0] * tmp0; 1766 sum2 -= v2[0] * tmp0; 1767 } 1768 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1769 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1770 ibdiag += 4; 1771 break; 1772 case 3: 1773 v2 = a->a + ii[row+1]; 1774 v3 = a->a + ii[row+2]; 1775 sum1 = b[row]; 1776 sum2 = b[row+1]; 1777 sum3 = b[row+2]; 1778 for(n = 0; n<sz-1; n+=2) { 1779 i1 = idx[0]; 1780 i2 = idx[1]; 1781 idx += 2; 1782 tmp0 = x[i1]; 1783 tmp1 = x[i2]; 1784 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1785 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1786 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1787 } 1788 1789 if (n == sz-1){ 1790 tmp0 = x[*idx]; 1791 sum1 -= v1[0] * tmp0; 1792 sum2 -= v2[0] * tmp0; 1793 sum3 -= v3[0] * tmp0; 1794 } 1795 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1796 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1797 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1798 ibdiag += 9; 1799 break; 1800 case 4: 1801 v2 = a->a + ii[row+1]; 1802 v3 = a->a + ii[row+2]; 1803 v4 = a->a + ii[row+3]; 1804 sum1 = b[row]; 1805 sum2 = b[row+1]; 1806 sum3 = b[row+2]; 1807 sum4 = b[row+3]; 1808 for(n = 0; n<sz-1; n+=2) { 1809 i1 = idx[0]; 1810 i2 = idx[1]; 1811 idx += 2; 1812 tmp0 = x[i1]; 1813 tmp1 = x[i2]; 1814 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1815 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1816 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1817 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1818 } 1819 1820 if (n == sz-1){ 1821 tmp0 = x[*idx]; 1822 sum1 -= v1[0] * tmp0; 1823 sum2 -= v2[0] * tmp0; 1824 sum3 -= v3[0] * tmp0; 1825 sum4 -= v4[0] * tmp0; 1826 } 1827 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1828 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1829 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1830 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1831 ibdiag += 16; 1832 break; 1833 case 5: 1834 v2 = a->a + ii[row+1]; 1835 v3 = a->a + ii[row+2]; 1836 v4 = a->a + ii[row+3]; 1837 v5 = a->a + ii[row+4]; 1838 sum1 = b[row]; 1839 sum2 = b[row+1]; 1840 sum3 = b[row+2]; 1841 sum4 = b[row+3]; 1842 sum5 = b[row+4]; 1843 for(n = 0; n<sz-1; n+=2) { 1844 i1 = idx[0]; 1845 i2 = idx[1]; 1846 idx += 2; 1847 tmp0 = x[i1]; 1848 tmp1 = x[i2]; 1849 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1850 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1851 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1852 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1853 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1854 } 1855 1856 if (n == sz-1){ 1857 tmp0 = x[*idx]; 1858 sum1 -= v1[0] * tmp0; 1859 sum2 -= v2[0] * tmp0; 1860 sum3 -= v3[0] * tmp0; 1861 sum4 -= v4[0] * tmp0; 1862 sum5 -= v5[0] * tmp0; 1863 } 1864 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1865 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1866 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1867 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1868 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1869 ibdiag += 25; 1870 break; 1871 default: 1872 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1873 } 1874 } 1875 1876 xb = x; 1877 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1878 } else xb = b; 1879 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1880 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1881 cnt = 0; 1882 for (i=0, row=0; i<m; i++) { 1883 1884 switch (sizes[i]){ 1885 case 1: 1886 x[row++] *= bdiag[cnt++]; 1887 break; 1888 case 2: 1889 x1 = x[row]; x2 = x[row+1]; 1890 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1891 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1892 x[row++] = tmp1; 1893 x[row++] = tmp2; 1894 cnt += 4; 1895 break; 1896 case 3: 1897 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1898 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1899 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1900 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1901 x[row++] = tmp1; 1902 x[row++] = tmp2; 1903 x[row++] = tmp3; 1904 cnt += 9; 1905 break; 1906 case 4: 1907 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1908 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1909 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1910 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1911 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1912 x[row++] = tmp1; 1913 x[row++] = tmp2; 1914 x[row++] = tmp3; 1915 x[row++] = tmp4; 1916 cnt += 16; 1917 break; 1918 case 5: 1919 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1920 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1921 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1922 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1923 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1924 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1925 x[row++] = tmp1; 1926 x[row++] = tmp2; 1927 x[row++] = tmp3; 1928 x[row++] = tmp4; 1929 x[row++] = tmp5; 1930 cnt += 25; 1931 break; 1932 default: 1933 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1934 } 1935 } 1936 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1937 } 1938 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1939 1940 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1941 for (i=m-1, row=A->rmap.n-1; i>=0; i--) { 1942 ibdiag -= sizes[i]*sizes[i]; 1943 sz = ii[row+1] - diag[row] - 1; 1944 v1 = a->a + diag[row] + 1; 1945 idx = a->j + diag[row] + 1; 1946 1947 /* see comments for MatMult_Inode() for how this is coded */ 1948 switch (sizes[i]){ 1949 case 1: 1950 1951 sum1 = xb[row]; 1952 for(n = 0; n<sz-1; n+=2) { 1953 i1 = idx[0]; 1954 i2 = idx[1]; 1955 idx += 2; 1956 tmp0 = x[i1]; 1957 tmp1 = x[i2]; 1958 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1959 } 1960 1961 if (n == sz-1){ 1962 tmp0 = x[*idx]; 1963 sum1 -= *v1*tmp0; 1964 } 1965 x[row--] = sum1*(*ibdiag); 1966 break; 1967 1968 case 2: 1969 1970 sum1 = xb[row]; 1971 sum2 = xb[row-1]; 1972 /* note that sum1 is associated with the second of the two rows */ 1973 v2 = a->a + diag[row-1] + 2; 1974 for(n = 0; n<sz-1; n+=2) { 1975 i1 = idx[0]; 1976 i2 = idx[1]; 1977 idx += 2; 1978 tmp0 = x[i1]; 1979 tmp1 = x[i2]; 1980 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1981 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1982 } 1983 1984 if (n == sz-1){ 1985 tmp0 = x[*idx]; 1986 sum1 -= *v1*tmp0; 1987 sum2 -= *v2*tmp0; 1988 } 1989 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1990 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1991 break; 1992 case 3: 1993 1994 sum1 = xb[row]; 1995 sum2 = xb[row-1]; 1996 sum3 = xb[row-2]; 1997 v2 = a->a + diag[row-1] + 2; 1998 v3 = a->a + diag[row-2] + 3; 1999 for(n = 0; n<sz-1; n+=2) { 2000 i1 = idx[0]; 2001 i2 = idx[1]; 2002 idx += 2; 2003 tmp0 = x[i1]; 2004 tmp1 = x[i2]; 2005 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2006 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2007 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2008 } 2009 2010 if (n == sz-1){ 2011 tmp0 = x[*idx]; 2012 sum1 -= *v1*tmp0; 2013 sum2 -= *v2*tmp0; 2014 sum3 -= *v3*tmp0; 2015 } 2016 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2017 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2018 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2019 break; 2020 case 4: 2021 2022 sum1 = xb[row]; 2023 sum2 = xb[row-1]; 2024 sum3 = xb[row-2]; 2025 sum4 = xb[row-3]; 2026 v2 = a->a + diag[row-1] + 2; 2027 v3 = a->a + diag[row-2] + 3; 2028 v4 = a->a + diag[row-3] + 4; 2029 for(n = 0; n<sz-1; n+=2) { 2030 i1 = idx[0]; 2031 i2 = idx[1]; 2032 idx += 2; 2033 tmp0 = x[i1]; 2034 tmp1 = x[i2]; 2035 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2036 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2037 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2038 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2039 } 2040 2041 if (n == sz-1){ 2042 tmp0 = x[*idx]; 2043 sum1 -= *v1*tmp0; 2044 sum2 -= *v2*tmp0; 2045 sum3 -= *v3*tmp0; 2046 sum4 -= *v4*tmp0; 2047 } 2048 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2049 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2050 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2051 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2052 break; 2053 case 5: 2054 2055 sum1 = xb[row]; 2056 sum2 = xb[row-1]; 2057 sum3 = xb[row-2]; 2058 sum4 = xb[row-3]; 2059 sum5 = xb[row-4]; 2060 v2 = a->a + diag[row-1] + 2; 2061 v3 = a->a + diag[row-2] + 3; 2062 v4 = a->a + diag[row-3] + 4; 2063 v5 = a->a + diag[row-4] + 5; 2064 for(n = 0; n<sz-1; n+=2) { 2065 i1 = idx[0]; 2066 i2 = idx[1]; 2067 idx += 2; 2068 tmp0 = x[i1]; 2069 tmp1 = x[i2]; 2070 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2071 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2072 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2073 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2074 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2075 } 2076 2077 if (n == sz-1){ 2078 tmp0 = x[*idx]; 2079 sum1 -= *v1*tmp0; 2080 sum2 -= *v2*tmp0; 2081 sum3 -= *v3*tmp0; 2082 sum4 -= *v4*tmp0; 2083 sum5 -= *v5*tmp0; 2084 } 2085 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2086 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2087 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2088 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2089 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2090 break; 2091 default: 2092 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2093 } 2094 } 2095 2096 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2097 } 2098 its--; 2099 } 2100 if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode"); 2101 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2102 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2103 PetscFunctionReturn(0); 2104 } 2105 2106 2107 /* 2108 samestructure indicates that the matrix has not changed its nonzero structure so we 2109 do not need to recompute the inodes 2110 */ 2111 #undef __FUNCT__ 2112 #define __FUNCT__ "Mat_CheckInode" 2113 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2114 { 2115 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2116 PetscErrorCode ierr; 2117 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2118 PetscTruth flag; 2119 2120 PetscFunctionBegin; 2121 if (!a->inode.use) PetscFunctionReturn(0); 2122 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2123 2124 2125 m = A->rmap.n; 2126 if (a->inode.size) {ns = a->inode.size;} 2127 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2128 2129 i = 0; 2130 node_count = 0; 2131 idx = a->j; 2132 ii = a->i; 2133 while (i < m){ /* For each row */ 2134 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2135 /* Limits the number of elements in a node to 'a->inode.limit' */ 2136 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2137 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2138 if (nzy != nzx) break; 2139 idy += nzx; /* Same nonzero pattern */ 2140 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2141 if (!flag) break; 2142 } 2143 ns[node_count++] = blk_size; 2144 idx += blk_size*nzx; 2145 i = j; 2146 } 2147 /* If not enough inodes found,, do not use inode version of the routines */ 2148 if (!a->inode.size && m && node_count > .9*m) { 2149 ierr = PetscFree(ns);CHKERRQ(ierr); 2150 a->inode.node_count = 0; 2151 a->inode.size = PETSC_NULL; 2152 a->inode.use = PETSC_FALSE; 2153 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2154 } else { 2155 A->ops->mult = MatMult_Inode; 2156 A->ops->relax = MatRelax_Inode; 2157 A->ops->multadd = MatMultAdd_Inode; 2158 A->ops->solve = MatSolve_Inode; 2159 A->ops->lufactornumeric = MatLUFactorNumeric_Inode; 2160 A->ops->getrowij = MatGetRowIJ_Inode; 2161 A->ops->restorerowij = MatRestoreRowIJ_Inode; 2162 A->ops->getcolumnij = MatGetColumnIJ_Inode; 2163 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 2164 A->ops->coloringpatch = MatColoringPatch_Inode; 2165 a->inode.node_count = node_count; 2166 a->inode.size = ns; 2167 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2168 } 2169 PetscFunctionReturn(0); 2170 } 2171 2172 /* 2173 This is really ugly. if inodes are used this replaces the 2174 permutations with ones that correspond to rows/cols of the matrix 2175 rather then inode blocks 2176 */ 2177 #undef __FUNCT__ 2178 #define __FUNCT__ "MatInodeAdjustForInodes" 2179 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2180 { 2181 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2182 2183 PetscFunctionBegin; 2184 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2185 if (f) { 2186 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2187 } 2188 PetscFunctionReturn(0); 2189 } 2190 2191 EXTERN_C_BEGIN 2192 #undef __FUNCT__ 2193 #define __FUNCT__ "MatAdjustForInodes_Inode" 2194 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 2195 { 2196 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2197 PetscErrorCode ierr; 2198 PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count; 2199 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2200 PetscInt nslim_col,*ns_col; 2201 IS ris = *rperm,cis = *cperm; 2202 2203 PetscFunctionBegin; 2204 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2205 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2206 2207 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2208 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2209 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2210 permc = permr + m; 2211 2212 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2213 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2214 2215 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2216 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2217 2218 /* Construct the permutations for rows*/ 2219 for (i=0,row = 0; i<nslim_row; ++i){ 2220 indx = ridx[i]; 2221 start_val = tns[indx]; 2222 end_val = tns[indx + 1]; 2223 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2224 } 2225 2226 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2227 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2228 2229 /* Construct permutations for columns */ 2230 for (i=0,col=0; i<nslim_col; ++i){ 2231 indx = cidx[i]; 2232 start_val = tns[indx]; 2233 end_val = tns[indx + 1]; 2234 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2235 } 2236 2237 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2238 ISSetPermutation(*rperm); 2239 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2240 ISSetPermutation(*cperm); 2241 2242 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2243 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2244 2245 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2246 ierr = PetscFree(permr);CHKERRQ(ierr); 2247 ierr = ISDestroy(cis);CHKERRQ(ierr); 2248 ierr = ISDestroy(ris);CHKERRQ(ierr); 2249 ierr = PetscFree(tns);CHKERRQ(ierr); 2250 PetscFunctionReturn(0); 2251 } 2252 EXTERN_C_END 2253 2254 #undef __FUNCT__ 2255 #define __FUNCT__ "MatInodeGetInodeSizes" 2256 /*@C 2257 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2258 2259 Collective on Mat 2260 2261 Input Parameter: 2262 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2263 2264 Output Parameter: 2265 + node_count - no of inodes present in the matrix. 2266 . sizes - an array of size node_count,with sizes of each inode. 2267 - limit - the max size used to generate the inodes. 2268 2269 Level: advanced 2270 2271 Notes: This routine returns some internal storage information 2272 of the matrix, it is intended to be used by advanced users. 2273 It should be called after the matrix is assembled. 2274 The contents of the sizes[] array should not be changed. 2275 PETSC_NULL may be passed for information not requested. 2276 2277 .keywords: matrix, seqaij, get, inode 2278 2279 .seealso: MatGetInfo() 2280 @*/ 2281 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2282 { 2283 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2284 2285 PetscFunctionBegin; 2286 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2287 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2288 if (f) { 2289 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2290 } 2291 PetscFunctionReturn(0); 2292 } 2293 2294 EXTERN_C_BEGIN 2295 #undef __FUNCT__ 2296 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 2297 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2298 { 2299 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2300 2301 PetscFunctionBegin; 2302 if (node_count) *node_count = a->inode.node_count; 2303 if (sizes) *sizes = a->inode.size; 2304 if (limit) *limit = a->inode.limit; 2305 PetscFunctionReturn(0); 2306 } 2307 EXTERN_C_END 2308