1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_Inode" 397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 sz = n; /* No of non zeros in this row */ 427 /* Switch on the size of Node */ 428 switch (nsz){ /* Each loop in 'case' is unrolled */ 429 case 1 : 430 sum1 = 0; 431 432 for(n = 0; n< sz-1; n+=2) { 433 i1 = idx[0]; /* The instructions are ordered to */ 434 i2 = idx[1]; /* make the compiler's job easy */ 435 idx += 2; 436 tmp0 = x[i1]; 437 tmp1 = x[i2]; 438 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 439 } 440 441 if (n == sz-1){ /* Take care of the last nonzero */ 442 tmp0 = x[*idx++]; 443 sum1 += *v1++ * tmp0; 444 } 445 y[row++]=sum1; 446 break; 447 case 2: 448 sum1 = 0; 449 sum2 = 0; 450 v2 = v1 + n; 451 452 for (n = 0; n< sz-1; n+=2) { 453 i1 = idx[0]; 454 i2 = idx[1]; 455 idx += 2; 456 tmp0 = x[i1]; 457 tmp1 = x[i2]; 458 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 459 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 460 } 461 if (n == sz-1){ 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 } 466 y[row++]=sum1; 467 y[row++]=sum2; 468 v1 =v2; /* Since the next block to be processed starts there*/ 469 idx +=sz; 470 break; 471 case 3: 472 sum1 = 0; 473 sum2 = 0; 474 sum3 = 0; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 478 for (n = 0; n< sz-1; n+=2) { 479 i1 = idx[0]; 480 i2 = idx[1]; 481 idx += 2; 482 tmp0 = x[i1]; 483 tmp1 = x[i2]; 484 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 485 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 487 } 488 if (n == sz-1){ 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 } 494 y[row++]=sum1; 495 y[row++]=sum2; 496 y[row++]=sum3; 497 v1 =v3; /* Since the next block to be processed starts there*/ 498 idx +=2*sz; 499 break; 500 case 4: 501 sum1 = 0; 502 sum2 = 0; 503 sum3 = 0; 504 sum4 = 0; 505 v2 = v1 + n; 506 v3 = v2 + n; 507 v4 = v3 + n; 508 509 for (n = 0; n< sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 } 520 if (n == sz-1){ 521 tmp0 = x[*idx++]; 522 sum1 += *v1++ * tmp0; 523 sum2 += *v2++ * tmp0; 524 sum3 += *v3++ * tmp0; 525 sum4 += *v4++ * tmp0; 526 } 527 y[row++]=sum1; 528 y[row++]=sum2; 529 y[row++]=sum3; 530 y[row++]=sum4; 531 v1 =v4; /* Since the next block to be processed starts there*/ 532 idx +=3*sz; 533 break; 534 case 5: 535 sum1 = 0; 536 sum2 = 0; 537 sum3 = 0; 538 sum4 = 0; 539 sum5 = 0; 540 v2 = v1 + n; 541 v3 = v2 + n; 542 v4 = v3 + n; 543 v5 = v4 + n; 544 545 for (n = 0; n<sz-1; n+=2) { 546 i1 = idx[0]; 547 i2 = idx[1]; 548 idx += 2; 549 tmp0 = x[i1]; 550 tmp1 = x[i2]; 551 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 552 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 553 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 554 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 555 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 556 } 557 if (n == sz-1){ 558 tmp0 = x[*idx++]; 559 sum1 += *v1++ * tmp0; 560 sum2 += *v2++ * tmp0; 561 sum3 += *v3++ * tmp0; 562 sum4 += *v4++ * tmp0; 563 sum5 += *v5++ * tmp0; 564 } 565 y[row++]=sum1; 566 y[row++]=sum2; 567 y[row++]=sum3; 568 y[row++]=sum4; 569 y[row++]=sum5; 570 v1 =v5; /* Since the next block to be processed starts there */ 571 idx +=4*sz; 572 break; 573 default : 574 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 575 } 576 } 577 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 578 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2*a->nz - nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 /* ----------------------------------------------------------- */ 583 /* Almost same code as the MatMult_Inode() */ 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultAdd_Inode" 586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 590 MatScalar *v1,*v2,*v3,*v4,*v5; 591 PetscScalar *x,*y,*z,*zt; 592 PetscErrorCode ierr; 593 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 594 595 PetscFunctionBegin; 596 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 597 node_max = a->inode.node_count; 598 ns = a->inode.size; /* Node Size array */ 599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 601 if (zz != yy) { 602 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 603 } else { 604 z = y; 605 } 606 zt = z; 607 608 idx = a->j; 609 v1 = a->a; 610 ii = a->i; 611 612 for (i = 0,row = 0; i< node_max; ++i){ 613 nsz = ns[i]; 614 n = ii[1] - ii[0]; 615 ii += nsz; 616 sz = n; /* No of non zeros in this row */ 617 /* Switch on the size of Node */ 618 switch (nsz){ /* Each loop in 'case' is unrolled */ 619 case 1 : 620 sum1 = *zt++; 621 622 for(n = 0; n< sz-1; n+=2) { 623 i1 = idx[0]; /* The instructions are ordered to */ 624 i2 = idx[1]; /* make the compiler's job easy */ 625 idx += 2; 626 tmp0 = x[i1]; 627 tmp1 = x[i2]; 628 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 629 } 630 631 if(n == sz-1){ /* Take care of the last nonzero */ 632 tmp0 = x[*idx++]; 633 sum1 += *v1++ * tmp0; 634 } 635 y[row++]=sum1; 636 break; 637 case 2: 638 sum1 = *zt++; 639 sum2 = *zt++; 640 v2 = v1 + n; 641 642 for(n = 0; n< sz-1; n+=2) { 643 i1 = idx[0]; 644 i2 = idx[1]; 645 idx += 2; 646 tmp0 = x[i1]; 647 tmp1 = x[i2]; 648 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 649 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 650 } 651 if(n == sz-1){ 652 tmp0 = x[*idx++]; 653 sum1 += *v1++ * tmp0; 654 sum2 += *v2++ * tmp0; 655 } 656 y[row++]=sum1; 657 y[row++]=sum2; 658 v1 =v2; /* Since the next block to be processed starts there*/ 659 idx +=sz; 660 break; 661 case 3: 662 sum1 = *zt++; 663 sum2 = *zt++; 664 sum3 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 677 } 678 if (n == sz-1){ 679 tmp0 = x[*idx++]; 680 sum1 += *v1++ * tmp0; 681 sum2 += *v2++ * tmp0; 682 sum3 += *v3++ * tmp0; 683 } 684 y[row++]=sum1; 685 y[row++]=sum2; 686 y[row++]=sum3; 687 v1 =v3; /* Since the next block to be processed starts there*/ 688 idx +=2*sz; 689 break; 690 case 4: 691 sum1 = *zt++; 692 sum2 = *zt++; 693 sum3 = *zt++; 694 sum4 = *zt++; 695 v2 = v1 + n; 696 v3 = v2 + n; 697 v4 = v3 + n; 698 699 for (n = 0; n< sz-1; n+=2) { 700 i1 = idx[0]; 701 i2 = idx[1]; 702 idx += 2; 703 tmp0 = x[i1]; 704 tmp1 = x[i2]; 705 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 706 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 707 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 708 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 709 } 710 if (n == sz-1){ 711 tmp0 = x[*idx++]; 712 sum1 += *v1++ * tmp0; 713 sum2 += *v2++ * tmp0; 714 sum3 += *v3++ * tmp0; 715 sum4 += *v4++ * tmp0; 716 } 717 y[row++]=sum1; 718 y[row++]=sum2; 719 y[row++]=sum3; 720 y[row++]=sum4; 721 v1 =v4; /* Since the next block to be processed starts there*/ 722 idx +=3*sz; 723 break; 724 case 5: 725 sum1 = *zt++; 726 sum2 = *zt++; 727 sum3 = *zt++; 728 sum4 = *zt++; 729 sum5 = *zt++; 730 v2 = v1 + n; 731 v3 = v2 + n; 732 v4 = v3 + n; 733 v5 = v4 + n; 734 735 for (n = 0; n<sz-1; n+=2) { 736 i1 = idx[0]; 737 i2 = idx[1]; 738 idx += 2; 739 tmp0 = x[i1]; 740 tmp1 = x[i2]; 741 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 742 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 743 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 744 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 745 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 746 } 747 if(n == sz-1){ 748 tmp0 = x[*idx++]; 749 sum1 += *v1++ * tmp0; 750 sum2 += *v2++ * tmp0; 751 sum3 += *v3++ * tmp0; 752 sum4 += *v4++ * tmp0; 753 sum5 += *v5++ * tmp0; 754 } 755 y[row++]=sum1; 756 y[row++]=sum2; 757 y[row++]=sum3; 758 y[row++]=sum4; 759 y[row++]=sum5; 760 v1 =v5; /* Since the next block to be processed starts there */ 761 idx +=4*sz; 762 break; 763 default : 764 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 765 } 766 } 767 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 768 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 769 if (zz != yy) { 770 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 771 } 772 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 /* ----------------------------------------------------------- */ 777 #undef __FUNCT__ 778 #define __FUNCT__ "MatSolve_Inode" 779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 780 { 781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 782 IS iscol = a->col,isrow = a->row; 783 PetscErrorCode ierr; 784 PetscInt *r,*c,i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 785 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout; 786 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 787 PetscScalar sum1,sum2,sum3,sum4,sum5; 788 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 789 const PetscScalar *b; 790 791 PetscFunctionBegin; 792 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 793 node_max = a->inode.node_count; 794 ns = a->inode.size; /* Node Size array */ 795 796 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 797 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 798 tmp = a->solve_work; 799 800 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 801 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 802 803 /* forward solve the lower triangular */ 804 tmps = tmp ; 805 aa = a_a ; 806 aj = a_j ; 807 ad = a->diag; 808 809 for (i = 0,row = 0; i< node_max; ++i){ 810 nsz = ns[i]; 811 aii = ai[row]; 812 v1 = aa + aii; 813 vi = aj + aii; 814 nz = ad[row]- aii; 815 816 switch (nsz){ /* Each loop in 'case' is unrolled */ 817 case 1 : 818 sum1 = b[*r++]; 819 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 820 for(j=0; j<nz-1; j+=2){ 821 i0 = vi[0]; 822 i1 = vi[1]; 823 vi +=2; 824 tmp0 = tmps[i0]; 825 tmp1 = tmps[i1]; 826 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 827 } 828 if(j == nz-1){ 829 tmp0 = tmps[*vi++]; 830 sum1 -= *v1++ *tmp0; 831 } 832 tmp[row ++]=sum1; 833 break; 834 case 2: 835 sum1 = b[*r++]; 836 sum2 = b[*r++]; 837 v2 = aa + ai[row+1]; 838 839 for(j=0; j<nz-1; j+=2){ 840 i0 = vi[0]; 841 i1 = vi[1]; 842 vi +=2; 843 tmp0 = tmps[i0]; 844 tmp1 = tmps[i1]; 845 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 846 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 847 } 848 if(j == nz-1){ 849 tmp0 = tmps[*vi++]; 850 sum1 -= *v1++ *tmp0; 851 sum2 -= *v2++ *tmp0; 852 } 853 sum2 -= *v2++ * sum1; 854 tmp[row ++]=sum1; 855 tmp[row ++]=sum2; 856 break; 857 case 3: 858 sum1 = b[*r++]; 859 sum2 = b[*r++]; 860 sum3 = b[*r++]; 861 v2 = aa + ai[row+1]; 862 v3 = aa + ai[row+2]; 863 864 for (j=0; j<nz-1; j+=2){ 865 i0 = vi[0]; 866 i1 = vi[1]; 867 vi +=2; 868 tmp0 = tmps[i0]; 869 tmp1 = tmps[i1]; 870 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 871 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 872 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 873 } 874 if (j == nz-1){ 875 tmp0 = tmps[*vi++]; 876 sum1 -= *v1++ *tmp0; 877 sum2 -= *v2++ *tmp0; 878 sum3 -= *v3++ *tmp0; 879 } 880 sum2 -= *v2++ * sum1; 881 sum3 -= *v3++ * sum1; 882 sum3 -= *v3++ * sum2; 883 tmp[row ++]=sum1; 884 tmp[row ++]=sum2; 885 tmp[row ++]=sum3; 886 break; 887 888 case 4: 889 sum1 = b[*r++]; 890 sum2 = b[*r++]; 891 sum3 = b[*r++]; 892 sum4 = b[*r++]; 893 v2 = aa + ai[row+1]; 894 v3 = aa + ai[row+2]; 895 v4 = aa + ai[row+3]; 896 897 for (j=0; j<nz-1; j+=2){ 898 i0 = vi[0]; 899 i1 = vi[1]; 900 vi +=2; 901 tmp0 = tmps[i0]; 902 tmp1 = tmps[i1]; 903 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 905 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 906 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 907 } 908 if (j == nz-1){ 909 tmp0 = tmps[*vi++]; 910 sum1 -= *v1++ *tmp0; 911 sum2 -= *v2++ *tmp0; 912 sum3 -= *v3++ *tmp0; 913 sum4 -= *v4++ *tmp0; 914 } 915 sum2 -= *v2++ * sum1; 916 sum3 -= *v3++ * sum1; 917 sum4 -= *v4++ * sum1; 918 sum3 -= *v3++ * sum2; 919 sum4 -= *v4++ * sum2; 920 sum4 -= *v4++ * sum3; 921 922 tmp[row ++]=sum1; 923 tmp[row ++]=sum2; 924 tmp[row ++]=sum3; 925 tmp[row ++]=sum4; 926 break; 927 case 5: 928 sum1 = b[*r++]; 929 sum2 = b[*r++]; 930 sum3 = b[*r++]; 931 sum4 = b[*r++]; 932 sum5 = b[*r++]; 933 v2 = aa + ai[row+1]; 934 v3 = aa + ai[row+2]; 935 v4 = aa + ai[row+3]; 936 v5 = aa + ai[row+4]; 937 938 for (j=0; j<nz-1; j+=2){ 939 i0 = vi[0]; 940 i1 = vi[1]; 941 vi +=2; 942 tmp0 = tmps[i0]; 943 tmp1 = tmps[i1]; 944 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 945 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 946 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 947 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 948 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 949 } 950 if (j == nz-1){ 951 tmp0 = tmps[*vi++]; 952 sum1 -= *v1++ *tmp0; 953 sum2 -= *v2++ *tmp0; 954 sum3 -= *v3++ *tmp0; 955 sum4 -= *v4++ *tmp0; 956 sum5 -= *v5++ *tmp0; 957 } 958 959 sum2 -= *v2++ * sum1; 960 sum3 -= *v3++ * sum1; 961 sum4 -= *v4++ * sum1; 962 sum5 -= *v5++ * sum1; 963 sum3 -= *v3++ * sum2; 964 sum4 -= *v4++ * sum2; 965 sum5 -= *v5++ * sum2; 966 sum4 -= *v4++ * sum3; 967 sum5 -= *v5++ * sum3; 968 sum5 -= *v5++ * sum4; 969 970 tmp[row ++]=sum1; 971 tmp[row ++]=sum2; 972 tmp[row ++]=sum3; 973 tmp[row ++]=sum4; 974 tmp[row ++]=sum5; 975 break; 976 default: 977 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 978 } 979 } 980 /* backward solve the upper triangular */ 981 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 982 nsz = ns[i]; 983 aii = ai[row+1] -1; 984 v1 = aa + aii; 985 vi = aj + aii; 986 nz = aii- ad[row]; 987 switch (nsz){ /* Each loop in 'case' is unrolled */ 988 case 1 : 989 sum1 = tmp[row]; 990 991 for(j=nz ; j>1; j-=2){ 992 vi -=2; 993 i0 = vi[2]; 994 i1 = vi[1]; 995 tmp0 = tmps[i0]; 996 tmp1 = tmps[i1]; 997 v1 -= 2; 998 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 999 } 1000 if (j==1){ 1001 tmp0 = tmps[*vi--]; 1002 sum1 -= *v1-- * tmp0; 1003 } 1004 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1005 break; 1006 case 2 : 1007 sum1 = tmp[row]; 1008 sum2 = tmp[row -1]; 1009 v2 = aa + ai[row]-1; 1010 for (j=nz ; j>1; j-=2){ 1011 vi -=2; 1012 i0 = vi[2]; 1013 i1 = vi[1]; 1014 tmp0 = tmps[i0]; 1015 tmp1 = tmps[i1]; 1016 v1 -= 2; 1017 v2 -= 2; 1018 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1019 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1020 } 1021 if (j==1){ 1022 tmp0 = tmps[*vi--]; 1023 sum1 -= *v1-- * tmp0; 1024 sum2 -= *v2-- * tmp0; 1025 } 1026 1027 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1028 sum2 -= *v2-- * tmp0; 1029 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1030 break; 1031 case 3 : 1032 sum1 = tmp[row]; 1033 sum2 = tmp[row -1]; 1034 sum3 = tmp[row -2]; 1035 v2 = aa + ai[row]-1; 1036 v3 = aa + ai[row -1]-1; 1037 for (j=nz ; j>1; j-=2){ 1038 vi -=2; 1039 i0 = vi[2]; 1040 i1 = vi[1]; 1041 tmp0 = tmps[i0]; 1042 tmp1 = tmps[i1]; 1043 v1 -= 2; 1044 v2 -= 2; 1045 v3 -= 2; 1046 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1047 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1048 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1049 } 1050 if (j==1){ 1051 tmp0 = tmps[*vi--]; 1052 sum1 -= *v1-- * tmp0; 1053 sum2 -= *v2-- * tmp0; 1054 sum3 -= *v3-- * tmp0; 1055 } 1056 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1057 sum2 -= *v2-- * tmp0; 1058 sum3 -= *v3-- * tmp0; 1059 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1060 sum3 -= *v3-- * tmp0; 1061 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1062 1063 break; 1064 case 4 : 1065 sum1 = tmp[row]; 1066 sum2 = tmp[row -1]; 1067 sum3 = tmp[row -2]; 1068 sum4 = tmp[row -3]; 1069 v2 = aa + ai[row]-1; 1070 v3 = aa + ai[row -1]-1; 1071 v4 = aa + ai[row -2]-1; 1072 1073 for (j=nz ; j>1; j-=2){ 1074 vi -=2; 1075 i0 = vi[2]; 1076 i1 = vi[1]; 1077 tmp0 = tmps[i0]; 1078 tmp1 = tmps[i1]; 1079 v1 -= 2; 1080 v2 -= 2; 1081 v3 -= 2; 1082 v4 -= 2; 1083 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1084 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1085 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1086 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1087 } 1088 if (j==1){ 1089 tmp0 = tmps[*vi--]; 1090 sum1 -= *v1-- * tmp0; 1091 sum2 -= *v2-- * tmp0; 1092 sum3 -= *v3-- * tmp0; 1093 sum4 -= *v4-- * tmp0; 1094 } 1095 1096 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1097 sum2 -= *v2-- * tmp0; 1098 sum3 -= *v3-- * tmp0; 1099 sum4 -= *v4-- * tmp0; 1100 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1104 sum4 -= *v4-- * tmp0; 1105 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1106 break; 1107 case 5 : 1108 sum1 = tmp[row]; 1109 sum2 = tmp[row -1]; 1110 sum3 = tmp[row -2]; 1111 sum4 = tmp[row -3]; 1112 sum5 = tmp[row -4]; 1113 v2 = aa + ai[row]-1; 1114 v3 = aa + ai[row -1]-1; 1115 v4 = aa + ai[row -2]-1; 1116 v5 = aa + ai[row -3]-1; 1117 for (j=nz ; j>1; j-=2){ 1118 vi -= 2; 1119 i0 = vi[2]; 1120 i1 = vi[1]; 1121 tmp0 = tmps[i0]; 1122 tmp1 = tmps[i1]; 1123 v1 -= 2; 1124 v2 -= 2; 1125 v3 -= 2; 1126 v4 -= 2; 1127 v5 -= 2; 1128 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1129 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1130 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1131 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1132 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1133 } 1134 if (j==1){ 1135 tmp0 = tmps[*vi--]; 1136 sum1 -= *v1-- * tmp0; 1137 sum2 -= *v2-- * tmp0; 1138 sum3 -= *v3-- * tmp0; 1139 sum4 -= *v4-- * tmp0; 1140 sum5 -= *v5-- * tmp0; 1141 } 1142 1143 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1144 sum2 -= *v2-- * tmp0; 1145 sum3 -= *v3-- * tmp0; 1146 sum4 -= *v4-- * tmp0; 1147 sum5 -= *v5-- * tmp0; 1148 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1149 sum3 -= *v3-- * tmp0; 1150 sum4 -= *v4-- * tmp0; 1151 sum5 -= *v5-- * tmp0; 1152 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1153 sum4 -= *v4-- * tmp0; 1154 sum5 -= *v5-- * tmp0; 1155 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1156 sum5 -= *v5-- * tmp0; 1157 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1158 break; 1159 default: 1160 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1161 } 1162 } 1163 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1164 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1165 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1166 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1167 ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 1168 PetscFunctionReturn(0); 1169 } 1170 1171 #undef __FUNCT__ 1172 #define __FUNCT__ "MatLUFactorNumeric_Inode" 1173 PetscErrorCode MatLUFactorNumeric_Inode(Mat B,Mat A,MatFactorInfo *info) 1174 { 1175 Mat C = B; 1176 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1177 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1178 PetscErrorCode ierr; 1179 PetscInt *r,*ic,*c,n = A->rmap->n,*bi = b->i; 1180 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1181 PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1182 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1183 PetscScalar mul1,mul2,mul3,tmp; 1184 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1185 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1186 PetscReal rs=0.0; 1187 LUShift_Ctx sctx; 1188 PetscInt newshift; 1189 1190 PetscFunctionBegin; 1191 sctx.shift_top = 0; 1192 sctx.nshift_max = 0; 1193 sctx.shift_lo = 0; 1194 sctx.shift_hi = 0; 1195 1196 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1197 if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0; 1198 if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1199 sctx.shift_top = 0; 1200 for (i=0; i<n; i++) { 1201 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1202 rs = 0.0; 1203 ajtmp = aj + ai[i]; 1204 rtmp1 = aa + ai[i]; 1205 nz = ai[i+1] - ai[i]; 1206 for (j=0; j<nz; j++){ 1207 if (*ajtmp != i){ 1208 rs += PetscAbsScalar(*rtmp1++); 1209 } else { 1210 rs -= PetscRealPart(*rtmp1++); 1211 } 1212 ajtmp++; 1213 } 1214 if (rs>sctx.shift_top) sctx.shift_top = rs; 1215 } 1216 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1217 sctx.shift_top *= 1.1; 1218 sctx.nshift_max = 5; 1219 sctx.shift_lo = 0.; 1220 sctx.shift_hi = 1.; 1221 } 1222 sctx.shift_amount = 0; 1223 sctx.nshift = 0; 1224 1225 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1226 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1227 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1228 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1230 ics = ic ; 1231 rtmp22 = rtmp11 + n; 1232 rtmp33 = rtmp22 + n; 1233 1234 node_max = a->inode.node_count; 1235 ns = a->inode.size; 1236 if (!ns){ 1237 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1238 } 1239 1240 /* If max inode size > 3, split it into two inodes.*/ 1241 /* also map the inode sizes according to the ordering */ 1242 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1243 for (i=0,j=0; i<node_max; ++i,++j){ 1244 if (ns[i]>3) { 1245 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1246 ++j; 1247 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1248 } else { 1249 tmp_vec1[j] = ns[i]; 1250 } 1251 } 1252 /* Use the correct node_max */ 1253 node_max = j; 1254 1255 /* Now reorder the inode info based on mat re-ordering info */ 1256 /* First create a row -> inode_size_array_index map */ 1257 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1258 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1259 for (i=0,row=0; i<node_max; i++) { 1260 nodesz = tmp_vec1[i]; 1261 for (j=0; j<nodesz; j++,row++) { 1262 nsmap[row] = i; 1263 } 1264 } 1265 /* Using nsmap, create a reordered ns structure */ 1266 for (i=0,j=0; i< node_max; i++) { 1267 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1268 tmp_vec2[i] = nodesz; 1269 j += nodesz; 1270 } 1271 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1272 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1273 /* Now use the correct ns */ 1274 ns = tmp_vec2; 1275 1276 do { 1277 sctx.lushift = PETSC_FALSE; 1278 /* Now loop over each block-row, and do the factorization */ 1279 for (i=0,row=0; i<node_max; i++) { 1280 nodesz = ns[i]; 1281 nz = bi[row+1] - bi[row]; 1282 bjtmp = bj + bi[row]; 1283 1284 switch (nodesz){ 1285 case 1: 1286 for (j=0; j<nz; j++){ 1287 idx = bjtmp[j]; 1288 rtmp11[idx] = 0.0; 1289 } 1290 1291 /* load in initial (unfactored row) */ 1292 idx = r[row]; 1293 nz_tmp = ai[idx+1] - ai[idx]; 1294 ajtmp = aj + ai[idx]; 1295 v1 = aa + ai[idx]; 1296 1297 for (j=0; j<nz_tmp; j++) { 1298 idx = ics[ajtmp[j]]; 1299 rtmp11[idx] = v1[j]; 1300 } 1301 rtmp11[ics[r[row]]] += sctx.shift_amount; 1302 1303 prow = *bjtmp++ ; 1304 while (prow < row) { 1305 pc1 = rtmp11 + prow; 1306 if (*pc1 != 0.0){ 1307 pv = ba + bd[prow]; 1308 pj = nbj + bd[prow]; 1309 mul1 = *pc1 * *pv++; 1310 *pc1 = mul1; 1311 nz_tmp = bi[prow+1] - bd[prow] - 1; 1312 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1313 for (j=0; j<nz_tmp; j++) { 1314 tmp = pv[j]; 1315 idx = pj[j]; 1316 rtmp11[idx] -= mul1 * tmp; 1317 } 1318 } 1319 prow = *bjtmp++ ; 1320 } 1321 pj = bj + bi[row]; 1322 pc1 = ba + bi[row]; 1323 1324 sctx.pv = rtmp11[row]; 1325 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1326 rs = 0.0; 1327 for (j=0; j<nz; j++) { 1328 idx = pj[j]; 1329 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1330 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1331 } 1332 sctx.rs = rs; 1333 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1334 if (newshift == 1) goto endofwhile; 1335 break; 1336 1337 case 2: 1338 for (j=0; j<nz; j++) { 1339 idx = bjtmp[j]; 1340 rtmp11[idx] = 0.0; 1341 rtmp22[idx] = 0.0; 1342 } 1343 1344 /* load in initial (unfactored row) */ 1345 idx = r[row]; 1346 nz_tmp = ai[idx+1] - ai[idx]; 1347 ajtmp = aj + ai[idx]; 1348 v1 = aa + ai[idx]; 1349 v2 = aa + ai[idx+1]; 1350 for (j=0; j<nz_tmp; j++) { 1351 idx = ics[ajtmp[j]]; 1352 rtmp11[idx] = v1[j]; 1353 rtmp22[idx] = v2[j]; 1354 } 1355 rtmp11[ics[r[row]]] += sctx.shift_amount; 1356 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1357 1358 prow = *bjtmp++ ; 1359 while (prow < row) { 1360 pc1 = rtmp11 + prow; 1361 pc2 = rtmp22 + prow; 1362 if (*pc1 != 0.0 || *pc2 != 0.0){ 1363 pv = ba + bd[prow]; 1364 pj = nbj + bd[prow]; 1365 mul1 = *pc1 * *pv; 1366 mul2 = *pc2 * *pv; 1367 ++pv; 1368 *pc1 = mul1; 1369 *pc2 = mul2; 1370 1371 nz_tmp = bi[prow+1] - bd[prow] - 1; 1372 for (j=0; j<nz_tmp; j++) { 1373 tmp = pv[j]; 1374 idx = pj[j]; 1375 rtmp11[idx] -= mul1 * tmp; 1376 rtmp22[idx] -= mul2 * tmp; 1377 } 1378 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1379 } 1380 prow = *bjtmp++ ; 1381 } 1382 1383 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1384 pc1 = rtmp11 + prow; 1385 pc2 = rtmp22 + prow; 1386 1387 sctx.pv = *pc1; 1388 pj = bj + bi[prow]; 1389 rs = 0.0; 1390 for (j=0; j<nz; j++){ 1391 idx = pj[j]; 1392 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1393 } 1394 sctx.rs = rs; 1395 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1396 if (newshift == 1) goto endofwhile; 1397 1398 if (*pc2 != 0.0){ 1399 pj = nbj + bd[prow]; 1400 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1401 *pc2 = mul2; 1402 nz_tmp = bi[prow+1] - bd[prow] - 1; 1403 for (j=0; j<nz_tmp; j++) { 1404 idx = pj[j] ; 1405 tmp = rtmp11[idx]; 1406 rtmp22[idx] -= mul2 * tmp; 1407 } 1408 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1409 } 1410 1411 pj = bj + bi[row]; 1412 pc1 = ba + bi[row]; 1413 pc2 = ba + bi[row+1]; 1414 1415 sctx.pv = rtmp22[row+1]; 1416 rs = 0.0; 1417 rtmp11[row] = 1.0/rtmp11[row]; 1418 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1419 /* copy row entries from dense representation to sparse */ 1420 for (j=0; j<nz; j++) { 1421 idx = pj[j]; 1422 pc1[j] = rtmp11[idx]; 1423 pc2[j] = rtmp22[idx]; 1424 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1425 } 1426 sctx.rs = rs; 1427 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1428 if (newshift == 1) goto endofwhile; 1429 break; 1430 1431 case 3: 1432 for (j=0; j<nz; j++) { 1433 idx = bjtmp[j]; 1434 rtmp11[idx] = 0.0; 1435 rtmp22[idx] = 0.0; 1436 rtmp33[idx] = 0.0; 1437 } 1438 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1439 idx = r[row]; 1440 nz_tmp = ai[idx+1] - ai[idx]; 1441 ajtmp = aj + ai[idx]; 1442 v1 = aa + ai[idx]; 1443 v2 = aa + ai[idx+1]; 1444 v3 = aa + ai[idx+2]; 1445 for (j=0; j<nz_tmp; j++) { 1446 idx = ics[ajtmp[j]]; 1447 rtmp11[idx] = v1[j]; 1448 rtmp22[idx] = v2[j]; 1449 rtmp33[idx] = v3[j]; 1450 } 1451 rtmp11[ics[r[row]]] += sctx.shift_amount; 1452 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1453 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1454 1455 /* loop over all pivot row blocks above this row block */ 1456 prow = *bjtmp++ ; 1457 while (prow < row) { 1458 pc1 = rtmp11 + prow; 1459 pc2 = rtmp22 + prow; 1460 pc3 = rtmp33 + prow; 1461 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1462 pv = ba + bd[prow]; 1463 pj = nbj + bd[prow]; 1464 mul1 = *pc1 * *pv; 1465 mul2 = *pc2 * *pv; 1466 mul3 = *pc3 * *pv; 1467 ++pv; 1468 *pc1 = mul1; 1469 *pc2 = mul2; 1470 *pc3 = mul3; 1471 1472 nz_tmp = bi[prow+1] - bd[prow] - 1; 1473 /* update this row based on pivot row */ 1474 for (j=0; j<nz_tmp; j++) { 1475 tmp = pv[j]; 1476 idx = pj[j]; 1477 rtmp11[idx] -= mul1 * tmp; 1478 rtmp22[idx] -= mul2 * tmp; 1479 rtmp33[idx] -= mul3 * tmp; 1480 } 1481 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1482 } 1483 prow = *bjtmp++ ; 1484 } 1485 1486 /* Now take care of diagonal 3x3 block in this set of rows */ 1487 /* note: prow = row here */ 1488 pc1 = rtmp11 + prow; 1489 pc2 = rtmp22 + prow; 1490 pc3 = rtmp33 + prow; 1491 1492 sctx.pv = *pc1; 1493 pj = bj + bi[prow]; 1494 rs = 0.0; 1495 for (j=0; j<nz; j++){ 1496 idx = pj[j]; 1497 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 1498 } 1499 sctx.rs = rs; 1500 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1501 if (newshift == 1) goto endofwhile; 1502 1503 if (*pc2 != 0.0 || *pc3 != 0.0){ 1504 mul2 = (*pc2)/(*pc1); 1505 mul3 = (*pc3)/(*pc1); 1506 *pc2 = mul2; 1507 *pc3 = mul3; 1508 nz_tmp = bi[prow+1] - bd[prow] - 1; 1509 pj = nbj + bd[prow]; 1510 for (j=0; j<nz_tmp; j++) { 1511 idx = pj[j] ; 1512 tmp = rtmp11[idx]; 1513 rtmp22[idx] -= mul2 * tmp; 1514 rtmp33[idx] -= mul3 * tmp; 1515 } 1516 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1517 } 1518 ++prow; 1519 1520 pc2 = rtmp22 + prow; 1521 pc3 = rtmp33 + prow; 1522 sctx.pv = *pc2; 1523 pj = bj + bi[prow]; 1524 rs = 0.0; 1525 for (j=0; j<nz; j++){ 1526 idx = pj[j]; 1527 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 1528 } 1529 sctx.rs = rs; 1530 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1531 if (newshift == 1) goto endofwhile; 1532 1533 if (*pc3 != 0.0){ 1534 mul3 = (*pc3)/(*pc2); 1535 *pc3 = mul3; 1536 pj = nbj + bd[prow]; 1537 nz_tmp = bi[prow+1] - bd[prow] - 1; 1538 for (j=0; j<nz_tmp; j++) { 1539 idx = pj[j] ; 1540 tmp = rtmp22[idx]; 1541 rtmp33[idx] -= mul3 * tmp; 1542 } 1543 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1544 } 1545 1546 pj = bj + bi[row]; 1547 pc1 = ba + bi[row]; 1548 pc2 = ba + bi[row+1]; 1549 pc3 = ba + bi[row+2]; 1550 1551 sctx.pv = rtmp33[row+2]; 1552 rs = 0.0; 1553 rtmp11[row] = 1.0/rtmp11[row]; 1554 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1555 rtmp33[row+2] = 1.0/rtmp33[row+2]; 1556 /* copy row entries from dense representation to sparse */ 1557 for (j=0; j<nz; j++) { 1558 idx = pj[j]; 1559 pc1[j] = rtmp11[idx]; 1560 pc2[j] = rtmp22[idx]; 1561 pc3[j] = rtmp33[idx]; 1562 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1563 } 1564 1565 sctx.rs = rs; 1566 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1567 if (newshift == 1) goto endofwhile; 1568 break; 1569 1570 default: 1571 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1572 } 1573 row += nodesz; /* Update the row */ 1574 } 1575 endofwhile:; 1576 } while (sctx.lushift); 1577 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 1578 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1579 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1580 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1581 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1582 (B)->ops->solve = MatSolve_Inode; 1583 /* do not set solve add, since MatSolve_Inode + Add is faster */ 1584 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1585 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1586 C->assembled = PETSC_TRUE; 1587 C->preallocated = PETSC_TRUE; 1588 if (sctx.nshift) { 1589 if (info->shiftnz) { 1590 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1591 } else if (info->shiftpd) { 1592 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1593 } 1594 } 1595 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 /* 1600 Makes a longer coloring[] array and calls the usual code with that 1601 */ 1602 #undef __FUNCT__ 1603 #define __FUNCT__ "MatColoringPatch_Inode" 1604 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1605 { 1606 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1607 PetscErrorCode ierr; 1608 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1609 PetscInt *colorused,i; 1610 ISColoringValue *newcolor; 1611 1612 PetscFunctionBegin; 1613 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1614 /* loop over inodes, marking a color for each column*/ 1615 row = 0; 1616 for (i=0; i<m; i++){ 1617 for (j=0; j<ns[i]; j++) { 1618 newcolor[row++] = coloring[i] + j*ncolors; 1619 } 1620 } 1621 1622 /* eliminate unneeded colors */ 1623 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1624 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1625 for (i=0; i<n; i++) { 1626 colorused[newcolor[i]] = 1; 1627 } 1628 1629 for (i=1; i<5*ncolors; i++) { 1630 colorused[i] += colorused[i-1]; 1631 } 1632 ncolors = colorused[5*ncolors-1]; 1633 for (i=0; i<n; i++) { 1634 newcolor[i] = colorused[newcolor[i]]-1; 1635 } 1636 ierr = PetscFree(colorused);CHKERRQ(ierr); 1637 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1638 ierr = PetscFree(coloring);CHKERRQ(ierr); 1639 PetscFunctionReturn(0); 1640 } 1641 1642 #include "src/inline/ilu.h" 1643 1644 #undef __FUNCT__ 1645 #define __FUNCT__ "MatRelax_Inode" 1646 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1647 { 1648 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1649 PetscScalar *x,*xs,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1650 MatScalar *ibdiag,*bdiag; 1651 PetscScalar *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1652 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1653 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1654 PetscErrorCode ierr; 1655 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1656 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1657 1658 PetscFunctionBegin; 1659 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1660 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1661 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode"); 1662 1663 if (!a->inode.ibdiagvalid) { 1664 if (!a->inode.ibdiag) { 1665 /* calculate space needed for diagonal blocks */ 1666 for (i=0; i<m; i++) { 1667 cnt += sizes[i]*sizes[i]; 1668 } 1669 a->inode.bdiagsize = cnt; 1670 ierr = PetscMalloc2(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag);CHKERRQ(ierr); 1671 } 1672 1673 /* copy over the diagonal blocks and invert them */ 1674 ibdiag = a->inode.ibdiag; 1675 bdiag = a->inode.bdiag; 1676 cnt = 0; 1677 for (i=0, row = 0; i<m; i++) { 1678 for (j=0; j<sizes[i]; j++) { 1679 for (k=0; k<sizes[i]; k++) { 1680 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1681 } 1682 } 1683 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1684 1685 switch(sizes[i]) { 1686 case 1: 1687 /* Create matrix data structure */ 1688 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1689 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1690 break; 1691 case 2: 1692 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1693 break; 1694 case 3: 1695 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1696 break; 1697 case 4: 1698 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1699 break; 1700 case 5: 1701 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1702 break; 1703 default: 1704 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1705 } 1706 cnt += sizes[i]*sizes[i]; 1707 row += sizes[i]; 1708 } 1709 a->inode.ibdiagvalid = PETSC_TRUE; 1710 } 1711 ibdiag = a->inode.ibdiag; 1712 bdiag = a->inode.bdiag; 1713 1714 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1715 if (xx != bb) { 1716 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1717 } else { 1718 b = x; 1719 } 1720 1721 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1722 xs = x; 1723 if (flag & SOR_ZERO_INITIAL_GUESS) { 1724 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1725 1726 for (i=0, row=0; i<m; i++) { 1727 sz = diag[row] - ii[row]; 1728 v1 = a->a + ii[row]; 1729 idx = a->j + ii[row]; 1730 1731 /* see comments for MatMult_Inode() for how this is coded */ 1732 switch (sizes[i]){ 1733 case 1: 1734 1735 sum1 = b[row]; 1736 for(n = 0; n<sz-1; n+=2) { 1737 i1 = idx[0]; 1738 i2 = idx[1]; 1739 idx += 2; 1740 tmp0 = x[i1]; 1741 tmp1 = x[i2]; 1742 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1743 } 1744 1745 if (n == sz-1){ 1746 tmp0 = x[*idx]; 1747 sum1 -= *v1 * tmp0; 1748 } 1749 x[row++] = sum1*(*ibdiag++); 1750 break; 1751 case 2: 1752 v2 = a->a + ii[row+1]; 1753 sum1 = b[row]; 1754 sum2 = b[row+1]; 1755 for(n = 0; n<sz-1; n+=2) { 1756 i1 = idx[0]; 1757 i2 = idx[1]; 1758 idx += 2; 1759 tmp0 = x[i1]; 1760 tmp1 = x[i2]; 1761 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1762 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1763 } 1764 1765 if (n == sz-1){ 1766 tmp0 = x[*idx]; 1767 sum1 -= v1[0] * tmp0; 1768 sum2 -= v2[0] * tmp0; 1769 } 1770 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1771 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1772 ibdiag += 4; 1773 break; 1774 case 3: 1775 v2 = a->a + ii[row+1]; 1776 v3 = a->a + ii[row+2]; 1777 sum1 = b[row]; 1778 sum2 = b[row+1]; 1779 sum3 = b[row+2]; 1780 for(n = 0; n<sz-1; n+=2) { 1781 i1 = idx[0]; 1782 i2 = idx[1]; 1783 idx += 2; 1784 tmp0 = x[i1]; 1785 tmp1 = x[i2]; 1786 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1787 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1788 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1789 } 1790 1791 if (n == sz-1){ 1792 tmp0 = x[*idx]; 1793 sum1 -= v1[0] * tmp0; 1794 sum2 -= v2[0] * tmp0; 1795 sum3 -= v3[0] * tmp0; 1796 } 1797 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1798 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1799 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1800 ibdiag += 9; 1801 break; 1802 case 4: 1803 v2 = a->a + ii[row+1]; 1804 v3 = a->a + ii[row+2]; 1805 v4 = a->a + ii[row+3]; 1806 sum1 = b[row]; 1807 sum2 = b[row+1]; 1808 sum3 = b[row+2]; 1809 sum4 = b[row+3]; 1810 for(n = 0; n<sz-1; n+=2) { 1811 i1 = idx[0]; 1812 i2 = idx[1]; 1813 idx += 2; 1814 tmp0 = x[i1]; 1815 tmp1 = x[i2]; 1816 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1817 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1818 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1819 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1820 } 1821 1822 if (n == sz-1){ 1823 tmp0 = x[*idx]; 1824 sum1 -= v1[0] * tmp0; 1825 sum2 -= v2[0] * tmp0; 1826 sum3 -= v3[0] * tmp0; 1827 sum4 -= v4[0] * tmp0; 1828 } 1829 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1830 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1831 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1832 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1833 ibdiag += 16; 1834 break; 1835 case 5: 1836 v2 = a->a + ii[row+1]; 1837 v3 = a->a + ii[row+2]; 1838 v4 = a->a + ii[row+3]; 1839 v5 = a->a + ii[row+4]; 1840 sum1 = b[row]; 1841 sum2 = b[row+1]; 1842 sum3 = b[row+2]; 1843 sum4 = b[row+3]; 1844 sum5 = b[row+4]; 1845 for(n = 0; n<sz-1; n+=2) { 1846 i1 = idx[0]; 1847 i2 = idx[1]; 1848 idx += 2; 1849 tmp0 = x[i1]; 1850 tmp1 = x[i2]; 1851 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1852 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1853 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1854 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1855 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1856 } 1857 1858 if (n == sz-1){ 1859 tmp0 = x[*idx]; 1860 sum1 -= v1[0] * tmp0; 1861 sum2 -= v2[0] * tmp0; 1862 sum3 -= v3[0] * tmp0; 1863 sum4 -= v4[0] * tmp0; 1864 sum5 -= v5[0] * tmp0; 1865 } 1866 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1867 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1868 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1869 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1870 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1871 ibdiag += 25; 1872 break; 1873 default: 1874 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1875 } 1876 } 1877 1878 xb = x; 1879 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1880 } else xb = b; 1881 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1882 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1883 cnt = 0; 1884 for (i=0, row=0; i<m; i++) { 1885 1886 switch (sizes[i]){ 1887 case 1: 1888 x[row++] *= bdiag[cnt++]; 1889 break; 1890 case 2: 1891 x1 = x[row]; x2 = x[row+1]; 1892 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1893 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1894 x[row++] = tmp1; 1895 x[row++] = tmp2; 1896 cnt += 4; 1897 break; 1898 case 3: 1899 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1900 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1901 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1902 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1903 x[row++] = tmp1; 1904 x[row++] = tmp2; 1905 x[row++] = tmp3; 1906 cnt += 9; 1907 break; 1908 case 4: 1909 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1910 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1911 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1912 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1913 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1914 x[row++] = tmp1; 1915 x[row++] = tmp2; 1916 x[row++] = tmp3; 1917 x[row++] = tmp4; 1918 cnt += 16; 1919 break; 1920 case 5: 1921 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1922 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1923 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1924 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1925 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1926 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1927 x[row++] = tmp1; 1928 x[row++] = tmp2; 1929 x[row++] = tmp3; 1930 x[row++] = tmp4; 1931 x[row++] = tmp5; 1932 cnt += 25; 1933 break; 1934 default: 1935 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1936 } 1937 } 1938 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1939 } 1940 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1941 1942 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1943 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1944 ibdiag -= sizes[i]*sizes[i]; 1945 sz = ii[row+1] - diag[row] - 1; 1946 v1 = a->a + diag[row] + 1; 1947 idx = a->j + diag[row] + 1; 1948 1949 /* see comments for MatMult_Inode() for how this is coded */ 1950 switch (sizes[i]){ 1951 case 1: 1952 1953 sum1 = xb[row]; 1954 for(n = 0; n<sz-1; n+=2) { 1955 i1 = idx[0]; 1956 i2 = idx[1]; 1957 idx += 2; 1958 tmp0 = x[i1]; 1959 tmp1 = x[i2]; 1960 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1961 } 1962 1963 if (n == sz-1){ 1964 tmp0 = x[*idx]; 1965 sum1 -= *v1*tmp0; 1966 } 1967 x[row--] = sum1*(*ibdiag); 1968 break; 1969 1970 case 2: 1971 1972 sum1 = xb[row]; 1973 sum2 = xb[row-1]; 1974 /* note that sum1 is associated with the second of the two rows */ 1975 v2 = a->a + diag[row-1] + 2; 1976 for(n = 0; n<sz-1; n+=2) { 1977 i1 = idx[0]; 1978 i2 = idx[1]; 1979 idx += 2; 1980 tmp0 = x[i1]; 1981 tmp1 = x[i2]; 1982 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1983 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1984 } 1985 1986 if (n == sz-1){ 1987 tmp0 = x[*idx]; 1988 sum1 -= *v1*tmp0; 1989 sum2 -= *v2*tmp0; 1990 } 1991 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1992 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1993 break; 1994 case 3: 1995 1996 sum1 = xb[row]; 1997 sum2 = xb[row-1]; 1998 sum3 = xb[row-2]; 1999 v2 = a->a + diag[row-1] + 2; 2000 v3 = a->a + diag[row-2] + 3; 2001 for(n = 0; n<sz-1; n+=2) { 2002 i1 = idx[0]; 2003 i2 = idx[1]; 2004 idx += 2; 2005 tmp0 = x[i1]; 2006 tmp1 = x[i2]; 2007 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2008 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2009 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2010 } 2011 2012 if (n == sz-1){ 2013 tmp0 = x[*idx]; 2014 sum1 -= *v1*tmp0; 2015 sum2 -= *v2*tmp0; 2016 sum3 -= *v3*tmp0; 2017 } 2018 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2019 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2020 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2021 break; 2022 case 4: 2023 2024 sum1 = xb[row]; 2025 sum2 = xb[row-1]; 2026 sum3 = xb[row-2]; 2027 sum4 = xb[row-3]; 2028 v2 = a->a + diag[row-1] + 2; 2029 v3 = a->a + diag[row-2] + 3; 2030 v4 = a->a + diag[row-3] + 4; 2031 for(n = 0; n<sz-1; n+=2) { 2032 i1 = idx[0]; 2033 i2 = idx[1]; 2034 idx += 2; 2035 tmp0 = x[i1]; 2036 tmp1 = x[i2]; 2037 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2038 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2039 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2040 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2041 } 2042 2043 if (n == sz-1){ 2044 tmp0 = x[*idx]; 2045 sum1 -= *v1*tmp0; 2046 sum2 -= *v2*tmp0; 2047 sum3 -= *v3*tmp0; 2048 sum4 -= *v4*tmp0; 2049 } 2050 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2051 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2052 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2053 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2054 break; 2055 case 5: 2056 2057 sum1 = xb[row]; 2058 sum2 = xb[row-1]; 2059 sum3 = xb[row-2]; 2060 sum4 = xb[row-3]; 2061 sum5 = xb[row-4]; 2062 v2 = a->a + diag[row-1] + 2; 2063 v3 = a->a + diag[row-2] + 3; 2064 v4 = a->a + diag[row-3] + 4; 2065 v5 = a->a + diag[row-4] + 5; 2066 for(n = 0; n<sz-1; n+=2) { 2067 i1 = idx[0]; 2068 i2 = idx[1]; 2069 idx += 2; 2070 tmp0 = x[i1]; 2071 tmp1 = x[i2]; 2072 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2073 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2074 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2075 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2076 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2077 } 2078 2079 if (n == sz-1){ 2080 tmp0 = x[*idx]; 2081 sum1 -= *v1*tmp0; 2082 sum2 -= *v2*tmp0; 2083 sum3 -= *v3*tmp0; 2084 sum4 -= *v4*tmp0; 2085 sum5 -= *v5*tmp0; 2086 } 2087 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2088 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2089 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2090 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2091 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2092 break; 2093 default: 2094 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2095 } 2096 } 2097 2098 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2099 } 2100 its--; 2101 } 2102 if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode"); 2103 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2104 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2105 PetscFunctionReturn(0); 2106 } 2107 2108 2109 /* 2110 samestructure indicates that the matrix has not changed its nonzero structure so we 2111 do not need to recompute the inodes 2112 */ 2113 #undef __FUNCT__ 2114 #define __FUNCT__ "Mat_CheckInode" 2115 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2116 { 2117 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2118 PetscErrorCode ierr; 2119 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2120 PetscTruth flag; 2121 2122 PetscFunctionBegin; 2123 if (!a->inode.use) PetscFunctionReturn(0); 2124 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2125 2126 2127 m = A->rmap->n; 2128 if (a->inode.size) {ns = a->inode.size;} 2129 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2130 2131 i = 0; 2132 node_count = 0; 2133 idx = a->j; 2134 ii = a->i; 2135 while (i < m){ /* For each row */ 2136 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2137 /* Limits the number of elements in a node to 'a->inode.limit' */ 2138 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2139 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2140 if (nzy != nzx) break; 2141 idy += nzx; /* Same nonzero pattern */ 2142 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2143 if (!flag) break; 2144 } 2145 ns[node_count++] = blk_size; 2146 idx += blk_size*nzx; 2147 i = j; 2148 } 2149 /* If not enough inodes found,, do not use inode version of the routines */ 2150 if (!a->inode.size && m && node_count > .9*m) { 2151 ierr = PetscFree(ns);CHKERRQ(ierr); 2152 a->inode.node_count = 0; 2153 a->inode.size = PETSC_NULL; 2154 a->inode.use = PETSC_FALSE; 2155 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2156 } else { 2157 A->ops->mult = MatMult_Inode; 2158 A->ops->relax = MatRelax_Inode; 2159 A->ops->multadd = MatMultAdd_Inode; 2160 A->ops->getrowij = MatGetRowIJ_Inode; 2161 A->ops->restorerowij = MatRestoreRowIJ_Inode; 2162 A->ops->getcolumnij = MatGetColumnIJ_Inode; 2163 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 2164 A->ops->coloringpatch = MatColoringPatch_Inode; 2165 a->inode.node_count = node_count; 2166 a->inode.size = ns; 2167 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2168 } 2169 PetscFunctionReturn(0); 2170 } 2171 2172 /* 2173 This is really ugly. if inodes are used this replaces the 2174 permutations with ones that correspond to rows/cols of the matrix 2175 rather then inode blocks 2176 */ 2177 #undef __FUNCT__ 2178 #define __FUNCT__ "MatInodeAdjustForInodes" 2179 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2180 { 2181 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2182 2183 PetscFunctionBegin; 2184 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2185 if (f) { 2186 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2187 } 2188 PetscFunctionReturn(0); 2189 } 2190 2191 EXTERN_C_BEGIN 2192 #undef __FUNCT__ 2193 #define __FUNCT__ "MatAdjustForInodes_Inode" 2194 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 2195 { 2196 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2197 PetscErrorCode ierr; 2198 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count; 2199 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2200 PetscInt nslim_col,*ns_col; 2201 IS ris = *rperm,cis = *cperm; 2202 2203 PetscFunctionBegin; 2204 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2205 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2206 2207 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2208 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2209 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2210 permc = permr + m; 2211 2212 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2213 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2214 2215 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2216 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2217 2218 /* Construct the permutations for rows*/ 2219 for (i=0,row = 0; i<nslim_row; ++i){ 2220 indx = ridx[i]; 2221 start_val = tns[indx]; 2222 end_val = tns[indx + 1]; 2223 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2224 } 2225 2226 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2227 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2228 2229 /* Construct permutations for columns */ 2230 for (i=0,col=0; i<nslim_col; ++i){ 2231 indx = cidx[i]; 2232 start_val = tns[indx]; 2233 end_val = tns[indx + 1]; 2234 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2235 } 2236 2237 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2238 ISSetPermutation(*rperm); 2239 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2240 ISSetPermutation(*cperm); 2241 2242 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2243 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2244 2245 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2246 ierr = PetscFree(permr);CHKERRQ(ierr); 2247 ierr = ISDestroy(cis);CHKERRQ(ierr); 2248 ierr = ISDestroy(ris);CHKERRQ(ierr); 2249 ierr = PetscFree(tns);CHKERRQ(ierr); 2250 PetscFunctionReturn(0); 2251 } 2252 EXTERN_C_END 2253 2254 #undef __FUNCT__ 2255 #define __FUNCT__ "MatInodeGetInodeSizes" 2256 /*@C 2257 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2258 2259 Collective on Mat 2260 2261 Input Parameter: 2262 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2263 2264 Output Parameter: 2265 + node_count - no of inodes present in the matrix. 2266 . sizes - an array of size node_count,with sizes of each inode. 2267 - limit - the max size used to generate the inodes. 2268 2269 Level: advanced 2270 2271 Notes: This routine returns some internal storage information 2272 of the matrix, it is intended to be used by advanced users. 2273 It should be called after the matrix is assembled. 2274 The contents of the sizes[] array should not be changed. 2275 PETSC_NULL may be passed for information not requested. 2276 2277 .keywords: matrix, seqaij, get, inode 2278 2279 .seealso: MatGetInfo() 2280 @*/ 2281 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2282 { 2283 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2284 2285 PetscFunctionBegin; 2286 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2287 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2288 if (f) { 2289 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2290 } 2291 PetscFunctionReturn(0); 2292 } 2293 2294 EXTERN_C_BEGIN 2295 #undef __FUNCT__ 2296 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 2297 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2298 { 2299 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2300 2301 PetscFunctionBegin; 2302 if (node_count) *node_count = a->inode.node_count; 2303 if (sizes) *sizes = a->inode.size; 2304 if (limit) *limit = a->inode.limit; 2305 PetscFunctionReturn(0); 2306 } 2307 EXTERN_C_END 2308