1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap.n; 19 m = A->rmap.n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap.n; 72 n = A->cmap.n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap.n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap.n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_Inode" 397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x,*v1,*v2,*v3,*v4,*v5; 403 PetscErrorCode ierr; 404 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 405 406 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 407 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 408 #endif 409 410 PetscFunctionBegin; 411 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 412 node_max = a->inode.node_count; 413 ns = a->inode.size; /* Node Size array */ 414 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 415 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 416 idx = a->j; 417 v1 = a->a; 418 ii = a->i; 419 420 for (i = 0,row = 0; i< node_max; ++i){ 421 nsz = ns[i]; 422 n = ii[1] - ii[0]; 423 nonzerorow += (n>0)*nsz; 424 ii += nsz; 425 sz = n; /* No of non zeros in this row */ 426 /* Switch on the size of Node */ 427 switch (nsz){ /* Each loop in 'case' is unrolled */ 428 case 1 : 429 sum1 = 0; 430 431 for(n = 0; n< sz-1; n+=2) { 432 i1 = idx[0]; /* The instructions are ordered to */ 433 i2 = idx[1]; /* make the compiler's job easy */ 434 idx += 2; 435 tmp0 = x[i1]; 436 tmp1 = x[i2]; 437 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 438 } 439 440 if (n == sz-1){ /* Take care of the last nonzero */ 441 tmp0 = x[*idx++]; 442 sum1 += *v1++ * tmp0; 443 } 444 y[row++]=sum1; 445 break; 446 case 2: 447 sum1 = 0; 448 sum2 = 0; 449 v2 = v1 + n; 450 451 for (n = 0; n< sz-1; n+=2) { 452 i1 = idx[0]; 453 i2 = idx[1]; 454 idx += 2; 455 tmp0 = x[i1]; 456 tmp1 = x[i2]; 457 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 458 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 459 } 460 if (n == sz-1){ 461 tmp0 = x[*idx++]; 462 sum1 += *v1++ * tmp0; 463 sum2 += *v2++ * tmp0; 464 } 465 y[row++]=sum1; 466 y[row++]=sum2; 467 v1 =v2; /* Since the next block to be processed starts there*/ 468 idx +=sz; 469 break; 470 case 3: 471 sum1 = 0; 472 sum2 = 0; 473 sum3 = 0; 474 v2 = v1 + n; 475 v3 = v2 + n; 476 477 for (n = 0; n< sz-1; n+=2) { 478 i1 = idx[0]; 479 i2 = idx[1]; 480 idx += 2; 481 tmp0 = x[i1]; 482 tmp1 = x[i2]; 483 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 484 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 485 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 486 } 487 if (n == sz-1){ 488 tmp0 = x[*idx++]; 489 sum1 += *v1++ * tmp0; 490 sum2 += *v2++ * tmp0; 491 sum3 += *v3++ * tmp0; 492 } 493 y[row++]=sum1; 494 y[row++]=sum2; 495 y[row++]=sum3; 496 v1 =v3; /* Since the next block to be processed starts there*/ 497 idx +=2*sz; 498 break; 499 case 4: 500 sum1 = 0; 501 sum2 = 0; 502 sum3 = 0; 503 sum4 = 0; 504 v2 = v1 + n; 505 v3 = v2 + n; 506 v4 = v3 + n; 507 508 for (n = 0; n< sz-1; n+=2) { 509 i1 = idx[0]; 510 i2 = idx[1]; 511 idx += 2; 512 tmp0 = x[i1]; 513 tmp1 = x[i2]; 514 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 515 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 516 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 517 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 518 } 519 if (n == sz-1){ 520 tmp0 = x[*idx++]; 521 sum1 += *v1++ * tmp0; 522 sum2 += *v2++ * tmp0; 523 sum3 += *v3++ * tmp0; 524 sum4 += *v4++ * tmp0; 525 } 526 y[row++]=sum1; 527 y[row++]=sum2; 528 y[row++]=sum3; 529 y[row++]=sum4; 530 v1 =v4; /* Since the next block to be processed starts there*/ 531 idx +=3*sz; 532 break; 533 case 5: 534 sum1 = 0; 535 sum2 = 0; 536 sum3 = 0; 537 sum4 = 0; 538 sum5 = 0; 539 v2 = v1 + n; 540 v3 = v2 + n; 541 v4 = v3 + n; 542 v5 = v4 + n; 543 544 for (n = 0; n<sz-1; n+=2) { 545 i1 = idx[0]; 546 i2 = idx[1]; 547 idx += 2; 548 tmp0 = x[i1]; 549 tmp1 = x[i2]; 550 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 551 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 552 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 553 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 554 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 555 } 556 if (n == sz-1){ 557 tmp0 = x[*idx++]; 558 sum1 += *v1++ * tmp0; 559 sum2 += *v2++ * tmp0; 560 sum3 += *v3++ * tmp0; 561 sum4 += *v4++ * tmp0; 562 sum5 += *v5++ * tmp0; 563 } 564 y[row++]=sum1; 565 y[row++]=sum2; 566 y[row++]=sum3; 567 y[row++]=sum4; 568 y[row++]=sum5; 569 v1 =v5; /* Since the next block to be processed starts there */ 570 idx +=4*sz; 571 break; 572 default : 573 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 574 } 575 } 576 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 577 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 578 ierr = PetscLogFlops(2*a->nz - nonzerorow);CHKERRQ(ierr); 579 PetscFunctionReturn(0); 580 } 581 /* ----------------------------------------------------------- */ 582 /* Almost same code as the MatMult_Inode() */ 583 #undef __FUNCT__ 584 #define __FUNCT__ "MatMultAdd_Inode" 585 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 586 { 587 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 588 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 589 PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt; 590 PetscErrorCode ierr; 591 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 592 593 PetscFunctionBegin; 594 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 595 node_max = a->inode.node_count; 596 ns = a->inode.size; /* Node Size array */ 597 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 598 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 599 if (zz != yy) { 600 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 601 } else { 602 z = y; 603 } 604 zt = z; 605 606 idx = a->j; 607 v1 = a->a; 608 ii = a->i; 609 610 for (i = 0,row = 0; i< node_max; ++i){ 611 nsz = ns[i]; 612 n = ii[1] - ii[0]; 613 ii += nsz; 614 sz = n; /* No of non zeros in this row */ 615 /* Switch on the size of Node */ 616 switch (nsz){ /* Each loop in 'case' is unrolled */ 617 case 1 : 618 sum1 = *zt++; 619 620 for(n = 0; n< sz-1; n+=2) { 621 i1 = idx[0]; /* The instructions are ordered to */ 622 i2 = idx[1]; /* make the compiler's job easy */ 623 idx += 2; 624 tmp0 = x[i1]; 625 tmp1 = x[i2]; 626 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 627 } 628 629 if(n == sz-1){ /* Take care of the last nonzero */ 630 tmp0 = x[*idx++]; 631 sum1 += *v1++ * tmp0; 632 } 633 y[row++]=sum1; 634 break; 635 case 2: 636 sum1 = *zt++; 637 sum2 = *zt++; 638 v2 = v1 + n; 639 640 for(n = 0; n< sz-1; n+=2) { 641 i1 = idx[0]; 642 i2 = idx[1]; 643 idx += 2; 644 tmp0 = x[i1]; 645 tmp1 = x[i2]; 646 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 647 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 648 } 649 if(n == sz-1){ 650 tmp0 = x[*idx++]; 651 sum1 += *v1++ * tmp0; 652 sum2 += *v2++ * tmp0; 653 } 654 y[row++]=sum1; 655 y[row++]=sum2; 656 v1 =v2; /* Since the next block to be processed starts there*/ 657 idx +=sz; 658 break; 659 case 3: 660 sum1 = *zt++; 661 sum2 = *zt++; 662 sum3 = *zt++; 663 v2 = v1 + n; 664 v3 = v2 + n; 665 666 for (n = 0; n< sz-1; n+=2) { 667 i1 = idx[0]; 668 i2 = idx[1]; 669 idx += 2; 670 tmp0 = x[i1]; 671 tmp1 = x[i2]; 672 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 673 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 674 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 675 } 676 if (n == sz-1){ 677 tmp0 = x[*idx++]; 678 sum1 += *v1++ * tmp0; 679 sum2 += *v2++ * tmp0; 680 sum3 += *v3++ * tmp0; 681 } 682 y[row++]=sum1; 683 y[row++]=sum2; 684 y[row++]=sum3; 685 v1 =v3; /* Since the next block to be processed starts there*/ 686 idx +=2*sz; 687 break; 688 case 4: 689 sum1 = *zt++; 690 sum2 = *zt++; 691 sum3 = *zt++; 692 sum4 = *zt++; 693 v2 = v1 + n; 694 v3 = v2 + n; 695 v4 = v3 + n; 696 697 for (n = 0; n< sz-1; n+=2) { 698 i1 = idx[0]; 699 i2 = idx[1]; 700 idx += 2; 701 tmp0 = x[i1]; 702 tmp1 = x[i2]; 703 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 704 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 705 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 706 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 707 } 708 if (n == sz-1){ 709 tmp0 = x[*idx++]; 710 sum1 += *v1++ * tmp0; 711 sum2 += *v2++ * tmp0; 712 sum3 += *v3++ * tmp0; 713 sum4 += *v4++ * tmp0; 714 } 715 y[row++]=sum1; 716 y[row++]=sum2; 717 y[row++]=sum3; 718 y[row++]=sum4; 719 v1 =v4; /* Since the next block to be processed starts there*/ 720 idx +=3*sz; 721 break; 722 case 5: 723 sum1 = *zt++; 724 sum2 = *zt++; 725 sum3 = *zt++; 726 sum4 = *zt++; 727 sum5 = *zt++; 728 v2 = v1 + n; 729 v3 = v2 + n; 730 v4 = v3 + n; 731 v5 = v4 + n; 732 733 for (n = 0; n<sz-1; n+=2) { 734 i1 = idx[0]; 735 i2 = idx[1]; 736 idx += 2; 737 tmp0 = x[i1]; 738 tmp1 = x[i2]; 739 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 740 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 741 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 742 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 743 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 744 } 745 if(n == sz-1){ 746 tmp0 = x[*idx++]; 747 sum1 += *v1++ * tmp0; 748 sum2 += *v2++ * tmp0; 749 sum3 += *v3++ * tmp0; 750 sum4 += *v4++ * tmp0; 751 sum5 += *v5++ * tmp0; 752 } 753 y[row++]=sum1; 754 y[row++]=sum2; 755 y[row++]=sum3; 756 y[row++]=sum4; 757 y[row++]=sum5; 758 v1 =v5; /* Since the next block to be processed starts there */ 759 idx +=4*sz; 760 break; 761 default : 762 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 763 } 764 } 765 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 766 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 767 if (zz != yy) { 768 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 769 } 770 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 771 PetscFunctionReturn(0); 772 } 773 774 /* ----------------------------------------------------------- */ 775 #undef __FUNCT__ 776 #define __FUNCT__ "MatSolve_Inode" 777 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 778 { 779 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 780 IS iscol = a->col,isrow = a->row; 781 PetscErrorCode ierr; 782 PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j; 783 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout; 784 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 785 PetscScalar sum1,sum2,sum3,sum4,sum5; 786 const PetscScalar *v1,*v2,*v3,*v4,*v5,*b,*a_a = a->a,*aa; 787 788 PetscFunctionBegin; 789 if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix"); 790 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 791 node_max = a->inode.node_count; 792 ns = a->inode.size; /* Node Size array */ 793 794 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 795 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 796 tmp = a->solve_work; 797 798 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 799 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 800 801 /* forward solve the lower triangular */ 802 tmps = tmp ; 803 aa = a_a ; 804 aj = a_j ; 805 ad = a->diag; 806 807 for (i = 0,row = 0; i< node_max; ++i){ 808 nsz = ns[i]; 809 aii = ai[row]; 810 v1 = aa + aii; 811 vi = aj + aii; 812 nz = ad[row]- aii; 813 814 switch (nsz){ /* Each loop in 'case' is unrolled */ 815 case 1 : 816 sum1 = b[*r++]; 817 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 818 for(j=0; j<nz-1; j+=2){ 819 i0 = vi[0]; 820 i1 = vi[1]; 821 vi +=2; 822 tmp0 = tmps[i0]; 823 tmp1 = tmps[i1]; 824 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 825 } 826 if(j == nz-1){ 827 tmp0 = tmps[*vi++]; 828 sum1 -= *v1++ *tmp0; 829 } 830 tmp[row ++]=sum1; 831 break; 832 case 2: 833 sum1 = b[*r++]; 834 sum2 = b[*r++]; 835 v2 = aa + ai[row+1]; 836 837 for(j=0; j<nz-1; j+=2){ 838 i0 = vi[0]; 839 i1 = vi[1]; 840 vi +=2; 841 tmp0 = tmps[i0]; 842 tmp1 = tmps[i1]; 843 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 844 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 845 } 846 if(j == nz-1){ 847 tmp0 = tmps[*vi++]; 848 sum1 -= *v1++ *tmp0; 849 sum2 -= *v2++ *tmp0; 850 } 851 sum2 -= *v2++ * sum1; 852 tmp[row ++]=sum1; 853 tmp[row ++]=sum2; 854 break; 855 case 3: 856 sum1 = b[*r++]; 857 sum2 = b[*r++]; 858 sum3 = b[*r++]; 859 v2 = aa + ai[row+1]; 860 v3 = aa + ai[row+2]; 861 862 for (j=0; j<nz-1; j+=2){ 863 i0 = vi[0]; 864 i1 = vi[1]; 865 vi +=2; 866 tmp0 = tmps[i0]; 867 tmp1 = tmps[i1]; 868 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 869 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 870 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 871 } 872 if (j == nz-1){ 873 tmp0 = tmps[*vi++]; 874 sum1 -= *v1++ *tmp0; 875 sum2 -= *v2++ *tmp0; 876 sum3 -= *v3++ *tmp0; 877 } 878 sum2 -= *v2++ * sum1; 879 sum3 -= *v3++ * sum1; 880 sum3 -= *v3++ * sum2; 881 tmp[row ++]=sum1; 882 tmp[row ++]=sum2; 883 tmp[row ++]=sum3; 884 break; 885 886 case 4: 887 sum1 = b[*r++]; 888 sum2 = b[*r++]; 889 sum3 = b[*r++]; 890 sum4 = b[*r++]; 891 v2 = aa + ai[row+1]; 892 v3 = aa + ai[row+2]; 893 v4 = aa + ai[row+3]; 894 895 for (j=0; j<nz-1; j+=2){ 896 i0 = vi[0]; 897 i1 = vi[1]; 898 vi +=2; 899 tmp0 = tmps[i0]; 900 tmp1 = tmps[i1]; 901 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 902 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 903 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 904 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 905 } 906 if (j == nz-1){ 907 tmp0 = tmps[*vi++]; 908 sum1 -= *v1++ *tmp0; 909 sum2 -= *v2++ *tmp0; 910 sum3 -= *v3++ *tmp0; 911 sum4 -= *v4++ *tmp0; 912 } 913 sum2 -= *v2++ * sum1; 914 sum3 -= *v3++ * sum1; 915 sum4 -= *v4++ * sum1; 916 sum3 -= *v3++ * sum2; 917 sum4 -= *v4++ * sum2; 918 sum4 -= *v4++ * sum3; 919 920 tmp[row ++]=sum1; 921 tmp[row ++]=sum2; 922 tmp[row ++]=sum3; 923 tmp[row ++]=sum4; 924 break; 925 case 5: 926 sum1 = b[*r++]; 927 sum2 = b[*r++]; 928 sum3 = b[*r++]; 929 sum4 = b[*r++]; 930 sum5 = b[*r++]; 931 v2 = aa + ai[row+1]; 932 v3 = aa + ai[row+2]; 933 v4 = aa + ai[row+3]; 934 v5 = aa + ai[row+4]; 935 936 for (j=0; j<nz-1; j+=2){ 937 i0 = vi[0]; 938 i1 = vi[1]; 939 vi +=2; 940 tmp0 = tmps[i0]; 941 tmp1 = tmps[i1]; 942 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 943 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 944 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 945 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 946 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 947 } 948 if (j == nz-1){ 949 tmp0 = tmps[*vi++]; 950 sum1 -= *v1++ *tmp0; 951 sum2 -= *v2++ *tmp0; 952 sum3 -= *v3++ *tmp0; 953 sum4 -= *v4++ *tmp0; 954 sum5 -= *v5++ *tmp0; 955 } 956 957 sum2 -= *v2++ * sum1; 958 sum3 -= *v3++ * sum1; 959 sum4 -= *v4++ * sum1; 960 sum5 -= *v5++ * sum1; 961 sum3 -= *v3++ * sum2; 962 sum4 -= *v4++ * sum2; 963 sum5 -= *v5++ * sum2; 964 sum4 -= *v4++ * sum3; 965 sum5 -= *v5++ * sum3; 966 sum5 -= *v5++ * sum4; 967 968 tmp[row ++]=sum1; 969 tmp[row ++]=sum2; 970 tmp[row ++]=sum3; 971 tmp[row ++]=sum4; 972 tmp[row ++]=sum5; 973 break; 974 default: 975 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 976 } 977 } 978 /* backward solve the upper triangular */ 979 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 980 nsz = ns[i]; 981 aii = ai[row+1] -1; 982 v1 = aa + aii; 983 vi = aj + aii; 984 nz = aii- ad[row]; 985 switch (nsz){ /* Each loop in 'case' is unrolled */ 986 case 1 : 987 sum1 = tmp[row]; 988 989 for(j=nz ; j>1; j-=2){ 990 vi -=2; 991 i0 = vi[2]; 992 i1 = vi[1]; 993 tmp0 = tmps[i0]; 994 tmp1 = tmps[i1]; 995 v1 -= 2; 996 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 997 } 998 if (j==1){ 999 tmp0 = tmps[*vi--]; 1000 sum1 -= *v1-- * tmp0; 1001 } 1002 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1003 break; 1004 case 2 : 1005 sum1 = tmp[row]; 1006 sum2 = tmp[row -1]; 1007 v2 = aa + ai[row]-1; 1008 for (j=nz ; j>1; j-=2){ 1009 vi -=2; 1010 i0 = vi[2]; 1011 i1 = vi[1]; 1012 tmp0 = tmps[i0]; 1013 tmp1 = tmps[i1]; 1014 v1 -= 2; 1015 v2 -= 2; 1016 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1017 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1018 } 1019 if (j==1){ 1020 tmp0 = tmps[*vi--]; 1021 sum1 -= *v1-- * tmp0; 1022 sum2 -= *v2-- * tmp0; 1023 } 1024 1025 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1026 sum2 -= *v2-- * tmp0; 1027 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1028 break; 1029 case 3 : 1030 sum1 = tmp[row]; 1031 sum2 = tmp[row -1]; 1032 sum3 = tmp[row -2]; 1033 v2 = aa + ai[row]-1; 1034 v3 = aa + ai[row -1]-1; 1035 for (j=nz ; j>1; j-=2){ 1036 vi -=2; 1037 i0 = vi[2]; 1038 i1 = vi[1]; 1039 tmp0 = tmps[i0]; 1040 tmp1 = tmps[i1]; 1041 v1 -= 2; 1042 v2 -= 2; 1043 v3 -= 2; 1044 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1045 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1046 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1047 } 1048 if (j==1){ 1049 tmp0 = tmps[*vi--]; 1050 sum1 -= *v1-- * tmp0; 1051 sum2 -= *v2-- * tmp0; 1052 sum3 -= *v3-- * tmp0; 1053 } 1054 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1055 sum2 -= *v2-- * tmp0; 1056 sum3 -= *v3-- * tmp0; 1057 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1058 sum3 -= *v3-- * tmp0; 1059 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1060 1061 break; 1062 case 4 : 1063 sum1 = tmp[row]; 1064 sum2 = tmp[row -1]; 1065 sum3 = tmp[row -2]; 1066 sum4 = tmp[row -3]; 1067 v2 = aa + ai[row]-1; 1068 v3 = aa + ai[row -1]-1; 1069 v4 = aa + ai[row -2]-1; 1070 1071 for (j=nz ; j>1; j-=2){ 1072 vi -=2; 1073 i0 = vi[2]; 1074 i1 = vi[1]; 1075 tmp0 = tmps[i0]; 1076 tmp1 = tmps[i1]; 1077 v1 -= 2; 1078 v2 -= 2; 1079 v3 -= 2; 1080 v4 -= 2; 1081 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1082 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1083 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1084 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1085 } 1086 if (j==1){ 1087 tmp0 = tmps[*vi--]; 1088 sum1 -= *v1-- * tmp0; 1089 sum2 -= *v2-- * tmp0; 1090 sum3 -= *v3-- * tmp0; 1091 sum4 -= *v4-- * tmp0; 1092 } 1093 1094 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1095 sum2 -= *v2-- * tmp0; 1096 sum3 -= *v3-- * tmp0; 1097 sum4 -= *v4-- * tmp0; 1098 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1102 sum4 -= *v4-- * tmp0; 1103 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1104 break; 1105 case 5 : 1106 sum1 = tmp[row]; 1107 sum2 = tmp[row -1]; 1108 sum3 = tmp[row -2]; 1109 sum4 = tmp[row -3]; 1110 sum5 = tmp[row -4]; 1111 v2 = aa + ai[row]-1; 1112 v3 = aa + ai[row -1]-1; 1113 v4 = aa + ai[row -2]-1; 1114 v5 = aa + ai[row -3]-1; 1115 for (j=nz ; j>1; j-=2){ 1116 vi -= 2; 1117 i0 = vi[2]; 1118 i1 = vi[1]; 1119 tmp0 = tmps[i0]; 1120 tmp1 = tmps[i1]; 1121 v1 -= 2; 1122 v2 -= 2; 1123 v3 -= 2; 1124 v4 -= 2; 1125 v5 -= 2; 1126 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1127 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1128 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1129 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1130 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1131 } 1132 if (j==1){ 1133 tmp0 = tmps[*vi--]; 1134 sum1 -= *v1-- * tmp0; 1135 sum2 -= *v2-- * tmp0; 1136 sum3 -= *v3-- * tmp0; 1137 sum4 -= *v4-- * tmp0; 1138 sum5 -= *v5-- * tmp0; 1139 } 1140 1141 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1142 sum2 -= *v2-- * tmp0; 1143 sum3 -= *v3-- * tmp0; 1144 sum4 -= *v4-- * tmp0; 1145 sum5 -= *v5-- * tmp0; 1146 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1154 sum5 -= *v5-- * tmp0; 1155 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1156 break; 1157 default: 1158 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1159 } 1160 } 1161 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1162 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1163 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1164 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1165 ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr); 1166 PetscFunctionReturn(0); 1167 } 1168 1169 #undef __FUNCT__ 1170 #define __FUNCT__ "MatLUFactorNumeric_Inode" 1171 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B) 1172 { 1173 Mat C = *B; 1174 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1175 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1176 PetscErrorCode ierr; 1177 PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i; 1178 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1179 PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1180 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1181 PetscScalar *pc1,*pc2,*pc3,mul1,mul2,mul3; 1182 PetscScalar tmp,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1183 const PetscScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1184 PetscReal rs=0.0; 1185 LUShift_Ctx sctx; 1186 PetscInt newshift; 1187 1188 PetscFunctionBegin; 1189 sctx.shift_top = 0; 1190 sctx.nshift_max = 0; 1191 sctx.shift_lo = 0; 1192 sctx.shift_hi = 0; 1193 1194 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1195 if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0; 1196 if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1197 sctx.shift_top = 0; 1198 for (i=0; i<n; i++) { 1199 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1200 rs = 0.0; 1201 ajtmp = aj + ai[i]; 1202 rtmp1 = aa + ai[i]; 1203 nz = ai[i+1] - ai[i]; 1204 for (j=0; j<nz; j++){ 1205 if (*ajtmp != i){ 1206 rs += PetscAbsScalar(*rtmp1++); 1207 } else { 1208 rs -= PetscRealPart(*rtmp1++); 1209 } 1210 ajtmp++; 1211 } 1212 if (rs>sctx.shift_top) sctx.shift_top = rs; 1213 } 1214 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1215 sctx.shift_top *= 1.1; 1216 sctx.nshift_max = 5; 1217 sctx.shift_lo = 0.; 1218 sctx.shift_hi = 1.; 1219 } 1220 sctx.shift_amount = 0; 1221 sctx.nshift = 0; 1222 1223 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1224 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1225 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1226 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1227 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1228 ics = ic ; 1229 rtmp22 = rtmp11 + n; 1230 rtmp33 = rtmp22 + n; 1231 1232 node_max = a->inode.node_count; 1233 ns = a->inode.size ; 1234 if (!ns){ 1235 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1236 } 1237 1238 /* If max inode size > 3, split it into two inodes.*/ 1239 /* also map the inode sizes according to the ordering */ 1240 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1241 for (i=0,j=0; i<node_max; ++i,++j){ 1242 if (ns[i]>3) { 1243 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1244 ++j; 1245 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1246 } else { 1247 tmp_vec1[j] = ns[i]; 1248 } 1249 } 1250 /* Use the correct node_max */ 1251 node_max = j; 1252 1253 /* Now reorder the inode info based on mat re-ordering info */ 1254 /* First create a row -> inode_size_array_index map */ 1255 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1256 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1257 for (i=0,row=0; i<node_max; i++) { 1258 nodesz = tmp_vec1[i]; 1259 for (j=0; j<nodesz; j++,row++) { 1260 nsmap[row] = i; 1261 } 1262 } 1263 /* Using nsmap, create a reordered ns structure */ 1264 for (i=0,j=0; i< node_max; i++) { 1265 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1266 tmp_vec2[i] = nodesz; 1267 j += nodesz; 1268 } 1269 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1270 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1271 /* Now use the correct ns */ 1272 ns = tmp_vec2; 1273 1274 do { 1275 sctx.lushift = PETSC_FALSE; 1276 /* Now loop over each block-row, and do the factorization */ 1277 for (i=0,row=0; i<node_max; i++) { 1278 nodesz = ns[i]; 1279 nz = bi[row+1] - bi[row]; 1280 bjtmp = bj + bi[row]; 1281 1282 switch (nodesz){ 1283 case 1: 1284 for (j=0; j<nz; j++){ 1285 idx = bjtmp[j]; 1286 rtmp11[idx] = 0.0; 1287 } 1288 1289 /* load in initial (unfactored row) */ 1290 idx = r[row]; 1291 nz_tmp = ai[idx+1] - ai[idx]; 1292 ajtmp = aj + ai[idx]; 1293 v1 = aa + ai[idx]; 1294 1295 for (j=0; j<nz_tmp; j++) { 1296 idx = ics[ajtmp[j]]; 1297 rtmp11[idx] = v1[j]; 1298 } 1299 rtmp11[ics[r[row]]] += sctx.shift_amount; 1300 1301 prow = *bjtmp++ ; 1302 while (prow < row) { 1303 pc1 = rtmp11 + prow; 1304 if (*pc1 != 0.0){ 1305 pv = ba + bd[prow]; 1306 pj = nbj + bd[prow]; 1307 mul1 = *pc1 * *pv++; 1308 *pc1 = mul1; 1309 nz_tmp = bi[prow+1] - bd[prow] - 1; 1310 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1311 for (j=0; j<nz_tmp; j++) { 1312 tmp = pv[j]; 1313 idx = pj[j]; 1314 rtmp11[idx] -= mul1 * tmp; 1315 } 1316 } 1317 prow = *bjtmp++ ; 1318 } 1319 pj = bj + bi[row]; 1320 pc1 = ba + bi[row]; 1321 1322 sctx.pv = rtmp11[row]; 1323 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1324 rs = 0.0; 1325 for (j=0; j<nz; j++) { 1326 idx = pj[j]; 1327 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1328 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1329 } 1330 sctx.rs = rs; 1331 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1332 if (newshift == 1) goto endofwhile; 1333 break; 1334 1335 case 2: 1336 for (j=0; j<nz; j++) { 1337 idx = bjtmp[j]; 1338 rtmp11[idx] = 0.0; 1339 rtmp22[idx] = 0.0; 1340 } 1341 1342 /* load in initial (unfactored row) */ 1343 idx = r[row]; 1344 nz_tmp = ai[idx+1] - ai[idx]; 1345 ajtmp = aj + ai[idx]; 1346 v1 = aa + ai[idx]; 1347 v2 = aa + ai[idx+1]; 1348 for (j=0; j<nz_tmp; j++) { 1349 idx = ics[ajtmp[j]]; 1350 rtmp11[idx] = v1[j]; 1351 rtmp22[idx] = v2[j]; 1352 } 1353 rtmp11[ics[r[row]]] += sctx.shift_amount; 1354 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1355 1356 prow = *bjtmp++ ; 1357 while (prow < row) { 1358 pc1 = rtmp11 + prow; 1359 pc2 = rtmp22 + prow; 1360 if (*pc1 != 0.0 || *pc2 != 0.0){ 1361 pv = ba + bd[prow]; 1362 pj = nbj + bd[prow]; 1363 mul1 = *pc1 * *pv; 1364 mul2 = *pc2 * *pv; 1365 ++pv; 1366 *pc1 = mul1; 1367 *pc2 = mul2; 1368 1369 nz_tmp = bi[prow+1] - bd[prow] - 1; 1370 for (j=0; j<nz_tmp; j++) { 1371 tmp = pv[j]; 1372 idx = pj[j]; 1373 rtmp11[idx] -= mul1 * tmp; 1374 rtmp22[idx] -= mul2 * tmp; 1375 } 1376 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1377 } 1378 prow = *bjtmp++ ; 1379 } 1380 1381 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1382 pc1 = rtmp11 + prow; 1383 pc2 = rtmp22 + prow; 1384 1385 sctx.pv = *pc1; 1386 pj = bj + bi[prow]; 1387 rs = 0.0; 1388 for (j=0; j<nz; j++){ 1389 idx = pj[j]; 1390 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1391 } 1392 sctx.rs = rs; 1393 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1394 if (newshift == 1) goto endofwhile; 1395 1396 if (*pc2 != 0.0){ 1397 pj = nbj + bd[prow]; 1398 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1399 *pc2 = mul2; 1400 nz_tmp = bi[prow+1] - bd[prow] - 1; 1401 for (j=0; j<nz_tmp; j++) { 1402 idx = pj[j] ; 1403 tmp = rtmp11[idx]; 1404 rtmp22[idx] -= mul2 * tmp; 1405 } 1406 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1407 } 1408 1409 pj = bj + bi[row]; 1410 pc1 = ba + bi[row]; 1411 pc2 = ba + bi[row+1]; 1412 1413 sctx.pv = rtmp22[row+1]; 1414 rs = 0.0; 1415 rtmp11[row] = 1.0/rtmp11[row]; 1416 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1417 /* copy row entries from dense representation to sparse */ 1418 for (j=0; j<nz; j++) { 1419 idx = pj[j]; 1420 pc1[j] = rtmp11[idx]; 1421 pc2[j] = rtmp22[idx]; 1422 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1423 } 1424 sctx.rs = rs; 1425 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1426 if (newshift == 1) goto endofwhile; 1427 break; 1428 1429 case 3: 1430 for (j=0; j<nz; j++) { 1431 idx = bjtmp[j]; 1432 rtmp11[idx] = 0.0; 1433 rtmp22[idx] = 0.0; 1434 rtmp33[idx] = 0.0; 1435 } 1436 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1437 idx = r[row]; 1438 nz_tmp = ai[idx+1] - ai[idx]; 1439 ajtmp = aj + ai[idx]; 1440 v1 = aa + ai[idx]; 1441 v2 = aa + ai[idx+1]; 1442 v3 = aa + ai[idx+2]; 1443 for (j=0; j<nz_tmp; j++) { 1444 idx = ics[ajtmp[j]]; 1445 rtmp11[idx] = v1[j]; 1446 rtmp22[idx] = v2[j]; 1447 rtmp33[idx] = v3[j]; 1448 } 1449 rtmp11[ics[r[row]]] += sctx.shift_amount; 1450 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1451 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1452 1453 /* loop over all pivot row blocks above this row block */ 1454 prow = *bjtmp++ ; 1455 while (prow < row) { 1456 pc1 = rtmp11 + prow; 1457 pc2 = rtmp22 + prow; 1458 pc3 = rtmp33 + prow; 1459 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1460 pv = ba + bd[prow]; 1461 pj = nbj + bd[prow]; 1462 mul1 = *pc1 * *pv; 1463 mul2 = *pc2 * *pv; 1464 mul3 = *pc3 * *pv; 1465 ++pv; 1466 *pc1 = mul1; 1467 *pc2 = mul2; 1468 *pc3 = mul3; 1469 1470 nz_tmp = bi[prow+1] - bd[prow] - 1; 1471 /* update this row based on pivot row */ 1472 for (j=0; j<nz_tmp; j++) { 1473 tmp = pv[j]; 1474 idx = pj[j]; 1475 rtmp11[idx] -= mul1 * tmp; 1476 rtmp22[idx] -= mul2 * tmp; 1477 rtmp33[idx] -= mul3 * tmp; 1478 } 1479 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1480 } 1481 prow = *bjtmp++ ; 1482 } 1483 1484 /* Now take care of diagonal 3x3 block in this set of rows */ 1485 /* note: prow = row here */ 1486 pc1 = rtmp11 + prow; 1487 pc2 = rtmp22 + prow; 1488 pc3 = rtmp33 + prow; 1489 1490 sctx.pv = *pc1; 1491 pj = bj + bi[prow]; 1492 rs = 0.0; 1493 for (j=0; j<nz; j++){ 1494 idx = pj[j]; 1495 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 1496 } 1497 sctx.rs = rs; 1498 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1499 if (newshift == 1) goto endofwhile; 1500 1501 if (*pc2 != 0.0 || *pc3 != 0.0){ 1502 mul2 = (*pc2)/(*pc1); 1503 mul3 = (*pc3)/(*pc1); 1504 *pc2 = mul2; 1505 *pc3 = mul3; 1506 nz_tmp = bi[prow+1] - bd[prow] - 1; 1507 pj = nbj + bd[prow]; 1508 for (j=0; j<nz_tmp; j++) { 1509 idx = pj[j] ; 1510 tmp = rtmp11[idx]; 1511 rtmp22[idx] -= mul2 * tmp; 1512 rtmp33[idx] -= mul3 * tmp; 1513 } 1514 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1515 } 1516 ++prow; 1517 1518 pc2 = rtmp22 + prow; 1519 pc3 = rtmp33 + prow; 1520 sctx.pv = *pc2; 1521 pj = bj + bi[prow]; 1522 rs = 0.0; 1523 for (j=0; j<nz; j++){ 1524 idx = pj[j]; 1525 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 1526 } 1527 sctx.rs = rs; 1528 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1529 if (newshift == 1) goto endofwhile; 1530 1531 if (*pc3 != 0.0){ 1532 mul3 = (*pc3)/(*pc2); 1533 *pc3 = mul3; 1534 pj = nbj + bd[prow]; 1535 nz_tmp = bi[prow+1] - bd[prow] - 1; 1536 for (j=0; j<nz_tmp; j++) { 1537 idx = pj[j] ; 1538 tmp = rtmp22[idx]; 1539 rtmp33[idx] -= mul3 * tmp; 1540 } 1541 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1542 } 1543 1544 pj = bj + bi[row]; 1545 pc1 = ba + bi[row]; 1546 pc2 = ba + bi[row+1]; 1547 pc3 = ba + bi[row+2]; 1548 1549 sctx.pv = rtmp33[row+2]; 1550 rs = 0.0; 1551 rtmp11[row] = 1.0/rtmp11[row]; 1552 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1553 rtmp33[row+2] = 1.0/rtmp33[row+2]; 1554 /* copy row entries from dense representation to sparse */ 1555 for (j=0; j<nz; j++) { 1556 idx = pj[j]; 1557 pc1[j] = rtmp11[idx]; 1558 pc2[j] = rtmp22[idx]; 1559 pc3[j] = rtmp33[idx]; 1560 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1561 } 1562 1563 sctx.rs = rs; 1564 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1565 if (newshift == 1) goto endofwhile; 1566 break; 1567 1568 default: 1569 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1570 } 1571 row += nodesz; /* Update the row */ 1572 } 1573 endofwhile:; 1574 } while (sctx.lushift); 1575 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 1576 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1577 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1578 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1579 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1580 C->factor = FACTOR_LU; 1581 C->assembled = PETSC_TRUE; 1582 if (sctx.nshift) { 1583 if (info->shiftnz) { 1584 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1585 } else if (info->shiftpd) { 1586 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1587 } 1588 } 1589 ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr); 1590 PetscFunctionReturn(0); 1591 } 1592 1593 /* 1594 Makes a longer coloring[] array and calls the usual code with that 1595 */ 1596 #undef __FUNCT__ 1597 #define __FUNCT__ "MatColoringPatch_Inode" 1598 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1599 { 1600 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1601 PetscErrorCode ierr; 1602 PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1603 PetscInt *colorused,i; 1604 ISColoringValue *newcolor; 1605 1606 PetscFunctionBegin; 1607 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1608 /* loop over inodes, marking a color for each column*/ 1609 row = 0; 1610 for (i=0; i<m; i++){ 1611 for (j=0; j<ns[i]; j++) { 1612 newcolor[row++] = coloring[i] + j*ncolors; 1613 } 1614 } 1615 1616 /* eliminate unneeded colors */ 1617 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1618 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1619 for (i=0; i<n; i++) { 1620 colorused[newcolor[i]] = 1; 1621 } 1622 1623 for (i=1; i<5*ncolors; i++) { 1624 colorused[i] += colorused[i-1]; 1625 } 1626 ncolors = colorused[5*ncolors-1]; 1627 for (i=0; i<n; i++) { 1628 newcolor[i] = colorused[newcolor[i]]-1; 1629 } 1630 ierr = PetscFree(colorused);CHKERRQ(ierr); 1631 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1632 ierr = PetscFree(coloring);CHKERRQ(ierr); 1633 PetscFunctionReturn(0); 1634 } 1635 1636 #include "src/inline/ilu.h" 1637 1638 #undef __FUNCT__ 1639 #define __FUNCT__ "MatRelax_Inode" 1640 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1641 { 1642 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1643 PetscScalar *x,*xs,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1644 MatScalar *ibdiag,*bdiag; 1645 PetscScalar *b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1646 const PetscScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1647 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1648 PetscErrorCode ierr; 1649 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1650 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1651 1652 PetscFunctionBegin; 1653 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1654 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1655 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode"); 1656 1657 if (!a->inode.ibdiagvalid) { 1658 if (!a->inode.ibdiag) { 1659 /* calculate space needed for diagonal blocks */ 1660 for (i=0; i<m; i++) { 1661 cnt += sizes[i]*sizes[i]; 1662 } 1663 a->inode.bdiagsize = cnt; 1664 ierr = PetscMalloc2(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag);CHKERRQ(ierr); 1665 } 1666 1667 /* copy over the diagonal blocks and invert them */ 1668 ibdiag = a->inode.ibdiag; 1669 bdiag = a->inode.bdiag; 1670 cnt = 0; 1671 for (i=0, row = 0; i<m; i++) { 1672 for (j=0; j<sizes[i]; j++) { 1673 for (k=0; k<sizes[i]; k++) { 1674 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1675 } 1676 } 1677 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1678 1679 switch(sizes[i]) { 1680 case 1: 1681 /* Create matrix data structure */ 1682 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1683 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1684 break; 1685 case 2: 1686 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1687 break; 1688 case 3: 1689 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1690 break; 1691 case 4: 1692 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1693 break; 1694 case 5: 1695 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1696 break; 1697 default: 1698 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1699 } 1700 cnt += sizes[i]*sizes[i]; 1701 row += sizes[i]; 1702 } 1703 a->inode.ibdiagvalid = PETSC_TRUE; 1704 } 1705 ibdiag = a->inode.ibdiag; 1706 bdiag = a->inode.bdiag; 1707 1708 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1709 if (xx != bb) { 1710 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1711 } else { 1712 b = x; 1713 } 1714 1715 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1716 xs = x; 1717 if (flag & SOR_ZERO_INITIAL_GUESS) { 1718 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1719 1720 for (i=0, row=0; i<m; i++) { 1721 sz = diag[row] - ii[row]; 1722 v1 = a->a + ii[row]; 1723 idx = a->j + ii[row]; 1724 1725 /* see comments for MatMult_Inode() for how this is coded */ 1726 switch (sizes[i]){ 1727 case 1: 1728 1729 sum1 = b[row]; 1730 for(n = 0; n<sz-1; n+=2) { 1731 i1 = idx[0]; 1732 i2 = idx[1]; 1733 idx += 2; 1734 tmp0 = x[i1]; 1735 tmp1 = x[i2]; 1736 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1737 } 1738 1739 if (n == sz-1){ 1740 tmp0 = x[*idx]; 1741 sum1 -= *v1 * tmp0; 1742 } 1743 x[row++] = sum1*(*ibdiag++); 1744 break; 1745 case 2: 1746 v2 = a->a + ii[row+1]; 1747 sum1 = b[row]; 1748 sum2 = b[row+1]; 1749 for(n = 0; n<sz-1; n+=2) { 1750 i1 = idx[0]; 1751 i2 = idx[1]; 1752 idx += 2; 1753 tmp0 = x[i1]; 1754 tmp1 = x[i2]; 1755 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1756 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1757 } 1758 1759 if (n == sz-1){ 1760 tmp0 = x[*idx]; 1761 sum1 -= v1[0] * tmp0; 1762 sum2 -= v2[0] * tmp0; 1763 } 1764 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1765 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1766 ibdiag += 4; 1767 break; 1768 case 3: 1769 v2 = a->a + ii[row+1]; 1770 v3 = a->a + ii[row+2]; 1771 sum1 = b[row]; 1772 sum2 = b[row+1]; 1773 sum3 = b[row+2]; 1774 for(n = 0; n<sz-1; n+=2) { 1775 i1 = idx[0]; 1776 i2 = idx[1]; 1777 idx += 2; 1778 tmp0 = x[i1]; 1779 tmp1 = x[i2]; 1780 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1781 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1782 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1783 } 1784 1785 if (n == sz-1){ 1786 tmp0 = x[*idx]; 1787 sum1 -= v1[0] * tmp0; 1788 sum2 -= v2[0] * tmp0; 1789 sum3 -= v3[0] * tmp0; 1790 } 1791 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1792 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1793 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1794 ibdiag += 9; 1795 break; 1796 case 4: 1797 v2 = a->a + ii[row+1]; 1798 v3 = a->a + ii[row+2]; 1799 v4 = a->a + ii[row+3]; 1800 sum1 = b[row]; 1801 sum2 = b[row+1]; 1802 sum3 = b[row+2]; 1803 sum4 = b[row+3]; 1804 for(n = 0; n<sz-1; n+=2) { 1805 i1 = idx[0]; 1806 i2 = idx[1]; 1807 idx += 2; 1808 tmp0 = x[i1]; 1809 tmp1 = x[i2]; 1810 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1811 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1812 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1813 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1814 } 1815 1816 if (n == sz-1){ 1817 tmp0 = x[*idx]; 1818 sum1 -= v1[0] * tmp0; 1819 sum2 -= v2[0] * tmp0; 1820 sum3 -= v3[0] * tmp0; 1821 sum4 -= v4[0] * tmp0; 1822 } 1823 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1824 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1825 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1826 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1827 ibdiag += 16; 1828 break; 1829 case 5: 1830 v2 = a->a + ii[row+1]; 1831 v3 = a->a + ii[row+2]; 1832 v4 = a->a + ii[row+3]; 1833 v5 = a->a + ii[row+4]; 1834 sum1 = b[row]; 1835 sum2 = b[row+1]; 1836 sum3 = b[row+2]; 1837 sum4 = b[row+3]; 1838 sum5 = b[row+4]; 1839 for(n = 0; n<sz-1; n+=2) { 1840 i1 = idx[0]; 1841 i2 = idx[1]; 1842 idx += 2; 1843 tmp0 = x[i1]; 1844 tmp1 = x[i2]; 1845 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1846 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1847 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1848 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1849 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1850 } 1851 1852 if (n == sz-1){ 1853 tmp0 = x[*idx]; 1854 sum1 -= v1[0] * tmp0; 1855 sum2 -= v2[0] * tmp0; 1856 sum3 -= v3[0] * tmp0; 1857 sum4 -= v4[0] * tmp0; 1858 sum5 -= v5[0] * tmp0; 1859 } 1860 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1861 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1862 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1863 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1864 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1865 ibdiag += 25; 1866 break; 1867 default: 1868 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1869 } 1870 } 1871 1872 xb = x; 1873 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1874 } else xb = b; 1875 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1876 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1877 cnt = 0; 1878 for (i=0, row=0; i<m; i++) { 1879 1880 switch (sizes[i]){ 1881 case 1: 1882 x[row++] *= bdiag[cnt++]; 1883 break; 1884 case 2: 1885 x1 = x[row]; x2 = x[row+1]; 1886 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1887 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1888 x[row++] = tmp1; 1889 x[row++] = tmp2; 1890 cnt += 4; 1891 break; 1892 case 3: 1893 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1894 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1895 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1896 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1897 x[row++] = tmp1; 1898 x[row++] = tmp2; 1899 x[row++] = tmp3; 1900 cnt += 9; 1901 break; 1902 case 4: 1903 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1904 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1905 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1906 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1907 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1908 x[row++] = tmp1; 1909 x[row++] = tmp2; 1910 x[row++] = tmp3; 1911 x[row++] = tmp4; 1912 cnt += 16; 1913 break; 1914 case 5: 1915 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1916 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1917 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1918 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1919 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1920 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1921 x[row++] = tmp1; 1922 x[row++] = tmp2; 1923 x[row++] = tmp3; 1924 x[row++] = tmp4; 1925 x[row++] = tmp5; 1926 cnt += 25; 1927 break; 1928 default: 1929 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1930 } 1931 } 1932 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1933 } 1934 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1935 1936 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1937 for (i=m-1, row=A->rmap.n-1; i>=0; i--) { 1938 ibdiag -= sizes[i]*sizes[i]; 1939 sz = ii[row+1] - diag[row] - 1; 1940 v1 = a->a + diag[row] + 1; 1941 idx = a->j + diag[row] + 1; 1942 1943 /* see comments for MatMult_Inode() for how this is coded */ 1944 switch (sizes[i]){ 1945 case 1: 1946 1947 sum1 = xb[row]; 1948 for(n = 0; n<sz-1; n+=2) { 1949 i1 = idx[0]; 1950 i2 = idx[1]; 1951 idx += 2; 1952 tmp0 = x[i1]; 1953 tmp1 = x[i2]; 1954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1955 } 1956 1957 if (n == sz-1){ 1958 tmp0 = x[*idx]; 1959 sum1 -= *v1*tmp0; 1960 } 1961 x[row--] = sum1*(*ibdiag); 1962 break; 1963 1964 case 2: 1965 1966 sum1 = xb[row]; 1967 sum2 = xb[row-1]; 1968 /* note that sum1 is associated with the second of the two rows */ 1969 v2 = a->a + diag[row-1] + 2; 1970 for(n = 0; n<sz-1; n+=2) { 1971 i1 = idx[0]; 1972 i2 = idx[1]; 1973 idx += 2; 1974 tmp0 = x[i1]; 1975 tmp1 = x[i2]; 1976 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1977 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1978 } 1979 1980 if (n == sz-1){ 1981 tmp0 = x[*idx]; 1982 sum1 -= *v1*tmp0; 1983 sum2 -= *v2*tmp0; 1984 } 1985 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1986 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1987 break; 1988 case 3: 1989 1990 sum1 = xb[row]; 1991 sum2 = xb[row-1]; 1992 sum3 = xb[row-2]; 1993 v2 = a->a + diag[row-1] + 2; 1994 v3 = a->a + diag[row-2] + 3; 1995 for(n = 0; n<sz-1; n+=2) { 1996 i1 = idx[0]; 1997 i2 = idx[1]; 1998 idx += 2; 1999 tmp0 = x[i1]; 2000 tmp1 = x[i2]; 2001 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2002 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2003 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2004 } 2005 2006 if (n == sz-1){ 2007 tmp0 = x[*idx]; 2008 sum1 -= *v1*tmp0; 2009 sum2 -= *v2*tmp0; 2010 sum3 -= *v3*tmp0; 2011 } 2012 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2013 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2014 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2015 break; 2016 case 4: 2017 2018 sum1 = xb[row]; 2019 sum2 = xb[row-1]; 2020 sum3 = xb[row-2]; 2021 sum4 = xb[row-3]; 2022 v2 = a->a + diag[row-1] + 2; 2023 v3 = a->a + diag[row-2] + 3; 2024 v4 = a->a + diag[row-3] + 4; 2025 for(n = 0; n<sz-1; n+=2) { 2026 i1 = idx[0]; 2027 i2 = idx[1]; 2028 idx += 2; 2029 tmp0 = x[i1]; 2030 tmp1 = x[i2]; 2031 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2032 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2033 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2034 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2035 } 2036 2037 if (n == sz-1){ 2038 tmp0 = x[*idx]; 2039 sum1 -= *v1*tmp0; 2040 sum2 -= *v2*tmp0; 2041 sum3 -= *v3*tmp0; 2042 sum4 -= *v4*tmp0; 2043 } 2044 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2045 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2046 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2047 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2048 break; 2049 case 5: 2050 2051 sum1 = xb[row]; 2052 sum2 = xb[row-1]; 2053 sum3 = xb[row-2]; 2054 sum4 = xb[row-3]; 2055 sum5 = xb[row-4]; 2056 v2 = a->a + diag[row-1] + 2; 2057 v3 = a->a + diag[row-2] + 3; 2058 v4 = a->a + diag[row-3] + 4; 2059 v5 = a->a + diag[row-4] + 5; 2060 for(n = 0; n<sz-1; n+=2) { 2061 i1 = idx[0]; 2062 i2 = idx[1]; 2063 idx += 2; 2064 tmp0 = x[i1]; 2065 tmp1 = x[i2]; 2066 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2067 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2068 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2069 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2070 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2071 } 2072 2073 if (n == sz-1){ 2074 tmp0 = x[*idx]; 2075 sum1 -= *v1*tmp0; 2076 sum2 -= *v2*tmp0; 2077 sum3 -= *v3*tmp0; 2078 sum4 -= *v4*tmp0; 2079 sum5 -= *v5*tmp0; 2080 } 2081 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2082 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2083 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2084 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2085 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2086 break; 2087 default: 2088 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2089 } 2090 } 2091 2092 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2093 } 2094 its--; 2095 } 2096 if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode"); 2097 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2098 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2099 PetscFunctionReturn(0); 2100 } 2101 2102 2103 /* 2104 samestructure indicates that the matrix has not changed its nonzero structure so we 2105 do not need to recompute the inodes 2106 */ 2107 #undef __FUNCT__ 2108 #define __FUNCT__ "Mat_CheckInode" 2109 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2110 { 2111 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2112 PetscErrorCode ierr; 2113 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2114 PetscTruth flag; 2115 2116 PetscFunctionBegin; 2117 if (!a->inode.use) PetscFunctionReturn(0); 2118 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2119 2120 2121 m = A->rmap.n; 2122 if (a->inode.size) {ns = a->inode.size;} 2123 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2124 2125 i = 0; 2126 node_count = 0; 2127 idx = a->j; 2128 ii = a->i; 2129 while (i < m){ /* For each row */ 2130 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2131 /* Limits the number of elements in a node to 'a->inode.limit' */ 2132 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2133 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2134 if (nzy != nzx) break; 2135 idy += nzx; /* Same nonzero pattern */ 2136 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2137 if (!flag) break; 2138 } 2139 ns[node_count++] = blk_size; 2140 idx += blk_size*nzx; 2141 i = j; 2142 } 2143 /* If not enough inodes found,, do not use inode version of the routines */ 2144 if (!a->inode.size && m && node_count > .9*m) { 2145 ierr = PetscFree(ns);CHKERRQ(ierr); 2146 a->inode.node_count = 0; 2147 a->inode.size = PETSC_NULL; 2148 a->inode.use = PETSC_FALSE; 2149 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2150 } else { 2151 A->ops->mult = MatMult_Inode; 2152 A->ops->relax = MatRelax_Inode; 2153 A->ops->multadd = MatMultAdd_Inode; 2154 A->ops->solve = MatSolve_Inode; 2155 A->ops->lufactornumeric = MatLUFactorNumeric_Inode; 2156 A->ops->getrowij = MatGetRowIJ_Inode; 2157 A->ops->restorerowij = MatRestoreRowIJ_Inode; 2158 A->ops->getcolumnij = MatGetColumnIJ_Inode; 2159 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 2160 A->ops->coloringpatch = MatColoringPatch_Inode; 2161 a->inode.node_count = node_count; 2162 a->inode.size = ns; 2163 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2164 } 2165 PetscFunctionReturn(0); 2166 } 2167 2168 /* 2169 This is really ugly. if inodes are used this replaces the 2170 permutations with ones that correspond to rows/cols of the matrix 2171 rather then inode blocks 2172 */ 2173 #undef __FUNCT__ 2174 #define __FUNCT__ "MatInodeAdjustForInodes" 2175 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2176 { 2177 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2178 2179 PetscFunctionBegin; 2180 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2181 if (f) { 2182 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2183 } 2184 PetscFunctionReturn(0); 2185 } 2186 2187 EXTERN_C_BEGIN 2188 #undef __FUNCT__ 2189 #define __FUNCT__ "MatAdjustForInodes_Inode" 2190 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 2191 { 2192 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2193 PetscErrorCode ierr; 2194 PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count; 2195 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2196 PetscInt nslim_col,*ns_col; 2197 IS ris = *rperm,cis = *cperm; 2198 2199 PetscFunctionBegin; 2200 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2201 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2202 2203 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2204 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2205 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2206 permc = permr + m; 2207 2208 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2209 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2210 2211 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2212 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2213 2214 /* Construct the permutations for rows*/ 2215 for (i=0,row = 0; i<nslim_row; ++i){ 2216 indx = ridx[i]; 2217 start_val = tns[indx]; 2218 end_val = tns[indx + 1]; 2219 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2220 } 2221 2222 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2223 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2224 2225 /* Construct permutations for columns */ 2226 for (i=0,col=0; i<nslim_col; ++i){ 2227 indx = cidx[i]; 2228 start_val = tns[indx]; 2229 end_val = tns[indx + 1]; 2230 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2231 } 2232 2233 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2234 ISSetPermutation(*rperm); 2235 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2236 ISSetPermutation(*cperm); 2237 2238 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2239 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2240 2241 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2242 ierr = PetscFree(permr);CHKERRQ(ierr); 2243 ierr = ISDestroy(cis);CHKERRQ(ierr); 2244 ierr = ISDestroy(ris);CHKERRQ(ierr); 2245 ierr = PetscFree(tns);CHKERRQ(ierr); 2246 PetscFunctionReturn(0); 2247 } 2248 EXTERN_C_END 2249 2250 #undef __FUNCT__ 2251 #define __FUNCT__ "MatInodeGetInodeSizes" 2252 /*@C 2253 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2254 2255 Collective on Mat 2256 2257 Input Parameter: 2258 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2259 2260 Output Parameter: 2261 + node_count - no of inodes present in the matrix. 2262 . sizes - an array of size node_count,with sizes of each inode. 2263 - limit - the max size used to generate the inodes. 2264 2265 Level: advanced 2266 2267 Notes: This routine returns some internal storage information 2268 of the matrix, it is intended to be used by advanced users. 2269 It should be called after the matrix is assembled. 2270 The contents of the sizes[] array should not be changed. 2271 PETSC_NULL may be passed for information not requested. 2272 2273 .keywords: matrix, seqaij, get, inode 2274 2275 .seealso: MatGetInfo() 2276 @*/ 2277 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2278 { 2279 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2280 2281 PetscFunctionBegin; 2282 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2283 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2284 if (f) { 2285 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2286 } 2287 PetscFunctionReturn(0); 2288 } 2289 2290 EXTERN_C_BEGIN 2291 #undef __FUNCT__ 2292 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 2293 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2294 { 2295 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2296 2297 PetscFunctionBegin; 2298 if (node_count) *node_count = a->inode.node_count; 2299 if (sizes) *sizes = a->inode.size; 2300 if (limit) *limit = a->inode.limit; 2301 PetscFunctionReturn(0); 2302 } 2303 EXTERN_C_END 2304