1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap.n; 19 m = A->rmap.n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap.n; 72 n = A->cmap.n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap.n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap.n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_Inode" 397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y; 402 PetscErrorCode ierr; 403 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 404 405 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 406 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 407 #endif 408 409 PetscFunctionBegin; 410 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 411 node_max = a->inode.node_count; 412 ns = a->inode.size; /* Node Size array */ 413 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 414 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 415 idx = a->j; 416 v1 = a->a; 417 ii = a->i; 418 419 for (i = 0,row = 0; i< node_max; ++i){ 420 nsz = ns[i]; 421 n = ii[1] - ii[0]; 422 nonzerorow += (n>0)*nsz; 423 ii += nsz; 424 sz = n; /* No of non zeros in this row */ 425 /* Switch on the size of Node */ 426 switch (nsz){ /* Each loop in 'case' is unrolled */ 427 case 1 : 428 sum1 = 0; 429 430 for(n = 0; n< sz-1; n+=2) { 431 i1 = idx[0]; /* The instructions are ordered to */ 432 i2 = idx[1]; /* make the compiler's job easy */ 433 idx += 2; 434 tmp0 = x[i1]; 435 tmp1 = x[i2]; 436 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 437 } 438 439 if (n == sz-1){ /* Take care of the last nonzero */ 440 tmp0 = x[*idx++]; 441 sum1 += *v1++ * tmp0; 442 } 443 y[row++]=sum1; 444 break; 445 case 2: 446 sum1 = 0; 447 sum2 = 0; 448 v2 = v1 + n; 449 450 for (n = 0; n< sz-1; n+=2) { 451 i1 = idx[0]; 452 i2 = idx[1]; 453 idx += 2; 454 tmp0 = x[i1]; 455 tmp1 = x[i2]; 456 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 457 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 458 } 459 if (n == sz-1){ 460 tmp0 = x[*idx++]; 461 sum1 += *v1++ * tmp0; 462 sum2 += *v2++ * tmp0; 463 } 464 y[row++]=sum1; 465 y[row++]=sum2; 466 v1 =v2; /* Since the next block to be processed starts there*/ 467 idx +=sz; 468 break; 469 case 3: 470 sum1 = 0; 471 sum2 = 0; 472 sum3 = 0; 473 v2 = v1 + n; 474 v3 = v2 + n; 475 476 for (n = 0; n< sz-1; n+=2) { 477 i1 = idx[0]; 478 i2 = idx[1]; 479 idx += 2; 480 tmp0 = x[i1]; 481 tmp1 = x[i2]; 482 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 483 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 484 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 485 } 486 if (n == sz-1){ 487 tmp0 = x[*idx++]; 488 sum1 += *v1++ * tmp0; 489 sum2 += *v2++ * tmp0; 490 sum3 += *v3++ * tmp0; 491 } 492 y[row++]=sum1; 493 y[row++]=sum2; 494 y[row++]=sum3; 495 v1 =v3; /* Since the next block to be processed starts there*/ 496 idx +=2*sz; 497 break; 498 case 4: 499 sum1 = 0; 500 sum2 = 0; 501 sum3 = 0; 502 sum4 = 0; 503 v2 = v1 + n; 504 v3 = v2 + n; 505 v4 = v3 + n; 506 507 for (n = 0; n< sz-1; n+=2) { 508 i1 = idx[0]; 509 i2 = idx[1]; 510 idx += 2; 511 tmp0 = x[i1]; 512 tmp1 = x[i2]; 513 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 514 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 515 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 516 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 517 } 518 if (n == sz-1){ 519 tmp0 = x[*idx++]; 520 sum1 += *v1++ * tmp0; 521 sum2 += *v2++ * tmp0; 522 sum3 += *v3++ * tmp0; 523 sum4 += *v4++ * tmp0; 524 } 525 y[row++]=sum1; 526 y[row++]=sum2; 527 y[row++]=sum3; 528 y[row++]=sum4; 529 v1 =v4; /* Since the next block to be processed starts there*/ 530 idx +=3*sz; 531 break; 532 case 5: 533 sum1 = 0; 534 sum2 = 0; 535 sum3 = 0; 536 sum4 = 0; 537 sum5 = 0; 538 v2 = v1 + n; 539 v3 = v2 + n; 540 v4 = v3 + n; 541 v5 = v4 + n; 542 543 for (n = 0; n<sz-1; n+=2) { 544 i1 = idx[0]; 545 i2 = idx[1]; 546 idx += 2; 547 tmp0 = x[i1]; 548 tmp1 = x[i2]; 549 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 550 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 551 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 552 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 553 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 554 } 555 if (n == sz-1){ 556 tmp0 = x[*idx++]; 557 sum1 += *v1++ * tmp0; 558 sum2 += *v2++ * tmp0; 559 sum3 += *v3++ * tmp0; 560 sum4 += *v4++ * tmp0; 561 sum5 += *v5++ * tmp0; 562 } 563 y[row++]=sum1; 564 y[row++]=sum2; 565 y[row++]=sum3; 566 y[row++]=sum4; 567 y[row++]=sum5; 568 v1 =v5; /* Since the next block to be processed starts there */ 569 idx +=4*sz; 570 break; 571 default : 572 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 573 } 574 } 575 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 576 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 577 ierr = PetscLogFlops(2*a->nz - nonzerorow);CHKERRQ(ierr); 578 PetscFunctionReturn(0); 579 } 580 /* ----------------------------------------------------------- */ 581 /* Almost same code as the MatMult_Inode() */ 582 #undef __FUNCT__ 583 #define __FUNCT__ "MatMultAdd_Inode" 584 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 585 { 586 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 587 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 588 PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt; 589 PetscErrorCode ierr; 590 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 591 592 PetscFunctionBegin; 593 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 594 node_max = a->inode.node_count; 595 ns = a->inode.size; /* Node Size array */ 596 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 597 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 598 if (zz != yy) { 599 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 600 } else { 601 z = y; 602 } 603 zt = z; 604 605 idx = a->j; 606 v1 = a->a; 607 ii = a->i; 608 609 for (i = 0,row = 0; i< node_max; ++i){ 610 nsz = ns[i]; 611 n = ii[1] - ii[0]; 612 ii += nsz; 613 sz = n; /* No of non zeros in this row */ 614 /* Switch on the size of Node */ 615 switch (nsz){ /* Each loop in 'case' is unrolled */ 616 case 1 : 617 sum1 = *zt++; 618 619 for(n = 0; n< sz-1; n+=2) { 620 i1 = idx[0]; /* The instructions are ordered to */ 621 i2 = idx[1]; /* make the compiler's job easy */ 622 idx += 2; 623 tmp0 = x[i1]; 624 tmp1 = x[i2]; 625 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 626 } 627 628 if(n == sz-1){ /* Take care of the last nonzero */ 629 tmp0 = x[*idx++]; 630 sum1 += *v1++ * tmp0; 631 } 632 y[row++]=sum1; 633 break; 634 case 2: 635 sum1 = *zt++; 636 sum2 = *zt++; 637 v2 = v1 + n; 638 639 for(n = 0; n< sz-1; n+=2) { 640 i1 = idx[0]; 641 i2 = idx[1]; 642 idx += 2; 643 tmp0 = x[i1]; 644 tmp1 = x[i2]; 645 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 646 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 647 } 648 if(n == sz-1){ 649 tmp0 = x[*idx++]; 650 sum1 += *v1++ * tmp0; 651 sum2 += *v2++ * tmp0; 652 } 653 y[row++]=sum1; 654 y[row++]=sum2; 655 v1 =v2; /* Since the next block to be processed starts there*/ 656 idx +=sz; 657 break; 658 case 3: 659 sum1 = *zt++; 660 sum2 = *zt++; 661 sum3 = *zt++; 662 v2 = v1 + n; 663 v3 = v2 + n; 664 665 for (n = 0; n< sz-1; n+=2) { 666 i1 = idx[0]; 667 i2 = idx[1]; 668 idx += 2; 669 tmp0 = x[i1]; 670 tmp1 = x[i2]; 671 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 672 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 673 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 674 } 675 if (n == sz-1){ 676 tmp0 = x[*idx++]; 677 sum1 += *v1++ * tmp0; 678 sum2 += *v2++ * tmp0; 679 sum3 += *v3++ * tmp0; 680 } 681 y[row++]=sum1; 682 y[row++]=sum2; 683 y[row++]=sum3; 684 v1 =v3; /* Since the next block to be processed starts there*/ 685 idx +=2*sz; 686 break; 687 case 4: 688 sum1 = *zt++; 689 sum2 = *zt++; 690 sum3 = *zt++; 691 sum4 = *zt++; 692 v2 = v1 + n; 693 v3 = v2 + n; 694 v4 = v3 + n; 695 696 for (n = 0; n< sz-1; n+=2) { 697 i1 = idx[0]; 698 i2 = idx[1]; 699 idx += 2; 700 tmp0 = x[i1]; 701 tmp1 = x[i2]; 702 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 703 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 704 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 705 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 706 } 707 if (n == sz-1){ 708 tmp0 = x[*idx++]; 709 sum1 += *v1++ * tmp0; 710 sum2 += *v2++ * tmp0; 711 sum3 += *v3++ * tmp0; 712 sum4 += *v4++ * tmp0; 713 } 714 y[row++]=sum1; 715 y[row++]=sum2; 716 y[row++]=sum3; 717 y[row++]=sum4; 718 v1 =v4; /* Since the next block to be processed starts there*/ 719 idx +=3*sz; 720 break; 721 case 5: 722 sum1 = *zt++; 723 sum2 = *zt++; 724 sum3 = *zt++; 725 sum4 = *zt++; 726 sum5 = *zt++; 727 v2 = v1 + n; 728 v3 = v2 + n; 729 v4 = v3 + n; 730 v5 = v4 + n; 731 732 for (n = 0; n<sz-1; n+=2) { 733 i1 = idx[0]; 734 i2 = idx[1]; 735 idx += 2; 736 tmp0 = x[i1]; 737 tmp1 = x[i2]; 738 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 739 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 740 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 741 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 742 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 743 } 744 if(n == sz-1){ 745 tmp0 = x[*idx++]; 746 sum1 += *v1++ * tmp0; 747 sum2 += *v2++ * tmp0; 748 sum3 += *v3++ * tmp0; 749 sum4 += *v4++ * tmp0; 750 sum5 += *v5++ * tmp0; 751 } 752 y[row++]=sum1; 753 y[row++]=sum2; 754 y[row++]=sum3; 755 y[row++]=sum4; 756 y[row++]=sum5; 757 v1 =v5; /* Since the next block to be processed starts there */ 758 idx +=4*sz; 759 break; 760 default : 761 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 762 } 763 } 764 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 765 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 766 if (zz != yy) { 767 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 768 } 769 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 770 PetscFunctionReturn(0); 771 } 772 773 /* ----------------------------------------------------------- */ 774 #undef __FUNCT__ 775 #define __FUNCT__ "MatSolve_Inode" 776 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 777 { 778 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 779 IS iscol = a->col,isrow = a->row; 780 PetscErrorCode ierr; 781 PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j; 782 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout; 783 PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1; 784 PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5; 785 786 PetscFunctionBegin; 787 if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix"); 788 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 789 node_max = a->inode.node_count; 790 ns = a->inode.size; /* Node Size array */ 791 792 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 793 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 794 tmp = a->solve_work; 795 796 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 797 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 798 799 /* forward solve the lower triangular */ 800 tmps = tmp ; 801 aa = a_a ; 802 aj = a_j ; 803 ad = a->diag; 804 805 for (i = 0,row = 0; i< node_max; ++i){ 806 nsz = ns[i]; 807 aii = ai[row]; 808 v1 = aa + aii; 809 vi = aj + aii; 810 nz = ad[row]- aii; 811 812 switch (nsz){ /* Each loop in 'case' is unrolled */ 813 case 1 : 814 sum1 = b[*r++]; 815 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 816 for(j=0; j<nz-1; j+=2){ 817 i0 = vi[0]; 818 i1 = vi[1]; 819 vi +=2; 820 tmp0 = tmps[i0]; 821 tmp1 = tmps[i1]; 822 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 823 } 824 if(j == nz-1){ 825 tmp0 = tmps[*vi++]; 826 sum1 -= *v1++ *tmp0; 827 } 828 tmp[row ++]=sum1; 829 break; 830 case 2: 831 sum1 = b[*r++]; 832 sum2 = b[*r++]; 833 v2 = aa + ai[row+1]; 834 835 for(j=0; j<nz-1; j+=2){ 836 i0 = vi[0]; 837 i1 = vi[1]; 838 vi +=2; 839 tmp0 = tmps[i0]; 840 tmp1 = tmps[i1]; 841 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 842 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 843 } 844 if(j == nz-1){ 845 tmp0 = tmps[*vi++]; 846 sum1 -= *v1++ *tmp0; 847 sum2 -= *v2++ *tmp0; 848 } 849 sum2 -= *v2++ * sum1; 850 tmp[row ++]=sum1; 851 tmp[row ++]=sum2; 852 break; 853 case 3: 854 sum1 = b[*r++]; 855 sum2 = b[*r++]; 856 sum3 = b[*r++]; 857 v2 = aa + ai[row+1]; 858 v3 = aa + ai[row+2]; 859 860 for (j=0; j<nz-1; j+=2){ 861 i0 = vi[0]; 862 i1 = vi[1]; 863 vi +=2; 864 tmp0 = tmps[i0]; 865 tmp1 = tmps[i1]; 866 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 867 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 868 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 869 } 870 if (j == nz-1){ 871 tmp0 = tmps[*vi++]; 872 sum1 -= *v1++ *tmp0; 873 sum2 -= *v2++ *tmp0; 874 sum3 -= *v3++ *tmp0; 875 } 876 sum2 -= *v2++ * sum1; 877 sum3 -= *v3++ * sum1; 878 sum3 -= *v3++ * sum2; 879 tmp[row ++]=sum1; 880 tmp[row ++]=sum2; 881 tmp[row ++]=sum3; 882 break; 883 884 case 4: 885 sum1 = b[*r++]; 886 sum2 = b[*r++]; 887 sum3 = b[*r++]; 888 sum4 = b[*r++]; 889 v2 = aa + ai[row+1]; 890 v3 = aa + ai[row+2]; 891 v4 = aa + ai[row+3]; 892 893 for (j=0; j<nz-1; j+=2){ 894 i0 = vi[0]; 895 i1 = vi[1]; 896 vi +=2; 897 tmp0 = tmps[i0]; 898 tmp1 = tmps[i1]; 899 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 900 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 901 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 902 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 903 } 904 if (j == nz-1){ 905 tmp0 = tmps[*vi++]; 906 sum1 -= *v1++ *tmp0; 907 sum2 -= *v2++ *tmp0; 908 sum3 -= *v3++ *tmp0; 909 sum4 -= *v4++ *tmp0; 910 } 911 sum2 -= *v2++ * sum1; 912 sum3 -= *v3++ * sum1; 913 sum4 -= *v4++ * sum1; 914 sum3 -= *v3++ * sum2; 915 sum4 -= *v4++ * sum2; 916 sum4 -= *v4++ * sum3; 917 918 tmp[row ++]=sum1; 919 tmp[row ++]=sum2; 920 tmp[row ++]=sum3; 921 tmp[row ++]=sum4; 922 break; 923 case 5: 924 sum1 = b[*r++]; 925 sum2 = b[*r++]; 926 sum3 = b[*r++]; 927 sum4 = b[*r++]; 928 sum5 = b[*r++]; 929 v2 = aa + ai[row+1]; 930 v3 = aa + ai[row+2]; 931 v4 = aa + ai[row+3]; 932 v5 = aa + ai[row+4]; 933 934 for (j=0; j<nz-1; j+=2){ 935 i0 = vi[0]; 936 i1 = vi[1]; 937 vi +=2; 938 tmp0 = tmps[i0]; 939 tmp1 = tmps[i1]; 940 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 941 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 942 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 943 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 944 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 945 } 946 if (j == nz-1){ 947 tmp0 = tmps[*vi++]; 948 sum1 -= *v1++ *tmp0; 949 sum2 -= *v2++ *tmp0; 950 sum3 -= *v3++ *tmp0; 951 sum4 -= *v4++ *tmp0; 952 sum5 -= *v5++ *tmp0; 953 } 954 955 sum2 -= *v2++ * sum1; 956 sum3 -= *v3++ * sum1; 957 sum4 -= *v4++ * sum1; 958 sum5 -= *v5++ * sum1; 959 sum3 -= *v3++ * sum2; 960 sum4 -= *v4++ * sum2; 961 sum5 -= *v5++ * sum2; 962 sum4 -= *v4++ * sum3; 963 sum5 -= *v5++ * sum3; 964 sum5 -= *v5++ * sum4; 965 966 tmp[row ++]=sum1; 967 tmp[row ++]=sum2; 968 tmp[row ++]=sum3; 969 tmp[row ++]=sum4; 970 tmp[row ++]=sum5; 971 break; 972 default: 973 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 974 } 975 } 976 /* backward solve the upper triangular */ 977 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 978 nsz = ns[i]; 979 aii = ai[row+1] -1; 980 v1 = aa + aii; 981 vi = aj + aii; 982 nz = aii- ad[row]; 983 switch (nsz){ /* Each loop in 'case' is unrolled */ 984 case 1 : 985 sum1 = tmp[row]; 986 987 for(j=nz ; j>1; j-=2){ 988 vi -=2; 989 i0 = vi[2]; 990 i1 = vi[1]; 991 tmp0 = tmps[i0]; 992 tmp1 = tmps[i1]; 993 v1 -= 2; 994 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 995 } 996 if (j==1){ 997 tmp0 = tmps[*vi--]; 998 sum1 -= *v1-- * tmp0; 999 } 1000 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1001 break; 1002 case 2 : 1003 sum1 = tmp[row]; 1004 sum2 = tmp[row -1]; 1005 v2 = aa + ai[row]-1; 1006 for (j=nz ; j>1; j-=2){ 1007 vi -=2; 1008 i0 = vi[2]; 1009 i1 = vi[1]; 1010 tmp0 = tmps[i0]; 1011 tmp1 = tmps[i1]; 1012 v1 -= 2; 1013 v2 -= 2; 1014 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1015 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1016 } 1017 if (j==1){ 1018 tmp0 = tmps[*vi--]; 1019 sum1 -= *v1-- * tmp0; 1020 sum2 -= *v2-- * tmp0; 1021 } 1022 1023 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1024 sum2 -= *v2-- * tmp0; 1025 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1026 break; 1027 case 3 : 1028 sum1 = tmp[row]; 1029 sum2 = tmp[row -1]; 1030 sum3 = tmp[row -2]; 1031 v2 = aa + ai[row]-1; 1032 v3 = aa + ai[row -1]-1; 1033 for (j=nz ; j>1; j-=2){ 1034 vi -=2; 1035 i0 = vi[2]; 1036 i1 = vi[1]; 1037 tmp0 = tmps[i0]; 1038 tmp1 = tmps[i1]; 1039 v1 -= 2; 1040 v2 -= 2; 1041 v3 -= 2; 1042 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1043 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1044 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1045 } 1046 if (j==1){ 1047 tmp0 = tmps[*vi--]; 1048 sum1 -= *v1-- * tmp0; 1049 sum2 -= *v2-- * tmp0; 1050 sum3 -= *v3-- * tmp0; 1051 } 1052 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1053 sum2 -= *v2-- * tmp0; 1054 sum3 -= *v3-- * tmp0; 1055 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1056 sum3 -= *v3-- * tmp0; 1057 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1058 1059 break; 1060 case 4 : 1061 sum1 = tmp[row]; 1062 sum2 = tmp[row -1]; 1063 sum3 = tmp[row -2]; 1064 sum4 = tmp[row -3]; 1065 v2 = aa + ai[row]-1; 1066 v3 = aa + ai[row -1]-1; 1067 v4 = aa + ai[row -2]-1; 1068 1069 for (j=nz ; j>1; j-=2){ 1070 vi -=2; 1071 i0 = vi[2]; 1072 i1 = vi[1]; 1073 tmp0 = tmps[i0]; 1074 tmp1 = tmps[i1]; 1075 v1 -= 2; 1076 v2 -= 2; 1077 v3 -= 2; 1078 v4 -= 2; 1079 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1080 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1081 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1082 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1083 } 1084 if (j==1){ 1085 tmp0 = tmps[*vi--]; 1086 sum1 -= *v1-- * tmp0; 1087 sum2 -= *v2-- * tmp0; 1088 sum3 -= *v3-- * tmp0; 1089 sum4 -= *v4-- * tmp0; 1090 } 1091 1092 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1093 sum2 -= *v2-- * tmp0; 1094 sum3 -= *v3-- * tmp0; 1095 sum4 -= *v4-- * tmp0; 1096 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1097 sum3 -= *v3-- * tmp0; 1098 sum4 -= *v4-- * tmp0; 1099 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1100 sum4 -= *v4-- * tmp0; 1101 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1102 break; 1103 case 5 : 1104 sum1 = tmp[row]; 1105 sum2 = tmp[row -1]; 1106 sum3 = tmp[row -2]; 1107 sum4 = tmp[row -3]; 1108 sum5 = tmp[row -4]; 1109 v2 = aa + ai[row]-1; 1110 v3 = aa + ai[row -1]-1; 1111 v4 = aa + ai[row -2]-1; 1112 v5 = aa + ai[row -3]-1; 1113 for (j=nz ; j>1; j-=2){ 1114 vi -= 2; 1115 i0 = vi[2]; 1116 i1 = vi[1]; 1117 tmp0 = tmps[i0]; 1118 tmp1 = tmps[i1]; 1119 v1 -= 2; 1120 v2 -= 2; 1121 v3 -= 2; 1122 v4 -= 2; 1123 v5 -= 2; 1124 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1125 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1126 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1127 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1128 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1129 } 1130 if (j==1){ 1131 tmp0 = tmps[*vi--]; 1132 sum1 -= *v1-- * tmp0; 1133 sum2 -= *v2-- * tmp0; 1134 sum3 -= *v3-- * tmp0; 1135 sum4 -= *v4-- * tmp0; 1136 sum5 -= *v5-- * tmp0; 1137 } 1138 1139 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1140 sum2 -= *v2-- * tmp0; 1141 sum3 -= *v3-- * tmp0; 1142 sum4 -= *v4-- * tmp0; 1143 sum5 -= *v5-- * tmp0; 1144 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1145 sum3 -= *v3-- * tmp0; 1146 sum4 -= *v4-- * tmp0; 1147 sum5 -= *v5-- * tmp0; 1148 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1152 sum5 -= *v5-- * tmp0; 1153 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1154 break; 1155 default: 1156 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1157 } 1158 } 1159 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1160 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1161 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1162 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1163 ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr); 1164 PetscFunctionReturn(0); 1165 } 1166 1167 #undef __FUNCT__ 1168 #define __FUNCT__ "MatLUFactorNumeric_Inode" 1169 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B) 1170 { 1171 Mat C = *B; 1172 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1173 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1174 PetscErrorCode ierr; 1175 PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i; 1176 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1177 PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1178 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1179 PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3; 1180 PetscScalar tmp,*ba = b->a,*aa = a->a,*pv; 1181 PetscReal rs=0.0; 1182 LUShift_Ctx sctx; 1183 PetscInt newshift; 1184 1185 PetscFunctionBegin; 1186 sctx.shift_top = 0; 1187 sctx.nshift_max = 0; 1188 sctx.shift_lo = 0; 1189 sctx.shift_hi = 0; 1190 1191 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1192 if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0; 1193 if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1194 sctx.shift_top = 0; 1195 for (i=0; i<n; i++) { 1196 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1197 rs = 0.0; 1198 ajtmp = aj + ai[i]; 1199 rtmp1 = aa + ai[i]; 1200 nz = ai[i+1] - ai[i]; 1201 for (j=0; j<nz; j++){ 1202 if (*ajtmp != i){ 1203 rs += PetscAbsScalar(*rtmp1++); 1204 } else { 1205 rs -= PetscRealPart(*rtmp1++); 1206 } 1207 ajtmp++; 1208 } 1209 if (rs>sctx.shift_top) sctx.shift_top = rs; 1210 } 1211 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1212 sctx.shift_top *= 1.1; 1213 sctx.nshift_max = 5; 1214 sctx.shift_lo = 0.; 1215 sctx.shift_hi = 1.; 1216 } 1217 sctx.shift_amount = 0; 1218 sctx.nshift = 0; 1219 1220 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1221 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1222 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1223 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1224 ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1225 ics = ic ; 1226 rtmp2 = rtmp1 + n; 1227 rtmp3 = rtmp2 + n; 1228 1229 node_max = a->inode.node_count; 1230 ns = a->inode.size ; 1231 if (!ns){ 1232 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1233 } 1234 1235 /* If max inode size > 3, split it into two inodes.*/ 1236 /* also map the inode sizes according to the ordering */ 1237 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1238 for (i=0,j=0; i<node_max; ++i,++j){ 1239 if (ns[i]>3) { 1240 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1241 ++j; 1242 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1243 } else { 1244 tmp_vec1[j] = ns[i]; 1245 } 1246 } 1247 /* Use the correct node_max */ 1248 node_max = j; 1249 1250 /* Now reorder the inode info based on mat re-ordering info */ 1251 /* First create a row -> inode_size_array_index map */ 1252 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1253 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1254 for (i=0,row=0; i<node_max; i++) { 1255 nodesz = tmp_vec1[i]; 1256 for (j=0; j<nodesz; j++,row++) { 1257 nsmap[row] = i; 1258 } 1259 } 1260 /* Using nsmap, create a reordered ns structure */ 1261 for (i=0,j=0; i< node_max; i++) { 1262 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1263 tmp_vec2[i] = nodesz; 1264 j += nodesz; 1265 } 1266 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1267 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1268 /* Now use the correct ns */ 1269 ns = tmp_vec2; 1270 1271 do { 1272 sctx.lushift = PETSC_FALSE; 1273 /* Now loop over each block-row, and do the factorization */ 1274 for (i=0,row=0; i<node_max; i++) { 1275 nodesz = ns[i]; 1276 nz = bi[row+1] - bi[row]; 1277 bjtmp = bj + bi[row]; 1278 1279 switch (nodesz){ 1280 case 1: 1281 for (j=0; j<nz; j++){ 1282 idx = bjtmp[j]; 1283 rtmp1[idx] = 0.0; 1284 } 1285 1286 /* load in initial (unfactored row) */ 1287 idx = r[row]; 1288 nz_tmp = ai[idx+1] - ai[idx]; 1289 ajtmp = aj + ai[idx]; 1290 v1 = aa + ai[idx]; 1291 1292 for (j=0; j<nz_tmp; j++) { 1293 idx = ics[ajtmp[j]]; 1294 rtmp1[idx] = v1[j]; 1295 } 1296 rtmp1[ics[r[row]]] += sctx.shift_amount; 1297 1298 prow = *bjtmp++ ; 1299 while (prow < row) { 1300 pc1 = rtmp1 + prow; 1301 if (*pc1 != 0.0){ 1302 pv = ba + bd[prow]; 1303 pj = nbj + bd[prow]; 1304 mul1 = *pc1 * *pv++; 1305 *pc1 = mul1; 1306 nz_tmp = bi[prow+1] - bd[prow] - 1; 1307 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1308 for (j=0; j<nz_tmp; j++) { 1309 tmp = pv[j]; 1310 idx = pj[j]; 1311 rtmp1[idx] -= mul1 * tmp; 1312 } 1313 } 1314 prow = *bjtmp++ ; 1315 } 1316 pj = bj + bi[row]; 1317 pc1 = ba + bi[row]; 1318 1319 sctx.pv = rtmp1[row]; 1320 rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */ 1321 rs = 0.0; 1322 for (j=0; j<nz; j++) { 1323 idx = pj[j]; 1324 pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */ 1325 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1326 } 1327 sctx.rs = rs; 1328 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1329 if (newshift == 1) goto endofwhile; 1330 break; 1331 1332 case 2: 1333 for (j=0; j<nz; j++) { 1334 idx = bjtmp[j]; 1335 rtmp1[idx] = 0.0; 1336 rtmp2[idx] = 0.0; 1337 } 1338 1339 /* load in initial (unfactored row) */ 1340 idx = r[row]; 1341 nz_tmp = ai[idx+1] - ai[idx]; 1342 ajtmp = aj + ai[idx]; 1343 v1 = aa + ai[idx]; 1344 v2 = aa + ai[idx+1]; 1345 for (j=0; j<nz_tmp; j++) { 1346 idx = ics[ajtmp[j]]; 1347 rtmp1[idx] = v1[j]; 1348 rtmp2[idx] = v2[j]; 1349 } 1350 rtmp1[ics[r[row]]] += sctx.shift_amount; 1351 rtmp2[ics[r[row+1]]] += sctx.shift_amount; 1352 1353 prow = *bjtmp++ ; 1354 while (prow < row) { 1355 pc1 = rtmp1 + prow; 1356 pc2 = rtmp2 + prow; 1357 if (*pc1 != 0.0 || *pc2 != 0.0){ 1358 pv = ba + bd[prow]; 1359 pj = nbj + bd[prow]; 1360 mul1 = *pc1 * *pv; 1361 mul2 = *pc2 * *pv; 1362 ++pv; 1363 *pc1 = mul1; 1364 *pc2 = mul2; 1365 1366 nz_tmp = bi[prow+1] - bd[prow] - 1; 1367 for (j=0; j<nz_tmp; j++) { 1368 tmp = pv[j]; 1369 idx = pj[j]; 1370 rtmp1[idx] -= mul1 * tmp; 1371 rtmp2[idx] -= mul2 * tmp; 1372 } 1373 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1374 } 1375 prow = *bjtmp++ ; 1376 } 1377 1378 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1379 pc1 = rtmp1 + prow; 1380 pc2 = rtmp2 + prow; 1381 1382 sctx.pv = *pc1; 1383 pj = bj + bi[prow]; 1384 rs = 0.0; 1385 for (j=0; j<nz; j++){ 1386 idx = pj[j]; 1387 if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]); 1388 } 1389 sctx.rs = rs; 1390 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1391 if (newshift == 1) goto endofwhile; 1392 1393 if (*pc2 != 0.0){ 1394 pj = nbj + bd[prow]; 1395 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1396 *pc2 = mul2; 1397 nz_tmp = bi[prow+1] - bd[prow] - 1; 1398 for (j=0; j<nz_tmp; j++) { 1399 idx = pj[j] ; 1400 tmp = rtmp1[idx]; 1401 rtmp2[idx] -= mul2 * tmp; 1402 } 1403 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1404 } 1405 1406 pj = bj + bi[row]; 1407 pc1 = ba + bi[row]; 1408 pc2 = ba + bi[row+1]; 1409 1410 sctx.pv = rtmp2[row+1]; 1411 rs = 0.0; 1412 rtmp1[row] = 1.0/rtmp1[row]; 1413 rtmp2[row+1] = 1.0/rtmp2[row+1]; 1414 /* copy row entries from dense representation to sparse */ 1415 for (j=0; j<nz; j++) { 1416 idx = pj[j]; 1417 pc1[j] = rtmp1[idx]; 1418 pc2[j] = rtmp2[idx]; 1419 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1420 } 1421 sctx.rs = rs; 1422 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1423 if (newshift == 1) goto endofwhile; 1424 break; 1425 1426 case 3: 1427 for (j=0; j<nz; j++) { 1428 idx = bjtmp[j]; 1429 rtmp1[idx] = 0.0; 1430 rtmp2[idx] = 0.0; 1431 rtmp3[idx] = 0.0; 1432 } 1433 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1434 idx = r[row]; 1435 nz_tmp = ai[idx+1] - ai[idx]; 1436 ajtmp = aj + ai[idx]; 1437 v1 = aa + ai[idx]; 1438 v2 = aa + ai[idx+1]; 1439 v3 = aa + ai[idx+2]; 1440 for (j=0; j<nz_tmp; j++) { 1441 idx = ics[ajtmp[j]]; 1442 rtmp1[idx] = v1[j]; 1443 rtmp2[idx] = v2[j]; 1444 rtmp3[idx] = v3[j]; 1445 } 1446 rtmp1[ics[r[row]]] += sctx.shift_amount; 1447 rtmp2[ics[r[row+1]]] += sctx.shift_amount; 1448 rtmp3[ics[r[row+2]]] += sctx.shift_amount; 1449 1450 /* loop over all pivot row blocks above this row block */ 1451 prow = *bjtmp++ ; 1452 while (prow < row) { 1453 pc1 = rtmp1 + prow; 1454 pc2 = rtmp2 + prow; 1455 pc3 = rtmp3 + prow; 1456 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1457 pv = ba + bd[prow]; 1458 pj = nbj + bd[prow]; 1459 mul1 = *pc1 * *pv; 1460 mul2 = *pc2 * *pv; 1461 mul3 = *pc3 * *pv; 1462 ++pv; 1463 *pc1 = mul1; 1464 *pc2 = mul2; 1465 *pc3 = mul3; 1466 1467 nz_tmp = bi[prow+1] - bd[prow] - 1; 1468 /* update this row based on pivot row */ 1469 for (j=0; j<nz_tmp; j++) { 1470 tmp = pv[j]; 1471 idx = pj[j]; 1472 rtmp1[idx] -= mul1 * tmp; 1473 rtmp2[idx] -= mul2 * tmp; 1474 rtmp3[idx] -= mul3 * tmp; 1475 } 1476 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1477 } 1478 prow = *bjtmp++ ; 1479 } 1480 1481 /* Now take care of diagonal 3x3 block in this set of rows */ 1482 /* note: prow = row here */ 1483 pc1 = rtmp1 + prow; 1484 pc2 = rtmp2 + prow; 1485 pc3 = rtmp3 + prow; 1486 1487 sctx.pv = *pc1; 1488 pj = bj + bi[prow]; 1489 rs = 0.0; 1490 for (j=0; j<nz; j++){ 1491 idx = pj[j]; 1492 if (idx != row) rs += PetscAbsScalar(rtmp1[idx]); 1493 } 1494 sctx.rs = rs; 1495 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1496 if (newshift == 1) goto endofwhile; 1497 1498 if (*pc2 != 0.0 || *pc3 != 0.0){ 1499 mul2 = (*pc2)/(*pc1); 1500 mul3 = (*pc3)/(*pc1); 1501 *pc2 = mul2; 1502 *pc3 = mul3; 1503 nz_tmp = bi[prow+1] - bd[prow] - 1; 1504 pj = nbj + bd[prow]; 1505 for (j=0; j<nz_tmp; j++) { 1506 idx = pj[j] ; 1507 tmp = rtmp1[idx]; 1508 rtmp2[idx] -= mul2 * tmp; 1509 rtmp3[idx] -= mul3 * tmp; 1510 } 1511 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1512 } 1513 ++prow; 1514 1515 pc2 = rtmp2 + prow; 1516 pc3 = rtmp3 + prow; 1517 sctx.pv = *pc2; 1518 pj = bj + bi[prow]; 1519 rs = 0.0; 1520 for (j=0; j<nz; j++){ 1521 idx = pj[j]; 1522 if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]); 1523 } 1524 sctx.rs = rs; 1525 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1526 if (newshift == 1) goto endofwhile; 1527 1528 if (*pc3 != 0.0){ 1529 mul3 = (*pc3)/(*pc2); 1530 *pc3 = mul3; 1531 pj = nbj + bd[prow]; 1532 nz_tmp = bi[prow+1] - bd[prow] - 1; 1533 for (j=0; j<nz_tmp; j++) { 1534 idx = pj[j] ; 1535 tmp = rtmp2[idx]; 1536 rtmp3[idx] -= mul3 * tmp; 1537 } 1538 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1539 } 1540 1541 pj = bj + bi[row]; 1542 pc1 = ba + bi[row]; 1543 pc2 = ba + bi[row+1]; 1544 pc3 = ba + bi[row+2]; 1545 1546 sctx.pv = rtmp3[row+2]; 1547 rs = 0.0; 1548 rtmp1[row] = 1.0/rtmp1[row]; 1549 rtmp2[row+1] = 1.0/rtmp2[row+1]; 1550 rtmp3[row+2] = 1.0/rtmp3[row+2]; 1551 /* copy row entries from dense representation to sparse */ 1552 for (j=0; j<nz; j++) { 1553 idx = pj[j]; 1554 pc1[j] = rtmp1[idx]; 1555 pc2[j] = rtmp2[idx]; 1556 pc3[j] = rtmp3[idx]; 1557 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1558 } 1559 1560 sctx.rs = rs; 1561 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1562 if (newshift == 1) goto endofwhile; 1563 break; 1564 1565 default: 1566 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1567 } 1568 row += nodesz; /* Update the row */ 1569 } 1570 endofwhile:; 1571 } while (sctx.lushift); 1572 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1573 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1574 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1575 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1576 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1577 C->factor = FACTOR_LU; 1578 C->assembled = PETSC_TRUE; 1579 if (sctx.nshift) { 1580 if (info->shiftnz) { 1581 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1582 } else if (info->shiftpd) { 1583 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1584 } 1585 } 1586 ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr); 1587 PetscFunctionReturn(0); 1588 } 1589 1590 /* 1591 Makes a longer coloring[] array and calls the usual code with that 1592 */ 1593 #undef __FUNCT__ 1594 #define __FUNCT__ "MatColoringPatch_Inode" 1595 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1596 { 1597 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1598 PetscErrorCode ierr; 1599 PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1600 PetscInt *colorused,i; 1601 ISColoringValue *newcolor; 1602 1603 PetscFunctionBegin; 1604 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1605 /* loop over inodes, marking a color for each column*/ 1606 row = 0; 1607 for (i=0; i<m; i++){ 1608 for (j=0; j<ns[i]; j++) { 1609 newcolor[row++] = coloring[i] + j*ncolors; 1610 } 1611 } 1612 1613 /* eliminate unneeded colors */ 1614 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1615 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1616 for (i=0; i<n; i++) { 1617 colorused[newcolor[i]] = 1; 1618 } 1619 1620 for (i=1; i<5*ncolors; i++) { 1621 colorused[i] += colorused[i-1]; 1622 } 1623 ncolors = colorused[5*ncolors-1]; 1624 for (i=0; i<n; i++) { 1625 newcolor[i] = colorused[newcolor[i]]-1; 1626 } 1627 ierr = PetscFree(colorused);CHKERRQ(ierr); 1628 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1629 ierr = PetscFree(coloring);CHKERRQ(ierr); 1630 PetscFunctionReturn(0); 1631 } 1632 1633 #include "src/inline/ilu.h" 1634 1635 #undef __FUNCT__ 1636 #define __FUNCT__ "MatRelax_Inode" 1637 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1638 { 1639 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1640 PetscScalar *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3,*v1,*v2,*v3,*v4,*v5; 1641 PetscScalar *v = a->a,*b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1642 PetscReal zeropivot = 1.0e-15; 1643 PetscErrorCode ierr; 1644 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1645 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1646 1647 PetscFunctionBegin; 1648 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1649 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1650 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode"); 1651 1652 if (!a->inode.ibdiagvalid) { 1653 if (!a->inode.ibdiag) { 1654 /* calculate space needed for diagonal blocks */ 1655 for (i=0; i<m; i++) { 1656 cnt += sizes[i]*sizes[i]; 1657 } 1658 a->inode.bdiagsize = cnt; 1659 ierr = PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);CHKERRQ(ierr); 1660 } 1661 1662 /* copy over the diagonal blocks and invert them */ 1663 ibdiag = a->inode.ibdiag; 1664 bdiag = a->inode.bdiag; 1665 cnt = 0; 1666 for (i=0, row = 0; i<m; i++) { 1667 for (j=0; j<sizes[i]; j++) { 1668 for (k=0; k<sizes[i]; k++) { 1669 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1670 } 1671 } 1672 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));CHKERRQ(ierr); 1673 1674 switch(sizes[i]) { 1675 case 1: 1676 /* Create matrix data structure */ 1677 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1678 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1679 break; 1680 case 2: 1681 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt);CHKERRQ(ierr); 1682 break; 1683 case 3: 1684 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt);CHKERRQ(ierr); 1685 break; 1686 case 4: 1687 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt);CHKERRQ(ierr); 1688 break; 1689 case 5: 1690 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt);CHKERRQ(ierr); 1691 break; 1692 default: 1693 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1694 } 1695 cnt += sizes[i]*sizes[i]; 1696 row += sizes[i]; 1697 } 1698 a->inode.ibdiagvalid = PETSC_TRUE; 1699 } 1700 ibdiag = a->inode.ibdiag; 1701 bdiag = a->inode.bdiag; 1702 1703 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1704 if (xx != bb) { 1705 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1706 } else { 1707 b = x; 1708 } 1709 1710 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1711 xs = x; 1712 if (flag & SOR_ZERO_INITIAL_GUESS) { 1713 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1714 1715 for (i=0, row=0; i<m; i++) { 1716 sz = diag[row] - ii[row]; 1717 v1 = a->a + ii[row]; 1718 idx = a->j + ii[row]; 1719 1720 /* see comments for MatMult_Inode() for how this is coded */ 1721 switch (sizes[i]){ 1722 case 1: 1723 1724 sum1 = b[row]; 1725 for(n = 0; n<sz-1; n+=2) { 1726 i1 = idx[0]; 1727 i2 = idx[1]; 1728 idx += 2; 1729 tmp0 = x[i1]; 1730 tmp1 = x[i2]; 1731 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1732 } 1733 1734 if (n == sz-1){ 1735 tmp0 = x[*idx]; 1736 sum1 -= *v1 * tmp0; 1737 } 1738 x[row++] = sum1*(*ibdiag++); 1739 break; 1740 case 2: 1741 v2 = a->a + ii[row+1]; 1742 sum1 = b[row]; 1743 sum2 = b[row+1]; 1744 for(n = 0; n<sz-1; n+=2) { 1745 i1 = idx[0]; 1746 i2 = idx[1]; 1747 idx += 2; 1748 tmp0 = x[i1]; 1749 tmp1 = x[i2]; 1750 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1751 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1752 } 1753 1754 if (n == sz-1){ 1755 tmp0 = x[*idx]; 1756 sum1 -= v1[0] * tmp0; 1757 sum2 -= v2[0] * tmp0; 1758 } 1759 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1760 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1761 ibdiag += 4; 1762 break; 1763 case 3: 1764 v2 = a->a + ii[row+1]; 1765 v3 = a->a + ii[row+2]; 1766 sum1 = b[row]; 1767 sum2 = b[row+1]; 1768 sum3 = b[row+2]; 1769 for(n = 0; n<sz-1; n+=2) { 1770 i1 = idx[0]; 1771 i2 = idx[1]; 1772 idx += 2; 1773 tmp0 = x[i1]; 1774 tmp1 = x[i2]; 1775 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1776 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1777 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1778 } 1779 1780 if (n == sz-1){ 1781 tmp0 = x[*idx]; 1782 sum1 -= v1[0] * tmp0; 1783 sum2 -= v2[0] * tmp0; 1784 sum3 -= v3[0] * tmp0; 1785 } 1786 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1787 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1788 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1789 ibdiag += 9; 1790 break; 1791 case 4: 1792 v2 = a->a + ii[row+1]; 1793 v3 = a->a + ii[row+2]; 1794 v4 = a->a + ii[row+3]; 1795 sum1 = b[row]; 1796 sum2 = b[row+1]; 1797 sum3 = b[row+2]; 1798 sum4 = b[row+3]; 1799 for(n = 0; n<sz-1; n+=2) { 1800 i1 = idx[0]; 1801 i2 = idx[1]; 1802 idx += 2; 1803 tmp0 = x[i1]; 1804 tmp1 = x[i2]; 1805 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1806 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1807 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1808 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1809 } 1810 1811 if (n == sz-1){ 1812 tmp0 = x[*idx]; 1813 sum1 -= v1[0] * tmp0; 1814 sum2 -= v2[0] * tmp0; 1815 sum3 -= v3[0] * tmp0; 1816 sum4 -= v4[0] * tmp0; 1817 } 1818 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1819 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1820 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1821 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1822 ibdiag += 16; 1823 break; 1824 case 5: 1825 v2 = a->a + ii[row+1]; 1826 v3 = a->a + ii[row+2]; 1827 v4 = a->a + ii[row+3]; 1828 v5 = a->a + ii[row+4]; 1829 sum1 = b[row]; 1830 sum2 = b[row+1]; 1831 sum3 = b[row+2]; 1832 sum4 = b[row+3]; 1833 sum5 = b[row+4]; 1834 for(n = 0; n<sz-1; n+=2) { 1835 i1 = idx[0]; 1836 i2 = idx[1]; 1837 idx += 2; 1838 tmp0 = x[i1]; 1839 tmp1 = x[i2]; 1840 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1841 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1842 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1843 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1844 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1845 } 1846 1847 if (n == sz-1){ 1848 tmp0 = x[*idx]; 1849 sum1 -= v1[0] * tmp0; 1850 sum2 -= v2[0] * tmp0; 1851 sum3 -= v3[0] * tmp0; 1852 sum4 -= v4[0] * tmp0; 1853 sum5 -= v5[0] * tmp0; 1854 } 1855 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1856 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1857 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1858 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1859 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1860 ibdiag += 25; 1861 break; 1862 default: 1863 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1864 } 1865 } 1866 1867 xb = x; 1868 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1869 } else xb = b; 1870 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1871 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1872 cnt = 0; 1873 for (i=0, row=0; i<m; i++) { 1874 1875 switch (sizes[i]){ 1876 case 1: 1877 x[row++] *= bdiag[cnt++]; 1878 break; 1879 case 2: 1880 x1 = x[row]; x2 = x[row+1]; 1881 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1882 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1883 x[row++] = tmp1; 1884 x[row++] = tmp2; 1885 cnt += 4; 1886 break; 1887 case 3: 1888 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1889 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1890 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1891 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1892 x[row++] = tmp1; 1893 x[row++] = tmp2; 1894 x[row++] = tmp3; 1895 cnt += 9; 1896 break; 1897 case 4: 1898 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1899 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1900 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1901 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1902 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1903 x[row++] = tmp1; 1904 x[row++] = tmp2; 1905 x[row++] = tmp3; 1906 x[row++] = tmp4; 1907 cnt += 16; 1908 break; 1909 case 5: 1910 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1911 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1912 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1913 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1914 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1915 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1916 x[row++] = tmp1; 1917 x[row++] = tmp2; 1918 x[row++] = tmp3; 1919 x[row++] = tmp4; 1920 x[row++] = tmp5; 1921 cnt += 25; 1922 break; 1923 default: 1924 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1925 } 1926 } 1927 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1928 } 1929 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1930 1931 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1932 for (i=m-1, row=A->rmap.n-1; i>=0; i--) { 1933 ibdiag -= sizes[i]*sizes[i]; 1934 sz = ii[row+1] - diag[row] - 1; 1935 v1 = a->a + diag[row] + 1; 1936 idx = a->j + diag[row] + 1; 1937 1938 /* see comments for MatMult_Inode() for how this is coded */ 1939 switch (sizes[i]){ 1940 case 1: 1941 1942 sum1 = xb[row]; 1943 for(n = 0; n<sz-1; n+=2) { 1944 i1 = idx[0]; 1945 i2 = idx[1]; 1946 idx += 2; 1947 tmp0 = x[i1]; 1948 tmp1 = x[i2]; 1949 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1950 } 1951 1952 if (n == sz-1){ 1953 tmp0 = x[*idx]; 1954 sum1 -= *v1*tmp0; 1955 } 1956 x[row--] = sum1*(*ibdiag); 1957 break; 1958 1959 case 2: 1960 1961 sum1 = xb[row]; 1962 sum2 = xb[row-1]; 1963 /* note that sum1 is associated with the second of the two rows */ 1964 v2 = a->a + diag[row-1] + 2; 1965 for(n = 0; n<sz-1; n+=2) { 1966 i1 = idx[0]; 1967 i2 = idx[1]; 1968 idx += 2; 1969 tmp0 = x[i1]; 1970 tmp1 = x[i2]; 1971 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1972 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1973 } 1974 1975 if (n == sz-1){ 1976 tmp0 = x[*idx]; 1977 sum1 -= *v1*tmp0; 1978 sum2 -= *v2*tmp0; 1979 } 1980 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1981 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1982 break; 1983 case 3: 1984 1985 sum1 = xb[row]; 1986 sum2 = xb[row-1]; 1987 sum3 = xb[row-2]; 1988 v2 = a->a + diag[row-1] + 2; 1989 v3 = a->a + diag[row-2] + 3; 1990 for(n = 0; n<sz-1; n+=2) { 1991 i1 = idx[0]; 1992 i2 = idx[1]; 1993 idx += 2; 1994 tmp0 = x[i1]; 1995 tmp1 = x[i2]; 1996 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1997 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1998 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1999 } 2000 2001 if (n == sz-1){ 2002 tmp0 = x[*idx]; 2003 sum1 -= *v1*tmp0; 2004 sum2 -= *v2*tmp0; 2005 sum3 -= *v3*tmp0; 2006 } 2007 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2008 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2009 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2010 break; 2011 case 4: 2012 2013 sum1 = xb[row]; 2014 sum2 = xb[row-1]; 2015 sum3 = xb[row-2]; 2016 sum4 = xb[row-3]; 2017 v2 = a->a + diag[row-1] + 2; 2018 v3 = a->a + diag[row-2] + 3; 2019 v4 = a->a + diag[row-3] + 4; 2020 for(n = 0; n<sz-1; n+=2) { 2021 i1 = idx[0]; 2022 i2 = idx[1]; 2023 idx += 2; 2024 tmp0 = x[i1]; 2025 tmp1 = x[i2]; 2026 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2027 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2028 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2029 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2030 } 2031 2032 if (n == sz-1){ 2033 tmp0 = x[*idx]; 2034 sum1 -= *v1*tmp0; 2035 sum2 -= *v2*tmp0; 2036 sum3 -= *v3*tmp0; 2037 sum4 -= *v4*tmp0; 2038 } 2039 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2040 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2041 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2042 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2043 break; 2044 case 5: 2045 2046 sum1 = xb[row]; 2047 sum2 = xb[row-1]; 2048 sum3 = xb[row-2]; 2049 sum4 = xb[row-3]; 2050 sum5 = xb[row-4]; 2051 v2 = a->a + diag[row-1] + 2; 2052 v3 = a->a + diag[row-2] + 3; 2053 v4 = a->a + diag[row-3] + 4; 2054 v5 = a->a + diag[row-4] + 5; 2055 for(n = 0; n<sz-1; n+=2) { 2056 i1 = idx[0]; 2057 i2 = idx[1]; 2058 idx += 2; 2059 tmp0 = x[i1]; 2060 tmp1 = x[i2]; 2061 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2062 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2063 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2064 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2065 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2066 } 2067 2068 if (n == sz-1){ 2069 tmp0 = x[*idx]; 2070 sum1 -= *v1*tmp0; 2071 sum2 -= *v2*tmp0; 2072 sum3 -= *v3*tmp0; 2073 sum4 -= *v4*tmp0; 2074 sum5 -= *v5*tmp0; 2075 } 2076 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2077 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2078 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2079 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2080 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2081 break; 2082 default: 2083 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2084 } 2085 } 2086 2087 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2088 } 2089 its--; 2090 } 2091 if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode"); 2092 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2093 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2094 PetscFunctionReturn(0); 2095 } 2096 2097 2098 /* 2099 samestructure indicates that the matrix has not changed its nonzero structure so we 2100 do not need to recompute the inodes 2101 */ 2102 #undef __FUNCT__ 2103 #define __FUNCT__ "Mat_CheckInode" 2104 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2105 { 2106 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2107 PetscErrorCode ierr; 2108 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2109 PetscTruth flag; 2110 2111 PetscFunctionBegin; 2112 if (!a->inode.use) PetscFunctionReturn(0); 2113 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2114 2115 2116 m = A->rmap.n; 2117 if (a->inode.size) {ns = a->inode.size;} 2118 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2119 2120 i = 0; 2121 node_count = 0; 2122 idx = a->j; 2123 ii = a->i; 2124 while (i < m){ /* For each row */ 2125 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2126 /* Limits the number of elements in a node to 'a->inode.limit' */ 2127 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2128 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2129 if (nzy != nzx) break; 2130 idy += nzx; /* Same nonzero pattern */ 2131 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2132 if (!flag) break; 2133 } 2134 ns[node_count++] = blk_size; 2135 idx += blk_size*nzx; 2136 i = j; 2137 } 2138 /* If not enough inodes found,, do not use inode version of the routines */ 2139 if (!a->inode.size && m && node_count > .9*m) { 2140 ierr = PetscFree(ns);CHKERRQ(ierr); 2141 a->inode.node_count = 0; 2142 a->inode.size = PETSC_NULL; 2143 a->inode.use = PETSC_FALSE; 2144 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2145 } else { 2146 A->ops->mult = MatMult_Inode; 2147 A->ops->relax = MatRelax_Inode; 2148 A->ops->multadd = MatMultAdd_Inode; 2149 A->ops->solve = MatSolve_Inode; 2150 A->ops->lufactornumeric = MatLUFactorNumeric_Inode; 2151 A->ops->getrowij = MatGetRowIJ_Inode; 2152 A->ops->restorerowij = MatRestoreRowIJ_Inode; 2153 A->ops->getcolumnij = MatGetColumnIJ_Inode; 2154 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 2155 A->ops->coloringpatch = MatColoringPatch_Inode; 2156 a->inode.node_count = node_count; 2157 a->inode.size = ns; 2158 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2159 } 2160 PetscFunctionReturn(0); 2161 } 2162 2163 /* 2164 This is really ugly. if inodes are used this replaces the 2165 permutations with ones that correspond to rows/cols of the matrix 2166 rather then inode blocks 2167 */ 2168 #undef __FUNCT__ 2169 #define __FUNCT__ "MatInodeAdjustForInodes" 2170 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2171 { 2172 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2173 2174 PetscFunctionBegin; 2175 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2176 if (f) { 2177 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2178 } 2179 PetscFunctionReturn(0); 2180 } 2181 2182 EXTERN_C_BEGIN 2183 #undef __FUNCT__ 2184 #define __FUNCT__ "MatAdjustForInodes_Inode" 2185 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 2186 { 2187 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2188 PetscErrorCode ierr; 2189 PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count; 2190 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2191 PetscInt nslim_col,*ns_col; 2192 IS ris = *rperm,cis = *cperm; 2193 2194 PetscFunctionBegin; 2195 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2196 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2197 2198 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2199 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2200 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2201 permc = permr + m; 2202 2203 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2204 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2205 2206 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2207 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2208 2209 /* Construct the permutations for rows*/ 2210 for (i=0,row = 0; i<nslim_row; ++i){ 2211 indx = ridx[i]; 2212 start_val = tns[indx]; 2213 end_val = tns[indx + 1]; 2214 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2215 } 2216 2217 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2218 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2219 2220 /* Construct permutations for columns */ 2221 for (i=0,col=0; i<nslim_col; ++i){ 2222 indx = cidx[i]; 2223 start_val = tns[indx]; 2224 end_val = tns[indx + 1]; 2225 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2226 } 2227 2228 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2229 ISSetPermutation(*rperm); 2230 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2231 ISSetPermutation(*cperm); 2232 2233 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2234 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2235 2236 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2237 ierr = PetscFree(permr);CHKERRQ(ierr); 2238 ierr = ISDestroy(cis);CHKERRQ(ierr); 2239 ierr = ISDestroy(ris);CHKERRQ(ierr); 2240 ierr = PetscFree(tns);CHKERRQ(ierr); 2241 PetscFunctionReturn(0); 2242 } 2243 EXTERN_C_END 2244 2245 #undef __FUNCT__ 2246 #define __FUNCT__ "MatInodeGetInodeSizes" 2247 /*@C 2248 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2249 2250 Collective on Mat 2251 2252 Input Parameter: 2253 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2254 2255 Output Parameter: 2256 + node_count - no of inodes present in the matrix. 2257 . sizes - an array of size node_count,with sizes of each inode. 2258 - limit - the max size used to generate the inodes. 2259 2260 Level: advanced 2261 2262 Notes: This routine returns some internal storage information 2263 of the matrix, it is intended to be used by advanced users. 2264 It should be called after the matrix is assembled. 2265 The contents of the sizes[] array should not be changed. 2266 PETSC_NULL may be passed for information not requested. 2267 2268 .keywords: matrix, seqaij, get, inode 2269 2270 .seealso: MatGetInfo() 2271 @*/ 2272 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2273 { 2274 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2275 2276 PetscFunctionBegin; 2277 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2278 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2279 if (f) { 2280 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2281 } 2282 PetscFunctionReturn(0); 2283 } 2284 2285 EXTERN_C_BEGIN 2286 #undef __FUNCT__ 2287 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 2288 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2289 { 2290 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2291 2292 PetscFunctionBegin; 2293 if (node_count) *node_count = a->inode.node_count; 2294 if (sizes) *sizes = a->inode.size; 2295 if (limit) *limit = a->inode.limit; 2296 PetscFunctionReturn(0); 2297 } 2298 EXTERN_C_END 2299