1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_Inode" 397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 sz = n; /* No of non zeros in this row */ 427 /* Switch on the size of Node */ 428 switch (nsz){ /* Each loop in 'case' is unrolled */ 429 case 1 : 430 sum1 = 0; 431 432 for(n = 0; n< sz-1; n+=2) { 433 i1 = idx[0]; /* The instructions are ordered to */ 434 i2 = idx[1]; /* make the compiler's job easy */ 435 idx += 2; 436 tmp0 = x[i1]; 437 tmp1 = x[i2]; 438 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 439 } 440 441 if (n == sz-1){ /* Take care of the last nonzero */ 442 tmp0 = x[*idx++]; 443 sum1 += *v1++ * tmp0; 444 } 445 y[row++]=sum1; 446 break; 447 case 2: 448 sum1 = 0; 449 sum2 = 0; 450 v2 = v1 + n; 451 452 for (n = 0; n< sz-1; n+=2) { 453 i1 = idx[0]; 454 i2 = idx[1]; 455 idx += 2; 456 tmp0 = x[i1]; 457 tmp1 = x[i2]; 458 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 459 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 460 } 461 if (n == sz-1){ 462 tmp0 = x[*idx++]; 463 sum1 += *v1++ * tmp0; 464 sum2 += *v2++ * tmp0; 465 } 466 y[row++]=sum1; 467 y[row++]=sum2; 468 v1 =v2; /* Since the next block to be processed starts there*/ 469 idx +=sz; 470 break; 471 case 3: 472 sum1 = 0; 473 sum2 = 0; 474 sum3 = 0; 475 v2 = v1 + n; 476 v3 = v2 + n; 477 478 for (n = 0; n< sz-1; n+=2) { 479 i1 = idx[0]; 480 i2 = idx[1]; 481 idx += 2; 482 tmp0 = x[i1]; 483 tmp1 = x[i2]; 484 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 485 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 486 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 487 } 488 if (n == sz-1){ 489 tmp0 = x[*idx++]; 490 sum1 += *v1++ * tmp0; 491 sum2 += *v2++ * tmp0; 492 sum3 += *v3++ * tmp0; 493 } 494 y[row++]=sum1; 495 y[row++]=sum2; 496 y[row++]=sum3; 497 v1 =v3; /* Since the next block to be processed starts there*/ 498 idx +=2*sz; 499 break; 500 case 4: 501 sum1 = 0; 502 sum2 = 0; 503 sum3 = 0; 504 sum4 = 0; 505 v2 = v1 + n; 506 v3 = v2 + n; 507 v4 = v3 + n; 508 509 for (n = 0; n< sz-1; n+=2) { 510 i1 = idx[0]; 511 i2 = idx[1]; 512 idx += 2; 513 tmp0 = x[i1]; 514 tmp1 = x[i2]; 515 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 516 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 517 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 518 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 519 } 520 if (n == sz-1){ 521 tmp0 = x[*idx++]; 522 sum1 += *v1++ * tmp0; 523 sum2 += *v2++ * tmp0; 524 sum3 += *v3++ * tmp0; 525 sum4 += *v4++ * tmp0; 526 } 527 y[row++]=sum1; 528 y[row++]=sum2; 529 y[row++]=sum3; 530 y[row++]=sum4; 531 v1 =v4; /* Since the next block to be processed starts there*/ 532 idx +=3*sz; 533 break; 534 case 5: 535 sum1 = 0; 536 sum2 = 0; 537 sum3 = 0; 538 sum4 = 0; 539 sum5 = 0; 540 v2 = v1 + n; 541 v3 = v2 + n; 542 v4 = v3 + n; 543 v5 = v4 + n; 544 545 for (n = 0; n<sz-1; n+=2) { 546 i1 = idx[0]; 547 i2 = idx[1]; 548 idx += 2; 549 tmp0 = x[i1]; 550 tmp1 = x[i2]; 551 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 552 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 553 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 554 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 555 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 556 } 557 if (n == sz-1){ 558 tmp0 = x[*idx++]; 559 sum1 += *v1++ * tmp0; 560 sum2 += *v2++ * tmp0; 561 sum3 += *v3++ * tmp0; 562 sum4 += *v4++ * tmp0; 563 sum5 += *v5++ * tmp0; 564 } 565 y[row++]=sum1; 566 y[row++]=sum2; 567 y[row++]=sum3; 568 y[row++]=sum4; 569 y[row++]=sum5; 570 v1 =v5; /* Since the next block to be processed starts there */ 571 idx +=4*sz; 572 break; 573 default : 574 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 575 } 576 } 577 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 578 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 /* ----------------------------------------------------------- */ 583 /* Almost same code as the MatMult_Inode() */ 584 #undef __FUNCT__ 585 #define __FUNCT__ "MatMultAdd_Inode" 586 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 587 { 588 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 589 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 590 MatScalar *v1,*v2,*v3,*v4,*v5; 591 PetscScalar *x,*y,*z,*zt; 592 PetscErrorCode ierr; 593 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 594 595 PetscFunctionBegin; 596 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 597 node_max = a->inode.node_count; 598 ns = a->inode.size; /* Node Size array */ 599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 601 if (zz != yy) { 602 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 603 } else { 604 z = y; 605 } 606 zt = z; 607 608 idx = a->j; 609 v1 = a->a; 610 ii = a->i; 611 612 for (i = 0,row = 0; i< node_max; ++i){ 613 nsz = ns[i]; 614 n = ii[1] - ii[0]; 615 ii += nsz; 616 sz = n; /* No of non zeros in this row */ 617 /* Switch on the size of Node */ 618 switch (nsz){ /* Each loop in 'case' is unrolled */ 619 case 1 : 620 sum1 = *zt++; 621 622 for(n = 0; n< sz-1; n+=2) { 623 i1 = idx[0]; /* The instructions are ordered to */ 624 i2 = idx[1]; /* make the compiler's job easy */ 625 idx += 2; 626 tmp0 = x[i1]; 627 tmp1 = x[i2]; 628 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 629 } 630 631 if(n == sz-1){ /* Take care of the last nonzero */ 632 tmp0 = x[*idx++]; 633 sum1 += *v1++ * tmp0; 634 } 635 y[row++]=sum1; 636 break; 637 case 2: 638 sum1 = *zt++; 639 sum2 = *zt++; 640 v2 = v1 + n; 641 642 for(n = 0; n< sz-1; n+=2) { 643 i1 = idx[0]; 644 i2 = idx[1]; 645 idx += 2; 646 tmp0 = x[i1]; 647 tmp1 = x[i2]; 648 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 649 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 650 } 651 if(n == sz-1){ 652 tmp0 = x[*idx++]; 653 sum1 += *v1++ * tmp0; 654 sum2 += *v2++ * tmp0; 655 } 656 y[row++]=sum1; 657 y[row++]=sum2; 658 v1 =v2; /* Since the next block to be processed starts there*/ 659 idx +=sz; 660 break; 661 case 3: 662 sum1 = *zt++; 663 sum2 = *zt++; 664 sum3 = *zt++; 665 v2 = v1 + n; 666 v3 = v2 + n; 667 668 for (n = 0; n< sz-1; n+=2) { 669 i1 = idx[0]; 670 i2 = idx[1]; 671 idx += 2; 672 tmp0 = x[i1]; 673 tmp1 = x[i2]; 674 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 675 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 676 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 677 } 678 if (n == sz-1){ 679 tmp0 = x[*idx++]; 680 sum1 += *v1++ * tmp0; 681 sum2 += *v2++ * tmp0; 682 sum3 += *v3++ * tmp0; 683 } 684 y[row++]=sum1; 685 y[row++]=sum2; 686 y[row++]=sum3; 687 v1 =v3; /* Since the next block to be processed starts there*/ 688 idx +=2*sz; 689 break; 690 case 4: 691 sum1 = *zt++; 692 sum2 = *zt++; 693 sum3 = *zt++; 694 sum4 = *zt++; 695 v2 = v1 + n; 696 v3 = v2 + n; 697 v4 = v3 + n; 698 699 for (n = 0; n< sz-1; n+=2) { 700 i1 = idx[0]; 701 i2 = idx[1]; 702 idx += 2; 703 tmp0 = x[i1]; 704 tmp1 = x[i2]; 705 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 706 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 707 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 708 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 709 } 710 if (n == sz-1){ 711 tmp0 = x[*idx++]; 712 sum1 += *v1++ * tmp0; 713 sum2 += *v2++ * tmp0; 714 sum3 += *v3++ * tmp0; 715 sum4 += *v4++ * tmp0; 716 } 717 y[row++]=sum1; 718 y[row++]=sum2; 719 y[row++]=sum3; 720 y[row++]=sum4; 721 v1 =v4; /* Since the next block to be processed starts there*/ 722 idx +=3*sz; 723 break; 724 case 5: 725 sum1 = *zt++; 726 sum2 = *zt++; 727 sum3 = *zt++; 728 sum4 = *zt++; 729 sum5 = *zt++; 730 v2 = v1 + n; 731 v3 = v2 + n; 732 v4 = v3 + n; 733 v5 = v4 + n; 734 735 for (n = 0; n<sz-1; n+=2) { 736 i1 = idx[0]; 737 i2 = idx[1]; 738 idx += 2; 739 tmp0 = x[i1]; 740 tmp1 = x[i2]; 741 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 742 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 743 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 744 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 745 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 746 } 747 if(n == sz-1){ 748 tmp0 = x[*idx++]; 749 sum1 += *v1++ * tmp0; 750 sum2 += *v2++ * tmp0; 751 sum3 += *v3++ * tmp0; 752 sum4 += *v4++ * tmp0; 753 sum5 += *v5++ * tmp0; 754 } 755 y[row++]=sum1; 756 y[row++]=sum2; 757 y[row++]=sum3; 758 y[row++]=sum4; 759 y[row++]=sum5; 760 v1 =v5; /* Since the next block to be processed starts there */ 761 idx +=4*sz; 762 break; 763 default : 764 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 765 } 766 } 767 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 768 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 769 if (zz != yy) { 770 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 771 } 772 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 773 PetscFunctionReturn(0); 774 } 775 776 /* ----------------------------------------------------------- */ 777 #undef __FUNCT__ 778 #define __FUNCT__ "MatSolve_Inode" 779 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 780 { 781 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 782 IS iscol = a->col,isrow = a->row; 783 PetscErrorCode ierr; 784 const PetscInt *r,*c,*rout,*cout; 785 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 786 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 787 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 788 PetscScalar sum1,sum2,sum3,sum4,sum5; 789 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 790 const PetscScalar *b; 791 792 PetscFunctionBegin; 793 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 794 node_max = a->inode.node_count; 795 ns = a->inode.size; /* Node Size array */ 796 797 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 799 tmp = a->solve_work; 800 801 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 802 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 803 804 /* forward solve the lower triangular */ 805 tmps = tmp ; 806 aa = a_a ; 807 aj = a_j ; 808 ad = a->diag; 809 810 for (i = 0,row = 0; i< node_max; ++i){ 811 nsz = ns[i]; 812 aii = ai[row]; 813 v1 = aa + aii; 814 vi = aj + aii; 815 nz = ad[row]- aii; 816 817 switch (nsz){ /* Each loop in 'case' is unrolled */ 818 case 1 : 819 sum1 = b[*r++]; 820 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 821 for(j=0; j<nz-1; j+=2){ 822 i0 = vi[0]; 823 i1 = vi[1]; 824 vi +=2; 825 tmp0 = tmps[i0]; 826 tmp1 = tmps[i1]; 827 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 828 } 829 if(j == nz-1){ 830 tmp0 = tmps[*vi++]; 831 sum1 -= *v1++ *tmp0; 832 } 833 tmp[row ++]=sum1; 834 break; 835 case 2: 836 sum1 = b[*r++]; 837 sum2 = b[*r++]; 838 v2 = aa + ai[row+1]; 839 840 for(j=0; j<nz-1; j+=2){ 841 i0 = vi[0]; 842 i1 = vi[1]; 843 vi +=2; 844 tmp0 = tmps[i0]; 845 tmp1 = tmps[i1]; 846 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 847 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 848 } 849 if(j == nz-1){ 850 tmp0 = tmps[*vi++]; 851 sum1 -= *v1++ *tmp0; 852 sum2 -= *v2++ *tmp0; 853 } 854 sum2 -= *v2++ * sum1; 855 tmp[row ++]=sum1; 856 tmp[row ++]=sum2; 857 break; 858 case 3: 859 sum1 = b[*r++]; 860 sum2 = b[*r++]; 861 sum3 = b[*r++]; 862 v2 = aa + ai[row+1]; 863 v3 = aa + ai[row+2]; 864 865 for (j=0; j<nz-1; j+=2){ 866 i0 = vi[0]; 867 i1 = vi[1]; 868 vi +=2; 869 tmp0 = tmps[i0]; 870 tmp1 = tmps[i1]; 871 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 872 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 873 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 874 } 875 if (j == nz-1){ 876 tmp0 = tmps[*vi++]; 877 sum1 -= *v1++ *tmp0; 878 sum2 -= *v2++ *tmp0; 879 sum3 -= *v3++ *tmp0; 880 } 881 sum2 -= *v2++ * sum1; 882 sum3 -= *v3++ * sum1; 883 sum3 -= *v3++ * sum2; 884 tmp[row ++]=sum1; 885 tmp[row ++]=sum2; 886 tmp[row ++]=sum3; 887 break; 888 889 case 4: 890 sum1 = b[*r++]; 891 sum2 = b[*r++]; 892 sum3 = b[*r++]; 893 sum4 = b[*r++]; 894 v2 = aa + ai[row+1]; 895 v3 = aa + ai[row+2]; 896 v4 = aa + ai[row+3]; 897 898 for (j=0; j<nz-1; j+=2){ 899 i0 = vi[0]; 900 i1 = vi[1]; 901 vi +=2; 902 tmp0 = tmps[i0]; 903 tmp1 = tmps[i1]; 904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 906 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 907 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 908 } 909 if (j == nz-1){ 910 tmp0 = tmps[*vi++]; 911 sum1 -= *v1++ *tmp0; 912 sum2 -= *v2++ *tmp0; 913 sum3 -= *v3++ *tmp0; 914 sum4 -= *v4++ *tmp0; 915 } 916 sum2 -= *v2++ * sum1; 917 sum3 -= *v3++ * sum1; 918 sum4 -= *v4++ * sum1; 919 sum3 -= *v3++ * sum2; 920 sum4 -= *v4++ * sum2; 921 sum4 -= *v4++ * sum3; 922 923 tmp[row ++]=sum1; 924 tmp[row ++]=sum2; 925 tmp[row ++]=sum3; 926 tmp[row ++]=sum4; 927 break; 928 case 5: 929 sum1 = b[*r++]; 930 sum2 = b[*r++]; 931 sum3 = b[*r++]; 932 sum4 = b[*r++]; 933 sum5 = b[*r++]; 934 v2 = aa + ai[row+1]; 935 v3 = aa + ai[row+2]; 936 v4 = aa + ai[row+3]; 937 v5 = aa + ai[row+4]; 938 939 for (j=0; j<nz-1; j+=2){ 940 i0 = vi[0]; 941 i1 = vi[1]; 942 vi +=2; 943 tmp0 = tmps[i0]; 944 tmp1 = tmps[i1]; 945 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 946 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 947 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 948 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 949 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 950 } 951 if (j == nz-1){ 952 tmp0 = tmps[*vi++]; 953 sum1 -= *v1++ *tmp0; 954 sum2 -= *v2++ *tmp0; 955 sum3 -= *v3++ *tmp0; 956 sum4 -= *v4++ *tmp0; 957 sum5 -= *v5++ *tmp0; 958 } 959 960 sum2 -= *v2++ * sum1; 961 sum3 -= *v3++ * sum1; 962 sum4 -= *v4++ * sum1; 963 sum5 -= *v5++ * sum1; 964 sum3 -= *v3++ * sum2; 965 sum4 -= *v4++ * sum2; 966 sum5 -= *v5++ * sum2; 967 sum4 -= *v4++ * sum3; 968 sum5 -= *v5++ * sum3; 969 sum5 -= *v5++ * sum4; 970 971 tmp[row ++]=sum1; 972 tmp[row ++]=sum2; 973 tmp[row ++]=sum3; 974 tmp[row ++]=sum4; 975 tmp[row ++]=sum5; 976 break; 977 default: 978 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 979 } 980 } 981 /* backward solve the upper triangular */ 982 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 983 nsz = ns[i]; 984 aii = ai[row+1] -1; 985 v1 = aa + aii; 986 vi = aj + aii; 987 nz = aii- ad[row]; 988 switch (nsz){ /* Each loop in 'case' is unrolled */ 989 case 1 : 990 sum1 = tmp[row]; 991 992 for(j=nz ; j>1; j-=2){ 993 vi -=2; 994 i0 = vi[2]; 995 i1 = vi[1]; 996 tmp0 = tmps[i0]; 997 tmp1 = tmps[i1]; 998 v1 -= 2; 999 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1000 } 1001 if (j==1){ 1002 tmp0 = tmps[*vi--]; 1003 sum1 -= *v1-- * tmp0; 1004 } 1005 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1006 break; 1007 case 2 : 1008 sum1 = tmp[row]; 1009 sum2 = tmp[row -1]; 1010 v2 = aa + ai[row]-1; 1011 for (j=nz ; j>1; j-=2){ 1012 vi -=2; 1013 i0 = vi[2]; 1014 i1 = vi[1]; 1015 tmp0 = tmps[i0]; 1016 tmp1 = tmps[i1]; 1017 v1 -= 2; 1018 v2 -= 2; 1019 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1020 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1021 } 1022 if (j==1){ 1023 tmp0 = tmps[*vi--]; 1024 sum1 -= *v1-- * tmp0; 1025 sum2 -= *v2-- * tmp0; 1026 } 1027 1028 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1029 sum2 -= *v2-- * tmp0; 1030 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1031 break; 1032 case 3 : 1033 sum1 = tmp[row]; 1034 sum2 = tmp[row -1]; 1035 sum3 = tmp[row -2]; 1036 v2 = aa + ai[row]-1; 1037 v3 = aa + ai[row -1]-1; 1038 for (j=nz ; j>1; j-=2){ 1039 vi -=2; 1040 i0 = vi[2]; 1041 i1 = vi[1]; 1042 tmp0 = tmps[i0]; 1043 tmp1 = tmps[i1]; 1044 v1 -= 2; 1045 v2 -= 2; 1046 v3 -= 2; 1047 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1048 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1049 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1050 } 1051 if (j==1){ 1052 tmp0 = tmps[*vi--]; 1053 sum1 -= *v1-- * tmp0; 1054 sum2 -= *v2-- * tmp0; 1055 sum3 -= *v3-- * tmp0; 1056 } 1057 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1058 sum2 -= *v2-- * tmp0; 1059 sum3 -= *v3-- * tmp0; 1060 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1061 sum3 -= *v3-- * tmp0; 1062 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1063 1064 break; 1065 case 4 : 1066 sum1 = tmp[row]; 1067 sum2 = tmp[row -1]; 1068 sum3 = tmp[row -2]; 1069 sum4 = tmp[row -3]; 1070 v2 = aa + ai[row]-1; 1071 v3 = aa + ai[row -1]-1; 1072 v4 = aa + ai[row -2]-1; 1073 1074 for (j=nz ; j>1; j-=2){ 1075 vi -=2; 1076 i0 = vi[2]; 1077 i1 = vi[1]; 1078 tmp0 = tmps[i0]; 1079 tmp1 = tmps[i1]; 1080 v1 -= 2; 1081 v2 -= 2; 1082 v3 -= 2; 1083 v4 -= 2; 1084 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1085 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1086 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1087 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1088 } 1089 if (j==1){ 1090 tmp0 = tmps[*vi--]; 1091 sum1 -= *v1-- * tmp0; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 } 1096 1097 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1098 sum2 -= *v2-- * tmp0; 1099 sum3 -= *v3-- * tmp0; 1100 sum4 -= *v4-- * tmp0; 1101 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1105 sum4 -= *v4-- * tmp0; 1106 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1107 break; 1108 case 5 : 1109 sum1 = tmp[row]; 1110 sum2 = tmp[row -1]; 1111 sum3 = tmp[row -2]; 1112 sum4 = tmp[row -3]; 1113 sum5 = tmp[row -4]; 1114 v2 = aa + ai[row]-1; 1115 v3 = aa + ai[row -1]-1; 1116 v4 = aa + ai[row -2]-1; 1117 v5 = aa + ai[row -3]-1; 1118 for (j=nz ; j>1; j-=2){ 1119 vi -= 2; 1120 i0 = vi[2]; 1121 i1 = vi[1]; 1122 tmp0 = tmps[i0]; 1123 tmp1 = tmps[i1]; 1124 v1 -= 2; 1125 v2 -= 2; 1126 v3 -= 2; 1127 v4 -= 2; 1128 v5 -= 2; 1129 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1130 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1131 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1132 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1133 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1134 } 1135 if (j==1){ 1136 tmp0 = tmps[*vi--]; 1137 sum1 -= *v1-- * tmp0; 1138 sum2 -= *v2-- * tmp0; 1139 sum3 -= *v3-- * tmp0; 1140 sum4 -= *v4-- * tmp0; 1141 sum5 -= *v5-- * tmp0; 1142 } 1143 1144 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1145 sum2 -= *v2-- * tmp0; 1146 sum3 -= *v3-- * tmp0; 1147 sum4 -= *v4-- * tmp0; 1148 sum5 -= *v5-- * tmp0; 1149 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1150 sum3 -= *v3-- * tmp0; 1151 sum4 -= *v4-- * tmp0; 1152 sum5 -= *v5-- * tmp0; 1153 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1154 sum4 -= *v4-- * tmp0; 1155 sum5 -= *v5-- * tmp0; 1156 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1157 sum5 -= *v5-- * tmp0; 1158 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1159 break; 1160 default: 1161 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1162 } 1163 } 1164 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1165 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1166 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1168 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1169 PetscFunctionReturn(0); 1170 } 1171 1172 #undef __FUNCT__ 1173 #define __FUNCT__ "MatLUFactorNumeric_Inode" 1174 PetscErrorCode MatLUFactorNumeric_Inode(Mat B,Mat A,const MatFactorInfo *info) 1175 { 1176 Mat C = B; 1177 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1178 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1179 PetscErrorCode ierr; 1180 const PetscInt *r,*ic,*c,*ics; 1181 PetscInt n = A->rmap->n,*bi = b->i; 1182 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1183 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1184 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1185 PetscScalar mul1,mul2,mul3,tmp; 1186 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1187 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1188 PetscReal rs=0.0; 1189 LUShift_Ctx sctx; 1190 PetscInt newshift; 1191 1192 PetscFunctionBegin; 1193 sctx.shift_top = 0; 1194 sctx.nshift_max = 0; 1195 sctx.shift_lo = 0; 1196 sctx.shift_hi = 0; 1197 sctx.shift_fraction = 0; 1198 1199 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1200 if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1201 sctx.shift_top = 0; 1202 for (i=0; i<n; i++) { 1203 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1204 rs = 0.0; 1205 ajtmp = aj + ai[i]; 1206 rtmp1 = aa + ai[i]; 1207 nz = ai[i+1] - ai[i]; 1208 for (j=0; j<nz; j++){ 1209 if (*ajtmp != i){ 1210 rs += PetscAbsScalar(*rtmp1++); 1211 } else { 1212 rs -= PetscRealPart(*rtmp1++); 1213 } 1214 ajtmp++; 1215 } 1216 if (rs>sctx.shift_top) sctx.shift_top = rs; 1217 } 1218 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1219 sctx.shift_top *= 1.1; 1220 sctx.nshift_max = 5; 1221 sctx.shift_lo = 0.; 1222 sctx.shift_hi = 1.; 1223 } 1224 sctx.shift_amount = 0; 1225 sctx.nshift = 0; 1226 1227 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1228 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1229 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1230 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1231 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1232 ics = ic ; 1233 rtmp22 = rtmp11 + n; 1234 rtmp33 = rtmp22 + n; 1235 1236 node_max = a->inode.node_count; 1237 ns = a->inode.size; 1238 if (!ns){ 1239 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1240 } 1241 1242 /* If max inode size > 3, split it into two inodes.*/ 1243 /* also map the inode sizes according to the ordering */ 1244 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1245 for (i=0,j=0; i<node_max; ++i,++j){ 1246 if (ns[i]>3) { 1247 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1248 ++j; 1249 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1250 } else { 1251 tmp_vec1[j] = ns[i]; 1252 } 1253 } 1254 /* Use the correct node_max */ 1255 node_max = j; 1256 1257 /* Now reorder the inode info based on mat re-ordering info */ 1258 /* First create a row -> inode_size_array_index map */ 1259 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1260 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1261 for (i=0,row=0; i<node_max; i++) { 1262 nodesz = tmp_vec1[i]; 1263 for (j=0; j<nodesz; j++,row++) { 1264 nsmap[row] = i; 1265 } 1266 } 1267 /* Using nsmap, create a reordered ns structure */ 1268 for (i=0,j=0; i< node_max; i++) { 1269 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1270 tmp_vec2[i] = nodesz; 1271 j += nodesz; 1272 } 1273 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1274 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1275 /* Now use the correct ns */ 1276 ns = tmp_vec2; 1277 1278 do { 1279 sctx.lushift = PETSC_FALSE; 1280 /* Now loop over each block-row, and do the factorization */ 1281 for (i=0,row=0; i<node_max; i++) { 1282 nodesz = ns[i]; 1283 nz = bi[row+1] - bi[row]; 1284 bjtmp = bj + bi[row]; 1285 1286 switch (nodesz){ 1287 case 1: 1288 for (j=0; j<nz; j++){ 1289 idx = bjtmp[j]; 1290 rtmp11[idx] = 0.0; 1291 } 1292 1293 /* load in initial (unfactored row) */ 1294 idx = r[row]; 1295 nz_tmp = ai[idx+1] - ai[idx]; 1296 ajtmp = aj + ai[idx]; 1297 v1 = aa + ai[idx]; 1298 1299 for (j=0; j<nz_tmp; j++) { 1300 idx = ics[ajtmp[j]]; 1301 rtmp11[idx] = v1[j]; 1302 } 1303 rtmp11[ics[r[row]]] += sctx.shift_amount; 1304 1305 prow = *bjtmp++ ; 1306 while (prow < row) { 1307 pc1 = rtmp11 + prow; 1308 if (*pc1 != 0.0){ 1309 pv = ba + bd[prow]; 1310 pj = nbj + bd[prow]; 1311 mul1 = *pc1 * *pv++; 1312 *pc1 = mul1; 1313 nz_tmp = bi[prow+1] - bd[prow] - 1; 1314 ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr); 1315 for (j=0; j<nz_tmp; j++) { 1316 tmp = pv[j]; 1317 idx = pj[j]; 1318 rtmp11[idx] -= mul1 * tmp; 1319 } 1320 } 1321 prow = *bjtmp++ ; 1322 } 1323 pj = bj + bi[row]; 1324 pc1 = ba + bi[row]; 1325 1326 sctx.pv = rtmp11[row]; 1327 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1328 rs = 0.0; 1329 for (j=0; j<nz; j++) { 1330 idx = pj[j]; 1331 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1332 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1333 } 1334 sctx.rs = rs; 1335 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1336 if (newshift == 1) goto endofwhile; 1337 break; 1338 1339 case 2: 1340 for (j=0; j<nz; j++) { 1341 idx = bjtmp[j]; 1342 rtmp11[idx] = 0.0; 1343 rtmp22[idx] = 0.0; 1344 } 1345 1346 /* load in initial (unfactored row) */ 1347 idx = r[row]; 1348 nz_tmp = ai[idx+1] - ai[idx]; 1349 ajtmp = aj + ai[idx]; 1350 v1 = aa + ai[idx]; 1351 v2 = aa + ai[idx+1]; 1352 for (j=0; j<nz_tmp; j++) { 1353 idx = ics[ajtmp[j]]; 1354 rtmp11[idx] = v1[j]; 1355 rtmp22[idx] = v2[j]; 1356 } 1357 rtmp11[ics[r[row]]] += sctx.shift_amount; 1358 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1359 1360 prow = *bjtmp++ ; 1361 while (prow < row) { 1362 pc1 = rtmp11 + prow; 1363 pc2 = rtmp22 + prow; 1364 if (*pc1 != 0.0 || *pc2 != 0.0){ 1365 pv = ba + bd[prow]; 1366 pj = nbj + bd[prow]; 1367 mul1 = *pc1 * *pv; 1368 mul2 = *pc2 * *pv; 1369 ++pv; 1370 *pc1 = mul1; 1371 *pc2 = mul2; 1372 1373 nz_tmp = bi[prow+1] - bd[prow] - 1; 1374 for (j=0; j<nz_tmp; j++) { 1375 tmp = pv[j]; 1376 idx = pj[j]; 1377 rtmp11[idx] -= mul1 * tmp; 1378 rtmp22[idx] -= mul2 * tmp; 1379 } 1380 ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr); 1381 } 1382 prow = *bjtmp++ ; 1383 } 1384 1385 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1386 pc1 = rtmp11 + prow; 1387 pc2 = rtmp22 + prow; 1388 1389 sctx.pv = *pc1; 1390 pj = bj + bi[prow]; 1391 rs = 0.0; 1392 for (j=0; j<nz; j++){ 1393 idx = pj[j]; 1394 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1395 } 1396 sctx.rs = rs; 1397 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1398 if (newshift == 1) goto endofwhile; 1399 1400 if (*pc2 != 0.0){ 1401 pj = nbj + bd[prow]; 1402 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1403 *pc2 = mul2; 1404 nz_tmp = bi[prow+1] - bd[prow] - 1; 1405 for (j=0; j<nz_tmp; j++) { 1406 idx = pj[j] ; 1407 tmp = rtmp11[idx]; 1408 rtmp22[idx] -= mul2 * tmp; 1409 } 1410 ierr = PetscLogFlops(2.0*nz_tmp);CHKERRQ(ierr); 1411 } 1412 1413 pj = bj + bi[row]; 1414 pc1 = ba + bi[row]; 1415 pc2 = ba + bi[row+1]; 1416 1417 sctx.pv = rtmp22[row+1]; 1418 rs = 0.0; 1419 rtmp11[row] = 1.0/rtmp11[row]; 1420 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1421 /* copy row entries from dense representation to sparse */ 1422 for (j=0; j<nz; j++) { 1423 idx = pj[j]; 1424 pc1[j] = rtmp11[idx]; 1425 pc2[j] = rtmp22[idx]; 1426 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1427 } 1428 sctx.rs = rs; 1429 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1430 if (newshift == 1) goto endofwhile; 1431 break; 1432 1433 case 3: 1434 for (j=0; j<nz; j++) { 1435 idx = bjtmp[j]; 1436 rtmp11[idx] = 0.0; 1437 rtmp22[idx] = 0.0; 1438 rtmp33[idx] = 0.0; 1439 } 1440 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1441 idx = r[row]; 1442 nz_tmp = ai[idx+1] - ai[idx]; 1443 ajtmp = aj + ai[idx]; 1444 v1 = aa + ai[idx]; 1445 v2 = aa + ai[idx+1]; 1446 v3 = aa + ai[idx+2]; 1447 for (j=0; j<nz_tmp; j++) { 1448 idx = ics[ajtmp[j]]; 1449 rtmp11[idx] = v1[j]; 1450 rtmp22[idx] = v2[j]; 1451 rtmp33[idx] = v3[j]; 1452 } 1453 rtmp11[ics[r[row]]] += sctx.shift_amount; 1454 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1455 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1456 1457 /* loop over all pivot row blocks above this row block */ 1458 prow = *bjtmp++ ; 1459 while (prow < row) { 1460 pc1 = rtmp11 + prow; 1461 pc2 = rtmp22 + prow; 1462 pc3 = rtmp33 + prow; 1463 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1464 pv = ba + bd[prow]; 1465 pj = nbj + bd[prow]; 1466 mul1 = *pc1 * *pv; 1467 mul2 = *pc2 * *pv; 1468 mul3 = *pc3 * *pv; 1469 ++pv; 1470 *pc1 = mul1; 1471 *pc2 = mul2; 1472 *pc3 = mul3; 1473 1474 nz_tmp = bi[prow+1] - bd[prow] - 1; 1475 /* update this row based on pivot row */ 1476 for (j=0; j<nz_tmp; j++) { 1477 tmp = pv[j]; 1478 idx = pj[j]; 1479 rtmp11[idx] -= mul1 * tmp; 1480 rtmp22[idx] -= mul2 * tmp; 1481 rtmp33[idx] -= mul3 * tmp; 1482 } 1483 ierr = PetscLogFlops(6.0*nz_tmp);CHKERRQ(ierr); 1484 } 1485 prow = *bjtmp++ ; 1486 } 1487 1488 /* Now take care of diagonal 3x3 block in this set of rows */ 1489 /* note: prow = row here */ 1490 pc1 = rtmp11 + prow; 1491 pc2 = rtmp22 + prow; 1492 pc3 = rtmp33 + prow; 1493 1494 sctx.pv = *pc1; 1495 pj = bj + bi[prow]; 1496 rs = 0.0; 1497 for (j=0; j<nz; j++){ 1498 idx = pj[j]; 1499 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 1500 } 1501 sctx.rs = rs; 1502 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1503 if (newshift == 1) goto endofwhile; 1504 1505 if (*pc2 != 0.0 || *pc3 != 0.0){ 1506 mul2 = (*pc2)/(*pc1); 1507 mul3 = (*pc3)/(*pc1); 1508 *pc2 = mul2; 1509 *pc3 = mul3; 1510 nz_tmp = bi[prow+1] - bd[prow] - 1; 1511 pj = nbj + bd[prow]; 1512 for (j=0; j<nz_tmp; j++) { 1513 idx = pj[j] ; 1514 tmp = rtmp11[idx]; 1515 rtmp22[idx] -= mul2 * tmp; 1516 rtmp33[idx] -= mul3 * tmp; 1517 } 1518 ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr); 1519 } 1520 ++prow; 1521 1522 pc2 = rtmp22 + prow; 1523 pc3 = rtmp33 + prow; 1524 sctx.pv = *pc2; 1525 pj = bj + bi[prow]; 1526 rs = 0.0; 1527 for (j=0; j<nz; j++){ 1528 idx = pj[j]; 1529 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 1530 } 1531 sctx.rs = rs; 1532 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1533 if (newshift == 1) goto endofwhile; 1534 1535 if (*pc3 != 0.0){ 1536 mul3 = (*pc3)/(*pc2); 1537 *pc3 = mul3; 1538 pj = nbj + bd[prow]; 1539 nz_tmp = bi[prow+1] - bd[prow] - 1; 1540 for (j=0; j<nz_tmp; j++) { 1541 idx = pj[j] ; 1542 tmp = rtmp22[idx]; 1543 rtmp33[idx] -= mul3 * tmp; 1544 } 1545 ierr = PetscLogFlops(4.0*nz_tmp);CHKERRQ(ierr); 1546 } 1547 1548 pj = bj + bi[row]; 1549 pc1 = ba + bi[row]; 1550 pc2 = ba + bi[row+1]; 1551 pc3 = ba + bi[row+2]; 1552 1553 sctx.pv = rtmp33[row+2]; 1554 rs = 0.0; 1555 rtmp11[row] = 1.0/rtmp11[row]; 1556 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1557 rtmp33[row+2] = 1.0/rtmp33[row+2]; 1558 /* copy row entries from dense representation to sparse */ 1559 for (j=0; j<nz; j++) { 1560 idx = pj[j]; 1561 pc1[j] = rtmp11[idx]; 1562 pc2[j] = rtmp22[idx]; 1563 pc3[j] = rtmp33[idx]; 1564 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1565 } 1566 1567 sctx.rs = rs; 1568 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1569 if (newshift == 1) goto endofwhile; 1570 break; 1571 1572 default: 1573 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1574 } 1575 row += nodesz; /* Update the row */ 1576 } 1577 endofwhile:; 1578 } while (sctx.lushift); 1579 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 1580 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1581 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1582 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1583 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1584 (B)->ops->solve = MatSolve_Inode; 1585 /* do not set solve add, since MatSolve_Inode + Add is faster */ 1586 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1587 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1588 C->assembled = PETSC_TRUE; 1589 C->preallocated = PETSC_TRUE; 1590 if (sctx.nshift) { 1591 if (info->shiftpd) { 1592 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1593 } else if (info->shiftnz) { 1594 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1595 } 1596 } 1597 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1598 PetscFunctionReturn(0); 1599 } 1600 1601 /* 1602 Makes a longer coloring[] array and calls the usual code with that 1603 */ 1604 #undef __FUNCT__ 1605 #define __FUNCT__ "MatColoringPatch_Inode" 1606 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1607 { 1608 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1609 PetscErrorCode ierr; 1610 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1611 PetscInt *colorused,i; 1612 ISColoringValue *newcolor; 1613 1614 PetscFunctionBegin; 1615 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1616 /* loop over inodes, marking a color for each column*/ 1617 row = 0; 1618 for (i=0; i<m; i++){ 1619 for (j=0; j<ns[i]; j++) { 1620 newcolor[row++] = coloring[i] + j*ncolors; 1621 } 1622 } 1623 1624 /* eliminate unneeded colors */ 1625 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1626 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1627 for (i=0; i<n; i++) { 1628 colorused[newcolor[i]] = 1; 1629 } 1630 1631 for (i=1; i<5*ncolors; i++) { 1632 colorused[i] += colorused[i-1]; 1633 } 1634 ncolors = colorused[5*ncolors-1]; 1635 for (i=0; i<n; i++) { 1636 newcolor[i] = colorused[newcolor[i]]-1; 1637 } 1638 ierr = PetscFree(colorused);CHKERRQ(ierr); 1639 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1640 ierr = PetscFree(coloring);CHKERRQ(ierr); 1641 PetscFunctionReturn(0); 1642 } 1643 1644 #include "../src/mat/blockinvert.h" 1645 1646 #undef __FUNCT__ 1647 #define __FUNCT__ "MatRelax_Inode" 1648 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1649 { 1650 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1651 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 1652 MatScalar *ibdiag,*bdiag; 1653 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1654 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 1655 PetscReal zeropivot = 1.0e-15, shift = 0.0; 1656 PetscErrorCode ierr; 1657 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1658 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1659 1660 PetscFunctionBegin; 1661 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1662 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1663 if (its > 1) { 1664 /* switch to non-inode version */ 1665 ierr = MatRelax_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 1666 PetscFunctionReturn(0); 1667 } 1668 1669 if (!a->inode.ibdiagvalid) { 1670 if (!a->inode.ibdiag) { 1671 /* calculate space needed for diagonal blocks */ 1672 for (i=0; i<m; i++) { 1673 cnt += sizes[i]*sizes[i]; 1674 } 1675 a->inode.bdiagsize = cnt; 1676 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 1677 } 1678 1679 /* copy over the diagonal blocks and invert them */ 1680 ibdiag = a->inode.ibdiag; 1681 bdiag = a->inode.bdiag; 1682 cnt = 0; 1683 for (i=0, row = 0; i<m; i++) { 1684 for (j=0; j<sizes[i]; j++) { 1685 for (k=0; k<sizes[i]; k++) { 1686 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1687 } 1688 } 1689 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 1690 1691 switch(sizes[i]) { 1692 case 1: 1693 /* Create matrix data structure */ 1694 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1695 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1696 break; 1697 case 2: 1698 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 1699 break; 1700 case 3: 1701 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 1702 break; 1703 case 4: 1704 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 1705 break; 1706 case 5: 1707 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,shift);CHKERRQ(ierr); 1708 break; 1709 default: 1710 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1711 } 1712 cnt += sizes[i]*sizes[i]; 1713 row += sizes[i]; 1714 } 1715 a->inode.ibdiagvalid = PETSC_TRUE; 1716 } 1717 ibdiag = a->inode.ibdiag; 1718 bdiag = a->inode.bdiag; 1719 1720 1721 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1722 if (flag & SOR_ZERO_INITIAL_GUESS) { 1723 PetscScalar *b; 1724 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1725 if (xx != bb) { 1726 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1727 } else { 1728 b = x; 1729 } 1730 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1731 1732 for (i=0, row=0; i<m; i++) { 1733 sz = diag[row] - ii[row]; 1734 v1 = a->a + ii[row]; 1735 idx = a->j + ii[row]; 1736 1737 /* see comments for MatMult_Inode() for how this is coded */ 1738 switch (sizes[i]){ 1739 case 1: 1740 1741 sum1 = b[row]; 1742 for(n = 0; n<sz-1; n+=2) { 1743 i1 = idx[0]; 1744 i2 = idx[1]; 1745 idx += 2; 1746 tmp0 = x[i1]; 1747 tmp1 = x[i2]; 1748 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1749 } 1750 1751 if (n == sz-1){ 1752 tmp0 = x[*idx]; 1753 sum1 -= *v1 * tmp0; 1754 } 1755 x[row++] = sum1*(*ibdiag++); 1756 break; 1757 case 2: 1758 v2 = a->a + ii[row+1]; 1759 sum1 = b[row]; 1760 sum2 = b[row+1]; 1761 for(n = 0; n<sz-1; n+=2) { 1762 i1 = idx[0]; 1763 i2 = idx[1]; 1764 idx += 2; 1765 tmp0 = x[i1]; 1766 tmp1 = x[i2]; 1767 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1768 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1769 } 1770 1771 if (n == sz-1){ 1772 tmp0 = x[*idx]; 1773 sum1 -= v1[0] * tmp0; 1774 sum2 -= v2[0] * tmp0; 1775 } 1776 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1777 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1778 ibdiag += 4; 1779 break; 1780 case 3: 1781 v2 = a->a + ii[row+1]; 1782 v3 = a->a + ii[row+2]; 1783 sum1 = b[row]; 1784 sum2 = b[row+1]; 1785 sum3 = b[row+2]; 1786 for(n = 0; n<sz-1; n+=2) { 1787 i1 = idx[0]; 1788 i2 = idx[1]; 1789 idx += 2; 1790 tmp0 = x[i1]; 1791 tmp1 = x[i2]; 1792 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1793 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1794 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1795 } 1796 1797 if (n == sz-1){ 1798 tmp0 = x[*idx]; 1799 sum1 -= v1[0] * tmp0; 1800 sum2 -= v2[0] * tmp0; 1801 sum3 -= v3[0] * tmp0; 1802 } 1803 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1804 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1805 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1806 ibdiag += 9; 1807 break; 1808 case 4: 1809 v2 = a->a + ii[row+1]; 1810 v3 = a->a + ii[row+2]; 1811 v4 = a->a + ii[row+3]; 1812 sum1 = b[row]; 1813 sum2 = b[row+1]; 1814 sum3 = b[row+2]; 1815 sum4 = b[row+3]; 1816 for(n = 0; n<sz-1; n+=2) { 1817 i1 = idx[0]; 1818 i2 = idx[1]; 1819 idx += 2; 1820 tmp0 = x[i1]; 1821 tmp1 = x[i2]; 1822 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1823 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1824 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1825 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1826 } 1827 1828 if (n == sz-1){ 1829 tmp0 = x[*idx]; 1830 sum1 -= v1[0] * tmp0; 1831 sum2 -= v2[0] * tmp0; 1832 sum3 -= v3[0] * tmp0; 1833 sum4 -= v4[0] * tmp0; 1834 } 1835 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1836 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1837 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1838 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1839 ibdiag += 16; 1840 break; 1841 case 5: 1842 v2 = a->a + ii[row+1]; 1843 v3 = a->a + ii[row+2]; 1844 v4 = a->a + ii[row+3]; 1845 v5 = a->a + ii[row+4]; 1846 sum1 = b[row]; 1847 sum2 = b[row+1]; 1848 sum3 = b[row+2]; 1849 sum4 = b[row+3]; 1850 sum5 = b[row+4]; 1851 for(n = 0; n<sz-1; n+=2) { 1852 i1 = idx[0]; 1853 i2 = idx[1]; 1854 idx += 2; 1855 tmp0 = x[i1]; 1856 tmp1 = x[i2]; 1857 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1858 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1859 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1860 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1861 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1862 } 1863 1864 if (n == sz-1){ 1865 tmp0 = x[*idx]; 1866 sum1 -= v1[0] * tmp0; 1867 sum2 -= v2[0] * tmp0; 1868 sum3 -= v3[0] * tmp0; 1869 sum4 -= v4[0] * tmp0; 1870 sum5 -= v5[0] * tmp0; 1871 } 1872 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1873 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1874 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1875 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1876 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1877 ibdiag += 25; 1878 break; 1879 default: 1880 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1881 } 1882 } 1883 1884 xb = x; 1885 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1886 } else xb = b; 1887 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1888 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1889 cnt = 0; 1890 for (i=0, row=0; i<m; i++) { 1891 1892 switch (sizes[i]){ 1893 case 1: 1894 x[row++] *= bdiag[cnt++]; 1895 break; 1896 case 2: 1897 x1 = x[row]; x2 = x[row+1]; 1898 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1899 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1900 x[row++] = tmp1; 1901 x[row++] = tmp2; 1902 cnt += 4; 1903 break; 1904 case 3: 1905 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1906 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1907 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1908 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1909 x[row++] = tmp1; 1910 x[row++] = tmp2; 1911 x[row++] = tmp3; 1912 cnt += 9; 1913 break; 1914 case 4: 1915 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1916 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1917 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1918 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1919 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1920 x[row++] = tmp1; 1921 x[row++] = tmp2; 1922 x[row++] = tmp3; 1923 x[row++] = tmp4; 1924 cnt += 16; 1925 break; 1926 case 5: 1927 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1928 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1929 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1930 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1931 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1932 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1933 x[row++] = tmp1; 1934 x[row++] = tmp2; 1935 x[row++] = tmp3; 1936 x[row++] = tmp4; 1937 x[row++] = tmp5; 1938 cnt += 25; 1939 break; 1940 default: 1941 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1942 } 1943 } 1944 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1945 } 1946 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1947 1948 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1949 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 1950 ibdiag -= sizes[i]*sizes[i]; 1951 sz = ii[row+1] - diag[row] - 1; 1952 v1 = a->a + diag[row] + 1; 1953 idx = a->j + diag[row] + 1; 1954 1955 /* see comments for MatMult_Inode() for how this is coded */ 1956 switch (sizes[i]){ 1957 case 1: 1958 1959 sum1 = xb[row]; 1960 for(n = 0; n<sz-1; n+=2) { 1961 i1 = idx[0]; 1962 i2 = idx[1]; 1963 idx += 2; 1964 tmp0 = x[i1]; 1965 tmp1 = x[i2]; 1966 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1967 } 1968 1969 if (n == sz-1){ 1970 tmp0 = x[*idx]; 1971 sum1 -= *v1*tmp0; 1972 } 1973 x[row--] = sum1*(*ibdiag); 1974 break; 1975 1976 case 2: 1977 1978 sum1 = xb[row]; 1979 sum2 = xb[row-1]; 1980 /* note that sum1 is associated with the second of the two rows */ 1981 v2 = a->a + diag[row-1] + 2; 1982 for(n = 0; n<sz-1; n+=2) { 1983 i1 = idx[0]; 1984 i2 = idx[1]; 1985 idx += 2; 1986 tmp0 = x[i1]; 1987 tmp1 = x[i2]; 1988 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1989 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1990 } 1991 1992 if (n == sz-1){ 1993 tmp0 = x[*idx]; 1994 sum1 -= *v1*tmp0; 1995 sum2 -= *v2*tmp0; 1996 } 1997 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1998 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1999 break; 2000 case 3: 2001 2002 sum1 = xb[row]; 2003 sum2 = xb[row-1]; 2004 sum3 = xb[row-2]; 2005 v2 = a->a + diag[row-1] + 2; 2006 v3 = a->a + diag[row-2] + 3; 2007 for(n = 0; n<sz-1; n+=2) { 2008 i1 = idx[0]; 2009 i2 = idx[1]; 2010 idx += 2; 2011 tmp0 = x[i1]; 2012 tmp1 = x[i2]; 2013 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2014 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2015 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2016 } 2017 2018 if (n == sz-1){ 2019 tmp0 = x[*idx]; 2020 sum1 -= *v1*tmp0; 2021 sum2 -= *v2*tmp0; 2022 sum3 -= *v3*tmp0; 2023 } 2024 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2025 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2026 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2027 break; 2028 case 4: 2029 2030 sum1 = xb[row]; 2031 sum2 = xb[row-1]; 2032 sum3 = xb[row-2]; 2033 sum4 = xb[row-3]; 2034 v2 = a->a + diag[row-1] + 2; 2035 v3 = a->a + diag[row-2] + 3; 2036 v4 = a->a + diag[row-3] + 4; 2037 for(n = 0; n<sz-1; n+=2) { 2038 i1 = idx[0]; 2039 i2 = idx[1]; 2040 idx += 2; 2041 tmp0 = x[i1]; 2042 tmp1 = x[i2]; 2043 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2044 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2045 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2046 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2047 } 2048 2049 if (n == sz-1){ 2050 tmp0 = x[*idx]; 2051 sum1 -= *v1*tmp0; 2052 sum2 -= *v2*tmp0; 2053 sum3 -= *v3*tmp0; 2054 sum4 -= *v4*tmp0; 2055 } 2056 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2057 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2058 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2059 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2060 break; 2061 case 5: 2062 2063 sum1 = xb[row]; 2064 sum2 = xb[row-1]; 2065 sum3 = xb[row-2]; 2066 sum4 = xb[row-3]; 2067 sum5 = xb[row-4]; 2068 v2 = a->a + diag[row-1] + 2; 2069 v3 = a->a + diag[row-2] + 3; 2070 v4 = a->a + diag[row-3] + 4; 2071 v5 = a->a + diag[row-4] + 5; 2072 for(n = 0; n<sz-1; n+=2) { 2073 i1 = idx[0]; 2074 i2 = idx[1]; 2075 idx += 2; 2076 tmp0 = x[i1]; 2077 tmp1 = x[i2]; 2078 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2079 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2080 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2081 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2082 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2083 } 2084 2085 if (n == sz-1){ 2086 tmp0 = x[*idx]; 2087 sum1 -= *v1*tmp0; 2088 sum2 -= *v2*tmp0; 2089 sum3 -= *v3*tmp0; 2090 sum4 -= *v4*tmp0; 2091 sum5 -= *v5*tmp0; 2092 } 2093 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2094 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2095 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2096 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2097 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2098 break; 2099 default: 2100 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2101 } 2102 } 2103 2104 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2105 } 2106 its--; 2107 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2108 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2109 } 2110 if (flag & SOR_EISENSTAT) { 2111 const PetscScalar *b; 2112 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2113 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2114 MatScalar *t = a->inode.ssor_work; 2115 /* 2116 Apply (U + D)^-1 where D is now the block diagonal 2117 */ 2118 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2119 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2120 ibdiag -= sizes[i]*sizes[i]; 2121 sz = ii[row+1] - diag[row] - 1; 2122 v1 = a->a + diag[row] + 1; 2123 idx = a->j + diag[row] + 1; 2124 CHKMEMQ; 2125 /* see comments for MatMult_Inode() for how this is coded */ 2126 switch (sizes[i]){ 2127 case 1: 2128 2129 sum1 = b[row]; 2130 for(n = 0; n<sz-1; n+=2) { 2131 i1 = idx[0]; 2132 i2 = idx[1]; 2133 idx += 2; 2134 tmp0 = x[i1]; 2135 tmp1 = x[i2]; 2136 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2137 } 2138 2139 if (n == sz-1){ 2140 tmp0 = x[*idx]; 2141 sum1 -= *v1*tmp0; 2142 } 2143 x[row] = sum1*(*ibdiag);row--; 2144 break; 2145 2146 case 2: 2147 2148 sum1 = b[row]; 2149 sum2 = b[row-1]; 2150 /* note that sum1 is associated with the second of the two rows */ 2151 v2 = a->a + diag[row-1] + 2; 2152 for(n = 0; n<sz-1; n+=2) { 2153 i1 = idx[0]; 2154 i2 = idx[1]; 2155 idx += 2; 2156 tmp0 = x[i1]; 2157 tmp1 = x[i2]; 2158 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2159 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2160 } 2161 2162 if (n == sz-1){ 2163 tmp0 = x[*idx]; 2164 sum1 -= *v1*tmp0; 2165 sum2 -= *v2*tmp0; 2166 } 2167 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2168 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2169 row -= 2; 2170 break; 2171 case 3: 2172 2173 sum1 = b[row]; 2174 sum2 = b[row-1]; 2175 sum3 = b[row-2]; 2176 v2 = a->a + diag[row-1] + 2; 2177 v3 = a->a + diag[row-2] + 3; 2178 for(n = 0; n<sz-1; n+=2) { 2179 i1 = idx[0]; 2180 i2 = idx[1]; 2181 idx += 2; 2182 tmp0 = x[i1]; 2183 tmp1 = x[i2]; 2184 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2185 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2186 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2187 } 2188 2189 if (n == sz-1){ 2190 tmp0 = x[*idx]; 2191 sum1 -= *v1*tmp0; 2192 sum2 -= *v2*tmp0; 2193 sum3 -= *v3*tmp0; 2194 } 2195 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2196 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2197 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2198 row -= 3; 2199 break; 2200 case 4: 2201 2202 sum1 = b[row]; 2203 sum2 = b[row-1]; 2204 sum3 = b[row-2]; 2205 sum4 = b[row-3]; 2206 v2 = a->a + diag[row-1] + 2; 2207 v3 = a->a + diag[row-2] + 3; 2208 v4 = a->a + diag[row-3] + 4; 2209 for(n = 0; n<sz-1; n+=2) { 2210 i1 = idx[0]; 2211 i2 = idx[1]; 2212 idx += 2; 2213 tmp0 = x[i1]; 2214 tmp1 = x[i2]; 2215 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2216 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2217 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2218 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2219 } 2220 2221 if (n == sz-1){ 2222 tmp0 = x[*idx]; 2223 sum1 -= *v1*tmp0; 2224 sum2 -= *v2*tmp0; 2225 sum3 -= *v3*tmp0; 2226 sum4 -= *v4*tmp0; 2227 } 2228 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2229 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2230 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2231 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2232 row -= 4; 2233 break; 2234 case 5: 2235 2236 sum1 = b[row]; 2237 sum2 = b[row-1]; 2238 sum3 = b[row-2]; 2239 sum4 = b[row-3]; 2240 sum5 = b[row-4]; 2241 v2 = a->a + diag[row-1] + 2; 2242 v3 = a->a + diag[row-2] + 3; 2243 v4 = a->a + diag[row-3] + 4; 2244 v5 = a->a + diag[row-4] + 5; 2245 for(n = 0; n<sz-1; n+=2) { 2246 i1 = idx[0]; 2247 i2 = idx[1]; 2248 idx += 2; 2249 tmp0 = x[i1]; 2250 tmp1 = x[i2]; 2251 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2252 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2253 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2254 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2255 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2256 } 2257 2258 if (n == sz-1){ 2259 tmp0 = x[*idx]; 2260 sum1 -= *v1*tmp0; 2261 sum2 -= *v2*tmp0; 2262 sum3 -= *v3*tmp0; 2263 sum4 -= *v4*tmp0; 2264 sum5 -= *v5*tmp0; 2265 } 2266 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2267 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2268 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2269 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2270 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2271 row -= 5; 2272 break; 2273 default: 2274 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2275 } 2276 CHKMEMQ; 2277 } 2278 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2279 2280 /* 2281 t = b - D x where D is the block diagonal 2282 */ 2283 cnt = 0; 2284 for (i=0, row=0; i<m; i++) { 2285 CHKMEMQ; 2286 switch (sizes[i]){ 2287 case 1: 2288 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 2289 break; 2290 case 2: 2291 x1 = x[row]; x2 = x[row+1]; 2292 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2293 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2294 t[row] = b[row] - tmp1; 2295 t[row+1] = b[row+1] - tmp2; row += 2; 2296 cnt += 4; 2297 break; 2298 case 3: 2299 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2300 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2301 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2302 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2303 t[row] = b[row] - tmp1; 2304 t[row+1] = b[row+1] - tmp2; 2305 t[row+2] = b[row+2] - tmp3; row += 3; 2306 cnt += 9; 2307 break; 2308 case 4: 2309 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2310 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2311 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2312 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2313 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2314 t[row] = b[row] - tmp1; 2315 t[row+1] = b[row+1] - tmp2; 2316 t[row+2] = b[row+2] - tmp3; 2317 t[row+3] = b[row+3] - tmp4; row += 4; 2318 cnt += 16; 2319 break; 2320 case 5: 2321 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2322 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2323 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2324 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2325 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2326 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2327 t[row] = b[row] - tmp1; 2328 t[row+1] = b[row+1] - tmp2; 2329 t[row+2] = b[row+2] - tmp3; 2330 t[row+3] = b[row+3] - tmp4; 2331 t[row+4] = b[row+4] - tmp5;row += 5; 2332 cnt += 25; 2333 break; 2334 default: 2335 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2336 } 2337 CHKMEMQ; 2338 } 2339 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2340 2341 2342 2343 /* 2344 Apply (L + D)^-1 where D is the block diagonal 2345 */ 2346 for (i=0, row=0; i<m; i++) { 2347 sz = diag[row] - ii[row]; 2348 v1 = a->a + ii[row]; 2349 idx = a->j + ii[row]; 2350 CHKMEMQ; 2351 /* see comments for MatMult_Inode() for how this is coded */ 2352 switch (sizes[i]){ 2353 case 1: 2354 2355 sum1 = t[row]; 2356 for(n = 0; n<sz-1; n+=2) { 2357 i1 = idx[0]; 2358 i2 = idx[1]; 2359 idx += 2; 2360 tmp0 = t[i1]; 2361 tmp1 = t[i2]; 2362 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2363 } 2364 2365 if (n == sz-1){ 2366 tmp0 = t[*idx]; 2367 sum1 -= *v1 * tmp0; 2368 } 2369 x[row] += t[row] = sum1*(*ibdiag++); row++; 2370 break; 2371 case 2: 2372 v2 = a->a + ii[row+1]; 2373 sum1 = t[row]; 2374 sum2 = t[row+1]; 2375 for(n = 0; n<sz-1; n+=2) { 2376 i1 = idx[0]; 2377 i2 = idx[1]; 2378 idx += 2; 2379 tmp0 = t[i1]; 2380 tmp1 = t[i2]; 2381 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2382 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2383 } 2384 2385 if (n == sz-1){ 2386 tmp0 = t[*idx]; 2387 sum1 -= v1[0] * tmp0; 2388 sum2 -= v2[0] * tmp0; 2389 } 2390 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2391 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2392 ibdiag += 4; row += 2; 2393 break; 2394 case 3: 2395 v2 = a->a + ii[row+1]; 2396 v3 = a->a + ii[row+2]; 2397 sum1 = t[row]; 2398 sum2 = t[row+1]; 2399 sum3 = t[row+2]; 2400 for(n = 0; n<sz-1; n+=2) { 2401 i1 = idx[0]; 2402 i2 = idx[1]; 2403 idx += 2; 2404 tmp0 = t[i1]; 2405 tmp1 = t[i2]; 2406 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2407 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2408 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2409 } 2410 2411 if (n == sz-1){ 2412 tmp0 = t[*idx]; 2413 sum1 -= v1[0] * tmp0; 2414 sum2 -= v2[0] * tmp0; 2415 sum3 -= v3[0] * tmp0; 2416 } 2417 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2418 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2419 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2420 ibdiag += 9; row += 3; 2421 break; 2422 case 4: 2423 v2 = a->a + ii[row+1]; 2424 v3 = a->a + ii[row+2]; 2425 v4 = a->a + ii[row+3]; 2426 sum1 = t[row]; 2427 sum2 = t[row+1]; 2428 sum3 = t[row+2]; 2429 sum4 = t[row+3]; 2430 for(n = 0; n<sz-1; n+=2) { 2431 i1 = idx[0]; 2432 i2 = idx[1]; 2433 idx += 2; 2434 tmp0 = t[i1]; 2435 tmp1 = t[i2]; 2436 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2437 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2438 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2439 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2440 } 2441 2442 if (n == sz-1){ 2443 tmp0 = t[*idx]; 2444 sum1 -= v1[0] * tmp0; 2445 sum2 -= v2[0] * tmp0; 2446 sum3 -= v3[0] * tmp0; 2447 sum4 -= v4[0] * tmp0; 2448 } 2449 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2450 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2451 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2452 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2453 ibdiag += 16; row += 4; 2454 break; 2455 case 5: 2456 v2 = a->a + ii[row+1]; 2457 v3 = a->a + ii[row+2]; 2458 v4 = a->a + ii[row+3]; 2459 v5 = a->a + ii[row+4]; 2460 sum1 = t[row]; 2461 sum2 = t[row+1]; 2462 sum3 = t[row+2]; 2463 sum4 = t[row+3]; 2464 sum5 = t[row+4]; 2465 for(n = 0; n<sz-1; n+=2) { 2466 i1 = idx[0]; 2467 i2 = idx[1]; 2468 idx += 2; 2469 tmp0 = t[i1]; 2470 tmp1 = t[i2]; 2471 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2472 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2473 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2474 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2475 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2476 } 2477 2478 if (n == sz-1){ 2479 tmp0 = t[*idx]; 2480 sum1 -= v1[0] * tmp0; 2481 sum2 -= v2[0] * tmp0; 2482 sum3 -= v3[0] * tmp0; 2483 sum4 -= v4[0] * tmp0; 2484 sum5 -= v5[0] * tmp0; 2485 } 2486 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2487 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2488 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2489 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2490 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2491 ibdiag += 25; row += 5; 2492 break; 2493 default: 2494 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2495 } 2496 CHKMEMQ; 2497 } 2498 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2499 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2500 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2501 } 2502 PetscFunctionReturn(0); 2503 } 2504 2505 #undef __FUNCT__ 2506 #define __FUNCT__ "MatMultDiagonalBlock_Inode" 2507 PetscErrorCode MatMultDiagonalBlock_Inode(Mat A,Vec bb,Vec xx) 2508 { 2509 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2510 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 2511 const MatScalar *bdiag = a->inode.bdiag; 2512 const PetscScalar *b; 2513 PetscErrorCode ierr; 2514 PetscInt m = a->inode.node_count,cnt = 0,i,row; 2515 const PetscInt *sizes = a->inode.size; 2516 2517 PetscFunctionBegin; 2518 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2519 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2520 cnt = 0; 2521 for (i=0, row=0; i<m; i++) { 2522 switch (sizes[i]){ 2523 case 1: 2524 x[row] = b[row]*bdiag[cnt++];row++; 2525 break; 2526 case 2: 2527 x1 = b[row]; x2 = b[row+1]; 2528 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2529 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2530 x[row++] = tmp1; 2531 x[row++] = tmp2; 2532 cnt += 4; 2533 break; 2534 case 3: 2535 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 2536 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2537 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2538 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2539 x[row++] = tmp1; 2540 x[row++] = tmp2; 2541 x[row++] = tmp3; 2542 cnt += 9; 2543 break; 2544 case 4: 2545 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 2546 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2547 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2548 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2549 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2550 x[row++] = tmp1; 2551 x[row++] = tmp2; 2552 x[row++] = tmp3; 2553 x[row++] = tmp4; 2554 cnt += 16; 2555 break; 2556 case 5: 2557 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 2558 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2559 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2560 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2561 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2562 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2563 x[row++] = tmp1; 2564 x[row++] = tmp2; 2565 x[row++] = tmp3; 2566 x[row++] = tmp4; 2567 x[row++] = tmp5; 2568 cnt += 25; 2569 break; 2570 default: 2571 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2572 } 2573 } 2574 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 2575 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2576 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2577 PetscFunctionReturn(0); 2578 } 2579 2580 extern PetscErrorCode MatMultDiagonalBlock_Inode(Mat,Vec,Vec); 2581 /* 2582 samestructure indicates that the matrix has not changed its nonzero structure so we 2583 do not need to recompute the inodes 2584 */ 2585 #undef __FUNCT__ 2586 #define __FUNCT__ "Mat_CheckInode" 2587 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2588 { 2589 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2590 PetscErrorCode ierr; 2591 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2592 PetscTruth flag; 2593 2594 PetscFunctionBegin; 2595 if (!a->inode.use) PetscFunctionReturn(0); 2596 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2597 2598 2599 m = A->rmap->n; 2600 if (a->inode.size) {ns = a->inode.size;} 2601 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2602 2603 i = 0; 2604 node_count = 0; 2605 idx = a->j; 2606 ii = a->i; 2607 while (i < m){ /* For each row */ 2608 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2609 /* Limits the number of elements in a node to 'a->inode.limit' */ 2610 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2611 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2612 if (nzy != nzx) break; 2613 idy += nzx; /* Same nonzero pattern */ 2614 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2615 if (!flag) break; 2616 } 2617 ns[node_count++] = blk_size; 2618 idx += blk_size*nzx; 2619 i = j; 2620 } 2621 /* If not enough inodes found,, do not use inode version of the routines */ 2622 if (!a->inode.size && m && node_count > .9*m) { 2623 ierr = PetscFree(ns);CHKERRQ(ierr); 2624 a->inode.node_count = 0; 2625 a->inode.size = PETSC_NULL; 2626 a->inode.use = PETSC_FALSE; 2627 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2628 } else { 2629 A->ops->mult = MatMult_Inode; 2630 A->ops->relax = MatRelax_Inode; 2631 A->ops->multadd = MatMultAdd_Inode; 2632 A->ops->getrowij = MatGetRowIJ_Inode; 2633 A->ops->restorerowij = MatRestoreRowIJ_Inode; 2634 A->ops->getcolumnij = MatGetColumnIJ_Inode; 2635 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 2636 A->ops->coloringpatch = MatColoringPatch_Inode; 2637 A->ops->multdiagonalblock = MatMultDiagonalBlock_Inode; 2638 a->inode.node_count = node_count; 2639 a->inode.size = ns; 2640 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2641 } 2642 PetscFunctionReturn(0); 2643 } 2644 2645 /* 2646 This is really ugly. if inodes are used this replaces the 2647 permutations with ones that correspond to rows/cols of the matrix 2648 rather then inode blocks 2649 */ 2650 #undef __FUNCT__ 2651 #define __FUNCT__ "MatInodeAdjustForInodes" 2652 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2653 { 2654 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2655 2656 PetscFunctionBegin; 2657 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2658 if (f) { 2659 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2660 } 2661 PetscFunctionReturn(0); 2662 } 2663 2664 EXTERN_C_BEGIN 2665 #undef __FUNCT__ 2666 #define __FUNCT__ "MatAdjustForInodes_Inode" 2667 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 2668 { 2669 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2670 PetscErrorCode ierr; 2671 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 2672 const PetscInt *ridx,*cidx; 2673 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2674 PetscInt nslim_col,*ns_col; 2675 IS ris = *rperm,cis = *cperm; 2676 2677 PetscFunctionBegin; 2678 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2679 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2680 2681 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2682 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2683 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2684 permc = permr + m; 2685 2686 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2687 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2688 2689 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2690 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2691 2692 /* Construct the permutations for rows*/ 2693 for (i=0,row = 0; i<nslim_row; ++i){ 2694 indx = ridx[i]; 2695 start_val = tns[indx]; 2696 end_val = tns[indx + 1]; 2697 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2698 } 2699 2700 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2701 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2702 2703 /* Construct permutations for columns */ 2704 for (i=0,col=0; i<nslim_col; ++i){ 2705 indx = cidx[i]; 2706 start_val = tns[indx]; 2707 end_val = tns[indx + 1]; 2708 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2709 } 2710 2711 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2712 ISSetPermutation(*rperm); 2713 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2714 ISSetPermutation(*cperm); 2715 2716 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2717 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2718 2719 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2720 ierr = PetscFree(permr);CHKERRQ(ierr); 2721 ierr = ISDestroy(cis);CHKERRQ(ierr); 2722 ierr = ISDestroy(ris);CHKERRQ(ierr); 2723 ierr = PetscFree(tns);CHKERRQ(ierr); 2724 PetscFunctionReturn(0); 2725 } 2726 EXTERN_C_END 2727 2728 #undef __FUNCT__ 2729 #define __FUNCT__ "MatInodeGetInodeSizes" 2730 /*@C 2731 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2732 2733 Collective on Mat 2734 2735 Input Parameter: 2736 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2737 2738 Output Parameter: 2739 + node_count - no of inodes present in the matrix. 2740 . sizes - an array of size node_count,with sizes of each inode. 2741 - limit - the max size used to generate the inodes. 2742 2743 Level: advanced 2744 2745 Notes: This routine returns some internal storage information 2746 of the matrix, it is intended to be used by advanced users. 2747 It should be called after the matrix is assembled. 2748 The contents of the sizes[] array should not be changed. 2749 PETSC_NULL may be passed for information not requested. 2750 2751 .keywords: matrix, seqaij, get, inode 2752 2753 .seealso: MatGetInfo() 2754 @*/ 2755 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2756 { 2757 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2758 2759 PetscFunctionBegin; 2760 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2761 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2762 if (f) { 2763 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2764 } 2765 PetscFunctionReturn(0); 2766 } 2767 2768 EXTERN_C_BEGIN 2769 #undef __FUNCT__ 2770 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 2771 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2772 { 2773 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2774 2775 PetscFunctionBegin; 2776 if (node_count) *node_count = a->inode.node_count; 2777 if (sizes) *sizes = a->inode.size; 2778 if (limit) *limit = a->inode.limit; 2779 PetscFunctionReturn(0); 2780 } 2781 EXTERN_C_END 2782