1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap.n; 19 m = A->rmap.n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap.n; 72 n = A->cmap.n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap.n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap.n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_Inode" 397 static PetscErrorCode MatMult_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y; 402 PetscErrorCode ierr; 403 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 404 405 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 406 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 407 #endif 408 409 PetscFunctionBegin; 410 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 411 node_max = a->inode.node_count; 412 ns = a->inode.size; /* Node Size array */ 413 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 414 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 415 idx = a->j; 416 v1 = a->a; 417 ii = a->i; 418 419 for (i = 0,row = 0; i< node_max; ++i){ 420 nsz = ns[i]; 421 n = ii[1] - ii[0]; 422 ii += nsz; 423 sz = n; /* No of non zeros in this row */ 424 /* Switch on the size of Node */ 425 switch (nsz){ /* Each loop in 'case' is unrolled */ 426 case 1 : 427 sum1 = 0; 428 429 for(n = 0; n< sz-1; n+=2) { 430 i1 = idx[0]; /* The instructions are ordered to */ 431 i2 = idx[1]; /* make the compiler's job easy */ 432 idx += 2; 433 tmp0 = x[i1]; 434 tmp1 = x[i2]; 435 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 436 } 437 438 if (n == sz-1){ /* Take care of the last nonzero */ 439 tmp0 = x[*idx++]; 440 sum1 += *v1++ * tmp0; 441 } 442 y[row++]=sum1; 443 break; 444 case 2: 445 sum1 = 0; 446 sum2 = 0; 447 v2 = v1 + n; 448 449 for (n = 0; n< sz-1; n+=2) { 450 i1 = idx[0]; 451 i2 = idx[1]; 452 idx += 2; 453 tmp0 = x[i1]; 454 tmp1 = x[i2]; 455 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 456 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 457 } 458 if (n == sz-1){ 459 tmp0 = x[*idx++]; 460 sum1 += *v1++ * tmp0; 461 sum2 += *v2++ * tmp0; 462 } 463 y[row++]=sum1; 464 y[row++]=sum2; 465 v1 =v2; /* Since the next block to be processed starts there*/ 466 idx +=sz; 467 break; 468 case 3: 469 sum1 = 0; 470 sum2 = 0; 471 sum3 = 0; 472 v2 = v1 + n; 473 v3 = v2 + n; 474 475 for (n = 0; n< sz-1; n+=2) { 476 i1 = idx[0]; 477 i2 = idx[1]; 478 idx += 2; 479 tmp0 = x[i1]; 480 tmp1 = x[i2]; 481 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 482 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 483 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 484 } 485 if (n == sz-1){ 486 tmp0 = x[*idx++]; 487 sum1 += *v1++ * tmp0; 488 sum2 += *v2++ * tmp0; 489 sum3 += *v3++ * tmp0; 490 } 491 y[row++]=sum1; 492 y[row++]=sum2; 493 y[row++]=sum3; 494 v1 =v3; /* Since the next block to be processed starts there*/ 495 idx +=2*sz; 496 break; 497 case 4: 498 sum1 = 0; 499 sum2 = 0; 500 sum3 = 0; 501 sum4 = 0; 502 v2 = v1 + n; 503 v3 = v2 + n; 504 v4 = v3 + n; 505 506 for (n = 0; n< sz-1; n+=2) { 507 i1 = idx[0]; 508 i2 = idx[1]; 509 idx += 2; 510 tmp0 = x[i1]; 511 tmp1 = x[i2]; 512 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 513 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 514 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 515 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 516 } 517 if (n == sz-1){ 518 tmp0 = x[*idx++]; 519 sum1 += *v1++ * tmp0; 520 sum2 += *v2++ * tmp0; 521 sum3 += *v3++ * tmp0; 522 sum4 += *v4++ * tmp0; 523 } 524 y[row++]=sum1; 525 y[row++]=sum2; 526 y[row++]=sum3; 527 y[row++]=sum4; 528 v1 =v4; /* Since the next block to be processed starts there*/ 529 idx +=3*sz; 530 break; 531 case 5: 532 sum1 = 0; 533 sum2 = 0; 534 sum3 = 0; 535 sum4 = 0; 536 sum5 = 0; 537 v2 = v1 + n; 538 v3 = v2 + n; 539 v4 = v3 + n; 540 v5 = v4 + n; 541 542 for (n = 0; n<sz-1; n+=2) { 543 i1 = idx[0]; 544 i2 = idx[1]; 545 idx += 2; 546 tmp0 = x[i1]; 547 tmp1 = x[i2]; 548 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 549 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 550 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 551 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 552 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 553 } 554 if (n == sz-1){ 555 tmp0 = x[*idx++]; 556 sum1 += *v1++ * tmp0; 557 sum2 += *v2++ * tmp0; 558 sum3 += *v3++ * tmp0; 559 sum4 += *v4++ * tmp0; 560 sum5 += *v5++ * tmp0; 561 } 562 y[row++]=sum1; 563 y[row++]=sum2; 564 y[row++]=sum3; 565 y[row++]=sum4; 566 y[row++]=sum5; 567 v1 =v5; /* Since the next block to be processed starts there */ 568 idx +=4*sz; 569 break; 570 default : 571 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 572 } 573 } 574 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 575 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 576 ierr = PetscLogFlops(2*a->nz - A->rmap.n);CHKERRQ(ierr); 577 PetscFunctionReturn(0); 578 } 579 /* ----------------------------------------------------------- */ 580 /* Almost same code as the MatMult_Inode() */ 581 #undef __FUNCT__ 582 #define __FUNCT__ "MatMultAdd_Inode" 583 static PetscErrorCode MatMultAdd_Inode(Mat A,Vec xx,Vec zz,Vec yy) 584 { 585 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 586 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 587 PetscScalar *v1,*v2,*v3,*v4,*v5,*x,*y,*z,*zt; 588 PetscErrorCode ierr; 589 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 590 591 PetscFunctionBegin; 592 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 593 node_max = a->inode.node_count; 594 ns = a->inode.size; /* Node Size array */ 595 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 596 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 597 if (zz != yy) { 598 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 599 } else { 600 z = y; 601 } 602 zt = z; 603 604 idx = a->j; 605 v1 = a->a; 606 ii = a->i; 607 608 for (i = 0,row = 0; i< node_max; ++i){ 609 nsz = ns[i]; 610 n = ii[1] - ii[0]; 611 ii += nsz; 612 sz = n; /* No of non zeros in this row */ 613 /* Switch on the size of Node */ 614 switch (nsz){ /* Each loop in 'case' is unrolled */ 615 case 1 : 616 sum1 = *zt++; 617 618 for(n = 0; n< sz-1; n+=2) { 619 i1 = idx[0]; /* The instructions are ordered to */ 620 i2 = idx[1]; /* make the compiler's job easy */ 621 idx += 2; 622 tmp0 = x[i1]; 623 tmp1 = x[i2]; 624 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 625 } 626 627 if(n == sz-1){ /* Take care of the last nonzero */ 628 tmp0 = x[*idx++]; 629 sum1 += *v1++ * tmp0; 630 } 631 y[row++]=sum1; 632 break; 633 case 2: 634 sum1 = *zt++; 635 sum2 = *zt++; 636 v2 = v1 + n; 637 638 for(n = 0; n< sz-1; n+=2) { 639 i1 = idx[0]; 640 i2 = idx[1]; 641 idx += 2; 642 tmp0 = x[i1]; 643 tmp1 = x[i2]; 644 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 645 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 646 } 647 if(n == sz-1){ 648 tmp0 = x[*idx++]; 649 sum1 += *v1++ * tmp0; 650 sum2 += *v2++ * tmp0; 651 } 652 y[row++]=sum1; 653 y[row++]=sum2; 654 v1 =v2; /* Since the next block to be processed starts there*/ 655 idx +=sz; 656 break; 657 case 3: 658 sum1 = *zt++; 659 sum2 = *zt++; 660 sum3 = *zt++; 661 v2 = v1 + n; 662 v3 = v2 + n; 663 664 for (n = 0; n< sz-1; n+=2) { 665 i1 = idx[0]; 666 i2 = idx[1]; 667 idx += 2; 668 tmp0 = x[i1]; 669 tmp1 = x[i2]; 670 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 671 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 672 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 673 } 674 if (n == sz-1){ 675 tmp0 = x[*idx++]; 676 sum1 += *v1++ * tmp0; 677 sum2 += *v2++ * tmp0; 678 sum3 += *v3++ * tmp0; 679 } 680 y[row++]=sum1; 681 y[row++]=sum2; 682 y[row++]=sum3; 683 v1 =v3; /* Since the next block to be processed starts there*/ 684 idx +=2*sz; 685 break; 686 case 4: 687 sum1 = *zt++; 688 sum2 = *zt++; 689 sum3 = *zt++; 690 sum4 = *zt++; 691 v2 = v1 + n; 692 v3 = v2 + n; 693 v4 = v3 + n; 694 695 for (n = 0; n< sz-1; n+=2) { 696 i1 = idx[0]; 697 i2 = idx[1]; 698 idx += 2; 699 tmp0 = x[i1]; 700 tmp1 = x[i2]; 701 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 702 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 703 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 704 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 705 } 706 if (n == sz-1){ 707 tmp0 = x[*idx++]; 708 sum1 += *v1++ * tmp0; 709 sum2 += *v2++ * tmp0; 710 sum3 += *v3++ * tmp0; 711 sum4 += *v4++ * tmp0; 712 } 713 y[row++]=sum1; 714 y[row++]=sum2; 715 y[row++]=sum3; 716 y[row++]=sum4; 717 v1 =v4; /* Since the next block to be processed starts there*/ 718 idx +=3*sz; 719 break; 720 case 5: 721 sum1 = *zt++; 722 sum2 = *zt++; 723 sum3 = *zt++; 724 sum4 = *zt++; 725 sum5 = *zt++; 726 v2 = v1 + n; 727 v3 = v2 + n; 728 v4 = v3 + n; 729 v5 = v4 + n; 730 731 for (n = 0; n<sz-1; n+=2) { 732 i1 = idx[0]; 733 i2 = idx[1]; 734 idx += 2; 735 tmp0 = x[i1]; 736 tmp1 = x[i2]; 737 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 738 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 739 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 740 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 741 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 742 } 743 if(n == sz-1){ 744 tmp0 = x[*idx++]; 745 sum1 += *v1++ * tmp0; 746 sum2 += *v2++ * tmp0; 747 sum3 += *v3++ * tmp0; 748 sum4 += *v4++ * tmp0; 749 sum5 += *v5++ * tmp0; 750 } 751 y[row++]=sum1; 752 y[row++]=sum2; 753 y[row++]=sum3; 754 y[row++]=sum4; 755 y[row++]=sum5; 756 v1 =v5; /* Since the next block to be processed starts there */ 757 idx +=4*sz; 758 break; 759 default : 760 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 761 } 762 } 763 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 764 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 765 if (zz != yy) { 766 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 767 } 768 ierr = PetscLogFlops(2*a->nz);CHKERRQ(ierr); 769 PetscFunctionReturn(0); 770 } 771 772 /* ----------------------------------------------------------- */ 773 #undef __FUNCT__ 774 #define __FUNCT__ "MatSolve_Inode" 775 PetscErrorCode MatSolve_Inode(Mat A,Vec bb,Vec xx) 776 { 777 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 778 IS iscol = a->col,isrow = a->row; 779 PetscErrorCode ierr; 780 PetscInt *r,*c,i,j,n = A->rmap.n,*ai = a->i,nz,*a_j = a->j; 781 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1,*rout,*cout; 782 PetscScalar *x,*b,*a_a = a->a,*tmp,*tmps,*aa,tmp0,tmp1; 783 PetscScalar sum1,sum2,sum3,sum4,sum5,*v1,*v2,*v3,*v4,*v5; 784 785 PetscFunctionBegin; 786 if (A->factor!=FACTOR_LU) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unfactored matrix"); 787 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 788 node_max = a->inode.node_count; 789 ns = a->inode.size; /* Node Size array */ 790 791 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 792 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 793 tmp = a->solve_work; 794 795 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 796 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 797 798 /* forward solve the lower triangular */ 799 tmps = tmp ; 800 aa = a_a ; 801 aj = a_j ; 802 ad = a->diag; 803 804 for (i = 0,row = 0; i< node_max; ++i){ 805 nsz = ns[i]; 806 aii = ai[row]; 807 v1 = aa + aii; 808 vi = aj + aii; 809 nz = ad[row]- aii; 810 811 switch (nsz){ /* Each loop in 'case' is unrolled */ 812 case 1 : 813 sum1 = b[*r++]; 814 /* while (nz--) sum1 -= *v1++ *tmps[*vi++];*/ 815 for(j=0; j<nz-1; j+=2){ 816 i0 = vi[0]; 817 i1 = vi[1]; 818 vi +=2; 819 tmp0 = tmps[i0]; 820 tmp1 = tmps[i1]; 821 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 822 } 823 if(j == nz-1){ 824 tmp0 = tmps[*vi++]; 825 sum1 -= *v1++ *tmp0; 826 } 827 tmp[row ++]=sum1; 828 break; 829 case 2: 830 sum1 = b[*r++]; 831 sum2 = b[*r++]; 832 v2 = aa + ai[row+1]; 833 834 for(j=0; j<nz-1; j+=2){ 835 i0 = vi[0]; 836 i1 = vi[1]; 837 vi +=2; 838 tmp0 = tmps[i0]; 839 tmp1 = tmps[i1]; 840 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 841 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 842 } 843 if(j == nz-1){ 844 tmp0 = tmps[*vi++]; 845 sum1 -= *v1++ *tmp0; 846 sum2 -= *v2++ *tmp0; 847 } 848 sum2 -= *v2++ * sum1; 849 tmp[row ++]=sum1; 850 tmp[row ++]=sum2; 851 break; 852 case 3: 853 sum1 = b[*r++]; 854 sum2 = b[*r++]; 855 sum3 = b[*r++]; 856 v2 = aa + ai[row+1]; 857 v3 = aa + ai[row+2]; 858 859 for (j=0; j<nz-1; j+=2){ 860 i0 = vi[0]; 861 i1 = vi[1]; 862 vi +=2; 863 tmp0 = tmps[i0]; 864 tmp1 = tmps[i1]; 865 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 866 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 867 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 868 } 869 if (j == nz-1){ 870 tmp0 = tmps[*vi++]; 871 sum1 -= *v1++ *tmp0; 872 sum2 -= *v2++ *tmp0; 873 sum3 -= *v3++ *tmp0; 874 } 875 sum2 -= *v2++ * sum1; 876 sum3 -= *v3++ * sum1; 877 sum3 -= *v3++ * sum2; 878 tmp[row ++]=sum1; 879 tmp[row ++]=sum2; 880 tmp[row ++]=sum3; 881 break; 882 883 case 4: 884 sum1 = b[*r++]; 885 sum2 = b[*r++]; 886 sum3 = b[*r++]; 887 sum4 = b[*r++]; 888 v2 = aa + ai[row+1]; 889 v3 = aa + ai[row+2]; 890 v4 = aa + ai[row+3]; 891 892 for (j=0; j<nz-1; j+=2){ 893 i0 = vi[0]; 894 i1 = vi[1]; 895 vi +=2; 896 tmp0 = tmps[i0]; 897 tmp1 = tmps[i1]; 898 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 899 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 900 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 901 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 902 } 903 if (j == nz-1){ 904 tmp0 = tmps[*vi++]; 905 sum1 -= *v1++ *tmp0; 906 sum2 -= *v2++ *tmp0; 907 sum3 -= *v3++ *tmp0; 908 sum4 -= *v4++ *tmp0; 909 } 910 sum2 -= *v2++ * sum1; 911 sum3 -= *v3++ * sum1; 912 sum4 -= *v4++ * sum1; 913 sum3 -= *v3++ * sum2; 914 sum4 -= *v4++ * sum2; 915 sum4 -= *v4++ * sum3; 916 917 tmp[row ++]=sum1; 918 tmp[row ++]=sum2; 919 tmp[row ++]=sum3; 920 tmp[row ++]=sum4; 921 break; 922 case 5: 923 sum1 = b[*r++]; 924 sum2 = b[*r++]; 925 sum3 = b[*r++]; 926 sum4 = b[*r++]; 927 sum5 = b[*r++]; 928 v2 = aa + ai[row+1]; 929 v3 = aa + ai[row+2]; 930 v4 = aa + ai[row+3]; 931 v5 = aa + ai[row+4]; 932 933 for (j=0; j<nz-1; j+=2){ 934 i0 = vi[0]; 935 i1 = vi[1]; 936 vi +=2; 937 tmp0 = tmps[i0]; 938 tmp1 = tmps[i1]; 939 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 940 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 941 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 942 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 943 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 944 } 945 if (j == nz-1){ 946 tmp0 = tmps[*vi++]; 947 sum1 -= *v1++ *tmp0; 948 sum2 -= *v2++ *tmp0; 949 sum3 -= *v3++ *tmp0; 950 sum4 -= *v4++ *tmp0; 951 sum5 -= *v5++ *tmp0; 952 } 953 954 sum2 -= *v2++ * sum1; 955 sum3 -= *v3++ * sum1; 956 sum4 -= *v4++ * sum1; 957 sum5 -= *v5++ * sum1; 958 sum3 -= *v3++ * sum2; 959 sum4 -= *v4++ * sum2; 960 sum5 -= *v5++ * sum2; 961 sum4 -= *v4++ * sum3; 962 sum5 -= *v5++ * sum3; 963 sum5 -= *v5++ * sum4; 964 965 tmp[row ++]=sum1; 966 tmp[row ++]=sum2; 967 tmp[row ++]=sum3; 968 tmp[row ++]=sum4; 969 tmp[row ++]=sum5; 970 break; 971 default: 972 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 973 } 974 } 975 /* backward solve the upper triangular */ 976 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 977 nsz = ns[i]; 978 aii = ai[row+1] -1; 979 v1 = aa + aii; 980 vi = aj + aii; 981 nz = aii- ad[row]; 982 switch (nsz){ /* Each loop in 'case' is unrolled */ 983 case 1 : 984 sum1 = tmp[row]; 985 986 for(j=nz ; j>1; j-=2){ 987 vi -=2; 988 i0 = vi[2]; 989 i1 = vi[1]; 990 tmp0 = tmps[i0]; 991 tmp1 = tmps[i1]; 992 v1 -= 2; 993 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 994 } 995 if (j==1){ 996 tmp0 = tmps[*vi--]; 997 sum1 -= *v1-- * tmp0; 998 } 999 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1000 break; 1001 case 2 : 1002 sum1 = tmp[row]; 1003 sum2 = tmp[row -1]; 1004 v2 = aa + ai[row]-1; 1005 for (j=nz ; j>1; j-=2){ 1006 vi -=2; 1007 i0 = vi[2]; 1008 i1 = vi[1]; 1009 tmp0 = tmps[i0]; 1010 tmp1 = tmps[i1]; 1011 v1 -= 2; 1012 v2 -= 2; 1013 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1014 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1015 } 1016 if (j==1){ 1017 tmp0 = tmps[*vi--]; 1018 sum1 -= *v1-- * tmp0; 1019 sum2 -= *v2-- * tmp0; 1020 } 1021 1022 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1023 sum2 -= *v2-- * tmp0; 1024 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1025 break; 1026 case 3 : 1027 sum1 = tmp[row]; 1028 sum2 = tmp[row -1]; 1029 sum3 = tmp[row -2]; 1030 v2 = aa + ai[row]-1; 1031 v3 = aa + ai[row -1]-1; 1032 for (j=nz ; j>1; j-=2){ 1033 vi -=2; 1034 i0 = vi[2]; 1035 i1 = vi[1]; 1036 tmp0 = tmps[i0]; 1037 tmp1 = tmps[i1]; 1038 v1 -= 2; 1039 v2 -= 2; 1040 v3 -= 2; 1041 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1042 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1043 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1044 } 1045 if (j==1){ 1046 tmp0 = tmps[*vi--]; 1047 sum1 -= *v1-- * tmp0; 1048 sum2 -= *v2-- * tmp0; 1049 sum3 -= *v3-- * tmp0; 1050 } 1051 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1052 sum2 -= *v2-- * tmp0; 1053 sum3 -= *v3-- * tmp0; 1054 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1055 sum3 -= *v3-- * tmp0; 1056 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1057 1058 break; 1059 case 4 : 1060 sum1 = tmp[row]; 1061 sum2 = tmp[row -1]; 1062 sum3 = tmp[row -2]; 1063 sum4 = tmp[row -3]; 1064 v2 = aa + ai[row]-1; 1065 v3 = aa + ai[row -1]-1; 1066 v4 = aa + ai[row -2]-1; 1067 1068 for (j=nz ; j>1; j-=2){ 1069 vi -=2; 1070 i0 = vi[2]; 1071 i1 = vi[1]; 1072 tmp0 = tmps[i0]; 1073 tmp1 = tmps[i1]; 1074 v1 -= 2; 1075 v2 -= 2; 1076 v3 -= 2; 1077 v4 -= 2; 1078 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1079 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1080 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1081 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1082 } 1083 if (j==1){ 1084 tmp0 = tmps[*vi--]; 1085 sum1 -= *v1-- * tmp0; 1086 sum2 -= *v2-- * tmp0; 1087 sum3 -= *v3-- * tmp0; 1088 sum4 -= *v4-- * tmp0; 1089 } 1090 1091 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1092 sum2 -= *v2-- * tmp0; 1093 sum3 -= *v3-- * tmp0; 1094 sum4 -= *v4-- * tmp0; 1095 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1096 sum3 -= *v3-- * tmp0; 1097 sum4 -= *v4-- * tmp0; 1098 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1099 sum4 -= *v4-- * tmp0; 1100 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1101 break; 1102 case 5 : 1103 sum1 = tmp[row]; 1104 sum2 = tmp[row -1]; 1105 sum3 = tmp[row -2]; 1106 sum4 = tmp[row -3]; 1107 sum5 = tmp[row -4]; 1108 v2 = aa + ai[row]-1; 1109 v3 = aa + ai[row -1]-1; 1110 v4 = aa + ai[row -2]-1; 1111 v5 = aa + ai[row -3]-1; 1112 for (j=nz ; j>1; j-=2){ 1113 vi -= 2; 1114 i0 = vi[2]; 1115 i1 = vi[1]; 1116 tmp0 = tmps[i0]; 1117 tmp1 = tmps[i1]; 1118 v1 -= 2; 1119 v2 -= 2; 1120 v3 -= 2; 1121 v4 -= 2; 1122 v5 -= 2; 1123 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1124 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1125 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1126 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1127 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1128 } 1129 if (j==1){ 1130 tmp0 = tmps[*vi--]; 1131 sum1 -= *v1-- * tmp0; 1132 sum2 -= *v2-- * tmp0; 1133 sum3 -= *v3-- * tmp0; 1134 sum4 -= *v4-- * tmp0; 1135 sum5 -= *v5-- * tmp0; 1136 } 1137 1138 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1139 sum2 -= *v2-- * tmp0; 1140 sum3 -= *v3-- * tmp0; 1141 sum4 -= *v4-- * tmp0; 1142 sum5 -= *v5-- * tmp0; 1143 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1144 sum3 -= *v3-- * tmp0; 1145 sum4 -= *v4-- * tmp0; 1146 sum5 -= *v5-- * tmp0; 1147 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1151 sum5 -= *v5-- * tmp0; 1152 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1153 break; 1154 default: 1155 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1156 } 1157 } 1158 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1159 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1160 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1161 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1162 ierr = PetscLogFlops(2*a->nz - A->cmap.n);CHKERRQ(ierr); 1163 PetscFunctionReturn(0); 1164 } 1165 1166 #undef __FUNCT__ 1167 #define __FUNCT__ "MatLUFactorNumeric_Inode" 1168 PetscErrorCode MatLUFactorNumeric_Inode(Mat A,MatFactorInfo *info,Mat *B) 1169 { 1170 Mat C = *B; 1171 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1172 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1173 PetscErrorCode ierr; 1174 PetscInt *r,*ic,*c,n = A->rmap.n,*bi = b->i; 1175 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1176 PetscInt *ics,i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1177 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1178 PetscScalar *rtmp1,*rtmp2,*rtmp3,*v1,*v2,*v3,*pc1,*pc2,*pc3,mul1,mul2,mul3; 1179 PetscScalar tmp,*ba = b->a,*aa = a->a,*pv; 1180 PetscReal rs=0.0; 1181 LUShift_Ctx sctx; 1182 PetscInt newshift; 1183 1184 PetscFunctionBegin; 1185 sctx.shift_top = 0; 1186 sctx.nshift_max = 0; 1187 sctx.shift_lo = 0; 1188 sctx.shift_hi = 0; 1189 1190 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1191 if (info->shiftpd && info->shiftnz) info->shiftnz = 0.0; 1192 if (info->shiftpd) { /* set sctx.shift_top=max{rs} */ 1193 sctx.shift_top = 0; 1194 for (i=0; i<n; i++) { 1195 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1196 rs = 0.0; 1197 ajtmp = aj + ai[i]; 1198 rtmp1 = aa + ai[i]; 1199 nz = ai[i+1] - ai[i]; 1200 for (j=0; j<nz; j++){ 1201 if (*ajtmp != i){ 1202 rs += PetscAbsScalar(*rtmp1++); 1203 } else { 1204 rs -= PetscRealPart(*rtmp1++); 1205 } 1206 ajtmp++; 1207 } 1208 if (rs>sctx.shift_top) sctx.shift_top = rs; 1209 } 1210 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1211 sctx.shift_top *= 1.1; 1212 sctx.nshift_max = 5; 1213 sctx.shift_lo = 0.; 1214 sctx.shift_hi = 1.; 1215 } 1216 sctx.shift_amount = 0; 1217 sctx.nshift = 0; 1218 1219 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1220 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1221 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1222 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1223 ierr = PetscMemzero(rtmp1,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1224 ics = ic ; 1225 rtmp2 = rtmp1 + n; 1226 rtmp3 = rtmp2 + n; 1227 1228 node_max = a->inode.node_count; 1229 ns = a->inode.size ; 1230 if (!ns){ 1231 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1232 } 1233 1234 /* If max inode size > 3, split it into two inodes.*/ 1235 /* also map the inode sizes according to the ordering */ 1236 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1237 for (i=0,j=0; i<node_max; ++i,++j){ 1238 if (ns[i]>3) { 1239 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1240 ++j; 1241 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1242 } else { 1243 tmp_vec1[j] = ns[i]; 1244 } 1245 } 1246 /* Use the correct node_max */ 1247 node_max = j; 1248 1249 /* Now reorder the inode info based on mat re-ordering info */ 1250 /* First create a row -> inode_size_array_index map */ 1251 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1252 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1253 for (i=0,row=0; i<node_max; i++) { 1254 nodesz = tmp_vec1[i]; 1255 for (j=0; j<nodesz; j++,row++) { 1256 nsmap[row] = i; 1257 } 1258 } 1259 /* Using nsmap, create a reordered ns structure */ 1260 for (i=0,j=0; i< node_max; i++) { 1261 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1262 tmp_vec2[i] = nodesz; 1263 j += nodesz; 1264 } 1265 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1266 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1267 /* Now use the correct ns */ 1268 ns = tmp_vec2; 1269 1270 do { 1271 sctx.lushift = PETSC_FALSE; 1272 /* Now loop over each block-row, and do the factorization */ 1273 for (i=0,row=0; i<node_max; i++) { 1274 nodesz = ns[i]; 1275 nz = bi[row+1] - bi[row]; 1276 bjtmp = bj + bi[row]; 1277 1278 switch (nodesz){ 1279 case 1: 1280 for (j=0; j<nz; j++){ 1281 idx = bjtmp[j]; 1282 rtmp1[idx] = 0.0; 1283 } 1284 1285 /* load in initial (unfactored row) */ 1286 idx = r[row]; 1287 nz_tmp = ai[idx+1] - ai[idx]; 1288 ajtmp = aj + ai[idx]; 1289 v1 = aa + ai[idx]; 1290 1291 for (j=0; j<nz_tmp; j++) { 1292 idx = ics[ajtmp[j]]; 1293 rtmp1[idx] = v1[j]; 1294 } 1295 rtmp1[ics[r[row]]] += sctx.shift_amount; 1296 1297 prow = *bjtmp++ ; 1298 while (prow < row) { 1299 pc1 = rtmp1 + prow; 1300 if (*pc1 != 0.0){ 1301 pv = ba + bd[prow]; 1302 pj = nbj + bd[prow]; 1303 mul1 = *pc1 * *pv++; 1304 *pc1 = mul1; 1305 nz_tmp = bi[prow+1] - bd[prow] - 1; 1306 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1307 for (j=0; j<nz_tmp; j++) { 1308 tmp = pv[j]; 1309 idx = pj[j]; 1310 rtmp1[idx] -= mul1 * tmp; 1311 } 1312 } 1313 prow = *bjtmp++ ; 1314 } 1315 pj = bj + bi[row]; 1316 pc1 = ba + bi[row]; 1317 1318 sctx.pv = rtmp1[row]; 1319 rtmp1[row] = 1.0/rtmp1[row]; /* invert diag */ 1320 rs = 0.0; 1321 for (j=0; j<nz; j++) { 1322 idx = pj[j]; 1323 pc1[j] = rtmp1[idx]; /* rtmp1 -> ba */ 1324 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1325 } 1326 sctx.rs = rs; 1327 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1328 if (newshift == 1) goto endofwhile; 1329 break; 1330 1331 case 2: 1332 for (j=0; j<nz; j++) { 1333 idx = bjtmp[j]; 1334 rtmp1[idx] = 0.0; 1335 rtmp2[idx] = 0.0; 1336 } 1337 1338 /* load in initial (unfactored row) */ 1339 idx = r[row]; 1340 nz_tmp = ai[idx+1] - ai[idx]; 1341 ajtmp = aj + ai[idx]; 1342 v1 = aa + ai[idx]; 1343 v2 = aa + ai[idx+1]; 1344 for (j=0; j<nz_tmp; j++) { 1345 idx = ics[ajtmp[j]]; 1346 rtmp1[idx] = v1[j]; 1347 rtmp2[idx] = v2[j]; 1348 } 1349 rtmp1[ics[r[row]]] += sctx.shift_amount; 1350 rtmp2[ics[r[row+1]]] += sctx.shift_amount; 1351 1352 prow = *bjtmp++ ; 1353 while (prow < row) { 1354 pc1 = rtmp1 + prow; 1355 pc2 = rtmp2 + prow; 1356 if (*pc1 != 0.0 || *pc2 != 0.0){ 1357 pv = ba + bd[prow]; 1358 pj = nbj + bd[prow]; 1359 mul1 = *pc1 * *pv; 1360 mul2 = *pc2 * *pv; 1361 ++pv; 1362 *pc1 = mul1; 1363 *pc2 = mul2; 1364 1365 nz_tmp = bi[prow+1] - bd[prow] - 1; 1366 for (j=0; j<nz_tmp; j++) { 1367 tmp = pv[j]; 1368 idx = pj[j]; 1369 rtmp1[idx] -= mul1 * tmp; 1370 rtmp2[idx] -= mul2 * tmp; 1371 } 1372 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1373 } 1374 prow = *bjtmp++ ; 1375 } 1376 1377 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1378 pc1 = rtmp1 + prow; 1379 pc2 = rtmp2 + prow; 1380 1381 sctx.pv = *pc1; 1382 pj = bj + bi[prow]; 1383 rs = 0.0; 1384 for (j=0; j<nz; j++){ 1385 idx = pj[j]; 1386 if (idx != prow) rs += PetscAbsScalar(rtmp1[idx]); 1387 } 1388 sctx.rs = rs; 1389 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1390 if (newshift == 1) goto endofwhile; 1391 1392 if (*pc2 != 0.0){ 1393 pj = nbj + bd[prow]; 1394 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1395 *pc2 = mul2; 1396 nz_tmp = bi[prow+1] - bd[prow] - 1; 1397 for (j=0; j<nz_tmp; j++) { 1398 idx = pj[j] ; 1399 tmp = rtmp1[idx]; 1400 rtmp2[idx] -= mul2 * tmp; 1401 } 1402 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1403 } 1404 1405 pj = bj + bi[row]; 1406 pc1 = ba + bi[row]; 1407 pc2 = ba + bi[row+1]; 1408 1409 sctx.pv = rtmp2[row+1]; 1410 rs = 0.0; 1411 rtmp1[row] = 1.0/rtmp1[row]; 1412 rtmp2[row+1] = 1.0/rtmp2[row+1]; 1413 /* copy row entries from dense representation to sparse */ 1414 for (j=0; j<nz; j++) { 1415 idx = pj[j]; 1416 pc1[j] = rtmp1[idx]; 1417 pc2[j] = rtmp2[idx]; 1418 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1419 } 1420 sctx.rs = rs; 1421 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1422 if (newshift == 1) goto endofwhile; 1423 break; 1424 1425 case 3: 1426 for (j=0; j<nz; j++) { 1427 idx = bjtmp[j]; 1428 rtmp1[idx] = 0.0; 1429 rtmp2[idx] = 0.0; 1430 rtmp3[idx] = 0.0; 1431 } 1432 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1433 idx = r[row]; 1434 nz_tmp = ai[idx+1] - ai[idx]; 1435 ajtmp = aj + ai[idx]; 1436 v1 = aa + ai[idx]; 1437 v2 = aa + ai[idx+1]; 1438 v3 = aa + ai[idx+2]; 1439 for (j=0; j<nz_tmp; j++) { 1440 idx = ics[ajtmp[j]]; 1441 rtmp1[idx] = v1[j]; 1442 rtmp2[idx] = v2[j]; 1443 rtmp3[idx] = v3[j]; 1444 } 1445 rtmp1[ics[r[row]]] += sctx.shift_amount; 1446 rtmp2[ics[r[row+1]]] += sctx.shift_amount; 1447 rtmp3[ics[r[row+2]]] += sctx.shift_amount; 1448 1449 /* loop over all pivot row blocks above this row block */ 1450 prow = *bjtmp++ ; 1451 while (prow < row) { 1452 pc1 = rtmp1 + prow; 1453 pc2 = rtmp2 + prow; 1454 pc3 = rtmp3 + prow; 1455 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1456 pv = ba + bd[prow]; 1457 pj = nbj + bd[prow]; 1458 mul1 = *pc1 * *pv; 1459 mul2 = *pc2 * *pv; 1460 mul3 = *pc3 * *pv; 1461 ++pv; 1462 *pc1 = mul1; 1463 *pc2 = mul2; 1464 *pc3 = mul3; 1465 1466 nz_tmp = bi[prow+1] - bd[prow] - 1; 1467 /* update this row based on pivot row */ 1468 for (j=0; j<nz_tmp; j++) { 1469 tmp = pv[j]; 1470 idx = pj[j]; 1471 rtmp1[idx] -= mul1 * tmp; 1472 rtmp2[idx] -= mul2 * tmp; 1473 rtmp3[idx] -= mul3 * tmp; 1474 } 1475 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 1476 } 1477 prow = *bjtmp++ ; 1478 } 1479 1480 /* Now take care of diagonal 3x3 block in this set of rows */ 1481 /* note: prow = row here */ 1482 pc1 = rtmp1 + prow; 1483 pc2 = rtmp2 + prow; 1484 pc3 = rtmp3 + prow; 1485 1486 sctx.pv = *pc1; 1487 pj = bj + bi[prow]; 1488 rs = 0.0; 1489 for (j=0; j<nz; j++){ 1490 idx = pj[j]; 1491 if (idx != row) rs += PetscAbsScalar(rtmp1[idx]); 1492 } 1493 sctx.rs = rs; 1494 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1495 if (newshift == 1) goto endofwhile; 1496 1497 if (*pc2 != 0.0 || *pc3 != 0.0){ 1498 mul2 = (*pc2)/(*pc1); 1499 mul3 = (*pc3)/(*pc1); 1500 *pc2 = mul2; 1501 *pc3 = mul3; 1502 nz_tmp = bi[prow+1] - bd[prow] - 1; 1503 pj = nbj + bd[prow]; 1504 for (j=0; j<nz_tmp; j++) { 1505 idx = pj[j] ; 1506 tmp = rtmp1[idx]; 1507 rtmp2[idx] -= mul2 * tmp; 1508 rtmp3[idx] -= mul3 * tmp; 1509 } 1510 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1511 } 1512 ++prow; 1513 1514 pc2 = rtmp2 + prow; 1515 pc3 = rtmp3 + prow; 1516 sctx.pv = *pc2; 1517 pj = bj + bi[prow]; 1518 rs = 0.0; 1519 for (j=0; j<nz; j++){ 1520 idx = pj[j]; 1521 if (idx != prow) rs += PetscAbsScalar(rtmp2[idx]); 1522 } 1523 sctx.rs = rs; 1524 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1525 if (newshift == 1) goto endofwhile; 1526 1527 if (*pc3 != 0.0){ 1528 mul3 = (*pc3)/(*pc2); 1529 *pc3 = mul3; 1530 pj = nbj + bd[prow]; 1531 nz_tmp = bi[prow+1] - bd[prow] - 1; 1532 for (j=0; j<nz_tmp; j++) { 1533 idx = pj[j] ; 1534 tmp = rtmp2[idx]; 1535 rtmp3[idx] -= mul3 * tmp; 1536 } 1537 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1538 } 1539 1540 pj = bj + bi[row]; 1541 pc1 = ba + bi[row]; 1542 pc2 = ba + bi[row+1]; 1543 pc3 = ba + bi[row+2]; 1544 1545 sctx.pv = rtmp3[row+2]; 1546 rs = 0.0; 1547 rtmp1[row] = 1.0/rtmp1[row]; 1548 rtmp2[row+1] = 1.0/rtmp2[row+1]; 1549 rtmp3[row+2] = 1.0/rtmp3[row+2]; 1550 /* copy row entries from dense representation to sparse */ 1551 for (j=0; j<nz; j++) { 1552 idx = pj[j]; 1553 pc1[j] = rtmp1[idx]; 1554 pc2[j] = rtmp2[idx]; 1555 pc3[j] = rtmp3[idx]; 1556 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 1557 } 1558 1559 sctx.rs = rs; 1560 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 1561 if (newshift == 1) goto endofwhile; 1562 break; 1563 1564 default: 1565 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1566 } 1567 row += nodesz; /* Update the row */ 1568 } 1569 endofwhile:; 1570 } while (sctx.lushift); 1571 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1572 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1573 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1574 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1575 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 1576 C->factor = FACTOR_LU; 1577 C->assembled = PETSC_TRUE; 1578 if (sctx.nshift) { 1579 if (info->shiftnz) { 1580 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1581 } else if (info->shiftpd) { 1582 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,info->shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1583 } 1584 } 1585 ierr = PetscLogFlops(C->cmap.n);CHKERRQ(ierr); 1586 PetscFunctionReturn(0); 1587 } 1588 1589 /* 1590 Makes a longer coloring[] array and calls the usual code with that 1591 */ 1592 #undef __FUNCT__ 1593 #define __FUNCT__ "MatColoringPatch_Inode" 1594 PetscErrorCode MatColoringPatch_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 1595 { 1596 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 1597 PetscErrorCode ierr; 1598 PetscInt n = mat->cmap.n,m = a->inode.node_count,j,*ns = a->inode.size,row; 1599 PetscInt *colorused,i; 1600 ISColoringValue *newcolor; 1601 1602 PetscFunctionBegin; 1603 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 1604 /* loop over inodes, marking a color for each column*/ 1605 row = 0; 1606 for (i=0; i<m; i++){ 1607 for (j=0; j<ns[i]; j++) { 1608 newcolor[row++] = coloring[i] + j*ncolors; 1609 } 1610 } 1611 1612 /* eliminate unneeded colors */ 1613 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 1614 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 1615 for (i=0; i<n; i++) { 1616 colorused[newcolor[i]] = 1; 1617 } 1618 1619 for (i=1; i<5*ncolors; i++) { 1620 colorused[i] += colorused[i-1]; 1621 } 1622 ncolors = colorused[5*ncolors-1]; 1623 for (i=0; i<n; i++) { 1624 newcolor[i] = colorused[newcolor[i]]-1; 1625 } 1626 ierr = PetscFree(colorused);CHKERRQ(ierr); 1627 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 1628 ierr = PetscFree(coloring);CHKERRQ(ierr); 1629 PetscFunctionReturn(0); 1630 } 1631 1632 #include "src/inline/ilu.h" 1633 1634 #undef __FUNCT__ 1635 #define __FUNCT__ "MatRelax_Inode" 1636 PetscErrorCode MatRelax_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 1637 { 1638 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 1639 PetscScalar *x,*xs,*ibdiag,*bdiag,sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3,*v1,*v2,*v3,*v4,*v5; 1640 PetscScalar *v = a->a,*b,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 1641 PetscReal zeropivot = 1.0e-15; 1642 PetscErrorCode ierr; 1643 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 1644 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k; 1645 1646 PetscFunctionBegin; 1647 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 1648 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 1649 if (flag & SOR_EISENSTAT) SETERRQ(PETSC_ERR_SUP,"No support for Eisenstat trick; use -mat_no_inode"); 1650 1651 if (!a->inode.ibdiagvalid) { 1652 if (!a->inode.ibdiag) { 1653 /* calculate space needed for diagonal blocks */ 1654 for (i=0; i<m; i++) { 1655 cnt += sizes[i]*sizes[i]; 1656 } 1657 a->inode.bdiagsize = cnt; 1658 ierr = PetscMalloc2(cnt,PetscScalar,&a->inode.ibdiag,cnt,PetscScalar,&a->inode.bdiag);CHKERRQ(ierr); 1659 } 1660 1661 /* copy over the diagonal blocks and invert them */ 1662 ibdiag = a->inode.ibdiag; 1663 bdiag = a->inode.bdiag; 1664 cnt = 0; 1665 for (i=0, row = 0; i<m; i++) { 1666 for (j=0; j<sizes[i]; j++) { 1667 for (k=0; k<sizes[i]; k++) { 1668 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 1669 } 1670 } 1671 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(PetscScalar));CHKERRQ(ierr); 1672 1673 switch(sizes[i]) { 1674 case 1: 1675 /* Create matrix data structure */ 1676 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 1677 ibdiag[cnt] = 1.0/ibdiag[cnt]; 1678 break; 1679 case 2: 1680 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt);CHKERRQ(ierr); 1681 break; 1682 case 3: 1683 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt);CHKERRQ(ierr); 1684 break; 1685 case 4: 1686 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt);CHKERRQ(ierr); 1687 break; 1688 case 5: 1689 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt);CHKERRQ(ierr); 1690 break; 1691 default: 1692 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1693 } 1694 cnt += sizes[i]*sizes[i]; 1695 row += sizes[i]; 1696 } 1697 a->inode.ibdiagvalid = PETSC_TRUE; 1698 } 1699 ibdiag = a->inode.ibdiag; 1700 bdiag = a->inode.bdiag; 1701 1702 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1703 if (xx != bb) { 1704 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1705 } else { 1706 b = x; 1707 } 1708 1709 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 1710 xs = x; 1711 if (flag & SOR_ZERO_INITIAL_GUESS) { 1712 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 1713 1714 for (i=0, row=0; i<m; i++) { 1715 sz = diag[row] - ii[row]; 1716 v1 = a->a + ii[row]; 1717 idx = a->j + ii[row]; 1718 1719 /* see comments for MatMult_Inode() for how this is coded */ 1720 switch (sizes[i]){ 1721 case 1: 1722 1723 sum1 = b[row]; 1724 for(n = 0; n<sz-1; n+=2) { 1725 i1 = idx[0]; 1726 i2 = idx[1]; 1727 idx += 2; 1728 tmp0 = x[i1]; 1729 tmp1 = x[i2]; 1730 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1731 } 1732 1733 if (n == sz-1){ 1734 tmp0 = x[*idx]; 1735 sum1 -= *v1 * tmp0; 1736 } 1737 x[row++] = sum1*(*ibdiag++); 1738 break; 1739 case 2: 1740 v2 = a->a + ii[row+1]; 1741 sum1 = b[row]; 1742 sum2 = b[row+1]; 1743 for(n = 0; n<sz-1; n+=2) { 1744 i1 = idx[0]; 1745 i2 = idx[1]; 1746 idx += 2; 1747 tmp0 = x[i1]; 1748 tmp1 = x[i2]; 1749 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1750 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1751 } 1752 1753 if (n == sz-1){ 1754 tmp0 = x[*idx]; 1755 sum1 -= v1[0] * tmp0; 1756 sum2 -= v2[0] * tmp0; 1757 } 1758 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 1759 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 1760 ibdiag += 4; 1761 break; 1762 case 3: 1763 v2 = a->a + ii[row+1]; 1764 v3 = a->a + ii[row+2]; 1765 sum1 = b[row]; 1766 sum2 = b[row+1]; 1767 sum3 = b[row+2]; 1768 for(n = 0; n<sz-1; n+=2) { 1769 i1 = idx[0]; 1770 i2 = idx[1]; 1771 idx += 2; 1772 tmp0 = x[i1]; 1773 tmp1 = x[i2]; 1774 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1775 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1776 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1777 } 1778 1779 if (n == sz-1){ 1780 tmp0 = x[*idx]; 1781 sum1 -= v1[0] * tmp0; 1782 sum2 -= v2[0] * tmp0; 1783 sum3 -= v3[0] * tmp0; 1784 } 1785 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 1786 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 1787 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 1788 ibdiag += 9; 1789 break; 1790 case 4: 1791 v2 = a->a + ii[row+1]; 1792 v3 = a->a + ii[row+2]; 1793 v4 = a->a + ii[row+3]; 1794 sum1 = b[row]; 1795 sum2 = b[row+1]; 1796 sum3 = b[row+2]; 1797 sum4 = b[row+3]; 1798 for(n = 0; n<sz-1; n+=2) { 1799 i1 = idx[0]; 1800 i2 = idx[1]; 1801 idx += 2; 1802 tmp0 = x[i1]; 1803 tmp1 = x[i2]; 1804 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1805 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1806 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1807 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1808 } 1809 1810 if (n == sz-1){ 1811 tmp0 = x[*idx]; 1812 sum1 -= v1[0] * tmp0; 1813 sum2 -= v2[0] * tmp0; 1814 sum3 -= v3[0] * tmp0; 1815 sum4 -= v4[0] * tmp0; 1816 } 1817 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 1818 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 1819 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 1820 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 1821 ibdiag += 16; 1822 break; 1823 case 5: 1824 v2 = a->a + ii[row+1]; 1825 v3 = a->a + ii[row+2]; 1826 v4 = a->a + ii[row+3]; 1827 v5 = a->a + ii[row+4]; 1828 sum1 = b[row]; 1829 sum2 = b[row+1]; 1830 sum3 = b[row+2]; 1831 sum4 = b[row+3]; 1832 sum5 = b[row+4]; 1833 for(n = 0; n<sz-1; n+=2) { 1834 i1 = idx[0]; 1835 i2 = idx[1]; 1836 idx += 2; 1837 tmp0 = x[i1]; 1838 tmp1 = x[i2]; 1839 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1840 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1841 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1842 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 1843 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 1844 } 1845 1846 if (n == sz-1){ 1847 tmp0 = x[*idx]; 1848 sum1 -= v1[0] * tmp0; 1849 sum2 -= v2[0] * tmp0; 1850 sum3 -= v3[0] * tmp0; 1851 sum4 -= v4[0] * tmp0; 1852 sum5 -= v5[0] * tmp0; 1853 } 1854 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 1855 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 1856 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 1857 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 1858 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 1859 ibdiag += 25; 1860 break; 1861 default: 1862 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1863 } 1864 } 1865 1866 xb = x; 1867 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 1868 } else xb = b; 1869 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 1870 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 1871 cnt = 0; 1872 for (i=0, row=0; i<m; i++) { 1873 1874 switch (sizes[i]){ 1875 case 1: 1876 x[row++] *= bdiag[cnt++]; 1877 break; 1878 case 2: 1879 x1 = x[row]; x2 = x[row+1]; 1880 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 1881 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 1882 x[row++] = tmp1; 1883 x[row++] = tmp2; 1884 cnt += 4; 1885 break; 1886 case 3: 1887 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 1888 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 1889 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 1890 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 1891 x[row++] = tmp1; 1892 x[row++] = tmp2; 1893 x[row++] = tmp3; 1894 cnt += 9; 1895 break; 1896 case 4: 1897 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 1898 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 1899 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 1900 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 1901 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 1902 x[row++] = tmp1; 1903 x[row++] = tmp2; 1904 x[row++] = tmp3; 1905 x[row++] = tmp4; 1906 cnt += 16; 1907 break; 1908 case 5: 1909 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 1910 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 1911 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 1912 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 1913 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 1914 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 1915 x[row++] = tmp1; 1916 x[row++] = tmp2; 1917 x[row++] = tmp3; 1918 x[row++] = tmp4; 1919 x[row++] = tmp5; 1920 cnt += 25; 1921 break; 1922 default: 1923 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 1924 } 1925 } 1926 ierr = PetscLogFlops(m);CHKERRQ(ierr); 1927 } 1928 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 1929 1930 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 1931 for (i=m-1, row=A->rmap.n-1; i>=0; i--) { 1932 ibdiag -= sizes[i]*sizes[i]; 1933 sz = ii[row+1] - diag[row] - 1; 1934 v1 = a->a + diag[row] + 1; 1935 idx = a->j + diag[row] + 1; 1936 1937 /* see comments for MatMult_Inode() for how this is coded */ 1938 switch (sizes[i]){ 1939 case 1: 1940 1941 sum1 = xb[row]; 1942 for(n = 0; n<sz-1; n+=2) { 1943 i1 = idx[0]; 1944 i2 = idx[1]; 1945 idx += 2; 1946 tmp0 = x[i1]; 1947 tmp1 = x[i2]; 1948 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1949 } 1950 1951 if (n == sz-1){ 1952 tmp0 = x[*idx]; 1953 sum1 -= *v1*tmp0; 1954 } 1955 x[row--] = sum1*(*ibdiag); 1956 break; 1957 1958 case 2: 1959 1960 sum1 = xb[row]; 1961 sum2 = xb[row-1]; 1962 /* note that sum1 is associated with the second of the two rows */ 1963 v2 = a->a + diag[row-1] + 2; 1964 for(n = 0; n<sz-1; n+=2) { 1965 i1 = idx[0]; 1966 i2 = idx[1]; 1967 idx += 2; 1968 tmp0 = x[i1]; 1969 tmp1 = x[i2]; 1970 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1971 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1972 } 1973 1974 if (n == sz-1){ 1975 tmp0 = x[*idx]; 1976 sum1 -= *v1*tmp0; 1977 sum2 -= *v2*tmp0; 1978 } 1979 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 1980 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 1981 break; 1982 case 3: 1983 1984 sum1 = xb[row]; 1985 sum2 = xb[row-1]; 1986 sum3 = xb[row-2]; 1987 v2 = a->a + diag[row-1] + 2; 1988 v3 = a->a + diag[row-2] + 3; 1989 for(n = 0; n<sz-1; n+=2) { 1990 i1 = idx[0]; 1991 i2 = idx[1]; 1992 idx += 2; 1993 tmp0 = x[i1]; 1994 tmp1 = x[i2]; 1995 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 1996 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 1997 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 1998 } 1999 2000 if (n == sz-1){ 2001 tmp0 = x[*idx]; 2002 sum1 -= *v1*tmp0; 2003 sum2 -= *v2*tmp0; 2004 sum3 -= *v3*tmp0; 2005 } 2006 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2007 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2008 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2009 break; 2010 case 4: 2011 2012 sum1 = xb[row]; 2013 sum2 = xb[row-1]; 2014 sum3 = xb[row-2]; 2015 sum4 = xb[row-3]; 2016 v2 = a->a + diag[row-1] + 2; 2017 v3 = a->a + diag[row-2] + 3; 2018 v4 = a->a + diag[row-3] + 4; 2019 for(n = 0; n<sz-1; n+=2) { 2020 i1 = idx[0]; 2021 i2 = idx[1]; 2022 idx += 2; 2023 tmp0 = x[i1]; 2024 tmp1 = x[i2]; 2025 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2026 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2027 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2028 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2029 } 2030 2031 if (n == sz-1){ 2032 tmp0 = x[*idx]; 2033 sum1 -= *v1*tmp0; 2034 sum2 -= *v2*tmp0; 2035 sum3 -= *v3*tmp0; 2036 sum4 -= *v4*tmp0; 2037 } 2038 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2039 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2040 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2041 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2042 break; 2043 case 5: 2044 2045 sum1 = xb[row]; 2046 sum2 = xb[row-1]; 2047 sum3 = xb[row-2]; 2048 sum4 = xb[row-3]; 2049 sum5 = xb[row-4]; 2050 v2 = a->a + diag[row-1] + 2; 2051 v3 = a->a + diag[row-2] + 3; 2052 v4 = a->a + diag[row-3] + 4; 2053 v5 = a->a + diag[row-4] + 5; 2054 for(n = 0; n<sz-1; n+=2) { 2055 i1 = idx[0]; 2056 i2 = idx[1]; 2057 idx += 2; 2058 tmp0 = x[i1]; 2059 tmp1 = x[i2]; 2060 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2061 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2062 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2063 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2064 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2065 } 2066 2067 if (n == sz-1){ 2068 tmp0 = x[*idx]; 2069 sum1 -= *v1*tmp0; 2070 sum2 -= *v2*tmp0; 2071 sum3 -= *v3*tmp0; 2072 sum4 -= *v4*tmp0; 2073 sum5 -= *v5*tmp0; 2074 } 2075 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 2076 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 2077 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 2078 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 2079 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 2080 break; 2081 default: 2082 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2083 } 2084 } 2085 2086 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2087 } 2088 its--; 2089 } 2090 if (its) SETERRQ(PETSC_ERR_SUP,"Currently no support for multiply SOR sweeps using inode version of AIJ matrix format;\n run with the option -mat_no_inode"); 2091 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2092 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 2093 PetscFunctionReturn(0); 2094 } 2095 2096 2097 /* 2098 samestructure indicates that the matrix has not changed its nonzero structure so we 2099 do not need to recompute the inodes 2100 */ 2101 #undef __FUNCT__ 2102 #define __FUNCT__ "Mat_CheckInode" 2103 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 2104 { 2105 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2106 PetscErrorCode ierr; 2107 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 2108 PetscTruth flag; 2109 2110 PetscFunctionBegin; 2111 if (!a->inode.use) PetscFunctionReturn(0); 2112 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 2113 2114 2115 m = A->rmap.n; 2116 if (a->inode.size) {ns = a->inode.size;} 2117 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 2118 2119 i = 0; 2120 node_count = 0; 2121 idx = a->j; 2122 ii = a->i; 2123 while (i < m){ /* For each row */ 2124 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 2125 /* Limits the number of elements in a node to 'a->inode.limit' */ 2126 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 2127 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 2128 if (nzy != nzx) break; 2129 idy += nzx; /* Same nonzero pattern */ 2130 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 2131 if (!flag) break; 2132 } 2133 ns[node_count++] = blk_size; 2134 idx += blk_size*nzx; 2135 i = j; 2136 } 2137 /* If not enough inodes found,, do not use inode version of the routines */ 2138 if (!a->inode.size && m && node_count > .9*m) { 2139 ierr = PetscFree(ns);CHKERRQ(ierr); 2140 a->inode.node_count = 0; 2141 a->inode.size = PETSC_NULL; 2142 a->inode.use = PETSC_FALSE; 2143 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 2144 } else { 2145 A->ops->mult = MatMult_Inode; 2146 A->ops->relax = MatRelax_Inode; 2147 A->ops->multadd = MatMultAdd_Inode; 2148 A->ops->solve = MatSolve_Inode; 2149 A->ops->lufactornumeric = MatLUFactorNumeric_Inode; 2150 A->ops->getrowij = MatGetRowIJ_Inode; 2151 A->ops->restorerowij = MatRestoreRowIJ_Inode; 2152 A->ops->getcolumnij = MatGetColumnIJ_Inode; 2153 A->ops->restorecolumnij = MatRestoreColumnIJ_Inode; 2154 A->ops->coloringpatch = MatColoringPatch_Inode; 2155 a->inode.node_count = node_count; 2156 a->inode.size = ns; 2157 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 2158 } 2159 PetscFunctionReturn(0); 2160 } 2161 2162 /* 2163 This is really ugly. if inodes are used this replaces the 2164 permutations with ones that correspond to rows/cols of the matrix 2165 rather then inode blocks 2166 */ 2167 #undef __FUNCT__ 2168 #define __FUNCT__ "MatInodeAdjustForInodes" 2169 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 2170 { 2171 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 2172 2173 PetscFunctionBegin; 2174 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 2175 if (f) { 2176 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 2177 } 2178 PetscFunctionReturn(0); 2179 } 2180 2181 EXTERN_C_BEGIN 2182 #undef __FUNCT__ 2183 #define __FUNCT__ "MatAdjustForInodes_Inode" 2184 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_Inode(Mat A,IS *rperm,IS *cperm) 2185 { 2186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 2187 PetscErrorCode ierr; 2188 PetscInt m = A->rmap.n,n = A->cmap.n,i,j,*ridx,*cidx,nslim_row = a->inode.node_count; 2189 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 2190 PetscInt nslim_col,*ns_col; 2191 IS ris = *rperm,cis = *cperm; 2192 2193 PetscFunctionBegin; 2194 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 2195 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 2196 2197 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 2198 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 2199 ierr = PetscMalloc((m+n+1)*sizeof(PetscInt),&permr);CHKERRQ(ierr); 2200 permc = permr + m; 2201 2202 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 2203 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 2204 2205 /* Form the inode structure for the rows of permuted matric using inv perm*/ 2206 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 2207 2208 /* Construct the permutations for rows*/ 2209 for (i=0,row = 0; i<nslim_row; ++i){ 2210 indx = ridx[i]; 2211 start_val = tns[indx]; 2212 end_val = tns[indx + 1]; 2213 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 2214 } 2215 2216 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 2217 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 2218 2219 /* Construct permutations for columns */ 2220 for (i=0,col=0; i<nslim_col; ++i){ 2221 indx = cidx[i]; 2222 start_val = tns[indx]; 2223 end_val = tns[indx + 1]; 2224 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 2225 } 2226 2227 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 2228 ISSetPermutation(*rperm); 2229 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 2230 ISSetPermutation(*cperm); 2231 2232 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 2233 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 2234 2235 ierr = PetscFree(ns_col);CHKERRQ(ierr); 2236 ierr = PetscFree(permr);CHKERRQ(ierr); 2237 ierr = ISDestroy(cis);CHKERRQ(ierr); 2238 ierr = ISDestroy(ris);CHKERRQ(ierr); 2239 ierr = PetscFree(tns);CHKERRQ(ierr); 2240 PetscFunctionReturn(0); 2241 } 2242 EXTERN_C_END 2243 2244 #undef __FUNCT__ 2245 #define __FUNCT__ "MatInodeGetInodeSizes" 2246 /*@C 2247 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 2248 2249 Collective on Mat 2250 2251 Input Parameter: 2252 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 2253 2254 Output Parameter: 2255 + node_count - no of inodes present in the matrix. 2256 . sizes - an array of size node_count,with sizes of each inode. 2257 - limit - the max size used to generate the inodes. 2258 2259 Level: advanced 2260 2261 Notes: This routine returns some internal storage information 2262 of the matrix, it is intended to be used by advanced users. 2263 It should be called after the matrix is assembled. 2264 The contents of the sizes[] array should not be changed. 2265 PETSC_NULL may be passed for information not requested. 2266 2267 .keywords: matrix, seqaij, get, inode 2268 2269 .seealso: MatGetInfo() 2270 @*/ 2271 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2272 { 2273 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 2274 2275 PetscFunctionBegin; 2276 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 2277 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 2278 if (f) { 2279 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 2280 } 2281 PetscFunctionReturn(0); 2282 } 2283 2284 EXTERN_C_BEGIN 2285 #undef __FUNCT__ 2286 #define __FUNCT__ "MatInodeGetInodeSizes_Inode" 2287 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 2288 { 2289 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2290 2291 PetscFunctionBegin; 2292 if (node_count) *node_count = a->inode.node_count; 2293 if (sizes) *sizes = a->inode.size; 2294 if (limit) *limit = a->inode.limit; 2295 PetscFunctionReturn(0); 2296 } 2297 EXTERN_C_END 2298