1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1195 FactorShiftCtx sctx; 1196 const PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,node_max,col; 1200 const PetscInt *ns; 1201 PetscInt *tmp_vec1,*tmp_vec2,*nsmap,newshift = 0; 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) 1216 rs += PetscAbsScalar(v[j]); 1217 if (rs>sctx.shift_top) sctx.shift_top = rs; 1218 } 1219 sctx.shift_top *= 1.1; 1220 sctx.nshift_max = 5; 1221 sctx.shift_lo = 0.; 1222 sctx.shift_hi = 1.; 1223 } 1224 1225 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1226 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1227 1228 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1230 rtmp2 = rtmp1 + n; 1231 rtmp3 = rtmp2 + n; 1232 rtmp4 = rtmp3 + n; 1233 ics = ic; 1234 1235 node_max = a->inode.node_count; 1236 ns = a->inode.size; 1237 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1238 1239 /* If max inode size > 4, split it into two inodes.*/ 1240 /* also map the inode sizes according to the ordering */ 1241 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1242 for (i=0,j=0; i<node_max; ++i,++j){ 1243 if (ns[i]>3) { 1244 tmp_vec1[j] = ns[i]/2; 1245 ++j; 1246 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1247 } else { 1248 tmp_vec1[j] = ns[i]; 1249 } 1250 } 1251 /* Use the correct node_max */ 1252 node_max = j; 1253 1254 /* Now reorder the inode info based on mat re-ordering info */ 1255 /* First create a row -> inode_size_array_index map */ 1256 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1257 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1258 for (i=0,row=0; i<node_max; i++) { 1259 nodesz = tmp_vec1[i]; 1260 for (j=0; j<nodesz; j++,row++) { 1261 nsmap[row] = i; 1262 } 1263 } 1264 /* Using nsmap, create a reordered ns structure */ 1265 for (i=0,j=0; i< node_max; i++) { 1266 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1267 tmp_vec2[i] = nodesz; 1268 j += nodesz; 1269 } 1270 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1271 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1272 1273 /* Now use the correct ns */ 1274 ns = tmp_vec2; 1275 1276 do { 1277 sctx.useshift = PETSC_FALSE; 1278 /* Now loop over each block-row, and do the factorization */ 1279 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1280 nodesz = ns[inod]; 1281 1282 switch (nodesz){ 1283 case 1: 1284 /*----------*/ 1285 /* zero rtmp1 */ 1286 /* L part */ 1287 nz = bi[i+1] - bi[i]; 1288 bjtmp = bj + bi[i]; 1289 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1290 1291 /* U part */ 1292 nz = bdiag[i]-bdiag[i+1]; 1293 bjtmp = bj + bdiag[i+1]+1; 1294 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1295 1296 /* load in initial (unfactored row) */ 1297 nz = ai[r[i]+1] - ai[r[i]]; 1298 ajtmp = aj + ai[r[i]]; 1299 v = aa + ai[r[i]]; 1300 for (j=0; j<nz; j++) { 1301 rtmp1[ics[ajtmp[j]]] = v[j]; 1302 } 1303 /* ZeropivotApply() */ 1304 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1305 1306 /* elimination */ 1307 bjtmp = bj + bi[i]; 1308 row = *bjtmp++; 1309 nzL = bi[i+1] - bi[i]; 1310 for(k=0; k < nzL;k++) { 1311 pc = rtmp1 + row; 1312 if (*pc != 0.0) { 1313 pv = b->a + bdiag[row]; 1314 mul1 = *pc * (*pv); 1315 *pc = mul1; 1316 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1317 pv = b->a + bdiag[row+1]+1; 1318 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1319 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1320 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1321 } 1322 row = *bjtmp++; 1323 } 1324 1325 /* finished row so stick it into b->a */ 1326 rs = 0.0; 1327 /* L part */ 1328 pv = b->a + bi[i] ; 1329 pj = b->j + bi[i] ; 1330 nz = bi[i+1] - bi[i]; 1331 for (j=0; j<nz; j++) { 1332 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1333 } 1334 1335 /* U part */ 1336 pv = b->a + bdiag[i+1]+1; 1337 pj = b->j + bdiag[i+1]+1; 1338 nz = bdiag[i] - bdiag[i+1]-1; 1339 for (j=0; j<nz; j++) { 1340 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1341 } 1342 1343 /* Check zero pivot */ 1344 sctx.rs = rs; 1345 sctx.pv = rtmp1[i]; 1346 ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr); 1347 if(newshift == 1) break; 1348 1349 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1350 pv = b->a + bdiag[i]; 1351 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1352 break; 1353 1354 case 2: 1355 /*----------*/ 1356 /* zero rtmp1 and rtmp2 */ 1357 /* L part */ 1358 nz = bi[i+1] - bi[i]; 1359 bjtmp = bj + bi[i]; 1360 for (j=0; j<nz; j++) { 1361 col = bjtmp[j]; 1362 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1363 } 1364 1365 /* U part */ 1366 nz = bdiag[i]-bdiag[i+1]; 1367 bjtmp = bj + bdiag[i+1]+1; 1368 for (j=0; j<nz; j++) { 1369 col = bjtmp[j]; 1370 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1371 } 1372 1373 /* load in initial (unfactored row) */ 1374 nz = ai[r[i]+1] - ai[r[i]]; 1375 ajtmp = aj + ai[r[i]]; 1376 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1377 for (j=0; j<nz; j++) { 1378 col = ics[ajtmp[j]]; 1379 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1380 } 1381 /* ZeropivotApply(): shift the diagonal of the matrix */ 1382 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1383 1384 /* elimination */ 1385 bjtmp = bj + bi[i]; 1386 row = *bjtmp++; /* pivot row */ 1387 nzL = bi[i+1] - bi[i]; 1388 for(k=0; k < nzL;k++) { 1389 pc1 = rtmp1 + row; 1390 pc2 = rtmp2 + row; 1391 if (*pc1 != 0.0 || *pc2 != 0.0) { 1392 pv = b->a + bdiag[row]; 1393 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1394 *pc1 = mul1; *pc2 = mul2; 1395 1396 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1397 pv = b->a + bdiag[row+1]+1; 1398 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1399 for (j=0; j<nz; j++){ 1400 col = pj[j]; 1401 rtmp1[col] -= mul1 * pv[j]; 1402 rtmp2[col] -= mul2 * pv[j]; 1403 } 1404 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1405 } 1406 row = *bjtmp++; 1407 } 1408 1409 /* finished row i; check zero pivot, then stick row i into b->a */ 1410 rs = 0.0; 1411 /* L part */ 1412 pc1 = b->a + bi[i]; 1413 pj = b->j + bi[i] ; 1414 nz = bi[i+1] - bi[i]; 1415 for (j=0; j<nz; j++) { 1416 col = pj[j]; 1417 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1418 } 1419 /* U part */ 1420 pc1 = b->a + bdiag[i+1]+1; 1421 pj = b->j + bdiag[i+1]+1; 1422 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1423 for (j=0; j<nz; j++) { 1424 col = pj[j]; 1425 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1426 } 1427 1428 sctx.rs = rs; 1429 sctx.pv = rtmp1[i]; 1430 ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr); 1431 if(newshift == 1) break; 1432 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1433 *pc1 = 1.0/sctx.pv; 1434 1435 /* Now take care of diagonal 2x2 block. */ 1436 pc2 = rtmp2 + i; 1437 if (*pc2 != 0.0){ 1438 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1439 *pc2 = mul1; /* insert L entry */ 1440 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1441 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1442 for (j=0; j<nz; j++) { 1443 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1444 } 1445 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1446 } 1447 1448 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1449 rs = 0.0; 1450 /* L part */ 1451 pc2 = b->a + bi[i+1]; 1452 pj = b->j + bi[i+1] ; 1453 nz = bi[i+2] - bi[i+1]; 1454 for (j=0; j<nz; j++) { 1455 col = pj[j]; 1456 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1457 } 1458 /* U part */ 1459 pc2 = b->a + bdiag[i+2]+1; 1460 pj = b->j + bdiag[i+2]+1; 1461 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1462 for (j=0; j<nz; j++) { 1463 col = pj[j]; 1464 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1465 } 1466 1467 sctx.rs = rs; 1468 sctx.pv = rtmp2[i+1]; 1469 ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr); 1470 if(newshift == 1) break; 1471 pc2 = b->a + bdiag[i+1]; 1472 *pc2 = 1.0/sctx.pv; 1473 break; 1474 1475 case 3: 1476 /*----------*/ 1477 /* zero rtmp */ 1478 /* L part */ 1479 nz = bi[i+1] - bi[i]; 1480 bjtmp = bj + bi[i]; 1481 for (j=0; j<nz; j++) { 1482 col = bjtmp[j]; 1483 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1484 } 1485 1486 /* U part */ 1487 nz = bdiag[i]-bdiag[i+1]; 1488 bjtmp = bj + bdiag[i+1]+1; 1489 for (j=0; j<nz; j++) { 1490 col = bjtmp[j]; 1491 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1492 } 1493 1494 /* load in initial (unfactored row) */ 1495 nz = ai[r[i]+1] - ai[r[i]]; 1496 ajtmp = aj + ai[r[i]]; 1497 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1498 for (j=0; j<nz; j++) { 1499 col = ics[ajtmp[j]]; 1500 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1501 } 1502 /* ZeropivotApply(): shift the diagonal of the matrix */ 1503 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1504 1505 /* elimination */ 1506 bjtmp = bj + bi[i]; 1507 row = *bjtmp++; /* pivot row */ 1508 nzL = bi[i+1] - bi[i]; 1509 for(k=0; k < nzL;k++) { 1510 pc1 = rtmp1 + row; 1511 pc2 = rtmp2 + row; 1512 pc3 = rtmp3 + row; 1513 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1514 pv = b->a + bdiag[row]; 1515 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1516 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1517 1518 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1519 pv = b->a + bdiag[row+1]+1; 1520 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1521 for (j=0; j<nz; j++){ 1522 col = pj[j]; 1523 rtmp1[col] -= mul1 * pv[j]; 1524 rtmp2[col] -= mul2 * pv[j]; 1525 rtmp3[col] -= mul3 * pv[j]; 1526 } 1527 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1528 } 1529 row = *bjtmp++; 1530 } 1531 1532 /* finished row i; check zero pivot, then stick row i into b->a */ 1533 rs = 0.0; 1534 /* L part */ 1535 pc1 = b->a + bi[i]; 1536 pj = b->j + bi[i] ; 1537 nz = bi[i+1] - bi[i]; 1538 for (j=0; j<nz; j++) { 1539 col = pj[j]; 1540 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1541 } 1542 /* U part */ 1543 pc1 = b->a + bdiag[i+1]+1; 1544 pj = b->j + bdiag[i+1]+1; 1545 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1546 for (j=0; j<nz; j++) { 1547 col = pj[j]; 1548 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1549 } 1550 1551 sctx.rs = rs; 1552 sctx.pv = rtmp1[i]; 1553 ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr); 1554 if(newshift == 1) break; 1555 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1556 *pc1 = 1.0/sctx.pv; 1557 1558 /* Now take care of 1st column of diagonal 3x3 block. */ 1559 pc2 = rtmp2 + i; 1560 pc3 = rtmp3 + i; 1561 if (*pc2 != 0.0 || *pc3 != 0.0){ 1562 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1563 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1564 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1565 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1566 for (j=0; j<nz; j++) { 1567 col = pj[j]; 1568 rtmp2[col] -= mul2 * rtmp1[col]; 1569 rtmp3[col] -= mul3 * rtmp1[col]; 1570 } 1571 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1572 } 1573 1574 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1575 rs = 0.0; 1576 /* L part */ 1577 pc2 = b->a + bi[i+1]; 1578 pj = b->j + bi[i+1] ; 1579 nz = bi[i+2] - bi[i+1]; 1580 for (j=0; j<nz; j++) { 1581 col = pj[j]; 1582 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1583 } 1584 /* U part */ 1585 pc2 = b->a + bdiag[i+2]+1; 1586 pj = b->j + bdiag[i+2]+1; 1587 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1588 for (j=0; j<nz; j++) { 1589 col = pj[j]; 1590 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1591 } 1592 1593 sctx.rs = rs; 1594 sctx.pv = rtmp2[i+1]; 1595 ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr); 1596 if(newshift == 1) break; 1597 pc2 = b->a + bdiag[i+1]; 1598 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1599 1600 /* Now take care of 2nd column of diagonal 3x3 block. */ 1601 pc3 = rtmp3 + i+1; 1602 if (*pc3 != 0.0){ 1603 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1604 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1605 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1606 for (j=0; j<nz; j++) { 1607 col = pj[j]; 1608 rtmp3[col] -= mul3 * rtmp2[col]; 1609 } 1610 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1611 } 1612 1613 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1614 rs = 0.0; 1615 /* L part */ 1616 pc3 = b->a + bi[i+2]; 1617 pj = b->j + bi[i+2] ; 1618 nz = bi[i+3] - bi[i+2]; 1619 for (j=0; j<nz; j++) { 1620 col = pj[j]; 1621 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1622 } 1623 /* U part */ 1624 pc3 = b->a + bdiag[i+3]+1; 1625 pj = b->j + bdiag[i+3]+1; 1626 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1627 for (j=0; j<nz; j++) { 1628 col = pj[j]; 1629 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1630 } 1631 1632 sctx.rs = rs; 1633 sctx.pv = rtmp3[i+2]; 1634 ierr = MatPivotCheck(info,&sctx,i+2,&newshift);CHKERRQ(ierr); 1635 if(newshift == 1) break; 1636 pc3 = b->a + bdiag[i+2]; 1637 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1638 break; 1639 case 4: 1640 /*----------*/ 1641 /* zero rtmp */ 1642 /* L part */ 1643 nz = bi[i+1] - bi[i]; 1644 bjtmp = bj + bi[i]; 1645 for (j=0; j<nz; j++) { 1646 col = bjtmp[j]; 1647 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1648 } 1649 1650 /* U part */ 1651 nz = bdiag[i]-bdiag[i+1]; 1652 bjtmp = bj + bdiag[i+1]+1; 1653 for (j=0; j<nz; j++) { 1654 col = bjtmp[j]; 1655 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1656 } 1657 1658 /* load in initial (unfactored row) */ 1659 nz = ai[r[i]+1] - ai[r[i]]; 1660 ajtmp = aj + ai[r[i]]; 1661 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1662 for (j=0; j<nz; j++) { 1663 col = ics[ajtmp[j]]; 1664 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1665 } 1666 /* ZeropivotApply(): shift the diagonal of the matrix */ 1667 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1668 1669 /* elimination */ 1670 bjtmp = bj + bi[i]; 1671 row = *bjtmp++; /* pivot row */ 1672 nzL = bi[i+1] - bi[i]; 1673 for(k=0; k < nzL;k++) { 1674 pc1 = rtmp1 + row; 1675 pc2 = rtmp2 + row; 1676 pc3 = rtmp3 + row; 1677 pc4 = rtmp4 + row; 1678 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1679 pv = b->a + bdiag[row]; 1680 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1681 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1682 1683 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1684 pv = b->a + bdiag[row+1]+1; 1685 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1686 for (j=0; j<nz; j++){ 1687 col = pj[j]; 1688 rtmp1[col] -= mul1 * pv[j]; 1689 rtmp2[col] -= mul2 * pv[j]; 1690 rtmp3[col] -= mul3 * pv[j]; 1691 rtmp4[col] -= mul4 * pv[j]; 1692 } 1693 ierr = PetscLogFlops(8*nz);CHKERRQ(ierr); 1694 } 1695 row = *bjtmp++; 1696 } 1697 1698 /* finished row i; check zero pivot, then stick row i into b->a */ 1699 rs = 0.0; 1700 /* L part */ 1701 pc1 = b->a + bi[i]; 1702 pj = b->j + bi[i] ; 1703 nz = bi[i+1] - bi[i]; 1704 for (j=0; j<nz; j++) { 1705 col = pj[j]; 1706 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1707 } 1708 /* U part */ 1709 pc1 = b->a + bdiag[i+1]+1; 1710 pj = b->j + bdiag[i+1]+1; 1711 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1712 for (j=0; j<nz; j++) { 1713 col = pj[j]; 1714 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1715 } 1716 1717 sctx.rs = rs; 1718 sctx.pv = rtmp1[i]; 1719 ierr = MatPivotCheck(info,&sctx,i,&newshift);CHKERRQ(ierr); 1720 if(newshift == 1) break; 1721 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1722 *pc1 = 1.0/sctx.pv; 1723 1724 /* Now take care of 1st column of diagonal 4x4 block. */ 1725 pc2 = rtmp2 + i; 1726 pc3 = rtmp3 + i; 1727 pc4 = rtmp4 + i; 1728 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){ 1729 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1730 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1731 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1732 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1733 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1734 for (j=0; j<nz; j++) { 1735 col = pj[j]; 1736 rtmp2[col] -= mul2 * rtmp1[col]; 1737 rtmp3[col] -= mul3 * rtmp1[col]; 1738 rtmp4[col] -= mul4 * rtmp1[col]; 1739 } 1740 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1741 } 1742 1743 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1744 rs = 0.0; 1745 /* L part */ 1746 pc2 = b->a + bi[i+1]; 1747 pj = b->j + bi[i+1] ; 1748 nz = bi[i+2] - bi[i+1]; 1749 for (j=0; j<nz; j++) { 1750 col = pj[j]; 1751 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1752 } 1753 /* U part */ 1754 pc2 = b->a + bdiag[i+2]+1; 1755 pj = b->j + bdiag[i+2]+1; 1756 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1757 for (j=0; j<nz; j++) { 1758 col = pj[j]; 1759 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1760 } 1761 1762 sctx.rs = rs; 1763 sctx.pv = rtmp2[i+1]; 1764 ierr = MatPivotCheck(info,&sctx,i+1,&newshift);CHKERRQ(ierr); 1765 if(newshift == 1) break; 1766 pc2 = b->a + bdiag[i+1]; 1767 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1768 1769 /* Now take care of 2nd column of diagonal 4x4 block. */ 1770 pc3 = rtmp3 + i+1; 1771 pc4 = rtmp4 + i+1; 1772 if (*pc3 != 0.0 || *pc4 != 0.0){ 1773 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1774 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1775 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1776 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1777 for (j=0; j<nz; j++) { 1778 col = pj[j]; 1779 rtmp3[col] -= mul3 * rtmp2[col]; 1780 rtmp4[col] -= mul4 * rtmp2[col]; 1781 } 1782 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1783 } 1784 1785 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1786 rs = 0.0; 1787 /* L part */ 1788 pc3 = b->a + bi[i+2]; 1789 pj = b->j + bi[i+2] ; 1790 nz = bi[i+3] - bi[i+2]; 1791 for (j=0; j<nz; j++) { 1792 col = pj[j]; 1793 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1794 } 1795 /* U part */ 1796 pc3 = b->a + bdiag[i+3]+1; 1797 pj = b->j + bdiag[i+3]+1; 1798 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1799 for (j=0; j<nz; j++) { 1800 col = pj[j]; 1801 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1802 } 1803 1804 sctx.rs = rs; 1805 sctx.pv = rtmp3[i+2]; 1806 ierr = MatPivotCheck(info,&sctx,i+2,&newshift);CHKERRQ(ierr); 1807 if(newshift == 1) break; 1808 pc3 = b->a + bdiag[i+2]; 1809 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1810 1811 /* Now take care of 3rd column of diagonal 4x4 block. */ 1812 pc4 = rtmp4 + i+2; 1813 if (*pc4 != 0.0){ 1814 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1815 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1816 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1817 for (j=0; j<nz; j++) { 1818 col = pj[j]; 1819 rtmp4[col] -= mul4 * rtmp3[col]; 1820 } 1821 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1822 } 1823 1824 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1825 rs = 0.0; 1826 /* L part */ 1827 pc4 = b->a + bi[i+3]; 1828 pj = b->j + bi[i+3] ; 1829 nz = bi[i+4] - bi[i+3]; 1830 for (j=0; j<nz; j++) { 1831 col = pj[j]; 1832 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1833 } 1834 /* U part */ 1835 pc4 = b->a + bdiag[i+4]+1; 1836 pj = b->j + bdiag[i+4]+1; 1837 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1838 for (j=0; j<nz; j++) { 1839 col = pj[j]; 1840 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1841 } 1842 1843 sctx.rs = rs; 1844 sctx.pv = rtmp4[i+3]; 1845 ierr = MatPivotCheck(info,&sctx,i+3,&newshift);CHKERRQ(ierr); 1846 if(newshift == 1) break; 1847 pc4 = b->a + bdiag[i+3]; 1848 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1849 break; 1850 1851 default: 1852 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1853 } 1854 i += nodesz; /* Update the row */ 1855 } 1856 1857 /* MatPivotRefine() */ 1858 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1859 /* 1860 * if no shift in this attempt & shifting & started shifting & can refine, 1861 * then try lower shift 1862 */ 1863 sctx.shift_hi = sctx.shift_fraction; 1864 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1865 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1866 sctx.useshift = PETSC_TRUE; 1867 sctx.nshift++; 1868 } 1869 } while (sctx.useshift); 1870 1871 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1872 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1873 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1874 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1875 1876 C->ops->solve = MatSolve_SeqAIJ; 1877 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1878 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1879 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1880 C->ops->matsolve = MatMatSolve_SeqAIJ; 1881 C->assembled = PETSC_TRUE; 1882 C->preallocated = PETSC_TRUE; 1883 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1884 1885 /* MatShiftView(A,info,&sctx) */ 1886 if (sctx.nshift){ 1887 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1888 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1889 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1890 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1891 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){ 1892 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1893 } 1894 } 1895 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1896 PetscFunctionReturn(0); 1897 } 1898 1899 #undef __FUNCT__ 1900 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1901 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1902 { 1903 Mat C = B; 1904 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1905 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1906 PetscErrorCode ierr; 1907 const PetscInt *r,*ic,*c,*ics; 1908 PetscInt n = A->rmap->n,*bi = b->i; 1909 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1910 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1911 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1912 PetscScalar mul1,mul2,mul3,tmp; 1913 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1914 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1915 PetscReal rs=0.0; 1916 FactorShiftCtx sctx; 1917 PetscInt newshift; 1918 1919 PetscFunctionBegin; 1920 sctx.shift_top = 0; 1921 sctx.nshift_max = 0; 1922 sctx.shift_lo = 0; 1923 sctx.shift_hi = 0; 1924 sctx.shift_fraction = 0; 1925 1926 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1927 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1928 sctx.shift_top = 0; 1929 for (i=0; i<n; i++) { 1930 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1931 rs = 0.0; 1932 ajtmp = aj + ai[i]; 1933 rtmp1 = aa + ai[i]; 1934 nz = ai[i+1] - ai[i]; 1935 for (j=0; j<nz; j++){ 1936 if (*ajtmp != i){ 1937 rs += PetscAbsScalar(*rtmp1++); 1938 } else { 1939 rs -= PetscRealPart(*rtmp1++); 1940 } 1941 ajtmp++; 1942 } 1943 if (rs>sctx.shift_top) sctx.shift_top = rs; 1944 } 1945 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1946 sctx.shift_top *= 1.1; 1947 sctx.nshift_max = 5; 1948 sctx.shift_lo = 0.; 1949 sctx.shift_hi = 1.; 1950 } 1951 sctx.shift_amount = 0; 1952 sctx.nshift = 0; 1953 1954 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1955 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1956 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1957 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1958 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1959 ics = ic ; 1960 rtmp22 = rtmp11 + n; 1961 rtmp33 = rtmp22 + n; 1962 1963 node_max = a->inode.node_count; 1964 ns = a->inode.size; 1965 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1966 1967 /* If max inode size > 3, split it into two inodes.*/ 1968 /* also map the inode sizes according to the ordering */ 1969 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1970 for (i=0,j=0; i<node_max; ++i,++j){ 1971 if (ns[i]>3) { 1972 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1973 ++j; 1974 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1975 } else { 1976 tmp_vec1[j] = ns[i]; 1977 } 1978 } 1979 /* Use the correct node_max */ 1980 node_max = j; 1981 1982 /* Now reorder the inode info based on mat re-ordering info */ 1983 /* First create a row -> inode_size_array_index map */ 1984 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1985 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1986 for (i=0,row=0; i<node_max; i++) { 1987 nodesz = tmp_vec1[i]; 1988 for (j=0; j<nodesz; j++,row++) { 1989 nsmap[row] = i; 1990 } 1991 } 1992 /* Using nsmap, create a reordered ns structure */ 1993 for (i=0,j=0; i< node_max; i++) { 1994 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1995 tmp_vec2[i] = nodesz; 1996 j += nodesz; 1997 } 1998 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1999 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 2000 /* Now use the correct ns */ 2001 ns = tmp_vec2; 2002 2003 do { 2004 sctx.useshift = PETSC_FALSE; 2005 /* Now loop over each block-row, and do the factorization */ 2006 for (i=0,row=0; i<node_max; i++) { 2007 nodesz = ns[i]; 2008 nz = bi[row+1] - bi[row]; 2009 bjtmp = bj + bi[row]; 2010 2011 switch (nodesz){ 2012 case 1: 2013 for (j=0; j<nz; j++){ 2014 idx = bjtmp[j]; 2015 rtmp11[idx] = 0.0; 2016 } 2017 2018 /* load in initial (unfactored row) */ 2019 idx = r[row]; 2020 nz_tmp = ai[idx+1] - ai[idx]; 2021 ajtmp = aj + ai[idx]; 2022 v1 = aa + ai[idx]; 2023 2024 for (j=0; j<nz_tmp; j++) { 2025 idx = ics[ajtmp[j]]; 2026 rtmp11[idx] = v1[j]; 2027 } 2028 rtmp11[ics[r[row]]] += sctx.shift_amount; 2029 2030 prow = *bjtmp++ ; 2031 while (prow < row) { 2032 pc1 = rtmp11 + prow; 2033 if (*pc1 != 0.0){ 2034 pv = ba + bd[prow]; 2035 pj = nbj + bd[prow]; 2036 mul1 = *pc1 * *pv++; 2037 *pc1 = mul1; 2038 nz_tmp = bi[prow+1] - bd[prow] - 1; 2039 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2040 for (j=0; j<nz_tmp; j++) { 2041 tmp = pv[j]; 2042 idx = pj[j]; 2043 rtmp11[idx] -= mul1 * tmp; 2044 } 2045 } 2046 prow = *bjtmp++ ; 2047 } 2048 pj = bj + bi[row]; 2049 pc1 = ba + bi[row]; 2050 2051 sctx.pv = rtmp11[row]; 2052 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2053 rs = 0.0; 2054 for (j=0; j<nz; j++) { 2055 idx = pj[j]; 2056 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2057 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2058 } 2059 sctx.rs = rs; 2060 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2061 if (newshift == 1) goto endofwhile; 2062 break; 2063 2064 case 2: 2065 for (j=0; j<nz; j++) { 2066 idx = bjtmp[j]; 2067 rtmp11[idx] = 0.0; 2068 rtmp22[idx] = 0.0; 2069 } 2070 2071 /* load in initial (unfactored row) */ 2072 idx = r[row]; 2073 nz_tmp = ai[idx+1] - ai[idx]; 2074 ajtmp = aj + ai[idx]; 2075 v1 = aa + ai[idx]; 2076 v2 = aa + ai[idx+1]; 2077 for (j=0; j<nz_tmp; j++) { 2078 idx = ics[ajtmp[j]]; 2079 rtmp11[idx] = v1[j]; 2080 rtmp22[idx] = v2[j]; 2081 } 2082 rtmp11[ics[r[row]]] += sctx.shift_amount; 2083 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2084 2085 prow = *bjtmp++ ; 2086 while (prow < row) { 2087 pc1 = rtmp11 + prow; 2088 pc2 = rtmp22 + prow; 2089 if (*pc1 != 0.0 || *pc2 != 0.0){ 2090 pv = ba + bd[prow]; 2091 pj = nbj + bd[prow]; 2092 mul1 = *pc1 * *pv; 2093 mul2 = *pc2 * *pv; 2094 ++pv; 2095 *pc1 = mul1; 2096 *pc2 = mul2; 2097 2098 nz_tmp = bi[prow+1] - bd[prow] - 1; 2099 for (j=0; j<nz_tmp; j++) { 2100 tmp = pv[j]; 2101 idx = pj[j]; 2102 rtmp11[idx] -= mul1 * tmp; 2103 rtmp22[idx] -= mul2 * tmp; 2104 } 2105 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2106 } 2107 prow = *bjtmp++ ; 2108 } 2109 2110 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2111 pc1 = rtmp11 + prow; 2112 pc2 = rtmp22 + prow; 2113 2114 sctx.pv = *pc1; 2115 pj = bj + bi[prow]; 2116 rs = 0.0; 2117 for (j=0; j<nz; j++){ 2118 idx = pj[j]; 2119 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2120 } 2121 sctx.rs = rs; 2122 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2123 if (newshift == 1) goto endofwhile; 2124 2125 if (*pc2 != 0.0){ 2126 pj = nbj + bd[prow]; 2127 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2128 *pc2 = mul2; 2129 nz_tmp = bi[prow+1] - bd[prow] - 1; 2130 for (j=0; j<nz_tmp; j++) { 2131 idx = pj[j] ; 2132 tmp = rtmp11[idx]; 2133 rtmp22[idx] -= mul2 * tmp; 2134 } 2135 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2136 } 2137 2138 pj = bj + bi[row]; 2139 pc1 = ba + bi[row]; 2140 pc2 = ba + bi[row+1]; 2141 2142 sctx.pv = rtmp22[row+1]; 2143 rs = 0.0; 2144 rtmp11[row] = 1.0/rtmp11[row]; 2145 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2146 /* copy row entries from dense representation to sparse */ 2147 for (j=0; j<nz; j++) { 2148 idx = pj[j]; 2149 pc1[j] = rtmp11[idx]; 2150 pc2[j] = rtmp22[idx]; 2151 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2152 } 2153 sctx.rs = rs; 2154 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2155 if (newshift == 1) goto endofwhile; 2156 break; 2157 2158 case 3: 2159 for (j=0; j<nz; j++) { 2160 idx = bjtmp[j]; 2161 rtmp11[idx] = 0.0; 2162 rtmp22[idx] = 0.0; 2163 rtmp33[idx] = 0.0; 2164 } 2165 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2166 idx = r[row]; 2167 nz_tmp = ai[idx+1] - ai[idx]; 2168 ajtmp = aj + ai[idx]; 2169 v1 = aa + ai[idx]; 2170 v2 = aa + ai[idx+1]; 2171 v3 = aa + ai[idx+2]; 2172 for (j=0; j<nz_tmp; j++) { 2173 idx = ics[ajtmp[j]]; 2174 rtmp11[idx] = v1[j]; 2175 rtmp22[idx] = v2[j]; 2176 rtmp33[idx] = v3[j]; 2177 } 2178 rtmp11[ics[r[row]]] += sctx.shift_amount; 2179 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2180 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2181 2182 /* loop over all pivot row blocks above this row block */ 2183 prow = *bjtmp++ ; 2184 while (prow < row) { 2185 pc1 = rtmp11 + prow; 2186 pc2 = rtmp22 + prow; 2187 pc3 = rtmp33 + prow; 2188 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 2189 pv = ba + bd[prow]; 2190 pj = nbj + bd[prow]; 2191 mul1 = *pc1 * *pv; 2192 mul2 = *pc2 * *pv; 2193 mul3 = *pc3 * *pv; 2194 ++pv; 2195 *pc1 = mul1; 2196 *pc2 = mul2; 2197 *pc3 = mul3; 2198 2199 nz_tmp = bi[prow+1] - bd[prow] - 1; 2200 /* update this row based on pivot row */ 2201 for (j=0; j<nz_tmp; j++) { 2202 tmp = pv[j]; 2203 idx = pj[j]; 2204 rtmp11[idx] -= mul1 * tmp; 2205 rtmp22[idx] -= mul2 * tmp; 2206 rtmp33[idx] -= mul3 * tmp; 2207 } 2208 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 2209 } 2210 prow = *bjtmp++ ; 2211 } 2212 2213 /* Now take care of diagonal 3x3 block in this set of rows */ 2214 /* note: prow = row here */ 2215 pc1 = rtmp11 + prow; 2216 pc2 = rtmp22 + prow; 2217 pc3 = rtmp33 + prow; 2218 2219 sctx.pv = *pc1; 2220 pj = bj + bi[prow]; 2221 rs = 0.0; 2222 for (j=0; j<nz; j++){ 2223 idx = pj[j]; 2224 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2225 } 2226 sctx.rs = rs; 2227 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2228 if (newshift == 1) goto endofwhile; 2229 2230 if (*pc2 != 0.0 || *pc3 != 0.0){ 2231 mul2 = (*pc2)/(*pc1); 2232 mul3 = (*pc3)/(*pc1); 2233 *pc2 = mul2; 2234 *pc3 = mul3; 2235 nz_tmp = bi[prow+1] - bd[prow] - 1; 2236 pj = nbj + bd[prow]; 2237 for (j=0; j<nz_tmp; j++) { 2238 idx = pj[j] ; 2239 tmp = rtmp11[idx]; 2240 rtmp22[idx] -= mul2 * tmp; 2241 rtmp33[idx] -= mul3 * tmp; 2242 } 2243 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2244 } 2245 ++prow; 2246 2247 pc2 = rtmp22 + prow; 2248 pc3 = rtmp33 + prow; 2249 sctx.pv = *pc2; 2250 pj = bj + bi[prow]; 2251 rs = 0.0; 2252 for (j=0; j<nz; j++){ 2253 idx = pj[j]; 2254 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2255 } 2256 sctx.rs = rs; 2257 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2258 if (newshift == 1) goto endofwhile; 2259 2260 if (*pc3 != 0.0){ 2261 mul3 = (*pc3)/(*pc2); 2262 *pc3 = mul3; 2263 pj = nbj + bd[prow]; 2264 nz_tmp = bi[prow+1] - bd[prow] - 1; 2265 for (j=0; j<nz_tmp; j++) { 2266 idx = pj[j] ; 2267 tmp = rtmp22[idx]; 2268 rtmp33[idx] -= mul3 * tmp; 2269 } 2270 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2271 } 2272 2273 pj = bj + bi[row]; 2274 pc1 = ba + bi[row]; 2275 pc2 = ba + bi[row+1]; 2276 pc3 = ba + bi[row+2]; 2277 2278 sctx.pv = rtmp33[row+2]; 2279 rs = 0.0; 2280 rtmp11[row] = 1.0/rtmp11[row]; 2281 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2282 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2283 /* copy row entries from dense representation to sparse */ 2284 for (j=0; j<nz; j++) { 2285 idx = pj[j]; 2286 pc1[j] = rtmp11[idx]; 2287 pc2[j] = rtmp22[idx]; 2288 pc3[j] = rtmp33[idx]; 2289 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2290 } 2291 2292 sctx.rs = rs; 2293 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 2294 if (newshift == 1) goto endofwhile; 2295 break; 2296 2297 default: 2298 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2299 } 2300 row += nodesz; /* Update the row */ 2301 } 2302 endofwhile:; 2303 } while (sctx.useshift); 2304 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2305 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2306 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2307 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2308 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2309 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2310 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2311 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2312 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2313 C->assembled = PETSC_TRUE; 2314 C->preallocated = PETSC_TRUE; 2315 if (sctx.nshift) { 2316 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2317 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2318 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2319 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2320 } 2321 } 2322 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2323 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2324 PetscFunctionReturn(0); 2325 } 2326 2327 2328 /* ----------------------------------------------------------- */ 2329 #undef __FUNCT__ 2330 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2331 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2332 { 2333 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2334 IS iscol = a->col,isrow = a->row; 2335 PetscErrorCode ierr; 2336 const PetscInt *r,*c,*rout,*cout; 2337 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2338 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2339 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2340 PetscScalar sum1,sum2,sum3,sum4,sum5; 2341 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2342 const PetscScalar *b; 2343 2344 PetscFunctionBegin; 2345 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2346 node_max = a->inode.node_count; 2347 ns = a->inode.size; /* Node Size array */ 2348 2349 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2350 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2351 tmp = a->solve_work; 2352 2353 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2354 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2355 2356 /* forward solve the lower triangular */ 2357 tmps = tmp ; 2358 aa = a_a ; 2359 aj = a_j ; 2360 ad = a->diag; 2361 2362 for (i = 0,row = 0; i< node_max; ++i){ 2363 nsz = ns[i]; 2364 aii = ai[row]; 2365 v1 = aa + aii; 2366 vi = aj + aii; 2367 nz = ai[row+1]- ai[row]; 2368 2369 if (i < node_max-1) { 2370 /* Prefetch the indices for the next block */ 2371 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2372 /* Prefetch the data for the next block */ 2373 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2374 } 2375 2376 switch (nsz){ /* Each loop in 'case' is unrolled */ 2377 case 1 : 2378 sum1 = b[r[row]]; 2379 for(j=0; j<nz-1; j+=2){ 2380 i0 = vi[j]; 2381 i1 = vi[j+1]; 2382 tmp0 = tmps[i0]; 2383 tmp1 = tmps[i1]; 2384 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2385 } 2386 if(j == nz-1){ 2387 tmp0 = tmps[vi[j]]; 2388 sum1 -= v1[j]*tmp0; 2389 } 2390 tmp[row++]=sum1; 2391 break; 2392 case 2: 2393 sum1 = b[r[row]]; 2394 sum2 = b[r[row+1]]; 2395 v2 = aa + ai[row+1]; 2396 2397 for(j=0; j<nz-1; j+=2){ 2398 i0 = vi[j]; 2399 i1 = vi[j+1]; 2400 tmp0 = tmps[i0]; 2401 tmp1 = tmps[i1]; 2402 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2403 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2404 } 2405 if(j == nz-1){ 2406 tmp0 = tmps[vi[j]]; 2407 sum1 -= v1[j] *tmp0; 2408 sum2 -= v2[j] *tmp0; 2409 } 2410 sum2 -= v2[nz] * sum1; 2411 tmp[row ++]=sum1; 2412 tmp[row ++]=sum2; 2413 break; 2414 case 3: 2415 sum1 = b[r[row]]; 2416 sum2 = b[r[row+1]]; 2417 sum3 = b[r[row+2]]; 2418 v2 = aa + ai[row+1]; 2419 v3 = aa + ai[row+2]; 2420 2421 for (j=0; j<nz-1; j+=2){ 2422 i0 = vi[j]; 2423 i1 = vi[j+1]; 2424 tmp0 = tmps[i0]; 2425 tmp1 = tmps[i1]; 2426 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2427 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2428 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2429 } 2430 if (j == nz-1){ 2431 tmp0 = tmps[vi[j]]; 2432 sum1 -= v1[j] *tmp0; 2433 sum2 -= v2[j] *tmp0; 2434 sum3 -= v3[j] *tmp0; 2435 } 2436 sum2 -= v2[nz] * sum1; 2437 sum3 -= v3[nz] * sum1; 2438 sum3 -= v3[nz+1] * sum2; 2439 tmp[row ++]=sum1; 2440 tmp[row ++]=sum2; 2441 tmp[row ++]=sum3; 2442 break; 2443 2444 case 4: 2445 sum1 = b[r[row]]; 2446 sum2 = b[r[row+1]]; 2447 sum3 = b[r[row+2]]; 2448 sum4 = b[r[row+3]]; 2449 v2 = aa + ai[row+1]; 2450 v3 = aa + ai[row+2]; 2451 v4 = aa + ai[row+3]; 2452 2453 for (j=0; j<nz-1; j+=2){ 2454 i0 = vi[j]; 2455 i1 = vi[j+1]; 2456 tmp0 = tmps[i0]; 2457 tmp1 = tmps[i1]; 2458 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2459 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2460 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2461 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2462 } 2463 if (j == nz-1){ 2464 tmp0 = tmps[vi[j]]; 2465 sum1 -= v1[j] *tmp0; 2466 sum2 -= v2[j] *tmp0; 2467 sum3 -= v3[j] *tmp0; 2468 sum4 -= v4[j] *tmp0; 2469 } 2470 sum2 -= v2[nz] * sum1; 2471 sum3 -= v3[nz] * sum1; 2472 sum4 -= v4[nz] * sum1; 2473 sum3 -= v3[nz+1] * sum2; 2474 sum4 -= v4[nz+1] * sum2; 2475 sum4 -= v4[nz+2] * sum3; 2476 2477 tmp[row ++]=sum1; 2478 tmp[row ++]=sum2; 2479 tmp[row ++]=sum3; 2480 tmp[row ++]=sum4; 2481 break; 2482 case 5: 2483 sum1 = b[r[row]]; 2484 sum2 = b[r[row+1]]; 2485 sum3 = b[r[row+2]]; 2486 sum4 = b[r[row+3]]; 2487 sum5 = b[r[row+4]]; 2488 v2 = aa + ai[row+1]; 2489 v3 = aa + ai[row+2]; 2490 v4 = aa + ai[row+3]; 2491 v5 = aa + ai[row+4]; 2492 2493 for (j=0; j<nz-1; j+=2){ 2494 i0 = vi[j]; 2495 i1 = vi[j+1]; 2496 tmp0 = tmps[i0]; 2497 tmp1 = tmps[i1]; 2498 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2499 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2500 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2501 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2502 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2503 } 2504 if (j == nz-1){ 2505 tmp0 = tmps[vi[j]]; 2506 sum1 -= v1[j] *tmp0; 2507 sum2 -= v2[j] *tmp0; 2508 sum3 -= v3[j] *tmp0; 2509 sum4 -= v4[j] *tmp0; 2510 sum5 -= v5[j] *tmp0; 2511 } 2512 2513 sum2 -= v2[nz] * sum1; 2514 sum3 -= v3[nz] * sum1; 2515 sum4 -= v4[nz] * sum1; 2516 sum5 -= v5[nz] * sum1; 2517 sum3 -= v3[nz+1] * sum2; 2518 sum4 -= v4[nz+1] * sum2; 2519 sum5 -= v5[nz+1] * sum2; 2520 sum4 -= v4[nz+2] * sum3; 2521 sum5 -= v5[nz+2] * sum3; 2522 sum5 -= v5[nz+3] * sum4; 2523 2524 tmp[row ++]=sum1; 2525 tmp[row ++]=sum2; 2526 tmp[row ++]=sum3; 2527 tmp[row ++]=sum4; 2528 tmp[row ++]=sum5; 2529 break; 2530 default: 2531 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2532 } 2533 } 2534 /* backward solve the upper triangular */ 2535 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2536 nsz = ns[i]; 2537 aii = ad[row+1] + 1; 2538 v1 = aa + aii; 2539 vi = aj + aii; 2540 nz = ad[row]- ad[row+1] - 1; 2541 2542 if (i > 0) { 2543 /* Prefetch the indices for the next block */ 2544 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2545 /* Prefetch the data for the next block */ 2546 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2547 } 2548 2549 switch (nsz){ /* Each loop in 'case' is unrolled */ 2550 case 1 : 2551 sum1 = tmp[row]; 2552 2553 for(j=0 ; j<nz-1; j+=2){ 2554 i0 = vi[j]; 2555 i1 = vi[j+1]; 2556 tmp0 = tmps[i0]; 2557 tmp1 = tmps[i1]; 2558 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2559 } 2560 if (j == nz-1){ 2561 tmp0 = tmps[vi[j]]; 2562 sum1 -= v1[j]*tmp0; 2563 } 2564 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2565 break; 2566 case 2 : 2567 sum1 = tmp[row]; 2568 sum2 = tmp[row-1]; 2569 v2 = aa + ad[row] + 1; 2570 for (j=0 ; j<nz-1; j+=2){ 2571 i0 = vi[j]; 2572 i1 = vi[j+1]; 2573 tmp0 = tmps[i0]; 2574 tmp1 = tmps[i1]; 2575 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2576 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2577 } 2578 if (j == nz-1){ 2579 tmp0 = tmps[vi[j]]; 2580 sum1 -= v1[j]* tmp0; 2581 sum2 -= v2[j+1]* tmp0; 2582 } 2583 2584 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2585 sum2 -= v2[0] * tmp0; 2586 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2587 break; 2588 case 3 : 2589 sum1 = tmp[row]; 2590 sum2 = tmp[row -1]; 2591 sum3 = tmp[row -2]; 2592 v2 = aa + ad[row] + 1; 2593 v3 = aa + ad[row -1] + 1; 2594 for (j=0 ; j<nz-1; j+=2){ 2595 i0 = vi[j]; 2596 i1 = vi[j+1]; 2597 tmp0 = tmps[i0]; 2598 tmp1 = tmps[i1]; 2599 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2600 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2601 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2602 } 2603 if (j== nz-1){ 2604 tmp0 = tmps[vi[j]]; 2605 sum1 -= v1[j] * tmp0; 2606 sum2 -= v2[j+1] * tmp0; 2607 sum3 -= v3[j+2] * tmp0; 2608 } 2609 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2610 sum2 -= v2[0]* tmp0; 2611 sum3 -= v3[1] * tmp0; 2612 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2613 sum3 -= v3[0]* tmp0; 2614 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2615 2616 break; 2617 case 4 : 2618 sum1 = tmp[row]; 2619 sum2 = tmp[row -1]; 2620 sum3 = tmp[row -2]; 2621 sum4 = tmp[row -3]; 2622 v2 = aa + ad[row]+1; 2623 v3 = aa + ad[row -1]+1; 2624 v4 = aa + ad[row -2]+1; 2625 2626 for (j=0 ; j<nz-1; j+=2){ 2627 i0 = vi[j]; 2628 i1 = vi[j+1]; 2629 tmp0 = tmps[i0]; 2630 tmp1 = tmps[i1]; 2631 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2632 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2633 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2634 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2635 } 2636 if (j== nz-1){ 2637 tmp0 = tmps[vi[j]]; 2638 sum1 -= v1[j] * tmp0; 2639 sum2 -= v2[j+1] * tmp0; 2640 sum3 -= v3[j+2] * tmp0; 2641 sum4 -= v4[j+3] * tmp0; 2642 } 2643 2644 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2645 sum2 -= v2[0] * tmp0; 2646 sum3 -= v3[1] * tmp0; 2647 sum4 -= v4[2] * tmp0; 2648 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2649 sum3 -= v3[0] * tmp0; 2650 sum4 -= v4[1] * tmp0; 2651 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2652 sum4 -= v4[0] * tmp0; 2653 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2654 break; 2655 case 5 : 2656 sum1 = tmp[row]; 2657 sum2 = tmp[row -1]; 2658 sum3 = tmp[row -2]; 2659 sum4 = tmp[row -3]; 2660 sum5 = tmp[row -4]; 2661 v2 = aa + ad[row]+1; 2662 v3 = aa + ad[row -1]+1; 2663 v4 = aa + ad[row -2]+1; 2664 v5 = aa + ad[row -3]+1; 2665 for (j=0 ; j<nz-1; j+=2){ 2666 i0 = vi[j]; 2667 i1 = vi[j+1]; 2668 tmp0 = tmps[i0]; 2669 tmp1 = tmps[i1]; 2670 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2671 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2672 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2673 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2674 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2675 } 2676 if (j==nz-1){ 2677 tmp0 = tmps[vi[j]]; 2678 sum1 -= v1[j] * tmp0; 2679 sum2 -= v2[j+1] * tmp0; 2680 sum3 -= v3[j+2] * tmp0; 2681 sum4 -= v4[j+3] * tmp0; 2682 sum5 -= v5[j+4] * tmp0; 2683 } 2684 2685 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2686 sum2 -= v2[0] * tmp0; 2687 sum3 -= v3[1] * tmp0; 2688 sum4 -= v4[2] * tmp0; 2689 sum5 -= v5[3] * tmp0; 2690 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2691 sum3 -= v3[0] * tmp0; 2692 sum4 -= v4[1] * tmp0; 2693 sum5 -= v5[2] * tmp0; 2694 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2695 sum4 -= v4[0] * tmp0; 2696 sum5 -= v5[1] * tmp0; 2697 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2698 sum5 -= v5[0] * tmp0; 2699 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2700 break; 2701 default: 2702 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2703 } 2704 } 2705 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2706 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2707 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2708 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2709 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2710 PetscFunctionReturn(0); 2711 } 2712 2713 2714 /* 2715 Makes a longer coloring[] array and calls the usual code with that 2716 */ 2717 #undef __FUNCT__ 2718 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2719 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2720 { 2721 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2722 PetscErrorCode ierr; 2723 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2724 PetscInt *colorused,i; 2725 ISColoringValue *newcolor; 2726 2727 PetscFunctionBegin; 2728 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2729 /* loop over inodes, marking a color for each column*/ 2730 row = 0; 2731 for (i=0; i<m; i++){ 2732 for (j=0; j<ns[i]; j++) { 2733 newcolor[row++] = coloring[i] + j*ncolors; 2734 } 2735 } 2736 2737 /* eliminate unneeded colors */ 2738 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2739 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2740 for (i=0; i<n; i++) { 2741 colorused[newcolor[i]] = 1; 2742 } 2743 2744 for (i=1; i<5*ncolors; i++) { 2745 colorused[i] += colorused[i-1]; 2746 } 2747 ncolors = colorused[5*ncolors-1]; 2748 for (i=0; i<n; i++) { 2749 newcolor[i] = colorused[newcolor[i]]-1; 2750 } 2751 ierr = PetscFree(colorused);CHKERRQ(ierr); 2752 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2753 ierr = PetscFree(coloring);CHKERRQ(ierr); 2754 PetscFunctionReturn(0); 2755 } 2756 2757 #include "../src/mat/blockinvert.h" 2758 2759 #undef __FUNCT__ 2760 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2761 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2762 { 2763 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2764 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2765 MatScalar *ibdiag,*bdiag,work[25]; 2766 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2767 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2768 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2769 PetscErrorCode ierr; 2770 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2771 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2772 2773 PetscFunctionBegin; 2774 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2775 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2776 if (its > 1) { 2777 /* switch to non-inode version */ 2778 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2779 PetscFunctionReturn(0); 2780 } 2781 2782 if (!a->inode.ibdiagvalid) { 2783 if (!a->inode.ibdiag) { 2784 /* calculate space needed for diagonal blocks */ 2785 for (i=0; i<m; i++) { 2786 cnt += sizes[i]*sizes[i]; 2787 } 2788 a->inode.bdiagsize = cnt; 2789 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2790 } 2791 2792 /* copy over the diagonal blocks and invert them */ 2793 ibdiag = a->inode.ibdiag; 2794 bdiag = a->inode.bdiag; 2795 cnt = 0; 2796 for (i=0, row = 0; i<m; i++) { 2797 for (j=0; j<sizes[i]; j++) { 2798 for (k=0; k<sizes[i]; k++) { 2799 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2800 } 2801 } 2802 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2803 2804 switch(sizes[i]) { 2805 case 1: 2806 /* Create matrix data structure */ 2807 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2808 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2809 break; 2810 case 2: 2811 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2812 break; 2813 case 3: 2814 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2815 break; 2816 case 4: 2817 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2818 break; 2819 case 5: 2820 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2821 break; 2822 default: 2823 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2824 } 2825 cnt += sizes[i]*sizes[i]; 2826 row += sizes[i]; 2827 } 2828 a->inode.ibdiagvalid = PETSC_TRUE; 2829 } 2830 ibdiag = a->inode.ibdiag; 2831 bdiag = a->inode.bdiag; 2832 2833 2834 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2835 if (flag & SOR_ZERO_INITIAL_GUESS) { 2836 PetscScalar *b; 2837 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2838 if (xx != bb) { 2839 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2840 } else { 2841 b = x; 2842 } 2843 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2844 2845 for (i=0, row=0; i<m; i++) { 2846 sz = diag[row] - ii[row]; 2847 v1 = a->a + ii[row]; 2848 idx = a->j + ii[row]; 2849 2850 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2851 switch (sizes[i]){ 2852 case 1: 2853 2854 sum1 = b[row]; 2855 for(n = 0; n<sz-1; n+=2) { 2856 i1 = idx[0]; 2857 i2 = idx[1]; 2858 idx += 2; 2859 tmp0 = x[i1]; 2860 tmp1 = x[i2]; 2861 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2862 } 2863 2864 if (n == sz-1){ 2865 tmp0 = x[*idx]; 2866 sum1 -= *v1 * tmp0; 2867 } 2868 x[row++] = sum1*(*ibdiag++); 2869 break; 2870 case 2: 2871 v2 = a->a + ii[row+1]; 2872 sum1 = b[row]; 2873 sum2 = b[row+1]; 2874 for(n = 0; n<sz-1; n+=2) { 2875 i1 = idx[0]; 2876 i2 = idx[1]; 2877 idx += 2; 2878 tmp0 = x[i1]; 2879 tmp1 = x[i2]; 2880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2882 } 2883 2884 if (n == sz-1){ 2885 tmp0 = x[*idx]; 2886 sum1 -= v1[0] * tmp0; 2887 sum2 -= v2[0] * tmp0; 2888 } 2889 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2890 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2891 ibdiag += 4; 2892 break; 2893 case 3: 2894 v2 = a->a + ii[row+1]; 2895 v3 = a->a + ii[row+2]; 2896 sum1 = b[row]; 2897 sum2 = b[row+1]; 2898 sum3 = b[row+2]; 2899 for(n = 0; n<sz-1; n+=2) { 2900 i1 = idx[0]; 2901 i2 = idx[1]; 2902 idx += 2; 2903 tmp0 = x[i1]; 2904 tmp1 = x[i2]; 2905 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2906 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2907 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2908 } 2909 2910 if (n == sz-1){ 2911 tmp0 = x[*idx]; 2912 sum1 -= v1[0] * tmp0; 2913 sum2 -= v2[0] * tmp0; 2914 sum3 -= v3[0] * tmp0; 2915 } 2916 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2917 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2918 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2919 ibdiag += 9; 2920 break; 2921 case 4: 2922 v2 = a->a + ii[row+1]; 2923 v3 = a->a + ii[row+2]; 2924 v4 = a->a + ii[row+3]; 2925 sum1 = b[row]; 2926 sum2 = b[row+1]; 2927 sum3 = b[row+2]; 2928 sum4 = b[row+3]; 2929 for(n = 0; n<sz-1; n+=2) { 2930 i1 = idx[0]; 2931 i2 = idx[1]; 2932 idx += 2; 2933 tmp0 = x[i1]; 2934 tmp1 = x[i2]; 2935 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2936 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2937 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2938 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2939 } 2940 2941 if (n == sz-1){ 2942 tmp0 = x[*idx]; 2943 sum1 -= v1[0] * tmp0; 2944 sum2 -= v2[0] * tmp0; 2945 sum3 -= v3[0] * tmp0; 2946 sum4 -= v4[0] * tmp0; 2947 } 2948 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2949 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2950 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2951 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2952 ibdiag += 16; 2953 break; 2954 case 5: 2955 v2 = a->a + ii[row+1]; 2956 v3 = a->a + ii[row+2]; 2957 v4 = a->a + ii[row+3]; 2958 v5 = a->a + ii[row+4]; 2959 sum1 = b[row]; 2960 sum2 = b[row+1]; 2961 sum3 = b[row+2]; 2962 sum4 = b[row+3]; 2963 sum5 = b[row+4]; 2964 for(n = 0; n<sz-1; n+=2) { 2965 i1 = idx[0]; 2966 i2 = idx[1]; 2967 idx += 2; 2968 tmp0 = x[i1]; 2969 tmp1 = x[i2]; 2970 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2971 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2972 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2973 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2974 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2975 } 2976 2977 if (n == sz-1){ 2978 tmp0 = x[*idx]; 2979 sum1 -= v1[0] * tmp0; 2980 sum2 -= v2[0] * tmp0; 2981 sum3 -= v3[0] * tmp0; 2982 sum4 -= v4[0] * tmp0; 2983 sum5 -= v5[0] * tmp0; 2984 } 2985 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2986 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2987 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2988 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2989 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2990 ibdiag += 25; 2991 break; 2992 default: 2993 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2994 } 2995 } 2996 2997 xb = x; 2998 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2999 } else xb = b; 3000 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 3001 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 3002 cnt = 0; 3003 for (i=0, row=0; i<m; i++) { 3004 3005 switch (sizes[i]){ 3006 case 1: 3007 x[row++] *= bdiag[cnt++]; 3008 break; 3009 case 2: 3010 x1 = x[row]; x2 = x[row+1]; 3011 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3012 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3013 x[row++] = tmp1; 3014 x[row++] = tmp2; 3015 cnt += 4; 3016 break; 3017 case 3: 3018 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3019 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3020 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3021 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3022 x[row++] = tmp1; 3023 x[row++] = tmp2; 3024 x[row++] = tmp3; 3025 cnt += 9; 3026 break; 3027 case 4: 3028 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3029 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3030 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3031 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3032 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3033 x[row++] = tmp1; 3034 x[row++] = tmp2; 3035 x[row++] = tmp3; 3036 x[row++] = tmp4; 3037 cnt += 16; 3038 break; 3039 case 5: 3040 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3041 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3042 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3043 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3044 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3045 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3046 x[row++] = tmp1; 3047 x[row++] = tmp2; 3048 x[row++] = tmp3; 3049 x[row++] = tmp4; 3050 x[row++] = tmp5; 3051 cnt += 25; 3052 break; 3053 default: 3054 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3055 } 3056 } 3057 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3058 } 3059 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 3060 3061 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3062 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3063 ibdiag -= sizes[i]*sizes[i]; 3064 sz = ii[row+1] - diag[row] - 1; 3065 v1 = a->a + diag[row] + 1; 3066 idx = a->j + diag[row] + 1; 3067 3068 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3069 switch (sizes[i]){ 3070 case 1: 3071 3072 sum1 = xb[row]; 3073 for(n = 0; n<sz-1; n+=2) { 3074 i1 = idx[0]; 3075 i2 = idx[1]; 3076 idx += 2; 3077 tmp0 = x[i1]; 3078 tmp1 = x[i2]; 3079 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3080 } 3081 3082 if (n == sz-1){ 3083 tmp0 = x[*idx]; 3084 sum1 -= *v1*tmp0; 3085 } 3086 x[row--] = sum1*(*ibdiag); 3087 break; 3088 3089 case 2: 3090 3091 sum1 = xb[row]; 3092 sum2 = xb[row-1]; 3093 /* note that sum1 is associated with the second of the two rows */ 3094 v2 = a->a + diag[row-1] + 2; 3095 for(n = 0; n<sz-1; n+=2) { 3096 i1 = idx[0]; 3097 i2 = idx[1]; 3098 idx += 2; 3099 tmp0 = x[i1]; 3100 tmp1 = x[i2]; 3101 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3102 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3103 } 3104 3105 if (n == sz-1){ 3106 tmp0 = x[*idx]; 3107 sum1 -= *v1*tmp0; 3108 sum2 -= *v2*tmp0; 3109 } 3110 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3111 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3112 break; 3113 case 3: 3114 3115 sum1 = xb[row]; 3116 sum2 = xb[row-1]; 3117 sum3 = xb[row-2]; 3118 v2 = a->a + diag[row-1] + 2; 3119 v3 = a->a + diag[row-2] + 3; 3120 for(n = 0; n<sz-1; n+=2) { 3121 i1 = idx[0]; 3122 i2 = idx[1]; 3123 idx += 2; 3124 tmp0 = x[i1]; 3125 tmp1 = x[i2]; 3126 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3127 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3128 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3129 } 3130 3131 if (n == sz-1){ 3132 tmp0 = x[*idx]; 3133 sum1 -= *v1*tmp0; 3134 sum2 -= *v2*tmp0; 3135 sum3 -= *v3*tmp0; 3136 } 3137 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3138 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3139 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3140 break; 3141 case 4: 3142 3143 sum1 = xb[row]; 3144 sum2 = xb[row-1]; 3145 sum3 = xb[row-2]; 3146 sum4 = xb[row-3]; 3147 v2 = a->a + diag[row-1] + 2; 3148 v3 = a->a + diag[row-2] + 3; 3149 v4 = a->a + diag[row-3] + 4; 3150 for(n = 0; n<sz-1; n+=2) { 3151 i1 = idx[0]; 3152 i2 = idx[1]; 3153 idx += 2; 3154 tmp0 = x[i1]; 3155 tmp1 = x[i2]; 3156 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3157 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3158 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3159 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3160 } 3161 3162 if (n == sz-1){ 3163 tmp0 = x[*idx]; 3164 sum1 -= *v1*tmp0; 3165 sum2 -= *v2*tmp0; 3166 sum3 -= *v3*tmp0; 3167 sum4 -= *v4*tmp0; 3168 } 3169 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3170 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3171 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3172 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3173 break; 3174 case 5: 3175 3176 sum1 = xb[row]; 3177 sum2 = xb[row-1]; 3178 sum3 = xb[row-2]; 3179 sum4 = xb[row-3]; 3180 sum5 = xb[row-4]; 3181 v2 = a->a + diag[row-1] + 2; 3182 v3 = a->a + diag[row-2] + 3; 3183 v4 = a->a + diag[row-3] + 4; 3184 v5 = a->a + diag[row-4] + 5; 3185 for(n = 0; n<sz-1; n+=2) { 3186 i1 = idx[0]; 3187 i2 = idx[1]; 3188 idx += 2; 3189 tmp0 = x[i1]; 3190 tmp1 = x[i2]; 3191 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3192 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3193 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3194 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3195 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3196 } 3197 3198 if (n == sz-1){ 3199 tmp0 = x[*idx]; 3200 sum1 -= *v1*tmp0; 3201 sum2 -= *v2*tmp0; 3202 sum3 -= *v3*tmp0; 3203 sum4 -= *v4*tmp0; 3204 sum5 -= *v5*tmp0; 3205 } 3206 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3207 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3208 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3209 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3210 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3211 break; 3212 default: 3213 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3214 } 3215 } 3216 3217 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3218 } 3219 its--; 3220 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3221 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 3222 } 3223 if (flag & SOR_EISENSTAT) { 3224 const PetscScalar *b; 3225 MatScalar *t = a->inode.ssor_work; 3226 3227 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3228 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3229 /* 3230 Apply (U + D)^-1 where D is now the block diagonal 3231 */ 3232 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3233 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3234 ibdiag -= sizes[i]*sizes[i]; 3235 sz = ii[row+1] - diag[row] - 1; 3236 v1 = a->a + diag[row] + 1; 3237 idx = a->j + diag[row] + 1; 3238 CHKMEMQ; 3239 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3240 switch (sizes[i]){ 3241 case 1: 3242 3243 sum1 = b[row]; 3244 for(n = 0; n<sz-1; n+=2) { 3245 i1 = idx[0]; 3246 i2 = idx[1]; 3247 idx += 2; 3248 tmp0 = x[i1]; 3249 tmp1 = x[i2]; 3250 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3251 } 3252 3253 if (n == sz-1){ 3254 tmp0 = x[*idx]; 3255 sum1 -= *v1*tmp0; 3256 } 3257 x[row] = sum1*(*ibdiag);row--; 3258 break; 3259 3260 case 2: 3261 3262 sum1 = b[row]; 3263 sum2 = b[row-1]; 3264 /* note that sum1 is associated with the second of the two rows */ 3265 v2 = a->a + diag[row-1] + 2; 3266 for(n = 0; n<sz-1; n+=2) { 3267 i1 = idx[0]; 3268 i2 = idx[1]; 3269 idx += 2; 3270 tmp0 = x[i1]; 3271 tmp1 = x[i2]; 3272 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3273 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3274 } 3275 3276 if (n == sz-1){ 3277 tmp0 = x[*idx]; 3278 sum1 -= *v1*tmp0; 3279 sum2 -= *v2*tmp0; 3280 } 3281 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3282 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3283 row -= 2; 3284 break; 3285 case 3: 3286 3287 sum1 = b[row]; 3288 sum2 = b[row-1]; 3289 sum3 = b[row-2]; 3290 v2 = a->a + diag[row-1] + 2; 3291 v3 = a->a + diag[row-2] + 3; 3292 for(n = 0; n<sz-1; n+=2) { 3293 i1 = idx[0]; 3294 i2 = idx[1]; 3295 idx += 2; 3296 tmp0 = x[i1]; 3297 tmp1 = x[i2]; 3298 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3299 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3300 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3301 } 3302 3303 if (n == sz-1){ 3304 tmp0 = x[*idx]; 3305 sum1 -= *v1*tmp0; 3306 sum2 -= *v2*tmp0; 3307 sum3 -= *v3*tmp0; 3308 } 3309 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3310 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3311 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3312 row -= 3; 3313 break; 3314 case 4: 3315 3316 sum1 = b[row]; 3317 sum2 = b[row-1]; 3318 sum3 = b[row-2]; 3319 sum4 = b[row-3]; 3320 v2 = a->a + diag[row-1] + 2; 3321 v3 = a->a + diag[row-2] + 3; 3322 v4 = a->a + diag[row-3] + 4; 3323 for(n = 0; n<sz-1; n+=2) { 3324 i1 = idx[0]; 3325 i2 = idx[1]; 3326 idx += 2; 3327 tmp0 = x[i1]; 3328 tmp1 = x[i2]; 3329 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3330 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3331 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3332 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3333 } 3334 3335 if (n == sz-1){ 3336 tmp0 = x[*idx]; 3337 sum1 -= *v1*tmp0; 3338 sum2 -= *v2*tmp0; 3339 sum3 -= *v3*tmp0; 3340 sum4 -= *v4*tmp0; 3341 } 3342 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3343 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3344 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3345 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3346 row -= 4; 3347 break; 3348 case 5: 3349 3350 sum1 = b[row]; 3351 sum2 = b[row-1]; 3352 sum3 = b[row-2]; 3353 sum4 = b[row-3]; 3354 sum5 = b[row-4]; 3355 v2 = a->a + diag[row-1] + 2; 3356 v3 = a->a + diag[row-2] + 3; 3357 v4 = a->a + diag[row-3] + 4; 3358 v5 = a->a + diag[row-4] + 5; 3359 for(n = 0; n<sz-1; n+=2) { 3360 i1 = idx[0]; 3361 i2 = idx[1]; 3362 idx += 2; 3363 tmp0 = x[i1]; 3364 tmp1 = x[i2]; 3365 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3366 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3367 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3368 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3369 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3370 } 3371 3372 if (n == sz-1){ 3373 tmp0 = x[*idx]; 3374 sum1 -= *v1*tmp0; 3375 sum2 -= *v2*tmp0; 3376 sum3 -= *v3*tmp0; 3377 sum4 -= *v4*tmp0; 3378 sum5 -= *v5*tmp0; 3379 } 3380 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3381 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3382 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3383 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3384 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3385 row -= 5; 3386 break; 3387 default: 3388 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3389 } 3390 CHKMEMQ; 3391 } 3392 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3393 3394 /* 3395 t = b - D x where D is the block diagonal 3396 */ 3397 cnt = 0; 3398 for (i=0, row=0; i<m; i++) { 3399 CHKMEMQ; 3400 switch (sizes[i]){ 3401 case 1: 3402 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3403 break; 3404 case 2: 3405 x1 = x[row]; x2 = x[row+1]; 3406 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3407 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3408 t[row] = b[row] - tmp1; 3409 t[row+1] = b[row+1] - tmp2; row += 2; 3410 cnt += 4; 3411 break; 3412 case 3: 3413 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3414 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3415 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3416 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3417 t[row] = b[row] - tmp1; 3418 t[row+1] = b[row+1] - tmp2; 3419 t[row+2] = b[row+2] - tmp3; row += 3; 3420 cnt += 9; 3421 break; 3422 case 4: 3423 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3424 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3425 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3426 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3427 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3428 t[row] = b[row] - tmp1; 3429 t[row+1] = b[row+1] - tmp2; 3430 t[row+2] = b[row+2] - tmp3; 3431 t[row+3] = b[row+3] - tmp4; row += 4; 3432 cnt += 16; 3433 break; 3434 case 5: 3435 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3436 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3437 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3438 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3439 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3440 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3441 t[row] = b[row] - tmp1; 3442 t[row+1] = b[row+1] - tmp2; 3443 t[row+2] = b[row+2] - tmp3; 3444 t[row+3] = b[row+3] - tmp4; 3445 t[row+4] = b[row+4] - tmp5;row += 5; 3446 cnt += 25; 3447 break; 3448 default: 3449 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3450 } 3451 CHKMEMQ; 3452 } 3453 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3454 3455 3456 3457 /* 3458 Apply (L + D)^-1 where D is the block diagonal 3459 */ 3460 for (i=0, row=0; i<m; i++) { 3461 sz = diag[row] - ii[row]; 3462 v1 = a->a + ii[row]; 3463 idx = a->j + ii[row]; 3464 CHKMEMQ; 3465 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3466 switch (sizes[i]){ 3467 case 1: 3468 3469 sum1 = t[row]; 3470 for(n = 0; n<sz-1; n+=2) { 3471 i1 = idx[0]; 3472 i2 = idx[1]; 3473 idx += 2; 3474 tmp0 = t[i1]; 3475 tmp1 = t[i2]; 3476 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3477 } 3478 3479 if (n == sz-1){ 3480 tmp0 = t[*idx]; 3481 sum1 -= *v1 * tmp0; 3482 } 3483 x[row] += t[row] = sum1*(*ibdiag++); row++; 3484 break; 3485 case 2: 3486 v2 = a->a + ii[row+1]; 3487 sum1 = t[row]; 3488 sum2 = t[row+1]; 3489 for(n = 0; n<sz-1; n+=2) { 3490 i1 = idx[0]; 3491 i2 = idx[1]; 3492 idx += 2; 3493 tmp0 = t[i1]; 3494 tmp1 = t[i2]; 3495 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3496 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3497 } 3498 3499 if (n == sz-1){ 3500 tmp0 = t[*idx]; 3501 sum1 -= v1[0] * tmp0; 3502 sum2 -= v2[0] * tmp0; 3503 } 3504 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3505 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3506 ibdiag += 4; row += 2; 3507 break; 3508 case 3: 3509 v2 = a->a + ii[row+1]; 3510 v3 = a->a + ii[row+2]; 3511 sum1 = t[row]; 3512 sum2 = t[row+1]; 3513 sum3 = t[row+2]; 3514 for(n = 0; n<sz-1; n+=2) { 3515 i1 = idx[0]; 3516 i2 = idx[1]; 3517 idx += 2; 3518 tmp0 = t[i1]; 3519 tmp1 = t[i2]; 3520 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3521 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3522 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3523 } 3524 3525 if (n == sz-1){ 3526 tmp0 = t[*idx]; 3527 sum1 -= v1[0] * tmp0; 3528 sum2 -= v2[0] * tmp0; 3529 sum3 -= v3[0] * tmp0; 3530 } 3531 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3532 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3533 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3534 ibdiag += 9; row += 3; 3535 break; 3536 case 4: 3537 v2 = a->a + ii[row+1]; 3538 v3 = a->a + ii[row+2]; 3539 v4 = a->a + ii[row+3]; 3540 sum1 = t[row]; 3541 sum2 = t[row+1]; 3542 sum3 = t[row+2]; 3543 sum4 = t[row+3]; 3544 for(n = 0; n<sz-1; n+=2) { 3545 i1 = idx[0]; 3546 i2 = idx[1]; 3547 idx += 2; 3548 tmp0 = t[i1]; 3549 tmp1 = t[i2]; 3550 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3551 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3552 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3553 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3554 } 3555 3556 if (n == sz-1){ 3557 tmp0 = t[*idx]; 3558 sum1 -= v1[0] * tmp0; 3559 sum2 -= v2[0] * tmp0; 3560 sum3 -= v3[0] * tmp0; 3561 sum4 -= v4[0] * tmp0; 3562 } 3563 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3564 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3565 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3566 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3567 ibdiag += 16; row += 4; 3568 break; 3569 case 5: 3570 v2 = a->a + ii[row+1]; 3571 v3 = a->a + ii[row+2]; 3572 v4 = a->a + ii[row+3]; 3573 v5 = a->a + ii[row+4]; 3574 sum1 = t[row]; 3575 sum2 = t[row+1]; 3576 sum3 = t[row+2]; 3577 sum4 = t[row+3]; 3578 sum5 = t[row+4]; 3579 for(n = 0; n<sz-1; n+=2) { 3580 i1 = idx[0]; 3581 i2 = idx[1]; 3582 idx += 2; 3583 tmp0 = t[i1]; 3584 tmp1 = t[i2]; 3585 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3586 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3587 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3588 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3589 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3590 } 3591 3592 if (n == sz-1){ 3593 tmp0 = t[*idx]; 3594 sum1 -= v1[0] * tmp0; 3595 sum2 -= v2[0] * tmp0; 3596 sum3 -= v3[0] * tmp0; 3597 sum4 -= v4[0] * tmp0; 3598 sum5 -= v5[0] * tmp0; 3599 } 3600 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3601 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3602 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3603 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3604 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3605 ibdiag += 25; row += 5; 3606 break; 3607 default: 3608 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3609 } 3610 CHKMEMQ; 3611 } 3612 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3613 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3614 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3615 } 3616 PetscFunctionReturn(0); 3617 } 3618 3619 #undef __FUNCT__ 3620 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3621 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3622 { 3623 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3624 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3625 const MatScalar *bdiag = a->inode.bdiag; 3626 const PetscScalar *b; 3627 PetscErrorCode ierr; 3628 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3629 const PetscInt *sizes = a->inode.size; 3630 3631 PetscFunctionBegin; 3632 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3633 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3634 cnt = 0; 3635 for (i=0, row=0; i<m; i++) { 3636 switch (sizes[i]){ 3637 case 1: 3638 x[row] = b[row]*bdiag[cnt++];row++; 3639 break; 3640 case 2: 3641 x1 = b[row]; x2 = b[row+1]; 3642 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3643 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3644 x[row++] = tmp1; 3645 x[row++] = tmp2; 3646 cnt += 4; 3647 break; 3648 case 3: 3649 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3650 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3651 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3652 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3653 x[row++] = tmp1; 3654 x[row++] = tmp2; 3655 x[row++] = tmp3; 3656 cnt += 9; 3657 break; 3658 case 4: 3659 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3660 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3661 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3662 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3663 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3664 x[row++] = tmp1; 3665 x[row++] = tmp2; 3666 x[row++] = tmp3; 3667 x[row++] = tmp4; 3668 cnt += 16; 3669 break; 3670 case 5: 3671 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3672 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3673 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3674 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3675 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3676 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3677 x[row++] = tmp1; 3678 x[row++] = tmp2; 3679 x[row++] = tmp3; 3680 x[row++] = tmp4; 3681 x[row++] = tmp5; 3682 cnt += 25; 3683 break; 3684 default: 3685 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3686 } 3687 } 3688 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3689 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3690 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3691 PetscFunctionReturn(0); 3692 } 3693 3694 /* 3695 samestructure indicates that the matrix has not changed its nonzero structure so we 3696 do not need to recompute the inodes 3697 */ 3698 #undef __FUNCT__ 3699 #define __FUNCT__ "Mat_CheckInode" 3700 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3701 { 3702 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3703 PetscErrorCode ierr; 3704 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3705 PetscTruth flag; 3706 3707 PetscFunctionBegin; 3708 if (!a->inode.use) PetscFunctionReturn(0); 3709 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3710 3711 3712 m = A->rmap->n; 3713 if (a->inode.size) {ns = a->inode.size;} 3714 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3715 3716 i = 0; 3717 node_count = 0; 3718 idx = a->j; 3719 ii = a->i; 3720 while (i < m){ /* For each row */ 3721 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3722 /* Limits the number of elements in a node to 'a->inode.limit' */ 3723 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3724 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3725 if (nzy != nzx) break; 3726 idy += nzx; /* Same nonzero pattern */ 3727 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3728 if (!flag) break; 3729 } 3730 ns[node_count++] = blk_size; 3731 idx += blk_size*nzx; 3732 i = j; 3733 } 3734 /* If not enough inodes found,, do not use inode version of the routines */ 3735 if (!a->inode.size && m && node_count > .9*m) { 3736 ierr = PetscFree(ns);CHKERRQ(ierr); 3737 a->inode.node_count = 0; 3738 a->inode.size = PETSC_NULL; 3739 a->inode.use = PETSC_FALSE; 3740 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3741 } else { 3742 if (!A->factortype) { 3743 A->ops->mult = MatMult_SeqAIJ_Inode; 3744 A->ops->sor = MatSOR_SeqAIJ_Inode; 3745 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3746 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3747 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3748 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3749 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3750 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3751 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3752 } else { 3753 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 3754 } 3755 a->inode.node_count = node_count; 3756 a->inode.size = ns; 3757 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3758 } 3759 PetscFunctionReturn(0); 3760 } 3761 3762 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3763 PetscInt __k, *__vi; \ 3764 __vi = aj + ai[row]; \ 3765 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3766 __vi = aj + adiag[row]; \ 3767 cols[nzl] = __vi[0];\ 3768 __vi = aj + adiag[row+1]+1;\ 3769 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3770 3771 3772 /* 3773 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3774 Modified from Mat_CheckInode(). 3775 3776 Input Parameters: 3777 + Mat A - ILU or LU matrix factor 3778 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3779 do not need to recompute the inodes 3780 */ 3781 #undef __FUNCT__ 3782 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3783 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3784 { 3785 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3786 PetscErrorCode ierr; 3787 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3788 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3789 PetscTruth flag; 3790 3791 PetscFunctionBegin; 3792 if (!a->inode.use) PetscFunctionReturn(0); 3793 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3794 3795 m = A->rmap->n; 3796 if (a->inode.size) {ns = a->inode.size;} 3797 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3798 3799 i = 0; 3800 node_count = 0; 3801 while (i < m){ /* For each row */ 3802 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3803 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3804 nzx = nzl1 + nzu1 + 1; 3805 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3806 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3807 3808 /* Limits the number of elements in a node to 'a->inode.limit' */ 3809 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3810 nzl2 = ai[j+1] - ai[j]; 3811 nzu2 = adiag[j] - adiag[j+1] - 1; 3812 nzy = nzl2 + nzu2 + 1; 3813 if( nzy != nzx) break; 3814 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3815 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3816 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3817 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3818 ierr = PetscFree(cols2);CHKERRQ(ierr); 3819 } 3820 ns[node_count++] = blk_size; 3821 ierr = PetscFree(cols1);CHKERRQ(ierr); 3822 i = j; 3823 } 3824 /* If not enough inodes found,, do not use inode version of the routines */ 3825 if (!a->inode.size && m && node_count > .9*m) { 3826 ierr = PetscFree(ns);CHKERRQ(ierr); 3827 a->inode.node_count = 0; 3828 a->inode.size = PETSC_NULL; 3829 a->inode.use = PETSC_FALSE; 3830 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3831 } else { 3832 A->ops->mult = 0; 3833 A->ops->sor = 0; 3834 A->ops->multadd = 0; 3835 A->ops->getrowij = 0; 3836 A->ops->restorerowij = 0; 3837 A->ops->getcolumnij = 0; 3838 A->ops->restorecolumnij = 0; 3839 A->ops->coloringpatch = 0; 3840 A->ops->multdiagonalblock = 0; 3841 A->ops->solve = MatSolve_SeqAIJ_Inode; 3842 a->inode.node_count = node_count; 3843 a->inode.size = ns; 3844 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3845 } 3846 PetscFunctionReturn(0); 3847 } 3848 3849 /* 3850 This is really ugly. if inodes are used this replaces the 3851 permutations with ones that correspond to rows/cols of the matrix 3852 rather then inode blocks 3853 */ 3854 #undef __FUNCT__ 3855 #define __FUNCT__ "MatInodeAdjustForInodes" 3856 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3857 { 3858 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3859 3860 PetscFunctionBegin; 3861 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3862 if (f) { 3863 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3864 } 3865 PetscFunctionReturn(0); 3866 } 3867 3868 EXTERN_C_BEGIN 3869 #undef __FUNCT__ 3870 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3871 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3872 { 3873 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3874 PetscErrorCode ierr; 3875 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3876 const PetscInt *ridx,*cidx; 3877 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3878 PetscInt nslim_col,*ns_col; 3879 IS ris = *rperm,cis = *cperm; 3880 3881 PetscFunctionBegin; 3882 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3883 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3884 3885 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3886 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3887 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3888 3889 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3890 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3891 3892 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3893 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3894 3895 /* Construct the permutations for rows*/ 3896 for (i=0,row = 0; i<nslim_row; ++i){ 3897 indx = ridx[i]; 3898 start_val = tns[indx]; 3899 end_val = tns[indx + 1]; 3900 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3901 } 3902 3903 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3904 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3905 3906 /* Construct permutations for columns */ 3907 for (i=0,col=0; i<nslim_col; ++i){ 3908 indx = cidx[i]; 3909 start_val = tns[indx]; 3910 end_val = tns[indx + 1]; 3911 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3912 } 3913 3914 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3915 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3916 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3917 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3918 3919 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3920 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3921 3922 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3923 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3924 ierr = ISDestroy(cis);CHKERRQ(ierr); 3925 ierr = ISDestroy(ris);CHKERRQ(ierr); 3926 ierr = PetscFree(tns);CHKERRQ(ierr); 3927 PetscFunctionReturn(0); 3928 } 3929 EXTERN_C_END 3930 3931 #undef __FUNCT__ 3932 #define __FUNCT__ "MatInodeGetInodeSizes" 3933 /*@C 3934 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3935 3936 Collective on Mat 3937 3938 Input Parameter: 3939 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3940 3941 Output Parameter: 3942 + node_count - no of inodes present in the matrix. 3943 . sizes - an array of size node_count,with sizes of each inode. 3944 - limit - the max size used to generate the inodes. 3945 3946 Level: advanced 3947 3948 Notes: This routine returns some internal storage information 3949 of the matrix, it is intended to be used by advanced users. 3950 It should be called after the matrix is assembled. 3951 The contents of the sizes[] array should not be changed. 3952 PETSC_NULL may be passed for information not requested. 3953 3954 .keywords: matrix, seqaij, get, inode 3955 3956 .seealso: MatGetInfo() 3957 @*/ 3958 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3959 { 3960 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3961 3962 PetscFunctionBegin; 3963 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3964 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3965 if (f) { 3966 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3967 } 3968 PetscFunctionReturn(0); 3969 } 3970 3971 EXTERN_C_BEGIN 3972 #undef __FUNCT__ 3973 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3974 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3975 { 3976 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3977 3978 PetscFunctionBegin; 3979 if (node_count) *node_count = a->inode.node_count; 3980 if (sizes) *sizes = a->inode.size; 3981 if (limit) *limit = a->inode.limit; 3982 PetscFunctionReturn(0); 3983 } 3984 EXTERN_C_END 3985