1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1195 FactorShiftCtx sctx; 1196 const PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,node_max,col; 1200 const PetscInt *ns; 1201 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1202 1203 PetscFunctionBegin; 1204 /* MatPivotSetUp(): initialize shift context sctx */ 1205 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1206 1207 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1208 ddiag = a->diag; 1209 sctx.shift_top = info->zeropivot; 1210 for (i=0; i<n; i++) { 1211 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1212 d = (aa)[ddiag[i]]; 1213 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1214 v = aa+ai[i]; 1215 nz = ai[i+1] - ai[i]; 1216 for (j=0; j<nz; j++) 1217 rs += PetscAbsScalar(v[j]); 1218 if (rs>sctx.shift_top) sctx.shift_top = rs; 1219 } 1220 sctx.shift_top *= 1.1; 1221 sctx.nshift_max = 5; 1222 sctx.shift_lo = 0.; 1223 sctx.shift_hi = 1.; 1224 } 1225 1226 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1227 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1228 1229 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1230 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1231 rtmp2 = rtmp1 + n; 1232 rtmp3 = rtmp2 + n; 1233 rtmp4 = rtmp3 + n; 1234 ics = ic; 1235 1236 node_max = a->inode.node_count; 1237 ns = a->inode.size; 1238 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1239 1240 /* If max inode size > 4, split it into two inodes.*/ 1241 /* also map the inode sizes according to the ordering */ 1242 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1243 for (i=0,j=0; i<node_max; ++i,++j){ 1244 if (ns[i] > 4) { 1245 tmp_vec1[j] = 4; 1246 ++j; 1247 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1248 } else { 1249 tmp_vec1[j] = ns[i]; 1250 } 1251 } 1252 /* Use the correct node_max */ 1253 node_max = j; 1254 1255 /* Now reorder the inode info based on mat re-ordering info */ 1256 /* First create a row -> inode_size_array_index map */ 1257 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1258 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1259 for (i=0,row=0; i<node_max; i++) { 1260 nodesz = tmp_vec1[i]; 1261 for (j=0; j<nodesz; j++,row++) { 1262 nsmap[row] = i; 1263 } 1264 } 1265 /* Using nsmap, create a reordered ns structure */ 1266 for (i=0,j=0; i< node_max; i++) { 1267 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1268 tmp_vec2[i] = nodesz; 1269 j += nodesz; 1270 } 1271 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1272 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1273 1274 /* Now use the correct ns */ 1275 ns = tmp_vec2; 1276 1277 do { 1278 sctx.newshift = PETSC_FALSE; 1279 /* Now loop over each block-row, and do the factorization */ 1280 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1281 nodesz = ns[inod]; 1282 1283 switch (nodesz){ 1284 case 1: 1285 /*----------*/ 1286 /* zero rtmp1 */ 1287 /* L part */ 1288 nz = bi[i+1] - bi[i]; 1289 bjtmp = bj + bi[i]; 1290 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1291 1292 /* U part */ 1293 nz = bdiag[i]-bdiag[i+1]; 1294 bjtmp = bj + bdiag[i+1]+1; 1295 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1296 1297 /* load in initial (unfactored row) */ 1298 nz = ai[r[i]+1] - ai[r[i]]; 1299 ajtmp = aj + ai[r[i]]; 1300 v = aa + ai[r[i]]; 1301 for (j=0; j<nz; j++) { 1302 rtmp1[ics[ajtmp[j]]] = v[j]; 1303 } 1304 /* ZeropivotApply() */ 1305 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1306 1307 /* elimination */ 1308 bjtmp = bj + bi[i]; 1309 row = *bjtmp++; 1310 nzL = bi[i+1] - bi[i]; 1311 for(k=0; k < nzL;k++) { 1312 pc = rtmp1 + row; 1313 if (*pc != 0.0) { 1314 pv = b->a + bdiag[row]; 1315 mul1 = *pc * (*pv); 1316 *pc = mul1; 1317 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1318 pv = b->a + bdiag[row+1]+1; 1319 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1320 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1321 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1322 } 1323 row = *bjtmp++; 1324 } 1325 1326 /* finished row so stick it into b->a */ 1327 rs = 0.0; 1328 /* L part */ 1329 pv = b->a + bi[i] ; 1330 pj = b->j + bi[i] ; 1331 nz = bi[i+1] - bi[i]; 1332 for (j=0; j<nz; j++) { 1333 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1334 } 1335 1336 /* U part */ 1337 pv = b->a + bdiag[i+1]+1; 1338 pj = b->j + bdiag[i+1]+1; 1339 nz = bdiag[i] - bdiag[i+1]-1; 1340 for (j=0; j<nz; j++) { 1341 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* Check zero pivot */ 1345 sctx.rs = rs; 1346 sctx.pv = rtmp1[i]; 1347 ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr); 1348 if(sctx.newshift) break; 1349 1350 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1351 pv = b->a + bdiag[i]; 1352 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1353 break; 1354 1355 case 2: 1356 /*----------*/ 1357 /* zero rtmp1 and rtmp2 */ 1358 /* L part */ 1359 nz = bi[i+1] - bi[i]; 1360 bjtmp = bj + bi[i]; 1361 for (j=0; j<nz; j++) { 1362 col = bjtmp[j]; 1363 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1364 } 1365 1366 /* U part */ 1367 nz = bdiag[i]-bdiag[i+1]; 1368 bjtmp = bj + bdiag[i+1]+1; 1369 for (j=0; j<nz; j++) { 1370 col = bjtmp[j]; 1371 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1372 } 1373 1374 /* load in initial (unfactored row) */ 1375 nz = ai[r[i]+1] - ai[r[i]]; 1376 ajtmp = aj + ai[r[i]]; 1377 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1378 for (j=0; j<nz; j++) { 1379 col = ics[ajtmp[j]]; 1380 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1381 } 1382 /* ZeropivotApply(): shift the diagonal of the matrix */ 1383 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1384 1385 /* elimination */ 1386 bjtmp = bj + bi[i]; 1387 row = *bjtmp++; /* pivot row */ 1388 nzL = bi[i+1] - bi[i]; 1389 for(k=0; k < nzL;k++) { 1390 pc1 = rtmp1 + row; 1391 pc2 = rtmp2 + row; 1392 if (*pc1 != 0.0 || *pc2 != 0.0) { 1393 pv = b->a + bdiag[row]; 1394 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1395 *pc1 = mul1; *pc2 = mul2; 1396 1397 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1398 pv = b->a + bdiag[row+1]+1; 1399 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1400 for (j=0; j<nz; j++){ 1401 col = pj[j]; 1402 rtmp1[col] -= mul1 * pv[j]; 1403 rtmp2[col] -= mul2 * pv[j]; 1404 } 1405 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1406 } 1407 row = *bjtmp++; 1408 } 1409 1410 /* finished row i; check zero pivot, then stick row i into b->a */ 1411 rs = 0.0; 1412 /* L part */ 1413 pc1 = b->a + bi[i]; 1414 pj = b->j + bi[i] ; 1415 nz = bi[i+1] - bi[i]; 1416 for (j=0; j<nz; j++) { 1417 col = pj[j]; 1418 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1419 } 1420 /* U part */ 1421 pc1 = b->a + bdiag[i+1]+1; 1422 pj = b->j + bdiag[i+1]+1; 1423 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1424 for (j=0; j<nz; j++) { 1425 col = pj[j]; 1426 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1427 } 1428 1429 sctx.rs = rs; 1430 sctx.pv = rtmp1[i]; 1431 ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr); 1432 if(sctx.newshift) break; 1433 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1434 *pc1 = 1.0/sctx.pv; 1435 1436 /* Now take care of diagonal 2x2 block. */ 1437 pc2 = rtmp2 + i; 1438 if (*pc2 != 0.0){ 1439 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1440 *pc2 = mul1; /* insert L entry */ 1441 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1442 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1443 for (j=0; j<nz; j++) { 1444 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1445 } 1446 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1447 } 1448 1449 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1450 rs = 0.0; 1451 /* L part */ 1452 pc2 = b->a + bi[i+1]; 1453 pj = b->j + bi[i+1] ; 1454 nz = bi[i+2] - bi[i+1]; 1455 for (j=0; j<nz; j++) { 1456 col = pj[j]; 1457 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1458 } 1459 /* U part */ 1460 pc2 = b->a + bdiag[i+2]+1; 1461 pj = b->j + bdiag[i+2]+1; 1462 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1463 for (j=0; j<nz; j++) { 1464 col = pj[j]; 1465 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1466 } 1467 1468 sctx.rs = rs; 1469 sctx.pv = rtmp2[i+1]; 1470 ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr); 1471 if(sctx.newshift) break; 1472 pc2 = b->a + bdiag[i+1]; 1473 *pc2 = 1.0/sctx.pv; 1474 break; 1475 1476 case 3: 1477 /*----------*/ 1478 /* zero rtmp */ 1479 /* L part */ 1480 nz = bi[i+1] - bi[i]; 1481 bjtmp = bj + bi[i]; 1482 for (j=0; j<nz; j++) { 1483 col = bjtmp[j]; 1484 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1485 } 1486 1487 /* U part */ 1488 nz = bdiag[i]-bdiag[i+1]; 1489 bjtmp = bj + bdiag[i+1]+1; 1490 for (j=0; j<nz; j++) { 1491 col = bjtmp[j]; 1492 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1493 } 1494 1495 /* load in initial (unfactored row) */ 1496 nz = ai[r[i]+1] - ai[r[i]]; 1497 ajtmp = aj + ai[r[i]]; 1498 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1499 for (j=0; j<nz; j++) { 1500 col = ics[ajtmp[j]]; 1501 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1502 } 1503 /* ZeropivotApply(): shift the diagonal of the matrix */ 1504 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1505 1506 /* elimination */ 1507 bjtmp = bj + bi[i]; 1508 row = *bjtmp++; /* pivot row */ 1509 nzL = bi[i+1] - bi[i]; 1510 for(k=0; k < nzL;k++) { 1511 pc1 = rtmp1 + row; 1512 pc2 = rtmp2 + row; 1513 pc3 = rtmp3 + row; 1514 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1515 pv = b->a + bdiag[row]; 1516 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1517 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1518 1519 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1520 pv = b->a + bdiag[row+1]+1; 1521 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1522 for (j=0; j<nz; j++){ 1523 col = pj[j]; 1524 rtmp1[col] -= mul1 * pv[j]; 1525 rtmp2[col] -= mul2 * pv[j]; 1526 rtmp3[col] -= mul3 * pv[j]; 1527 } 1528 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1529 } 1530 row = *bjtmp++; 1531 } 1532 1533 /* finished row i; check zero pivot, then stick row i into b->a */ 1534 rs = 0.0; 1535 /* L part */ 1536 pc1 = b->a + bi[i]; 1537 pj = b->j + bi[i] ; 1538 nz = bi[i+1] - bi[i]; 1539 for (j=0; j<nz; j++) { 1540 col = pj[j]; 1541 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1542 } 1543 /* U part */ 1544 pc1 = b->a + bdiag[i+1]+1; 1545 pj = b->j + bdiag[i+1]+1; 1546 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1547 for (j=0; j<nz; j++) { 1548 col = pj[j]; 1549 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1550 } 1551 1552 sctx.rs = rs; 1553 sctx.pv = rtmp1[i]; 1554 ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr); 1555 if(sctx.newshift) break; 1556 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1557 *pc1 = 1.0/sctx.pv; 1558 1559 /* Now take care of 1st column of diagonal 3x3 block. */ 1560 pc2 = rtmp2 + i; 1561 pc3 = rtmp3 + i; 1562 if (*pc2 != 0.0 || *pc3 != 0.0){ 1563 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1564 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1565 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1566 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1567 for (j=0; j<nz; j++) { 1568 col = pj[j]; 1569 rtmp2[col] -= mul2 * rtmp1[col]; 1570 rtmp3[col] -= mul3 * rtmp1[col]; 1571 } 1572 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1573 } 1574 1575 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1576 rs = 0.0; 1577 /* L part */ 1578 pc2 = b->a + bi[i+1]; 1579 pj = b->j + bi[i+1] ; 1580 nz = bi[i+2] - bi[i+1]; 1581 for (j=0; j<nz; j++) { 1582 col = pj[j]; 1583 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1584 } 1585 /* U part */ 1586 pc2 = b->a + bdiag[i+2]+1; 1587 pj = b->j + bdiag[i+2]+1; 1588 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1589 for (j=0; j<nz; j++) { 1590 col = pj[j]; 1591 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1592 } 1593 1594 sctx.rs = rs; 1595 sctx.pv = rtmp2[i+1]; 1596 ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr); 1597 if(sctx.newshift) break; 1598 pc2 = b->a + bdiag[i+1]; 1599 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1600 1601 /* Now take care of 2nd column of diagonal 3x3 block. */ 1602 pc3 = rtmp3 + i+1; 1603 if (*pc3 != 0.0){ 1604 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1605 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1606 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1607 for (j=0; j<nz; j++) { 1608 col = pj[j]; 1609 rtmp3[col] -= mul3 * rtmp2[col]; 1610 } 1611 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1612 } 1613 1614 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1615 rs = 0.0; 1616 /* L part */ 1617 pc3 = b->a + bi[i+2]; 1618 pj = b->j + bi[i+2] ; 1619 nz = bi[i+3] - bi[i+2]; 1620 for (j=0; j<nz; j++) { 1621 col = pj[j]; 1622 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1623 } 1624 /* U part */ 1625 pc3 = b->a + bdiag[i+3]+1; 1626 pj = b->j + bdiag[i+3]+1; 1627 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1628 for (j=0; j<nz; j++) { 1629 col = pj[j]; 1630 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1631 } 1632 1633 sctx.rs = rs; 1634 sctx.pv = rtmp3[i+2]; 1635 ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr); 1636 if(sctx.newshift) break; 1637 pc3 = b->a + bdiag[i+2]; 1638 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1639 break; 1640 case 4: 1641 /*----------*/ 1642 /* zero rtmp */ 1643 /* L part */ 1644 nz = bi[i+1] - bi[i]; 1645 bjtmp = bj + bi[i]; 1646 for (j=0; j<nz; j++) { 1647 col = bjtmp[j]; 1648 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1649 } 1650 1651 /* U part */ 1652 nz = bdiag[i]-bdiag[i+1]; 1653 bjtmp = bj + bdiag[i+1]+1; 1654 for (j=0; j<nz; j++) { 1655 col = bjtmp[j]; 1656 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1657 } 1658 1659 /* load in initial (unfactored row) */ 1660 nz = ai[r[i]+1] - ai[r[i]]; 1661 ajtmp = aj + ai[r[i]]; 1662 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1663 for (j=0; j<nz; j++) { 1664 col = ics[ajtmp[j]]; 1665 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1666 } 1667 /* ZeropivotApply(): shift the diagonal of the matrix */ 1668 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1669 1670 /* elimination */ 1671 bjtmp = bj + bi[i]; 1672 row = *bjtmp++; /* pivot row */ 1673 nzL = bi[i+1] - bi[i]; 1674 for(k=0; k < nzL;k++) { 1675 pc1 = rtmp1 + row; 1676 pc2 = rtmp2 + row; 1677 pc3 = rtmp3 + row; 1678 pc4 = rtmp4 + row; 1679 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1680 pv = b->a + bdiag[row]; 1681 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1682 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1683 1684 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1685 pv = b->a + bdiag[row+1]+1; 1686 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1687 for (j=0; j<nz; j++){ 1688 col = pj[j]; 1689 rtmp1[col] -= mul1 * pv[j]; 1690 rtmp2[col] -= mul2 * pv[j]; 1691 rtmp3[col] -= mul3 * pv[j]; 1692 rtmp4[col] -= mul4 * pv[j]; 1693 } 1694 ierr = PetscLogFlops(8*nz);CHKERRQ(ierr); 1695 } 1696 row = *bjtmp++; 1697 } 1698 1699 /* finished row i; check zero pivot, then stick row i into b->a */ 1700 rs = 0.0; 1701 /* L part */ 1702 pc1 = b->a + bi[i]; 1703 pj = b->j + bi[i] ; 1704 nz = bi[i+1] - bi[i]; 1705 for (j=0; j<nz; j++) { 1706 col = pj[j]; 1707 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1708 } 1709 /* U part */ 1710 pc1 = b->a + bdiag[i+1]+1; 1711 pj = b->j + bdiag[i+1]+1; 1712 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1713 for (j=0; j<nz; j++) { 1714 col = pj[j]; 1715 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1716 } 1717 1718 sctx.rs = rs; 1719 sctx.pv = rtmp1[i]; 1720 ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr); 1721 if(sctx.newshift) break; 1722 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1723 *pc1 = 1.0/sctx.pv; 1724 1725 /* Now take care of 1st column of diagonal 4x4 block. */ 1726 pc2 = rtmp2 + i; 1727 pc3 = rtmp3 + i; 1728 pc4 = rtmp4 + i; 1729 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){ 1730 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1731 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1732 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1733 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1734 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1735 for (j=0; j<nz; j++) { 1736 col = pj[j]; 1737 rtmp2[col] -= mul2 * rtmp1[col]; 1738 rtmp3[col] -= mul3 * rtmp1[col]; 1739 rtmp4[col] -= mul4 * rtmp1[col]; 1740 } 1741 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1742 } 1743 1744 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1745 rs = 0.0; 1746 /* L part */ 1747 pc2 = b->a + bi[i+1]; 1748 pj = b->j + bi[i+1] ; 1749 nz = bi[i+2] - bi[i+1]; 1750 for (j=0; j<nz; j++) { 1751 col = pj[j]; 1752 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1753 } 1754 /* U part */ 1755 pc2 = b->a + bdiag[i+2]+1; 1756 pj = b->j + bdiag[i+2]+1; 1757 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1758 for (j=0; j<nz; j++) { 1759 col = pj[j]; 1760 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1761 } 1762 1763 sctx.rs = rs; 1764 sctx.pv = rtmp2[i+1]; 1765 ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr); 1766 if(sctx.newshift) break; 1767 pc2 = b->a + bdiag[i+1]; 1768 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1769 1770 /* Now take care of 2nd column of diagonal 4x4 block. */ 1771 pc3 = rtmp3 + i+1; 1772 pc4 = rtmp4 + i+1; 1773 if (*pc3 != 0.0 || *pc4 != 0.0){ 1774 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1775 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1776 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1777 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1778 for (j=0; j<nz; j++) { 1779 col = pj[j]; 1780 rtmp3[col] -= mul3 * rtmp2[col]; 1781 rtmp4[col] -= mul4 * rtmp2[col]; 1782 } 1783 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1784 } 1785 1786 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1787 rs = 0.0; 1788 /* L part */ 1789 pc3 = b->a + bi[i+2]; 1790 pj = b->j + bi[i+2] ; 1791 nz = bi[i+3] - bi[i+2]; 1792 for (j=0; j<nz; j++) { 1793 col = pj[j]; 1794 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1795 } 1796 /* U part */ 1797 pc3 = b->a + bdiag[i+3]+1; 1798 pj = b->j + bdiag[i+3]+1; 1799 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1800 for (j=0; j<nz; j++) { 1801 col = pj[j]; 1802 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1803 } 1804 1805 sctx.rs = rs; 1806 sctx.pv = rtmp3[i+2]; 1807 ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr); 1808 if(sctx.newshift) break; 1809 pc3 = b->a + bdiag[i+2]; 1810 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1811 1812 /* Now take care of 3rd column of diagonal 4x4 block. */ 1813 pc4 = rtmp4 + i+2; 1814 if (*pc4 != 0.0){ 1815 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1816 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1817 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1818 for (j=0; j<nz; j++) { 1819 col = pj[j]; 1820 rtmp4[col] -= mul4 * rtmp3[col]; 1821 } 1822 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1823 } 1824 1825 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1826 rs = 0.0; 1827 /* L part */ 1828 pc4 = b->a + bi[i+3]; 1829 pj = b->j + bi[i+3] ; 1830 nz = bi[i+4] - bi[i+3]; 1831 for (j=0; j<nz; j++) { 1832 col = pj[j]; 1833 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1834 } 1835 /* U part */ 1836 pc4 = b->a + bdiag[i+4]+1; 1837 pj = b->j + bdiag[i+4]+1; 1838 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1839 for (j=0; j<nz; j++) { 1840 col = pj[j]; 1841 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1842 } 1843 1844 sctx.rs = rs; 1845 sctx.pv = rtmp4[i+3]; 1846 ierr = MatPivotCheck(info,&sctx,i+3);CHKERRQ(ierr); 1847 if(sctx.newshift) break; 1848 pc4 = b->a + bdiag[i+3]; 1849 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1850 break; 1851 1852 default: 1853 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1854 } 1855 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1856 i += nodesz; /* Update the row */ 1857 } 1858 1859 /* MatPivotRefine() */ 1860 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1861 /* 1862 * if no shift in this attempt & shifting & started shifting & can refine, 1863 * then try lower shift 1864 */ 1865 sctx.shift_hi = sctx.shift_fraction; 1866 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1867 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1868 sctx.newshift = PETSC_TRUE; 1869 sctx.nshift++; 1870 } 1871 } while (sctx.newshift); 1872 1873 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1874 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1875 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1876 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1877 1878 C->ops->solve = MatSolve_SeqAIJ; 1879 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1880 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1881 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1882 C->ops->matsolve = MatMatSolve_SeqAIJ; 1883 C->assembled = PETSC_TRUE; 1884 C->preallocated = PETSC_TRUE; 1885 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1886 1887 /* MatShiftView(A,info,&sctx) */ 1888 if (sctx.nshift){ 1889 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1890 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1891 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1892 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1893 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){ 1894 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1895 } 1896 } 1897 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1898 PetscFunctionReturn(0); 1899 } 1900 1901 #undef __FUNCT__ 1902 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1903 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1904 { 1905 Mat C = B; 1906 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1907 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1908 PetscErrorCode ierr; 1909 const PetscInt *r,*ic,*c,*ics; 1910 PetscInt n = A->rmap->n,*bi = b->i; 1911 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1912 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1913 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1914 PetscScalar mul1,mul2,mul3,tmp; 1915 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1916 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1917 PetscReal rs=0.0; 1918 FactorShiftCtx sctx; 1919 1920 PetscFunctionBegin; 1921 sctx.shift_top = 0; 1922 sctx.nshift_max = 0; 1923 sctx.shift_lo = 0; 1924 sctx.shift_hi = 0; 1925 sctx.shift_fraction = 0; 1926 1927 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1928 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1929 sctx.shift_top = 0; 1930 for (i=0; i<n; i++) { 1931 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1932 rs = 0.0; 1933 ajtmp = aj + ai[i]; 1934 rtmp1 = aa + ai[i]; 1935 nz = ai[i+1] - ai[i]; 1936 for (j=0; j<nz; j++){ 1937 if (*ajtmp != i){ 1938 rs += PetscAbsScalar(*rtmp1++); 1939 } else { 1940 rs -= PetscRealPart(*rtmp1++); 1941 } 1942 ajtmp++; 1943 } 1944 if (rs>sctx.shift_top) sctx.shift_top = rs; 1945 } 1946 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1947 sctx.shift_top *= 1.1; 1948 sctx.nshift_max = 5; 1949 sctx.shift_lo = 0.; 1950 sctx.shift_hi = 1.; 1951 } 1952 sctx.shift_amount = 0; 1953 sctx.nshift = 0; 1954 1955 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1956 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1957 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1958 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1959 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1960 ics = ic ; 1961 rtmp22 = rtmp11 + n; 1962 rtmp33 = rtmp22 + n; 1963 1964 node_max = a->inode.node_count; 1965 ns = a->inode.size; 1966 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1967 1968 /* If max inode size > 3, split it into two inodes.*/ 1969 /* also map the inode sizes according to the ordering */ 1970 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1971 for (i=0,j=0; i<node_max; ++i,++j){ 1972 if (ns[i]>3) { 1973 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1974 ++j; 1975 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1976 } else { 1977 tmp_vec1[j] = ns[i]; 1978 } 1979 } 1980 /* Use the correct node_max */ 1981 node_max = j; 1982 1983 /* Now reorder the inode info based on mat re-ordering info */ 1984 /* First create a row -> inode_size_array_index map */ 1985 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1986 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1987 for (i=0,row=0; i<node_max; i++) { 1988 nodesz = tmp_vec1[i]; 1989 for (j=0; j<nodesz; j++,row++) { 1990 nsmap[row] = i; 1991 } 1992 } 1993 /* Using nsmap, create a reordered ns structure */ 1994 for (i=0,j=0; i< node_max; i++) { 1995 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1996 tmp_vec2[i] = nodesz; 1997 j += nodesz; 1998 } 1999 ierr = PetscFree(nsmap);CHKERRQ(ierr); 2000 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 2001 /* Now use the correct ns */ 2002 ns = tmp_vec2; 2003 2004 do { 2005 sctx.newshift = PETSC_FALSE; 2006 /* Now loop over each block-row, and do the factorization */ 2007 for (i=0,row=0; i<node_max; i++) { 2008 nodesz = ns[i]; 2009 nz = bi[row+1] - bi[row]; 2010 bjtmp = bj + bi[row]; 2011 2012 switch (nodesz){ 2013 case 1: 2014 for (j=0; j<nz; j++){ 2015 idx = bjtmp[j]; 2016 rtmp11[idx] = 0.0; 2017 } 2018 2019 /* load in initial (unfactored row) */ 2020 idx = r[row]; 2021 nz_tmp = ai[idx+1] - ai[idx]; 2022 ajtmp = aj + ai[idx]; 2023 v1 = aa + ai[idx]; 2024 2025 for (j=0; j<nz_tmp; j++) { 2026 idx = ics[ajtmp[j]]; 2027 rtmp11[idx] = v1[j]; 2028 } 2029 rtmp11[ics[r[row]]] += sctx.shift_amount; 2030 2031 prow = *bjtmp++ ; 2032 while (prow < row) { 2033 pc1 = rtmp11 + prow; 2034 if (*pc1 != 0.0){ 2035 pv = ba + bd[prow]; 2036 pj = nbj + bd[prow]; 2037 mul1 = *pc1 * *pv++; 2038 *pc1 = mul1; 2039 nz_tmp = bi[prow+1] - bd[prow] - 1; 2040 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2041 for (j=0; j<nz_tmp; j++) { 2042 tmp = pv[j]; 2043 idx = pj[j]; 2044 rtmp11[idx] -= mul1 * tmp; 2045 } 2046 } 2047 prow = *bjtmp++ ; 2048 } 2049 pj = bj + bi[row]; 2050 pc1 = ba + bi[row]; 2051 2052 sctx.pv = rtmp11[row]; 2053 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2054 rs = 0.0; 2055 for (j=0; j<nz; j++) { 2056 idx = pj[j]; 2057 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2058 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2059 } 2060 sctx.rs = rs; 2061 ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr); 2062 if (sctx.newshift) goto endofwhile; 2063 break; 2064 2065 case 2: 2066 for (j=0; j<nz; j++) { 2067 idx = bjtmp[j]; 2068 rtmp11[idx] = 0.0; 2069 rtmp22[idx] = 0.0; 2070 } 2071 2072 /* load in initial (unfactored row) */ 2073 idx = r[row]; 2074 nz_tmp = ai[idx+1] - ai[idx]; 2075 ajtmp = aj + ai[idx]; 2076 v1 = aa + ai[idx]; 2077 v2 = aa + ai[idx+1]; 2078 for (j=0; j<nz_tmp; j++) { 2079 idx = ics[ajtmp[j]]; 2080 rtmp11[idx] = v1[j]; 2081 rtmp22[idx] = v2[j]; 2082 } 2083 rtmp11[ics[r[row]]] += sctx.shift_amount; 2084 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2085 2086 prow = *bjtmp++ ; 2087 while (prow < row) { 2088 pc1 = rtmp11 + prow; 2089 pc2 = rtmp22 + prow; 2090 if (*pc1 != 0.0 || *pc2 != 0.0){ 2091 pv = ba + bd[prow]; 2092 pj = nbj + bd[prow]; 2093 mul1 = *pc1 * *pv; 2094 mul2 = *pc2 * *pv; 2095 ++pv; 2096 *pc1 = mul1; 2097 *pc2 = mul2; 2098 2099 nz_tmp = bi[prow+1] - bd[prow] - 1; 2100 for (j=0; j<nz_tmp; j++) { 2101 tmp = pv[j]; 2102 idx = pj[j]; 2103 rtmp11[idx] -= mul1 * tmp; 2104 rtmp22[idx] -= mul2 * tmp; 2105 } 2106 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2107 } 2108 prow = *bjtmp++ ; 2109 } 2110 2111 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2112 pc1 = rtmp11 + prow; 2113 pc2 = rtmp22 + prow; 2114 2115 sctx.pv = *pc1; 2116 pj = bj + bi[prow]; 2117 rs = 0.0; 2118 for (j=0; j<nz; j++){ 2119 idx = pj[j]; 2120 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2121 } 2122 sctx.rs = rs; 2123 ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr); 2124 if (sctx.newshift) goto endofwhile; 2125 2126 if (*pc2 != 0.0){ 2127 pj = nbj + bd[prow]; 2128 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2129 *pc2 = mul2; 2130 nz_tmp = bi[prow+1] - bd[prow] - 1; 2131 for (j=0; j<nz_tmp; j++) { 2132 idx = pj[j] ; 2133 tmp = rtmp11[idx]; 2134 rtmp22[idx] -= mul2 * tmp; 2135 } 2136 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2137 } 2138 2139 pj = bj + bi[row]; 2140 pc1 = ba + bi[row]; 2141 pc2 = ba + bi[row+1]; 2142 2143 sctx.pv = rtmp22[row+1]; 2144 rs = 0.0; 2145 rtmp11[row] = 1.0/rtmp11[row]; 2146 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2147 /* copy row entries from dense representation to sparse */ 2148 for (j=0; j<nz; j++) { 2149 idx = pj[j]; 2150 pc1[j] = rtmp11[idx]; 2151 pc2[j] = rtmp22[idx]; 2152 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2153 } 2154 sctx.rs = rs; 2155 ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr); 2156 if (sctx.newshift) goto endofwhile; 2157 break; 2158 2159 case 3: 2160 for (j=0; j<nz; j++) { 2161 idx = bjtmp[j]; 2162 rtmp11[idx] = 0.0; 2163 rtmp22[idx] = 0.0; 2164 rtmp33[idx] = 0.0; 2165 } 2166 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2167 idx = r[row]; 2168 nz_tmp = ai[idx+1] - ai[idx]; 2169 ajtmp = aj + ai[idx]; 2170 v1 = aa + ai[idx]; 2171 v2 = aa + ai[idx+1]; 2172 v3 = aa + ai[idx+2]; 2173 for (j=0; j<nz_tmp; j++) { 2174 idx = ics[ajtmp[j]]; 2175 rtmp11[idx] = v1[j]; 2176 rtmp22[idx] = v2[j]; 2177 rtmp33[idx] = v3[j]; 2178 } 2179 rtmp11[ics[r[row]]] += sctx.shift_amount; 2180 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2181 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2182 2183 /* loop over all pivot row blocks above this row block */ 2184 prow = *bjtmp++ ; 2185 while (prow < row) { 2186 pc1 = rtmp11 + prow; 2187 pc2 = rtmp22 + prow; 2188 pc3 = rtmp33 + prow; 2189 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 2190 pv = ba + bd[prow]; 2191 pj = nbj + bd[prow]; 2192 mul1 = *pc1 * *pv; 2193 mul2 = *pc2 * *pv; 2194 mul3 = *pc3 * *pv; 2195 ++pv; 2196 *pc1 = mul1; 2197 *pc2 = mul2; 2198 *pc3 = mul3; 2199 2200 nz_tmp = bi[prow+1] - bd[prow] - 1; 2201 /* update this row based on pivot row */ 2202 for (j=0; j<nz_tmp; j++) { 2203 tmp = pv[j]; 2204 idx = pj[j]; 2205 rtmp11[idx] -= mul1 * tmp; 2206 rtmp22[idx] -= mul2 * tmp; 2207 rtmp33[idx] -= mul3 * tmp; 2208 } 2209 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 2210 } 2211 prow = *bjtmp++ ; 2212 } 2213 2214 /* Now take care of diagonal 3x3 block in this set of rows */ 2215 /* note: prow = row here */ 2216 pc1 = rtmp11 + prow; 2217 pc2 = rtmp22 + prow; 2218 pc3 = rtmp33 + prow; 2219 2220 sctx.pv = *pc1; 2221 pj = bj + bi[prow]; 2222 rs = 0.0; 2223 for (j=0; j<nz; j++){ 2224 idx = pj[j]; 2225 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2226 } 2227 sctx.rs = rs; 2228 ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr); 2229 if (sctx.newshift) goto endofwhile; 2230 2231 if (*pc2 != 0.0 || *pc3 != 0.0){ 2232 mul2 = (*pc2)/(*pc1); 2233 mul3 = (*pc3)/(*pc1); 2234 *pc2 = mul2; 2235 *pc3 = mul3; 2236 nz_tmp = bi[prow+1] - bd[prow] - 1; 2237 pj = nbj + bd[prow]; 2238 for (j=0; j<nz_tmp; j++) { 2239 idx = pj[j] ; 2240 tmp = rtmp11[idx]; 2241 rtmp22[idx] -= mul2 * tmp; 2242 rtmp33[idx] -= mul3 * tmp; 2243 } 2244 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2245 } 2246 ++prow; 2247 2248 pc2 = rtmp22 + prow; 2249 pc3 = rtmp33 + prow; 2250 sctx.pv = *pc2; 2251 pj = bj + bi[prow]; 2252 rs = 0.0; 2253 for (j=0; j<nz; j++){ 2254 idx = pj[j]; 2255 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2256 } 2257 sctx.rs = rs; 2258 ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr); 2259 if (sctx.newshift) goto endofwhile; 2260 2261 if (*pc3 != 0.0){ 2262 mul3 = (*pc3)/(*pc2); 2263 *pc3 = mul3; 2264 pj = nbj + bd[prow]; 2265 nz_tmp = bi[prow+1] - bd[prow] - 1; 2266 for (j=0; j<nz_tmp; j++) { 2267 idx = pj[j] ; 2268 tmp = rtmp22[idx]; 2269 rtmp33[idx] -= mul3 * tmp; 2270 } 2271 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2272 } 2273 2274 pj = bj + bi[row]; 2275 pc1 = ba + bi[row]; 2276 pc2 = ba + bi[row+1]; 2277 pc3 = ba + bi[row+2]; 2278 2279 sctx.pv = rtmp33[row+2]; 2280 rs = 0.0; 2281 rtmp11[row] = 1.0/rtmp11[row]; 2282 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2283 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2284 /* copy row entries from dense representation to sparse */ 2285 for (j=0; j<nz; j++) { 2286 idx = pj[j]; 2287 pc1[j] = rtmp11[idx]; 2288 pc2[j] = rtmp22[idx]; 2289 pc3[j] = rtmp33[idx]; 2290 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2291 } 2292 2293 sctx.rs = rs; 2294 ierr = MatPivotCheck(info,&sctx,row+2);CHKERRQ(ierr); 2295 if (sctx.newshift) goto endofwhile; 2296 break; 2297 2298 default: 2299 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2300 } 2301 row += nodesz; /* Update the row */ 2302 } 2303 endofwhile:; 2304 } while (sctx.newshift); 2305 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2306 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2307 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2308 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2309 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2310 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2311 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2312 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2313 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2314 C->assembled = PETSC_TRUE; 2315 C->preallocated = PETSC_TRUE; 2316 if (sctx.nshift) { 2317 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2318 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2319 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2320 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2321 } 2322 } 2323 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2324 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2325 PetscFunctionReturn(0); 2326 } 2327 2328 2329 /* ----------------------------------------------------------- */ 2330 #undef __FUNCT__ 2331 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2332 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2333 { 2334 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2335 IS iscol = a->col,isrow = a->row; 2336 PetscErrorCode ierr; 2337 const PetscInt *r,*c,*rout,*cout; 2338 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2339 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2340 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2341 PetscScalar sum1,sum2,sum3,sum4,sum5; 2342 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2343 const PetscScalar *b; 2344 2345 PetscFunctionBegin; 2346 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2347 node_max = a->inode.node_count; 2348 ns = a->inode.size; /* Node Size array */ 2349 2350 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2351 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2352 tmp = a->solve_work; 2353 2354 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2355 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2356 2357 /* forward solve the lower triangular */ 2358 tmps = tmp ; 2359 aa = a_a ; 2360 aj = a_j ; 2361 ad = a->diag; 2362 2363 for (i = 0,row = 0; i< node_max; ++i){ 2364 nsz = ns[i]; 2365 aii = ai[row]; 2366 v1 = aa + aii; 2367 vi = aj + aii; 2368 nz = ai[row+1]- ai[row]; 2369 2370 if (i < node_max-1) { 2371 /* Prefetch the indices for the next block */ 2372 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2373 /* Prefetch the data for the next block */ 2374 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2375 } 2376 2377 switch (nsz){ /* Each loop in 'case' is unrolled */ 2378 case 1 : 2379 sum1 = b[r[row]]; 2380 for(j=0; j<nz-1; j+=2){ 2381 i0 = vi[j]; 2382 i1 = vi[j+1]; 2383 tmp0 = tmps[i0]; 2384 tmp1 = tmps[i1]; 2385 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2386 } 2387 if(j == nz-1){ 2388 tmp0 = tmps[vi[j]]; 2389 sum1 -= v1[j]*tmp0; 2390 } 2391 tmp[row++]=sum1; 2392 break; 2393 case 2: 2394 sum1 = b[r[row]]; 2395 sum2 = b[r[row+1]]; 2396 v2 = aa + ai[row+1]; 2397 2398 for(j=0; j<nz-1; j+=2){ 2399 i0 = vi[j]; 2400 i1 = vi[j+1]; 2401 tmp0 = tmps[i0]; 2402 tmp1 = tmps[i1]; 2403 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2404 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2405 } 2406 if(j == nz-1){ 2407 tmp0 = tmps[vi[j]]; 2408 sum1 -= v1[j] *tmp0; 2409 sum2 -= v2[j] *tmp0; 2410 } 2411 sum2 -= v2[nz] * sum1; 2412 tmp[row ++]=sum1; 2413 tmp[row ++]=sum2; 2414 break; 2415 case 3: 2416 sum1 = b[r[row]]; 2417 sum2 = b[r[row+1]]; 2418 sum3 = b[r[row+2]]; 2419 v2 = aa + ai[row+1]; 2420 v3 = aa + ai[row+2]; 2421 2422 for (j=0; j<nz-1; j+=2){ 2423 i0 = vi[j]; 2424 i1 = vi[j+1]; 2425 tmp0 = tmps[i0]; 2426 tmp1 = tmps[i1]; 2427 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2428 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2429 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2430 } 2431 if (j == nz-1){ 2432 tmp0 = tmps[vi[j]]; 2433 sum1 -= v1[j] *tmp0; 2434 sum2 -= v2[j] *tmp0; 2435 sum3 -= v3[j] *tmp0; 2436 } 2437 sum2 -= v2[nz] * sum1; 2438 sum3 -= v3[nz] * sum1; 2439 sum3 -= v3[nz+1] * sum2; 2440 tmp[row ++]=sum1; 2441 tmp[row ++]=sum2; 2442 tmp[row ++]=sum3; 2443 break; 2444 2445 case 4: 2446 sum1 = b[r[row]]; 2447 sum2 = b[r[row+1]]; 2448 sum3 = b[r[row+2]]; 2449 sum4 = b[r[row+3]]; 2450 v2 = aa + ai[row+1]; 2451 v3 = aa + ai[row+2]; 2452 v4 = aa + ai[row+3]; 2453 2454 for (j=0; j<nz-1; j+=2){ 2455 i0 = vi[j]; 2456 i1 = vi[j+1]; 2457 tmp0 = tmps[i0]; 2458 tmp1 = tmps[i1]; 2459 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2460 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2461 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2462 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2463 } 2464 if (j == nz-1){ 2465 tmp0 = tmps[vi[j]]; 2466 sum1 -= v1[j] *tmp0; 2467 sum2 -= v2[j] *tmp0; 2468 sum3 -= v3[j] *tmp0; 2469 sum4 -= v4[j] *tmp0; 2470 } 2471 sum2 -= v2[nz] * sum1; 2472 sum3 -= v3[nz] * sum1; 2473 sum4 -= v4[nz] * sum1; 2474 sum3 -= v3[nz+1] * sum2; 2475 sum4 -= v4[nz+1] * sum2; 2476 sum4 -= v4[nz+2] * sum3; 2477 2478 tmp[row ++]=sum1; 2479 tmp[row ++]=sum2; 2480 tmp[row ++]=sum3; 2481 tmp[row ++]=sum4; 2482 break; 2483 case 5: 2484 sum1 = b[r[row]]; 2485 sum2 = b[r[row+1]]; 2486 sum3 = b[r[row+2]]; 2487 sum4 = b[r[row+3]]; 2488 sum5 = b[r[row+4]]; 2489 v2 = aa + ai[row+1]; 2490 v3 = aa + ai[row+2]; 2491 v4 = aa + ai[row+3]; 2492 v5 = aa + ai[row+4]; 2493 2494 for (j=0; j<nz-1; j+=2){ 2495 i0 = vi[j]; 2496 i1 = vi[j+1]; 2497 tmp0 = tmps[i0]; 2498 tmp1 = tmps[i1]; 2499 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2500 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2501 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2502 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2503 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2504 } 2505 if (j == nz-1){ 2506 tmp0 = tmps[vi[j]]; 2507 sum1 -= v1[j] *tmp0; 2508 sum2 -= v2[j] *tmp0; 2509 sum3 -= v3[j] *tmp0; 2510 sum4 -= v4[j] *tmp0; 2511 sum5 -= v5[j] *tmp0; 2512 } 2513 2514 sum2 -= v2[nz] * sum1; 2515 sum3 -= v3[nz] * sum1; 2516 sum4 -= v4[nz] * sum1; 2517 sum5 -= v5[nz] * sum1; 2518 sum3 -= v3[nz+1] * sum2; 2519 sum4 -= v4[nz+1] * sum2; 2520 sum5 -= v5[nz+1] * sum2; 2521 sum4 -= v4[nz+2] * sum3; 2522 sum5 -= v5[nz+2] * sum3; 2523 sum5 -= v5[nz+3] * sum4; 2524 2525 tmp[row ++]=sum1; 2526 tmp[row ++]=sum2; 2527 tmp[row ++]=sum3; 2528 tmp[row ++]=sum4; 2529 tmp[row ++]=sum5; 2530 break; 2531 default: 2532 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2533 } 2534 } 2535 /* backward solve the upper triangular */ 2536 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2537 nsz = ns[i]; 2538 aii = ad[row+1] + 1; 2539 v1 = aa + aii; 2540 vi = aj + aii; 2541 nz = ad[row]- ad[row+1] - 1; 2542 2543 if (i > 0) { 2544 /* Prefetch the indices for the next block */ 2545 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2546 /* Prefetch the data for the next block */ 2547 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2548 } 2549 2550 switch (nsz){ /* Each loop in 'case' is unrolled */ 2551 case 1 : 2552 sum1 = tmp[row]; 2553 2554 for(j=0 ; j<nz-1; j+=2){ 2555 i0 = vi[j]; 2556 i1 = vi[j+1]; 2557 tmp0 = tmps[i0]; 2558 tmp1 = tmps[i1]; 2559 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2560 } 2561 if (j == nz-1){ 2562 tmp0 = tmps[vi[j]]; 2563 sum1 -= v1[j]*tmp0; 2564 } 2565 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2566 break; 2567 case 2 : 2568 sum1 = tmp[row]; 2569 sum2 = tmp[row-1]; 2570 v2 = aa + ad[row] + 1; 2571 for (j=0 ; j<nz-1; j+=2){ 2572 i0 = vi[j]; 2573 i1 = vi[j+1]; 2574 tmp0 = tmps[i0]; 2575 tmp1 = tmps[i1]; 2576 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2577 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2578 } 2579 if (j == nz-1){ 2580 tmp0 = tmps[vi[j]]; 2581 sum1 -= v1[j]* tmp0; 2582 sum2 -= v2[j+1]* tmp0; 2583 } 2584 2585 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2586 sum2 -= v2[0] * tmp0; 2587 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2588 break; 2589 case 3 : 2590 sum1 = tmp[row]; 2591 sum2 = tmp[row -1]; 2592 sum3 = tmp[row -2]; 2593 v2 = aa + ad[row] + 1; 2594 v3 = aa + ad[row -1] + 1; 2595 for (j=0 ; j<nz-1; j+=2){ 2596 i0 = vi[j]; 2597 i1 = vi[j+1]; 2598 tmp0 = tmps[i0]; 2599 tmp1 = tmps[i1]; 2600 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2601 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2602 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2603 } 2604 if (j== nz-1){ 2605 tmp0 = tmps[vi[j]]; 2606 sum1 -= v1[j] * tmp0; 2607 sum2 -= v2[j+1] * tmp0; 2608 sum3 -= v3[j+2] * tmp0; 2609 } 2610 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2611 sum2 -= v2[0]* tmp0; 2612 sum3 -= v3[1] * tmp0; 2613 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2614 sum3 -= v3[0]* tmp0; 2615 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2616 2617 break; 2618 case 4 : 2619 sum1 = tmp[row]; 2620 sum2 = tmp[row -1]; 2621 sum3 = tmp[row -2]; 2622 sum4 = tmp[row -3]; 2623 v2 = aa + ad[row]+1; 2624 v3 = aa + ad[row -1]+1; 2625 v4 = aa + ad[row -2]+1; 2626 2627 for (j=0 ; j<nz-1; j+=2){ 2628 i0 = vi[j]; 2629 i1 = vi[j+1]; 2630 tmp0 = tmps[i0]; 2631 tmp1 = tmps[i1]; 2632 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2633 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2634 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2635 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2636 } 2637 if (j== nz-1){ 2638 tmp0 = tmps[vi[j]]; 2639 sum1 -= v1[j] * tmp0; 2640 sum2 -= v2[j+1] * tmp0; 2641 sum3 -= v3[j+2] * tmp0; 2642 sum4 -= v4[j+3] * tmp0; 2643 } 2644 2645 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2646 sum2 -= v2[0] * tmp0; 2647 sum3 -= v3[1] * tmp0; 2648 sum4 -= v4[2] * tmp0; 2649 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2650 sum3 -= v3[0] * tmp0; 2651 sum4 -= v4[1] * tmp0; 2652 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2653 sum4 -= v4[0] * tmp0; 2654 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2655 break; 2656 case 5 : 2657 sum1 = tmp[row]; 2658 sum2 = tmp[row -1]; 2659 sum3 = tmp[row -2]; 2660 sum4 = tmp[row -3]; 2661 sum5 = tmp[row -4]; 2662 v2 = aa + ad[row]+1; 2663 v3 = aa + ad[row -1]+1; 2664 v4 = aa + ad[row -2]+1; 2665 v5 = aa + ad[row -3]+1; 2666 for (j=0 ; j<nz-1; j+=2){ 2667 i0 = vi[j]; 2668 i1 = vi[j+1]; 2669 tmp0 = tmps[i0]; 2670 tmp1 = tmps[i1]; 2671 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2672 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2673 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2674 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2675 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2676 } 2677 if (j==nz-1){ 2678 tmp0 = tmps[vi[j]]; 2679 sum1 -= v1[j] * tmp0; 2680 sum2 -= v2[j+1] * tmp0; 2681 sum3 -= v3[j+2] * tmp0; 2682 sum4 -= v4[j+3] * tmp0; 2683 sum5 -= v5[j+4] * tmp0; 2684 } 2685 2686 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2687 sum2 -= v2[0] * tmp0; 2688 sum3 -= v3[1] * tmp0; 2689 sum4 -= v4[2] * tmp0; 2690 sum5 -= v5[3] * tmp0; 2691 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2692 sum3 -= v3[0] * tmp0; 2693 sum4 -= v4[1] * tmp0; 2694 sum5 -= v5[2] * tmp0; 2695 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2696 sum4 -= v4[0] * tmp0; 2697 sum5 -= v5[1] * tmp0; 2698 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2699 sum5 -= v5[0] * tmp0; 2700 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2701 break; 2702 default: 2703 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2704 } 2705 } 2706 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2707 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2708 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2709 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2710 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2711 PetscFunctionReturn(0); 2712 } 2713 2714 2715 /* 2716 Makes a longer coloring[] array and calls the usual code with that 2717 */ 2718 #undef __FUNCT__ 2719 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2720 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2721 { 2722 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2723 PetscErrorCode ierr; 2724 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2725 PetscInt *colorused,i; 2726 ISColoringValue *newcolor; 2727 2728 PetscFunctionBegin; 2729 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2730 /* loop over inodes, marking a color for each column*/ 2731 row = 0; 2732 for (i=0; i<m; i++){ 2733 for (j=0; j<ns[i]; j++) { 2734 newcolor[row++] = coloring[i] + j*ncolors; 2735 } 2736 } 2737 2738 /* eliminate unneeded colors */ 2739 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2740 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2741 for (i=0; i<n; i++) { 2742 colorused[newcolor[i]] = 1; 2743 } 2744 2745 for (i=1; i<5*ncolors; i++) { 2746 colorused[i] += colorused[i-1]; 2747 } 2748 ncolors = colorused[5*ncolors-1]; 2749 for (i=0; i<n; i++) { 2750 newcolor[i] = colorused[newcolor[i]]-1; 2751 } 2752 ierr = PetscFree(colorused);CHKERRQ(ierr); 2753 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2754 ierr = PetscFree(coloring);CHKERRQ(ierr); 2755 PetscFunctionReturn(0); 2756 } 2757 2758 #include "../src/mat/blockinvert.h" 2759 2760 #undef __FUNCT__ 2761 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2762 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2763 { 2764 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2765 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2766 MatScalar *ibdiag,*bdiag,work[25]; 2767 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2768 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2769 const PetscScalar *xb; 2770 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2771 PetscErrorCode ierr; 2772 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2773 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2774 2775 PetscFunctionBegin; 2776 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2777 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2778 if (its > 1) { 2779 /* switch to non-inode version */ 2780 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2781 PetscFunctionReturn(0); 2782 } 2783 2784 if (!a->inode.ibdiagvalid) { 2785 if (!a->inode.ibdiag) { 2786 /* calculate space needed for diagonal blocks */ 2787 for (i=0; i<m; i++) { 2788 cnt += sizes[i]*sizes[i]; 2789 } 2790 a->inode.bdiagsize = cnt; 2791 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2792 } 2793 2794 /* copy over the diagonal blocks and invert them */ 2795 ibdiag = a->inode.ibdiag; 2796 bdiag = a->inode.bdiag; 2797 cnt = 0; 2798 for (i=0, row = 0; i<m; i++) { 2799 for (j=0; j<sizes[i]; j++) { 2800 for (k=0; k<sizes[i]; k++) { 2801 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2802 } 2803 } 2804 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2805 2806 switch(sizes[i]) { 2807 case 1: 2808 /* Create matrix data structure */ 2809 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2810 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2811 break; 2812 case 2: 2813 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2814 break; 2815 case 3: 2816 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2817 break; 2818 case 4: 2819 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2820 break; 2821 case 5: 2822 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2823 break; 2824 default: 2825 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2826 } 2827 cnt += sizes[i]*sizes[i]; 2828 row += sizes[i]; 2829 } 2830 a->inode.ibdiagvalid = PETSC_TRUE; 2831 } 2832 ibdiag = a->inode.ibdiag; 2833 bdiag = a->inode.bdiag; 2834 2835 2836 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2837 if (flag & SOR_ZERO_INITIAL_GUESS) { 2838 const PetscScalar *b; 2839 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2840 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2841 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2842 2843 for (i=0, row=0; i<m; i++) { 2844 sz = diag[row] - ii[row]; 2845 v1 = a->a + ii[row]; 2846 idx = a->j + ii[row]; 2847 2848 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2849 switch (sizes[i]){ 2850 case 1: 2851 2852 sum1 = b[row]; 2853 for(n = 0; n<sz-1; n+=2) { 2854 i1 = idx[0]; 2855 i2 = idx[1]; 2856 idx += 2; 2857 tmp0 = x[i1]; 2858 tmp1 = x[i2]; 2859 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2860 } 2861 2862 if (n == sz-1){ 2863 tmp0 = x[*idx]; 2864 sum1 -= *v1 * tmp0; 2865 } 2866 x[row++] = sum1*(*ibdiag++); 2867 break; 2868 case 2: 2869 v2 = a->a + ii[row+1]; 2870 sum1 = b[row]; 2871 sum2 = b[row+1]; 2872 for(n = 0; n<sz-1; n+=2) { 2873 i1 = idx[0]; 2874 i2 = idx[1]; 2875 idx += 2; 2876 tmp0 = x[i1]; 2877 tmp1 = x[i2]; 2878 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2879 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2880 } 2881 2882 if (n == sz-1){ 2883 tmp0 = x[*idx]; 2884 sum1 -= v1[0] * tmp0; 2885 sum2 -= v2[0] * tmp0; 2886 } 2887 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2888 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2889 ibdiag += 4; 2890 break; 2891 case 3: 2892 v2 = a->a + ii[row+1]; 2893 v3 = a->a + ii[row+2]; 2894 sum1 = b[row]; 2895 sum2 = b[row+1]; 2896 sum3 = b[row+2]; 2897 for(n = 0; n<sz-1; n+=2) { 2898 i1 = idx[0]; 2899 i2 = idx[1]; 2900 idx += 2; 2901 tmp0 = x[i1]; 2902 tmp1 = x[i2]; 2903 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2904 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2905 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2906 } 2907 2908 if (n == sz-1){ 2909 tmp0 = x[*idx]; 2910 sum1 -= v1[0] * tmp0; 2911 sum2 -= v2[0] * tmp0; 2912 sum3 -= v3[0] * tmp0; 2913 } 2914 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2915 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2916 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2917 ibdiag += 9; 2918 break; 2919 case 4: 2920 v2 = a->a + ii[row+1]; 2921 v3 = a->a + ii[row+2]; 2922 v4 = a->a + ii[row+3]; 2923 sum1 = b[row]; 2924 sum2 = b[row+1]; 2925 sum3 = b[row+2]; 2926 sum4 = b[row+3]; 2927 for(n = 0; n<sz-1; n+=2) { 2928 i1 = idx[0]; 2929 i2 = idx[1]; 2930 idx += 2; 2931 tmp0 = x[i1]; 2932 tmp1 = x[i2]; 2933 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2934 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2935 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2936 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2937 } 2938 2939 if (n == sz-1){ 2940 tmp0 = x[*idx]; 2941 sum1 -= v1[0] * tmp0; 2942 sum2 -= v2[0] * tmp0; 2943 sum3 -= v3[0] * tmp0; 2944 sum4 -= v4[0] * tmp0; 2945 } 2946 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2947 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2948 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2949 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2950 ibdiag += 16; 2951 break; 2952 case 5: 2953 v2 = a->a + ii[row+1]; 2954 v3 = a->a + ii[row+2]; 2955 v4 = a->a + ii[row+3]; 2956 v5 = a->a + ii[row+4]; 2957 sum1 = b[row]; 2958 sum2 = b[row+1]; 2959 sum3 = b[row+2]; 2960 sum4 = b[row+3]; 2961 sum5 = b[row+4]; 2962 for(n = 0; n<sz-1; n+=2) { 2963 i1 = idx[0]; 2964 i2 = idx[1]; 2965 idx += 2; 2966 tmp0 = x[i1]; 2967 tmp1 = x[i2]; 2968 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2969 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2970 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2971 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2972 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2973 } 2974 2975 if (n == sz-1){ 2976 tmp0 = x[*idx]; 2977 sum1 -= v1[0] * tmp0; 2978 sum2 -= v2[0] * tmp0; 2979 sum3 -= v3[0] * tmp0; 2980 sum4 -= v4[0] * tmp0; 2981 sum5 -= v5[0] * tmp0; 2982 } 2983 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2984 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2985 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2986 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2987 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2988 ibdiag += 25; 2989 break; 2990 default: 2991 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2992 } 2993 } 2994 2995 xb = x; 2996 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2997 } else xb = b; 2998 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2999 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 3000 cnt = 0; 3001 for (i=0, row=0; i<m; i++) { 3002 3003 switch (sizes[i]){ 3004 case 1: 3005 x[row++] *= bdiag[cnt++]; 3006 break; 3007 case 2: 3008 x1 = x[row]; x2 = x[row+1]; 3009 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3010 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3011 x[row++] = tmp1; 3012 x[row++] = tmp2; 3013 cnt += 4; 3014 break; 3015 case 3: 3016 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3017 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3018 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3019 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3020 x[row++] = tmp1; 3021 x[row++] = tmp2; 3022 x[row++] = tmp3; 3023 cnt += 9; 3024 break; 3025 case 4: 3026 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3027 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3028 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3029 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3030 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3031 x[row++] = tmp1; 3032 x[row++] = tmp2; 3033 x[row++] = tmp3; 3034 x[row++] = tmp4; 3035 cnt += 16; 3036 break; 3037 case 5: 3038 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3039 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3040 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3041 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3042 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3043 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3044 x[row++] = tmp1; 3045 x[row++] = tmp2; 3046 x[row++] = tmp3; 3047 x[row++] = tmp4; 3048 x[row++] = tmp5; 3049 cnt += 25; 3050 break; 3051 default: 3052 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3053 } 3054 } 3055 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3056 } 3057 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 3058 3059 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3060 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3061 ibdiag -= sizes[i]*sizes[i]; 3062 sz = ii[row+1] - diag[row] - 1; 3063 v1 = a->a + diag[row] + 1; 3064 idx = a->j + diag[row] + 1; 3065 3066 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3067 switch (sizes[i]){ 3068 case 1: 3069 3070 sum1 = xb[row]; 3071 for(n = 0; n<sz-1; n+=2) { 3072 i1 = idx[0]; 3073 i2 = idx[1]; 3074 idx += 2; 3075 tmp0 = x[i1]; 3076 tmp1 = x[i2]; 3077 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3078 } 3079 3080 if (n == sz-1){ 3081 tmp0 = x[*idx]; 3082 sum1 -= *v1*tmp0; 3083 } 3084 x[row--] = sum1*(*ibdiag); 3085 break; 3086 3087 case 2: 3088 3089 sum1 = xb[row]; 3090 sum2 = xb[row-1]; 3091 /* note that sum1 is associated with the second of the two rows */ 3092 v2 = a->a + diag[row-1] + 2; 3093 for(n = 0; n<sz-1; n+=2) { 3094 i1 = idx[0]; 3095 i2 = idx[1]; 3096 idx += 2; 3097 tmp0 = x[i1]; 3098 tmp1 = x[i2]; 3099 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3100 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3101 } 3102 3103 if (n == sz-1){ 3104 tmp0 = x[*idx]; 3105 sum1 -= *v1*tmp0; 3106 sum2 -= *v2*tmp0; 3107 } 3108 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3109 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3110 break; 3111 case 3: 3112 3113 sum1 = xb[row]; 3114 sum2 = xb[row-1]; 3115 sum3 = xb[row-2]; 3116 v2 = a->a + diag[row-1] + 2; 3117 v3 = a->a + diag[row-2] + 3; 3118 for(n = 0; n<sz-1; n+=2) { 3119 i1 = idx[0]; 3120 i2 = idx[1]; 3121 idx += 2; 3122 tmp0 = x[i1]; 3123 tmp1 = x[i2]; 3124 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3125 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3126 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3127 } 3128 3129 if (n == sz-1){ 3130 tmp0 = x[*idx]; 3131 sum1 -= *v1*tmp0; 3132 sum2 -= *v2*tmp0; 3133 sum3 -= *v3*tmp0; 3134 } 3135 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3136 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3137 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3138 break; 3139 case 4: 3140 3141 sum1 = xb[row]; 3142 sum2 = xb[row-1]; 3143 sum3 = xb[row-2]; 3144 sum4 = xb[row-3]; 3145 v2 = a->a + diag[row-1] + 2; 3146 v3 = a->a + diag[row-2] + 3; 3147 v4 = a->a + diag[row-3] + 4; 3148 for(n = 0; n<sz-1; n+=2) { 3149 i1 = idx[0]; 3150 i2 = idx[1]; 3151 idx += 2; 3152 tmp0 = x[i1]; 3153 tmp1 = x[i2]; 3154 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3155 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3156 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3157 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3158 } 3159 3160 if (n == sz-1){ 3161 tmp0 = x[*idx]; 3162 sum1 -= *v1*tmp0; 3163 sum2 -= *v2*tmp0; 3164 sum3 -= *v3*tmp0; 3165 sum4 -= *v4*tmp0; 3166 } 3167 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3168 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3169 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3170 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3171 break; 3172 case 5: 3173 3174 sum1 = xb[row]; 3175 sum2 = xb[row-1]; 3176 sum3 = xb[row-2]; 3177 sum4 = xb[row-3]; 3178 sum5 = xb[row-4]; 3179 v2 = a->a + diag[row-1] + 2; 3180 v3 = a->a + diag[row-2] + 3; 3181 v4 = a->a + diag[row-3] + 4; 3182 v5 = a->a + diag[row-4] + 5; 3183 for(n = 0; n<sz-1; n+=2) { 3184 i1 = idx[0]; 3185 i2 = idx[1]; 3186 idx += 2; 3187 tmp0 = x[i1]; 3188 tmp1 = x[i2]; 3189 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3190 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3191 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3192 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3193 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3194 } 3195 3196 if (n == sz-1){ 3197 tmp0 = x[*idx]; 3198 sum1 -= *v1*tmp0; 3199 sum2 -= *v2*tmp0; 3200 sum3 -= *v3*tmp0; 3201 sum4 -= *v4*tmp0; 3202 sum5 -= *v5*tmp0; 3203 } 3204 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3205 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3206 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3207 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3208 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3209 break; 3210 default: 3211 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3212 } 3213 } 3214 3215 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3216 } 3217 its--; 3218 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3219 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3220 } 3221 if (flag & SOR_EISENSTAT) { 3222 const PetscScalar *b; 3223 MatScalar *t = a->inode.ssor_work; 3224 3225 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3226 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3227 /* 3228 Apply (U + D)^-1 where D is now the block diagonal 3229 */ 3230 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3231 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3232 ibdiag -= sizes[i]*sizes[i]; 3233 sz = ii[row+1] - diag[row] - 1; 3234 v1 = a->a + diag[row] + 1; 3235 idx = a->j + diag[row] + 1; 3236 CHKMEMQ; 3237 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3238 switch (sizes[i]){ 3239 case 1: 3240 3241 sum1 = b[row]; 3242 for(n = 0; n<sz-1; n+=2) { 3243 i1 = idx[0]; 3244 i2 = idx[1]; 3245 idx += 2; 3246 tmp0 = x[i1]; 3247 tmp1 = x[i2]; 3248 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3249 } 3250 3251 if (n == sz-1){ 3252 tmp0 = x[*idx]; 3253 sum1 -= *v1*tmp0; 3254 } 3255 x[row] = sum1*(*ibdiag);row--; 3256 break; 3257 3258 case 2: 3259 3260 sum1 = b[row]; 3261 sum2 = b[row-1]; 3262 /* note that sum1 is associated with the second of the two rows */ 3263 v2 = a->a + diag[row-1] + 2; 3264 for(n = 0; n<sz-1; n+=2) { 3265 i1 = idx[0]; 3266 i2 = idx[1]; 3267 idx += 2; 3268 tmp0 = x[i1]; 3269 tmp1 = x[i2]; 3270 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3271 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3272 } 3273 3274 if (n == sz-1){ 3275 tmp0 = x[*idx]; 3276 sum1 -= *v1*tmp0; 3277 sum2 -= *v2*tmp0; 3278 } 3279 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3280 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3281 row -= 2; 3282 break; 3283 case 3: 3284 3285 sum1 = b[row]; 3286 sum2 = b[row-1]; 3287 sum3 = b[row-2]; 3288 v2 = a->a + diag[row-1] + 2; 3289 v3 = a->a + diag[row-2] + 3; 3290 for(n = 0; n<sz-1; n+=2) { 3291 i1 = idx[0]; 3292 i2 = idx[1]; 3293 idx += 2; 3294 tmp0 = x[i1]; 3295 tmp1 = x[i2]; 3296 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3297 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3298 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3299 } 3300 3301 if (n == sz-1){ 3302 tmp0 = x[*idx]; 3303 sum1 -= *v1*tmp0; 3304 sum2 -= *v2*tmp0; 3305 sum3 -= *v3*tmp0; 3306 } 3307 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3308 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3309 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3310 row -= 3; 3311 break; 3312 case 4: 3313 3314 sum1 = b[row]; 3315 sum2 = b[row-1]; 3316 sum3 = b[row-2]; 3317 sum4 = b[row-3]; 3318 v2 = a->a + diag[row-1] + 2; 3319 v3 = a->a + diag[row-2] + 3; 3320 v4 = a->a + diag[row-3] + 4; 3321 for(n = 0; n<sz-1; n+=2) { 3322 i1 = idx[0]; 3323 i2 = idx[1]; 3324 idx += 2; 3325 tmp0 = x[i1]; 3326 tmp1 = x[i2]; 3327 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3328 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3329 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3330 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3331 } 3332 3333 if (n == sz-1){ 3334 tmp0 = x[*idx]; 3335 sum1 -= *v1*tmp0; 3336 sum2 -= *v2*tmp0; 3337 sum3 -= *v3*tmp0; 3338 sum4 -= *v4*tmp0; 3339 } 3340 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3341 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3342 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3343 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3344 row -= 4; 3345 break; 3346 case 5: 3347 3348 sum1 = b[row]; 3349 sum2 = b[row-1]; 3350 sum3 = b[row-2]; 3351 sum4 = b[row-3]; 3352 sum5 = b[row-4]; 3353 v2 = a->a + diag[row-1] + 2; 3354 v3 = a->a + diag[row-2] + 3; 3355 v4 = a->a + diag[row-3] + 4; 3356 v5 = a->a + diag[row-4] + 5; 3357 for(n = 0; n<sz-1; n+=2) { 3358 i1 = idx[0]; 3359 i2 = idx[1]; 3360 idx += 2; 3361 tmp0 = x[i1]; 3362 tmp1 = x[i2]; 3363 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3364 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3365 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3366 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3367 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3368 } 3369 3370 if (n == sz-1){ 3371 tmp0 = x[*idx]; 3372 sum1 -= *v1*tmp0; 3373 sum2 -= *v2*tmp0; 3374 sum3 -= *v3*tmp0; 3375 sum4 -= *v4*tmp0; 3376 sum5 -= *v5*tmp0; 3377 } 3378 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3379 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3380 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3381 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3382 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3383 row -= 5; 3384 break; 3385 default: 3386 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3387 } 3388 CHKMEMQ; 3389 } 3390 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3391 3392 /* 3393 t = b - D x where D is the block diagonal 3394 */ 3395 cnt = 0; 3396 for (i=0, row=0; i<m; i++) { 3397 CHKMEMQ; 3398 switch (sizes[i]){ 3399 case 1: 3400 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3401 break; 3402 case 2: 3403 x1 = x[row]; x2 = x[row+1]; 3404 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3405 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3406 t[row] = b[row] - tmp1; 3407 t[row+1] = b[row+1] - tmp2; row += 2; 3408 cnt += 4; 3409 break; 3410 case 3: 3411 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3412 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3413 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3414 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3415 t[row] = b[row] - tmp1; 3416 t[row+1] = b[row+1] - tmp2; 3417 t[row+2] = b[row+2] - tmp3; row += 3; 3418 cnt += 9; 3419 break; 3420 case 4: 3421 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3422 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3423 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3424 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3425 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3426 t[row] = b[row] - tmp1; 3427 t[row+1] = b[row+1] - tmp2; 3428 t[row+2] = b[row+2] - tmp3; 3429 t[row+3] = b[row+3] - tmp4; row += 4; 3430 cnt += 16; 3431 break; 3432 case 5: 3433 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3434 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3435 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3436 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3437 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3438 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3439 t[row] = b[row] - tmp1; 3440 t[row+1] = b[row+1] - tmp2; 3441 t[row+2] = b[row+2] - tmp3; 3442 t[row+3] = b[row+3] - tmp4; 3443 t[row+4] = b[row+4] - tmp5;row += 5; 3444 cnt += 25; 3445 break; 3446 default: 3447 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3448 } 3449 CHKMEMQ; 3450 } 3451 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3452 3453 3454 3455 /* 3456 Apply (L + D)^-1 where D is the block diagonal 3457 */ 3458 for (i=0, row=0; i<m; i++) { 3459 sz = diag[row] - ii[row]; 3460 v1 = a->a + ii[row]; 3461 idx = a->j + ii[row]; 3462 CHKMEMQ; 3463 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3464 switch (sizes[i]){ 3465 case 1: 3466 3467 sum1 = t[row]; 3468 for(n = 0; n<sz-1; n+=2) { 3469 i1 = idx[0]; 3470 i2 = idx[1]; 3471 idx += 2; 3472 tmp0 = t[i1]; 3473 tmp1 = t[i2]; 3474 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3475 } 3476 3477 if (n == sz-1){ 3478 tmp0 = t[*idx]; 3479 sum1 -= *v1 * tmp0; 3480 } 3481 x[row] += t[row] = sum1*(*ibdiag++); row++; 3482 break; 3483 case 2: 3484 v2 = a->a + ii[row+1]; 3485 sum1 = t[row]; 3486 sum2 = t[row+1]; 3487 for(n = 0; n<sz-1; n+=2) { 3488 i1 = idx[0]; 3489 i2 = idx[1]; 3490 idx += 2; 3491 tmp0 = t[i1]; 3492 tmp1 = t[i2]; 3493 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3494 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3495 } 3496 3497 if (n == sz-1){ 3498 tmp0 = t[*idx]; 3499 sum1 -= v1[0] * tmp0; 3500 sum2 -= v2[0] * tmp0; 3501 } 3502 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3503 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3504 ibdiag += 4; row += 2; 3505 break; 3506 case 3: 3507 v2 = a->a + ii[row+1]; 3508 v3 = a->a + ii[row+2]; 3509 sum1 = t[row]; 3510 sum2 = t[row+1]; 3511 sum3 = t[row+2]; 3512 for(n = 0; n<sz-1; n+=2) { 3513 i1 = idx[0]; 3514 i2 = idx[1]; 3515 idx += 2; 3516 tmp0 = t[i1]; 3517 tmp1 = t[i2]; 3518 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3519 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3520 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3521 } 3522 3523 if (n == sz-1){ 3524 tmp0 = t[*idx]; 3525 sum1 -= v1[0] * tmp0; 3526 sum2 -= v2[0] * tmp0; 3527 sum3 -= v3[0] * tmp0; 3528 } 3529 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3530 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3531 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3532 ibdiag += 9; row += 3; 3533 break; 3534 case 4: 3535 v2 = a->a + ii[row+1]; 3536 v3 = a->a + ii[row+2]; 3537 v4 = a->a + ii[row+3]; 3538 sum1 = t[row]; 3539 sum2 = t[row+1]; 3540 sum3 = t[row+2]; 3541 sum4 = t[row+3]; 3542 for(n = 0; n<sz-1; n+=2) { 3543 i1 = idx[0]; 3544 i2 = idx[1]; 3545 idx += 2; 3546 tmp0 = t[i1]; 3547 tmp1 = t[i2]; 3548 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3549 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3550 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3551 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3552 } 3553 3554 if (n == sz-1){ 3555 tmp0 = t[*idx]; 3556 sum1 -= v1[0] * tmp0; 3557 sum2 -= v2[0] * tmp0; 3558 sum3 -= v3[0] * tmp0; 3559 sum4 -= v4[0] * tmp0; 3560 } 3561 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3562 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3563 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3564 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3565 ibdiag += 16; row += 4; 3566 break; 3567 case 5: 3568 v2 = a->a + ii[row+1]; 3569 v3 = a->a + ii[row+2]; 3570 v4 = a->a + ii[row+3]; 3571 v5 = a->a + ii[row+4]; 3572 sum1 = t[row]; 3573 sum2 = t[row+1]; 3574 sum3 = t[row+2]; 3575 sum4 = t[row+3]; 3576 sum5 = t[row+4]; 3577 for(n = 0; n<sz-1; n+=2) { 3578 i1 = idx[0]; 3579 i2 = idx[1]; 3580 idx += 2; 3581 tmp0 = t[i1]; 3582 tmp1 = t[i2]; 3583 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3584 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3585 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3586 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3587 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3588 } 3589 3590 if (n == sz-1){ 3591 tmp0 = t[*idx]; 3592 sum1 -= v1[0] * tmp0; 3593 sum2 -= v2[0] * tmp0; 3594 sum3 -= v3[0] * tmp0; 3595 sum4 -= v4[0] * tmp0; 3596 sum5 -= v5[0] * tmp0; 3597 } 3598 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3599 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3600 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3601 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3602 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3603 ibdiag += 25; row += 5; 3604 break; 3605 default: 3606 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3607 } 3608 CHKMEMQ; 3609 } 3610 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3611 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3612 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3613 } 3614 PetscFunctionReturn(0); 3615 } 3616 3617 #undef __FUNCT__ 3618 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3619 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3620 { 3621 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3622 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3623 const MatScalar *bdiag = a->inode.bdiag; 3624 const PetscScalar *b; 3625 PetscErrorCode ierr; 3626 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3627 const PetscInt *sizes = a->inode.size; 3628 3629 PetscFunctionBegin; 3630 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3631 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3632 cnt = 0; 3633 for (i=0, row=0; i<m; i++) { 3634 switch (sizes[i]){ 3635 case 1: 3636 x[row] = b[row]*bdiag[cnt++];row++; 3637 break; 3638 case 2: 3639 x1 = b[row]; x2 = b[row+1]; 3640 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3641 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3642 x[row++] = tmp1; 3643 x[row++] = tmp2; 3644 cnt += 4; 3645 break; 3646 case 3: 3647 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3648 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3649 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3650 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3651 x[row++] = tmp1; 3652 x[row++] = tmp2; 3653 x[row++] = tmp3; 3654 cnt += 9; 3655 break; 3656 case 4: 3657 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3658 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3659 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3660 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3661 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3662 x[row++] = tmp1; 3663 x[row++] = tmp2; 3664 x[row++] = tmp3; 3665 x[row++] = tmp4; 3666 cnt += 16; 3667 break; 3668 case 5: 3669 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3670 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3671 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3672 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3673 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3674 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3675 x[row++] = tmp1; 3676 x[row++] = tmp2; 3677 x[row++] = tmp3; 3678 x[row++] = tmp4; 3679 x[row++] = tmp5; 3680 cnt += 25; 3681 break; 3682 default: 3683 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3684 } 3685 } 3686 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3687 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3688 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3689 PetscFunctionReturn(0); 3690 } 3691 3692 /* 3693 samestructure indicates that the matrix has not changed its nonzero structure so we 3694 do not need to recompute the inodes 3695 */ 3696 #undef __FUNCT__ 3697 #define __FUNCT__ "Mat_CheckInode" 3698 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3699 { 3700 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3701 PetscErrorCode ierr; 3702 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3703 PetscTruth flag; 3704 3705 PetscFunctionBegin; 3706 if (!a->inode.use) PetscFunctionReturn(0); 3707 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3708 3709 3710 m = A->rmap->n; 3711 if (a->inode.size) {ns = a->inode.size;} 3712 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3713 3714 i = 0; 3715 node_count = 0; 3716 idx = a->j; 3717 ii = a->i; 3718 while (i < m){ /* For each row */ 3719 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3720 /* Limits the number of elements in a node to 'a->inode.limit' */ 3721 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3722 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3723 if (nzy != nzx) break; 3724 idy += nzx; /* Same nonzero pattern */ 3725 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3726 if (!flag) break; 3727 } 3728 ns[node_count++] = blk_size; 3729 idx += blk_size*nzx; 3730 i = j; 3731 } 3732 /* If not enough inodes found,, do not use inode version of the routines */ 3733 if (!m || node_count > .9*m) { 3734 ierr = PetscFree(ns);CHKERRQ(ierr); 3735 a->inode.node_count = 0; 3736 a->inode.size = PETSC_NULL; 3737 a->inode.use = PETSC_FALSE; 3738 A->ops->mult = MatMult_SeqAIJ; 3739 A->ops->sor = MatSOR_SeqAIJ; 3740 A->ops->multadd = MatMultAdd_SeqAIJ; 3741 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 3742 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 3743 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 3744 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 3745 A->ops->coloringpatch = 0; 3746 A->ops->multdiagonalblock = 0; 3747 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3748 } else { 3749 if (!A->factortype) { 3750 A->ops->mult = MatMult_SeqAIJ_Inode; 3751 A->ops->sor = MatSOR_SeqAIJ_Inode; 3752 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3753 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3754 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3755 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3756 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3757 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3758 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3759 } else { 3760 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 3761 } 3762 a->inode.node_count = node_count; 3763 a->inode.size = ns; 3764 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3765 } 3766 PetscFunctionReturn(0); 3767 } 3768 3769 #undef __FUNCT__ 3770 #define __FUNCT__ "MatGetRow_FactoredLU" 3771 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row) 3772 { 3773 PetscInt k, *vi; 3774 PetscFunctionBegin; 3775 vi = aj + ai[row]; 3776 for(k=0;k<nzl;k++) cols[k] = vi[k]; 3777 vi = aj + adiag[row]; 3778 cols[nzl] = vi[0]; 3779 vi = aj + adiag[row+1]+1; 3780 for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k]; 3781 PetscFunctionReturn(0); 3782 } 3783 /* 3784 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3785 Modified from Mat_CheckInode(). 3786 3787 Input Parameters: 3788 + Mat A - ILU or LU matrix factor 3789 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 3790 do not need to recompute the inodes 3791 */ 3792 #undef __FUNCT__ 3793 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3794 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3795 { 3796 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3797 PetscErrorCode ierr; 3798 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3799 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3800 PetscTruth flag; 3801 3802 PetscFunctionBegin; 3803 if (!a->inode.use) PetscFunctionReturn(0); 3804 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3805 3806 m = A->rmap->n; 3807 if (a->inode.size) {ns = a->inode.size;} 3808 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3809 3810 i = 0; 3811 node_count = 0; 3812 while (i < m){ /* For each row */ 3813 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3814 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3815 nzx = nzl1 + nzu1 + 1; 3816 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3817 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3818 3819 /* Limits the number of elements in a node to 'a->inode.limit' */ 3820 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3821 nzl2 = ai[j+1] - ai[j]; 3822 nzu2 = adiag[j] - adiag[j+1] - 1; 3823 nzy = nzl2 + nzu2 + 1; 3824 if( nzy != nzx) break; 3825 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3826 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 3827 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3828 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3829 ierr = PetscFree(cols2);CHKERRQ(ierr); 3830 } 3831 ns[node_count++] = blk_size; 3832 ierr = PetscFree(cols1);CHKERRQ(ierr); 3833 i = j; 3834 } 3835 /* If not enough inodes found,, do not use inode version of the routines */ 3836 if (!m || node_count > .9*m) { 3837 ierr = PetscFree(ns);CHKERRQ(ierr); 3838 a->inode.node_count = 0; 3839 a->inode.size = PETSC_NULL; 3840 a->inode.use = PETSC_FALSE; 3841 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3842 } else { 3843 A->ops->mult = 0; 3844 A->ops->sor = 0; 3845 A->ops->multadd = 0; 3846 A->ops->getrowij = 0; 3847 A->ops->restorerowij = 0; 3848 A->ops->getcolumnij = 0; 3849 A->ops->restorecolumnij = 0; 3850 A->ops->coloringpatch = 0; 3851 A->ops->multdiagonalblock = 0; 3852 A->ops->solve = MatSolve_SeqAIJ_Inode; 3853 a->inode.node_count = node_count; 3854 a->inode.size = ns; 3855 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3856 } 3857 PetscFunctionReturn(0); 3858 } 3859 3860 /* 3861 This is really ugly. if inodes are used this replaces the 3862 permutations with ones that correspond to rows/cols of the matrix 3863 rather then inode blocks 3864 */ 3865 #undef __FUNCT__ 3866 #define __FUNCT__ "MatInodeAdjustForInodes" 3867 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3868 { 3869 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3870 3871 PetscFunctionBegin; 3872 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3873 if (f) { 3874 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3875 } 3876 PetscFunctionReturn(0); 3877 } 3878 3879 EXTERN_C_BEGIN 3880 #undef __FUNCT__ 3881 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3882 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3883 { 3884 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3885 PetscErrorCode ierr; 3886 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3887 const PetscInt *ridx,*cidx; 3888 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3889 PetscInt nslim_col,*ns_col; 3890 IS ris = *rperm,cis = *cperm; 3891 3892 PetscFunctionBegin; 3893 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3894 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3895 3896 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3897 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3898 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3899 3900 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3901 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3902 3903 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3904 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3905 3906 /* Construct the permutations for rows*/ 3907 for (i=0,row = 0; i<nslim_row; ++i){ 3908 indx = ridx[i]; 3909 start_val = tns[indx]; 3910 end_val = tns[indx + 1]; 3911 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3912 } 3913 3914 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3915 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3916 3917 /* Construct permutations for columns */ 3918 for (i=0,col=0; i<nslim_col; ++i){ 3919 indx = cidx[i]; 3920 start_val = tns[indx]; 3921 end_val = tns[indx + 1]; 3922 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3923 } 3924 3925 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3926 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3927 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3928 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3929 3930 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3931 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3932 3933 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3934 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3935 ierr = ISDestroy(cis);CHKERRQ(ierr); 3936 ierr = ISDestroy(ris);CHKERRQ(ierr); 3937 ierr = PetscFree(tns);CHKERRQ(ierr); 3938 PetscFunctionReturn(0); 3939 } 3940 EXTERN_C_END 3941 3942 #undef __FUNCT__ 3943 #define __FUNCT__ "MatInodeGetInodeSizes" 3944 /*@C 3945 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3946 3947 Not Collective 3948 3949 Input Parameter: 3950 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3951 3952 Output Parameter: 3953 + node_count - no of inodes present in the matrix. 3954 . sizes - an array of size node_count,with sizes of each inode. 3955 - limit - the max size used to generate the inodes. 3956 3957 Level: advanced 3958 3959 Notes: This routine returns some internal storage information 3960 of the matrix, it is intended to be used by advanced users. 3961 It should be called after the matrix is assembled. 3962 The contents of the sizes[] array should not be changed. 3963 PETSC_NULL may be passed for information not requested. 3964 3965 .keywords: matrix, seqaij, get, inode 3966 3967 .seealso: MatGetInfo() 3968 @*/ 3969 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3970 { 3971 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3972 3973 PetscFunctionBegin; 3974 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3975 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3976 if (f) { 3977 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3978 } 3979 PetscFunctionReturn(0); 3980 } 3981 3982 EXTERN_C_BEGIN 3983 #undef __FUNCT__ 3984 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3985 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3986 { 3987 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3988 3989 PetscFunctionBegin; 3990 if (node_count) *node_count = a->inode.node_count; 3991 if (sizes) *sizes = a->inode.size; 3992 if (limit) *limit = a->inode.limit; 3993 PetscFunctionReturn(0); 3994 } 3995 EXTERN_C_END 3996