1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1195 FactorShiftCtx sctx; 1196 const PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,node_max,col; 1200 const PetscInt *ns; 1201 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1202 1203 PetscFunctionBegin; 1204 /* MatPivotSetUp(): initialize shift context sctx */ 1205 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1206 1207 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1208 ddiag = a->diag; 1209 sctx.shift_top = info->zeropivot; 1210 for (i=0; i<n; i++) { 1211 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1212 d = (aa)[ddiag[i]]; 1213 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1214 v = aa+ai[i]; 1215 nz = ai[i+1] - ai[i]; 1216 for (j=0; j<nz; j++) 1217 rs += PetscAbsScalar(v[j]); 1218 if (rs>sctx.shift_top) sctx.shift_top = rs; 1219 } 1220 sctx.shift_top *= 1.1; 1221 sctx.nshift_max = 5; 1222 sctx.shift_lo = 0.; 1223 sctx.shift_hi = 1.; 1224 } 1225 1226 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1227 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1228 1229 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1230 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1231 rtmp2 = rtmp1 + n; 1232 rtmp3 = rtmp2 + n; 1233 rtmp4 = rtmp3 + n; 1234 ics = ic; 1235 1236 node_max = a->inode.node_count; 1237 ns = a->inode.size; 1238 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1239 1240 /* If max inode size > 4, split it into two inodes.*/ 1241 /* also map the inode sizes according to the ordering */ 1242 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1243 for (i=0,j=0; i<node_max; ++i,++j){ 1244 if (ns[i] > 4) { 1245 tmp_vec1[j] = 4; 1246 ++j; 1247 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1248 } else { 1249 tmp_vec1[j] = ns[i]; 1250 } 1251 } 1252 /* Use the correct node_max */ 1253 node_max = j; 1254 1255 /* Now reorder the inode info based on mat re-ordering info */ 1256 /* First create a row -> inode_size_array_index map */ 1257 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1258 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1259 for (i=0,row=0; i<node_max; i++) { 1260 nodesz = tmp_vec1[i]; 1261 for (j=0; j<nodesz; j++,row++) { 1262 nsmap[row] = i; 1263 } 1264 } 1265 /* Using nsmap, create a reordered ns structure */ 1266 for (i=0,j=0; i< node_max; i++) { 1267 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1268 tmp_vec2[i] = nodesz; 1269 j += nodesz; 1270 } 1271 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1272 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1273 1274 /* Now use the correct ns */ 1275 ns = tmp_vec2; 1276 1277 do { 1278 sctx.newshift = PETSC_FALSE; 1279 /* Now loop over each block-row, and do the factorization */ 1280 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1281 nodesz = ns[inod]; 1282 1283 switch (nodesz){ 1284 case 1: 1285 /*----------*/ 1286 /* zero rtmp1 */ 1287 /* L part */ 1288 nz = bi[i+1] - bi[i]; 1289 bjtmp = bj + bi[i]; 1290 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1291 1292 /* U part */ 1293 nz = bdiag[i]-bdiag[i+1]; 1294 bjtmp = bj + bdiag[i+1]+1; 1295 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1296 1297 /* load in initial (unfactored row) */ 1298 nz = ai[r[i]+1] - ai[r[i]]; 1299 ajtmp = aj + ai[r[i]]; 1300 v = aa + ai[r[i]]; 1301 for (j=0; j<nz; j++) { 1302 rtmp1[ics[ajtmp[j]]] = v[j]; 1303 } 1304 /* ZeropivotApply() */ 1305 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1306 1307 /* elimination */ 1308 bjtmp = bj + bi[i]; 1309 row = *bjtmp++; 1310 nzL = bi[i+1] - bi[i]; 1311 for(k=0; k < nzL;k++) { 1312 pc = rtmp1 + row; 1313 if (*pc != 0.0) { 1314 pv = b->a + bdiag[row]; 1315 mul1 = *pc * (*pv); 1316 *pc = mul1; 1317 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1318 pv = b->a + bdiag[row+1]+1; 1319 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1320 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1321 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1322 } 1323 row = *bjtmp++; 1324 } 1325 1326 /* finished row so stick it into b->a */ 1327 rs = 0.0; 1328 /* L part */ 1329 pv = b->a + bi[i] ; 1330 pj = b->j + bi[i] ; 1331 nz = bi[i+1] - bi[i]; 1332 for (j=0; j<nz; j++) { 1333 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1334 } 1335 1336 /* U part */ 1337 pv = b->a + bdiag[i+1]+1; 1338 pj = b->j + bdiag[i+1]+1; 1339 nz = bdiag[i] - bdiag[i+1]-1; 1340 for (j=0; j<nz; j++) { 1341 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* Check zero pivot */ 1345 sctx.rs = rs; 1346 sctx.pv = rtmp1[i]; 1347 ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr); 1348 if(sctx.newshift) break; 1349 1350 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1351 pv = b->a + bdiag[i]; 1352 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1353 break; 1354 1355 case 2: 1356 /*----------*/ 1357 /* zero rtmp1 and rtmp2 */ 1358 /* L part */ 1359 nz = bi[i+1] - bi[i]; 1360 bjtmp = bj + bi[i]; 1361 for (j=0; j<nz; j++) { 1362 col = bjtmp[j]; 1363 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1364 } 1365 1366 /* U part */ 1367 nz = bdiag[i]-bdiag[i+1]; 1368 bjtmp = bj + bdiag[i+1]+1; 1369 for (j=0; j<nz; j++) { 1370 col = bjtmp[j]; 1371 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1372 } 1373 1374 /* load in initial (unfactored row) */ 1375 nz = ai[r[i]+1] - ai[r[i]]; 1376 ajtmp = aj + ai[r[i]]; 1377 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1378 for (j=0; j<nz; j++) { 1379 col = ics[ajtmp[j]]; 1380 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1381 } 1382 /* ZeropivotApply(): shift the diagonal of the matrix */ 1383 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1384 1385 /* elimination */ 1386 bjtmp = bj + bi[i]; 1387 row = *bjtmp++; /* pivot row */ 1388 nzL = bi[i+1] - bi[i]; 1389 for(k=0; k < nzL;k++) { 1390 pc1 = rtmp1 + row; 1391 pc2 = rtmp2 + row; 1392 if (*pc1 != 0.0 || *pc2 != 0.0) { 1393 pv = b->a + bdiag[row]; 1394 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1395 *pc1 = mul1; *pc2 = mul2; 1396 1397 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1398 pv = b->a + bdiag[row+1]+1; 1399 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1400 for (j=0; j<nz; j++){ 1401 col = pj[j]; 1402 rtmp1[col] -= mul1 * pv[j]; 1403 rtmp2[col] -= mul2 * pv[j]; 1404 } 1405 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1406 } 1407 row = *bjtmp++; 1408 } 1409 1410 /* finished row i; check zero pivot, then stick row i into b->a */ 1411 rs = 0.0; 1412 /* L part */ 1413 pc1 = b->a + bi[i]; 1414 pj = b->j + bi[i] ; 1415 nz = bi[i+1] - bi[i]; 1416 for (j=0; j<nz; j++) { 1417 col = pj[j]; 1418 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1419 } 1420 /* U part */ 1421 pc1 = b->a + bdiag[i+1]+1; 1422 pj = b->j + bdiag[i+1]+1; 1423 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1424 for (j=0; j<nz; j++) { 1425 col = pj[j]; 1426 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1427 } 1428 1429 sctx.rs = rs; 1430 sctx.pv = rtmp1[i]; 1431 ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr); 1432 if(sctx.newshift) break; 1433 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1434 *pc1 = 1.0/sctx.pv; 1435 1436 /* Now take care of diagonal 2x2 block. */ 1437 pc2 = rtmp2 + i; 1438 if (*pc2 != 0.0){ 1439 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1440 *pc2 = mul1; /* insert L entry */ 1441 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1442 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1443 for (j=0; j<nz; j++) { 1444 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1445 } 1446 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1447 } 1448 1449 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1450 rs = 0.0; 1451 /* L part */ 1452 pc2 = b->a + bi[i+1]; 1453 pj = b->j + bi[i+1] ; 1454 nz = bi[i+2] - bi[i+1]; 1455 for (j=0; j<nz; j++) { 1456 col = pj[j]; 1457 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1458 } 1459 /* U part */ 1460 pc2 = b->a + bdiag[i+2]+1; 1461 pj = b->j + bdiag[i+2]+1; 1462 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1463 for (j=0; j<nz; j++) { 1464 col = pj[j]; 1465 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1466 } 1467 1468 sctx.rs = rs; 1469 sctx.pv = rtmp2[i+1]; 1470 ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr); 1471 if(sctx.newshift) break; 1472 pc2 = b->a + bdiag[i+1]; 1473 *pc2 = 1.0/sctx.pv; 1474 break; 1475 1476 case 3: 1477 /*----------*/ 1478 /* zero rtmp */ 1479 /* L part */ 1480 nz = bi[i+1] - bi[i]; 1481 bjtmp = bj + bi[i]; 1482 for (j=0; j<nz; j++) { 1483 col = bjtmp[j]; 1484 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1485 } 1486 1487 /* U part */ 1488 nz = bdiag[i]-bdiag[i+1]; 1489 bjtmp = bj + bdiag[i+1]+1; 1490 for (j=0; j<nz; j++) { 1491 col = bjtmp[j]; 1492 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1493 } 1494 1495 /* load in initial (unfactored row) */ 1496 nz = ai[r[i]+1] - ai[r[i]]; 1497 ajtmp = aj + ai[r[i]]; 1498 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1499 for (j=0; j<nz; j++) { 1500 col = ics[ajtmp[j]]; 1501 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1502 } 1503 /* ZeropivotApply(): shift the diagonal of the matrix */ 1504 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1505 1506 /* elimination */ 1507 bjtmp = bj + bi[i]; 1508 row = *bjtmp++; /* pivot row */ 1509 nzL = bi[i+1] - bi[i]; 1510 for(k=0; k < nzL;k++) { 1511 pc1 = rtmp1 + row; 1512 pc2 = rtmp2 + row; 1513 pc3 = rtmp3 + row; 1514 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1515 pv = b->a + bdiag[row]; 1516 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1517 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1518 1519 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1520 pv = b->a + bdiag[row+1]+1; 1521 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1522 for (j=0; j<nz; j++){ 1523 col = pj[j]; 1524 rtmp1[col] -= mul1 * pv[j]; 1525 rtmp2[col] -= mul2 * pv[j]; 1526 rtmp3[col] -= mul3 * pv[j]; 1527 } 1528 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1529 } 1530 row = *bjtmp++; 1531 } 1532 1533 /* finished row i; check zero pivot, then stick row i into b->a */ 1534 rs = 0.0; 1535 /* L part */ 1536 pc1 = b->a + bi[i]; 1537 pj = b->j + bi[i] ; 1538 nz = bi[i+1] - bi[i]; 1539 for (j=0; j<nz; j++) { 1540 col = pj[j]; 1541 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1542 } 1543 /* U part */ 1544 pc1 = b->a + bdiag[i+1]+1; 1545 pj = b->j + bdiag[i+1]+1; 1546 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1547 for (j=0; j<nz; j++) { 1548 col = pj[j]; 1549 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1550 } 1551 1552 sctx.rs = rs; 1553 sctx.pv = rtmp1[i]; 1554 ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr); 1555 if(sctx.newshift) break; 1556 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1557 *pc1 = 1.0/sctx.pv; 1558 1559 /* Now take care of 1st column of diagonal 3x3 block. */ 1560 pc2 = rtmp2 + i; 1561 pc3 = rtmp3 + i; 1562 if (*pc2 != 0.0 || *pc3 != 0.0){ 1563 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1564 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1565 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1566 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1567 for (j=0; j<nz; j++) { 1568 col = pj[j]; 1569 rtmp2[col] -= mul2 * rtmp1[col]; 1570 rtmp3[col] -= mul3 * rtmp1[col]; 1571 } 1572 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1573 } 1574 1575 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1576 rs = 0.0; 1577 /* L part */ 1578 pc2 = b->a + bi[i+1]; 1579 pj = b->j + bi[i+1] ; 1580 nz = bi[i+2] - bi[i+1]; 1581 for (j=0; j<nz; j++) { 1582 col = pj[j]; 1583 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1584 } 1585 /* U part */ 1586 pc2 = b->a + bdiag[i+2]+1; 1587 pj = b->j + bdiag[i+2]+1; 1588 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1589 for (j=0; j<nz; j++) { 1590 col = pj[j]; 1591 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1592 } 1593 1594 sctx.rs = rs; 1595 sctx.pv = rtmp2[i+1]; 1596 ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr); 1597 if(sctx.newshift) break; 1598 pc2 = b->a + bdiag[i+1]; 1599 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1600 1601 /* Now take care of 2nd column of diagonal 3x3 block. */ 1602 pc3 = rtmp3 + i+1; 1603 if (*pc3 != 0.0){ 1604 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1605 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1606 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1607 for (j=0; j<nz; j++) { 1608 col = pj[j]; 1609 rtmp3[col] -= mul3 * rtmp2[col]; 1610 } 1611 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1612 } 1613 1614 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1615 rs = 0.0; 1616 /* L part */ 1617 pc3 = b->a + bi[i+2]; 1618 pj = b->j + bi[i+2] ; 1619 nz = bi[i+3] - bi[i+2]; 1620 for (j=0; j<nz; j++) { 1621 col = pj[j]; 1622 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1623 } 1624 /* U part */ 1625 pc3 = b->a + bdiag[i+3]+1; 1626 pj = b->j + bdiag[i+3]+1; 1627 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1628 for (j=0; j<nz; j++) { 1629 col = pj[j]; 1630 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1631 } 1632 1633 sctx.rs = rs; 1634 sctx.pv = rtmp3[i+2]; 1635 ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr); 1636 if(sctx.newshift) break; 1637 pc3 = b->a + bdiag[i+2]; 1638 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1639 break; 1640 case 4: 1641 /*----------*/ 1642 /* zero rtmp */ 1643 /* L part */ 1644 nz = bi[i+1] - bi[i]; 1645 bjtmp = bj + bi[i]; 1646 for (j=0; j<nz; j++) { 1647 col = bjtmp[j]; 1648 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1649 } 1650 1651 /* U part */ 1652 nz = bdiag[i]-bdiag[i+1]; 1653 bjtmp = bj + bdiag[i+1]+1; 1654 for (j=0; j<nz; j++) { 1655 col = bjtmp[j]; 1656 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1657 } 1658 1659 /* load in initial (unfactored row) */ 1660 nz = ai[r[i]+1] - ai[r[i]]; 1661 ajtmp = aj + ai[r[i]]; 1662 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1663 for (j=0; j<nz; j++) { 1664 col = ics[ajtmp[j]]; 1665 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1666 } 1667 /* ZeropivotApply(): shift the diagonal of the matrix */ 1668 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1669 1670 /* elimination */ 1671 bjtmp = bj + bi[i]; 1672 row = *bjtmp++; /* pivot row */ 1673 nzL = bi[i+1] - bi[i]; 1674 for(k=0; k < nzL;k++) { 1675 pc1 = rtmp1 + row; 1676 pc2 = rtmp2 + row; 1677 pc3 = rtmp3 + row; 1678 pc4 = rtmp4 + row; 1679 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1680 pv = b->a + bdiag[row]; 1681 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1682 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1683 1684 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1685 pv = b->a + bdiag[row+1]+1; 1686 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1687 for (j=0; j<nz; j++){ 1688 col = pj[j]; 1689 rtmp1[col] -= mul1 * pv[j]; 1690 rtmp2[col] -= mul2 * pv[j]; 1691 rtmp3[col] -= mul3 * pv[j]; 1692 rtmp4[col] -= mul4 * pv[j]; 1693 } 1694 ierr = PetscLogFlops(8*nz);CHKERRQ(ierr); 1695 } 1696 row = *bjtmp++; 1697 } 1698 1699 /* finished row i; check zero pivot, then stick row i into b->a */ 1700 rs = 0.0; 1701 /* L part */ 1702 pc1 = b->a + bi[i]; 1703 pj = b->j + bi[i] ; 1704 nz = bi[i+1] - bi[i]; 1705 for (j=0; j<nz; j++) { 1706 col = pj[j]; 1707 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1708 } 1709 /* U part */ 1710 pc1 = b->a + bdiag[i+1]+1; 1711 pj = b->j + bdiag[i+1]+1; 1712 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1713 for (j=0; j<nz; j++) { 1714 col = pj[j]; 1715 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1716 } 1717 1718 sctx.rs = rs; 1719 sctx.pv = rtmp1[i]; 1720 ierr = MatPivotCheck(info,&sctx,i);CHKERRQ(ierr); 1721 if(sctx.newshift) break; 1722 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1723 *pc1 = 1.0/sctx.pv; 1724 1725 /* Now take care of 1st column of diagonal 4x4 block. */ 1726 pc2 = rtmp2 + i; 1727 pc3 = rtmp3 + i; 1728 pc4 = rtmp4 + i; 1729 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){ 1730 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1731 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1732 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1733 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1734 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1735 for (j=0; j<nz; j++) { 1736 col = pj[j]; 1737 rtmp2[col] -= mul2 * rtmp1[col]; 1738 rtmp3[col] -= mul3 * rtmp1[col]; 1739 rtmp4[col] -= mul4 * rtmp1[col]; 1740 } 1741 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1742 } 1743 1744 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1745 rs = 0.0; 1746 /* L part */ 1747 pc2 = b->a + bi[i+1]; 1748 pj = b->j + bi[i+1] ; 1749 nz = bi[i+2] - bi[i+1]; 1750 for (j=0; j<nz; j++) { 1751 col = pj[j]; 1752 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1753 } 1754 /* U part */ 1755 pc2 = b->a + bdiag[i+2]+1; 1756 pj = b->j + bdiag[i+2]+1; 1757 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1758 for (j=0; j<nz; j++) { 1759 col = pj[j]; 1760 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1761 } 1762 1763 sctx.rs = rs; 1764 sctx.pv = rtmp2[i+1]; 1765 ierr = MatPivotCheck(info,&sctx,i+1);CHKERRQ(ierr); 1766 if(sctx.newshift) break; 1767 pc2 = b->a + bdiag[i+1]; 1768 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1769 1770 /* Now take care of 2nd column of diagonal 4x4 block. */ 1771 pc3 = rtmp3 + i+1; 1772 pc4 = rtmp4 + i+1; 1773 if (*pc3 != 0.0 || *pc4 != 0.0){ 1774 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1775 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1776 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1777 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1778 for (j=0; j<nz; j++) { 1779 col = pj[j]; 1780 rtmp3[col] -= mul3 * rtmp2[col]; 1781 rtmp4[col] -= mul4 * rtmp2[col]; 1782 } 1783 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1784 } 1785 1786 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1787 rs = 0.0; 1788 /* L part */ 1789 pc3 = b->a + bi[i+2]; 1790 pj = b->j + bi[i+2] ; 1791 nz = bi[i+3] - bi[i+2]; 1792 for (j=0; j<nz; j++) { 1793 col = pj[j]; 1794 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1795 } 1796 /* U part */ 1797 pc3 = b->a + bdiag[i+3]+1; 1798 pj = b->j + bdiag[i+3]+1; 1799 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1800 for (j=0; j<nz; j++) { 1801 col = pj[j]; 1802 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1803 } 1804 1805 sctx.rs = rs; 1806 sctx.pv = rtmp3[i+2]; 1807 ierr = MatPivotCheck(info,&sctx,i+2);CHKERRQ(ierr); 1808 if(sctx.newshift) break; 1809 pc3 = b->a + bdiag[i+2]; 1810 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1811 1812 /* Now take care of 3rd column of diagonal 4x4 block. */ 1813 pc4 = rtmp4 + i+2; 1814 if (*pc4 != 0.0){ 1815 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1816 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1817 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1818 for (j=0; j<nz; j++) { 1819 col = pj[j]; 1820 rtmp4[col] -= mul4 * rtmp3[col]; 1821 } 1822 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1823 } 1824 1825 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1826 rs = 0.0; 1827 /* L part */ 1828 pc4 = b->a + bi[i+3]; 1829 pj = b->j + bi[i+3] ; 1830 nz = bi[i+4] - bi[i+3]; 1831 for (j=0; j<nz; j++) { 1832 col = pj[j]; 1833 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1834 } 1835 /* U part */ 1836 pc4 = b->a + bdiag[i+4]+1; 1837 pj = b->j + bdiag[i+4]+1; 1838 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1839 for (j=0; j<nz; j++) { 1840 col = pj[j]; 1841 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1842 } 1843 1844 sctx.rs = rs; 1845 sctx.pv = rtmp4[i+3]; 1846 ierr = MatPivotCheck(info,&sctx,i+3);CHKERRQ(ierr); 1847 if(sctx.newshift) break; 1848 pc4 = b->a + bdiag[i+3]; 1849 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1850 break; 1851 1852 default: 1853 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1854 } 1855 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1856 i += nodesz; /* Update the row */ 1857 } 1858 1859 /* MatPivotRefine() */ 1860 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1861 /* 1862 * if no shift in this attempt & shifting & started shifting & can refine, 1863 * then try lower shift 1864 */ 1865 sctx.shift_hi = sctx.shift_fraction; 1866 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1867 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1868 sctx.newshift = PETSC_TRUE; 1869 sctx.nshift++; 1870 } 1871 } while (sctx.newshift); 1872 1873 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1874 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1875 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1876 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1877 1878 C->ops->solve = MatSolve_SeqAIJ; 1879 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1880 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1881 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1882 C->ops->matsolve = MatMatSolve_SeqAIJ; 1883 C->assembled = PETSC_TRUE; 1884 C->preallocated = PETSC_TRUE; 1885 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1886 1887 /* MatShiftView(A,info,&sctx) */ 1888 if (sctx.nshift){ 1889 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1890 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1891 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1892 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1893 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){ 1894 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1895 } 1896 } 1897 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1898 PetscFunctionReturn(0); 1899 } 1900 1901 #undef __FUNCT__ 1902 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1903 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1904 { 1905 Mat C = B; 1906 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1907 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1908 PetscErrorCode ierr; 1909 const PetscInt *r,*ic,*c,*ics; 1910 PetscInt n = A->rmap->n,*bi = b->i; 1911 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1912 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1913 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1914 PetscScalar mul1,mul2,mul3,tmp; 1915 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1916 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1917 PetscReal rs=0.0; 1918 FactorShiftCtx sctx; 1919 1920 PetscFunctionBegin; 1921 sctx.shift_top = 0; 1922 sctx.nshift_max = 0; 1923 sctx.shift_lo = 0; 1924 sctx.shift_hi = 0; 1925 sctx.shift_fraction = 0; 1926 1927 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1928 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1929 sctx.shift_top = 0; 1930 for (i=0; i<n; i++) { 1931 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1932 rs = 0.0; 1933 ajtmp = aj + ai[i]; 1934 rtmp1 = aa + ai[i]; 1935 nz = ai[i+1] - ai[i]; 1936 for (j=0; j<nz; j++){ 1937 if (*ajtmp != i){ 1938 rs += PetscAbsScalar(*rtmp1++); 1939 } else { 1940 rs -= PetscRealPart(*rtmp1++); 1941 } 1942 ajtmp++; 1943 } 1944 if (rs>sctx.shift_top) sctx.shift_top = rs; 1945 } 1946 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1947 sctx.shift_top *= 1.1; 1948 sctx.nshift_max = 5; 1949 sctx.shift_lo = 0.; 1950 sctx.shift_hi = 1.; 1951 } 1952 sctx.shift_amount = 0; 1953 sctx.nshift = 0; 1954 1955 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1956 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1957 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1958 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1959 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1960 ics = ic ; 1961 rtmp22 = rtmp11 + n; 1962 rtmp33 = rtmp22 + n; 1963 1964 node_max = a->inode.node_count; 1965 ns = a->inode.size; 1966 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1967 1968 /* If max inode size > 3, split it into two inodes.*/ 1969 /* also map the inode sizes according to the ordering */ 1970 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1971 for (i=0,j=0; i<node_max; ++i,++j){ 1972 if (ns[i]>3) { 1973 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1974 ++j; 1975 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1976 } else { 1977 tmp_vec1[j] = ns[i]; 1978 } 1979 } 1980 /* Use the correct node_max */ 1981 node_max = j; 1982 1983 /* Now reorder the inode info based on mat re-ordering info */ 1984 /* First create a row -> inode_size_array_index map */ 1985 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1986 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1987 for (i=0,row=0; i<node_max; i++) { 1988 nodesz = tmp_vec1[i]; 1989 for (j=0; j<nodesz; j++,row++) { 1990 nsmap[row] = i; 1991 } 1992 } 1993 /* Using nsmap, create a reordered ns structure */ 1994 for (i=0,j=0; i< node_max; i++) { 1995 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1996 tmp_vec2[i] = nodesz; 1997 j += nodesz; 1998 } 1999 ierr = PetscFree(nsmap);CHKERRQ(ierr); 2000 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 2001 /* Now use the correct ns */ 2002 ns = tmp_vec2; 2003 2004 do { 2005 sctx.newshift = PETSC_FALSE; 2006 /* Now loop over each block-row, and do the factorization */ 2007 for (i=0,row=0; i<node_max; i++) { 2008 nodesz = ns[i]; 2009 nz = bi[row+1] - bi[row]; 2010 bjtmp = bj + bi[row]; 2011 2012 switch (nodesz){ 2013 case 1: 2014 for (j=0; j<nz; j++){ 2015 idx = bjtmp[j]; 2016 rtmp11[idx] = 0.0; 2017 } 2018 2019 /* load in initial (unfactored row) */ 2020 idx = r[row]; 2021 nz_tmp = ai[idx+1] - ai[idx]; 2022 ajtmp = aj + ai[idx]; 2023 v1 = aa + ai[idx]; 2024 2025 for (j=0; j<nz_tmp; j++) { 2026 idx = ics[ajtmp[j]]; 2027 rtmp11[idx] = v1[j]; 2028 } 2029 rtmp11[ics[r[row]]] += sctx.shift_amount; 2030 2031 prow = *bjtmp++ ; 2032 while (prow < row) { 2033 pc1 = rtmp11 + prow; 2034 if (*pc1 != 0.0){ 2035 pv = ba + bd[prow]; 2036 pj = nbj + bd[prow]; 2037 mul1 = *pc1 * *pv++; 2038 *pc1 = mul1; 2039 nz_tmp = bi[prow+1] - bd[prow] - 1; 2040 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2041 for (j=0; j<nz_tmp; j++) { 2042 tmp = pv[j]; 2043 idx = pj[j]; 2044 rtmp11[idx] -= mul1 * tmp; 2045 } 2046 } 2047 prow = *bjtmp++ ; 2048 } 2049 pj = bj + bi[row]; 2050 pc1 = ba + bi[row]; 2051 2052 sctx.pv = rtmp11[row]; 2053 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2054 rs = 0.0; 2055 for (j=0; j<nz; j++) { 2056 idx = pj[j]; 2057 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2058 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2059 } 2060 sctx.rs = rs; 2061 ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr); 2062 if (sctx.newshift) goto endofwhile; 2063 break; 2064 2065 case 2: 2066 for (j=0; j<nz; j++) { 2067 idx = bjtmp[j]; 2068 rtmp11[idx] = 0.0; 2069 rtmp22[idx] = 0.0; 2070 } 2071 2072 /* load in initial (unfactored row) */ 2073 idx = r[row]; 2074 nz_tmp = ai[idx+1] - ai[idx]; 2075 ajtmp = aj + ai[idx]; 2076 v1 = aa + ai[idx]; 2077 v2 = aa + ai[idx+1]; 2078 for (j=0; j<nz_tmp; j++) { 2079 idx = ics[ajtmp[j]]; 2080 rtmp11[idx] = v1[j]; 2081 rtmp22[idx] = v2[j]; 2082 } 2083 rtmp11[ics[r[row]]] += sctx.shift_amount; 2084 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2085 2086 prow = *bjtmp++ ; 2087 while (prow < row) { 2088 pc1 = rtmp11 + prow; 2089 pc2 = rtmp22 + prow; 2090 if (*pc1 != 0.0 || *pc2 != 0.0){ 2091 pv = ba + bd[prow]; 2092 pj = nbj + bd[prow]; 2093 mul1 = *pc1 * *pv; 2094 mul2 = *pc2 * *pv; 2095 ++pv; 2096 *pc1 = mul1; 2097 *pc2 = mul2; 2098 2099 nz_tmp = bi[prow+1] - bd[prow] - 1; 2100 for (j=0; j<nz_tmp; j++) { 2101 tmp = pv[j]; 2102 idx = pj[j]; 2103 rtmp11[idx] -= mul1 * tmp; 2104 rtmp22[idx] -= mul2 * tmp; 2105 } 2106 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2107 } 2108 prow = *bjtmp++ ; 2109 } 2110 2111 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2112 pc1 = rtmp11 + prow; 2113 pc2 = rtmp22 + prow; 2114 2115 sctx.pv = *pc1; 2116 pj = bj + bi[prow]; 2117 rs = 0.0; 2118 for (j=0; j<nz; j++){ 2119 idx = pj[j]; 2120 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2121 } 2122 sctx.rs = rs; 2123 ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr); 2124 if (sctx.newshift) goto endofwhile; 2125 2126 if (*pc2 != 0.0){ 2127 pj = nbj + bd[prow]; 2128 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2129 *pc2 = mul2; 2130 nz_tmp = bi[prow+1] - bd[prow] - 1; 2131 for (j=0; j<nz_tmp; j++) { 2132 idx = pj[j] ; 2133 tmp = rtmp11[idx]; 2134 rtmp22[idx] -= mul2 * tmp; 2135 } 2136 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2137 } 2138 2139 pj = bj + bi[row]; 2140 pc1 = ba + bi[row]; 2141 pc2 = ba + bi[row+1]; 2142 2143 sctx.pv = rtmp22[row+1]; 2144 rs = 0.0; 2145 rtmp11[row] = 1.0/rtmp11[row]; 2146 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2147 /* copy row entries from dense representation to sparse */ 2148 for (j=0; j<nz; j++) { 2149 idx = pj[j]; 2150 pc1[j] = rtmp11[idx]; 2151 pc2[j] = rtmp22[idx]; 2152 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2153 } 2154 sctx.rs = rs; 2155 ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr); 2156 if (sctx.newshift) goto endofwhile; 2157 break; 2158 2159 case 3: 2160 for (j=0; j<nz; j++) { 2161 idx = bjtmp[j]; 2162 rtmp11[idx] = 0.0; 2163 rtmp22[idx] = 0.0; 2164 rtmp33[idx] = 0.0; 2165 } 2166 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2167 idx = r[row]; 2168 nz_tmp = ai[idx+1] - ai[idx]; 2169 ajtmp = aj + ai[idx]; 2170 v1 = aa + ai[idx]; 2171 v2 = aa + ai[idx+1]; 2172 v3 = aa + ai[idx+2]; 2173 for (j=0; j<nz_tmp; j++) { 2174 idx = ics[ajtmp[j]]; 2175 rtmp11[idx] = v1[j]; 2176 rtmp22[idx] = v2[j]; 2177 rtmp33[idx] = v3[j]; 2178 } 2179 rtmp11[ics[r[row]]] += sctx.shift_amount; 2180 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2181 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2182 2183 /* loop over all pivot row blocks above this row block */ 2184 prow = *bjtmp++ ; 2185 while (prow < row) { 2186 pc1 = rtmp11 + prow; 2187 pc2 = rtmp22 + prow; 2188 pc3 = rtmp33 + prow; 2189 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 2190 pv = ba + bd[prow]; 2191 pj = nbj + bd[prow]; 2192 mul1 = *pc1 * *pv; 2193 mul2 = *pc2 * *pv; 2194 mul3 = *pc3 * *pv; 2195 ++pv; 2196 *pc1 = mul1; 2197 *pc2 = mul2; 2198 *pc3 = mul3; 2199 2200 nz_tmp = bi[prow+1] - bd[prow] - 1; 2201 /* update this row based on pivot row */ 2202 for (j=0; j<nz_tmp; j++) { 2203 tmp = pv[j]; 2204 idx = pj[j]; 2205 rtmp11[idx] -= mul1 * tmp; 2206 rtmp22[idx] -= mul2 * tmp; 2207 rtmp33[idx] -= mul3 * tmp; 2208 } 2209 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 2210 } 2211 prow = *bjtmp++ ; 2212 } 2213 2214 /* Now take care of diagonal 3x3 block in this set of rows */ 2215 /* note: prow = row here */ 2216 pc1 = rtmp11 + prow; 2217 pc2 = rtmp22 + prow; 2218 pc3 = rtmp33 + prow; 2219 2220 sctx.pv = *pc1; 2221 pj = bj + bi[prow]; 2222 rs = 0.0; 2223 for (j=0; j<nz; j++){ 2224 idx = pj[j]; 2225 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2226 } 2227 sctx.rs = rs; 2228 ierr = MatPivotCheck(info,&sctx,row);CHKERRQ(ierr); 2229 if (sctx.newshift) goto endofwhile; 2230 2231 if (*pc2 != 0.0 || *pc3 != 0.0){ 2232 mul2 = (*pc2)/(*pc1); 2233 mul3 = (*pc3)/(*pc1); 2234 *pc2 = mul2; 2235 *pc3 = mul3; 2236 nz_tmp = bi[prow+1] - bd[prow] - 1; 2237 pj = nbj + bd[prow]; 2238 for (j=0; j<nz_tmp; j++) { 2239 idx = pj[j] ; 2240 tmp = rtmp11[idx]; 2241 rtmp22[idx] -= mul2 * tmp; 2242 rtmp33[idx] -= mul3 * tmp; 2243 } 2244 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2245 } 2246 ++prow; 2247 2248 pc2 = rtmp22 + prow; 2249 pc3 = rtmp33 + prow; 2250 sctx.pv = *pc2; 2251 pj = bj + bi[prow]; 2252 rs = 0.0; 2253 for (j=0; j<nz; j++){ 2254 idx = pj[j]; 2255 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2256 } 2257 sctx.rs = rs; 2258 ierr = MatPivotCheck(info,&sctx,row+1);CHKERRQ(ierr); 2259 if (sctx.newshift) goto endofwhile; 2260 2261 if (*pc3 != 0.0){ 2262 mul3 = (*pc3)/(*pc2); 2263 *pc3 = mul3; 2264 pj = nbj + bd[prow]; 2265 nz_tmp = bi[prow+1] - bd[prow] - 1; 2266 for (j=0; j<nz_tmp; j++) { 2267 idx = pj[j] ; 2268 tmp = rtmp22[idx]; 2269 rtmp33[idx] -= mul3 * tmp; 2270 } 2271 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2272 } 2273 2274 pj = bj + bi[row]; 2275 pc1 = ba + bi[row]; 2276 pc2 = ba + bi[row+1]; 2277 pc3 = ba + bi[row+2]; 2278 2279 sctx.pv = rtmp33[row+2]; 2280 rs = 0.0; 2281 rtmp11[row] = 1.0/rtmp11[row]; 2282 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2283 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2284 /* copy row entries from dense representation to sparse */ 2285 for (j=0; j<nz; j++) { 2286 idx = pj[j]; 2287 pc1[j] = rtmp11[idx]; 2288 pc2[j] = rtmp22[idx]; 2289 pc3[j] = rtmp33[idx]; 2290 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2291 } 2292 2293 sctx.rs = rs; 2294 ierr = MatPivotCheck(info,&sctx,row+2);CHKERRQ(ierr); 2295 if (sctx.newshift) goto endofwhile; 2296 break; 2297 2298 default: 2299 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2300 } 2301 row += nodesz; /* Update the row */ 2302 } 2303 endofwhile:; 2304 } while (sctx.newshift); 2305 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2306 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2307 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2308 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2309 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2310 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2311 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2312 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2313 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2314 C->assembled = PETSC_TRUE; 2315 C->preallocated = PETSC_TRUE; 2316 if (sctx.nshift) { 2317 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2318 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2319 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2320 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2321 } 2322 } 2323 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2324 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2325 PetscFunctionReturn(0); 2326 } 2327 2328 2329 /* ----------------------------------------------------------- */ 2330 #undef __FUNCT__ 2331 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2332 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2333 { 2334 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2335 IS iscol = a->col,isrow = a->row; 2336 PetscErrorCode ierr; 2337 const PetscInt *r,*c,*rout,*cout; 2338 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2339 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2340 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2341 PetscScalar sum1,sum2,sum3,sum4,sum5; 2342 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2343 const PetscScalar *b; 2344 2345 PetscFunctionBegin; 2346 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2347 node_max = a->inode.node_count; 2348 ns = a->inode.size; /* Node Size array */ 2349 2350 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2351 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2352 tmp = a->solve_work; 2353 2354 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2355 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2356 2357 /* forward solve the lower triangular */ 2358 tmps = tmp ; 2359 aa = a_a ; 2360 aj = a_j ; 2361 ad = a->diag; 2362 2363 for (i = 0,row = 0; i< node_max; ++i){ 2364 nsz = ns[i]; 2365 aii = ai[row]; 2366 v1 = aa + aii; 2367 vi = aj + aii; 2368 nz = ai[row+1]- ai[row]; 2369 2370 if (i < node_max-1) { 2371 /* Prefetch the indices for the next block */ 2372 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2373 /* Prefetch the data for the next block */ 2374 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2375 } 2376 2377 switch (nsz){ /* Each loop in 'case' is unrolled */ 2378 case 1 : 2379 sum1 = b[r[row]]; 2380 for(j=0; j<nz-1; j+=2){ 2381 i0 = vi[j]; 2382 i1 = vi[j+1]; 2383 tmp0 = tmps[i0]; 2384 tmp1 = tmps[i1]; 2385 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2386 } 2387 if(j == nz-1){ 2388 tmp0 = tmps[vi[j]]; 2389 sum1 -= v1[j]*tmp0; 2390 } 2391 tmp[row++]=sum1; 2392 break; 2393 case 2: 2394 sum1 = b[r[row]]; 2395 sum2 = b[r[row+1]]; 2396 v2 = aa + ai[row+1]; 2397 2398 for(j=0; j<nz-1; j+=2){ 2399 i0 = vi[j]; 2400 i1 = vi[j+1]; 2401 tmp0 = tmps[i0]; 2402 tmp1 = tmps[i1]; 2403 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2404 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2405 } 2406 if(j == nz-1){ 2407 tmp0 = tmps[vi[j]]; 2408 sum1 -= v1[j] *tmp0; 2409 sum2 -= v2[j] *tmp0; 2410 } 2411 sum2 -= v2[nz] * sum1; 2412 tmp[row ++]=sum1; 2413 tmp[row ++]=sum2; 2414 break; 2415 case 3: 2416 sum1 = b[r[row]]; 2417 sum2 = b[r[row+1]]; 2418 sum3 = b[r[row+2]]; 2419 v2 = aa + ai[row+1]; 2420 v3 = aa + ai[row+2]; 2421 2422 for (j=0; j<nz-1; j+=2){ 2423 i0 = vi[j]; 2424 i1 = vi[j+1]; 2425 tmp0 = tmps[i0]; 2426 tmp1 = tmps[i1]; 2427 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2428 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2429 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2430 } 2431 if (j == nz-1){ 2432 tmp0 = tmps[vi[j]]; 2433 sum1 -= v1[j] *tmp0; 2434 sum2 -= v2[j] *tmp0; 2435 sum3 -= v3[j] *tmp0; 2436 } 2437 sum2 -= v2[nz] * sum1; 2438 sum3 -= v3[nz] * sum1; 2439 sum3 -= v3[nz+1] * sum2; 2440 tmp[row ++]=sum1; 2441 tmp[row ++]=sum2; 2442 tmp[row ++]=sum3; 2443 break; 2444 2445 case 4: 2446 sum1 = b[r[row]]; 2447 sum2 = b[r[row+1]]; 2448 sum3 = b[r[row+2]]; 2449 sum4 = b[r[row+3]]; 2450 v2 = aa + ai[row+1]; 2451 v3 = aa + ai[row+2]; 2452 v4 = aa + ai[row+3]; 2453 2454 for (j=0; j<nz-1; j+=2){ 2455 i0 = vi[j]; 2456 i1 = vi[j+1]; 2457 tmp0 = tmps[i0]; 2458 tmp1 = tmps[i1]; 2459 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2460 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2461 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2462 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2463 } 2464 if (j == nz-1){ 2465 tmp0 = tmps[vi[j]]; 2466 sum1 -= v1[j] *tmp0; 2467 sum2 -= v2[j] *tmp0; 2468 sum3 -= v3[j] *tmp0; 2469 sum4 -= v4[j] *tmp0; 2470 } 2471 sum2 -= v2[nz] * sum1; 2472 sum3 -= v3[nz] * sum1; 2473 sum4 -= v4[nz] * sum1; 2474 sum3 -= v3[nz+1] * sum2; 2475 sum4 -= v4[nz+1] * sum2; 2476 sum4 -= v4[nz+2] * sum3; 2477 2478 tmp[row ++]=sum1; 2479 tmp[row ++]=sum2; 2480 tmp[row ++]=sum3; 2481 tmp[row ++]=sum4; 2482 break; 2483 case 5: 2484 sum1 = b[r[row]]; 2485 sum2 = b[r[row+1]]; 2486 sum3 = b[r[row+2]]; 2487 sum4 = b[r[row+3]]; 2488 sum5 = b[r[row+4]]; 2489 v2 = aa + ai[row+1]; 2490 v3 = aa + ai[row+2]; 2491 v4 = aa + ai[row+3]; 2492 v5 = aa + ai[row+4]; 2493 2494 for (j=0; j<nz-1; j+=2){ 2495 i0 = vi[j]; 2496 i1 = vi[j+1]; 2497 tmp0 = tmps[i0]; 2498 tmp1 = tmps[i1]; 2499 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2500 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2501 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2502 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2503 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2504 } 2505 if (j == nz-1){ 2506 tmp0 = tmps[vi[j]]; 2507 sum1 -= v1[j] *tmp0; 2508 sum2 -= v2[j] *tmp0; 2509 sum3 -= v3[j] *tmp0; 2510 sum4 -= v4[j] *tmp0; 2511 sum5 -= v5[j] *tmp0; 2512 } 2513 2514 sum2 -= v2[nz] * sum1; 2515 sum3 -= v3[nz] * sum1; 2516 sum4 -= v4[nz] * sum1; 2517 sum5 -= v5[nz] * sum1; 2518 sum3 -= v3[nz+1] * sum2; 2519 sum4 -= v4[nz+1] * sum2; 2520 sum5 -= v5[nz+1] * sum2; 2521 sum4 -= v4[nz+2] * sum3; 2522 sum5 -= v5[nz+2] * sum3; 2523 sum5 -= v5[nz+3] * sum4; 2524 2525 tmp[row ++]=sum1; 2526 tmp[row ++]=sum2; 2527 tmp[row ++]=sum3; 2528 tmp[row ++]=sum4; 2529 tmp[row ++]=sum5; 2530 break; 2531 default: 2532 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2533 } 2534 } 2535 /* backward solve the upper triangular */ 2536 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2537 nsz = ns[i]; 2538 aii = ad[row+1] + 1; 2539 v1 = aa + aii; 2540 vi = aj + aii; 2541 nz = ad[row]- ad[row+1] - 1; 2542 2543 if (i > 0) { 2544 /* Prefetch the indices for the next block */ 2545 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2546 /* Prefetch the data for the next block */ 2547 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2548 } 2549 2550 switch (nsz){ /* Each loop in 'case' is unrolled */ 2551 case 1 : 2552 sum1 = tmp[row]; 2553 2554 for(j=0 ; j<nz-1; j+=2){ 2555 i0 = vi[j]; 2556 i1 = vi[j+1]; 2557 tmp0 = tmps[i0]; 2558 tmp1 = tmps[i1]; 2559 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2560 } 2561 if (j == nz-1){ 2562 tmp0 = tmps[vi[j]]; 2563 sum1 -= v1[j]*tmp0; 2564 } 2565 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2566 break; 2567 case 2 : 2568 sum1 = tmp[row]; 2569 sum2 = tmp[row-1]; 2570 v2 = aa + ad[row] + 1; 2571 for (j=0 ; j<nz-1; j+=2){ 2572 i0 = vi[j]; 2573 i1 = vi[j+1]; 2574 tmp0 = tmps[i0]; 2575 tmp1 = tmps[i1]; 2576 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2577 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2578 } 2579 if (j == nz-1){ 2580 tmp0 = tmps[vi[j]]; 2581 sum1 -= v1[j]* tmp0; 2582 sum2 -= v2[j+1]* tmp0; 2583 } 2584 2585 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2586 sum2 -= v2[0] * tmp0; 2587 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2588 break; 2589 case 3 : 2590 sum1 = tmp[row]; 2591 sum2 = tmp[row -1]; 2592 sum3 = tmp[row -2]; 2593 v2 = aa + ad[row] + 1; 2594 v3 = aa + ad[row -1] + 1; 2595 for (j=0 ; j<nz-1; j+=2){ 2596 i0 = vi[j]; 2597 i1 = vi[j+1]; 2598 tmp0 = tmps[i0]; 2599 tmp1 = tmps[i1]; 2600 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2601 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2602 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2603 } 2604 if (j== nz-1){ 2605 tmp0 = tmps[vi[j]]; 2606 sum1 -= v1[j] * tmp0; 2607 sum2 -= v2[j+1] * tmp0; 2608 sum3 -= v3[j+2] * tmp0; 2609 } 2610 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2611 sum2 -= v2[0]* tmp0; 2612 sum3 -= v3[1] * tmp0; 2613 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2614 sum3 -= v3[0]* tmp0; 2615 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2616 2617 break; 2618 case 4 : 2619 sum1 = tmp[row]; 2620 sum2 = tmp[row -1]; 2621 sum3 = tmp[row -2]; 2622 sum4 = tmp[row -3]; 2623 v2 = aa + ad[row]+1; 2624 v3 = aa + ad[row -1]+1; 2625 v4 = aa + ad[row -2]+1; 2626 2627 for (j=0 ; j<nz-1; j+=2){ 2628 i0 = vi[j]; 2629 i1 = vi[j+1]; 2630 tmp0 = tmps[i0]; 2631 tmp1 = tmps[i1]; 2632 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2633 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2634 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2635 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2636 } 2637 if (j== nz-1){ 2638 tmp0 = tmps[vi[j]]; 2639 sum1 -= v1[j] * tmp0; 2640 sum2 -= v2[j+1] * tmp0; 2641 sum3 -= v3[j+2] * tmp0; 2642 sum4 -= v4[j+3] * tmp0; 2643 } 2644 2645 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2646 sum2 -= v2[0] * tmp0; 2647 sum3 -= v3[1] * tmp0; 2648 sum4 -= v4[2] * tmp0; 2649 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2650 sum3 -= v3[0] * tmp0; 2651 sum4 -= v4[1] * tmp0; 2652 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2653 sum4 -= v4[0] * tmp0; 2654 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2655 break; 2656 case 5 : 2657 sum1 = tmp[row]; 2658 sum2 = tmp[row -1]; 2659 sum3 = tmp[row -2]; 2660 sum4 = tmp[row -3]; 2661 sum5 = tmp[row -4]; 2662 v2 = aa + ad[row]+1; 2663 v3 = aa + ad[row -1]+1; 2664 v4 = aa + ad[row -2]+1; 2665 v5 = aa + ad[row -3]+1; 2666 for (j=0 ; j<nz-1; j+=2){ 2667 i0 = vi[j]; 2668 i1 = vi[j+1]; 2669 tmp0 = tmps[i0]; 2670 tmp1 = tmps[i1]; 2671 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2672 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2673 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2674 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2675 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2676 } 2677 if (j==nz-1){ 2678 tmp0 = tmps[vi[j]]; 2679 sum1 -= v1[j] * tmp0; 2680 sum2 -= v2[j+1] * tmp0; 2681 sum3 -= v3[j+2] * tmp0; 2682 sum4 -= v4[j+3] * tmp0; 2683 sum5 -= v5[j+4] * tmp0; 2684 } 2685 2686 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2687 sum2 -= v2[0] * tmp0; 2688 sum3 -= v3[1] * tmp0; 2689 sum4 -= v4[2] * tmp0; 2690 sum5 -= v5[3] * tmp0; 2691 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2692 sum3 -= v3[0] * tmp0; 2693 sum4 -= v4[1] * tmp0; 2694 sum5 -= v5[2] * tmp0; 2695 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2696 sum4 -= v4[0] * tmp0; 2697 sum5 -= v5[1] * tmp0; 2698 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2699 sum5 -= v5[0] * tmp0; 2700 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2701 break; 2702 default: 2703 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2704 } 2705 } 2706 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2707 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2708 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2709 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2710 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2711 PetscFunctionReturn(0); 2712 } 2713 2714 2715 /* 2716 Makes a longer coloring[] array and calls the usual code with that 2717 */ 2718 #undef __FUNCT__ 2719 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2720 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2721 { 2722 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2723 PetscErrorCode ierr; 2724 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2725 PetscInt *colorused,i; 2726 ISColoringValue *newcolor; 2727 2728 PetscFunctionBegin; 2729 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2730 /* loop over inodes, marking a color for each column*/ 2731 row = 0; 2732 for (i=0; i<m; i++){ 2733 for (j=0; j<ns[i]; j++) { 2734 newcolor[row++] = coloring[i] + j*ncolors; 2735 } 2736 } 2737 2738 /* eliminate unneeded colors */ 2739 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2740 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2741 for (i=0; i<n; i++) { 2742 colorused[newcolor[i]] = 1; 2743 } 2744 2745 for (i=1; i<5*ncolors; i++) { 2746 colorused[i] += colorused[i-1]; 2747 } 2748 ncolors = colorused[5*ncolors-1]; 2749 for (i=0; i<n; i++) { 2750 newcolor[i] = colorused[newcolor[i]]-1; 2751 } 2752 ierr = PetscFree(colorused);CHKERRQ(ierr); 2753 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2754 ierr = PetscFree(coloring);CHKERRQ(ierr); 2755 PetscFunctionReturn(0); 2756 } 2757 2758 #include "../src/mat/blockinvert.h" 2759 2760 #undef __FUNCT__ 2761 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2762 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2763 { 2764 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2765 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2766 MatScalar *ibdiag,*bdiag,work[25]; 2767 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2768 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2769 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2770 PetscErrorCode ierr; 2771 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2772 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2773 2774 PetscFunctionBegin; 2775 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2776 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2777 if (its > 1) { 2778 /* switch to non-inode version */ 2779 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2780 PetscFunctionReturn(0); 2781 } 2782 2783 if (!a->inode.ibdiagvalid) { 2784 if (!a->inode.ibdiag) { 2785 /* calculate space needed for diagonal blocks */ 2786 for (i=0; i<m; i++) { 2787 cnt += sizes[i]*sizes[i]; 2788 } 2789 a->inode.bdiagsize = cnt; 2790 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2791 } 2792 2793 /* copy over the diagonal blocks and invert them */ 2794 ibdiag = a->inode.ibdiag; 2795 bdiag = a->inode.bdiag; 2796 cnt = 0; 2797 for (i=0, row = 0; i<m; i++) { 2798 for (j=0; j<sizes[i]; j++) { 2799 for (k=0; k<sizes[i]; k++) { 2800 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2801 } 2802 } 2803 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2804 2805 switch(sizes[i]) { 2806 case 1: 2807 /* Create matrix data structure */ 2808 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2809 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2810 break; 2811 case 2: 2812 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2813 break; 2814 case 3: 2815 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2816 break; 2817 case 4: 2818 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2819 break; 2820 case 5: 2821 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2822 break; 2823 default: 2824 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2825 } 2826 cnt += sizes[i]*sizes[i]; 2827 row += sizes[i]; 2828 } 2829 a->inode.ibdiagvalid = PETSC_TRUE; 2830 } 2831 ibdiag = a->inode.ibdiag; 2832 bdiag = a->inode.bdiag; 2833 2834 2835 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2836 if (flag & SOR_ZERO_INITIAL_GUESS) { 2837 const PetscScalar *b; 2838 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2839 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2840 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2841 2842 for (i=0, row=0; i<m; i++) { 2843 sz = diag[row] - ii[row]; 2844 v1 = a->a + ii[row]; 2845 idx = a->j + ii[row]; 2846 2847 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2848 switch (sizes[i]){ 2849 case 1: 2850 2851 sum1 = b[row]; 2852 for(n = 0; n<sz-1; n+=2) { 2853 i1 = idx[0]; 2854 i2 = idx[1]; 2855 idx += 2; 2856 tmp0 = x[i1]; 2857 tmp1 = x[i2]; 2858 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2859 } 2860 2861 if (n == sz-1){ 2862 tmp0 = x[*idx]; 2863 sum1 -= *v1 * tmp0; 2864 } 2865 x[row++] = sum1*(*ibdiag++); 2866 break; 2867 case 2: 2868 v2 = a->a + ii[row+1]; 2869 sum1 = b[row]; 2870 sum2 = b[row+1]; 2871 for(n = 0; n<sz-1; n+=2) { 2872 i1 = idx[0]; 2873 i2 = idx[1]; 2874 idx += 2; 2875 tmp0 = x[i1]; 2876 tmp1 = x[i2]; 2877 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2878 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2879 } 2880 2881 if (n == sz-1){ 2882 tmp0 = x[*idx]; 2883 sum1 -= v1[0] * tmp0; 2884 sum2 -= v2[0] * tmp0; 2885 } 2886 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2887 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2888 ibdiag += 4; 2889 break; 2890 case 3: 2891 v2 = a->a + ii[row+1]; 2892 v3 = a->a + ii[row+2]; 2893 sum1 = b[row]; 2894 sum2 = b[row+1]; 2895 sum3 = b[row+2]; 2896 for(n = 0; n<sz-1; n+=2) { 2897 i1 = idx[0]; 2898 i2 = idx[1]; 2899 idx += 2; 2900 tmp0 = x[i1]; 2901 tmp1 = x[i2]; 2902 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2903 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2904 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2905 } 2906 2907 if (n == sz-1){ 2908 tmp0 = x[*idx]; 2909 sum1 -= v1[0] * tmp0; 2910 sum2 -= v2[0] * tmp0; 2911 sum3 -= v3[0] * tmp0; 2912 } 2913 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2914 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2915 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2916 ibdiag += 9; 2917 break; 2918 case 4: 2919 v2 = a->a + ii[row+1]; 2920 v3 = a->a + ii[row+2]; 2921 v4 = a->a + ii[row+3]; 2922 sum1 = b[row]; 2923 sum2 = b[row+1]; 2924 sum3 = b[row+2]; 2925 sum4 = b[row+3]; 2926 for(n = 0; n<sz-1; n+=2) { 2927 i1 = idx[0]; 2928 i2 = idx[1]; 2929 idx += 2; 2930 tmp0 = x[i1]; 2931 tmp1 = x[i2]; 2932 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2933 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2934 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2935 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2936 } 2937 2938 if (n == sz-1){ 2939 tmp0 = x[*idx]; 2940 sum1 -= v1[0] * tmp0; 2941 sum2 -= v2[0] * tmp0; 2942 sum3 -= v3[0] * tmp0; 2943 sum4 -= v4[0] * tmp0; 2944 } 2945 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2946 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2947 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2948 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2949 ibdiag += 16; 2950 break; 2951 case 5: 2952 v2 = a->a + ii[row+1]; 2953 v3 = a->a + ii[row+2]; 2954 v4 = a->a + ii[row+3]; 2955 v5 = a->a + ii[row+4]; 2956 sum1 = b[row]; 2957 sum2 = b[row+1]; 2958 sum3 = b[row+2]; 2959 sum4 = b[row+3]; 2960 sum5 = b[row+4]; 2961 for(n = 0; n<sz-1; n+=2) { 2962 i1 = idx[0]; 2963 i2 = idx[1]; 2964 idx += 2; 2965 tmp0 = x[i1]; 2966 tmp1 = x[i2]; 2967 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2968 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2969 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2970 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2971 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2972 } 2973 2974 if (n == sz-1){ 2975 tmp0 = x[*idx]; 2976 sum1 -= v1[0] * tmp0; 2977 sum2 -= v2[0] * tmp0; 2978 sum3 -= v3[0] * tmp0; 2979 sum4 -= v4[0] * tmp0; 2980 sum5 -= v5[0] * tmp0; 2981 } 2982 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2983 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2984 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2985 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2986 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2987 ibdiag += 25; 2988 break; 2989 default: 2990 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2991 } 2992 } 2993 2994 xb = x; 2995 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2996 } else xb = b; 2997 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2998 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2999 cnt = 0; 3000 for (i=0, row=0; i<m; i++) { 3001 3002 switch (sizes[i]){ 3003 case 1: 3004 x[row++] *= bdiag[cnt++]; 3005 break; 3006 case 2: 3007 x1 = x[row]; x2 = x[row+1]; 3008 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3009 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3010 x[row++] = tmp1; 3011 x[row++] = tmp2; 3012 cnt += 4; 3013 break; 3014 case 3: 3015 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3016 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3017 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3018 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3019 x[row++] = tmp1; 3020 x[row++] = tmp2; 3021 x[row++] = tmp3; 3022 cnt += 9; 3023 break; 3024 case 4: 3025 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3026 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3027 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3028 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3029 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3030 x[row++] = tmp1; 3031 x[row++] = tmp2; 3032 x[row++] = tmp3; 3033 x[row++] = tmp4; 3034 cnt += 16; 3035 break; 3036 case 5: 3037 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3038 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3039 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3040 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3041 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3042 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3043 x[row++] = tmp1; 3044 x[row++] = tmp2; 3045 x[row++] = tmp3; 3046 x[row++] = tmp4; 3047 x[row++] = tmp5; 3048 cnt += 25; 3049 break; 3050 default: 3051 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3052 } 3053 } 3054 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3055 } 3056 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 3057 3058 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3059 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3060 ibdiag -= sizes[i]*sizes[i]; 3061 sz = ii[row+1] - diag[row] - 1; 3062 v1 = a->a + diag[row] + 1; 3063 idx = a->j + diag[row] + 1; 3064 3065 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3066 switch (sizes[i]){ 3067 case 1: 3068 3069 sum1 = xb[row]; 3070 for(n = 0; n<sz-1; n+=2) { 3071 i1 = idx[0]; 3072 i2 = idx[1]; 3073 idx += 2; 3074 tmp0 = x[i1]; 3075 tmp1 = x[i2]; 3076 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3077 } 3078 3079 if (n == sz-1){ 3080 tmp0 = x[*idx]; 3081 sum1 -= *v1*tmp0; 3082 } 3083 x[row--] = sum1*(*ibdiag); 3084 break; 3085 3086 case 2: 3087 3088 sum1 = xb[row]; 3089 sum2 = xb[row-1]; 3090 /* note that sum1 is associated with the second of the two rows */ 3091 v2 = a->a + diag[row-1] + 2; 3092 for(n = 0; n<sz-1; n+=2) { 3093 i1 = idx[0]; 3094 i2 = idx[1]; 3095 idx += 2; 3096 tmp0 = x[i1]; 3097 tmp1 = x[i2]; 3098 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3099 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3100 } 3101 3102 if (n == sz-1){ 3103 tmp0 = x[*idx]; 3104 sum1 -= *v1*tmp0; 3105 sum2 -= *v2*tmp0; 3106 } 3107 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3108 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3109 break; 3110 case 3: 3111 3112 sum1 = xb[row]; 3113 sum2 = xb[row-1]; 3114 sum3 = xb[row-2]; 3115 v2 = a->a + diag[row-1] + 2; 3116 v3 = a->a + diag[row-2] + 3; 3117 for(n = 0; n<sz-1; n+=2) { 3118 i1 = idx[0]; 3119 i2 = idx[1]; 3120 idx += 2; 3121 tmp0 = x[i1]; 3122 tmp1 = x[i2]; 3123 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3124 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3125 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3126 } 3127 3128 if (n == sz-1){ 3129 tmp0 = x[*idx]; 3130 sum1 -= *v1*tmp0; 3131 sum2 -= *v2*tmp0; 3132 sum3 -= *v3*tmp0; 3133 } 3134 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3135 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3136 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3137 break; 3138 case 4: 3139 3140 sum1 = xb[row]; 3141 sum2 = xb[row-1]; 3142 sum3 = xb[row-2]; 3143 sum4 = xb[row-3]; 3144 v2 = a->a + diag[row-1] + 2; 3145 v3 = a->a + diag[row-2] + 3; 3146 v4 = a->a + diag[row-3] + 4; 3147 for(n = 0; n<sz-1; n+=2) { 3148 i1 = idx[0]; 3149 i2 = idx[1]; 3150 idx += 2; 3151 tmp0 = x[i1]; 3152 tmp1 = x[i2]; 3153 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3154 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3155 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3156 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3157 } 3158 3159 if (n == sz-1){ 3160 tmp0 = x[*idx]; 3161 sum1 -= *v1*tmp0; 3162 sum2 -= *v2*tmp0; 3163 sum3 -= *v3*tmp0; 3164 sum4 -= *v4*tmp0; 3165 } 3166 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3167 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3168 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3169 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3170 break; 3171 case 5: 3172 3173 sum1 = xb[row]; 3174 sum2 = xb[row-1]; 3175 sum3 = xb[row-2]; 3176 sum4 = xb[row-3]; 3177 sum5 = xb[row-4]; 3178 v2 = a->a + diag[row-1] + 2; 3179 v3 = a->a + diag[row-2] + 3; 3180 v4 = a->a + diag[row-3] + 4; 3181 v5 = a->a + diag[row-4] + 5; 3182 for(n = 0; n<sz-1; n+=2) { 3183 i1 = idx[0]; 3184 i2 = idx[1]; 3185 idx += 2; 3186 tmp0 = x[i1]; 3187 tmp1 = x[i2]; 3188 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3189 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3190 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3191 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3192 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3193 } 3194 3195 if (n == sz-1){ 3196 tmp0 = x[*idx]; 3197 sum1 -= *v1*tmp0; 3198 sum2 -= *v2*tmp0; 3199 sum3 -= *v3*tmp0; 3200 sum4 -= *v4*tmp0; 3201 sum5 -= *v5*tmp0; 3202 } 3203 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3204 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3205 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3206 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3207 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3208 break; 3209 default: 3210 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3211 } 3212 } 3213 3214 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3215 } 3216 its--; 3217 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3218 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3219 } 3220 if (flag & SOR_EISENSTAT) { 3221 const PetscScalar *b; 3222 MatScalar *t = a->inode.ssor_work; 3223 3224 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3225 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3226 /* 3227 Apply (U + D)^-1 where D is now the block diagonal 3228 */ 3229 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3230 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3231 ibdiag -= sizes[i]*sizes[i]; 3232 sz = ii[row+1] - diag[row] - 1; 3233 v1 = a->a + diag[row] + 1; 3234 idx = a->j + diag[row] + 1; 3235 CHKMEMQ; 3236 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3237 switch (sizes[i]){ 3238 case 1: 3239 3240 sum1 = b[row]; 3241 for(n = 0; n<sz-1; n+=2) { 3242 i1 = idx[0]; 3243 i2 = idx[1]; 3244 idx += 2; 3245 tmp0 = x[i1]; 3246 tmp1 = x[i2]; 3247 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3248 } 3249 3250 if (n == sz-1){ 3251 tmp0 = x[*idx]; 3252 sum1 -= *v1*tmp0; 3253 } 3254 x[row] = sum1*(*ibdiag);row--; 3255 break; 3256 3257 case 2: 3258 3259 sum1 = b[row]; 3260 sum2 = b[row-1]; 3261 /* note that sum1 is associated with the second of the two rows */ 3262 v2 = a->a + diag[row-1] + 2; 3263 for(n = 0; n<sz-1; n+=2) { 3264 i1 = idx[0]; 3265 i2 = idx[1]; 3266 idx += 2; 3267 tmp0 = x[i1]; 3268 tmp1 = x[i2]; 3269 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3270 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3271 } 3272 3273 if (n == sz-1){ 3274 tmp0 = x[*idx]; 3275 sum1 -= *v1*tmp0; 3276 sum2 -= *v2*tmp0; 3277 } 3278 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3279 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3280 row -= 2; 3281 break; 3282 case 3: 3283 3284 sum1 = b[row]; 3285 sum2 = b[row-1]; 3286 sum3 = b[row-2]; 3287 v2 = a->a + diag[row-1] + 2; 3288 v3 = a->a + diag[row-2] + 3; 3289 for(n = 0; n<sz-1; n+=2) { 3290 i1 = idx[0]; 3291 i2 = idx[1]; 3292 idx += 2; 3293 tmp0 = x[i1]; 3294 tmp1 = x[i2]; 3295 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3296 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3297 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3298 } 3299 3300 if (n == sz-1){ 3301 tmp0 = x[*idx]; 3302 sum1 -= *v1*tmp0; 3303 sum2 -= *v2*tmp0; 3304 sum3 -= *v3*tmp0; 3305 } 3306 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3307 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3308 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3309 row -= 3; 3310 break; 3311 case 4: 3312 3313 sum1 = b[row]; 3314 sum2 = b[row-1]; 3315 sum3 = b[row-2]; 3316 sum4 = b[row-3]; 3317 v2 = a->a + diag[row-1] + 2; 3318 v3 = a->a + diag[row-2] + 3; 3319 v4 = a->a + diag[row-3] + 4; 3320 for(n = 0; n<sz-1; n+=2) { 3321 i1 = idx[0]; 3322 i2 = idx[1]; 3323 idx += 2; 3324 tmp0 = x[i1]; 3325 tmp1 = x[i2]; 3326 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3327 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3328 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3329 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3330 } 3331 3332 if (n == sz-1){ 3333 tmp0 = x[*idx]; 3334 sum1 -= *v1*tmp0; 3335 sum2 -= *v2*tmp0; 3336 sum3 -= *v3*tmp0; 3337 sum4 -= *v4*tmp0; 3338 } 3339 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3340 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3341 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3342 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3343 row -= 4; 3344 break; 3345 case 5: 3346 3347 sum1 = b[row]; 3348 sum2 = b[row-1]; 3349 sum3 = b[row-2]; 3350 sum4 = b[row-3]; 3351 sum5 = b[row-4]; 3352 v2 = a->a + diag[row-1] + 2; 3353 v3 = a->a + diag[row-2] + 3; 3354 v4 = a->a + diag[row-3] + 4; 3355 v5 = a->a + diag[row-4] + 5; 3356 for(n = 0; n<sz-1; n+=2) { 3357 i1 = idx[0]; 3358 i2 = idx[1]; 3359 idx += 2; 3360 tmp0 = x[i1]; 3361 tmp1 = x[i2]; 3362 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3363 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3364 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3365 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3366 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3367 } 3368 3369 if (n == sz-1){ 3370 tmp0 = x[*idx]; 3371 sum1 -= *v1*tmp0; 3372 sum2 -= *v2*tmp0; 3373 sum3 -= *v3*tmp0; 3374 sum4 -= *v4*tmp0; 3375 sum5 -= *v5*tmp0; 3376 } 3377 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3378 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3379 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3380 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3381 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3382 row -= 5; 3383 break; 3384 default: 3385 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3386 } 3387 CHKMEMQ; 3388 } 3389 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3390 3391 /* 3392 t = b - D x where D is the block diagonal 3393 */ 3394 cnt = 0; 3395 for (i=0, row=0; i<m; i++) { 3396 CHKMEMQ; 3397 switch (sizes[i]){ 3398 case 1: 3399 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3400 break; 3401 case 2: 3402 x1 = x[row]; x2 = x[row+1]; 3403 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3404 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3405 t[row] = b[row] - tmp1; 3406 t[row+1] = b[row+1] - tmp2; row += 2; 3407 cnt += 4; 3408 break; 3409 case 3: 3410 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3411 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3412 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3413 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3414 t[row] = b[row] - tmp1; 3415 t[row+1] = b[row+1] - tmp2; 3416 t[row+2] = b[row+2] - tmp3; row += 3; 3417 cnt += 9; 3418 break; 3419 case 4: 3420 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3421 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3422 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3423 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3424 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3425 t[row] = b[row] - tmp1; 3426 t[row+1] = b[row+1] - tmp2; 3427 t[row+2] = b[row+2] - tmp3; 3428 t[row+3] = b[row+3] - tmp4; row += 4; 3429 cnt += 16; 3430 break; 3431 case 5: 3432 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3433 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3434 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3435 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3436 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3437 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3438 t[row] = b[row] - tmp1; 3439 t[row+1] = b[row+1] - tmp2; 3440 t[row+2] = b[row+2] - tmp3; 3441 t[row+3] = b[row+3] - tmp4; 3442 t[row+4] = b[row+4] - tmp5;row += 5; 3443 cnt += 25; 3444 break; 3445 default: 3446 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3447 } 3448 CHKMEMQ; 3449 } 3450 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3451 3452 3453 3454 /* 3455 Apply (L + D)^-1 where D is the block diagonal 3456 */ 3457 for (i=0, row=0; i<m; i++) { 3458 sz = diag[row] - ii[row]; 3459 v1 = a->a + ii[row]; 3460 idx = a->j + ii[row]; 3461 CHKMEMQ; 3462 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3463 switch (sizes[i]){ 3464 case 1: 3465 3466 sum1 = t[row]; 3467 for(n = 0; n<sz-1; n+=2) { 3468 i1 = idx[0]; 3469 i2 = idx[1]; 3470 idx += 2; 3471 tmp0 = t[i1]; 3472 tmp1 = t[i2]; 3473 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3474 } 3475 3476 if (n == sz-1){ 3477 tmp0 = t[*idx]; 3478 sum1 -= *v1 * tmp0; 3479 } 3480 x[row] += t[row] = sum1*(*ibdiag++); row++; 3481 break; 3482 case 2: 3483 v2 = a->a + ii[row+1]; 3484 sum1 = t[row]; 3485 sum2 = t[row+1]; 3486 for(n = 0; n<sz-1; n+=2) { 3487 i1 = idx[0]; 3488 i2 = idx[1]; 3489 idx += 2; 3490 tmp0 = t[i1]; 3491 tmp1 = t[i2]; 3492 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3493 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3494 } 3495 3496 if (n == sz-1){ 3497 tmp0 = t[*idx]; 3498 sum1 -= v1[0] * tmp0; 3499 sum2 -= v2[0] * tmp0; 3500 } 3501 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3502 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3503 ibdiag += 4; row += 2; 3504 break; 3505 case 3: 3506 v2 = a->a + ii[row+1]; 3507 v3 = a->a + ii[row+2]; 3508 sum1 = t[row]; 3509 sum2 = t[row+1]; 3510 sum3 = t[row+2]; 3511 for(n = 0; n<sz-1; n+=2) { 3512 i1 = idx[0]; 3513 i2 = idx[1]; 3514 idx += 2; 3515 tmp0 = t[i1]; 3516 tmp1 = t[i2]; 3517 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3518 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3519 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3520 } 3521 3522 if (n == sz-1){ 3523 tmp0 = t[*idx]; 3524 sum1 -= v1[0] * tmp0; 3525 sum2 -= v2[0] * tmp0; 3526 sum3 -= v3[0] * tmp0; 3527 } 3528 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3529 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3530 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3531 ibdiag += 9; row += 3; 3532 break; 3533 case 4: 3534 v2 = a->a + ii[row+1]; 3535 v3 = a->a + ii[row+2]; 3536 v4 = a->a + ii[row+3]; 3537 sum1 = t[row]; 3538 sum2 = t[row+1]; 3539 sum3 = t[row+2]; 3540 sum4 = t[row+3]; 3541 for(n = 0; n<sz-1; n+=2) { 3542 i1 = idx[0]; 3543 i2 = idx[1]; 3544 idx += 2; 3545 tmp0 = t[i1]; 3546 tmp1 = t[i2]; 3547 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3548 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3549 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3550 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3551 } 3552 3553 if (n == sz-1){ 3554 tmp0 = t[*idx]; 3555 sum1 -= v1[0] * tmp0; 3556 sum2 -= v2[0] * tmp0; 3557 sum3 -= v3[0] * tmp0; 3558 sum4 -= v4[0] * tmp0; 3559 } 3560 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3561 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3562 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3563 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3564 ibdiag += 16; row += 4; 3565 break; 3566 case 5: 3567 v2 = a->a + ii[row+1]; 3568 v3 = a->a + ii[row+2]; 3569 v4 = a->a + ii[row+3]; 3570 v5 = a->a + ii[row+4]; 3571 sum1 = t[row]; 3572 sum2 = t[row+1]; 3573 sum3 = t[row+2]; 3574 sum4 = t[row+3]; 3575 sum5 = t[row+4]; 3576 for(n = 0; n<sz-1; n+=2) { 3577 i1 = idx[0]; 3578 i2 = idx[1]; 3579 idx += 2; 3580 tmp0 = t[i1]; 3581 tmp1 = t[i2]; 3582 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3583 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3584 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3585 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3586 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3587 } 3588 3589 if (n == sz-1){ 3590 tmp0 = t[*idx]; 3591 sum1 -= v1[0] * tmp0; 3592 sum2 -= v2[0] * tmp0; 3593 sum3 -= v3[0] * tmp0; 3594 sum4 -= v4[0] * tmp0; 3595 sum5 -= v5[0] * tmp0; 3596 } 3597 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3598 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3599 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3600 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3601 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3602 ibdiag += 25; row += 5; 3603 break; 3604 default: 3605 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3606 } 3607 CHKMEMQ; 3608 } 3609 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3610 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3611 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3612 } 3613 PetscFunctionReturn(0); 3614 } 3615 3616 #undef __FUNCT__ 3617 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3618 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3619 { 3620 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3621 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3622 const MatScalar *bdiag = a->inode.bdiag; 3623 const PetscScalar *b; 3624 PetscErrorCode ierr; 3625 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3626 const PetscInt *sizes = a->inode.size; 3627 3628 PetscFunctionBegin; 3629 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3630 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3631 cnt = 0; 3632 for (i=0, row=0; i<m; i++) { 3633 switch (sizes[i]){ 3634 case 1: 3635 x[row] = b[row]*bdiag[cnt++];row++; 3636 break; 3637 case 2: 3638 x1 = b[row]; x2 = b[row+1]; 3639 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3640 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3641 x[row++] = tmp1; 3642 x[row++] = tmp2; 3643 cnt += 4; 3644 break; 3645 case 3: 3646 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3647 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3648 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3649 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3650 x[row++] = tmp1; 3651 x[row++] = tmp2; 3652 x[row++] = tmp3; 3653 cnt += 9; 3654 break; 3655 case 4: 3656 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3657 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3658 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3659 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3660 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3661 x[row++] = tmp1; 3662 x[row++] = tmp2; 3663 x[row++] = tmp3; 3664 x[row++] = tmp4; 3665 cnt += 16; 3666 break; 3667 case 5: 3668 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3669 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3670 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3671 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3672 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3673 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3674 x[row++] = tmp1; 3675 x[row++] = tmp2; 3676 x[row++] = tmp3; 3677 x[row++] = tmp4; 3678 x[row++] = tmp5; 3679 cnt += 25; 3680 break; 3681 default: 3682 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3683 } 3684 } 3685 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3686 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3687 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3688 PetscFunctionReturn(0); 3689 } 3690 3691 /* 3692 samestructure indicates that the matrix has not changed its nonzero structure so we 3693 do not need to recompute the inodes 3694 */ 3695 #undef __FUNCT__ 3696 #define __FUNCT__ "Mat_CheckInode" 3697 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3698 { 3699 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3700 PetscErrorCode ierr; 3701 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3702 PetscTruth flag; 3703 3704 PetscFunctionBegin; 3705 if (!a->inode.use) PetscFunctionReturn(0); 3706 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3707 3708 3709 m = A->rmap->n; 3710 if (a->inode.size) {ns = a->inode.size;} 3711 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3712 3713 i = 0; 3714 node_count = 0; 3715 idx = a->j; 3716 ii = a->i; 3717 while (i < m){ /* For each row */ 3718 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3719 /* Limits the number of elements in a node to 'a->inode.limit' */ 3720 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3721 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3722 if (nzy != nzx) break; 3723 idy += nzx; /* Same nonzero pattern */ 3724 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3725 if (!flag) break; 3726 } 3727 ns[node_count++] = blk_size; 3728 idx += blk_size*nzx; 3729 i = j; 3730 } 3731 /* If not enough inodes found,, do not use inode version of the routines */ 3732 if (!m || node_count > .9*m) { 3733 ierr = PetscFree(ns);CHKERRQ(ierr); 3734 a->inode.node_count = 0; 3735 a->inode.size = PETSC_NULL; 3736 a->inode.use = PETSC_FALSE; 3737 A->ops->mult = MatMult_SeqAIJ; 3738 A->ops->sor = MatSOR_SeqAIJ; 3739 A->ops->multadd = MatMultAdd_SeqAIJ; 3740 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 3741 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 3742 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 3743 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 3744 A->ops->coloringpatch = 0; 3745 A->ops->multdiagonalblock = 0; 3746 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3747 } else { 3748 if (!A->factortype) { 3749 A->ops->mult = MatMult_SeqAIJ_Inode; 3750 A->ops->sor = MatSOR_SeqAIJ_Inode; 3751 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3752 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3753 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3754 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3755 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3756 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3757 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3758 } else { 3759 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 3760 } 3761 a->inode.node_count = node_count; 3762 a->inode.size = ns; 3763 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3764 } 3765 PetscFunctionReturn(0); 3766 } 3767 3768 #undef __FUNCT__ 3769 #define __FUNCT__ "MatGetRow_FactoredLU" 3770 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row) 3771 { 3772 PetscInt k, *vi; 3773 PetscFunctionBegin; 3774 vi = aj + ai[row]; 3775 for(k=0;k<nzl;k++) cols[k] = vi[k]; 3776 vi = aj + adiag[row]; 3777 cols[nzl] = vi[0]; 3778 vi = aj + adiag[row+1]+1; 3779 for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k]; 3780 PetscFunctionReturn(0); 3781 } 3782 /* 3783 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3784 Modified from Mat_CheckInode(). 3785 3786 Input Parameters: 3787 + Mat A - ILU or LU matrix factor 3788 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 3789 do not need to recompute the inodes 3790 */ 3791 #undef __FUNCT__ 3792 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3793 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3794 { 3795 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3796 PetscErrorCode ierr; 3797 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3798 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3799 PetscTruth flag; 3800 3801 PetscFunctionBegin; 3802 if (!a->inode.use) PetscFunctionReturn(0); 3803 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3804 3805 m = A->rmap->n; 3806 if (a->inode.size) {ns = a->inode.size;} 3807 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3808 3809 i = 0; 3810 node_count = 0; 3811 while (i < m){ /* For each row */ 3812 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3813 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3814 nzx = nzl1 + nzu1 + 1; 3815 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3816 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3817 3818 /* Limits the number of elements in a node to 'a->inode.limit' */ 3819 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3820 nzl2 = ai[j+1] - ai[j]; 3821 nzu2 = adiag[j] - adiag[j+1] - 1; 3822 nzy = nzl2 + nzu2 + 1; 3823 if( nzy != nzx) break; 3824 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3825 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 3826 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3827 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3828 ierr = PetscFree(cols2);CHKERRQ(ierr); 3829 } 3830 ns[node_count++] = blk_size; 3831 ierr = PetscFree(cols1);CHKERRQ(ierr); 3832 i = j; 3833 } 3834 /* If not enough inodes found,, do not use inode version of the routines */ 3835 if (!m || node_count > .9*m) { 3836 ierr = PetscFree(ns);CHKERRQ(ierr); 3837 a->inode.node_count = 0; 3838 a->inode.size = PETSC_NULL; 3839 a->inode.use = PETSC_FALSE; 3840 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3841 } else { 3842 A->ops->mult = 0; 3843 A->ops->sor = 0; 3844 A->ops->multadd = 0; 3845 A->ops->getrowij = 0; 3846 A->ops->restorerowij = 0; 3847 A->ops->getcolumnij = 0; 3848 A->ops->restorecolumnij = 0; 3849 A->ops->coloringpatch = 0; 3850 A->ops->multdiagonalblock = 0; 3851 A->ops->solve = MatSolve_SeqAIJ_Inode; 3852 a->inode.node_count = node_count; 3853 a->inode.size = ns; 3854 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3855 } 3856 PetscFunctionReturn(0); 3857 } 3858 3859 /* 3860 This is really ugly. if inodes are used this replaces the 3861 permutations with ones that correspond to rows/cols of the matrix 3862 rather then inode blocks 3863 */ 3864 #undef __FUNCT__ 3865 #define __FUNCT__ "MatInodeAdjustForInodes" 3866 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3867 { 3868 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3869 3870 PetscFunctionBegin; 3871 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3872 if (f) { 3873 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3874 } 3875 PetscFunctionReturn(0); 3876 } 3877 3878 EXTERN_C_BEGIN 3879 #undef __FUNCT__ 3880 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3881 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3882 { 3883 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3884 PetscErrorCode ierr; 3885 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3886 const PetscInt *ridx,*cidx; 3887 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3888 PetscInt nslim_col,*ns_col; 3889 IS ris = *rperm,cis = *cperm; 3890 3891 PetscFunctionBegin; 3892 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3893 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3894 3895 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3896 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3897 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3898 3899 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3900 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3901 3902 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3903 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3904 3905 /* Construct the permutations for rows*/ 3906 for (i=0,row = 0; i<nslim_row; ++i){ 3907 indx = ridx[i]; 3908 start_val = tns[indx]; 3909 end_val = tns[indx + 1]; 3910 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3911 } 3912 3913 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3914 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3915 3916 /* Construct permutations for columns */ 3917 for (i=0,col=0; i<nslim_col; ++i){ 3918 indx = cidx[i]; 3919 start_val = tns[indx]; 3920 end_val = tns[indx + 1]; 3921 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3922 } 3923 3924 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3925 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3926 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3927 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3928 3929 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3930 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3931 3932 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3933 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3934 ierr = ISDestroy(cis);CHKERRQ(ierr); 3935 ierr = ISDestroy(ris);CHKERRQ(ierr); 3936 ierr = PetscFree(tns);CHKERRQ(ierr); 3937 PetscFunctionReturn(0); 3938 } 3939 EXTERN_C_END 3940 3941 #undef __FUNCT__ 3942 #define __FUNCT__ "MatInodeGetInodeSizes" 3943 /*@C 3944 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3945 3946 Collective on Mat 3947 3948 Input Parameter: 3949 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3950 3951 Output Parameter: 3952 + node_count - no of inodes present in the matrix. 3953 . sizes - an array of size node_count,with sizes of each inode. 3954 - limit - the max size used to generate the inodes. 3955 3956 Level: advanced 3957 3958 Notes: This routine returns some internal storage information 3959 of the matrix, it is intended to be used by advanced users. 3960 It should be called after the matrix is assembled. 3961 The contents of the sizes[] array should not be changed. 3962 PETSC_NULL may be passed for information not requested. 3963 3964 .keywords: matrix, seqaij, get, inode 3965 3966 .seealso: MatGetInfo() 3967 @*/ 3968 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3969 { 3970 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3971 3972 PetscFunctionBegin; 3973 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3974 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3975 if (f) { 3976 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3977 } 3978 PetscFunctionReturn(0); 3979 } 3980 3981 EXTERN_C_BEGIN 3982 #undef __FUNCT__ 3983 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3984 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3985 { 3986 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3987 3988 PetscFunctionBegin; 3989 if (node_count) *node_count = a->inode.node_count; 3990 if (sizes) *sizes = a->inode.size; 3991 if (limit) *limit = a->inode.limit; 3992 PetscFunctionReturn(0); 3993 } 3994 EXTERN_C_END 3995