1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 24 for(; count+1 < n; count++,i++); 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for(; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 67 68 PetscFunctionBegin; 69 nslim_row = a->inode.node_count; 70 m = A->rmap->n; 71 n = A->cmap->n; 72 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 73 74 /* Use the row_inode as column_inode */ 75 nslim_col = nslim_row; 76 ns_col = ns_row; 77 78 /* allocate space for reformated inode structure */ 79 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 80 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 81 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 82 83 for (i1=0,col=0; i1<nslim_col; ++i1){ 84 nsz = ns_col[i1]; 85 for (i2=0; i2<nsz; ++i2,++col) 86 tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 i2 = 0; 101 col = *j++ + ishift; 102 i2 = tvc[col]; 103 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 104 ia[i1+1]++; 105 ia[i2+1]++; 106 i2++; /* Start col of next node */ 107 while((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 108 i2 = tvc[col]; 109 } 110 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 111 } 112 113 /* shift ia[i] to point to next row */ 114 for (i1=1; i1<nslim_row+1; i1++) { 115 row = ia[i1-1]; 116 ia[i1] += row; 117 work[i1-1] = row - oshift; 118 } 119 120 /* allocate space for column pointers */ 121 nz = ia[nslim_row] + (!ishift); 122 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 123 *jja = ja; 124 125 /* loop over lower triangular part putting into ja */ 126 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 127 j = aj + ai[row] + ishift; 128 jmax = aj + ai[row+1] + ishift; 129 i2 = 0; /* Col inode index */ 130 col = *j++ + ishift; 131 i2 = tvc[col]; 132 while (i2<i1 && j<jmax) { 133 ja[work[i2]++] = i1 + oshift; 134 ja[work[i1]++] = i2 + oshift; 135 ++i2; 136 while((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 137 i2 = tvc[col]; 138 } 139 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 140 141 } 142 ierr = PetscFree(work);CHKERRQ(ierr); 143 ierr = PetscFree(tns);CHKERRQ(ierr); 144 ierr = PetscFree(tvc);CHKERRQ(ierr); 145 PetscFunctionReturn(0); 146 } 147 148 /* 149 This builds nonsymmetric version of nonzero structure, 150 */ 151 #undef __FUNCT__ 152 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 153 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 154 { 155 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 156 PetscErrorCode ierr; 157 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 158 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 159 160 PetscFunctionBegin; 161 nslim_row = a->inode.node_count; 162 n = A->cmap->n; 163 164 /* Create The column_inode for this matrix */ 165 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 166 167 /* allocate space for reformated column_inode structure */ 168 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 169 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 170 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 171 172 for (i1=0,col=0; i1<nslim_col; ++i1){ 173 nsz = ns_col[i1]; 174 for (i2=0; i2<nsz; ++i2,++col) 175 tvc[col] = i1; 176 } 177 /* allocate space for row pointers */ 178 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 179 *iia = ia; 180 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 181 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 182 183 /* determine the number of columns in each row */ 184 ia[0] = oshift; 185 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 186 j = aj + ai[row] + ishift; 187 col = *j++ + ishift; 188 i2 = tvc[col]; 189 nz = ai[row+1] - ai[row]; 190 while (nz-- > 0) { /* off-diagonal elemets */ 191 ia[i1+1]++; 192 i2++; /* Start col of next node */ 193 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 194 if (nz > 0) i2 = tvc[col]; 195 } 196 } 197 198 /* shift ia[i] to point to next row */ 199 for (i1=1; i1<nslim_row+1; i1++) { 200 row = ia[i1-1]; 201 ia[i1] += row; 202 work[i1-1] = row - oshift; 203 } 204 205 /* allocate space for column pointers */ 206 nz = ia[nslim_row] + (!ishift); 207 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 208 *jja = ja; 209 210 /* loop over matrix putting into ja */ 211 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 212 j = aj + ai[row] + ishift; 213 i2 = 0; /* Col inode index */ 214 col = *j++ + ishift; 215 i2 = tvc[col]; 216 nz = ai[row+1] - ai[row]; 217 while (nz-- > 0) { 218 ja[work[i1]++] = i2 + oshift; 219 ++i2; 220 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 221 if (nz > 0) i2 = tvc[col]; 222 } 223 } 224 ierr = PetscFree(ns_col);CHKERRQ(ierr); 225 ierr = PetscFree(work);CHKERRQ(ierr); 226 ierr = PetscFree(tns);CHKERRQ(ierr); 227 ierr = PetscFree(tvc);CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 #undef __FUNCT__ 232 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 233 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done) 234 { 235 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 236 PetscErrorCode ierr; 237 238 PetscFunctionBegin; 239 *n = a->inode.node_count; 240 if (!ia) PetscFunctionReturn(0); 241 if (!blockcompressed) { 242 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 243 } else if (symmetric) { 244 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 245 } else { 246 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 247 } 248 PetscFunctionReturn(0); 249 } 250 251 #undef __FUNCT__ 252 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 253 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done) 254 { 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 if (!ia) PetscFunctionReturn(0); 259 260 if (!blockcompressed) { 261 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 262 } else { 263 ierr = PetscFree(*ia);CHKERRQ(ierr); 264 ierr = PetscFree(*ja);CHKERRQ(ierr); 265 } 266 267 PetscFunctionReturn(0); 268 } 269 270 /* ----------------------------------------------------------- */ 271 272 #undef __FUNCT__ 273 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 274 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 275 { 276 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 277 PetscErrorCode ierr; 278 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 279 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 280 281 PetscFunctionBegin; 282 nslim_row = a->inode.node_count; 283 n = A->cmap->n; 284 285 /* Create The column_inode for this matrix */ 286 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 287 288 /* allocate space for reformated column_inode structure */ 289 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 290 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 291 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 292 293 for (i1=0,col=0; i1<nslim_col; ++i1){ 294 nsz = ns_col[i1]; 295 for (i2=0; i2<nsz; ++i2,++col) 296 tvc[col] = i1; 297 } 298 /* allocate space for column pointers */ 299 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 300 *iia = ia; 301 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 302 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 303 304 /* determine the number of columns in each row */ 305 ia[0] = oshift; 306 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 307 j = aj + ai[row] + ishift; 308 col = *j++ + ishift; 309 i2 = tvc[col]; 310 nz = ai[row+1] - ai[row]; 311 while (nz-- > 0) { /* off-diagonal elemets */ 312 /* ia[i1+1]++; */ 313 ia[i2+1]++; 314 i2++; 315 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 316 if (nz > 0) i2 = tvc[col]; 317 } 318 } 319 320 /* shift ia[i] to point to next col */ 321 for (i1=1; i1<nslim_col+1; i1++) { 322 col = ia[i1-1]; 323 ia[i1] += col; 324 work[i1-1] = col - oshift; 325 } 326 327 /* allocate space for column pointers */ 328 nz = ia[nslim_col] + (!ishift); 329 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 330 *jja = ja; 331 332 /* loop over matrix putting into ja */ 333 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 334 j = aj + ai[row] + ishift; 335 i2 = 0; /* Col inode index */ 336 col = *j++ + ishift; 337 i2 = tvc[col]; 338 nz = ai[row+1] - ai[row]; 339 while (nz-- > 0) { 340 /* ja[work[i1]++] = i2 + oshift; */ 341 ja[work[i2]++] = i1 + oshift; 342 i2++; 343 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 344 if (nz > 0) i2 = tvc[col]; 345 } 346 } 347 ierr = PetscFree(ns_col);CHKERRQ(ierr); 348 ierr = PetscFree(work);CHKERRQ(ierr); 349 ierr = PetscFree(tns);CHKERRQ(ierr); 350 ierr = PetscFree(tvc);CHKERRQ(ierr); 351 PetscFunctionReturn(0); 352 } 353 354 #undef __FUNCT__ 355 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 356 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done) 357 { 358 PetscErrorCode ierr; 359 360 PetscFunctionBegin; 361 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 362 if (!ia) PetscFunctionReturn(0); 363 364 if (!blockcompressed) { 365 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 366 } else if (symmetric) { 367 /* Since the indices are symmetric it does'nt matter */ 368 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 369 } else { 370 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 371 } 372 PetscFunctionReturn(0); 373 } 374 375 #undef __FUNCT__ 376 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 377 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done) 378 { 379 PetscErrorCode ierr; 380 381 PetscFunctionBegin; 382 if (!ia) PetscFunctionReturn(0); 383 if (!blockcompressed) { 384 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 385 } else { 386 ierr = PetscFree(*ia);CHKERRQ(ierr); 387 ierr = PetscFree(*ja);CHKERRQ(ierr); 388 } 389 PetscFunctionReturn(0); 390 } 391 392 /* ----------------------------------------------------------- */ 393 394 #undef __FUNCT__ 395 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 396 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 397 { 398 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 399 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 400 PetscScalar *y; 401 const PetscScalar *x; 402 const MatScalar *v1,*v2,*v3,*v4,*v5; 403 PetscErrorCode ierr; 404 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 405 406 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 407 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 408 #endif 409 410 PetscFunctionBegin; 411 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 412 node_max = a->inode.node_count; 413 ns = a->inode.size; /* Node Size array */ 414 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 415 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 416 idx = a->j; 417 v1 = a->a; 418 ii = a->i; 419 420 for (i = 0,row = 0; i< node_max; ++i){ 421 nsz = ns[i]; 422 n = ii[1] - ii[0]; 423 nonzerorow += (n>0)*nsz; 424 ii += nsz; 425 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 426 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 427 sz = n; /* No of non zeros in this row */ 428 /* Switch on the size of Node */ 429 switch (nsz){ /* Each loop in 'case' is unrolled */ 430 case 1 : 431 sum1 = 0.; 432 433 for(n = 0; n< sz-1; n+=2) { 434 i1 = idx[0]; /* The instructions are ordered to */ 435 i2 = idx[1]; /* make the compiler's job easy */ 436 idx += 2; 437 tmp0 = x[i1]; 438 tmp1 = x[i2]; 439 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 440 } 441 442 if (n == sz-1){ /* Take care of the last nonzero */ 443 tmp0 = x[*idx++]; 444 sum1 += *v1++ * tmp0; 445 } 446 y[row++]=sum1; 447 break; 448 case 2: 449 sum1 = 0.; 450 sum2 = 0.; 451 v2 = v1 + n; 452 453 for (n = 0; n< sz-1; n+=2) { 454 i1 = idx[0]; 455 i2 = idx[1]; 456 idx += 2; 457 tmp0 = x[i1]; 458 tmp1 = x[i2]; 459 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 460 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 461 } 462 if (n == sz-1){ 463 tmp0 = x[*idx++]; 464 sum1 += *v1++ * tmp0; 465 sum2 += *v2++ * tmp0; 466 } 467 y[row++]=sum1; 468 y[row++]=sum2; 469 v1 =v2; /* Since the next block to be processed starts there*/ 470 idx +=sz; 471 break; 472 case 3: 473 sum1 = 0.; 474 sum2 = 0.; 475 sum3 = 0.; 476 v2 = v1 + n; 477 v3 = v2 + n; 478 479 for (n = 0; n< sz-1; n+=2) { 480 i1 = idx[0]; 481 i2 = idx[1]; 482 idx += 2; 483 tmp0 = x[i1]; 484 tmp1 = x[i2]; 485 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 486 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 487 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 488 } 489 if (n == sz-1){ 490 tmp0 = x[*idx++]; 491 sum1 += *v1++ * tmp0; 492 sum2 += *v2++ * tmp0; 493 sum3 += *v3++ * tmp0; 494 } 495 y[row++]=sum1; 496 y[row++]=sum2; 497 y[row++]=sum3; 498 v1 =v3; /* Since the next block to be processed starts there*/ 499 idx +=2*sz; 500 break; 501 case 4: 502 sum1 = 0.; 503 sum2 = 0.; 504 sum3 = 0.; 505 sum4 = 0.; 506 v2 = v1 + n; 507 v3 = v2 + n; 508 v4 = v3 + n; 509 510 for (n = 0; n< sz-1; n+=2) { 511 i1 = idx[0]; 512 i2 = idx[1]; 513 idx += 2; 514 tmp0 = x[i1]; 515 tmp1 = x[i2]; 516 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 517 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 518 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 519 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 520 } 521 if (n == sz-1){ 522 tmp0 = x[*idx++]; 523 sum1 += *v1++ * tmp0; 524 sum2 += *v2++ * tmp0; 525 sum3 += *v3++ * tmp0; 526 sum4 += *v4++ * tmp0; 527 } 528 y[row++]=sum1; 529 y[row++]=sum2; 530 y[row++]=sum3; 531 y[row++]=sum4; 532 v1 =v4; /* Since the next block to be processed starts there*/ 533 idx +=3*sz; 534 break; 535 case 5: 536 sum1 = 0.; 537 sum2 = 0.; 538 sum3 = 0.; 539 sum4 = 0.; 540 sum5 = 0.; 541 v2 = v1 + n; 542 v3 = v2 + n; 543 v4 = v3 + n; 544 v5 = v4 + n; 545 546 for (n = 0; n<sz-1; n+=2) { 547 i1 = idx[0]; 548 i2 = idx[1]; 549 idx += 2; 550 tmp0 = x[i1]; 551 tmp1 = x[i2]; 552 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 553 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 554 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 555 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 556 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 557 } 558 if (n == sz-1){ 559 tmp0 = x[*idx++]; 560 sum1 += *v1++ * tmp0; 561 sum2 += *v2++ * tmp0; 562 sum3 += *v3++ * tmp0; 563 sum4 += *v4++ * tmp0; 564 sum5 += *v5++ * tmp0; 565 } 566 y[row++]=sum1; 567 y[row++]=sum2; 568 y[row++]=sum3; 569 y[row++]=sum4; 570 y[row++]=sum5; 571 v1 =v5; /* Since the next block to be processed starts there */ 572 idx +=4*sz; 573 break; 574 default : 575 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 576 } 577 } 578 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 579 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 580 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 581 PetscFunctionReturn(0); 582 } 583 /* ----------------------------------------------------------- */ 584 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 585 #undef __FUNCT__ 586 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 587 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 588 { 589 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 590 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 591 MatScalar *v1,*v2,*v3,*v4,*v5; 592 PetscScalar *x,*y,*z,*zt; 593 PetscErrorCode ierr; 594 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 595 596 PetscFunctionBegin; 597 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 598 node_max = a->inode.node_count; 599 ns = a->inode.size; /* Node Size array */ 600 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 601 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 602 if (zz != yy) { 603 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 604 } else { 605 z = y; 606 } 607 zt = z; 608 609 idx = a->j; 610 v1 = a->a; 611 ii = a->i; 612 613 for (i = 0,row = 0; i< node_max; ++i){ 614 nsz = ns[i]; 615 n = ii[1] - ii[0]; 616 ii += nsz; 617 sz = n; /* No of non zeros in this row */ 618 /* Switch on the size of Node */ 619 switch (nsz){ /* Each loop in 'case' is unrolled */ 620 case 1 : 621 sum1 = *zt++; 622 623 for(n = 0; n< sz-1; n+=2) { 624 i1 = idx[0]; /* The instructions are ordered to */ 625 i2 = idx[1]; /* make the compiler's job easy */ 626 idx += 2; 627 tmp0 = x[i1]; 628 tmp1 = x[i2]; 629 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 630 } 631 632 if(n == sz-1){ /* Take care of the last nonzero */ 633 tmp0 = x[*idx++]; 634 sum1 += *v1++ * tmp0; 635 } 636 y[row++]=sum1; 637 break; 638 case 2: 639 sum1 = *zt++; 640 sum2 = *zt++; 641 v2 = v1 + n; 642 643 for(n = 0; n< sz-1; n+=2) { 644 i1 = idx[0]; 645 i2 = idx[1]; 646 idx += 2; 647 tmp0 = x[i1]; 648 tmp1 = x[i2]; 649 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 650 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 651 } 652 if(n == sz-1){ 653 tmp0 = x[*idx++]; 654 sum1 += *v1++ * tmp0; 655 sum2 += *v2++ * tmp0; 656 } 657 y[row++]=sum1; 658 y[row++]=sum2; 659 v1 =v2; /* Since the next block to be processed starts there*/ 660 idx +=sz; 661 break; 662 case 3: 663 sum1 = *zt++; 664 sum2 = *zt++; 665 sum3 = *zt++; 666 v2 = v1 + n; 667 v3 = v2 + n; 668 669 for (n = 0; n< sz-1; n+=2) { 670 i1 = idx[0]; 671 i2 = idx[1]; 672 idx += 2; 673 tmp0 = x[i1]; 674 tmp1 = x[i2]; 675 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 676 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 677 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 678 } 679 if (n == sz-1){ 680 tmp0 = x[*idx++]; 681 sum1 += *v1++ * tmp0; 682 sum2 += *v2++ * tmp0; 683 sum3 += *v3++ * tmp0; 684 } 685 y[row++]=sum1; 686 y[row++]=sum2; 687 y[row++]=sum3; 688 v1 =v3; /* Since the next block to be processed starts there*/ 689 idx +=2*sz; 690 break; 691 case 4: 692 sum1 = *zt++; 693 sum2 = *zt++; 694 sum3 = *zt++; 695 sum4 = *zt++; 696 v2 = v1 + n; 697 v3 = v2 + n; 698 v4 = v3 + n; 699 700 for (n = 0; n< sz-1; n+=2) { 701 i1 = idx[0]; 702 i2 = idx[1]; 703 idx += 2; 704 tmp0 = x[i1]; 705 tmp1 = x[i2]; 706 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 707 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 708 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 709 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 710 } 711 if (n == sz-1){ 712 tmp0 = x[*idx++]; 713 sum1 += *v1++ * tmp0; 714 sum2 += *v2++ * tmp0; 715 sum3 += *v3++ * tmp0; 716 sum4 += *v4++ * tmp0; 717 } 718 y[row++]=sum1; 719 y[row++]=sum2; 720 y[row++]=sum3; 721 y[row++]=sum4; 722 v1 =v4; /* Since the next block to be processed starts there*/ 723 idx +=3*sz; 724 break; 725 case 5: 726 sum1 = *zt++; 727 sum2 = *zt++; 728 sum3 = *zt++; 729 sum4 = *zt++; 730 sum5 = *zt++; 731 v2 = v1 + n; 732 v3 = v2 + n; 733 v4 = v3 + n; 734 v5 = v4 + n; 735 736 for (n = 0; n<sz-1; n+=2) { 737 i1 = idx[0]; 738 i2 = idx[1]; 739 idx += 2; 740 tmp0 = x[i1]; 741 tmp1 = x[i2]; 742 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 743 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 744 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 745 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 746 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 747 } 748 if(n == sz-1){ 749 tmp0 = x[*idx++]; 750 sum1 += *v1++ * tmp0; 751 sum2 += *v2++ * tmp0; 752 sum3 += *v3++ * tmp0; 753 sum4 += *v4++ * tmp0; 754 sum5 += *v5++ * tmp0; 755 } 756 y[row++]=sum1; 757 y[row++]=sum2; 758 y[row++]=sum3; 759 y[row++]=sum4; 760 y[row++]=sum5; 761 v1 =v5; /* Since the next block to be processed starts there */ 762 idx +=4*sz; 763 break; 764 default : 765 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 766 } 767 } 768 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 769 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 770 if (zz != yy) { 771 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 772 } 773 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 774 PetscFunctionReturn(0); 775 } 776 777 /* ----------------------------------------------------------- */ 778 #undef __FUNCT__ 779 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 780 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 781 { 782 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 783 IS iscol = a->col,isrow = a->row; 784 PetscErrorCode ierr; 785 const PetscInt *r,*c,*rout,*cout; 786 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 787 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 788 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 789 PetscScalar sum1,sum2,sum3,sum4,sum5; 790 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 791 const PetscScalar *b; 792 793 PetscFunctionBegin; 794 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 795 node_max = a->inode.node_count; 796 ns = a->inode.size; /* Node Size array */ 797 798 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 799 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 800 tmp = a->solve_work; 801 802 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 803 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 804 805 /* forward solve the lower triangular */ 806 tmps = tmp ; 807 aa = a_a ; 808 aj = a_j ; 809 ad = a->diag; 810 811 for (i = 0,row = 0; i< node_max; ++i){ 812 nsz = ns[i]; 813 aii = ai[row]; 814 v1 = aa + aii; 815 vi = aj + aii; 816 nz = ad[row]- aii; 817 if (i < node_max-1) { 818 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 819 * but our indexing to determine it's size could. */ 820 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 821 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 822 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 823 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 824 } 825 826 switch (nsz){ /* Each loop in 'case' is unrolled */ 827 case 1 : 828 sum1 = b[*r++]; 829 for(j=0; j<nz-1; j+=2){ 830 i0 = vi[0]; 831 i1 = vi[1]; 832 vi +=2; 833 tmp0 = tmps[i0]; 834 tmp1 = tmps[i1]; 835 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 836 } 837 if(j == nz-1){ 838 tmp0 = tmps[*vi++]; 839 sum1 -= *v1++ *tmp0; 840 } 841 tmp[row ++]=sum1; 842 break; 843 case 2: 844 sum1 = b[*r++]; 845 sum2 = b[*r++]; 846 v2 = aa + ai[row+1]; 847 848 for(j=0; j<nz-1; j+=2){ 849 i0 = vi[0]; 850 i1 = vi[1]; 851 vi +=2; 852 tmp0 = tmps[i0]; 853 tmp1 = tmps[i1]; 854 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 855 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 856 } 857 if(j == nz-1){ 858 tmp0 = tmps[*vi++]; 859 sum1 -= *v1++ *tmp0; 860 sum2 -= *v2++ *tmp0; 861 } 862 sum2 -= *v2++ * sum1; 863 tmp[row ++]=sum1; 864 tmp[row ++]=sum2; 865 break; 866 case 3: 867 sum1 = b[*r++]; 868 sum2 = b[*r++]; 869 sum3 = b[*r++]; 870 v2 = aa + ai[row+1]; 871 v3 = aa + ai[row+2]; 872 873 for (j=0; j<nz-1; j+=2){ 874 i0 = vi[0]; 875 i1 = vi[1]; 876 vi +=2; 877 tmp0 = tmps[i0]; 878 tmp1 = tmps[i1]; 879 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 880 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 881 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 882 } 883 if (j == nz-1){ 884 tmp0 = tmps[*vi++]; 885 sum1 -= *v1++ *tmp0; 886 sum2 -= *v2++ *tmp0; 887 sum3 -= *v3++ *tmp0; 888 } 889 sum2 -= *v2++ * sum1; 890 sum3 -= *v3++ * sum1; 891 sum3 -= *v3++ * sum2; 892 tmp[row ++]=sum1; 893 tmp[row ++]=sum2; 894 tmp[row ++]=sum3; 895 break; 896 897 case 4: 898 sum1 = b[*r++]; 899 sum2 = b[*r++]; 900 sum3 = b[*r++]; 901 sum4 = b[*r++]; 902 v2 = aa + ai[row+1]; 903 v3 = aa + ai[row+2]; 904 v4 = aa + ai[row+3]; 905 906 for (j=0; j<nz-1; j+=2){ 907 i0 = vi[0]; 908 i1 = vi[1]; 909 vi +=2; 910 tmp0 = tmps[i0]; 911 tmp1 = tmps[i1]; 912 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 913 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 914 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 915 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 916 } 917 if (j == nz-1){ 918 tmp0 = tmps[*vi++]; 919 sum1 -= *v1++ *tmp0; 920 sum2 -= *v2++ *tmp0; 921 sum3 -= *v3++ *tmp0; 922 sum4 -= *v4++ *tmp0; 923 } 924 sum2 -= *v2++ * sum1; 925 sum3 -= *v3++ * sum1; 926 sum4 -= *v4++ * sum1; 927 sum3 -= *v3++ * sum2; 928 sum4 -= *v4++ * sum2; 929 sum4 -= *v4++ * sum3; 930 931 tmp[row ++]=sum1; 932 tmp[row ++]=sum2; 933 tmp[row ++]=sum3; 934 tmp[row ++]=sum4; 935 break; 936 case 5: 937 sum1 = b[*r++]; 938 sum2 = b[*r++]; 939 sum3 = b[*r++]; 940 sum4 = b[*r++]; 941 sum5 = b[*r++]; 942 v2 = aa + ai[row+1]; 943 v3 = aa + ai[row+2]; 944 v4 = aa + ai[row+3]; 945 v5 = aa + ai[row+4]; 946 947 for (j=0; j<nz-1; j+=2){ 948 i0 = vi[0]; 949 i1 = vi[1]; 950 vi +=2; 951 tmp0 = tmps[i0]; 952 tmp1 = tmps[i1]; 953 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 954 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 955 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 957 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 958 } 959 if (j == nz-1){ 960 tmp0 = tmps[*vi++]; 961 sum1 -= *v1++ *tmp0; 962 sum2 -= *v2++ *tmp0; 963 sum3 -= *v3++ *tmp0; 964 sum4 -= *v4++ *tmp0; 965 sum5 -= *v5++ *tmp0; 966 } 967 968 sum2 -= *v2++ * sum1; 969 sum3 -= *v3++ * sum1; 970 sum4 -= *v4++ * sum1; 971 sum5 -= *v5++ * sum1; 972 sum3 -= *v3++ * sum2; 973 sum4 -= *v4++ * sum2; 974 sum5 -= *v5++ * sum2; 975 sum4 -= *v4++ * sum3; 976 sum5 -= *v5++ * sum3; 977 sum5 -= *v5++ * sum4; 978 979 tmp[row ++]=sum1; 980 tmp[row ++]=sum2; 981 tmp[row ++]=sum3; 982 tmp[row ++]=sum4; 983 tmp[row ++]=sum5; 984 break; 985 default: 986 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 987 } 988 } 989 /* backward solve the upper triangular */ 990 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 991 nsz = ns[i]; 992 aii = ai[row+1] -1; 993 v1 = aa + aii; 994 vi = aj + aii; 995 nz = aii- ad[row]; 996 switch (nsz){ /* Each loop in 'case' is unrolled */ 997 case 1 : 998 sum1 = tmp[row]; 999 1000 for(j=nz ; j>1; j-=2){ 1001 vi -=2; 1002 i0 = vi[2]; 1003 i1 = vi[1]; 1004 tmp0 = tmps[i0]; 1005 tmp1 = tmps[i1]; 1006 v1 -= 2; 1007 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1008 } 1009 if (j==1){ 1010 tmp0 = tmps[*vi--]; 1011 sum1 -= *v1-- * tmp0; 1012 } 1013 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1014 break; 1015 case 2 : 1016 sum1 = tmp[row]; 1017 sum2 = tmp[row -1]; 1018 v2 = aa + ai[row]-1; 1019 for (j=nz ; j>1; j-=2){ 1020 vi -=2; 1021 i0 = vi[2]; 1022 i1 = vi[1]; 1023 tmp0 = tmps[i0]; 1024 tmp1 = tmps[i1]; 1025 v1 -= 2; 1026 v2 -= 2; 1027 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1028 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1029 } 1030 if (j==1){ 1031 tmp0 = tmps[*vi--]; 1032 sum1 -= *v1-- * tmp0; 1033 sum2 -= *v2-- * tmp0; 1034 } 1035 1036 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1037 sum2 -= *v2-- * tmp0; 1038 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1039 break; 1040 case 3 : 1041 sum1 = tmp[row]; 1042 sum2 = tmp[row -1]; 1043 sum3 = tmp[row -2]; 1044 v2 = aa + ai[row]-1; 1045 v3 = aa + ai[row -1]-1; 1046 for (j=nz ; j>1; j-=2){ 1047 vi -=2; 1048 i0 = vi[2]; 1049 i1 = vi[1]; 1050 tmp0 = tmps[i0]; 1051 tmp1 = tmps[i1]; 1052 v1 -= 2; 1053 v2 -= 2; 1054 v3 -= 2; 1055 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1056 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1057 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1058 } 1059 if (j==1){ 1060 tmp0 = tmps[*vi--]; 1061 sum1 -= *v1-- * tmp0; 1062 sum2 -= *v2-- * tmp0; 1063 sum3 -= *v3-- * tmp0; 1064 } 1065 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1066 sum2 -= *v2-- * tmp0; 1067 sum3 -= *v3-- * tmp0; 1068 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1069 sum3 -= *v3-- * tmp0; 1070 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1071 1072 break; 1073 case 4 : 1074 sum1 = tmp[row]; 1075 sum2 = tmp[row -1]; 1076 sum3 = tmp[row -2]; 1077 sum4 = tmp[row -3]; 1078 v2 = aa + ai[row]-1; 1079 v3 = aa + ai[row -1]-1; 1080 v4 = aa + ai[row -2]-1; 1081 1082 for (j=nz ; j>1; j-=2){ 1083 vi -=2; 1084 i0 = vi[2]; 1085 i1 = vi[1]; 1086 tmp0 = tmps[i0]; 1087 tmp1 = tmps[i1]; 1088 v1 -= 2; 1089 v2 -= 2; 1090 v3 -= 2; 1091 v4 -= 2; 1092 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1093 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1094 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1095 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1096 } 1097 if (j==1){ 1098 tmp0 = tmps[*vi--]; 1099 sum1 -= *v1-- * tmp0; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 } 1104 1105 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1106 sum2 -= *v2-- * tmp0; 1107 sum3 -= *v3-- * tmp0; 1108 sum4 -= *v4-- * tmp0; 1109 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1113 sum4 -= *v4-- * tmp0; 1114 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1115 break; 1116 case 5 : 1117 sum1 = tmp[row]; 1118 sum2 = tmp[row -1]; 1119 sum3 = tmp[row -2]; 1120 sum4 = tmp[row -3]; 1121 sum5 = tmp[row -4]; 1122 v2 = aa + ai[row]-1; 1123 v3 = aa + ai[row -1]-1; 1124 v4 = aa + ai[row -2]-1; 1125 v5 = aa + ai[row -3]-1; 1126 for (j=nz ; j>1; j-=2){ 1127 vi -= 2; 1128 i0 = vi[2]; 1129 i1 = vi[1]; 1130 tmp0 = tmps[i0]; 1131 tmp1 = tmps[i1]; 1132 v1 -= 2; 1133 v2 -= 2; 1134 v3 -= 2; 1135 v4 -= 2; 1136 v5 -= 2; 1137 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1138 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1139 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1140 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1141 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1142 } 1143 if (j==1){ 1144 tmp0 = tmps[*vi--]; 1145 sum1 -= *v1-- * tmp0; 1146 sum2 -= *v2-- * tmp0; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 } 1151 1152 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1153 sum2 -= *v2-- * tmp0; 1154 sum3 -= *v3-- * tmp0; 1155 sum4 -= *v4-- * tmp0; 1156 sum5 -= *v5-- * tmp0; 1157 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1158 sum3 -= *v3-- * tmp0; 1159 sum4 -= *v4-- * tmp0; 1160 sum5 -= *v5-- * tmp0; 1161 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1162 sum4 -= *v4-- * tmp0; 1163 sum5 -= *v5-- * tmp0; 1164 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1165 sum5 -= *v5-- * tmp0; 1166 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1167 break; 1168 default: 1169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1170 } 1171 } 1172 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1173 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1174 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1176 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1177 PetscFunctionReturn(0); 1178 } 1179 1180 #undef __FUNCT__ 1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1183 { 1184 Mat C=B; 1185 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1186 IS isrow = b->row,isicol = b->icol; 1187 PetscErrorCode ierr; 1188 const PetscInt *r,*ic,*ics; 1189 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1190 PetscInt i,j,k,nz,nzL,row,*pj; 1191 const PetscInt *ajtmp,*bjtmp; 1192 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1193 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1194 FactorShiftCtx sctx; 1195 const PetscInt *ddiag; 1196 PetscReal rs; 1197 MatScalar d; 1198 PetscInt inod,nodesz,node_max,col; 1199 const PetscInt *ns; 1200 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1201 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) 1216 rs += PetscAbsScalar(v[j]); 1217 if (rs>sctx.shift_top) sctx.shift_top = rs; 1218 } 1219 sctx.shift_top *= 1.1; 1220 sctx.nshift_max = 5; 1221 sctx.shift_lo = 0.; 1222 sctx.shift_hi = 1.; 1223 } 1224 1225 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1226 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1227 1228 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1230 rtmp2 = rtmp1 + n; 1231 rtmp3 = rtmp2 + n; 1232 rtmp4 = rtmp3 + n; 1233 ics = ic; 1234 1235 node_max = a->inode.node_count; 1236 ns = a->inode.size; 1237 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1238 1239 /* If max inode size > 4, split it into two inodes.*/ 1240 /* also map the inode sizes according to the ordering */ 1241 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1242 for (i=0,j=0; i<node_max; ++i,++j){ 1243 if (ns[i] > 4) { 1244 tmp_vec1[j] = 4; 1245 ++j; 1246 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1247 } else { 1248 tmp_vec1[j] = ns[i]; 1249 } 1250 } 1251 /* Use the correct node_max */ 1252 node_max = j; 1253 1254 /* Now reorder the inode info based on mat re-ordering info */ 1255 /* First create a row -> inode_size_array_index map */ 1256 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1257 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1258 for (i=0,row=0; i<node_max; i++) { 1259 nodesz = tmp_vec1[i]; 1260 for (j=0; j<nodesz; j++,row++) { 1261 nsmap[row] = i; 1262 } 1263 } 1264 /* Using nsmap, create a reordered ns structure */ 1265 for (i=0,j=0; i< node_max; i++) { 1266 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1267 tmp_vec2[i] = nodesz; 1268 j += nodesz; 1269 } 1270 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1271 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1272 1273 /* Now use the correct ns */ 1274 ns = tmp_vec2; 1275 1276 do { 1277 sctx.newshift = PETSC_FALSE; 1278 /* Now loop over each block-row, and do the factorization */ 1279 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1280 nodesz = ns[inod]; 1281 1282 switch (nodesz){ 1283 case 1: 1284 /*----------*/ 1285 /* zero rtmp1 */ 1286 /* L part */ 1287 nz = bi[i+1] - bi[i]; 1288 bjtmp = bj + bi[i]; 1289 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1290 1291 /* U part */ 1292 nz = bdiag[i]-bdiag[i+1]; 1293 bjtmp = bj + bdiag[i+1]+1; 1294 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1295 1296 /* load in initial (unfactored row) */ 1297 nz = ai[r[i]+1] - ai[r[i]]; 1298 ajtmp = aj + ai[r[i]]; 1299 v = aa + ai[r[i]]; 1300 for (j=0; j<nz; j++) { 1301 rtmp1[ics[ajtmp[j]]] = v[j]; 1302 } 1303 /* ZeropivotApply() */ 1304 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1305 1306 /* elimination */ 1307 bjtmp = bj + bi[i]; 1308 row = *bjtmp++; 1309 nzL = bi[i+1] - bi[i]; 1310 for(k=0; k < nzL;k++) { 1311 pc = rtmp1 + row; 1312 if (*pc != 0.0) { 1313 pv = b->a + bdiag[row]; 1314 mul1 = *pc * (*pv); 1315 *pc = mul1; 1316 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1317 pv = b->a + bdiag[row+1]+1; 1318 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1319 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1320 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1321 } 1322 row = *bjtmp++; 1323 } 1324 1325 /* finished row so stick it into b->a */ 1326 rs = 0.0; 1327 /* L part */ 1328 pv = b->a + bi[i] ; 1329 pj = b->j + bi[i] ; 1330 nz = bi[i+1] - bi[i]; 1331 for (j=0; j<nz; j++) { 1332 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1333 } 1334 1335 /* U part */ 1336 pv = b->a + bdiag[i+1]+1; 1337 pj = b->j + bdiag[i+1]+1; 1338 nz = bdiag[i] - bdiag[i+1]-1; 1339 for (j=0; j<nz; j++) { 1340 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1341 } 1342 1343 /* Check zero pivot */ 1344 sctx.rs = rs; 1345 sctx.pv = rtmp1[i]; 1346 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1347 if(sctx.newshift) break; 1348 1349 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1350 pv = b->a + bdiag[i]; 1351 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1352 break; 1353 1354 case 2: 1355 /*----------*/ 1356 /* zero rtmp1 and rtmp2 */ 1357 /* L part */ 1358 nz = bi[i+1] - bi[i]; 1359 bjtmp = bj + bi[i]; 1360 for (j=0; j<nz; j++) { 1361 col = bjtmp[j]; 1362 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1363 } 1364 1365 /* U part */ 1366 nz = bdiag[i]-bdiag[i+1]; 1367 bjtmp = bj + bdiag[i+1]+1; 1368 for (j=0; j<nz; j++) { 1369 col = bjtmp[j]; 1370 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1371 } 1372 1373 /* load in initial (unfactored row) */ 1374 nz = ai[r[i]+1] - ai[r[i]]; 1375 ajtmp = aj + ai[r[i]]; 1376 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1377 for (j=0; j<nz; j++) { 1378 col = ics[ajtmp[j]]; 1379 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1380 } 1381 /* ZeropivotApply(): shift the diagonal of the matrix */ 1382 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1383 1384 /* elimination */ 1385 bjtmp = bj + bi[i]; 1386 row = *bjtmp++; /* pivot row */ 1387 nzL = bi[i+1] - bi[i]; 1388 for(k=0; k < nzL;k++) { 1389 pc1 = rtmp1 + row; 1390 pc2 = rtmp2 + row; 1391 if (*pc1 != 0.0 || *pc2 != 0.0) { 1392 pv = b->a + bdiag[row]; 1393 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1394 *pc1 = mul1; *pc2 = mul2; 1395 1396 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1397 pv = b->a + bdiag[row+1]+1; 1398 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1399 for (j=0; j<nz; j++){ 1400 col = pj[j]; 1401 rtmp1[col] -= mul1 * pv[j]; 1402 rtmp2[col] -= mul2 * pv[j]; 1403 } 1404 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1405 } 1406 row = *bjtmp++; 1407 } 1408 1409 /* finished row i; check zero pivot, then stick row i into b->a */ 1410 rs = 0.0; 1411 /* L part */ 1412 pc1 = b->a + bi[i]; 1413 pj = b->j + bi[i] ; 1414 nz = bi[i+1] - bi[i]; 1415 for (j=0; j<nz; j++) { 1416 col = pj[j]; 1417 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1418 } 1419 /* U part */ 1420 pc1 = b->a + bdiag[i+1]+1; 1421 pj = b->j + bdiag[i+1]+1; 1422 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1423 for (j=0; j<nz; j++) { 1424 col = pj[j]; 1425 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1426 } 1427 1428 sctx.rs = rs; 1429 sctx.pv = rtmp1[i]; 1430 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1431 if(sctx.newshift) break; 1432 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1433 *pc1 = 1.0/sctx.pv; 1434 1435 /* Now take care of diagonal 2x2 block. */ 1436 pc2 = rtmp2 + i; 1437 if (*pc2 != 0.0){ 1438 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1439 *pc2 = mul1; /* insert L entry */ 1440 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1441 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1442 for (j=0; j<nz; j++) { 1443 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1444 } 1445 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1446 } 1447 1448 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1449 rs = 0.0; 1450 /* L part */ 1451 pc2 = b->a + bi[i+1]; 1452 pj = b->j + bi[i+1] ; 1453 nz = bi[i+2] - bi[i+1]; 1454 for (j=0; j<nz; j++) { 1455 col = pj[j]; 1456 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1457 } 1458 /* U part */ 1459 pc2 = b->a + bdiag[i+2]+1; 1460 pj = b->j + bdiag[i+2]+1; 1461 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1462 for (j=0; j<nz; j++) { 1463 col = pj[j]; 1464 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1465 } 1466 1467 sctx.rs = rs; 1468 sctx.pv = rtmp2[i+1]; 1469 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1470 if(sctx.newshift) break; 1471 pc2 = b->a + bdiag[i+1]; 1472 *pc2 = 1.0/sctx.pv; 1473 break; 1474 1475 case 3: 1476 /*----------*/ 1477 /* zero rtmp */ 1478 /* L part */ 1479 nz = bi[i+1] - bi[i]; 1480 bjtmp = bj + bi[i]; 1481 for (j=0; j<nz; j++) { 1482 col = bjtmp[j]; 1483 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1484 } 1485 1486 /* U part */ 1487 nz = bdiag[i]-bdiag[i+1]; 1488 bjtmp = bj + bdiag[i+1]+1; 1489 for (j=0; j<nz; j++) { 1490 col = bjtmp[j]; 1491 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1492 } 1493 1494 /* load in initial (unfactored row) */ 1495 nz = ai[r[i]+1] - ai[r[i]]; 1496 ajtmp = aj + ai[r[i]]; 1497 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1498 for (j=0; j<nz; j++) { 1499 col = ics[ajtmp[j]]; 1500 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1501 } 1502 /* ZeropivotApply(): shift the diagonal of the matrix */ 1503 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1504 1505 /* elimination */ 1506 bjtmp = bj + bi[i]; 1507 row = *bjtmp++; /* pivot row */ 1508 nzL = bi[i+1] - bi[i]; 1509 for(k=0; k < nzL;k++) { 1510 pc1 = rtmp1 + row; 1511 pc2 = rtmp2 + row; 1512 pc3 = rtmp3 + row; 1513 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1514 pv = b->a + bdiag[row]; 1515 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1516 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1517 1518 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1519 pv = b->a + bdiag[row+1]+1; 1520 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1521 for (j=0; j<nz; j++){ 1522 col = pj[j]; 1523 rtmp1[col] -= mul1 * pv[j]; 1524 rtmp2[col] -= mul2 * pv[j]; 1525 rtmp3[col] -= mul3 * pv[j]; 1526 } 1527 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1528 } 1529 row = *bjtmp++; 1530 } 1531 1532 /* finished row i; check zero pivot, then stick row i into b->a */ 1533 rs = 0.0; 1534 /* L part */ 1535 pc1 = b->a + bi[i]; 1536 pj = b->j + bi[i] ; 1537 nz = bi[i+1] - bi[i]; 1538 for (j=0; j<nz; j++) { 1539 col = pj[j]; 1540 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1541 } 1542 /* U part */ 1543 pc1 = b->a + bdiag[i+1]+1; 1544 pj = b->j + bdiag[i+1]+1; 1545 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1546 for (j=0; j<nz; j++) { 1547 col = pj[j]; 1548 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1549 } 1550 1551 sctx.rs = rs; 1552 sctx.pv = rtmp1[i]; 1553 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1554 if(sctx.newshift) break; 1555 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1556 *pc1 = 1.0/sctx.pv; 1557 1558 /* Now take care of 1st column of diagonal 3x3 block. */ 1559 pc2 = rtmp2 + i; 1560 pc3 = rtmp3 + i; 1561 if (*pc2 != 0.0 || *pc3 != 0.0){ 1562 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1563 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1564 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1565 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1566 for (j=0; j<nz; j++) { 1567 col = pj[j]; 1568 rtmp2[col] -= mul2 * rtmp1[col]; 1569 rtmp3[col] -= mul3 * rtmp1[col]; 1570 } 1571 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1572 } 1573 1574 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1575 rs = 0.0; 1576 /* L part */ 1577 pc2 = b->a + bi[i+1]; 1578 pj = b->j + bi[i+1] ; 1579 nz = bi[i+2] - bi[i+1]; 1580 for (j=0; j<nz; j++) { 1581 col = pj[j]; 1582 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1583 } 1584 /* U part */ 1585 pc2 = b->a + bdiag[i+2]+1; 1586 pj = b->j + bdiag[i+2]+1; 1587 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1588 for (j=0; j<nz; j++) { 1589 col = pj[j]; 1590 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1591 } 1592 1593 sctx.rs = rs; 1594 sctx.pv = rtmp2[i+1]; 1595 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1596 if(sctx.newshift) break; 1597 pc2 = b->a + bdiag[i+1]; 1598 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1599 1600 /* Now take care of 2nd column of diagonal 3x3 block. */ 1601 pc3 = rtmp3 + i+1; 1602 if (*pc3 != 0.0){ 1603 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1604 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1605 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1606 for (j=0; j<nz; j++) { 1607 col = pj[j]; 1608 rtmp3[col] -= mul3 * rtmp2[col]; 1609 } 1610 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1611 } 1612 1613 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1614 rs = 0.0; 1615 /* L part */ 1616 pc3 = b->a + bi[i+2]; 1617 pj = b->j + bi[i+2] ; 1618 nz = bi[i+3] - bi[i+2]; 1619 for (j=0; j<nz; j++) { 1620 col = pj[j]; 1621 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1622 } 1623 /* U part */ 1624 pc3 = b->a + bdiag[i+3]+1; 1625 pj = b->j + bdiag[i+3]+1; 1626 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1627 for (j=0; j<nz; j++) { 1628 col = pj[j]; 1629 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1630 } 1631 1632 sctx.rs = rs; 1633 sctx.pv = rtmp3[i+2]; 1634 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1635 if(sctx.newshift) break; 1636 pc3 = b->a + bdiag[i+2]; 1637 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1638 break; 1639 case 4: 1640 /*----------*/ 1641 /* zero rtmp */ 1642 /* L part */ 1643 nz = bi[i+1] - bi[i]; 1644 bjtmp = bj + bi[i]; 1645 for (j=0; j<nz; j++) { 1646 col = bjtmp[j]; 1647 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1648 } 1649 1650 /* U part */ 1651 nz = bdiag[i]-bdiag[i+1]; 1652 bjtmp = bj + bdiag[i+1]+1; 1653 for (j=0; j<nz; j++) { 1654 col = bjtmp[j]; 1655 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1656 } 1657 1658 /* load in initial (unfactored row) */ 1659 nz = ai[r[i]+1] - ai[r[i]]; 1660 ajtmp = aj + ai[r[i]]; 1661 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1662 for (j=0; j<nz; j++) { 1663 col = ics[ajtmp[j]]; 1664 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1665 } 1666 /* ZeropivotApply(): shift the diagonal of the matrix */ 1667 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1668 1669 /* elimination */ 1670 bjtmp = bj + bi[i]; 1671 row = *bjtmp++; /* pivot row */ 1672 nzL = bi[i+1] - bi[i]; 1673 for(k=0; k < nzL;k++) { 1674 pc1 = rtmp1 + row; 1675 pc2 = rtmp2 + row; 1676 pc3 = rtmp3 + row; 1677 pc4 = rtmp4 + row; 1678 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1679 pv = b->a + bdiag[row]; 1680 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1681 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1682 1683 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1684 pv = b->a + bdiag[row+1]+1; 1685 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1686 for (j=0; j<nz; j++){ 1687 col = pj[j]; 1688 rtmp1[col] -= mul1 * pv[j]; 1689 rtmp2[col] -= mul2 * pv[j]; 1690 rtmp3[col] -= mul3 * pv[j]; 1691 rtmp4[col] -= mul4 * pv[j]; 1692 } 1693 ierr = PetscLogFlops(8*nz);CHKERRQ(ierr); 1694 } 1695 row = *bjtmp++; 1696 } 1697 1698 /* finished row i; check zero pivot, then stick row i into b->a */ 1699 rs = 0.0; 1700 /* L part */ 1701 pc1 = b->a + bi[i]; 1702 pj = b->j + bi[i] ; 1703 nz = bi[i+1] - bi[i]; 1704 for (j=0; j<nz; j++) { 1705 col = pj[j]; 1706 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1707 } 1708 /* U part */ 1709 pc1 = b->a + bdiag[i+1]+1; 1710 pj = b->j + bdiag[i+1]+1; 1711 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1712 for (j=0; j<nz; j++) { 1713 col = pj[j]; 1714 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1715 } 1716 1717 sctx.rs = rs; 1718 sctx.pv = rtmp1[i]; 1719 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1720 if(sctx.newshift) break; 1721 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1722 *pc1 = 1.0/sctx.pv; 1723 1724 /* Now take care of 1st column of diagonal 4x4 block. */ 1725 pc2 = rtmp2 + i; 1726 pc3 = rtmp3 + i; 1727 pc4 = rtmp4 + i; 1728 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){ 1729 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1730 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1731 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1732 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1733 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1734 for (j=0; j<nz; j++) { 1735 col = pj[j]; 1736 rtmp2[col] -= mul2 * rtmp1[col]; 1737 rtmp3[col] -= mul3 * rtmp1[col]; 1738 rtmp4[col] -= mul4 * rtmp1[col]; 1739 } 1740 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1741 } 1742 1743 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1744 rs = 0.0; 1745 /* L part */ 1746 pc2 = b->a + bi[i+1]; 1747 pj = b->j + bi[i+1] ; 1748 nz = bi[i+2] - bi[i+1]; 1749 for (j=0; j<nz; j++) { 1750 col = pj[j]; 1751 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1752 } 1753 /* U part */ 1754 pc2 = b->a + bdiag[i+2]+1; 1755 pj = b->j + bdiag[i+2]+1; 1756 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1757 for (j=0; j<nz; j++) { 1758 col = pj[j]; 1759 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1760 } 1761 1762 sctx.rs = rs; 1763 sctx.pv = rtmp2[i+1]; 1764 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1765 if(sctx.newshift) break; 1766 pc2 = b->a + bdiag[i+1]; 1767 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1768 1769 /* Now take care of 2nd column of diagonal 4x4 block. */ 1770 pc3 = rtmp3 + i+1; 1771 pc4 = rtmp4 + i+1; 1772 if (*pc3 != 0.0 || *pc4 != 0.0){ 1773 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1774 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1775 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1776 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1777 for (j=0; j<nz; j++) { 1778 col = pj[j]; 1779 rtmp3[col] -= mul3 * rtmp2[col]; 1780 rtmp4[col] -= mul4 * rtmp2[col]; 1781 } 1782 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1783 } 1784 1785 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1786 rs = 0.0; 1787 /* L part */ 1788 pc3 = b->a + bi[i+2]; 1789 pj = b->j + bi[i+2] ; 1790 nz = bi[i+3] - bi[i+2]; 1791 for (j=0; j<nz; j++) { 1792 col = pj[j]; 1793 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1794 } 1795 /* U part */ 1796 pc3 = b->a + bdiag[i+3]+1; 1797 pj = b->j + bdiag[i+3]+1; 1798 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1799 for (j=0; j<nz; j++) { 1800 col = pj[j]; 1801 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1802 } 1803 1804 sctx.rs = rs; 1805 sctx.pv = rtmp3[i+2]; 1806 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1807 if(sctx.newshift) break; 1808 pc3 = b->a + bdiag[i+2]; 1809 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1810 1811 /* Now take care of 3rd column of diagonal 4x4 block. */ 1812 pc4 = rtmp4 + i+2; 1813 if (*pc4 != 0.0){ 1814 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1815 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1816 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1817 for (j=0; j<nz; j++) { 1818 col = pj[j]; 1819 rtmp4[col] -= mul4 * rtmp3[col]; 1820 } 1821 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1822 } 1823 1824 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1825 rs = 0.0; 1826 /* L part */ 1827 pc4 = b->a + bi[i+3]; 1828 pj = b->j + bi[i+3] ; 1829 nz = bi[i+4] - bi[i+3]; 1830 for (j=0; j<nz; j++) { 1831 col = pj[j]; 1832 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1833 } 1834 /* U part */ 1835 pc4 = b->a + bdiag[i+4]+1; 1836 pj = b->j + bdiag[i+4]+1; 1837 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1838 for (j=0; j<nz; j++) { 1839 col = pj[j]; 1840 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1841 } 1842 1843 sctx.rs = rs; 1844 sctx.pv = rtmp4[i+3]; 1845 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1846 if(sctx.newshift) break; 1847 pc4 = b->a + bdiag[i+3]; 1848 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1849 break; 1850 1851 default: 1852 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1853 } 1854 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1855 i += nodesz; /* Update the row */ 1856 } 1857 1858 /* MatPivotRefine() */ 1859 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1860 /* 1861 * if no shift in this attempt & shifting & started shifting & can refine, 1862 * then try lower shift 1863 */ 1864 sctx.shift_hi = sctx.shift_fraction; 1865 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1866 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1867 sctx.newshift = PETSC_TRUE; 1868 sctx.nshift++; 1869 } 1870 } while (sctx.newshift); 1871 1872 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1873 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1874 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1875 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1876 1877 C->ops->solve = MatSolve_SeqAIJ; 1878 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1879 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1880 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1881 C->ops->matsolve = MatMatSolve_SeqAIJ; 1882 C->assembled = PETSC_TRUE; 1883 C->preallocated = PETSC_TRUE; 1884 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1885 1886 /* MatShiftView(A,info,&sctx) */ 1887 if (sctx.nshift){ 1888 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1889 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1890 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1891 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1892 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){ 1893 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1894 } 1895 } 1896 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1897 PetscFunctionReturn(0); 1898 } 1899 1900 #undef __FUNCT__ 1901 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1902 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1903 { 1904 Mat C = B; 1905 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1906 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1907 PetscErrorCode ierr; 1908 const PetscInt *r,*ic,*c,*ics; 1909 PetscInt n = A->rmap->n,*bi = b->i; 1910 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1911 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1912 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1913 PetscScalar mul1,mul2,mul3,tmp; 1914 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1915 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1916 PetscReal rs=0.0; 1917 FactorShiftCtx sctx; 1918 1919 PetscFunctionBegin; 1920 sctx.shift_top = 0; 1921 sctx.nshift_max = 0; 1922 sctx.shift_lo = 0; 1923 sctx.shift_hi = 0; 1924 sctx.shift_fraction = 0; 1925 1926 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1927 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1928 sctx.shift_top = 0; 1929 for (i=0; i<n; i++) { 1930 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1931 rs = 0.0; 1932 ajtmp = aj + ai[i]; 1933 rtmp1 = aa + ai[i]; 1934 nz = ai[i+1] - ai[i]; 1935 for (j=0; j<nz; j++){ 1936 if (*ajtmp != i){ 1937 rs += PetscAbsScalar(*rtmp1++); 1938 } else { 1939 rs -= PetscRealPart(*rtmp1++); 1940 } 1941 ajtmp++; 1942 } 1943 if (rs>sctx.shift_top) sctx.shift_top = rs; 1944 } 1945 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1946 sctx.shift_top *= 1.1; 1947 sctx.nshift_max = 5; 1948 sctx.shift_lo = 0.; 1949 sctx.shift_hi = 1.; 1950 } 1951 sctx.shift_amount = 0; 1952 sctx.nshift = 0; 1953 1954 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1955 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1956 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1957 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1958 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1959 ics = ic ; 1960 rtmp22 = rtmp11 + n; 1961 rtmp33 = rtmp22 + n; 1962 1963 node_max = a->inode.node_count; 1964 ns = a->inode.size; 1965 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1966 1967 /* If max inode size > 3, split it into two inodes.*/ 1968 /* also map the inode sizes according to the ordering */ 1969 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1970 for (i=0,j=0; i<node_max; ++i,++j){ 1971 if (ns[i]>3) { 1972 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1973 ++j; 1974 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1975 } else { 1976 tmp_vec1[j] = ns[i]; 1977 } 1978 } 1979 /* Use the correct node_max */ 1980 node_max = j; 1981 1982 /* Now reorder the inode info based on mat re-ordering info */ 1983 /* First create a row -> inode_size_array_index map */ 1984 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1985 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1986 for (i=0,row=0; i<node_max; i++) { 1987 nodesz = tmp_vec1[i]; 1988 for (j=0; j<nodesz; j++,row++) { 1989 nsmap[row] = i; 1990 } 1991 } 1992 /* Using nsmap, create a reordered ns structure */ 1993 for (i=0,j=0; i< node_max; i++) { 1994 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1995 tmp_vec2[i] = nodesz; 1996 j += nodesz; 1997 } 1998 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1999 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 2000 /* Now use the correct ns */ 2001 ns = tmp_vec2; 2002 2003 do { 2004 sctx.newshift = PETSC_FALSE; 2005 /* Now loop over each block-row, and do the factorization */ 2006 for (i=0,row=0; i<node_max; i++) { 2007 nodesz = ns[i]; 2008 nz = bi[row+1] - bi[row]; 2009 bjtmp = bj + bi[row]; 2010 2011 switch (nodesz){ 2012 case 1: 2013 for (j=0; j<nz; j++){ 2014 idx = bjtmp[j]; 2015 rtmp11[idx] = 0.0; 2016 } 2017 2018 /* load in initial (unfactored row) */ 2019 idx = r[row]; 2020 nz_tmp = ai[idx+1] - ai[idx]; 2021 ajtmp = aj + ai[idx]; 2022 v1 = aa + ai[idx]; 2023 2024 for (j=0; j<nz_tmp; j++) { 2025 idx = ics[ajtmp[j]]; 2026 rtmp11[idx] = v1[j]; 2027 } 2028 rtmp11[ics[r[row]]] += sctx.shift_amount; 2029 2030 prow = *bjtmp++ ; 2031 while (prow < row) { 2032 pc1 = rtmp11 + prow; 2033 if (*pc1 != 0.0){ 2034 pv = ba + bd[prow]; 2035 pj = nbj + bd[prow]; 2036 mul1 = *pc1 * *pv++; 2037 *pc1 = mul1; 2038 nz_tmp = bi[prow+1] - bd[prow] - 1; 2039 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2040 for (j=0; j<nz_tmp; j++) { 2041 tmp = pv[j]; 2042 idx = pj[j]; 2043 rtmp11[idx] -= mul1 * tmp; 2044 } 2045 } 2046 prow = *bjtmp++ ; 2047 } 2048 pj = bj + bi[row]; 2049 pc1 = ba + bi[row]; 2050 2051 sctx.pv = rtmp11[row]; 2052 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2053 rs = 0.0; 2054 for (j=0; j<nz; j++) { 2055 idx = pj[j]; 2056 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2057 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2058 } 2059 sctx.rs = rs; 2060 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2061 if (sctx.newshift) goto endofwhile; 2062 break; 2063 2064 case 2: 2065 for (j=0; j<nz; j++) { 2066 idx = bjtmp[j]; 2067 rtmp11[idx] = 0.0; 2068 rtmp22[idx] = 0.0; 2069 } 2070 2071 /* load in initial (unfactored row) */ 2072 idx = r[row]; 2073 nz_tmp = ai[idx+1] - ai[idx]; 2074 ajtmp = aj + ai[idx]; 2075 v1 = aa + ai[idx]; 2076 v2 = aa + ai[idx+1]; 2077 for (j=0; j<nz_tmp; j++) { 2078 idx = ics[ajtmp[j]]; 2079 rtmp11[idx] = v1[j]; 2080 rtmp22[idx] = v2[j]; 2081 } 2082 rtmp11[ics[r[row]]] += sctx.shift_amount; 2083 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2084 2085 prow = *bjtmp++ ; 2086 while (prow < row) { 2087 pc1 = rtmp11 + prow; 2088 pc2 = rtmp22 + prow; 2089 if (*pc1 != 0.0 || *pc2 != 0.0){ 2090 pv = ba + bd[prow]; 2091 pj = nbj + bd[prow]; 2092 mul1 = *pc1 * *pv; 2093 mul2 = *pc2 * *pv; 2094 ++pv; 2095 *pc1 = mul1; 2096 *pc2 = mul2; 2097 2098 nz_tmp = bi[prow+1] - bd[prow] - 1; 2099 for (j=0; j<nz_tmp; j++) { 2100 tmp = pv[j]; 2101 idx = pj[j]; 2102 rtmp11[idx] -= mul1 * tmp; 2103 rtmp22[idx] -= mul2 * tmp; 2104 } 2105 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2106 } 2107 prow = *bjtmp++ ; 2108 } 2109 2110 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2111 pc1 = rtmp11 + prow; 2112 pc2 = rtmp22 + prow; 2113 2114 sctx.pv = *pc1; 2115 pj = bj + bi[prow]; 2116 rs = 0.0; 2117 for (j=0; j<nz; j++){ 2118 idx = pj[j]; 2119 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2120 } 2121 sctx.rs = rs; 2122 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2123 if (sctx.newshift) goto endofwhile; 2124 2125 if (*pc2 != 0.0){ 2126 pj = nbj + bd[prow]; 2127 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2128 *pc2 = mul2; 2129 nz_tmp = bi[prow+1] - bd[prow] - 1; 2130 for (j=0; j<nz_tmp; j++) { 2131 idx = pj[j] ; 2132 tmp = rtmp11[idx]; 2133 rtmp22[idx] -= mul2 * tmp; 2134 } 2135 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2136 } 2137 2138 pj = bj + bi[row]; 2139 pc1 = ba + bi[row]; 2140 pc2 = ba + bi[row+1]; 2141 2142 sctx.pv = rtmp22[row+1]; 2143 rs = 0.0; 2144 rtmp11[row] = 1.0/rtmp11[row]; 2145 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2146 /* copy row entries from dense representation to sparse */ 2147 for (j=0; j<nz; j++) { 2148 idx = pj[j]; 2149 pc1[j] = rtmp11[idx]; 2150 pc2[j] = rtmp22[idx]; 2151 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2152 } 2153 sctx.rs = rs; 2154 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2155 if (sctx.newshift) goto endofwhile; 2156 break; 2157 2158 case 3: 2159 for (j=0; j<nz; j++) { 2160 idx = bjtmp[j]; 2161 rtmp11[idx] = 0.0; 2162 rtmp22[idx] = 0.0; 2163 rtmp33[idx] = 0.0; 2164 } 2165 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2166 idx = r[row]; 2167 nz_tmp = ai[idx+1] - ai[idx]; 2168 ajtmp = aj + ai[idx]; 2169 v1 = aa + ai[idx]; 2170 v2 = aa + ai[idx+1]; 2171 v3 = aa + ai[idx+2]; 2172 for (j=0; j<nz_tmp; j++) { 2173 idx = ics[ajtmp[j]]; 2174 rtmp11[idx] = v1[j]; 2175 rtmp22[idx] = v2[j]; 2176 rtmp33[idx] = v3[j]; 2177 } 2178 rtmp11[ics[r[row]]] += sctx.shift_amount; 2179 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2180 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2181 2182 /* loop over all pivot row blocks above this row block */ 2183 prow = *bjtmp++ ; 2184 while (prow < row) { 2185 pc1 = rtmp11 + prow; 2186 pc2 = rtmp22 + prow; 2187 pc3 = rtmp33 + prow; 2188 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 2189 pv = ba + bd[prow]; 2190 pj = nbj + bd[prow]; 2191 mul1 = *pc1 * *pv; 2192 mul2 = *pc2 * *pv; 2193 mul3 = *pc3 * *pv; 2194 ++pv; 2195 *pc1 = mul1; 2196 *pc2 = mul2; 2197 *pc3 = mul3; 2198 2199 nz_tmp = bi[prow+1] - bd[prow] - 1; 2200 /* update this row based on pivot row */ 2201 for (j=0; j<nz_tmp; j++) { 2202 tmp = pv[j]; 2203 idx = pj[j]; 2204 rtmp11[idx] -= mul1 * tmp; 2205 rtmp22[idx] -= mul2 * tmp; 2206 rtmp33[idx] -= mul3 * tmp; 2207 } 2208 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 2209 } 2210 prow = *bjtmp++ ; 2211 } 2212 2213 /* Now take care of diagonal 3x3 block in this set of rows */ 2214 /* note: prow = row here */ 2215 pc1 = rtmp11 + prow; 2216 pc2 = rtmp22 + prow; 2217 pc3 = rtmp33 + prow; 2218 2219 sctx.pv = *pc1; 2220 pj = bj + bi[prow]; 2221 rs = 0.0; 2222 for (j=0; j<nz; j++){ 2223 idx = pj[j]; 2224 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2225 } 2226 sctx.rs = rs; 2227 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2228 if (sctx.newshift) goto endofwhile; 2229 2230 if (*pc2 != 0.0 || *pc3 != 0.0){ 2231 mul2 = (*pc2)/(*pc1); 2232 mul3 = (*pc3)/(*pc1); 2233 *pc2 = mul2; 2234 *pc3 = mul3; 2235 nz_tmp = bi[prow+1] - bd[prow] - 1; 2236 pj = nbj + bd[prow]; 2237 for (j=0; j<nz_tmp; j++) { 2238 idx = pj[j] ; 2239 tmp = rtmp11[idx]; 2240 rtmp22[idx] -= mul2 * tmp; 2241 rtmp33[idx] -= mul3 * tmp; 2242 } 2243 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2244 } 2245 ++prow; 2246 2247 pc2 = rtmp22 + prow; 2248 pc3 = rtmp33 + prow; 2249 sctx.pv = *pc2; 2250 pj = bj + bi[prow]; 2251 rs = 0.0; 2252 for (j=0; j<nz; j++){ 2253 idx = pj[j]; 2254 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2255 } 2256 sctx.rs = rs; 2257 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2258 if (sctx.newshift) goto endofwhile; 2259 2260 if (*pc3 != 0.0){ 2261 mul3 = (*pc3)/(*pc2); 2262 *pc3 = mul3; 2263 pj = nbj + bd[prow]; 2264 nz_tmp = bi[prow+1] - bd[prow] - 1; 2265 for (j=0; j<nz_tmp; j++) { 2266 idx = pj[j] ; 2267 tmp = rtmp22[idx]; 2268 rtmp33[idx] -= mul3 * tmp; 2269 } 2270 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2271 } 2272 2273 pj = bj + bi[row]; 2274 pc1 = ba + bi[row]; 2275 pc2 = ba + bi[row+1]; 2276 pc3 = ba + bi[row+2]; 2277 2278 sctx.pv = rtmp33[row+2]; 2279 rs = 0.0; 2280 rtmp11[row] = 1.0/rtmp11[row]; 2281 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2282 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2283 /* copy row entries from dense representation to sparse */ 2284 for (j=0; j<nz; j++) { 2285 idx = pj[j]; 2286 pc1[j] = rtmp11[idx]; 2287 pc2[j] = rtmp22[idx]; 2288 pc3[j] = rtmp33[idx]; 2289 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2290 } 2291 2292 sctx.rs = rs; 2293 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2294 if (sctx.newshift) goto endofwhile; 2295 break; 2296 2297 default: 2298 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2299 } 2300 row += nodesz; /* Update the row */ 2301 } 2302 endofwhile:; 2303 } while (sctx.newshift); 2304 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2305 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2306 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2307 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2308 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2309 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2310 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2311 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2312 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2313 C->assembled = PETSC_TRUE; 2314 C->preallocated = PETSC_TRUE; 2315 if (sctx.nshift) { 2316 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2317 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2318 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2319 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2320 } 2321 } 2322 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2323 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2324 PetscFunctionReturn(0); 2325 } 2326 2327 2328 /* ----------------------------------------------------------- */ 2329 #undef __FUNCT__ 2330 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2331 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2332 { 2333 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2334 IS iscol = a->col,isrow = a->row; 2335 PetscErrorCode ierr; 2336 const PetscInt *r,*c,*rout,*cout; 2337 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2338 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2339 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2340 PetscScalar sum1,sum2,sum3,sum4,sum5; 2341 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2342 const PetscScalar *b; 2343 2344 PetscFunctionBegin; 2345 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2346 node_max = a->inode.node_count; 2347 ns = a->inode.size; /* Node Size array */ 2348 2349 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2350 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2351 tmp = a->solve_work; 2352 2353 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2354 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2355 2356 /* forward solve the lower triangular */ 2357 tmps = tmp ; 2358 aa = a_a ; 2359 aj = a_j ; 2360 ad = a->diag; 2361 2362 for (i = 0,row = 0; i< node_max; ++i){ 2363 nsz = ns[i]; 2364 aii = ai[row]; 2365 v1 = aa + aii; 2366 vi = aj + aii; 2367 nz = ai[row+1]- ai[row]; 2368 2369 if (i < node_max-1) { 2370 /* Prefetch the indices for the next block */ 2371 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2372 /* Prefetch the data for the next block */ 2373 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2374 } 2375 2376 switch (nsz){ /* Each loop in 'case' is unrolled */ 2377 case 1 : 2378 sum1 = b[r[row]]; 2379 for(j=0; j<nz-1; j+=2){ 2380 i0 = vi[j]; 2381 i1 = vi[j+1]; 2382 tmp0 = tmps[i0]; 2383 tmp1 = tmps[i1]; 2384 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2385 } 2386 if(j == nz-1){ 2387 tmp0 = tmps[vi[j]]; 2388 sum1 -= v1[j]*tmp0; 2389 } 2390 tmp[row++]=sum1; 2391 break; 2392 case 2: 2393 sum1 = b[r[row]]; 2394 sum2 = b[r[row+1]]; 2395 v2 = aa + ai[row+1]; 2396 2397 for(j=0; j<nz-1; j+=2){ 2398 i0 = vi[j]; 2399 i1 = vi[j+1]; 2400 tmp0 = tmps[i0]; 2401 tmp1 = tmps[i1]; 2402 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2403 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2404 } 2405 if(j == nz-1){ 2406 tmp0 = tmps[vi[j]]; 2407 sum1 -= v1[j] *tmp0; 2408 sum2 -= v2[j] *tmp0; 2409 } 2410 sum2 -= v2[nz] * sum1; 2411 tmp[row ++]=sum1; 2412 tmp[row ++]=sum2; 2413 break; 2414 case 3: 2415 sum1 = b[r[row]]; 2416 sum2 = b[r[row+1]]; 2417 sum3 = b[r[row+2]]; 2418 v2 = aa + ai[row+1]; 2419 v3 = aa + ai[row+2]; 2420 2421 for (j=0; j<nz-1; j+=2){ 2422 i0 = vi[j]; 2423 i1 = vi[j+1]; 2424 tmp0 = tmps[i0]; 2425 tmp1 = tmps[i1]; 2426 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2427 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2428 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2429 } 2430 if (j == nz-1){ 2431 tmp0 = tmps[vi[j]]; 2432 sum1 -= v1[j] *tmp0; 2433 sum2 -= v2[j] *tmp0; 2434 sum3 -= v3[j] *tmp0; 2435 } 2436 sum2 -= v2[nz] * sum1; 2437 sum3 -= v3[nz] * sum1; 2438 sum3 -= v3[nz+1] * sum2; 2439 tmp[row ++]=sum1; 2440 tmp[row ++]=sum2; 2441 tmp[row ++]=sum3; 2442 break; 2443 2444 case 4: 2445 sum1 = b[r[row]]; 2446 sum2 = b[r[row+1]]; 2447 sum3 = b[r[row+2]]; 2448 sum4 = b[r[row+3]]; 2449 v2 = aa + ai[row+1]; 2450 v3 = aa + ai[row+2]; 2451 v4 = aa + ai[row+3]; 2452 2453 for (j=0; j<nz-1; j+=2){ 2454 i0 = vi[j]; 2455 i1 = vi[j+1]; 2456 tmp0 = tmps[i0]; 2457 tmp1 = tmps[i1]; 2458 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2459 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2460 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2461 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2462 } 2463 if (j == nz-1){ 2464 tmp0 = tmps[vi[j]]; 2465 sum1 -= v1[j] *tmp0; 2466 sum2 -= v2[j] *tmp0; 2467 sum3 -= v3[j] *tmp0; 2468 sum4 -= v4[j] *tmp0; 2469 } 2470 sum2 -= v2[nz] * sum1; 2471 sum3 -= v3[nz] * sum1; 2472 sum4 -= v4[nz] * sum1; 2473 sum3 -= v3[nz+1] * sum2; 2474 sum4 -= v4[nz+1] * sum2; 2475 sum4 -= v4[nz+2] * sum3; 2476 2477 tmp[row ++]=sum1; 2478 tmp[row ++]=sum2; 2479 tmp[row ++]=sum3; 2480 tmp[row ++]=sum4; 2481 break; 2482 case 5: 2483 sum1 = b[r[row]]; 2484 sum2 = b[r[row+1]]; 2485 sum3 = b[r[row+2]]; 2486 sum4 = b[r[row+3]]; 2487 sum5 = b[r[row+4]]; 2488 v2 = aa + ai[row+1]; 2489 v3 = aa + ai[row+2]; 2490 v4 = aa + ai[row+3]; 2491 v5 = aa + ai[row+4]; 2492 2493 for (j=0; j<nz-1; j+=2){ 2494 i0 = vi[j]; 2495 i1 = vi[j+1]; 2496 tmp0 = tmps[i0]; 2497 tmp1 = tmps[i1]; 2498 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2499 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2500 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2501 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2502 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2503 } 2504 if (j == nz-1){ 2505 tmp0 = tmps[vi[j]]; 2506 sum1 -= v1[j] *tmp0; 2507 sum2 -= v2[j] *tmp0; 2508 sum3 -= v3[j] *tmp0; 2509 sum4 -= v4[j] *tmp0; 2510 sum5 -= v5[j] *tmp0; 2511 } 2512 2513 sum2 -= v2[nz] * sum1; 2514 sum3 -= v3[nz] * sum1; 2515 sum4 -= v4[nz] * sum1; 2516 sum5 -= v5[nz] * sum1; 2517 sum3 -= v3[nz+1] * sum2; 2518 sum4 -= v4[nz+1] * sum2; 2519 sum5 -= v5[nz+1] * sum2; 2520 sum4 -= v4[nz+2] * sum3; 2521 sum5 -= v5[nz+2] * sum3; 2522 sum5 -= v5[nz+3] * sum4; 2523 2524 tmp[row ++]=sum1; 2525 tmp[row ++]=sum2; 2526 tmp[row ++]=sum3; 2527 tmp[row ++]=sum4; 2528 tmp[row ++]=sum5; 2529 break; 2530 default: 2531 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2532 } 2533 } 2534 /* backward solve the upper triangular */ 2535 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2536 nsz = ns[i]; 2537 aii = ad[row+1] + 1; 2538 v1 = aa + aii; 2539 vi = aj + aii; 2540 nz = ad[row]- ad[row+1] - 1; 2541 2542 if (i > 0) { 2543 /* Prefetch the indices for the next block */ 2544 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2545 /* Prefetch the data for the next block */ 2546 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2547 } 2548 2549 switch (nsz){ /* Each loop in 'case' is unrolled */ 2550 case 1 : 2551 sum1 = tmp[row]; 2552 2553 for(j=0 ; j<nz-1; j+=2){ 2554 i0 = vi[j]; 2555 i1 = vi[j+1]; 2556 tmp0 = tmps[i0]; 2557 tmp1 = tmps[i1]; 2558 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2559 } 2560 if (j == nz-1){ 2561 tmp0 = tmps[vi[j]]; 2562 sum1 -= v1[j]*tmp0; 2563 } 2564 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2565 break; 2566 case 2 : 2567 sum1 = tmp[row]; 2568 sum2 = tmp[row-1]; 2569 v2 = aa + ad[row] + 1; 2570 for (j=0 ; j<nz-1; j+=2){ 2571 i0 = vi[j]; 2572 i1 = vi[j+1]; 2573 tmp0 = tmps[i0]; 2574 tmp1 = tmps[i1]; 2575 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2576 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2577 } 2578 if (j == nz-1){ 2579 tmp0 = tmps[vi[j]]; 2580 sum1 -= v1[j]* tmp0; 2581 sum2 -= v2[j+1]* tmp0; 2582 } 2583 2584 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2585 sum2 -= v2[0] * tmp0; 2586 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2587 break; 2588 case 3 : 2589 sum1 = tmp[row]; 2590 sum2 = tmp[row -1]; 2591 sum3 = tmp[row -2]; 2592 v2 = aa + ad[row] + 1; 2593 v3 = aa + ad[row -1] + 1; 2594 for (j=0 ; j<nz-1; j+=2){ 2595 i0 = vi[j]; 2596 i1 = vi[j+1]; 2597 tmp0 = tmps[i0]; 2598 tmp1 = tmps[i1]; 2599 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2600 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2601 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2602 } 2603 if (j== nz-1){ 2604 tmp0 = tmps[vi[j]]; 2605 sum1 -= v1[j] * tmp0; 2606 sum2 -= v2[j+1] * tmp0; 2607 sum3 -= v3[j+2] * tmp0; 2608 } 2609 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2610 sum2 -= v2[0]* tmp0; 2611 sum3 -= v3[1] * tmp0; 2612 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2613 sum3 -= v3[0]* tmp0; 2614 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2615 2616 break; 2617 case 4 : 2618 sum1 = tmp[row]; 2619 sum2 = tmp[row -1]; 2620 sum3 = tmp[row -2]; 2621 sum4 = tmp[row -3]; 2622 v2 = aa + ad[row]+1; 2623 v3 = aa + ad[row -1]+1; 2624 v4 = aa + ad[row -2]+1; 2625 2626 for (j=0 ; j<nz-1; j+=2){ 2627 i0 = vi[j]; 2628 i1 = vi[j+1]; 2629 tmp0 = tmps[i0]; 2630 tmp1 = tmps[i1]; 2631 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2632 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2633 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2634 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2635 } 2636 if (j== nz-1){ 2637 tmp0 = tmps[vi[j]]; 2638 sum1 -= v1[j] * tmp0; 2639 sum2 -= v2[j+1] * tmp0; 2640 sum3 -= v3[j+2] * tmp0; 2641 sum4 -= v4[j+3] * tmp0; 2642 } 2643 2644 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2645 sum2 -= v2[0] * tmp0; 2646 sum3 -= v3[1] * tmp0; 2647 sum4 -= v4[2] * tmp0; 2648 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2649 sum3 -= v3[0] * tmp0; 2650 sum4 -= v4[1] * tmp0; 2651 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2652 sum4 -= v4[0] * tmp0; 2653 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2654 break; 2655 case 5 : 2656 sum1 = tmp[row]; 2657 sum2 = tmp[row -1]; 2658 sum3 = tmp[row -2]; 2659 sum4 = tmp[row -3]; 2660 sum5 = tmp[row -4]; 2661 v2 = aa + ad[row]+1; 2662 v3 = aa + ad[row -1]+1; 2663 v4 = aa + ad[row -2]+1; 2664 v5 = aa + ad[row -3]+1; 2665 for (j=0 ; j<nz-1; j+=2){ 2666 i0 = vi[j]; 2667 i1 = vi[j+1]; 2668 tmp0 = tmps[i0]; 2669 tmp1 = tmps[i1]; 2670 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2671 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2672 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2673 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2674 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2675 } 2676 if (j==nz-1){ 2677 tmp0 = tmps[vi[j]]; 2678 sum1 -= v1[j] * tmp0; 2679 sum2 -= v2[j+1] * tmp0; 2680 sum3 -= v3[j+2] * tmp0; 2681 sum4 -= v4[j+3] * tmp0; 2682 sum5 -= v5[j+4] * tmp0; 2683 } 2684 2685 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2686 sum2 -= v2[0] * tmp0; 2687 sum3 -= v3[1] * tmp0; 2688 sum4 -= v4[2] * tmp0; 2689 sum5 -= v5[3] * tmp0; 2690 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2691 sum3 -= v3[0] * tmp0; 2692 sum4 -= v4[1] * tmp0; 2693 sum5 -= v5[2] * tmp0; 2694 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2695 sum4 -= v4[0] * tmp0; 2696 sum5 -= v5[1] * tmp0; 2697 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2698 sum5 -= v5[0] * tmp0; 2699 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2700 break; 2701 default: 2702 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2703 } 2704 } 2705 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2706 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2707 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2708 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2709 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2710 PetscFunctionReturn(0); 2711 } 2712 2713 2714 /* 2715 Makes a longer coloring[] array and calls the usual code with that 2716 */ 2717 #undef __FUNCT__ 2718 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2719 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2720 { 2721 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2722 PetscErrorCode ierr; 2723 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2724 PetscInt *colorused,i; 2725 ISColoringValue *newcolor; 2726 2727 PetscFunctionBegin; 2728 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2729 /* loop over inodes, marking a color for each column*/ 2730 row = 0; 2731 for (i=0; i<m; i++){ 2732 for (j=0; j<ns[i]; j++) { 2733 newcolor[row++] = coloring[i] + j*ncolors; 2734 } 2735 } 2736 2737 /* eliminate unneeded colors */ 2738 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2739 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2740 for (i=0; i<n; i++) { 2741 colorused[newcolor[i]] = 1; 2742 } 2743 2744 for (i=1; i<5*ncolors; i++) { 2745 colorused[i] += colorused[i-1]; 2746 } 2747 ncolors = colorused[5*ncolors-1]; 2748 for (i=0; i<n; i++) { 2749 newcolor[i] = colorused[newcolor[i]]-1; 2750 } 2751 ierr = PetscFree(colorused);CHKERRQ(ierr); 2752 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2753 ierr = PetscFree(coloring);CHKERRQ(ierr); 2754 PetscFunctionReturn(0); 2755 } 2756 2757 #include <../src/mat/blockinvert.h> 2758 2759 #undef __FUNCT__ 2760 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2761 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2762 { 2763 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2764 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2765 MatScalar *ibdiag,*bdiag,work[25]; 2766 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2767 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2768 const PetscScalar *xb; 2769 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2770 PetscErrorCode ierr; 2771 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2772 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2773 2774 PetscFunctionBegin; 2775 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2776 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2777 if (its > 1) { 2778 /* switch to non-inode version */ 2779 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2780 PetscFunctionReturn(0); 2781 } 2782 2783 if (!a->inode.ibdiagvalid) { 2784 if (!a->inode.ibdiag) { 2785 /* calculate space needed for diagonal blocks */ 2786 for (i=0; i<m; i++) { 2787 cnt += sizes[i]*sizes[i]; 2788 } 2789 a->inode.bdiagsize = cnt; 2790 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2791 } 2792 2793 /* copy over the diagonal blocks and invert them */ 2794 ibdiag = a->inode.ibdiag; 2795 bdiag = a->inode.bdiag; 2796 cnt = 0; 2797 for (i=0, row = 0; i<m; i++) { 2798 for (j=0; j<sizes[i]; j++) { 2799 for (k=0; k<sizes[i]; k++) { 2800 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2801 } 2802 } 2803 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2804 2805 switch(sizes[i]) { 2806 case 1: 2807 /* Create matrix data structure */ 2808 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2809 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2810 break; 2811 case 2: 2812 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2813 break; 2814 case 3: 2815 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2816 break; 2817 case 4: 2818 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2819 break; 2820 case 5: 2821 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2822 break; 2823 default: 2824 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2825 } 2826 cnt += sizes[i]*sizes[i]; 2827 row += sizes[i]; 2828 } 2829 a->inode.ibdiagvalid = PETSC_TRUE; 2830 } 2831 ibdiag = a->inode.ibdiag; 2832 bdiag = a->inode.bdiag; 2833 2834 2835 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2836 if (flag & SOR_ZERO_INITIAL_GUESS) { 2837 const PetscScalar *b; 2838 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2839 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2840 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2841 2842 for (i=0, row=0; i<m; i++) { 2843 sz = diag[row] - ii[row]; 2844 v1 = a->a + ii[row]; 2845 idx = a->j + ii[row]; 2846 2847 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2848 switch (sizes[i]){ 2849 case 1: 2850 2851 sum1 = b[row]; 2852 for(n = 0; n<sz-1; n+=2) { 2853 i1 = idx[0]; 2854 i2 = idx[1]; 2855 idx += 2; 2856 tmp0 = x[i1]; 2857 tmp1 = x[i2]; 2858 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2859 } 2860 2861 if (n == sz-1){ 2862 tmp0 = x[*idx]; 2863 sum1 -= *v1 * tmp0; 2864 } 2865 x[row++] = sum1*(*ibdiag++); 2866 break; 2867 case 2: 2868 v2 = a->a + ii[row+1]; 2869 sum1 = b[row]; 2870 sum2 = b[row+1]; 2871 for(n = 0; n<sz-1; n+=2) { 2872 i1 = idx[0]; 2873 i2 = idx[1]; 2874 idx += 2; 2875 tmp0 = x[i1]; 2876 tmp1 = x[i2]; 2877 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2878 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2879 } 2880 2881 if (n == sz-1){ 2882 tmp0 = x[*idx]; 2883 sum1 -= v1[0] * tmp0; 2884 sum2 -= v2[0] * tmp0; 2885 } 2886 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2887 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2888 ibdiag += 4; 2889 break; 2890 case 3: 2891 v2 = a->a + ii[row+1]; 2892 v3 = a->a + ii[row+2]; 2893 sum1 = b[row]; 2894 sum2 = b[row+1]; 2895 sum3 = b[row+2]; 2896 for(n = 0; n<sz-1; n+=2) { 2897 i1 = idx[0]; 2898 i2 = idx[1]; 2899 idx += 2; 2900 tmp0 = x[i1]; 2901 tmp1 = x[i2]; 2902 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2903 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2904 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2905 } 2906 2907 if (n == sz-1){ 2908 tmp0 = x[*idx]; 2909 sum1 -= v1[0] * tmp0; 2910 sum2 -= v2[0] * tmp0; 2911 sum3 -= v3[0] * tmp0; 2912 } 2913 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2914 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2915 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2916 ibdiag += 9; 2917 break; 2918 case 4: 2919 v2 = a->a + ii[row+1]; 2920 v3 = a->a + ii[row+2]; 2921 v4 = a->a + ii[row+3]; 2922 sum1 = b[row]; 2923 sum2 = b[row+1]; 2924 sum3 = b[row+2]; 2925 sum4 = b[row+3]; 2926 for(n = 0; n<sz-1; n+=2) { 2927 i1 = idx[0]; 2928 i2 = idx[1]; 2929 idx += 2; 2930 tmp0 = x[i1]; 2931 tmp1 = x[i2]; 2932 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2933 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2934 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2935 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2936 } 2937 2938 if (n == sz-1){ 2939 tmp0 = x[*idx]; 2940 sum1 -= v1[0] * tmp0; 2941 sum2 -= v2[0] * tmp0; 2942 sum3 -= v3[0] * tmp0; 2943 sum4 -= v4[0] * tmp0; 2944 } 2945 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2946 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2947 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2948 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2949 ibdiag += 16; 2950 break; 2951 case 5: 2952 v2 = a->a + ii[row+1]; 2953 v3 = a->a + ii[row+2]; 2954 v4 = a->a + ii[row+3]; 2955 v5 = a->a + ii[row+4]; 2956 sum1 = b[row]; 2957 sum2 = b[row+1]; 2958 sum3 = b[row+2]; 2959 sum4 = b[row+3]; 2960 sum5 = b[row+4]; 2961 for(n = 0; n<sz-1; n+=2) { 2962 i1 = idx[0]; 2963 i2 = idx[1]; 2964 idx += 2; 2965 tmp0 = x[i1]; 2966 tmp1 = x[i2]; 2967 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2968 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2969 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2970 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2971 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2972 } 2973 2974 if (n == sz-1){ 2975 tmp0 = x[*idx]; 2976 sum1 -= v1[0] * tmp0; 2977 sum2 -= v2[0] * tmp0; 2978 sum3 -= v3[0] * tmp0; 2979 sum4 -= v4[0] * tmp0; 2980 sum5 -= v5[0] * tmp0; 2981 } 2982 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2983 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2984 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2985 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2986 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2987 ibdiag += 25; 2988 break; 2989 default: 2990 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2991 } 2992 } 2993 2994 xb = x; 2995 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2996 } else xb = b; 2997 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2998 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2999 cnt = 0; 3000 for (i=0, row=0; i<m; i++) { 3001 3002 switch (sizes[i]){ 3003 case 1: 3004 x[row++] *= bdiag[cnt++]; 3005 break; 3006 case 2: 3007 x1 = x[row]; x2 = x[row+1]; 3008 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3009 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3010 x[row++] = tmp1; 3011 x[row++] = tmp2; 3012 cnt += 4; 3013 break; 3014 case 3: 3015 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3016 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3017 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3018 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3019 x[row++] = tmp1; 3020 x[row++] = tmp2; 3021 x[row++] = tmp3; 3022 cnt += 9; 3023 break; 3024 case 4: 3025 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3026 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3027 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3028 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3029 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3030 x[row++] = tmp1; 3031 x[row++] = tmp2; 3032 x[row++] = tmp3; 3033 x[row++] = tmp4; 3034 cnt += 16; 3035 break; 3036 case 5: 3037 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3038 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3039 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3040 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3041 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3042 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3043 x[row++] = tmp1; 3044 x[row++] = tmp2; 3045 x[row++] = tmp3; 3046 x[row++] = tmp4; 3047 x[row++] = tmp5; 3048 cnt += 25; 3049 break; 3050 default: 3051 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3052 } 3053 } 3054 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3055 } 3056 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 3057 3058 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3059 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3060 ibdiag -= sizes[i]*sizes[i]; 3061 sz = ii[row+1] - diag[row] - 1; 3062 v1 = a->a + diag[row] + 1; 3063 idx = a->j + diag[row] + 1; 3064 3065 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3066 switch (sizes[i]){ 3067 case 1: 3068 3069 sum1 = xb[row]; 3070 for(n = 0; n<sz-1; n+=2) { 3071 i1 = idx[0]; 3072 i2 = idx[1]; 3073 idx += 2; 3074 tmp0 = x[i1]; 3075 tmp1 = x[i2]; 3076 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3077 } 3078 3079 if (n == sz-1){ 3080 tmp0 = x[*idx]; 3081 sum1 -= *v1*tmp0; 3082 } 3083 x[row--] = sum1*(*ibdiag); 3084 break; 3085 3086 case 2: 3087 3088 sum1 = xb[row]; 3089 sum2 = xb[row-1]; 3090 /* note that sum1 is associated with the second of the two rows */ 3091 v2 = a->a + diag[row-1] + 2; 3092 for(n = 0; n<sz-1; n+=2) { 3093 i1 = idx[0]; 3094 i2 = idx[1]; 3095 idx += 2; 3096 tmp0 = x[i1]; 3097 tmp1 = x[i2]; 3098 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3099 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3100 } 3101 3102 if (n == sz-1){ 3103 tmp0 = x[*idx]; 3104 sum1 -= *v1*tmp0; 3105 sum2 -= *v2*tmp0; 3106 } 3107 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3108 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3109 break; 3110 case 3: 3111 3112 sum1 = xb[row]; 3113 sum2 = xb[row-1]; 3114 sum3 = xb[row-2]; 3115 v2 = a->a + diag[row-1] + 2; 3116 v3 = a->a + diag[row-2] + 3; 3117 for(n = 0; n<sz-1; n+=2) { 3118 i1 = idx[0]; 3119 i2 = idx[1]; 3120 idx += 2; 3121 tmp0 = x[i1]; 3122 tmp1 = x[i2]; 3123 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3124 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3125 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3126 } 3127 3128 if (n == sz-1){ 3129 tmp0 = x[*idx]; 3130 sum1 -= *v1*tmp0; 3131 sum2 -= *v2*tmp0; 3132 sum3 -= *v3*tmp0; 3133 } 3134 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3135 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3136 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3137 break; 3138 case 4: 3139 3140 sum1 = xb[row]; 3141 sum2 = xb[row-1]; 3142 sum3 = xb[row-2]; 3143 sum4 = xb[row-3]; 3144 v2 = a->a + diag[row-1] + 2; 3145 v3 = a->a + diag[row-2] + 3; 3146 v4 = a->a + diag[row-3] + 4; 3147 for(n = 0; n<sz-1; n+=2) { 3148 i1 = idx[0]; 3149 i2 = idx[1]; 3150 idx += 2; 3151 tmp0 = x[i1]; 3152 tmp1 = x[i2]; 3153 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3154 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3155 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3156 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3157 } 3158 3159 if (n == sz-1){ 3160 tmp0 = x[*idx]; 3161 sum1 -= *v1*tmp0; 3162 sum2 -= *v2*tmp0; 3163 sum3 -= *v3*tmp0; 3164 sum4 -= *v4*tmp0; 3165 } 3166 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3167 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3168 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3169 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3170 break; 3171 case 5: 3172 3173 sum1 = xb[row]; 3174 sum2 = xb[row-1]; 3175 sum3 = xb[row-2]; 3176 sum4 = xb[row-3]; 3177 sum5 = xb[row-4]; 3178 v2 = a->a + diag[row-1] + 2; 3179 v3 = a->a + diag[row-2] + 3; 3180 v4 = a->a + diag[row-3] + 4; 3181 v5 = a->a + diag[row-4] + 5; 3182 for(n = 0; n<sz-1; n+=2) { 3183 i1 = idx[0]; 3184 i2 = idx[1]; 3185 idx += 2; 3186 tmp0 = x[i1]; 3187 tmp1 = x[i2]; 3188 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3189 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3190 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3191 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3192 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3193 } 3194 3195 if (n == sz-1){ 3196 tmp0 = x[*idx]; 3197 sum1 -= *v1*tmp0; 3198 sum2 -= *v2*tmp0; 3199 sum3 -= *v3*tmp0; 3200 sum4 -= *v4*tmp0; 3201 sum5 -= *v5*tmp0; 3202 } 3203 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3204 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3205 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3206 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3207 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3208 break; 3209 default: 3210 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3211 } 3212 } 3213 3214 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3215 } 3216 its--; 3217 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3218 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3219 } 3220 if (flag & SOR_EISENSTAT) { 3221 const PetscScalar *b; 3222 MatScalar *t = a->inode.ssor_work; 3223 3224 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3225 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3226 /* 3227 Apply (U + D)^-1 where D is now the block diagonal 3228 */ 3229 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3230 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3231 ibdiag -= sizes[i]*sizes[i]; 3232 sz = ii[row+1] - diag[row] - 1; 3233 v1 = a->a + diag[row] + 1; 3234 idx = a->j + diag[row] + 1; 3235 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3236 switch (sizes[i]){ 3237 case 1: 3238 3239 sum1 = b[row]; 3240 for(n = 0; n<sz-1; n+=2) { 3241 i1 = idx[0]; 3242 i2 = idx[1]; 3243 idx += 2; 3244 tmp0 = x[i1]; 3245 tmp1 = x[i2]; 3246 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3247 } 3248 3249 if (n == sz-1){ 3250 tmp0 = x[*idx]; 3251 sum1 -= *v1*tmp0; 3252 } 3253 x[row] = sum1*(*ibdiag);row--; 3254 break; 3255 3256 case 2: 3257 3258 sum1 = b[row]; 3259 sum2 = b[row-1]; 3260 /* note that sum1 is associated with the second of the two rows */ 3261 v2 = a->a + diag[row-1] + 2; 3262 for(n = 0; n<sz-1; n+=2) { 3263 i1 = idx[0]; 3264 i2 = idx[1]; 3265 idx += 2; 3266 tmp0 = x[i1]; 3267 tmp1 = x[i2]; 3268 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3269 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3270 } 3271 3272 if (n == sz-1){ 3273 tmp0 = x[*idx]; 3274 sum1 -= *v1*tmp0; 3275 sum2 -= *v2*tmp0; 3276 } 3277 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3278 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3279 row -= 2; 3280 break; 3281 case 3: 3282 3283 sum1 = b[row]; 3284 sum2 = b[row-1]; 3285 sum3 = b[row-2]; 3286 v2 = a->a + diag[row-1] + 2; 3287 v3 = a->a + diag[row-2] + 3; 3288 for(n = 0; n<sz-1; n+=2) { 3289 i1 = idx[0]; 3290 i2 = idx[1]; 3291 idx += 2; 3292 tmp0 = x[i1]; 3293 tmp1 = x[i2]; 3294 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3295 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3296 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3297 } 3298 3299 if (n == sz-1){ 3300 tmp0 = x[*idx]; 3301 sum1 -= *v1*tmp0; 3302 sum2 -= *v2*tmp0; 3303 sum3 -= *v3*tmp0; 3304 } 3305 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3306 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3307 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3308 row -= 3; 3309 break; 3310 case 4: 3311 3312 sum1 = b[row]; 3313 sum2 = b[row-1]; 3314 sum3 = b[row-2]; 3315 sum4 = b[row-3]; 3316 v2 = a->a + diag[row-1] + 2; 3317 v3 = a->a + diag[row-2] + 3; 3318 v4 = a->a + diag[row-3] + 4; 3319 for(n = 0; n<sz-1; n+=2) { 3320 i1 = idx[0]; 3321 i2 = idx[1]; 3322 idx += 2; 3323 tmp0 = x[i1]; 3324 tmp1 = x[i2]; 3325 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3326 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3327 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3328 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3329 } 3330 3331 if (n == sz-1){ 3332 tmp0 = x[*idx]; 3333 sum1 -= *v1*tmp0; 3334 sum2 -= *v2*tmp0; 3335 sum3 -= *v3*tmp0; 3336 sum4 -= *v4*tmp0; 3337 } 3338 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3339 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3340 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3341 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3342 row -= 4; 3343 break; 3344 case 5: 3345 3346 sum1 = b[row]; 3347 sum2 = b[row-1]; 3348 sum3 = b[row-2]; 3349 sum4 = b[row-3]; 3350 sum5 = b[row-4]; 3351 v2 = a->a + diag[row-1] + 2; 3352 v3 = a->a + diag[row-2] + 3; 3353 v4 = a->a + diag[row-3] + 4; 3354 v5 = a->a + diag[row-4] + 5; 3355 for(n = 0; n<sz-1; n+=2) { 3356 i1 = idx[0]; 3357 i2 = idx[1]; 3358 idx += 2; 3359 tmp0 = x[i1]; 3360 tmp1 = x[i2]; 3361 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3362 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3363 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3364 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3365 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3366 } 3367 3368 if (n == sz-1){ 3369 tmp0 = x[*idx]; 3370 sum1 -= *v1*tmp0; 3371 sum2 -= *v2*tmp0; 3372 sum3 -= *v3*tmp0; 3373 sum4 -= *v4*tmp0; 3374 sum5 -= *v5*tmp0; 3375 } 3376 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3377 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3378 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3379 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3380 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3381 row -= 5; 3382 break; 3383 default: 3384 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3385 } 3386 } 3387 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3388 3389 /* 3390 t = b - D x where D is the block diagonal 3391 */ 3392 cnt = 0; 3393 for (i=0, row=0; i<m; i++) { 3394 switch (sizes[i]){ 3395 case 1: 3396 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3397 break; 3398 case 2: 3399 x1 = x[row]; x2 = x[row+1]; 3400 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3401 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3402 t[row] = b[row] - tmp1; 3403 t[row+1] = b[row+1] - tmp2; row += 2; 3404 cnt += 4; 3405 break; 3406 case 3: 3407 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3408 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3409 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3410 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3411 t[row] = b[row] - tmp1; 3412 t[row+1] = b[row+1] - tmp2; 3413 t[row+2] = b[row+2] - tmp3; row += 3; 3414 cnt += 9; 3415 break; 3416 case 4: 3417 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3418 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3419 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3420 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3421 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3422 t[row] = b[row] - tmp1; 3423 t[row+1] = b[row+1] - tmp2; 3424 t[row+2] = b[row+2] - tmp3; 3425 t[row+3] = b[row+3] - tmp4; row += 4; 3426 cnt += 16; 3427 break; 3428 case 5: 3429 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3430 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3431 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3432 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3433 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3434 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3435 t[row] = b[row] - tmp1; 3436 t[row+1] = b[row+1] - tmp2; 3437 t[row+2] = b[row+2] - tmp3; 3438 t[row+3] = b[row+3] - tmp4; 3439 t[row+4] = b[row+4] - tmp5;row += 5; 3440 cnt += 25; 3441 break; 3442 default: 3443 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3444 } 3445 } 3446 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3447 3448 3449 3450 /* 3451 Apply (L + D)^-1 where D is the block diagonal 3452 */ 3453 for (i=0, row=0; i<m; i++) { 3454 sz = diag[row] - ii[row]; 3455 v1 = a->a + ii[row]; 3456 idx = a->j + ii[row]; 3457 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3458 switch (sizes[i]){ 3459 case 1: 3460 3461 sum1 = t[row]; 3462 for(n = 0; n<sz-1; n+=2) { 3463 i1 = idx[0]; 3464 i2 = idx[1]; 3465 idx += 2; 3466 tmp0 = t[i1]; 3467 tmp1 = t[i2]; 3468 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3469 } 3470 3471 if (n == sz-1){ 3472 tmp0 = t[*idx]; 3473 sum1 -= *v1 * tmp0; 3474 } 3475 x[row] += t[row] = sum1*(*ibdiag++); row++; 3476 break; 3477 case 2: 3478 v2 = a->a + ii[row+1]; 3479 sum1 = t[row]; 3480 sum2 = t[row+1]; 3481 for(n = 0; n<sz-1; n+=2) { 3482 i1 = idx[0]; 3483 i2 = idx[1]; 3484 idx += 2; 3485 tmp0 = t[i1]; 3486 tmp1 = t[i2]; 3487 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3488 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3489 } 3490 3491 if (n == sz-1){ 3492 tmp0 = t[*idx]; 3493 sum1 -= v1[0] * tmp0; 3494 sum2 -= v2[0] * tmp0; 3495 } 3496 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3497 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3498 ibdiag += 4; row += 2; 3499 break; 3500 case 3: 3501 v2 = a->a + ii[row+1]; 3502 v3 = a->a + ii[row+2]; 3503 sum1 = t[row]; 3504 sum2 = t[row+1]; 3505 sum3 = t[row+2]; 3506 for(n = 0; n<sz-1; n+=2) { 3507 i1 = idx[0]; 3508 i2 = idx[1]; 3509 idx += 2; 3510 tmp0 = t[i1]; 3511 tmp1 = t[i2]; 3512 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3513 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3514 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3515 } 3516 3517 if (n == sz-1){ 3518 tmp0 = t[*idx]; 3519 sum1 -= v1[0] * tmp0; 3520 sum2 -= v2[0] * tmp0; 3521 sum3 -= v3[0] * tmp0; 3522 } 3523 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3524 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3525 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3526 ibdiag += 9; row += 3; 3527 break; 3528 case 4: 3529 v2 = a->a + ii[row+1]; 3530 v3 = a->a + ii[row+2]; 3531 v4 = a->a + ii[row+3]; 3532 sum1 = t[row]; 3533 sum2 = t[row+1]; 3534 sum3 = t[row+2]; 3535 sum4 = t[row+3]; 3536 for(n = 0; n<sz-1; n+=2) { 3537 i1 = idx[0]; 3538 i2 = idx[1]; 3539 idx += 2; 3540 tmp0 = t[i1]; 3541 tmp1 = t[i2]; 3542 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3543 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3544 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3545 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3546 } 3547 3548 if (n == sz-1){ 3549 tmp0 = t[*idx]; 3550 sum1 -= v1[0] * tmp0; 3551 sum2 -= v2[0] * tmp0; 3552 sum3 -= v3[0] * tmp0; 3553 sum4 -= v4[0] * tmp0; 3554 } 3555 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3556 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3557 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3558 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3559 ibdiag += 16; row += 4; 3560 break; 3561 case 5: 3562 v2 = a->a + ii[row+1]; 3563 v3 = a->a + ii[row+2]; 3564 v4 = a->a + ii[row+3]; 3565 v5 = a->a + ii[row+4]; 3566 sum1 = t[row]; 3567 sum2 = t[row+1]; 3568 sum3 = t[row+2]; 3569 sum4 = t[row+3]; 3570 sum5 = t[row+4]; 3571 for(n = 0; n<sz-1; n+=2) { 3572 i1 = idx[0]; 3573 i2 = idx[1]; 3574 idx += 2; 3575 tmp0 = t[i1]; 3576 tmp1 = t[i2]; 3577 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3578 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3579 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3580 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3581 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3582 } 3583 3584 if (n == sz-1){ 3585 tmp0 = t[*idx]; 3586 sum1 -= v1[0] * tmp0; 3587 sum2 -= v2[0] * tmp0; 3588 sum3 -= v3[0] * tmp0; 3589 sum4 -= v4[0] * tmp0; 3590 sum5 -= v5[0] * tmp0; 3591 } 3592 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3593 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3594 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3595 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3596 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3597 ibdiag += 25; row += 5; 3598 break; 3599 default: 3600 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3601 } 3602 } 3603 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3604 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3605 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3606 } 3607 PetscFunctionReturn(0); 3608 } 3609 3610 #undef __FUNCT__ 3611 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3612 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3613 { 3614 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3615 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3616 const MatScalar *bdiag = a->inode.bdiag; 3617 const PetscScalar *b; 3618 PetscErrorCode ierr; 3619 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3620 const PetscInt *sizes = a->inode.size; 3621 3622 PetscFunctionBegin; 3623 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3624 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3625 cnt = 0; 3626 for (i=0, row=0; i<m; i++) { 3627 switch (sizes[i]){ 3628 case 1: 3629 x[row] = b[row]*bdiag[cnt++];row++; 3630 break; 3631 case 2: 3632 x1 = b[row]; x2 = b[row+1]; 3633 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3634 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3635 x[row++] = tmp1; 3636 x[row++] = tmp2; 3637 cnt += 4; 3638 break; 3639 case 3: 3640 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3641 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3642 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3643 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3644 x[row++] = tmp1; 3645 x[row++] = tmp2; 3646 x[row++] = tmp3; 3647 cnt += 9; 3648 break; 3649 case 4: 3650 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3651 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3652 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3653 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3654 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3655 x[row++] = tmp1; 3656 x[row++] = tmp2; 3657 x[row++] = tmp3; 3658 x[row++] = tmp4; 3659 cnt += 16; 3660 break; 3661 case 5: 3662 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3663 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3664 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3665 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3666 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3667 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3668 x[row++] = tmp1; 3669 x[row++] = tmp2; 3670 x[row++] = tmp3; 3671 x[row++] = tmp4; 3672 x[row++] = tmp5; 3673 cnt += 25; 3674 break; 3675 default: 3676 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3677 } 3678 } 3679 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3680 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3681 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3682 PetscFunctionReturn(0); 3683 } 3684 3685 /* 3686 samestructure indicates that the matrix has not changed its nonzero structure so we 3687 do not need to recompute the inodes 3688 */ 3689 #undef __FUNCT__ 3690 #define __FUNCT__ "Mat_CheckInode" 3691 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 3692 { 3693 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3694 PetscErrorCode ierr; 3695 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3696 PetscBool flag; 3697 3698 PetscFunctionBegin; 3699 if (!a->inode.use) PetscFunctionReturn(0); 3700 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3701 3702 3703 m = A->rmap->n; 3704 if (a->inode.size) {ns = a->inode.size;} 3705 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3706 3707 i = 0; 3708 node_count = 0; 3709 idx = a->j; 3710 ii = a->i; 3711 while (i < m){ /* For each row */ 3712 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3713 /* Limits the number of elements in a node to 'a->inode.limit' */ 3714 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3715 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3716 if (nzy != nzx) break; 3717 idy += nzx; /* Same nonzero pattern */ 3718 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3719 if (!flag) break; 3720 } 3721 ns[node_count++] = blk_size; 3722 idx += blk_size*nzx; 3723 i = j; 3724 } 3725 /* If not enough inodes found,, do not use inode version of the routines */ 3726 if (!m || node_count > .8*m) { 3727 ierr = PetscFree(ns);CHKERRQ(ierr); 3728 a->inode.node_count = 0; 3729 a->inode.size = PETSC_NULL; 3730 a->inode.use = PETSC_FALSE; 3731 A->ops->mult = MatMult_SeqAIJ; 3732 A->ops->sor = MatSOR_SeqAIJ; 3733 A->ops->multadd = MatMultAdd_SeqAIJ; 3734 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 3735 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 3736 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 3737 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 3738 A->ops->coloringpatch = 0; 3739 A->ops->multdiagonalblock = 0; 3740 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3741 } else { 3742 if (!A->factortype) { 3743 A->ops->mult = MatMult_SeqAIJ_Inode; 3744 A->ops->sor = MatSOR_SeqAIJ_Inode; 3745 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3746 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3747 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3748 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3749 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3750 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3751 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3752 } else { 3753 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 3754 } 3755 a->inode.node_count = node_count; 3756 a->inode.size = ns; 3757 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3758 } 3759 a->inode.checked = PETSC_TRUE; 3760 PetscFunctionReturn(0); 3761 } 3762 3763 #undef __FUNCT__ 3764 #define __FUNCT__ "MatGetRow_FactoredLU" 3765 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row) 3766 { 3767 PetscInt k, *vi; 3768 PetscFunctionBegin; 3769 vi = aj + ai[row]; 3770 for(k=0;k<nzl;k++) cols[k] = vi[k]; 3771 vi = aj + adiag[row]; 3772 cols[nzl] = vi[0]; 3773 vi = aj + adiag[row+1]+1; 3774 for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k]; 3775 PetscFunctionReturn(0); 3776 } 3777 /* 3778 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3779 Modified from Mat_CheckInode(). 3780 3781 Input Parameters: 3782 + Mat A - ILU or LU matrix factor 3783 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 3784 do not need to recompute the inodes 3785 */ 3786 #undef __FUNCT__ 3787 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3788 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure) 3789 { 3790 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3791 PetscErrorCode ierr; 3792 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3793 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3794 PetscBool flag; 3795 3796 PetscFunctionBegin; 3797 if (!a->inode.use) PetscFunctionReturn(0); 3798 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3799 3800 m = A->rmap->n; 3801 if (a->inode.size) {ns = a->inode.size;} 3802 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3803 3804 i = 0; 3805 node_count = 0; 3806 while (i < m){ /* For each row */ 3807 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3808 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3809 nzx = nzl1 + nzu1 + 1; 3810 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3811 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3812 3813 /* Limits the number of elements in a node to 'a->inode.limit' */ 3814 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3815 nzl2 = ai[j+1] - ai[j]; 3816 nzu2 = adiag[j] - adiag[j+1] - 1; 3817 nzy = nzl2 + nzu2 + 1; 3818 if( nzy != nzx) break; 3819 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3820 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 3821 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3822 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3823 ierr = PetscFree(cols2);CHKERRQ(ierr); 3824 } 3825 ns[node_count++] = blk_size; 3826 ierr = PetscFree(cols1);CHKERRQ(ierr); 3827 i = j; 3828 } 3829 /* If not enough inodes found,, do not use inode version of the routines */ 3830 if (!m || node_count > .8*m) { 3831 ierr = PetscFree(ns);CHKERRQ(ierr); 3832 a->inode.node_count = 0; 3833 a->inode.size = PETSC_NULL; 3834 a->inode.use = PETSC_FALSE; 3835 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3836 } else { 3837 A->ops->mult = 0; 3838 A->ops->sor = 0; 3839 A->ops->multadd = 0; 3840 A->ops->getrowij = 0; 3841 A->ops->restorerowij = 0; 3842 A->ops->getcolumnij = 0; 3843 A->ops->restorecolumnij = 0; 3844 A->ops->coloringpatch = 0; 3845 A->ops->multdiagonalblock = 0; 3846 A->ops->solve = MatSolve_SeqAIJ_Inode; 3847 a->inode.node_count = node_count; 3848 a->inode.size = ns; 3849 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3850 } 3851 a->inode.checked = PETSC_TRUE; 3852 PetscFunctionReturn(0); 3853 } 3854 3855 /* 3856 This is really ugly. if inodes are used this replaces the 3857 permutations with ones that correspond to rows/cols of the matrix 3858 rather then inode blocks 3859 */ 3860 #undef __FUNCT__ 3861 #define __FUNCT__ "MatInodeAdjustForInodes" 3862 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3863 { 3864 PetscErrorCode ierr; 3865 3866 PetscFunctionBegin; 3867 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 3868 PetscFunctionReturn(0); 3869 } 3870 3871 EXTERN_C_BEGIN 3872 #undef __FUNCT__ 3873 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 3874 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3875 { 3876 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3877 PetscErrorCode ierr; 3878 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3879 const PetscInt *ridx,*cidx; 3880 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3881 PetscInt nslim_col,*ns_col; 3882 IS ris = *rperm,cis = *cperm; 3883 3884 PetscFunctionBegin; 3885 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3886 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3887 3888 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3889 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3890 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3891 3892 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3893 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3894 3895 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3896 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3897 3898 /* Construct the permutations for rows*/ 3899 for (i=0,row = 0; i<nslim_row; ++i){ 3900 indx = ridx[i]; 3901 start_val = tns[indx]; 3902 end_val = tns[indx + 1]; 3903 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3904 } 3905 3906 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3907 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3908 3909 /* Construct permutations for columns */ 3910 for (i=0,col=0; i<nslim_col; ++i){ 3911 indx = cidx[i]; 3912 start_val = tns[indx]; 3913 end_val = tns[indx + 1]; 3914 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3915 } 3916 3917 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 3918 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3919 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 3920 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3921 3922 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3923 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3924 3925 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3926 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3927 ierr = ISDestroy(&cis);CHKERRQ(ierr); 3928 ierr = ISDestroy(&ris);CHKERRQ(ierr); 3929 ierr = PetscFree(tns);CHKERRQ(ierr); 3930 PetscFunctionReturn(0); 3931 } 3932 EXTERN_C_END 3933 3934 #undef __FUNCT__ 3935 #define __FUNCT__ "MatInodeGetInodeSizes" 3936 /*@C 3937 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3938 3939 Not Collective 3940 3941 Input Parameter: 3942 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3943 3944 Output Parameter: 3945 + node_count - no of inodes present in the matrix. 3946 . sizes - an array of size node_count,with sizes of each inode. 3947 - limit - the max size used to generate the inodes. 3948 3949 Level: advanced 3950 3951 Notes: This routine returns some internal storage information 3952 of the matrix, it is intended to be used by advanced users. 3953 It should be called after the matrix is assembled. 3954 The contents of the sizes[] array should not be changed. 3955 PETSC_NULL may be passed for information not requested. 3956 3957 .keywords: matrix, seqaij, get, inode 3958 3959 .seealso: MatGetInfo() 3960 @*/ 3961 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3962 { 3963 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3964 3965 PetscFunctionBegin; 3966 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3967 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3968 if (f) { 3969 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3970 } 3971 PetscFunctionReturn(0); 3972 } 3973 3974 EXTERN_C_BEGIN 3975 #undef __FUNCT__ 3976 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3977 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3978 { 3979 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3980 3981 PetscFunctionBegin; 3982 if (node_count) *node_count = a->inode.node_count; 3983 if (sizes) *sizes = a->inode.size; 3984 if (limit) *limit = a->inode.limit; 3985 PetscFunctionReturn(0); 3986 } 3987 EXTERN_C_END 3988