1 #define PETSCMAT_DLL 2 3 /* 4 This file provides high performance routines for the Inode format (compressed sparse row) 5 by taking advantage of rows with identical nonzero structure (I-nodes). 6 */ 7 #include "../src/mat/impls/aij/seq/aij.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "Mat_CreateColInode" 11 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 12 { 13 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 14 PetscErrorCode ierr; 15 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 16 17 PetscFunctionBegin; 18 n = A->cmap->n; 19 m = A->rmap->n; 20 ns_row = a->inode.size; 21 22 min_mn = (m < n) ? m : n; 23 if (!ns) { 24 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 25 for(; count+1 < n; count++,i++); 26 if (count < n) { 27 i++; 28 } 29 *size = i; 30 PetscFunctionReturn(0); 31 } 32 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 33 34 /* Use the same row structure wherever feasible. */ 35 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 36 ns_col[i] = ns_row[i]; 37 } 38 39 /* if m < n; pad up the remainder with inode_limit */ 40 for(; count+1 < n; count++,i++) { 41 ns_col[i] = 1; 42 } 43 /* The last node is the odd ball. padd it up with the remaining rows; */ 44 if (count < n) { 45 ns_col[i] = n - count; 46 i++; 47 } else if (count > n) { 48 /* Adjust for the over estimation */ 49 ns_col[i-1] += n - count; 50 } 51 *size = i; 52 *ns = ns_col; 53 PetscFunctionReturn(0); 54 } 55 56 57 /* 58 This builds symmetric version of nonzero structure, 59 */ 60 #undef __FUNCT__ 61 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 62 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 63 { 64 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 65 PetscErrorCode ierr; 66 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 67 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 68 69 PetscFunctionBegin; 70 nslim_row = a->inode.node_count; 71 m = A->rmap->n; 72 n = A->cmap->n; 73 if (m != n) SETERRQ(PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 74 75 /* Use the row_inode as column_inode */ 76 nslim_col = nslim_row; 77 ns_col = ns_row; 78 79 /* allocate space for reformated inode structure */ 80 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 81 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 82 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 83 84 for (i1=0,col=0; i1<nslim_col; ++i1){ 85 nsz = ns_col[i1]; 86 for (i2=0; i2<nsz; ++i2,++col) 87 tvc[col] = i1; 88 } 89 /* allocate space for row pointers */ 90 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 91 *iia = ia; 92 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 93 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 94 95 /* determine the number of columns in each row */ 96 ia[0] = oshift; 97 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 98 99 j = aj + ai[row] + ishift; 100 jmax = aj + ai[row+1] + ishift; 101 i2 = 0; 102 col = *j++ + ishift; 103 i2 = tvc[col]; 104 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 105 ia[i1+1]++; 106 ia[i2+1]++; 107 i2++; /* Start col of next node */ 108 while(((col=*j+ishift)<tns[i2]) && (j<jmax)) ++j; 109 i2 = tvc[col]; 110 } 111 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 112 } 113 114 /* shift ia[i] to point to next row */ 115 for (i1=1; i1<nslim_row+1; i1++) { 116 row = ia[i1-1]; 117 ia[i1] += row; 118 work[i1-1] = row - oshift; 119 } 120 121 /* allocate space for column pointers */ 122 nz = ia[nslim_row] + (!ishift); 123 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 124 *jja = ja; 125 126 /* loop over lower triangular part putting into ja */ 127 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 128 j = aj + ai[row] + ishift; 129 jmax = aj + ai[row+1] + ishift; 130 i2 = 0; /* Col inode index */ 131 col = *j++ + ishift; 132 i2 = tvc[col]; 133 while (i2<i1 && j<jmax) { 134 ja[work[i2]++] = i1 + oshift; 135 ja[work[i1]++] = i2 + oshift; 136 ++i2; 137 while(((col=*j+ishift)< tns[i2])&&(j<jmax)) ++j; /* Skip rest col indices in this node */ 138 i2 = tvc[col]; 139 } 140 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 141 142 } 143 ierr = PetscFree(work);CHKERRQ(ierr); 144 ierr = PetscFree(tns);CHKERRQ(ierr); 145 ierr = PetscFree(tvc);CHKERRQ(ierr); 146 PetscFunctionReturn(0); 147 } 148 149 /* 150 This builds nonsymmetric version of nonzero structure, 151 */ 152 #undef __FUNCT__ 153 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 154 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 155 { 156 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 157 PetscErrorCode ierr; 158 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 159 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 160 161 PetscFunctionBegin; 162 nslim_row = a->inode.node_count; 163 n = A->cmap->n; 164 165 /* Create The column_inode for this matrix */ 166 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 167 168 /* allocate space for reformated column_inode structure */ 169 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 170 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 171 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 172 173 for (i1=0,col=0; i1<nslim_col; ++i1){ 174 nsz = ns_col[i1]; 175 for (i2=0; i2<nsz; ++i2,++col) 176 tvc[col] = i1; 177 } 178 /* allocate space for row pointers */ 179 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 180 *iia = ia; 181 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 182 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 183 184 /* determine the number of columns in each row */ 185 ia[0] = oshift; 186 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 187 j = aj + ai[row] + ishift; 188 col = *j++ + ishift; 189 i2 = tvc[col]; 190 nz = ai[row+1] - ai[row]; 191 while (nz-- > 0) { /* off-diagonal elemets */ 192 ia[i1+1]++; 193 i2++; /* Start col of next node */ 194 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 195 if (nz > 0) i2 = tvc[col]; 196 } 197 } 198 199 /* shift ia[i] to point to next row */ 200 for (i1=1; i1<nslim_row+1; i1++) { 201 row = ia[i1-1]; 202 ia[i1] += row; 203 work[i1-1] = row - oshift; 204 } 205 206 /* allocate space for column pointers */ 207 nz = ia[nslim_row] + (!ishift); 208 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 209 *jja = ja; 210 211 /* loop over matrix putting into ja */ 212 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 213 j = aj + ai[row] + ishift; 214 i2 = 0; /* Col inode index */ 215 col = *j++ + ishift; 216 i2 = tvc[col]; 217 nz = ai[row+1] - ai[row]; 218 while (nz-- > 0) { 219 ja[work[i1]++] = i2 + oshift; 220 ++i2; 221 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 222 if (nz > 0) i2 = tvc[col]; 223 } 224 } 225 ierr = PetscFree(ns_col);CHKERRQ(ierr); 226 ierr = PetscFree(work);CHKERRQ(ierr); 227 ierr = PetscFree(tns);CHKERRQ(ierr); 228 ierr = PetscFree(tvc);CHKERRQ(ierr); 229 PetscFunctionReturn(0); 230 } 231 232 #undef __FUNCT__ 233 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 234 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 235 { 236 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 237 PetscErrorCode ierr; 238 239 PetscFunctionBegin; 240 *n = a->inode.node_count; 241 if (!ia) PetscFunctionReturn(0); 242 if (!blockcompressed) { 243 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 244 } else if (symmetric) { 245 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 246 } else { 247 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 248 } 249 PetscFunctionReturn(0); 250 } 251 252 #undef __FUNCT__ 253 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 254 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 255 { 256 PetscErrorCode ierr; 257 258 PetscFunctionBegin; 259 if (!ia) PetscFunctionReturn(0); 260 261 if (!blockcompressed) { 262 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 263 } else { 264 ierr = PetscFree(*ia);CHKERRQ(ierr); 265 ierr = PetscFree(*ja);CHKERRQ(ierr); 266 } 267 268 PetscFunctionReturn(0); 269 } 270 271 /* ----------------------------------------------------------- */ 272 273 #undef __FUNCT__ 274 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 275 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 276 { 277 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 278 PetscErrorCode ierr; 279 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 280 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 281 282 PetscFunctionBegin; 283 nslim_row = a->inode.node_count; 284 n = A->cmap->n; 285 286 /* Create The column_inode for this matrix */ 287 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 288 289 /* allocate space for reformated column_inode structure */ 290 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 291 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 292 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 293 294 for (i1=0,col=0; i1<nslim_col; ++i1){ 295 nsz = ns_col[i1]; 296 for (i2=0; i2<nsz; ++i2,++col) 297 tvc[col] = i1; 298 } 299 /* allocate space for column pointers */ 300 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 301 *iia = ia; 302 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 303 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 304 305 /* determine the number of columns in each row */ 306 ia[0] = oshift; 307 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 308 j = aj + ai[row] + ishift; 309 col = *j++ + ishift; 310 i2 = tvc[col]; 311 nz = ai[row+1] - ai[row]; 312 while (nz-- > 0) { /* off-diagonal elemets */ 313 /* ia[i1+1]++; */ 314 ia[i2+1]++; 315 i2++; 316 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 317 if (nz > 0) i2 = tvc[col]; 318 } 319 } 320 321 /* shift ia[i] to point to next col */ 322 for (i1=1; i1<nslim_col+1; i1++) { 323 col = ia[i1-1]; 324 ia[i1] += col; 325 work[i1-1] = col - oshift; 326 } 327 328 /* allocate space for column pointers */ 329 nz = ia[nslim_col] + (!ishift); 330 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 331 *jja = ja; 332 333 /* loop over matrix putting into ja */ 334 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 335 j = aj + ai[row] + ishift; 336 i2 = 0; /* Col inode index */ 337 col = *j++ + ishift; 338 i2 = tvc[col]; 339 nz = ai[row+1] - ai[row]; 340 while (nz-- > 0) { 341 /* ja[work[i1]++] = i2 + oshift; */ 342 ja[work[i2]++] = i1 + oshift; 343 i2++; 344 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 345 if (nz > 0) i2 = tvc[col]; 346 } 347 } 348 ierr = PetscFree(ns_col);CHKERRQ(ierr); 349 ierr = PetscFree(work);CHKERRQ(ierr); 350 ierr = PetscFree(tns);CHKERRQ(ierr); 351 ierr = PetscFree(tvc);CHKERRQ(ierr); 352 PetscFunctionReturn(0); 353 } 354 355 #undef __FUNCT__ 356 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 357 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 358 { 359 PetscErrorCode ierr; 360 361 PetscFunctionBegin; 362 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 363 if (!ia) PetscFunctionReturn(0); 364 365 if (!blockcompressed) { 366 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 367 } else if (symmetric) { 368 /* Since the indices are symmetric it does'nt matter */ 369 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 370 } else { 371 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 372 } 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 378 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscTruth symmetric,PetscTruth blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscTruth *done) 379 { 380 PetscErrorCode ierr; 381 382 PetscFunctionBegin; 383 if (!ia) PetscFunctionReturn(0); 384 if (!blockcompressed) { 385 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 386 } else { 387 ierr = PetscFree(*ia);CHKERRQ(ierr); 388 ierr = PetscFree(*ja);CHKERRQ(ierr); 389 } 390 PetscFunctionReturn(0); 391 } 392 393 /* ----------------------------------------------------------- */ 394 395 #undef __FUNCT__ 396 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 397 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 398 { 399 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 400 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 401 PetscScalar *y; 402 const PetscScalar *x; 403 const MatScalar *v1,*v2,*v3,*v4,*v5; 404 PetscErrorCode ierr; 405 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 406 407 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 408 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 409 #endif 410 411 PetscFunctionBegin; 412 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 413 node_max = a->inode.node_count; 414 ns = a->inode.size; /* Node Size array */ 415 ierr = VecGetArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 416 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 417 idx = a->j; 418 v1 = a->a; 419 ii = a->i; 420 421 for (i = 0,row = 0; i< node_max; ++i){ 422 nsz = ns[i]; 423 n = ii[1] - ii[0]; 424 nonzerorow += (n>0)*nsz; 425 ii += nsz; 426 PetscPrefetchBlock(idx+nsz*n,n,0,0); /* Prefetch the indices for the block row after the current one */ 427 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,0); /* Prefetch the values for the block row after the current one */ 428 sz = n; /* No of non zeros in this row */ 429 /* Switch on the size of Node */ 430 switch (nsz){ /* Each loop in 'case' is unrolled */ 431 case 1 : 432 sum1 = 0.; 433 434 for(n = 0; n< sz-1; n+=2) { 435 i1 = idx[0]; /* The instructions are ordered to */ 436 i2 = idx[1]; /* make the compiler's job easy */ 437 idx += 2; 438 tmp0 = x[i1]; 439 tmp1 = x[i2]; 440 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 441 } 442 443 if (n == sz-1){ /* Take care of the last nonzero */ 444 tmp0 = x[*idx++]; 445 sum1 += *v1++ * tmp0; 446 } 447 y[row++]=sum1; 448 break; 449 case 2: 450 sum1 = 0.; 451 sum2 = 0.; 452 v2 = v1 + n; 453 454 for (n = 0; n< sz-1; n+=2) { 455 i1 = idx[0]; 456 i2 = idx[1]; 457 idx += 2; 458 tmp0 = x[i1]; 459 tmp1 = x[i2]; 460 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 461 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 462 } 463 if (n == sz-1){ 464 tmp0 = x[*idx++]; 465 sum1 += *v1++ * tmp0; 466 sum2 += *v2++ * tmp0; 467 } 468 y[row++]=sum1; 469 y[row++]=sum2; 470 v1 =v2; /* Since the next block to be processed starts there*/ 471 idx +=sz; 472 break; 473 case 3: 474 sum1 = 0.; 475 sum2 = 0.; 476 sum3 = 0.; 477 v2 = v1 + n; 478 v3 = v2 + n; 479 480 for (n = 0; n< sz-1; n+=2) { 481 i1 = idx[0]; 482 i2 = idx[1]; 483 idx += 2; 484 tmp0 = x[i1]; 485 tmp1 = x[i2]; 486 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 487 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 488 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 489 } 490 if (n == sz-1){ 491 tmp0 = x[*idx++]; 492 sum1 += *v1++ * tmp0; 493 sum2 += *v2++ * tmp0; 494 sum3 += *v3++ * tmp0; 495 } 496 y[row++]=sum1; 497 y[row++]=sum2; 498 y[row++]=sum3; 499 v1 =v3; /* Since the next block to be processed starts there*/ 500 idx +=2*sz; 501 break; 502 case 4: 503 sum1 = 0.; 504 sum2 = 0.; 505 sum3 = 0.; 506 sum4 = 0.; 507 v2 = v1 + n; 508 v3 = v2 + n; 509 v4 = v3 + n; 510 511 for (n = 0; n< sz-1; n+=2) { 512 i1 = idx[0]; 513 i2 = idx[1]; 514 idx += 2; 515 tmp0 = x[i1]; 516 tmp1 = x[i2]; 517 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 518 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 519 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 520 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 521 } 522 if (n == sz-1){ 523 tmp0 = x[*idx++]; 524 sum1 += *v1++ * tmp0; 525 sum2 += *v2++ * tmp0; 526 sum3 += *v3++ * tmp0; 527 sum4 += *v4++ * tmp0; 528 } 529 y[row++]=sum1; 530 y[row++]=sum2; 531 y[row++]=sum3; 532 y[row++]=sum4; 533 v1 =v4; /* Since the next block to be processed starts there*/ 534 idx +=3*sz; 535 break; 536 case 5: 537 sum1 = 0.; 538 sum2 = 0.; 539 sum3 = 0.; 540 sum4 = 0.; 541 sum5 = 0.; 542 v2 = v1 + n; 543 v3 = v2 + n; 544 v4 = v3 + n; 545 v5 = v4 + n; 546 547 for (n = 0; n<sz-1; n+=2) { 548 i1 = idx[0]; 549 i2 = idx[1]; 550 idx += 2; 551 tmp0 = x[i1]; 552 tmp1 = x[i2]; 553 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 554 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 555 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 556 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 557 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 558 } 559 if (n == sz-1){ 560 tmp0 = x[*idx++]; 561 sum1 += *v1++ * tmp0; 562 sum2 += *v2++ * tmp0; 563 sum3 += *v3++ * tmp0; 564 sum4 += *v4++ * tmp0; 565 sum5 += *v5++ * tmp0; 566 } 567 y[row++]=sum1; 568 y[row++]=sum2; 569 y[row++]=sum3; 570 y[row++]=sum4; 571 y[row++]=sum5; 572 v1 =v5; /* Since the next block to be processed starts there */ 573 idx +=4*sz; 574 break; 575 default : 576 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 577 } 578 } 579 ierr = VecRestoreArray(xx,(PetscScalar**)&x);CHKERRQ(ierr); 580 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 581 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 582 PetscFunctionReturn(0); 583 } 584 /* ----------------------------------------------------------- */ 585 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 586 #undef __FUNCT__ 587 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 588 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 589 { 590 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 591 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 592 MatScalar *v1,*v2,*v3,*v4,*v5; 593 PetscScalar *x,*y,*z,*zt; 594 PetscErrorCode ierr; 595 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 596 597 PetscFunctionBegin; 598 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 599 node_max = a->inode.node_count; 600 ns = a->inode.size; /* Node Size array */ 601 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 602 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 603 if (zz != yy) { 604 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 605 } else { 606 z = y; 607 } 608 zt = z; 609 610 idx = a->j; 611 v1 = a->a; 612 ii = a->i; 613 614 for (i = 0,row = 0; i< node_max; ++i){ 615 nsz = ns[i]; 616 n = ii[1] - ii[0]; 617 ii += nsz; 618 sz = n; /* No of non zeros in this row */ 619 /* Switch on the size of Node */ 620 switch (nsz){ /* Each loop in 'case' is unrolled */ 621 case 1 : 622 sum1 = *zt++; 623 624 for(n = 0; n< sz-1; n+=2) { 625 i1 = idx[0]; /* The instructions are ordered to */ 626 i2 = idx[1]; /* make the compiler's job easy */ 627 idx += 2; 628 tmp0 = x[i1]; 629 tmp1 = x[i2]; 630 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 631 } 632 633 if(n == sz-1){ /* Take care of the last nonzero */ 634 tmp0 = x[*idx++]; 635 sum1 += *v1++ * tmp0; 636 } 637 y[row++]=sum1; 638 break; 639 case 2: 640 sum1 = *zt++; 641 sum2 = *zt++; 642 v2 = v1 + n; 643 644 for(n = 0; n< sz-1; n+=2) { 645 i1 = idx[0]; 646 i2 = idx[1]; 647 idx += 2; 648 tmp0 = x[i1]; 649 tmp1 = x[i2]; 650 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 651 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 652 } 653 if(n == sz-1){ 654 tmp0 = x[*idx++]; 655 sum1 += *v1++ * tmp0; 656 sum2 += *v2++ * tmp0; 657 } 658 y[row++]=sum1; 659 y[row++]=sum2; 660 v1 =v2; /* Since the next block to be processed starts there*/ 661 idx +=sz; 662 break; 663 case 3: 664 sum1 = *zt++; 665 sum2 = *zt++; 666 sum3 = *zt++; 667 v2 = v1 + n; 668 v3 = v2 + n; 669 670 for (n = 0; n< sz-1; n+=2) { 671 i1 = idx[0]; 672 i2 = idx[1]; 673 idx += 2; 674 tmp0 = x[i1]; 675 tmp1 = x[i2]; 676 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 677 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 678 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 679 } 680 if (n == sz-1){ 681 tmp0 = x[*idx++]; 682 sum1 += *v1++ * tmp0; 683 sum2 += *v2++ * tmp0; 684 sum3 += *v3++ * tmp0; 685 } 686 y[row++]=sum1; 687 y[row++]=sum2; 688 y[row++]=sum3; 689 v1 =v3; /* Since the next block to be processed starts there*/ 690 idx +=2*sz; 691 break; 692 case 4: 693 sum1 = *zt++; 694 sum2 = *zt++; 695 sum3 = *zt++; 696 sum4 = *zt++; 697 v2 = v1 + n; 698 v3 = v2 + n; 699 v4 = v3 + n; 700 701 for (n = 0; n< sz-1; n+=2) { 702 i1 = idx[0]; 703 i2 = idx[1]; 704 idx += 2; 705 tmp0 = x[i1]; 706 tmp1 = x[i2]; 707 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 708 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 709 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 710 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 711 } 712 if (n == sz-1){ 713 tmp0 = x[*idx++]; 714 sum1 += *v1++ * tmp0; 715 sum2 += *v2++ * tmp0; 716 sum3 += *v3++ * tmp0; 717 sum4 += *v4++ * tmp0; 718 } 719 y[row++]=sum1; 720 y[row++]=sum2; 721 y[row++]=sum3; 722 y[row++]=sum4; 723 v1 =v4; /* Since the next block to be processed starts there*/ 724 idx +=3*sz; 725 break; 726 case 5: 727 sum1 = *zt++; 728 sum2 = *zt++; 729 sum3 = *zt++; 730 sum4 = *zt++; 731 sum5 = *zt++; 732 v2 = v1 + n; 733 v3 = v2 + n; 734 v4 = v3 + n; 735 v5 = v4 + n; 736 737 for (n = 0; n<sz-1; n+=2) { 738 i1 = idx[0]; 739 i2 = idx[1]; 740 idx += 2; 741 tmp0 = x[i1]; 742 tmp1 = x[i2]; 743 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 744 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 745 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 746 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 747 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 748 } 749 if(n == sz-1){ 750 tmp0 = x[*idx++]; 751 sum1 += *v1++ * tmp0; 752 sum2 += *v2++ * tmp0; 753 sum3 += *v3++ * tmp0; 754 sum4 += *v4++ * tmp0; 755 sum5 += *v5++ * tmp0; 756 } 757 y[row++]=sum1; 758 y[row++]=sum2; 759 y[row++]=sum3; 760 y[row++]=sum4; 761 y[row++]=sum5; 762 v1 =v5; /* Since the next block to be processed starts there */ 763 idx +=4*sz; 764 break; 765 default : 766 SETERRQ(PETSC_ERR_COR,"Node size not yet supported"); 767 } 768 } 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 771 if (zz != yy) { 772 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 773 } 774 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 775 PetscFunctionReturn(0); 776 } 777 778 /* ----------------------------------------------------------- */ 779 #undef __FUNCT__ 780 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 781 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 782 { 783 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 784 IS iscol = a->col,isrow = a->row; 785 PetscErrorCode ierr; 786 const PetscInt *r,*c,*rout,*cout; 787 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 788 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 789 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 790 PetscScalar sum1,sum2,sum3,sum4,sum5; 791 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 792 const PetscScalar *b; 793 794 PetscFunctionBegin; 795 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 796 node_max = a->inode.node_count; 797 ns = a->inode.size; /* Node Size array */ 798 799 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 801 tmp = a->solve_work; 802 803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 805 806 /* forward solve the lower triangular */ 807 tmps = tmp ; 808 aa = a_a ; 809 aj = a_j ; 810 ad = a->diag; 811 812 for (i = 0,row = 0; i< node_max; ++i){ 813 nsz = ns[i]; 814 aii = ai[row]; 815 v1 = aa + aii; 816 vi = aj + aii; 817 nz = ad[row]- aii; 818 if (i < node_max-1) { 819 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 820 * but our indexing to determine it's size could. */ 821 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,0); /* indices */ 822 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 823 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,0); 824 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 825 } 826 827 switch (nsz){ /* Each loop in 'case' is unrolled */ 828 case 1 : 829 sum1 = b[*r++]; 830 for(j=0; j<nz-1; j+=2){ 831 i0 = vi[0]; 832 i1 = vi[1]; 833 vi +=2; 834 tmp0 = tmps[i0]; 835 tmp1 = tmps[i1]; 836 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 837 } 838 if(j == nz-1){ 839 tmp0 = tmps[*vi++]; 840 sum1 -= *v1++ *tmp0; 841 } 842 tmp[row ++]=sum1; 843 break; 844 case 2: 845 sum1 = b[*r++]; 846 sum2 = b[*r++]; 847 v2 = aa + ai[row+1]; 848 849 for(j=0; j<nz-1; j+=2){ 850 i0 = vi[0]; 851 i1 = vi[1]; 852 vi +=2; 853 tmp0 = tmps[i0]; 854 tmp1 = tmps[i1]; 855 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 856 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 857 } 858 if(j == nz-1){ 859 tmp0 = tmps[*vi++]; 860 sum1 -= *v1++ *tmp0; 861 sum2 -= *v2++ *tmp0; 862 } 863 sum2 -= *v2++ * sum1; 864 tmp[row ++]=sum1; 865 tmp[row ++]=sum2; 866 break; 867 case 3: 868 sum1 = b[*r++]; 869 sum2 = b[*r++]; 870 sum3 = b[*r++]; 871 v2 = aa + ai[row+1]; 872 v3 = aa + ai[row+2]; 873 874 for (j=0; j<nz-1; j+=2){ 875 i0 = vi[0]; 876 i1 = vi[1]; 877 vi +=2; 878 tmp0 = tmps[i0]; 879 tmp1 = tmps[i1]; 880 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 881 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 882 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 883 } 884 if (j == nz-1){ 885 tmp0 = tmps[*vi++]; 886 sum1 -= *v1++ *tmp0; 887 sum2 -= *v2++ *tmp0; 888 sum3 -= *v3++ *tmp0; 889 } 890 sum2 -= *v2++ * sum1; 891 sum3 -= *v3++ * sum1; 892 sum3 -= *v3++ * sum2; 893 tmp[row ++]=sum1; 894 tmp[row ++]=sum2; 895 tmp[row ++]=sum3; 896 break; 897 898 case 4: 899 sum1 = b[*r++]; 900 sum2 = b[*r++]; 901 sum3 = b[*r++]; 902 sum4 = b[*r++]; 903 v2 = aa + ai[row+1]; 904 v3 = aa + ai[row+2]; 905 v4 = aa + ai[row+3]; 906 907 for (j=0; j<nz-1; j+=2){ 908 i0 = vi[0]; 909 i1 = vi[1]; 910 vi +=2; 911 tmp0 = tmps[i0]; 912 tmp1 = tmps[i1]; 913 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 914 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 915 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 916 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 917 } 918 if (j == nz-1){ 919 tmp0 = tmps[*vi++]; 920 sum1 -= *v1++ *tmp0; 921 sum2 -= *v2++ *tmp0; 922 sum3 -= *v3++ *tmp0; 923 sum4 -= *v4++ *tmp0; 924 } 925 sum2 -= *v2++ * sum1; 926 sum3 -= *v3++ * sum1; 927 sum4 -= *v4++ * sum1; 928 sum3 -= *v3++ * sum2; 929 sum4 -= *v4++ * sum2; 930 sum4 -= *v4++ * sum3; 931 932 tmp[row ++]=sum1; 933 tmp[row ++]=sum2; 934 tmp[row ++]=sum3; 935 tmp[row ++]=sum4; 936 break; 937 case 5: 938 sum1 = b[*r++]; 939 sum2 = b[*r++]; 940 sum3 = b[*r++]; 941 sum4 = b[*r++]; 942 sum5 = b[*r++]; 943 v2 = aa + ai[row+1]; 944 v3 = aa + ai[row+2]; 945 v4 = aa + ai[row+3]; 946 v5 = aa + ai[row+4]; 947 948 for (j=0; j<nz-1; j+=2){ 949 i0 = vi[0]; 950 i1 = vi[1]; 951 vi +=2; 952 tmp0 = tmps[i0]; 953 tmp1 = tmps[i1]; 954 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 955 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 956 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 957 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 958 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 959 } 960 if (j == nz-1){ 961 tmp0 = tmps[*vi++]; 962 sum1 -= *v1++ *tmp0; 963 sum2 -= *v2++ *tmp0; 964 sum3 -= *v3++ *tmp0; 965 sum4 -= *v4++ *tmp0; 966 sum5 -= *v5++ *tmp0; 967 } 968 969 sum2 -= *v2++ * sum1; 970 sum3 -= *v3++ * sum1; 971 sum4 -= *v4++ * sum1; 972 sum5 -= *v5++ * sum1; 973 sum3 -= *v3++ * sum2; 974 sum4 -= *v4++ * sum2; 975 sum5 -= *v5++ * sum2; 976 sum4 -= *v4++ * sum3; 977 sum5 -= *v5++ * sum3; 978 sum5 -= *v5++ * sum4; 979 980 tmp[row ++]=sum1; 981 tmp[row ++]=sum2; 982 tmp[row ++]=sum3; 983 tmp[row ++]=sum4; 984 tmp[row ++]=sum5; 985 break; 986 default: 987 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 988 } 989 } 990 /* backward solve the upper triangular */ 991 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 992 nsz = ns[i]; 993 aii = ai[row+1] -1; 994 v1 = aa + aii; 995 vi = aj + aii; 996 nz = aii- ad[row]; 997 switch (nsz){ /* Each loop in 'case' is unrolled */ 998 case 1 : 999 sum1 = tmp[row]; 1000 1001 for(j=nz ; j>1; j-=2){ 1002 vi -=2; 1003 i0 = vi[2]; 1004 i1 = vi[1]; 1005 tmp0 = tmps[i0]; 1006 tmp1 = tmps[i1]; 1007 v1 -= 2; 1008 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1009 } 1010 if (j==1){ 1011 tmp0 = tmps[*vi--]; 1012 sum1 -= *v1-- * tmp0; 1013 } 1014 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1015 break; 1016 case 2 : 1017 sum1 = tmp[row]; 1018 sum2 = tmp[row -1]; 1019 v2 = aa + ai[row]-1; 1020 for (j=nz ; j>1; j-=2){ 1021 vi -=2; 1022 i0 = vi[2]; 1023 i1 = vi[1]; 1024 tmp0 = tmps[i0]; 1025 tmp1 = tmps[i1]; 1026 v1 -= 2; 1027 v2 -= 2; 1028 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1029 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1030 } 1031 if (j==1){ 1032 tmp0 = tmps[*vi--]; 1033 sum1 -= *v1-- * tmp0; 1034 sum2 -= *v2-- * tmp0; 1035 } 1036 1037 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1038 sum2 -= *v2-- * tmp0; 1039 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1040 break; 1041 case 3 : 1042 sum1 = tmp[row]; 1043 sum2 = tmp[row -1]; 1044 sum3 = tmp[row -2]; 1045 v2 = aa + ai[row]-1; 1046 v3 = aa + ai[row -1]-1; 1047 for (j=nz ; j>1; j-=2){ 1048 vi -=2; 1049 i0 = vi[2]; 1050 i1 = vi[1]; 1051 tmp0 = tmps[i0]; 1052 tmp1 = tmps[i1]; 1053 v1 -= 2; 1054 v2 -= 2; 1055 v3 -= 2; 1056 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1057 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1058 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1059 } 1060 if (j==1){ 1061 tmp0 = tmps[*vi--]; 1062 sum1 -= *v1-- * tmp0; 1063 sum2 -= *v2-- * tmp0; 1064 sum3 -= *v3-- * tmp0; 1065 } 1066 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1067 sum2 -= *v2-- * tmp0; 1068 sum3 -= *v3-- * tmp0; 1069 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1070 sum3 -= *v3-- * tmp0; 1071 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1072 1073 break; 1074 case 4 : 1075 sum1 = tmp[row]; 1076 sum2 = tmp[row -1]; 1077 sum3 = tmp[row -2]; 1078 sum4 = tmp[row -3]; 1079 v2 = aa + ai[row]-1; 1080 v3 = aa + ai[row -1]-1; 1081 v4 = aa + ai[row -2]-1; 1082 1083 for (j=nz ; j>1; j-=2){ 1084 vi -=2; 1085 i0 = vi[2]; 1086 i1 = vi[1]; 1087 tmp0 = tmps[i0]; 1088 tmp1 = tmps[i1]; 1089 v1 -= 2; 1090 v2 -= 2; 1091 v3 -= 2; 1092 v4 -= 2; 1093 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1094 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1095 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1096 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1097 } 1098 if (j==1){ 1099 tmp0 = tmps[*vi--]; 1100 sum1 -= *v1-- * tmp0; 1101 sum2 -= *v2-- * tmp0; 1102 sum3 -= *v3-- * tmp0; 1103 sum4 -= *v4-- * tmp0; 1104 } 1105 1106 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1107 sum2 -= *v2-- * tmp0; 1108 sum3 -= *v3-- * tmp0; 1109 sum4 -= *v4-- * tmp0; 1110 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1111 sum3 -= *v3-- * tmp0; 1112 sum4 -= *v4-- * tmp0; 1113 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1114 sum4 -= *v4-- * tmp0; 1115 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1116 break; 1117 case 5 : 1118 sum1 = tmp[row]; 1119 sum2 = tmp[row -1]; 1120 sum3 = tmp[row -2]; 1121 sum4 = tmp[row -3]; 1122 sum5 = tmp[row -4]; 1123 v2 = aa + ai[row]-1; 1124 v3 = aa + ai[row -1]-1; 1125 v4 = aa + ai[row -2]-1; 1126 v5 = aa + ai[row -3]-1; 1127 for (j=nz ; j>1; j-=2){ 1128 vi -= 2; 1129 i0 = vi[2]; 1130 i1 = vi[1]; 1131 tmp0 = tmps[i0]; 1132 tmp1 = tmps[i1]; 1133 v1 -= 2; 1134 v2 -= 2; 1135 v3 -= 2; 1136 v4 -= 2; 1137 v5 -= 2; 1138 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1139 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1140 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1141 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1142 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1143 } 1144 if (j==1){ 1145 tmp0 = tmps[*vi--]; 1146 sum1 -= *v1-- * tmp0; 1147 sum2 -= *v2-- * tmp0; 1148 sum3 -= *v3-- * tmp0; 1149 sum4 -= *v4-- * tmp0; 1150 sum5 -= *v5-- * tmp0; 1151 } 1152 1153 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1154 sum2 -= *v2-- * tmp0; 1155 sum3 -= *v3-- * tmp0; 1156 sum4 -= *v4-- * tmp0; 1157 sum5 -= *v5-- * tmp0; 1158 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1159 sum3 -= *v3-- * tmp0; 1160 sum4 -= *v4-- * tmp0; 1161 sum5 -= *v5-- * tmp0; 1162 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1163 sum4 -= *v4-- * tmp0; 1164 sum5 -= *v5-- * tmp0; 1165 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1166 sum5 -= *v5-- * tmp0; 1167 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1168 break; 1169 default: 1170 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 1171 } 1172 } 1173 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1174 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1176 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1177 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1178 PetscFunctionReturn(0); 1179 } 1180 1181 #undef __FUNCT__ 1182 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1183 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1184 { 1185 Mat C=B; 1186 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1187 IS isrow = b->row,isicol = b->icol; 1188 PetscErrorCode ierr; 1189 const PetscInt *r,*ic,*ics; 1190 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1191 PetscInt i,j,k,nz,nzL,row,*pj; 1192 const PetscInt *ajtmp,*bjtmp; 1193 MatScalar *rtmp,*pc,*pc1,*pc2,*pc3,multiplier,multiplier1,multiplier2,multiplier3,*pv,*rtmp11,*rtmp22,*rtmp33; 1194 const MatScalar *aa=a->a,*v,*v1,*v2,*v3; 1195 FactorShiftCtx sctx; 1196 PetscInt *ddiag; 1197 PetscReal rs; 1198 MatScalar d; 1199 PetscInt inod,nodesz,k1,k2,k3,node_max,*ns,col; 1200 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1201 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) 1216 rs += PetscAbsScalar(v[j]); 1217 if (rs>sctx.shift_top) sctx.shift_top = rs; 1218 } 1219 sctx.shift_top *= 1.1; 1220 sctx.nshift_max = 5; 1221 sctx.shift_lo = 0.; 1222 sctx.shift_hi = 1.; 1223 } 1224 1225 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1226 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1227 1228 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1230 rtmp = rtmp11; 1231 rtmp22 = rtmp11 + n; 1232 rtmp33 = rtmp22 + n; 1233 ics = ic; 1234 1235 node_max = a->inode.node_count; 1236 ns = a->inode.size; 1237 if (!ns){ 1238 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1239 } 1240 1241 /* If max inode size > 3, split it into two inodes.*/ 1242 /* also map the inode sizes according to the ordering */ 1243 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1244 for (i=0,j=0; i<node_max; ++i,++j){ 1245 if (ns[i]>3) { 1246 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1247 ++j; 1248 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1249 } else { 1250 tmp_vec1[j] = ns[i]; 1251 } 1252 } 1253 /* Use the correct node_max */ 1254 node_max = j; 1255 1256 /* Now reorder the inode info based on mat re-ordering info */ 1257 /* First create a row -> inode_size_array_index map */ 1258 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1259 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1260 for (i=0,row=0; i<node_max; i++) { 1261 nodesz = tmp_vec1[i]; 1262 for (j=0; j<nodesz; j++,row++) { 1263 nsmap[row] = i; 1264 } 1265 } 1266 /* Using nsmap, create a reordered ns structure */ 1267 for (i=0,j=0; i< node_max; i++) { 1268 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1269 tmp_vec2[i] = nodesz; 1270 j += nodesz; 1271 } 1272 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1273 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1274 /* Now use the correct ns */ 1275 ns = tmp_vec2; 1276 1277 do { 1278 sctx.useshift = PETSC_FALSE; 1279 /* Now loop over each block-row, and do the factorization */ 1280 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1281 nodesz = ns[inod]; 1282 1283 switch (nodesz){ 1284 case 1: 1285 /*----------*/ 1286 /* zero rtmp */ 1287 /* L part */ 1288 nz = bi[i+1] - bi[i]; 1289 bjtmp = bj + bi[i]; 1290 for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0; 1291 1292 /* U part */ 1293 nz = bdiag[i]-bdiag[i+1]; 1294 bjtmp = bj + bdiag[i+1]+1; 1295 for (j=0; j<nz; j++) rtmp[bjtmp[j]] = 0.0; 1296 1297 /* load in initial (unfactored row) */ 1298 nz = ai[r[i]+1] - ai[r[i]]; 1299 ajtmp = aj + ai[r[i]]; 1300 v = aa + ai[r[i]]; 1301 for (j=0; j<nz; j++) { 1302 rtmp[ics[ajtmp[j]]] = v[j]; 1303 } 1304 /* ZeropivotApply() */ 1305 rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1306 1307 /* elimination */ 1308 bjtmp = bj + bi[i]; 1309 row = *bjtmp++; 1310 nzL = bi[i+1] - bi[i]; 1311 for(k=0; k < nzL;k++) { 1312 pc = rtmp + row; 1313 if (*pc != 0.0) { 1314 pv = b->a + bdiag[row]; 1315 multiplier = *pc * (*pv); 1316 *pc = multiplier; 1317 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1318 pv = b->a + bdiag[row+1]+1; 1319 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1320 for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 1321 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 1322 } 1323 row = *bjtmp++; 1324 } 1325 1326 /* finished row so stick it into b->a */ 1327 rs = 0.0; 1328 /* L part */ 1329 pv = b->a + bi[i] ; 1330 pj = b->j + bi[i] ; 1331 nz = bi[i+1] - bi[i]; 1332 for (j=0; j<nz; j++) { 1333 pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]); 1334 } 1335 1336 /* U part */ 1337 pv = b->a + bdiag[i+1]+1; 1338 pj = b->j + bdiag[i+1]+1; 1339 nz = bdiag[i] - bdiag[i+1]-1; 1340 for (j=0; j<nz; j++) { 1341 pv[j] = rtmp[pj[j]]; rs += PetscAbsScalar(pv[j]); 1342 } 1343 1344 /* MatPivotCheck() */ 1345 sctx.rs = rs; 1346 sctx.pv = rtmp[i]; 1347 if (info->shifttype == MAT_SHIFT_NONZERO){ 1348 ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr); 1349 } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){ 1350 ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr); 1351 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1352 ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr); 1353 } else { 1354 ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr); 1355 } 1356 rtmp[i] = sctx.pv; 1357 1358 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1359 pv = b->a + bdiag[i]; 1360 *pv = 1.0/rtmp[i]; 1361 1362 break; 1363 1364 case 2: 1365 /*----------*/ 1366 /* zero rtmp */ 1367 /* L part */ 1368 nz = bi[i+1] - bi[i]; 1369 bjtmp = bj + bi[i]; 1370 for (j=0; j<nz; j++) { 1371 col = bjtmp[j]; 1372 rtmp11[col] = 0.0; rtmp22[col] = 0.0; 1373 } 1374 1375 /* U part */ 1376 nz = bdiag[i]-bdiag[i+1]; 1377 bjtmp = bj + bdiag[i+1]+1; 1378 for (j=0; j<nz; j++) { 1379 col = bjtmp[j]; 1380 rtmp11[col] = 0.0; rtmp22[col] = 0.0; 1381 } 1382 1383 /* load in initial (unfactored row) */ 1384 nz = ai[r[i]+1] - ai[r[i]]; 1385 ajtmp = aj + ai[r[i]]; 1386 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1387 for (j=0; j<nz; j++) { 1388 col = ics[ajtmp[j]]; 1389 rtmp11[col] = v1[j]; rtmp22[col] = v2[j]; 1390 } 1391 /* ZeropivotApply(): shift the diagonal of the matrix */ 1392 rtmp11[i] += sctx.shift_amount; rtmp22[i+1] += sctx.shift_amount; 1393 1394 /* elimination */ 1395 bjtmp = bj + bi[i]; 1396 row = *bjtmp++; /* pivot row */ 1397 nzL = bi[i+1] - bi[i]; 1398 for(k=0; k < nzL;k++) { 1399 pc1 = rtmp11 + row; 1400 pc2 = rtmp22 + row; 1401 if (*pc1 != 0.0 || *pc2 != 0.0) { 1402 pv = b->a + bdiag[row]; 1403 multiplier1 = *pc1*(*pv); multiplier2 = *pc2*(*pv); 1404 *pc1 = multiplier1; *pc2 = multiplier2; 1405 1406 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1407 pv = b->a + bdiag[row+1]+1; 1408 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1409 for (j=0; j<nz; j++){ 1410 col = pj[j]; 1411 rtmp11[col] -= multiplier1 * pv[j]; 1412 rtmp22[col] -= multiplier2 * pv[j]; 1413 } 1414 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1415 } 1416 row = *bjtmp++; 1417 } 1418 1419 /* Now take care of diagonal 2x2 block. */ 1420 pc1 = rtmp11 + i; 1421 pc2 = rtmp22 + i; 1422 1423 if (*pc2 != 0.0){ 1424 multiplier = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1425 *pc2 = multiplier; /* insert L entry */ 1426 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1427 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1428 for (j=0; j<nz; j++) { 1429 col = pj[j]; 1430 rtmp22[col] -= multiplier * rtmp11[col]; 1431 } 1432 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1433 } 1434 1435 1436 /* finished rows so stick it into b->a */ 1437 rs = 0.0; 1438 /* L part */ 1439 k1 = k2 =0; 1440 pc1 = b->a + bi[i]; pc2 = b->a + bi[i+1]; 1441 pj = b->j + bi[i] ; 1442 nz = bi[i+1] - bi[i]; 1443 for (j=0; j<nz; j++) { 1444 col = pj[j]; 1445 if (col < i) {pc1[k1] = rtmp11[col]; k1++; rs += PetscAbsScalar(pc1[j]);} 1446 if (col < i+1){pc2[k2] = rtmp22[col]; k2++;} 1447 } 1448 1449 /* U part */ 1450 pc1 = b->a + bdiag[i+1]+1; pc2 = b->a + bdiag[i+2]+1; 1451 pj = b->j + bdiag[i+1]+1; 1452 nz = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */ 1453 k1 = k2 =0; 1454 for (j=0; j<nz; j++) { 1455 col = pj[j]; 1456 if (col > i) {pc1[k1] = rtmp11[col]; k1++; rs += PetscAbsScalar(pc1[j]);} 1457 if (col > i+1){pc2[k2] = rtmp22[col]; k2++;} 1458 } 1459 1460 #if defined(TMP) 1461 /* MatPivotCheck() */ 1462 sctx.rs = rs; 1463 sctx.pv = rtmp11[i]; 1464 if (info->shifttype == MAT_SHIFT_NONZERO){ 1465 ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr); 1466 } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){ 1467 ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr); 1468 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1469 ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr); 1470 } else { 1471 ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr); 1472 } 1473 rtmp11[i] = sctx.pv; 1474 #endif 1475 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1476 pc1 = b->a + bdiag[i]; 1477 *pc1 = 1.0/rtmp11[i]; 1478 #if defined(TMP) 1479 sctx.rs = rs; 1480 sctx.pv = rtmp22[i+1]; 1481 if (info->shifttype == MAT_SHIFT_NONZERO){ 1482 ierr = MatPivotCheck_nz(info,sctx,i+1);CHKERRQ(ierr); 1483 } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){ 1484 ierr = MatPivotCheck_pd(info,sctx,i+1);CHKERRQ(ierr); 1485 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1486 ierr = MatPivotCheck_inblocks(info,sctx,i+1);CHKERRQ(ierr); 1487 } else { 1488 ierr = MatPivotCheck_none(info,sctx,i+1);CHKERRQ(ierr); 1489 } 1490 rtmp22[i+1] = sctx.pv; 1491 #endif 1492 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1493 pc2 = b->a + bdiag[i+1]; 1494 *pc2 = 1.0/rtmp22[i+1]; 1495 1496 break; 1497 case 3: 1498 /*----------*/ 1499 /* zero rtmp */ 1500 /* L part */ 1501 nz = bi[i+1] - bi[i]; 1502 bjtmp = bj + bi[i]; 1503 for (j=0; j<nz; j++) { 1504 col = bjtmp[j]; 1505 rtmp11[col] = 0.0; rtmp22[col] = 0.0; rtmp33[col] = 0.0; 1506 } 1507 1508 /* U part */ 1509 nz = bdiag[i]-bdiag[i+1]; 1510 bjtmp = bj + bdiag[i+1]+1; 1511 for (j=0; j<nz; j++) { 1512 col = bjtmp[j]; 1513 rtmp11[col] = 0.0; rtmp22[col] = 0.0; rtmp33[col] = 0.0; 1514 } 1515 1516 /* load in initial (unfactored row) */ 1517 nz = ai[r[i]+1] - ai[r[i]]; 1518 ajtmp = aj + ai[r[i]]; 1519 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1520 for (j=0; j<nz; j++) { 1521 col = ics[ajtmp[j]]; 1522 rtmp11[col] = v1[j]; rtmp22[col] = v2[j]; rtmp33[col] = v3[j]; 1523 } 1524 /* ZeropivotApply(): shift the diagonal of the matrix */ 1525 rtmp11[i] += sctx.shift_amount; rtmp22[i+1] += sctx.shift_amount; rtmp33[i+2] += sctx.shift_amount; 1526 1527 /* elimination */ 1528 bjtmp = bj + bi[i]; 1529 row = *bjtmp++; /* pivot row */ 1530 nzL = bi[i+1] - bi[i]; 1531 for(k=0; k < nzL;k++) { 1532 pc1 = rtmp11 + row; 1533 pc2 = rtmp22 + row; 1534 pc3 = rtmp33 + row; 1535 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1536 pv = b->a + bdiag[row]; 1537 multiplier1 = *pc1*(*pv); multiplier2 = *pc2*(*pv); multiplier3 = *pc3*(*pv); 1538 *pc1 = multiplier1; *pc2 = multiplier2; *pc3 = multiplier3; 1539 1540 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1541 pv = b->a + bdiag[row+1]+1; 1542 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1543 for (j=0; j<nz; j++){ 1544 col = pj[j]; 1545 rtmp11[col] -= multiplier1 * pv[j]; 1546 rtmp22[col] -= multiplier2 * pv[j]; 1547 rtmp33[col] -= multiplier3 * pv[j]; 1548 } 1549 ierr = PetscLogFlops(6*nz);CHKERRQ(ierr); 1550 } 1551 row = *bjtmp++; 1552 } 1553 1554 /* Now take care of diagonal 3x3 block. */ 1555 pc1 = rtmp11 + i; 1556 pc2 = rtmp22 + i; 1557 pc3 = rtmp33 + i; 1558 if (*pc2 != 0.0 || *pc3 != 0.0){ 1559 multiplier2 = (*pc2)/(*pc1); 1560 multiplier3 = (*pc3)/(*pc1); 1561 1562 *pc2 = multiplier2; 1563 *pc3 = multiplier3; 1564 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1565 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1566 for (j=0; j<nz; j++) { 1567 col = pj[j]; 1568 rtmp22[col] -= multiplier2 * rtmp11[col]; 1569 rtmp33[col] -= multiplier3 * rtmp11[col]; 1570 } 1571 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1572 } 1573 1574 pc2 = rtmp22 + i+1; 1575 pc3 = rtmp33 + i+1; 1576 if (*pc3 != 0.0){ 1577 multiplier3 = (*pc3)/(*pc2); 1578 *pc3 = multiplier3; 1579 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1580 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1581 for (j=0; j<nz; j++) { 1582 col = pj[j]; 1583 rtmp33[col] -= multiplier3 * rtmp22[col]; 1584 } 1585 ierr = PetscLogFlops(2*nz);CHKERRQ(ierr); 1586 } 1587 1588 /* finished rows so stick it into b->a */ 1589 rs = 0.0; 1590 /* L part */ 1591 k1 = k2 = k3 = 0; 1592 pc1 = b->a + bi[i]; pc2 = b->a + bi[i+1]; pc3 = b->a + bi[i+2]; 1593 pj = b->j + bi[i] ; 1594 nz = bi[i+1] - bi[i]; 1595 for (j=0; j<nz; j++) { 1596 col = pj[j]; 1597 if (col < i) {pc1[k1] = rtmp11[col]; k1++; rs += PetscAbsScalar(pc1[j]);} 1598 if (col < i+1){pc2[k2] = rtmp22[col]; k2++;} 1599 if (col < i+2){pc3[k3] = rtmp33[col]; k3++;} 1600 } 1601 1602 /* U part */ 1603 pc1 = b->a + bdiag[i+1]+1; pc2 = b->a + bdiag[i+2]+1; pc3 = b->a + bdiag[i+3]+1; 1604 pj = b->j + bdiag[i+1]+1; 1605 nz = bdiag[i] - bdiag[i+1]; /* inlcude diagonal */ 1606 k1 = k2 = k3 = 0; 1607 for (j=0; j<nz; j++) { 1608 col = pj[j]; 1609 if (col > i) {pc1[k1] = rtmp11[col]; k1++; rs += PetscAbsScalar(pc1[j]);} 1610 if (col > i+1){pc2[k2] = rtmp22[col]; k2++;} 1611 if (col > i+2){pc3[k3] = rtmp33[col]; k3++;} 1612 } 1613 1614 #if defined(TMP) 1615 /* MatPivotCheck() */ 1616 sctx.rs = rs; 1617 sctx.pv = rtmp11[i]; 1618 if (info->shifttype == MAT_SHIFT_NONZERO){ 1619 ierr = MatPivotCheck_nz(info,sctx,i);CHKERRQ(ierr); 1620 } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){ 1621 ierr = MatPivotCheck_pd(info,sctx,i);CHKERRQ(ierr); 1622 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1623 ierr = MatPivotCheck_inblocks(info,sctx,i);CHKERRQ(ierr); 1624 } else { 1625 ierr = MatPivotCheck_none(info,sctx,i);CHKERRQ(ierr); 1626 } 1627 rtmp11[i] = sctx.pv; 1628 #endif 1629 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1630 pc1 = b->a + bdiag[i]; 1631 *pc1 = 1.0/rtmp11[i]; 1632 #if defined(TMP) 1633 sctx.rs = rs; 1634 sctx.pv = rtmp22[i+1]; 1635 if (info->shifttype == MAT_SHIFT_NONZERO){ 1636 ierr = MatPivotCheck_nz(info,sctx,i+1);CHKERRQ(ierr); 1637 } else if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE){ 1638 ierr = MatPivotCheck_pd(info,sctx,i+1);CHKERRQ(ierr); 1639 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1640 ierr = MatPivotCheck_inblocks(info,sctx,i+1);CHKERRQ(ierr); 1641 } else { 1642 ierr = MatPivotCheck_none(info,sctx,i+1);CHKERRQ(ierr); 1643 } 1644 rtmp22[i+1] = sctx.pv; 1645 #endif 1646 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1647 pc2 = b->a + bdiag[i+1]; 1648 *pc2 = 1.0/rtmp22[i+1]; 1649 1650 pc3 = b->a + bdiag[i+2]; 1651 *pc3 = 1.0/rtmp33[i+2]; 1652 break; 1653 1654 default: 1655 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 1656 } 1657 i += nodesz; /* Update the row */ 1658 } 1659 1660 /* MatPivotRefine() */ 1661 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE && !sctx.useshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1662 /* 1663 * if no shift in this attempt & shifting & started shifting & can refine, 1664 * then try lower shift 1665 */ 1666 sctx.shift_hi = sctx.shift_fraction; 1667 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1668 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1669 sctx.useshift = PETSC_TRUE; 1670 sctx.nshift++; 1671 } 1672 } while (sctx.useshift); 1673 1674 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 1675 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1676 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1677 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1678 1679 C->ops->solve = MatSolve_SeqAIJ_Inode; 1680 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1681 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1682 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1683 C->ops->matsolve = MatMatSolve_SeqAIJ; 1684 C->assembled = PETSC_TRUE; 1685 C->preallocated = PETSC_TRUE; 1686 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1687 1688 /* MatShiftView(A,info,&sctx) */ 1689 if (sctx.nshift){ 1690 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 1691 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1692 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 1693 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1694 } else if (info->shifttype == MAT_SHIFT_INBLOCKS){ 1695 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1696 } 1697 } 1698 PetscFunctionReturn(0); 1699 } 1700 1701 #undef __FUNCT__ 1702 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1703 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1704 { 1705 Mat C = B; 1706 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1707 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1708 PetscErrorCode ierr; 1709 const PetscInt *r,*ic,*c,*ics; 1710 PetscInt n = A->rmap->n,*bi = b->i; 1711 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1712 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1713 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1714 PetscScalar mul1,mul2,mul3,tmp; 1715 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1716 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1717 PetscReal rs=0.0; 1718 FactorShiftCtx sctx; 1719 PetscInt newshift; 1720 1721 PetscFunctionBegin; 1722 sctx.shift_top = 0; 1723 sctx.nshift_max = 0; 1724 sctx.shift_lo = 0; 1725 sctx.shift_hi = 0; 1726 sctx.shift_fraction = 0; 1727 1728 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1729 if (info->shifttype==MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1730 sctx.shift_top = 0; 1731 for (i=0; i<n; i++) { 1732 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1733 rs = 0.0; 1734 ajtmp = aj + ai[i]; 1735 rtmp1 = aa + ai[i]; 1736 nz = ai[i+1] - ai[i]; 1737 for (j=0; j<nz; j++){ 1738 if (*ajtmp != i){ 1739 rs += PetscAbsScalar(*rtmp1++); 1740 } else { 1741 rs -= PetscRealPart(*rtmp1++); 1742 } 1743 ajtmp++; 1744 } 1745 if (rs>sctx.shift_top) sctx.shift_top = rs; 1746 } 1747 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1748 sctx.shift_top *= 1.1; 1749 sctx.nshift_max = 5; 1750 sctx.shift_lo = 0.; 1751 sctx.shift_hi = 1.; 1752 } 1753 sctx.shift_amount = 0; 1754 sctx.nshift = 0; 1755 1756 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1757 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1758 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1759 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1760 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1761 ics = ic ; 1762 rtmp22 = rtmp11 + n; 1763 rtmp33 = rtmp22 + n; 1764 1765 node_max = a->inode.node_count; 1766 ns = a->inode.size; 1767 if (!ns){ 1768 SETERRQ(PETSC_ERR_PLIB,"Matrix without inode information"); 1769 } 1770 1771 /* If max inode size > 3, split it into two inodes.*/ 1772 /* also map the inode sizes according to the ordering */ 1773 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1774 for (i=0,j=0; i<node_max; ++i,++j){ 1775 if (ns[i]>3) { 1776 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1777 ++j; 1778 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1779 } else { 1780 tmp_vec1[j] = ns[i]; 1781 } 1782 } 1783 /* Use the correct node_max */ 1784 node_max = j; 1785 1786 /* Now reorder the inode info based on mat re-ordering info */ 1787 /* First create a row -> inode_size_array_index map */ 1788 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1789 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1790 for (i=0,row=0; i<node_max; i++) { 1791 nodesz = tmp_vec1[i]; 1792 for (j=0; j<nodesz; j++,row++) { 1793 nsmap[row] = i; 1794 } 1795 } 1796 /* Using nsmap, create a reordered ns structure */ 1797 for (i=0,j=0; i< node_max; i++) { 1798 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1799 tmp_vec2[i] = nodesz; 1800 j += nodesz; 1801 } 1802 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1803 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1804 /* Now use the correct ns */ 1805 ns = tmp_vec2; 1806 1807 do { 1808 sctx.useshift = PETSC_FALSE; 1809 /* Now loop over each block-row, and do the factorization */ 1810 for (i=0,row=0; i<node_max; i++) { 1811 nodesz = ns[i]; 1812 nz = bi[row+1] - bi[row]; 1813 bjtmp = bj + bi[row]; 1814 1815 switch (nodesz){ 1816 case 1: 1817 for (j=0; j<nz; j++){ 1818 idx = bjtmp[j]; 1819 rtmp11[idx] = 0.0; 1820 } 1821 1822 /* load in initial (unfactored row) */ 1823 idx = r[row]; 1824 nz_tmp = ai[idx+1] - ai[idx]; 1825 ajtmp = aj + ai[idx]; 1826 v1 = aa + ai[idx]; 1827 1828 for (j=0; j<nz_tmp; j++) { 1829 idx = ics[ajtmp[j]]; 1830 rtmp11[idx] = v1[j]; 1831 } 1832 rtmp11[ics[r[row]]] += sctx.shift_amount; 1833 1834 prow = *bjtmp++ ; 1835 while (prow < row) { 1836 pc1 = rtmp11 + prow; 1837 if (*pc1 != 0.0){ 1838 pv = ba + bd[prow]; 1839 pj = nbj + bd[prow]; 1840 mul1 = *pc1 * *pv++; 1841 *pc1 = mul1; 1842 nz_tmp = bi[prow+1] - bd[prow] - 1; 1843 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1844 for (j=0; j<nz_tmp; j++) { 1845 tmp = pv[j]; 1846 idx = pj[j]; 1847 rtmp11[idx] -= mul1 * tmp; 1848 } 1849 } 1850 prow = *bjtmp++ ; 1851 } 1852 pj = bj + bi[row]; 1853 pc1 = ba + bi[row]; 1854 1855 sctx.pv = rtmp11[row]; 1856 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 1857 rs = 0.0; 1858 for (j=0; j<nz; j++) { 1859 idx = pj[j]; 1860 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 1861 if (idx != row) rs += PetscAbsScalar(pc1[j]); 1862 } 1863 sctx.rs = rs; 1864 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1865 if (newshift == 1) goto endofwhile; 1866 break; 1867 1868 case 2: 1869 for (j=0; j<nz; j++) { 1870 idx = bjtmp[j]; 1871 rtmp11[idx] = 0.0; 1872 rtmp22[idx] = 0.0; 1873 } 1874 1875 /* load in initial (unfactored row) */ 1876 idx = r[row]; 1877 nz_tmp = ai[idx+1] - ai[idx]; 1878 ajtmp = aj + ai[idx]; 1879 v1 = aa + ai[idx]; 1880 v2 = aa + ai[idx+1]; 1881 for (j=0; j<nz_tmp; j++) { 1882 idx = ics[ajtmp[j]]; 1883 rtmp11[idx] = v1[j]; 1884 rtmp22[idx] = v2[j]; 1885 } 1886 rtmp11[ics[r[row]]] += sctx.shift_amount; 1887 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1888 1889 prow = *bjtmp++ ; 1890 while (prow < row) { 1891 pc1 = rtmp11 + prow; 1892 pc2 = rtmp22 + prow; 1893 if (*pc1 != 0.0 || *pc2 != 0.0){ 1894 pv = ba + bd[prow]; 1895 pj = nbj + bd[prow]; 1896 mul1 = *pc1 * *pv; 1897 mul2 = *pc2 * *pv; 1898 ++pv; 1899 *pc1 = mul1; 1900 *pc2 = mul2; 1901 1902 nz_tmp = bi[prow+1] - bd[prow] - 1; 1903 for (j=0; j<nz_tmp; j++) { 1904 tmp = pv[j]; 1905 idx = pj[j]; 1906 rtmp11[idx] -= mul1 * tmp; 1907 rtmp22[idx] -= mul2 * tmp; 1908 } 1909 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 1910 } 1911 prow = *bjtmp++ ; 1912 } 1913 1914 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 1915 pc1 = rtmp11 + prow; 1916 pc2 = rtmp22 + prow; 1917 1918 sctx.pv = *pc1; 1919 pj = bj + bi[prow]; 1920 rs = 0.0; 1921 for (j=0; j<nz; j++){ 1922 idx = pj[j]; 1923 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 1924 } 1925 sctx.rs = rs; 1926 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 1927 if (newshift == 1) goto endofwhile; 1928 1929 if (*pc2 != 0.0){ 1930 pj = nbj + bd[prow]; 1931 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 1932 *pc2 = mul2; 1933 nz_tmp = bi[prow+1] - bd[prow] - 1; 1934 for (j=0; j<nz_tmp; j++) { 1935 idx = pj[j] ; 1936 tmp = rtmp11[idx]; 1937 rtmp22[idx] -= mul2 * tmp; 1938 } 1939 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 1940 } 1941 1942 pj = bj + bi[row]; 1943 pc1 = ba + bi[row]; 1944 pc2 = ba + bi[row+1]; 1945 1946 sctx.pv = rtmp22[row+1]; 1947 rs = 0.0; 1948 rtmp11[row] = 1.0/rtmp11[row]; 1949 rtmp22[row+1] = 1.0/rtmp22[row+1]; 1950 /* copy row entries from dense representation to sparse */ 1951 for (j=0; j<nz; j++) { 1952 idx = pj[j]; 1953 pc1[j] = rtmp11[idx]; 1954 pc2[j] = rtmp22[idx]; 1955 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 1956 } 1957 sctx.rs = rs; 1958 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 1959 if (newshift == 1) goto endofwhile; 1960 break; 1961 1962 case 3: 1963 for (j=0; j<nz; j++) { 1964 idx = bjtmp[j]; 1965 rtmp11[idx] = 0.0; 1966 rtmp22[idx] = 0.0; 1967 rtmp33[idx] = 0.0; 1968 } 1969 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 1970 idx = r[row]; 1971 nz_tmp = ai[idx+1] - ai[idx]; 1972 ajtmp = aj + ai[idx]; 1973 v1 = aa + ai[idx]; 1974 v2 = aa + ai[idx+1]; 1975 v3 = aa + ai[idx+2]; 1976 for (j=0; j<nz_tmp; j++) { 1977 idx = ics[ajtmp[j]]; 1978 rtmp11[idx] = v1[j]; 1979 rtmp22[idx] = v2[j]; 1980 rtmp33[idx] = v3[j]; 1981 } 1982 rtmp11[ics[r[row]]] += sctx.shift_amount; 1983 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 1984 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 1985 1986 /* loop over all pivot row blocks above this row block */ 1987 prow = *bjtmp++ ; 1988 while (prow < row) { 1989 pc1 = rtmp11 + prow; 1990 pc2 = rtmp22 + prow; 1991 pc3 = rtmp33 + prow; 1992 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 1993 pv = ba + bd[prow]; 1994 pj = nbj + bd[prow]; 1995 mul1 = *pc1 * *pv; 1996 mul2 = *pc2 * *pv; 1997 mul3 = *pc3 * *pv; 1998 ++pv; 1999 *pc1 = mul1; 2000 *pc2 = mul2; 2001 *pc3 = mul3; 2002 2003 nz_tmp = bi[prow+1] - bd[prow] - 1; 2004 /* update this row based on pivot row */ 2005 for (j=0; j<nz_tmp; j++) { 2006 tmp = pv[j]; 2007 idx = pj[j]; 2008 rtmp11[idx] -= mul1 * tmp; 2009 rtmp22[idx] -= mul2 * tmp; 2010 rtmp33[idx] -= mul3 * tmp; 2011 } 2012 ierr = PetscLogFlops(6*nz_tmp);CHKERRQ(ierr); 2013 } 2014 prow = *bjtmp++ ; 2015 } 2016 2017 /* Now take care of diagonal 3x3 block in this set of rows */ 2018 /* note: prow = row here */ 2019 pc1 = rtmp11 + prow; 2020 pc2 = rtmp22 + prow; 2021 pc3 = rtmp33 + prow; 2022 2023 sctx.pv = *pc1; 2024 pj = bj + bi[prow]; 2025 rs = 0.0; 2026 for (j=0; j<nz; j++){ 2027 idx = pj[j]; 2028 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2029 } 2030 sctx.rs = rs; 2031 ierr = MatLUCheckShift_inline(info,sctx,row,newshift);CHKERRQ(ierr); 2032 if (newshift == 1) goto endofwhile; 2033 2034 if (*pc2 != 0.0 || *pc3 != 0.0){ 2035 mul2 = (*pc2)/(*pc1); 2036 mul3 = (*pc3)/(*pc1); 2037 *pc2 = mul2; 2038 *pc3 = mul3; 2039 nz_tmp = bi[prow+1] - bd[prow] - 1; 2040 pj = nbj + bd[prow]; 2041 for (j=0; j<nz_tmp; j++) { 2042 idx = pj[j] ; 2043 tmp = rtmp11[idx]; 2044 rtmp22[idx] -= mul2 * tmp; 2045 rtmp33[idx] -= mul3 * tmp; 2046 } 2047 ierr = PetscLogFlops(4*nz_tmp);CHKERRQ(ierr); 2048 } 2049 ++prow; 2050 2051 pc2 = rtmp22 + prow; 2052 pc3 = rtmp33 + prow; 2053 sctx.pv = *pc2; 2054 pj = bj + bi[prow]; 2055 rs = 0.0; 2056 for (j=0; j<nz; j++){ 2057 idx = pj[j]; 2058 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2059 } 2060 sctx.rs = rs; 2061 ierr = MatLUCheckShift_inline(info,sctx,row+1,newshift);CHKERRQ(ierr); 2062 if (newshift == 1) goto endofwhile; 2063 2064 if (*pc3 != 0.0){ 2065 mul3 = (*pc3)/(*pc2); 2066 *pc3 = mul3; 2067 pj = nbj + bd[prow]; 2068 nz_tmp = bi[prow+1] - bd[prow] - 1; 2069 for (j=0; j<nz_tmp; j++) { 2070 idx = pj[j] ; 2071 tmp = rtmp22[idx]; 2072 rtmp33[idx] -= mul3 * tmp; 2073 } 2074 ierr = PetscLogFlops(2*nz_tmp);CHKERRQ(ierr); 2075 } 2076 2077 pj = bj + bi[row]; 2078 pc1 = ba + bi[row]; 2079 pc2 = ba + bi[row+1]; 2080 pc3 = ba + bi[row+2]; 2081 2082 sctx.pv = rtmp33[row+2]; 2083 rs = 0.0; 2084 rtmp11[row] = 1.0/rtmp11[row]; 2085 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2086 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2087 /* copy row entries from dense representation to sparse */ 2088 for (j=0; j<nz; j++) { 2089 idx = pj[j]; 2090 pc1[j] = rtmp11[idx]; 2091 pc2[j] = rtmp22[idx]; 2092 pc3[j] = rtmp33[idx]; 2093 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2094 } 2095 2096 sctx.rs = rs; 2097 ierr = MatLUCheckShift_inline(info,sctx,row+2,newshift);CHKERRQ(ierr); 2098 if (newshift == 1) goto endofwhile; 2099 break; 2100 2101 default: 2102 SETERRQ(PETSC_ERR_SUP,"Node size not yet supported \n"); 2103 } 2104 row += nodesz; /* Update the row */ 2105 } 2106 endofwhile:; 2107 } while (sctx.useshift); 2108 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2109 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2110 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2111 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2112 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2113 (B)->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 2114 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2115 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2116 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2117 C->assembled = PETSC_TRUE; 2118 C->preallocated = PETSC_TRUE; 2119 if (sctx.nshift) { 2120 if (info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) { 2121 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2122 } else if (info->shifttype == MAT_SHIFT_NONZERO) { 2123 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2124 } 2125 } 2126 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2127 PetscFunctionReturn(0); 2128 } 2129 2130 2131 /* ----------------------------------------------------------- */ 2132 #undef __FUNCT__ 2133 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2134 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2135 { 2136 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2137 IS iscol = a->col,isrow = a->row; 2138 PetscErrorCode ierr; 2139 const PetscInt *r,*c,*rout,*cout; 2140 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2141 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2142 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2143 PetscScalar sum1,sum2,sum3,sum4,sum5; 2144 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2145 const PetscScalar *b; 2146 2147 PetscFunctionBegin; 2148 if (!a->inode.size) SETERRQ(PETSC_ERR_COR,"Missing Inode Structure"); 2149 node_max = a->inode.node_count; 2150 ns = a->inode.size; /* Node Size array */ 2151 2152 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2153 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2154 tmp = a->solve_work; 2155 2156 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2157 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2158 2159 /* forward solve the lower triangular */ 2160 tmps = tmp ; 2161 aa = a_a ; 2162 aj = a_j ; 2163 ad = a->diag; 2164 2165 for (i = 0,row = 0; i< node_max; ++i){ 2166 nsz = ns[i]; 2167 aii = ai[row]; 2168 v1 = aa + aii; 2169 vi = aj + aii; 2170 nz = ai[row+1]- ai[row]; 2171 2172 if (i < node_max-1) { 2173 /* Prefetch the indices for the next block */ 2174 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,0); /* indices */ 2175 /* Prefetch the data for the next block */ 2176 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,0); 2177 } 2178 2179 switch (nsz){ /* Each loop in 'case' is unrolled */ 2180 case 1 : 2181 sum1 = b[r[row]]; 2182 for(j=0; j<nz-1; j+=2){ 2183 i0 = vi[j]; 2184 i1 = vi[j+1]; 2185 tmp0 = tmps[i0]; 2186 tmp1 = tmps[i1]; 2187 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2188 } 2189 if(j == nz-1){ 2190 tmp0 = tmps[vi[j]]; 2191 sum1 -= v1[j]*tmp0; 2192 } 2193 tmp[row++]=sum1; 2194 break; 2195 case 2: 2196 sum1 = b[r[row]]; 2197 sum2 = b[r[row+1]]; 2198 v2 = aa + ai[row+1]; 2199 2200 for(j=0; j<nz-1; j+=2){ 2201 i0 = vi[j]; 2202 i1 = vi[j+1]; 2203 tmp0 = tmps[i0]; 2204 tmp1 = tmps[i1]; 2205 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2206 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2207 } 2208 if(j == nz-1){ 2209 tmp0 = tmps[vi[j]]; 2210 sum1 -= v1[j] *tmp0; 2211 sum2 -= v2[j] *tmp0; 2212 } 2213 sum2 -= v2[nz] * sum1; 2214 tmp[row ++]=sum1; 2215 tmp[row ++]=sum2; 2216 break; 2217 case 3: 2218 sum1 = b[r[row]]; 2219 sum2 = b[r[row+1]]; 2220 sum3 = b[r[row+2]]; 2221 v2 = aa + ai[row+1]; 2222 v3 = aa + ai[row+2]; 2223 2224 for (j=0; j<nz-1; j+=2){ 2225 i0 = vi[j]; 2226 i1 = vi[j+1]; 2227 tmp0 = tmps[i0]; 2228 tmp1 = tmps[i1]; 2229 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2230 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2231 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2232 } 2233 if (j == nz-1){ 2234 tmp0 = tmps[vi[j]]; 2235 sum1 -= v1[j] *tmp0; 2236 sum2 -= v2[j] *tmp0; 2237 sum3 -= v3[j] *tmp0; 2238 } 2239 sum2 -= v2[nz] * sum1; 2240 sum3 -= v3[nz] * sum1; 2241 sum3 -= v3[nz+1] * sum2; 2242 tmp[row ++]=sum1; 2243 tmp[row ++]=sum2; 2244 tmp[row ++]=sum3; 2245 break; 2246 2247 case 4: 2248 sum1 = b[r[row]]; 2249 sum2 = b[r[row+1]]; 2250 sum3 = b[r[row+2]]; 2251 sum4 = b[r[row+3]]; 2252 v2 = aa + ai[row+1]; 2253 v3 = aa + ai[row+2]; 2254 v4 = aa + ai[row+3]; 2255 2256 for (j=0; j<nz-1; j+=2){ 2257 i0 = vi[j]; 2258 i1 = vi[j+1]; 2259 tmp0 = tmps[i0]; 2260 tmp1 = tmps[i1]; 2261 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2262 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2263 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2264 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2265 } 2266 if (j == nz-1){ 2267 tmp0 = tmps[vi[j]]; 2268 sum1 -= v1[j] *tmp0; 2269 sum2 -= v2[j] *tmp0; 2270 sum3 -= v3[j] *tmp0; 2271 sum4 -= v4[j] *tmp0; 2272 } 2273 sum2 -= v2[nz] * sum1; 2274 sum3 -= v3[nz] * sum1; 2275 sum4 -= v4[nz] * sum1; 2276 sum3 -= v3[nz+1] * sum2; 2277 sum4 -= v4[nz+1] * sum2; 2278 sum4 -= v4[nz+2] * sum3; 2279 2280 tmp[row ++]=sum1; 2281 tmp[row ++]=sum2; 2282 tmp[row ++]=sum3; 2283 tmp[row ++]=sum4; 2284 break; 2285 case 5: 2286 sum1 = b[r[row]]; 2287 sum2 = b[r[row+1]]; 2288 sum3 = b[r[row+2]]; 2289 sum4 = b[r[row+3]]; 2290 sum5 = b[r[row+4]]; 2291 v2 = aa + ai[row+1]; 2292 v3 = aa + ai[row+2]; 2293 v4 = aa + ai[row+3]; 2294 v5 = aa + ai[row+4]; 2295 2296 for (j=0; j<nz-1; j+=2){ 2297 i0 = vi[j]; 2298 i1 = vi[j+1]; 2299 tmp0 = tmps[i0]; 2300 tmp1 = tmps[i1]; 2301 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2302 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2303 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2304 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2305 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2306 } 2307 if (j == nz-1){ 2308 tmp0 = tmps[vi[j]]; 2309 sum1 -= v1[j] *tmp0; 2310 sum2 -= v2[j] *tmp0; 2311 sum3 -= v3[j] *tmp0; 2312 sum4 -= v4[j] *tmp0; 2313 sum5 -= v5[j] *tmp0; 2314 } 2315 2316 sum2 -= v2[nz] * sum1; 2317 sum3 -= v3[nz] * sum1; 2318 sum4 -= v4[nz] * sum1; 2319 sum5 -= v5[nz] * sum1; 2320 sum3 -= v3[nz+1] * sum2; 2321 sum4 -= v4[nz+1] * sum2; 2322 sum5 -= v5[nz+1] * sum2; 2323 sum4 -= v4[nz+2] * sum3; 2324 sum5 -= v5[nz+2] * sum3; 2325 sum5 -= v5[nz+3] * sum4; 2326 2327 tmp[row ++]=sum1; 2328 tmp[row ++]=sum2; 2329 tmp[row ++]=sum3; 2330 tmp[row ++]=sum4; 2331 tmp[row ++]=sum5; 2332 break; 2333 default: 2334 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2335 } 2336 } 2337 /* backward solve the upper triangular */ 2338 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2339 nsz = ns[i]; 2340 aii = ad[row+1] + 1; 2341 v1 = aa + aii; 2342 vi = aj + aii; 2343 nz = ad[row]- ad[row+1] - 1; 2344 2345 if (i > 0) { 2346 /* Prefetch the indices for the next block */ 2347 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,0); /* indices */ 2348 /* Prefetch the data for the next block */ 2349 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,0); 2350 } 2351 2352 switch (nsz){ /* Each loop in 'case' is unrolled */ 2353 case 1 : 2354 sum1 = tmp[row]; 2355 2356 for(j=0 ; j<nz-1; j+=2){ 2357 i0 = vi[j]; 2358 i1 = vi[j+1]; 2359 tmp0 = tmps[i0]; 2360 tmp1 = tmps[i1]; 2361 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2362 } 2363 if (j == nz-1){ 2364 tmp0 = tmps[vi[j]]; 2365 sum1 -= v1[j]*tmp0; 2366 } 2367 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2368 break; 2369 case 2 : 2370 sum1 = tmp[row]; 2371 sum2 = tmp[row-1]; 2372 v2 = aa + ad[row] + 1; 2373 for (j=0 ; j<nz-1; j+=2){ 2374 i0 = vi[j]; 2375 i1 = vi[j+1]; 2376 tmp0 = tmps[i0]; 2377 tmp1 = tmps[i1]; 2378 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2379 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2380 } 2381 if (j == nz-1){ 2382 tmp0 = tmps[vi[j]]; 2383 sum1 -= v1[j]* tmp0; 2384 sum2 -= v2[j+1]* tmp0; 2385 } 2386 2387 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2388 sum2 -= v2[0] * tmp0; 2389 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2390 break; 2391 case 3 : 2392 sum1 = tmp[row]; 2393 sum2 = tmp[row -1]; 2394 sum3 = tmp[row -2]; 2395 v2 = aa + ad[row] + 1; 2396 v3 = aa + ad[row -1] + 1; 2397 for (j=0 ; j<nz-1; j+=2){ 2398 i0 = vi[j]; 2399 i1 = vi[j+1]; 2400 tmp0 = tmps[i0]; 2401 tmp1 = tmps[i1]; 2402 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2403 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2404 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2405 } 2406 if (j== nz-1){ 2407 tmp0 = tmps[vi[j]]; 2408 sum1 -= v1[j] * tmp0; 2409 sum2 -= v2[j+1] * tmp0; 2410 sum3 -= v3[j+2] * tmp0; 2411 } 2412 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2413 sum2 -= v2[0]* tmp0; 2414 sum3 -= v3[1] * tmp0; 2415 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2416 sum3 -= v3[0]* tmp0; 2417 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2418 2419 break; 2420 case 4 : 2421 sum1 = tmp[row]; 2422 sum2 = tmp[row -1]; 2423 sum3 = tmp[row -2]; 2424 sum4 = tmp[row -3]; 2425 v2 = aa + ad[row]+1; 2426 v3 = aa + ad[row -1]+1; 2427 v4 = aa + ad[row -2]+1; 2428 2429 for (j=0 ; j<nz-1; j+=2){ 2430 i0 = vi[j]; 2431 i1 = vi[j+1]; 2432 tmp0 = tmps[i0]; 2433 tmp1 = tmps[i1]; 2434 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2435 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2436 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2437 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2438 } 2439 if (j== nz-1){ 2440 tmp0 = tmps[vi[j]]; 2441 sum1 -= v1[j] * tmp0; 2442 sum2 -= v2[j+1] * tmp0; 2443 sum3 -= v3[j+2] * tmp0; 2444 sum4 -= v4[j+3] * tmp0; 2445 } 2446 2447 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2448 sum2 -= v2[0] * tmp0; 2449 sum3 -= v3[1] * tmp0; 2450 sum4 -= v4[2] * tmp0; 2451 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2452 sum3 -= v3[0] * tmp0; 2453 sum4 -= v4[1] * tmp0; 2454 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2455 sum4 -= v4[0] * tmp0; 2456 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2457 break; 2458 case 5 : 2459 sum1 = tmp[row]; 2460 sum2 = tmp[row -1]; 2461 sum3 = tmp[row -2]; 2462 sum4 = tmp[row -3]; 2463 sum5 = tmp[row -4]; 2464 v2 = aa + ad[row]+1; 2465 v3 = aa + ad[row -1]+1; 2466 v4 = aa + ad[row -2]+1; 2467 v5 = aa + ad[row -3]+1; 2468 for (j=0 ; j<nz-1; j+=2){ 2469 i0 = vi[j]; 2470 i1 = vi[j+1]; 2471 tmp0 = tmps[i0]; 2472 tmp1 = tmps[i1]; 2473 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2474 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2475 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2476 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2477 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2478 } 2479 if (j==nz-1){ 2480 tmp0 = tmps[vi[j]]; 2481 sum1 -= v1[j] * tmp0; 2482 sum2 -= v2[j+1] * tmp0; 2483 sum3 -= v3[j+2] * tmp0; 2484 sum4 -= v4[j+3] * tmp0; 2485 sum5 -= v5[j+4] * tmp0; 2486 } 2487 2488 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2489 sum2 -= v2[0] * tmp0; 2490 sum3 -= v3[1] * tmp0; 2491 sum4 -= v4[2] * tmp0; 2492 sum5 -= v5[3] * tmp0; 2493 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2494 sum3 -= v3[0] * tmp0; 2495 sum4 -= v4[1] * tmp0; 2496 sum5 -= v5[2] * tmp0; 2497 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2498 sum4 -= v4[0] * tmp0; 2499 sum5 -= v5[1] * tmp0; 2500 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2501 sum5 -= v5[0] * tmp0; 2502 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2503 break; 2504 default: 2505 SETERRQ(PETSC_ERR_COR,"Node size not yet supported \n"); 2506 } 2507 } 2508 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2509 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2510 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2511 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2512 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2513 PetscFunctionReturn(0); 2514 } 2515 2516 2517 /* 2518 Makes a longer coloring[] array and calls the usual code with that 2519 */ 2520 #undef __FUNCT__ 2521 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2522 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2523 { 2524 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2525 PetscErrorCode ierr; 2526 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2527 PetscInt *colorused,i; 2528 ISColoringValue *newcolor; 2529 2530 PetscFunctionBegin; 2531 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2532 /* loop over inodes, marking a color for each column*/ 2533 row = 0; 2534 for (i=0; i<m; i++){ 2535 for (j=0; j<ns[i]; j++) { 2536 newcolor[row++] = coloring[i] + j*ncolors; 2537 } 2538 } 2539 2540 /* eliminate unneeded colors */ 2541 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2542 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2543 for (i=0; i<n; i++) { 2544 colorused[newcolor[i]] = 1; 2545 } 2546 2547 for (i=1; i<5*ncolors; i++) { 2548 colorused[i] += colorused[i-1]; 2549 } 2550 ncolors = colorused[5*ncolors-1]; 2551 for (i=0; i<n; i++) { 2552 newcolor[i] = colorused[newcolor[i]]-1; 2553 } 2554 ierr = PetscFree(colorused);CHKERRQ(ierr); 2555 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2556 ierr = PetscFree(coloring);CHKERRQ(ierr); 2557 PetscFunctionReturn(0); 2558 } 2559 2560 #include "../src/mat/blockinvert.h" 2561 2562 #undef __FUNCT__ 2563 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2564 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2565 { 2566 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2567 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2568 MatScalar *ibdiag,*bdiag,work[25]; 2569 PetscScalar *x,*xb,tmp4,tmp5,x1,x2,x3,x4,x5; 2570 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2571 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2572 PetscErrorCode ierr; 2573 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2574 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2575 2576 PetscFunctionBegin; 2577 if (omega != 1.0) SETERRQ(PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2578 if (fshift != 0.0) SETERRQ(PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2579 if (its > 1) { 2580 /* switch to non-inode version */ 2581 ierr = MatSOR_SeqAIJ(A,bb,omega,flag,fshift,its,lits,xx);CHKERRQ(ierr); 2582 PetscFunctionReturn(0); 2583 } 2584 2585 if (!a->inode.ibdiagvalid) { 2586 if (!a->inode.ibdiag) { 2587 /* calculate space needed for diagonal blocks */ 2588 for (i=0; i<m; i++) { 2589 cnt += sizes[i]*sizes[i]; 2590 } 2591 a->inode.bdiagsize = cnt; 2592 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2593 } 2594 2595 /* copy over the diagonal blocks and invert them */ 2596 ibdiag = a->inode.ibdiag; 2597 bdiag = a->inode.bdiag; 2598 cnt = 0; 2599 for (i=0, row = 0; i<m; i++) { 2600 for (j=0; j<sizes[i]; j++) { 2601 for (k=0; k<sizes[i]; k++) { 2602 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2603 } 2604 } 2605 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2606 2607 switch(sizes[i]) { 2608 case 1: 2609 /* Create matrix data structure */ 2610 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2611 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2612 break; 2613 case 2: 2614 ierr = Kernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2615 break; 2616 case 3: 2617 ierr = Kernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2618 break; 2619 case 4: 2620 ierr = Kernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2621 break; 2622 case 5: 2623 ierr = Kernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2624 break; 2625 default: 2626 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2627 } 2628 cnt += sizes[i]*sizes[i]; 2629 row += sizes[i]; 2630 } 2631 a->inode.ibdiagvalid = PETSC_TRUE; 2632 } 2633 ibdiag = a->inode.ibdiag; 2634 bdiag = a->inode.bdiag; 2635 2636 2637 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2638 if (flag & SOR_ZERO_INITIAL_GUESS) { 2639 PetscScalar *b; 2640 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2641 if (xx != bb) { 2642 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2643 } else { 2644 b = x; 2645 } 2646 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2647 2648 for (i=0, row=0; i<m; i++) { 2649 sz = diag[row] - ii[row]; 2650 v1 = a->a + ii[row]; 2651 idx = a->j + ii[row]; 2652 2653 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2654 switch (sizes[i]){ 2655 case 1: 2656 2657 sum1 = b[row]; 2658 for(n = 0; n<sz-1; n+=2) { 2659 i1 = idx[0]; 2660 i2 = idx[1]; 2661 idx += 2; 2662 tmp0 = x[i1]; 2663 tmp1 = x[i2]; 2664 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2665 } 2666 2667 if (n == sz-1){ 2668 tmp0 = x[*idx]; 2669 sum1 -= *v1 * tmp0; 2670 } 2671 x[row++] = sum1*(*ibdiag++); 2672 break; 2673 case 2: 2674 v2 = a->a + ii[row+1]; 2675 sum1 = b[row]; 2676 sum2 = b[row+1]; 2677 for(n = 0; n<sz-1; n+=2) { 2678 i1 = idx[0]; 2679 i2 = idx[1]; 2680 idx += 2; 2681 tmp0 = x[i1]; 2682 tmp1 = x[i2]; 2683 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2684 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2685 } 2686 2687 if (n == sz-1){ 2688 tmp0 = x[*idx]; 2689 sum1 -= v1[0] * tmp0; 2690 sum2 -= v2[0] * tmp0; 2691 } 2692 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2693 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2694 ibdiag += 4; 2695 break; 2696 case 3: 2697 v2 = a->a + ii[row+1]; 2698 v3 = a->a + ii[row+2]; 2699 sum1 = b[row]; 2700 sum2 = b[row+1]; 2701 sum3 = b[row+2]; 2702 for(n = 0; n<sz-1; n+=2) { 2703 i1 = idx[0]; 2704 i2 = idx[1]; 2705 idx += 2; 2706 tmp0 = x[i1]; 2707 tmp1 = x[i2]; 2708 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2709 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2710 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2711 } 2712 2713 if (n == sz-1){ 2714 tmp0 = x[*idx]; 2715 sum1 -= v1[0] * tmp0; 2716 sum2 -= v2[0] * tmp0; 2717 sum3 -= v3[0] * tmp0; 2718 } 2719 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2720 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2721 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2722 ibdiag += 9; 2723 break; 2724 case 4: 2725 v2 = a->a + ii[row+1]; 2726 v3 = a->a + ii[row+2]; 2727 v4 = a->a + ii[row+3]; 2728 sum1 = b[row]; 2729 sum2 = b[row+1]; 2730 sum3 = b[row+2]; 2731 sum4 = b[row+3]; 2732 for(n = 0; n<sz-1; n+=2) { 2733 i1 = idx[0]; 2734 i2 = idx[1]; 2735 idx += 2; 2736 tmp0 = x[i1]; 2737 tmp1 = x[i2]; 2738 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2739 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2740 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2741 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2742 } 2743 2744 if (n == sz-1){ 2745 tmp0 = x[*idx]; 2746 sum1 -= v1[0] * tmp0; 2747 sum2 -= v2[0] * tmp0; 2748 sum3 -= v3[0] * tmp0; 2749 sum4 -= v4[0] * tmp0; 2750 } 2751 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2752 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2753 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2754 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2755 ibdiag += 16; 2756 break; 2757 case 5: 2758 v2 = a->a + ii[row+1]; 2759 v3 = a->a + ii[row+2]; 2760 v4 = a->a + ii[row+3]; 2761 v5 = a->a + ii[row+4]; 2762 sum1 = b[row]; 2763 sum2 = b[row+1]; 2764 sum3 = b[row+2]; 2765 sum4 = b[row+3]; 2766 sum5 = b[row+4]; 2767 for(n = 0; n<sz-1; n+=2) { 2768 i1 = idx[0]; 2769 i2 = idx[1]; 2770 idx += 2; 2771 tmp0 = x[i1]; 2772 tmp1 = x[i2]; 2773 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2774 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2775 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2776 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2777 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2778 } 2779 2780 if (n == sz-1){ 2781 tmp0 = x[*idx]; 2782 sum1 -= v1[0] * tmp0; 2783 sum2 -= v2[0] * tmp0; 2784 sum3 -= v3[0] * tmp0; 2785 sum4 -= v4[0] * tmp0; 2786 sum5 -= v5[0] * tmp0; 2787 } 2788 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2789 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2790 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2791 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2792 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2793 ibdiag += 25; 2794 break; 2795 default: 2796 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2797 } 2798 } 2799 2800 xb = x; 2801 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 2802 } else xb = b; 2803 if ((flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP) && 2804 (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP)) { 2805 cnt = 0; 2806 for (i=0, row=0; i<m; i++) { 2807 2808 switch (sizes[i]){ 2809 case 1: 2810 x[row++] *= bdiag[cnt++]; 2811 break; 2812 case 2: 2813 x1 = x[row]; x2 = x[row+1]; 2814 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 2815 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 2816 x[row++] = tmp1; 2817 x[row++] = tmp2; 2818 cnt += 4; 2819 break; 2820 case 3: 2821 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 2822 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 2823 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 2824 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 2825 x[row++] = tmp1; 2826 x[row++] = tmp2; 2827 x[row++] = tmp3; 2828 cnt += 9; 2829 break; 2830 case 4: 2831 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 2832 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 2833 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 2834 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 2835 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 2836 x[row++] = tmp1; 2837 x[row++] = tmp2; 2838 x[row++] = tmp3; 2839 x[row++] = tmp4; 2840 cnt += 16; 2841 break; 2842 case 5: 2843 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 2844 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 2845 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 2846 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 2847 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 2848 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 2849 x[row++] = tmp1; 2850 x[row++] = tmp2; 2851 x[row++] = tmp3; 2852 x[row++] = tmp4; 2853 x[row++] = tmp5; 2854 cnt += 25; 2855 break; 2856 default: 2857 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2858 } 2859 } 2860 ierr = PetscLogFlops(m);CHKERRQ(ierr); 2861 } 2862 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 2863 2864 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 2865 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 2866 ibdiag -= sizes[i]*sizes[i]; 2867 sz = ii[row+1] - diag[row] - 1; 2868 v1 = a->a + diag[row] + 1; 2869 idx = a->j + diag[row] + 1; 2870 2871 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2872 switch (sizes[i]){ 2873 case 1: 2874 2875 sum1 = xb[row]; 2876 for(n = 0; n<sz-1; n+=2) { 2877 i1 = idx[0]; 2878 i2 = idx[1]; 2879 idx += 2; 2880 tmp0 = x[i1]; 2881 tmp1 = x[i2]; 2882 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2883 } 2884 2885 if (n == sz-1){ 2886 tmp0 = x[*idx]; 2887 sum1 -= *v1*tmp0; 2888 } 2889 x[row--] = sum1*(*ibdiag); 2890 break; 2891 2892 case 2: 2893 2894 sum1 = xb[row]; 2895 sum2 = xb[row-1]; 2896 /* note that sum1 is associated with the second of the two rows */ 2897 v2 = a->a + diag[row-1] + 2; 2898 for(n = 0; n<sz-1; n+=2) { 2899 i1 = idx[0]; 2900 i2 = idx[1]; 2901 idx += 2; 2902 tmp0 = x[i1]; 2903 tmp1 = x[i2]; 2904 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2905 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2906 } 2907 2908 if (n == sz-1){ 2909 tmp0 = x[*idx]; 2910 sum1 -= *v1*tmp0; 2911 sum2 -= *v2*tmp0; 2912 } 2913 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 2914 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 2915 break; 2916 case 3: 2917 2918 sum1 = xb[row]; 2919 sum2 = xb[row-1]; 2920 sum3 = xb[row-2]; 2921 v2 = a->a + diag[row-1] + 2; 2922 v3 = a->a + diag[row-2] + 3; 2923 for(n = 0; n<sz-1; n+=2) { 2924 i1 = idx[0]; 2925 i2 = idx[1]; 2926 idx += 2; 2927 tmp0 = x[i1]; 2928 tmp1 = x[i2]; 2929 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2930 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2931 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2932 } 2933 2934 if (n == sz-1){ 2935 tmp0 = x[*idx]; 2936 sum1 -= *v1*tmp0; 2937 sum2 -= *v2*tmp0; 2938 sum3 -= *v3*tmp0; 2939 } 2940 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 2941 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 2942 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 2943 break; 2944 case 4: 2945 2946 sum1 = xb[row]; 2947 sum2 = xb[row-1]; 2948 sum3 = xb[row-2]; 2949 sum4 = xb[row-3]; 2950 v2 = a->a + diag[row-1] + 2; 2951 v3 = a->a + diag[row-2] + 3; 2952 v4 = a->a + diag[row-3] + 4; 2953 for(n = 0; n<sz-1; n+=2) { 2954 i1 = idx[0]; 2955 i2 = idx[1]; 2956 idx += 2; 2957 tmp0 = x[i1]; 2958 tmp1 = x[i2]; 2959 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2960 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2961 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2962 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2963 } 2964 2965 if (n == sz-1){ 2966 tmp0 = x[*idx]; 2967 sum1 -= *v1*tmp0; 2968 sum2 -= *v2*tmp0; 2969 sum3 -= *v3*tmp0; 2970 sum4 -= *v4*tmp0; 2971 } 2972 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 2973 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 2974 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 2975 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 2976 break; 2977 case 5: 2978 2979 sum1 = xb[row]; 2980 sum2 = xb[row-1]; 2981 sum3 = xb[row-2]; 2982 sum4 = xb[row-3]; 2983 sum5 = xb[row-4]; 2984 v2 = a->a + diag[row-1] + 2; 2985 v3 = a->a + diag[row-2] + 3; 2986 v4 = a->a + diag[row-3] + 4; 2987 v5 = a->a + diag[row-4] + 5; 2988 for(n = 0; n<sz-1; n+=2) { 2989 i1 = idx[0]; 2990 i2 = idx[1]; 2991 idx += 2; 2992 tmp0 = x[i1]; 2993 tmp1 = x[i2]; 2994 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2995 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2996 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2997 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2998 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2999 } 3000 3001 if (n == sz-1){ 3002 tmp0 = x[*idx]; 3003 sum1 -= *v1*tmp0; 3004 sum2 -= *v2*tmp0; 3005 sum3 -= *v3*tmp0; 3006 sum4 -= *v4*tmp0; 3007 sum5 -= *v5*tmp0; 3008 } 3009 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3010 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3011 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3012 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3013 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3014 break; 3015 default: 3016 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3017 } 3018 } 3019 3020 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3021 } 3022 its--; 3023 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3024 if (bb != xx) {ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);} 3025 } 3026 if (flag & SOR_EISENSTAT) { 3027 const PetscScalar *b; 3028 MatScalar *t = a->inode.ssor_work; 3029 3030 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3031 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3032 /* 3033 Apply (U + D)^-1 where D is now the block diagonal 3034 */ 3035 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3036 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3037 ibdiag -= sizes[i]*sizes[i]; 3038 sz = ii[row+1] - diag[row] - 1; 3039 v1 = a->a + diag[row] + 1; 3040 idx = a->j + diag[row] + 1; 3041 CHKMEMQ; 3042 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3043 switch (sizes[i]){ 3044 case 1: 3045 3046 sum1 = b[row]; 3047 for(n = 0; n<sz-1; n+=2) { 3048 i1 = idx[0]; 3049 i2 = idx[1]; 3050 idx += 2; 3051 tmp0 = x[i1]; 3052 tmp1 = x[i2]; 3053 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3054 } 3055 3056 if (n == sz-1){ 3057 tmp0 = x[*idx]; 3058 sum1 -= *v1*tmp0; 3059 } 3060 x[row] = sum1*(*ibdiag);row--; 3061 break; 3062 3063 case 2: 3064 3065 sum1 = b[row]; 3066 sum2 = b[row-1]; 3067 /* note that sum1 is associated with the second of the two rows */ 3068 v2 = a->a + diag[row-1] + 2; 3069 for(n = 0; n<sz-1; n+=2) { 3070 i1 = idx[0]; 3071 i2 = idx[1]; 3072 idx += 2; 3073 tmp0 = x[i1]; 3074 tmp1 = x[i2]; 3075 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3076 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3077 } 3078 3079 if (n == sz-1){ 3080 tmp0 = x[*idx]; 3081 sum1 -= *v1*tmp0; 3082 sum2 -= *v2*tmp0; 3083 } 3084 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3085 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3086 row -= 2; 3087 break; 3088 case 3: 3089 3090 sum1 = b[row]; 3091 sum2 = b[row-1]; 3092 sum3 = b[row-2]; 3093 v2 = a->a + diag[row-1] + 2; 3094 v3 = a->a + diag[row-2] + 3; 3095 for(n = 0; n<sz-1; n+=2) { 3096 i1 = idx[0]; 3097 i2 = idx[1]; 3098 idx += 2; 3099 tmp0 = x[i1]; 3100 tmp1 = x[i2]; 3101 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3102 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3103 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3104 } 3105 3106 if (n == sz-1){ 3107 tmp0 = x[*idx]; 3108 sum1 -= *v1*tmp0; 3109 sum2 -= *v2*tmp0; 3110 sum3 -= *v3*tmp0; 3111 } 3112 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3113 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3114 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3115 row -= 3; 3116 break; 3117 case 4: 3118 3119 sum1 = b[row]; 3120 sum2 = b[row-1]; 3121 sum3 = b[row-2]; 3122 sum4 = b[row-3]; 3123 v2 = a->a + diag[row-1] + 2; 3124 v3 = a->a + diag[row-2] + 3; 3125 v4 = a->a + diag[row-3] + 4; 3126 for(n = 0; n<sz-1; n+=2) { 3127 i1 = idx[0]; 3128 i2 = idx[1]; 3129 idx += 2; 3130 tmp0 = x[i1]; 3131 tmp1 = x[i2]; 3132 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3133 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3134 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3135 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3136 } 3137 3138 if (n == sz-1){ 3139 tmp0 = x[*idx]; 3140 sum1 -= *v1*tmp0; 3141 sum2 -= *v2*tmp0; 3142 sum3 -= *v3*tmp0; 3143 sum4 -= *v4*tmp0; 3144 } 3145 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3146 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3147 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3148 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3149 row -= 4; 3150 break; 3151 case 5: 3152 3153 sum1 = b[row]; 3154 sum2 = b[row-1]; 3155 sum3 = b[row-2]; 3156 sum4 = b[row-3]; 3157 sum5 = b[row-4]; 3158 v2 = a->a + diag[row-1] + 2; 3159 v3 = a->a + diag[row-2] + 3; 3160 v4 = a->a + diag[row-3] + 4; 3161 v5 = a->a + diag[row-4] + 5; 3162 for(n = 0; n<sz-1; n+=2) { 3163 i1 = idx[0]; 3164 i2 = idx[1]; 3165 idx += 2; 3166 tmp0 = x[i1]; 3167 tmp1 = x[i2]; 3168 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3169 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3170 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3171 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3172 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3173 } 3174 3175 if (n == sz-1){ 3176 tmp0 = x[*idx]; 3177 sum1 -= *v1*tmp0; 3178 sum2 -= *v2*tmp0; 3179 sum3 -= *v3*tmp0; 3180 sum4 -= *v4*tmp0; 3181 sum5 -= *v5*tmp0; 3182 } 3183 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3184 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3185 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3186 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3187 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3188 row -= 5; 3189 break; 3190 default: 3191 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3192 } 3193 CHKMEMQ; 3194 } 3195 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3196 3197 /* 3198 t = b - D x where D is the block diagonal 3199 */ 3200 cnt = 0; 3201 for (i=0, row=0; i<m; i++) { 3202 CHKMEMQ; 3203 switch (sizes[i]){ 3204 case 1: 3205 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3206 break; 3207 case 2: 3208 x1 = x[row]; x2 = x[row+1]; 3209 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3210 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3211 t[row] = b[row] - tmp1; 3212 t[row+1] = b[row+1] - tmp2; row += 2; 3213 cnt += 4; 3214 break; 3215 case 3: 3216 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3217 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3218 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3219 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3220 t[row] = b[row] - tmp1; 3221 t[row+1] = b[row+1] - tmp2; 3222 t[row+2] = b[row+2] - tmp3; row += 3; 3223 cnt += 9; 3224 break; 3225 case 4: 3226 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3227 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3228 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3229 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3230 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3231 t[row] = b[row] - tmp1; 3232 t[row+1] = b[row+1] - tmp2; 3233 t[row+2] = b[row+2] - tmp3; 3234 t[row+3] = b[row+3] - tmp4; row += 4; 3235 cnt += 16; 3236 break; 3237 case 5: 3238 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3239 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3240 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3241 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3242 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3243 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3244 t[row] = b[row] - tmp1; 3245 t[row+1] = b[row+1] - tmp2; 3246 t[row+2] = b[row+2] - tmp3; 3247 t[row+3] = b[row+3] - tmp4; 3248 t[row+4] = b[row+4] - tmp5;row += 5; 3249 cnt += 25; 3250 break; 3251 default: 3252 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3253 } 3254 CHKMEMQ; 3255 } 3256 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3257 3258 3259 3260 /* 3261 Apply (L + D)^-1 where D is the block diagonal 3262 */ 3263 for (i=0, row=0; i<m; i++) { 3264 sz = diag[row] - ii[row]; 3265 v1 = a->a + ii[row]; 3266 idx = a->j + ii[row]; 3267 CHKMEMQ; 3268 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3269 switch (sizes[i]){ 3270 case 1: 3271 3272 sum1 = t[row]; 3273 for(n = 0; n<sz-1; n+=2) { 3274 i1 = idx[0]; 3275 i2 = idx[1]; 3276 idx += 2; 3277 tmp0 = t[i1]; 3278 tmp1 = t[i2]; 3279 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3280 } 3281 3282 if (n == sz-1){ 3283 tmp0 = t[*idx]; 3284 sum1 -= *v1 * tmp0; 3285 } 3286 x[row] += t[row] = sum1*(*ibdiag++); row++; 3287 break; 3288 case 2: 3289 v2 = a->a + ii[row+1]; 3290 sum1 = t[row]; 3291 sum2 = t[row+1]; 3292 for(n = 0; n<sz-1; n+=2) { 3293 i1 = idx[0]; 3294 i2 = idx[1]; 3295 idx += 2; 3296 tmp0 = t[i1]; 3297 tmp1 = t[i2]; 3298 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3299 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3300 } 3301 3302 if (n == sz-1){ 3303 tmp0 = t[*idx]; 3304 sum1 -= v1[0] * tmp0; 3305 sum2 -= v2[0] * tmp0; 3306 } 3307 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3308 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3309 ibdiag += 4; row += 2; 3310 break; 3311 case 3: 3312 v2 = a->a + ii[row+1]; 3313 v3 = a->a + ii[row+2]; 3314 sum1 = t[row]; 3315 sum2 = t[row+1]; 3316 sum3 = t[row+2]; 3317 for(n = 0; n<sz-1; n+=2) { 3318 i1 = idx[0]; 3319 i2 = idx[1]; 3320 idx += 2; 3321 tmp0 = t[i1]; 3322 tmp1 = t[i2]; 3323 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3324 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3325 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3326 } 3327 3328 if (n == sz-1){ 3329 tmp0 = t[*idx]; 3330 sum1 -= v1[0] * tmp0; 3331 sum2 -= v2[0] * tmp0; 3332 sum3 -= v3[0] * tmp0; 3333 } 3334 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3335 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3336 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3337 ibdiag += 9; row += 3; 3338 break; 3339 case 4: 3340 v2 = a->a + ii[row+1]; 3341 v3 = a->a + ii[row+2]; 3342 v4 = a->a + ii[row+3]; 3343 sum1 = t[row]; 3344 sum2 = t[row+1]; 3345 sum3 = t[row+2]; 3346 sum4 = t[row+3]; 3347 for(n = 0; n<sz-1; n+=2) { 3348 i1 = idx[0]; 3349 i2 = idx[1]; 3350 idx += 2; 3351 tmp0 = t[i1]; 3352 tmp1 = t[i2]; 3353 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3354 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3355 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3356 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3357 } 3358 3359 if (n == sz-1){ 3360 tmp0 = t[*idx]; 3361 sum1 -= v1[0] * tmp0; 3362 sum2 -= v2[0] * tmp0; 3363 sum3 -= v3[0] * tmp0; 3364 sum4 -= v4[0] * tmp0; 3365 } 3366 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3367 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3368 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3369 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3370 ibdiag += 16; row += 4; 3371 break; 3372 case 5: 3373 v2 = a->a + ii[row+1]; 3374 v3 = a->a + ii[row+2]; 3375 v4 = a->a + ii[row+3]; 3376 v5 = a->a + ii[row+4]; 3377 sum1 = t[row]; 3378 sum2 = t[row+1]; 3379 sum3 = t[row+2]; 3380 sum4 = t[row+3]; 3381 sum5 = t[row+4]; 3382 for(n = 0; n<sz-1; n+=2) { 3383 i1 = idx[0]; 3384 i2 = idx[1]; 3385 idx += 2; 3386 tmp0 = t[i1]; 3387 tmp1 = t[i2]; 3388 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3389 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3390 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3391 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3392 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3393 } 3394 3395 if (n == sz-1){ 3396 tmp0 = t[*idx]; 3397 sum1 -= v1[0] * tmp0; 3398 sum2 -= v2[0] * tmp0; 3399 sum3 -= v3[0] * tmp0; 3400 sum4 -= v4[0] * tmp0; 3401 sum5 -= v5[0] * tmp0; 3402 } 3403 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3404 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3405 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3406 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3407 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3408 ibdiag += 25; row += 5; 3409 break; 3410 default: 3411 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3412 } 3413 CHKMEMQ; 3414 } 3415 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3416 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3417 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3418 } 3419 PetscFunctionReturn(0); 3420 } 3421 3422 #undef __FUNCT__ 3423 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3424 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3425 { 3426 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3427 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3428 const MatScalar *bdiag = a->inode.bdiag; 3429 const PetscScalar *b; 3430 PetscErrorCode ierr; 3431 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3432 const PetscInt *sizes = a->inode.size; 3433 3434 PetscFunctionBegin; 3435 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3436 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3437 cnt = 0; 3438 for (i=0, row=0; i<m; i++) { 3439 switch (sizes[i]){ 3440 case 1: 3441 x[row] = b[row]*bdiag[cnt++];row++; 3442 break; 3443 case 2: 3444 x1 = b[row]; x2 = b[row+1]; 3445 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3446 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3447 x[row++] = tmp1; 3448 x[row++] = tmp2; 3449 cnt += 4; 3450 break; 3451 case 3: 3452 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3453 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3454 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3455 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3456 x[row++] = tmp1; 3457 x[row++] = tmp2; 3458 x[row++] = tmp3; 3459 cnt += 9; 3460 break; 3461 case 4: 3462 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3463 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3464 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3465 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3466 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3467 x[row++] = tmp1; 3468 x[row++] = tmp2; 3469 x[row++] = tmp3; 3470 x[row++] = tmp4; 3471 cnt += 16; 3472 break; 3473 case 5: 3474 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3475 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3476 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3477 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3478 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3479 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3480 x[row++] = tmp1; 3481 x[row++] = tmp2; 3482 x[row++] = tmp3; 3483 x[row++] = tmp4; 3484 x[row++] = tmp5; 3485 cnt += 25; 3486 break; 3487 default: 3488 SETERRQ1(PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3489 } 3490 } 3491 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3492 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3493 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3494 PetscFunctionReturn(0); 3495 } 3496 3497 /* 3498 samestructure indicates that the matrix has not changed its nonzero structure so we 3499 do not need to recompute the inodes 3500 */ 3501 #undef __FUNCT__ 3502 #define __FUNCT__ "Mat_CheckInode" 3503 PetscErrorCode Mat_CheckInode(Mat A,PetscTruth samestructure) 3504 { 3505 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3506 PetscErrorCode ierr; 3507 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3508 PetscTruth flag; 3509 3510 PetscFunctionBegin; 3511 if (!a->inode.use) PetscFunctionReturn(0); 3512 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3513 3514 3515 m = A->rmap->n; 3516 if (a->inode.size) {ns = a->inode.size;} 3517 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3518 3519 i = 0; 3520 node_count = 0; 3521 idx = a->j; 3522 ii = a->i; 3523 while (i < m){ /* For each row */ 3524 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3525 /* Limits the number of elements in a node to 'a->inode.limit' */ 3526 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3527 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3528 if (nzy != nzx) break; 3529 idy += nzx; /* Same nonzero pattern */ 3530 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3531 if (!flag) break; 3532 } 3533 ns[node_count++] = blk_size; 3534 idx += blk_size*nzx; 3535 i = j; 3536 } 3537 /* If not enough inodes found,, do not use inode version of the routines */ 3538 if (!a->inode.size && m && node_count > .9*m) { 3539 ierr = PetscFree(ns);CHKERRQ(ierr); 3540 a->inode.node_count = 0; 3541 a->inode.size = PETSC_NULL; 3542 a->inode.use = PETSC_FALSE; 3543 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3544 } else { 3545 A->ops->mult = MatMult_SeqAIJ_Inode; 3546 A->ops->sor = MatSOR_SeqAIJ_Inode; 3547 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 3548 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 3549 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 3550 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 3551 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 3552 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 3553 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 3554 A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode_inplace; 3555 a->inode.node_count = node_count; 3556 a->inode.size = ns; 3557 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3558 } 3559 PetscFunctionReturn(0); 3560 } 3561 3562 #define MatGetRow_FactoredLU(cols,nzl,nzu,nz,ai,aj,adiag,row) { \ 3563 PetscInt __k, *__vi; \ 3564 __vi = aj + ai[row]; \ 3565 for(__k=0;__k<nzl;__k++) cols[__k] = __vi[__k]; \ 3566 __vi = aj + adiag[row]; \ 3567 cols[nzl] = __vi[0];\ 3568 __vi = aj + adiag[row+1]+1;\ 3569 for(__k=0;__k<nzu;__k++) cols[nzl+1+__k] = __vi[__k];} 3570 3571 3572 /* 3573 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 3574 Modified from Mat_CheckInode(). 3575 3576 Input Parameters: 3577 + Mat A - ILU or LU matrix factor 3578 - samestructure - TURE indicates that the matrix has not changed its nonzero structure so we 3579 do not need to recompute the inodes 3580 */ 3581 #undef __FUNCT__ 3582 #define __FUNCT__ "Mat_CheckInode_FactorLU" 3583 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscTruth samestructure) 3584 { 3585 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3586 PetscErrorCode ierr; 3587 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 3588 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 3589 PetscTruth flag; 3590 3591 PetscFunctionBegin; 3592 if (!a->inode.use) PetscFunctionReturn(0); 3593 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3594 3595 m = A->rmap->n; 3596 if (a->inode.size) {ns = a->inode.size;} 3597 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3598 3599 i = 0; 3600 node_count = 0; 3601 while (i < m){ /* For each row */ 3602 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 3603 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 3604 nzx = nzl1 + nzu1 + 1; 3605 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 3606 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 3607 3608 /* Limits the number of elements in a node to 'a->inode.limit' */ 3609 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3610 nzl2 = ai[j+1] - ai[j]; 3611 nzu2 = adiag[j] - adiag[j+1] - 1; 3612 nzy = nzl2 + nzu2 + 1; 3613 if( nzy != nzx) break; 3614 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 3615 MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j); 3616 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3617 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 3618 ierr = PetscFree(cols2);CHKERRQ(ierr); 3619 } 3620 ns[node_count++] = blk_size; 3621 ierr = PetscFree(cols1);CHKERRQ(ierr); 3622 i = j; 3623 } 3624 /* If not enough inodes found,, do not use inode version of the routines */ 3625 if (!a->inode.size && m && node_count > .9*m) { 3626 ierr = PetscFree(ns);CHKERRQ(ierr); 3627 a->inode.node_count = 0; 3628 a->inode.size = PETSC_NULL; 3629 a->inode.use = PETSC_FALSE; 3630 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 3631 } else { 3632 A->ops->mult = 0; 3633 A->ops->sor = 0; 3634 A->ops->multadd = 0; 3635 A->ops->getrowij = 0; 3636 A->ops->restorerowij = 0; 3637 A->ops->getcolumnij = 0; 3638 A->ops->restorecolumnij = 0; 3639 A->ops->coloringpatch = 0; 3640 A->ops->multdiagonalblock = 0; 3641 A->ops->lufactornumeric = MatLUFactorNumeric_SeqAIJ_Inode; /* not done yet */ 3642 a->inode.node_count = node_count; 3643 a->inode.size = ns; 3644 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 3645 } 3646 PetscFunctionReturn(0); 3647 } 3648 3649 /* 3650 This is really ugly. if inodes are used this replaces the 3651 permutations with ones that correspond to rows/cols of the matrix 3652 rather then inode blocks 3653 */ 3654 #undef __FUNCT__ 3655 #define __FUNCT__ "MatInodeAdjustForInodes" 3656 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 3657 { 3658 PetscErrorCode ierr,(*f)(Mat,IS*,IS*); 3659 3660 PetscFunctionBegin; 3661 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeAdjustForInodes_C",(void (**)(void))&f);CHKERRQ(ierr); 3662 if (f) { 3663 ierr = (*f)(A,rperm,cperm);CHKERRQ(ierr); 3664 } 3665 PetscFunctionReturn(0); 3666 } 3667 3668 EXTERN_C_BEGIN 3669 #undef __FUNCT__ 3670 #define __FUNCT__ "MatAdjustForInodes_SeqAIJ_Inode" 3671 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 3672 { 3673 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 3674 PetscErrorCode ierr; 3675 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 3676 const PetscInt *ridx,*cidx; 3677 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 3678 PetscInt nslim_col,*ns_col; 3679 IS ris = *rperm,cis = *cperm; 3680 3681 PetscFunctionBegin; 3682 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 3683 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 3684 3685 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 3686 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 3687 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 3688 3689 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 3690 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 3691 3692 /* Form the inode structure for the rows of permuted matric using inv perm*/ 3693 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 3694 3695 /* Construct the permutations for rows*/ 3696 for (i=0,row = 0; i<nslim_row; ++i){ 3697 indx = ridx[i]; 3698 start_val = tns[indx]; 3699 end_val = tns[indx + 1]; 3700 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 3701 } 3702 3703 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 3704 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 3705 3706 /* Construct permutations for columns */ 3707 for (i=0,col=0; i<nslim_col; ++i){ 3708 indx = cidx[i]; 3709 start_val = tns[indx]; 3710 end_val = tns[indx + 1]; 3711 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 3712 } 3713 3714 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,rperm);CHKERRQ(ierr); 3715 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 3716 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,cperm);CHKERRQ(ierr); 3717 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 3718 3719 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 3720 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 3721 3722 ierr = PetscFree(ns_col);CHKERRQ(ierr); 3723 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 3724 ierr = ISDestroy(cis);CHKERRQ(ierr); 3725 ierr = ISDestroy(ris);CHKERRQ(ierr); 3726 ierr = PetscFree(tns);CHKERRQ(ierr); 3727 PetscFunctionReturn(0); 3728 } 3729 EXTERN_C_END 3730 3731 #undef __FUNCT__ 3732 #define __FUNCT__ "MatInodeGetInodeSizes" 3733 /*@C 3734 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 3735 3736 Collective on Mat 3737 3738 Input Parameter: 3739 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 3740 3741 Output Parameter: 3742 + node_count - no of inodes present in the matrix. 3743 . sizes - an array of size node_count,with sizes of each inode. 3744 - limit - the max size used to generate the inodes. 3745 3746 Level: advanced 3747 3748 Notes: This routine returns some internal storage information 3749 of the matrix, it is intended to be used by advanced users. 3750 It should be called after the matrix is assembled. 3751 The contents of the sizes[] array should not be changed. 3752 PETSC_NULL may be passed for information not requested. 3753 3754 .keywords: matrix, seqaij, get, inode 3755 3756 .seealso: MatGetInfo() 3757 @*/ 3758 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3759 { 3760 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 3761 3762 PetscFunctionBegin; 3763 if (!A->assembled) SETERRQ(PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 3764 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 3765 if (f) { 3766 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 3767 } 3768 PetscFunctionReturn(0); 3769 } 3770 3771 EXTERN_C_BEGIN 3772 #undef __FUNCT__ 3773 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 3774 PetscErrorCode PETSCMAT_DLLEXPORT MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 3775 { 3776 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3777 3778 PetscFunctionBegin; 3779 if (node_count) *node_count = a->inode.node_count; 3780 if (sizes) *sizes = a->inode.size; 3781 if (limit) *limit = a->inode.limit; 3782 PetscFunctionReturn(0); 3783 } 3784 EXTERN_C_END 3785