1 2 /* 3 This file provides high performance routines for the Inode format (compressed sparse row) 4 by taking advantage of rows with identical nonzero structure (I-nodes). 5 */ 6 #include <../src/mat/impls/aij/seq/aij.h> 7 8 #undef __FUNCT__ 9 #define __FUNCT__ "Mat_CreateColInode" 10 static PetscErrorCode Mat_CreateColInode(Mat A,PetscInt* size,PetscInt ** ns) 11 { 12 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 13 PetscErrorCode ierr; 14 PetscInt i,count,m,n,min_mn,*ns_row,*ns_col; 15 16 PetscFunctionBegin; 17 n = A->cmap->n; 18 m = A->rmap->n; 19 ns_row = a->inode.size; 20 21 min_mn = (m < n) ? m : n; 22 if (!ns) { 23 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++); 24 for(; count+1 < n; count++,i++); 25 if (count < n) { 26 i++; 27 } 28 *size = i; 29 PetscFunctionReturn(0); 30 } 31 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ns_col);CHKERRQ(ierr); 32 33 /* Use the same row structure wherever feasible. */ 34 for (count=0,i=0; count<min_mn; count+=ns_row[i],i++) { 35 ns_col[i] = ns_row[i]; 36 } 37 38 /* if m < n; pad up the remainder with inode_limit */ 39 for(; count+1 < n; count++,i++) { 40 ns_col[i] = 1; 41 } 42 /* The last node is the odd ball. padd it up with the remaining rows; */ 43 if (count < n) { 44 ns_col[i] = n - count; 45 i++; 46 } else if (count > n) { 47 /* Adjust for the over estimation */ 48 ns_col[i-1] += n - count; 49 } 50 *size = i; 51 *ns = ns_col; 52 PetscFunctionReturn(0); 53 } 54 55 56 /* 57 This builds symmetric version of nonzero structure, 58 */ 59 #undef __FUNCT__ 60 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Symmetric" 61 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Symmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 62 { 63 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 64 PetscErrorCode ierr; 65 PetscInt *work,*ia,*ja,*j,nz,nslim_row,nslim_col,m,row,col,*jmax,n; 66 PetscInt *tns,*tvc,*ns_row = a->inode.size,*ns_col,nsz,i1,i2,*ai= a->i,*aj = a->j; 67 68 PetscFunctionBegin; 69 nslim_row = a->inode.node_count; 70 m = A->rmap->n; 71 n = A->cmap->n; 72 if (m != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"MatGetRowIJ_SeqAIJ_Inode_Symmetric: Matrix should be square"); 73 74 /* Use the row_inode as column_inode */ 75 nslim_col = nslim_row; 76 ns_col = ns_row; 77 78 /* allocate space for reformated inode structure */ 79 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 80 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 81 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1]+ ns_row[i1]; 82 83 for (i1=0,col=0; i1<nslim_col; ++i1){ 84 nsz = ns_col[i1]; 85 for (i2=0; i2<nsz; ++i2,++col) 86 tvc[col] = i1; 87 } 88 /* allocate space for row pointers */ 89 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 90 *iia = ia; 91 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 92 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 93 94 /* determine the number of columns in each row */ 95 ia[0] = oshift; 96 for (i1=0,row=0 ; i1<nslim_row; row+=ns_row[i1],i1++) { 97 98 j = aj + ai[row] + ishift; 99 jmax = aj + ai[row+1] + ishift; 100 i2 = 0; 101 col = *j++ + ishift; 102 i2 = tvc[col]; 103 while (i2<i1 && j<jmax) { /* 1.[-xx-d-xx--] 2.[-xx-------],off-diagonal elemets */ 104 ia[i1+1]++; 105 ia[i2+1]++; 106 i2++; /* Start col of next node */ 107 while((j<jmax) && ((col=*j+ishift)<tns[i2])) ++j; 108 i2 = tvc[col]; 109 } 110 if(i2 == i1) ia[i2+1]++; /* now the diagonal element */ 111 } 112 113 /* shift ia[i] to point to next row */ 114 for (i1=1; i1<nslim_row+1; i1++) { 115 row = ia[i1-1]; 116 ia[i1] += row; 117 work[i1-1] = row - oshift; 118 } 119 120 /* allocate space for column pointers */ 121 nz = ia[nslim_row] + (!ishift); 122 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 123 *jja = ja; 124 125 /* loop over lower triangular part putting into ja */ 126 for (i1=0,row=0; i1<nslim_row; row += ns_row[i1],i1++) { 127 j = aj + ai[row] + ishift; 128 jmax = aj + ai[row+1] + ishift; 129 i2 = 0; /* Col inode index */ 130 col = *j++ + ishift; 131 i2 = tvc[col]; 132 while (i2<i1 && j<jmax) { 133 ja[work[i2]++] = i1 + oshift; 134 ja[work[i1]++] = i2 + oshift; 135 ++i2; 136 while((j<jmax) && ((col=*j+ishift)< tns[i2])) ++j; /* Skip rest col indices in this node */ 137 i2 = tvc[col]; 138 } 139 if (i2 == i1) ja[work[i1]++] = i2 + oshift; 140 141 } 142 ierr = PetscFree(work);CHKERRQ(ierr); 143 ierr = PetscFree(tns);CHKERRQ(ierr); 144 ierr = PetscFree(tvc);CHKERRQ(ierr); 145 PetscFunctionReturn(0); 146 } 147 148 /* 149 This builds nonsymmetric version of nonzero structure, 150 */ 151 #undef __FUNCT__ 152 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric" 153 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 154 { 155 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 156 PetscErrorCode ierr; 157 PetscInt *work,*ia,*ja,*j,nz,nslim_row,n,row,col,*ns_col,nslim_col; 158 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 159 160 PetscFunctionBegin; 161 nslim_row = a->inode.node_count; 162 n = A->cmap->n; 163 164 /* Create The column_inode for this matrix */ 165 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 166 167 /* allocate space for reformated column_inode structure */ 168 ierr = PetscMalloc((nslim_col +1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 169 ierr = PetscMalloc((n +1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 170 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 171 172 for (i1=0,col=0; i1<nslim_col; ++i1){ 173 nsz = ns_col[i1]; 174 for (i2=0; i2<nsz; ++i2,++col) 175 tvc[col] = i1; 176 } 177 /* allocate space for row pointers */ 178 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 179 *iia = ia; 180 ierr = PetscMemzero(ia,(nslim_row+1)*sizeof(PetscInt));CHKERRQ(ierr); 181 ierr = PetscMalloc((nslim_row+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 182 183 /* determine the number of columns in each row */ 184 ia[0] = oshift; 185 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 186 j = aj + ai[row] + ishift; 187 col = *j++ + ishift; 188 i2 = tvc[col]; 189 nz = ai[row+1] - ai[row]; 190 while (nz-- > 0) { /* off-diagonal elemets */ 191 ia[i1+1]++; 192 i2++; /* Start col of next node */ 193 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 194 if (nz > 0) i2 = tvc[col]; 195 } 196 } 197 198 /* shift ia[i] to point to next row */ 199 for (i1=1; i1<nslim_row+1; i1++) { 200 row = ia[i1-1]; 201 ia[i1] += row; 202 work[i1-1] = row - oshift; 203 } 204 205 /* allocate space for column pointers */ 206 nz = ia[nslim_row] + (!ishift); 207 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 208 *jja = ja; 209 210 /* loop over matrix putting into ja */ 211 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 212 j = aj + ai[row] + ishift; 213 i2 = 0; /* Col inode index */ 214 col = *j++ + ishift; 215 i2 = tvc[col]; 216 nz = ai[row+1] - ai[row]; 217 while (nz-- > 0) { 218 ja[work[i1]++] = i2 + oshift; 219 ++i2; 220 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 221 if (nz > 0) i2 = tvc[col]; 222 } 223 } 224 ierr = PetscFree(ns_col);CHKERRQ(ierr); 225 ierr = PetscFree(work);CHKERRQ(ierr); 226 ierr = PetscFree(tns);CHKERRQ(ierr); 227 ierr = PetscFree(tvc);CHKERRQ(ierr); 228 PetscFunctionReturn(0); 229 } 230 231 #undef __FUNCT__ 232 #define __FUNCT__ "MatGetRowIJ_SeqAIJ_Inode" 233 static PetscErrorCode MatGetRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done) 234 { 235 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 236 PetscErrorCode ierr; 237 238 PetscFunctionBegin; 239 *n = a->inode.node_count; 240 if (!ia) PetscFunctionReturn(0); 241 if (!blockcompressed) { 242 ierr = MatGetRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 243 } else if (symmetric) { 244 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 245 } else { 246 ierr = MatGetRowIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 247 } 248 PetscFunctionReturn(0); 249 } 250 251 #undef __FUNCT__ 252 #define __FUNCT__ "MatRestoreRowIJ_SeqAIJ_Inode" 253 static PetscErrorCode MatRestoreRowIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done) 254 { 255 PetscErrorCode ierr; 256 257 PetscFunctionBegin; 258 if (!ia) PetscFunctionReturn(0); 259 260 if (!blockcompressed) { 261 ierr = MatRestoreRowIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 262 } else { 263 ierr = PetscFree(*ia);CHKERRQ(ierr); 264 ierr = PetscFree(*ja);CHKERRQ(ierr); 265 } 266 267 PetscFunctionReturn(0); 268 } 269 270 /* ----------------------------------------------------------- */ 271 272 #undef __FUNCT__ 273 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric" 274 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(Mat A,PetscInt *iia[],PetscInt *jja[],PetscInt ishift,PetscInt oshift) 275 { 276 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 277 PetscErrorCode ierr; 278 PetscInt *work,*ia,*ja,*j,nz,nslim_row, n,row,col,*ns_col,nslim_col; 279 PetscInt *tns,*tvc,*ns_row = a->inode.size,nsz,i1,i2,*ai= a->i,*aj = a->j; 280 281 PetscFunctionBegin; 282 nslim_row = a->inode.node_count; 283 n = A->cmap->n; 284 285 /* Create The column_inode for this matrix */ 286 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 287 288 /* allocate space for reformated column_inode structure */ 289 ierr = PetscMalloc((nslim_col + 1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 290 ierr = PetscMalloc((n + 1)*sizeof(PetscInt),&tvc);CHKERRQ(ierr); 291 for (i1=0,tns[0]=0; i1<nslim_col; ++i1) tns[i1+1] = tns[i1] + ns_col[i1]; 292 293 for (i1=0,col=0; i1<nslim_col; ++i1){ 294 nsz = ns_col[i1]; 295 for (i2=0; i2<nsz; ++i2,++col) 296 tvc[col] = i1; 297 } 298 /* allocate space for column pointers */ 299 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&ia);CHKERRQ(ierr); 300 *iia = ia; 301 ierr = PetscMemzero(ia,(nslim_col+1)*sizeof(PetscInt));CHKERRQ(ierr); 302 ierr = PetscMalloc((nslim_col+1)*sizeof(PetscInt),&work);CHKERRQ(ierr); 303 304 /* determine the number of columns in each row */ 305 ia[0] = oshift; 306 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 307 j = aj + ai[row] + ishift; 308 col = *j++ + ishift; 309 i2 = tvc[col]; 310 nz = ai[row+1] - ai[row]; 311 while (nz-- > 0) { /* off-diagonal elemets */ 312 /* ia[i1+1]++; */ 313 ia[i2+1]++; 314 i2++; 315 while (((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 316 if (nz > 0) i2 = tvc[col]; 317 } 318 } 319 320 /* shift ia[i] to point to next col */ 321 for (i1=1; i1<nslim_col+1; i1++) { 322 col = ia[i1-1]; 323 ia[i1] += col; 324 work[i1-1] = col - oshift; 325 } 326 327 /* allocate space for column pointers */ 328 nz = ia[nslim_col] + (!ishift); 329 ierr = PetscMalloc(nz*sizeof(PetscInt),&ja);CHKERRQ(ierr); 330 *jja = ja; 331 332 /* loop over matrix putting into ja */ 333 for (i1=0,row=0; i1<nslim_row; row+=ns_row[i1],i1++) { 334 j = aj + ai[row] + ishift; 335 i2 = 0; /* Col inode index */ 336 col = *j++ + ishift; 337 i2 = tvc[col]; 338 nz = ai[row+1] - ai[row]; 339 while (nz-- > 0) { 340 /* ja[work[i1]++] = i2 + oshift; */ 341 ja[work[i2]++] = i1 + oshift; 342 i2++; 343 while(((col = *j++ + ishift) < tns[i2]) && nz > 0) {nz--;} 344 if (nz > 0) i2 = tvc[col]; 345 } 346 } 347 ierr = PetscFree(ns_col);CHKERRQ(ierr); 348 ierr = PetscFree(work);CHKERRQ(ierr); 349 ierr = PetscFree(tns);CHKERRQ(ierr); 350 ierr = PetscFree(tvc);CHKERRQ(ierr); 351 PetscFunctionReturn(0); 352 } 353 354 #undef __FUNCT__ 355 #define __FUNCT__ "MatGetColumnIJ_SeqAIJ_Inode" 356 static PetscErrorCode MatGetColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done) 357 { 358 PetscErrorCode ierr; 359 360 PetscFunctionBegin; 361 ierr = Mat_CreateColInode(A,n,PETSC_NULL);CHKERRQ(ierr); 362 if (!ia) PetscFunctionReturn(0); 363 364 if (!blockcompressed) { 365 ierr = MatGetColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 366 } else if (symmetric) { 367 /* Since the indices are symmetric it does'nt matter */ 368 ierr = MatGetRowIJ_SeqAIJ_Inode_Symmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 369 } else { 370 ierr = MatGetColumnIJ_SeqAIJ_Inode_Nonsymmetric(A,ia,ja,0,oshift);CHKERRQ(ierr); 371 } 372 PetscFunctionReturn(0); 373 } 374 375 #undef __FUNCT__ 376 #define __FUNCT__ "MatRestoreColumnIJ_SeqAIJ_Inode" 377 static PetscErrorCode MatRestoreColumnIJ_SeqAIJ_Inode(Mat A,PetscInt oshift,PetscBool symmetric,PetscBool blockcompressed,PetscInt *n,PetscInt *ia[],PetscInt *ja[],PetscBool *done) 378 { 379 PetscErrorCode ierr; 380 381 PetscFunctionBegin; 382 if (!ia) PetscFunctionReturn(0); 383 if (!blockcompressed) { 384 ierr = MatRestoreColumnIJ_SeqAIJ(A,oshift,symmetric,blockcompressed,n,ia,ja,done);CHKERRQ(ierr);; 385 } else { 386 ierr = PetscFree(*ia);CHKERRQ(ierr); 387 ierr = PetscFree(*ja);CHKERRQ(ierr); 388 } 389 PetscFunctionReturn(0); 390 } 391 392 /* ----------------------------------------------------------- */ 393 394 #undef __FUNCT__ 395 #define __FUNCT__ "MatMult_SeqAIJ_Inode" 396 static PetscErrorCode MatMult_SeqAIJ_Inode(Mat A,Vec xx,Vec yy) 397 { 398 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 399 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 400 PetscScalar *y; 401 const PetscScalar *x; 402 const MatScalar *v1,*v2,*v3,*v4,*v5; 403 PetscErrorCode ierr; 404 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz,nonzerorow=0; 405 406 #if defined(PETSC_HAVE_PRAGMA_DISJOINT) 407 #pragma disjoint(*x,*y,*v1,*v2,*v3,*v4,*v5) 408 #endif 409 410 PetscFunctionBegin; 411 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 412 node_max = a->inode.node_count; 413 ns = a->inode.size; /* Node Size array */ 414 ierr = VecGetArrayRead(xx,&x);CHKERRQ(ierr); 415 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 416 idx = a->j; 417 v1 = a->a; 418 ii = a->i; 419 420 for (i = 0,row = 0; i< node_max; ++i){ 421 nsz = ns[i]; 422 n = ii[1] - ii[0]; 423 nonzerorow += (n>0)*nsz; 424 ii += nsz; 425 PetscPrefetchBlock(idx+nsz*n,n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the indices for the block row after the current one */ 426 PetscPrefetchBlock(v1+nsz*n,nsz*n,0,PETSC_PREFETCH_HINT_NTA); /* Prefetch the values for the block row after the current one */ 427 sz = n; /* No of non zeros in this row */ 428 /* Switch on the size of Node */ 429 switch (nsz){ /* Each loop in 'case' is unrolled */ 430 case 1 : 431 sum1 = 0.; 432 433 for(n = 0; n< sz-1; n+=2) { 434 i1 = idx[0]; /* The instructions are ordered to */ 435 i2 = idx[1]; /* make the compiler's job easy */ 436 idx += 2; 437 tmp0 = x[i1]; 438 tmp1 = x[i2]; 439 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 440 } 441 442 if (n == sz-1){ /* Take care of the last nonzero */ 443 tmp0 = x[*idx++]; 444 sum1 += *v1++ * tmp0; 445 } 446 y[row++]=sum1; 447 break; 448 case 2: 449 sum1 = 0.; 450 sum2 = 0.; 451 v2 = v1 + n; 452 453 for (n = 0; n< sz-1; n+=2) { 454 i1 = idx[0]; 455 i2 = idx[1]; 456 idx += 2; 457 tmp0 = x[i1]; 458 tmp1 = x[i2]; 459 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 460 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 461 } 462 if (n == sz-1){ 463 tmp0 = x[*idx++]; 464 sum1 += *v1++ * tmp0; 465 sum2 += *v2++ * tmp0; 466 } 467 y[row++]=sum1; 468 y[row++]=sum2; 469 v1 =v2; /* Since the next block to be processed starts there*/ 470 idx +=sz; 471 break; 472 case 3: 473 sum1 = 0.; 474 sum2 = 0.; 475 sum3 = 0.; 476 v2 = v1 + n; 477 v3 = v2 + n; 478 479 for (n = 0; n< sz-1; n+=2) { 480 i1 = idx[0]; 481 i2 = idx[1]; 482 idx += 2; 483 tmp0 = x[i1]; 484 tmp1 = x[i2]; 485 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 486 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 487 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 488 } 489 if (n == sz-1){ 490 tmp0 = x[*idx++]; 491 sum1 += *v1++ * tmp0; 492 sum2 += *v2++ * tmp0; 493 sum3 += *v3++ * tmp0; 494 } 495 y[row++]=sum1; 496 y[row++]=sum2; 497 y[row++]=sum3; 498 v1 =v3; /* Since the next block to be processed starts there*/ 499 idx +=2*sz; 500 break; 501 case 4: 502 sum1 = 0.; 503 sum2 = 0.; 504 sum3 = 0.; 505 sum4 = 0.; 506 v2 = v1 + n; 507 v3 = v2 + n; 508 v4 = v3 + n; 509 510 for (n = 0; n< sz-1; n+=2) { 511 i1 = idx[0]; 512 i2 = idx[1]; 513 idx += 2; 514 tmp0 = x[i1]; 515 tmp1 = x[i2]; 516 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 517 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 518 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 519 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 520 } 521 if (n == sz-1){ 522 tmp0 = x[*idx++]; 523 sum1 += *v1++ * tmp0; 524 sum2 += *v2++ * tmp0; 525 sum3 += *v3++ * tmp0; 526 sum4 += *v4++ * tmp0; 527 } 528 y[row++]=sum1; 529 y[row++]=sum2; 530 y[row++]=sum3; 531 y[row++]=sum4; 532 v1 =v4; /* Since the next block to be processed starts there*/ 533 idx +=3*sz; 534 break; 535 case 5: 536 sum1 = 0.; 537 sum2 = 0.; 538 sum3 = 0.; 539 sum4 = 0.; 540 sum5 = 0.; 541 v2 = v1 + n; 542 v3 = v2 + n; 543 v4 = v3 + n; 544 v5 = v4 + n; 545 546 for (n = 0; n<sz-1; n+=2) { 547 i1 = idx[0]; 548 i2 = idx[1]; 549 idx += 2; 550 tmp0 = x[i1]; 551 tmp1 = x[i2]; 552 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 553 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 554 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 555 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 556 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 557 } 558 if (n == sz-1){ 559 tmp0 = x[*idx++]; 560 sum1 += *v1++ * tmp0; 561 sum2 += *v2++ * tmp0; 562 sum3 += *v3++ * tmp0; 563 sum4 += *v4++ * tmp0; 564 sum5 += *v5++ * tmp0; 565 } 566 y[row++]=sum1; 567 y[row++]=sum2; 568 y[row++]=sum3; 569 y[row++]=sum4; 570 y[row++]=sum5; 571 v1 =v5; /* Since the next block to be processed starts there */ 572 idx +=4*sz; 573 break; 574 default : 575 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 576 } 577 } 578 ierr = VecRestoreArrayRead(xx,&x);CHKERRQ(ierr); 579 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 580 ierr = PetscLogFlops(2.0*a->nz - nonzerorow);CHKERRQ(ierr); 581 PetscFunctionReturn(0); 582 } 583 /* ----------------------------------------------------------- */ 584 /* Almost same code as the MatMult_SeqAIJ_Inode() */ 585 #undef __FUNCT__ 586 #define __FUNCT__ "MatMultAdd_SeqAIJ_Inode" 587 static PetscErrorCode MatMultAdd_SeqAIJ_Inode(Mat A,Vec xx,Vec zz,Vec yy) 588 { 589 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 590 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1; 591 MatScalar *v1,*v2,*v3,*v4,*v5; 592 PetscScalar *x,*y,*z,*zt; 593 PetscErrorCode ierr; 594 PetscInt *idx,i1,i2,n,i,row,node_max,*ns,*ii,nsz,sz; 595 596 PetscFunctionBegin; 597 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 598 node_max = a->inode.node_count; 599 ns = a->inode.size; /* Node Size array */ 600 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 601 ierr = VecGetArray(yy,&y);CHKERRQ(ierr); 602 if (zz != yy) { 603 ierr = VecGetArray(zz,&z);CHKERRQ(ierr); 604 } else { 605 z = y; 606 } 607 zt = z; 608 609 idx = a->j; 610 v1 = a->a; 611 ii = a->i; 612 613 for (i = 0,row = 0; i< node_max; ++i){ 614 nsz = ns[i]; 615 n = ii[1] - ii[0]; 616 ii += nsz; 617 sz = n; /* No of non zeros in this row */ 618 /* Switch on the size of Node */ 619 switch (nsz){ /* Each loop in 'case' is unrolled */ 620 case 1 : 621 sum1 = *zt++; 622 623 for(n = 0; n< sz-1; n+=2) { 624 i1 = idx[0]; /* The instructions are ordered to */ 625 i2 = idx[1]; /* make the compiler's job easy */ 626 idx += 2; 627 tmp0 = x[i1]; 628 tmp1 = x[i2]; 629 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 630 } 631 632 if(n == sz-1){ /* Take care of the last nonzero */ 633 tmp0 = x[*idx++]; 634 sum1 += *v1++ * tmp0; 635 } 636 y[row++]=sum1; 637 break; 638 case 2: 639 sum1 = *zt++; 640 sum2 = *zt++; 641 v2 = v1 + n; 642 643 for(n = 0; n< sz-1; n+=2) { 644 i1 = idx[0]; 645 i2 = idx[1]; 646 idx += 2; 647 tmp0 = x[i1]; 648 tmp1 = x[i2]; 649 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 650 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 651 } 652 if(n == sz-1){ 653 tmp0 = x[*idx++]; 654 sum1 += *v1++ * tmp0; 655 sum2 += *v2++ * tmp0; 656 } 657 y[row++]=sum1; 658 y[row++]=sum2; 659 v1 =v2; /* Since the next block to be processed starts there*/ 660 idx +=sz; 661 break; 662 case 3: 663 sum1 = *zt++; 664 sum2 = *zt++; 665 sum3 = *zt++; 666 v2 = v1 + n; 667 v3 = v2 + n; 668 669 for (n = 0; n< sz-1; n+=2) { 670 i1 = idx[0]; 671 i2 = idx[1]; 672 idx += 2; 673 tmp0 = x[i1]; 674 tmp1 = x[i2]; 675 sum1 += v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 676 sum2 += v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 677 sum3 += v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 678 } 679 if (n == sz-1){ 680 tmp0 = x[*idx++]; 681 sum1 += *v1++ * tmp0; 682 sum2 += *v2++ * tmp0; 683 sum3 += *v3++ * tmp0; 684 } 685 y[row++]=sum1; 686 y[row++]=sum2; 687 y[row++]=sum3; 688 v1 =v3; /* Since the next block to be processed starts there*/ 689 idx +=2*sz; 690 break; 691 case 4: 692 sum1 = *zt++; 693 sum2 = *zt++; 694 sum3 = *zt++; 695 sum4 = *zt++; 696 v2 = v1 + n; 697 v3 = v2 + n; 698 v4 = v3 + n; 699 700 for (n = 0; n< sz-1; n+=2) { 701 i1 = idx[0]; 702 i2 = idx[1]; 703 idx += 2; 704 tmp0 = x[i1]; 705 tmp1 = x[i2]; 706 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 707 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 708 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 709 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 710 } 711 if (n == sz-1){ 712 tmp0 = x[*idx++]; 713 sum1 += *v1++ * tmp0; 714 sum2 += *v2++ * tmp0; 715 sum3 += *v3++ * tmp0; 716 sum4 += *v4++ * tmp0; 717 } 718 y[row++]=sum1; 719 y[row++]=sum2; 720 y[row++]=sum3; 721 y[row++]=sum4; 722 v1 =v4; /* Since the next block to be processed starts there*/ 723 idx +=3*sz; 724 break; 725 case 5: 726 sum1 = *zt++; 727 sum2 = *zt++; 728 sum3 = *zt++; 729 sum4 = *zt++; 730 sum5 = *zt++; 731 v2 = v1 + n; 732 v3 = v2 + n; 733 v4 = v3 + n; 734 v5 = v4 + n; 735 736 for (n = 0; n<sz-1; n+=2) { 737 i1 = idx[0]; 738 i2 = idx[1]; 739 idx += 2; 740 tmp0 = x[i1]; 741 tmp1 = x[i2]; 742 sum1 += v1[0] * tmp0 + v1[1] *tmp1; v1 += 2; 743 sum2 += v2[0] * tmp0 + v2[1] *tmp1; v2 += 2; 744 sum3 += v3[0] * tmp0 + v3[1] *tmp1; v3 += 2; 745 sum4 += v4[0] * tmp0 + v4[1] *tmp1; v4 += 2; 746 sum5 += v5[0] * tmp0 + v5[1] *tmp1; v5 += 2; 747 } 748 if(n == sz-1){ 749 tmp0 = x[*idx++]; 750 sum1 += *v1++ * tmp0; 751 sum2 += *v2++ * tmp0; 752 sum3 += *v3++ * tmp0; 753 sum4 += *v4++ * tmp0; 754 sum5 += *v5++ * tmp0; 755 } 756 y[row++]=sum1; 757 y[row++]=sum2; 758 y[row++]=sum3; 759 y[row++]=sum4; 760 y[row++]=sum5; 761 v1 =v5; /* Since the next block to be processed starts there */ 762 idx +=4*sz; 763 break; 764 default : 765 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported"); 766 } 767 } 768 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 769 ierr = VecRestoreArray(yy,&y);CHKERRQ(ierr); 770 if (zz != yy) { 771 ierr = VecRestoreArray(zz,&z);CHKERRQ(ierr); 772 } 773 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 774 PetscFunctionReturn(0); 775 } 776 777 /* ----------------------------------------------------------- */ 778 #undef __FUNCT__ 779 #define __FUNCT__ "MatSolve_SeqAIJ_Inode_inplace" 780 PetscErrorCode MatSolve_SeqAIJ_Inode_inplace(Mat A,Vec bb,Vec xx) 781 { 782 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 783 IS iscol = a->col,isrow = a->row; 784 PetscErrorCode ierr; 785 const PetscInt *r,*c,*rout,*cout; 786 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 787 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 788 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 789 PetscScalar sum1,sum2,sum3,sum4,sum5; 790 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 791 const PetscScalar *b; 792 793 PetscFunctionBegin; 794 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 795 node_max = a->inode.node_count; 796 ns = a->inode.size; /* Node Size array */ 797 798 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 799 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 800 tmp = a->solve_work; 801 802 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 803 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 804 805 /* forward solve the lower triangular */ 806 tmps = tmp ; 807 aa = a_a ; 808 aj = a_j ; 809 ad = a->diag; 810 811 for (i = 0,row = 0; i< node_max; ++i){ 812 nsz = ns[i]; 813 aii = ai[row]; 814 v1 = aa + aii; 815 vi = aj + aii; 816 nz = ad[row]- aii; 817 if (i < node_max-1) { 818 /* Prefetch the block after the current one, the prefetch itself can't cause a memory error, 819 * but our indexing to determine it's size could. */ 820 PetscPrefetchBlock(aj+ai[row+nsz],ad[row+nsz]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 821 /* In my tests, it seems to be better to fetch entire rows instead of just the lower-triangular part */ 822 PetscPrefetchBlock(aa+ai[row+nsz],ad[row+nsz+ns[i+1]-1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 823 /* for (j=0; j<ns[i+1]; j++) PetscPrefetchBlock(aa+ai[row+nsz+j],ad[row+nsz+j]-ai[row+nsz+j],0,0); */ 824 } 825 826 switch (nsz){ /* Each loop in 'case' is unrolled */ 827 case 1 : 828 sum1 = b[*r++]; 829 for(j=0; j<nz-1; j+=2){ 830 i0 = vi[0]; 831 i1 = vi[1]; 832 vi +=2; 833 tmp0 = tmps[i0]; 834 tmp1 = tmps[i1]; 835 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 836 } 837 if(j == nz-1){ 838 tmp0 = tmps[*vi++]; 839 sum1 -= *v1++ *tmp0; 840 } 841 tmp[row ++]=sum1; 842 break; 843 case 2: 844 sum1 = b[*r++]; 845 sum2 = b[*r++]; 846 v2 = aa + ai[row+1]; 847 848 for(j=0; j<nz-1; j+=2){ 849 i0 = vi[0]; 850 i1 = vi[1]; 851 vi +=2; 852 tmp0 = tmps[i0]; 853 tmp1 = tmps[i1]; 854 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 855 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 856 } 857 if(j == nz-1){ 858 tmp0 = tmps[*vi++]; 859 sum1 -= *v1++ *tmp0; 860 sum2 -= *v2++ *tmp0; 861 } 862 sum2 -= *v2++ * sum1; 863 tmp[row ++]=sum1; 864 tmp[row ++]=sum2; 865 break; 866 case 3: 867 sum1 = b[*r++]; 868 sum2 = b[*r++]; 869 sum3 = b[*r++]; 870 v2 = aa + ai[row+1]; 871 v3 = aa + ai[row+2]; 872 873 for (j=0; j<nz-1; j+=2){ 874 i0 = vi[0]; 875 i1 = vi[1]; 876 vi +=2; 877 tmp0 = tmps[i0]; 878 tmp1 = tmps[i1]; 879 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 880 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 881 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 882 } 883 if (j == nz-1){ 884 tmp0 = tmps[*vi++]; 885 sum1 -= *v1++ *tmp0; 886 sum2 -= *v2++ *tmp0; 887 sum3 -= *v3++ *tmp0; 888 } 889 sum2 -= *v2++ * sum1; 890 sum3 -= *v3++ * sum1; 891 sum3 -= *v3++ * sum2; 892 tmp[row ++]=sum1; 893 tmp[row ++]=sum2; 894 tmp[row ++]=sum3; 895 break; 896 897 case 4: 898 sum1 = b[*r++]; 899 sum2 = b[*r++]; 900 sum3 = b[*r++]; 901 sum4 = b[*r++]; 902 v2 = aa + ai[row+1]; 903 v3 = aa + ai[row+2]; 904 v4 = aa + ai[row+3]; 905 906 for (j=0; j<nz-1; j+=2){ 907 i0 = vi[0]; 908 i1 = vi[1]; 909 vi +=2; 910 tmp0 = tmps[i0]; 911 tmp1 = tmps[i1]; 912 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 913 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 914 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 915 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 916 } 917 if (j == nz-1){ 918 tmp0 = tmps[*vi++]; 919 sum1 -= *v1++ *tmp0; 920 sum2 -= *v2++ *tmp0; 921 sum3 -= *v3++ *tmp0; 922 sum4 -= *v4++ *tmp0; 923 } 924 sum2 -= *v2++ * sum1; 925 sum3 -= *v3++ * sum1; 926 sum4 -= *v4++ * sum1; 927 sum3 -= *v3++ * sum2; 928 sum4 -= *v4++ * sum2; 929 sum4 -= *v4++ * sum3; 930 931 tmp[row ++]=sum1; 932 tmp[row ++]=sum2; 933 tmp[row ++]=sum3; 934 tmp[row ++]=sum4; 935 break; 936 case 5: 937 sum1 = b[*r++]; 938 sum2 = b[*r++]; 939 sum3 = b[*r++]; 940 sum4 = b[*r++]; 941 sum5 = b[*r++]; 942 v2 = aa + ai[row+1]; 943 v3 = aa + ai[row+2]; 944 v4 = aa + ai[row+3]; 945 v5 = aa + ai[row+4]; 946 947 for (j=0; j<nz-1; j+=2){ 948 i0 = vi[0]; 949 i1 = vi[1]; 950 vi +=2; 951 tmp0 = tmps[i0]; 952 tmp1 = tmps[i1]; 953 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 954 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 955 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 956 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 957 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 958 } 959 if (j == nz-1){ 960 tmp0 = tmps[*vi++]; 961 sum1 -= *v1++ *tmp0; 962 sum2 -= *v2++ *tmp0; 963 sum3 -= *v3++ *tmp0; 964 sum4 -= *v4++ *tmp0; 965 sum5 -= *v5++ *tmp0; 966 } 967 968 sum2 -= *v2++ * sum1; 969 sum3 -= *v3++ * sum1; 970 sum4 -= *v4++ * sum1; 971 sum5 -= *v5++ * sum1; 972 sum3 -= *v3++ * sum2; 973 sum4 -= *v4++ * sum2; 974 sum5 -= *v5++ * sum2; 975 sum4 -= *v4++ * sum3; 976 sum5 -= *v5++ * sum3; 977 sum5 -= *v5++ * sum4; 978 979 tmp[row ++]=sum1; 980 tmp[row ++]=sum2; 981 tmp[row ++]=sum3; 982 tmp[row ++]=sum4; 983 tmp[row ++]=sum5; 984 break; 985 default: 986 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 987 } 988 } 989 /* backward solve the upper triangular */ 990 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 991 nsz = ns[i]; 992 aii = ai[row+1] -1; 993 v1 = aa + aii; 994 vi = aj + aii; 995 nz = aii- ad[row]; 996 switch (nsz){ /* Each loop in 'case' is unrolled */ 997 case 1 : 998 sum1 = tmp[row]; 999 1000 for(j=nz ; j>1; j-=2){ 1001 vi -=2; 1002 i0 = vi[2]; 1003 i1 = vi[1]; 1004 tmp0 = tmps[i0]; 1005 tmp1 = tmps[i1]; 1006 v1 -= 2; 1007 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1008 } 1009 if (j==1){ 1010 tmp0 = tmps[*vi--]; 1011 sum1 -= *v1-- * tmp0; 1012 } 1013 x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1014 break; 1015 case 2 : 1016 sum1 = tmp[row]; 1017 sum2 = tmp[row -1]; 1018 v2 = aa + ai[row]-1; 1019 for (j=nz ; j>1; j-=2){ 1020 vi -=2; 1021 i0 = vi[2]; 1022 i1 = vi[1]; 1023 tmp0 = tmps[i0]; 1024 tmp1 = tmps[i1]; 1025 v1 -= 2; 1026 v2 -= 2; 1027 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1028 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1029 } 1030 if (j==1){ 1031 tmp0 = tmps[*vi--]; 1032 sum1 -= *v1-- * tmp0; 1033 sum2 -= *v2-- * tmp0; 1034 } 1035 1036 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1037 sum2 -= *v2-- * tmp0; 1038 x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1039 break; 1040 case 3 : 1041 sum1 = tmp[row]; 1042 sum2 = tmp[row -1]; 1043 sum3 = tmp[row -2]; 1044 v2 = aa + ai[row]-1; 1045 v3 = aa + ai[row -1]-1; 1046 for (j=nz ; j>1; j-=2){ 1047 vi -=2; 1048 i0 = vi[2]; 1049 i1 = vi[1]; 1050 tmp0 = tmps[i0]; 1051 tmp1 = tmps[i1]; 1052 v1 -= 2; 1053 v2 -= 2; 1054 v3 -= 2; 1055 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1056 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1057 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1058 } 1059 if (j==1){ 1060 tmp0 = tmps[*vi--]; 1061 sum1 -= *v1-- * tmp0; 1062 sum2 -= *v2-- * tmp0; 1063 sum3 -= *v3-- * tmp0; 1064 } 1065 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1066 sum2 -= *v2-- * tmp0; 1067 sum3 -= *v3-- * tmp0; 1068 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1069 sum3 -= *v3-- * tmp0; 1070 x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1071 1072 break; 1073 case 4 : 1074 sum1 = tmp[row]; 1075 sum2 = tmp[row -1]; 1076 sum3 = tmp[row -2]; 1077 sum4 = tmp[row -3]; 1078 v2 = aa + ai[row]-1; 1079 v3 = aa + ai[row -1]-1; 1080 v4 = aa + ai[row -2]-1; 1081 1082 for (j=nz ; j>1; j-=2){ 1083 vi -=2; 1084 i0 = vi[2]; 1085 i1 = vi[1]; 1086 tmp0 = tmps[i0]; 1087 tmp1 = tmps[i1]; 1088 v1 -= 2; 1089 v2 -= 2; 1090 v3 -= 2; 1091 v4 -= 2; 1092 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1093 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1094 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1095 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1096 } 1097 if (j==1){ 1098 tmp0 = tmps[*vi--]; 1099 sum1 -= *v1-- * tmp0; 1100 sum2 -= *v2-- * tmp0; 1101 sum3 -= *v3-- * tmp0; 1102 sum4 -= *v4-- * tmp0; 1103 } 1104 1105 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1106 sum2 -= *v2-- * tmp0; 1107 sum3 -= *v3-- * tmp0; 1108 sum4 -= *v4-- * tmp0; 1109 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1110 sum3 -= *v3-- * tmp0; 1111 sum4 -= *v4-- * tmp0; 1112 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1113 sum4 -= *v4-- * tmp0; 1114 x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1115 break; 1116 case 5 : 1117 sum1 = tmp[row]; 1118 sum2 = tmp[row -1]; 1119 sum3 = tmp[row -2]; 1120 sum4 = tmp[row -3]; 1121 sum5 = tmp[row -4]; 1122 v2 = aa + ai[row]-1; 1123 v3 = aa + ai[row -1]-1; 1124 v4 = aa + ai[row -2]-1; 1125 v5 = aa + ai[row -3]-1; 1126 for (j=nz ; j>1; j-=2){ 1127 vi -= 2; 1128 i0 = vi[2]; 1129 i1 = vi[1]; 1130 tmp0 = tmps[i0]; 1131 tmp1 = tmps[i1]; 1132 v1 -= 2; 1133 v2 -= 2; 1134 v3 -= 2; 1135 v4 -= 2; 1136 v5 -= 2; 1137 sum1 -= v1[2] * tmp0 + v1[1] * tmp1; 1138 sum2 -= v2[2] * tmp0 + v2[1] * tmp1; 1139 sum3 -= v3[2] * tmp0 + v3[1] * tmp1; 1140 sum4 -= v4[2] * tmp0 + v4[1] * tmp1; 1141 sum5 -= v5[2] * tmp0 + v5[1] * tmp1; 1142 } 1143 if (j==1){ 1144 tmp0 = tmps[*vi--]; 1145 sum1 -= *v1-- * tmp0; 1146 sum2 -= *v2-- * tmp0; 1147 sum3 -= *v3-- * tmp0; 1148 sum4 -= *v4-- * tmp0; 1149 sum5 -= *v5-- * tmp0; 1150 } 1151 1152 tmp0 = x[*c--] = tmp[row] = sum1*a_a[ad[row]]; row--; 1153 sum2 -= *v2-- * tmp0; 1154 sum3 -= *v3-- * tmp0; 1155 sum4 -= *v4-- * tmp0; 1156 sum5 -= *v5-- * tmp0; 1157 tmp0 = x[*c--] = tmp[row] = sum2*a_a[ad[row]]; row--; 1158 sum3 -= *v3-- * tmp0; 1159 sum4 -= *v4-- * tmp0; 1160 sum5 -= *v5-- * tmp0; 1161 tmp0 = x[*c--] = tmp[row] = sum3*a_a[ad[row]]; row--; 1162 sum4 -= *v4-- * tmp0; 1163 sum5 -= *v5-- * tmp0; 1164 tmp0 = x[*c--] = tmp[row] = sum4*a_a[ad[row]]; row--; 1165 sum5 -= *v5-- * tmp0; 1166 x[*c--] = tmp[row] = sum5*a_a[ad[row]]; row--; 1167 break; 1168 default: 1169 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 1170 } 1171 } 1172 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1173 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1174 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1176 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 1177 PetscFunctionReturn(0); 1178 } 1179 1180 #undef __FUNCT__ 1181 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode" 1182 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode(Mat B,Mat A,const MatFactorInfo *info) 1183 { 1184 Mat C=B; 1185 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data,*b=(Mat_SeqAIJ *)C->data; 1186 IS isrow = b->row,isicol = b->icol; 1187 PetscErrorCode ierr; 1188 const PetscInt *r,*ic,*ics; 1189 const PetscInt n=A->rmap->n,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*bdiag=b->diag; 1190 PetscInt i,j,k,nz,nzL,row,*pj; 1191 const PetscInt *ajtmp,*bjtmp; 1192 MatScalar *pc,*pc1,*pc2,*pc3,*pc4,mul1,mul2,mul3,mul4,*pv,*rtmp1,*rtmp2,*rtmp3,*rtmp4; 1193 const MatScalar *aa=a->a,*v,*v1,*v2,*v3,*v4; 1194 FactorShiftCtx sctx; 1195 const PetscInt *ddiag; 1196 PetscReal rs; 1197 MatScalar d; 1198 PetscInt inod,nodesz,node_max,col; 1199 const PetscInt *ns; 1200 PetscInt *tmp_vec1,*tmp_vec2,*nsmap; 1201 1202 PetscFunctionBegin; 1203 /* MatPivotSetUp(): initialize shift context sctx */ 1204 ierr = PetscMemzero(&sctx,sizeof(FactorShiftCtx));CHKERRQ(ierr); 1205 1206 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1207 ddiag = a->diag; 1208 sctx.shift_top = info->zeropivot; 1209 for (i=0; i<n; i++) { 1210 /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1211 d = (aa)[ddiag[i]]; 1212 rs = -PetscAbsScalar(d) - PetscRealPart(d); 1213 v = aa+ai[i]; 1214 nz = ai[i+1] - ai[i]; 1215 for (j=0; j<nz; j++) 1216 rs += PetscAbsScalar(v[j]); 1217 if (rs>sctx.shift_top) sctx.shift_top = rs; 1218 } 1219 sctx.shift_top *= 1.1; 1220 sctx.nshift_max = 5; 1221 sctx.shift_lo = 0.; 1222 sctx.shift_hi = 1.; 1223 } 1224 1225 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1226 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1227 1228 ierr = PetscMalloc((4*n+1)*sizeof(PetscScalar),&rtmp1);CHKERRQ(ierr); 1229 ierr = PetscMemzero(rtmp1,(4*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1230 rtmp2 = rtmp1 + n; 1231 rtmp3 = rtmp2 + n; 1232 rtmp4 = rtmp3 + n; 1233 ics = ic; 1234 1235 node_max = a->inode.node_count; 1236 ns = a->inode.size; 1237 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1238 1239 /* If max inode size > 4, split it into two inodes.*/ 1240 /* also map the inode sizes according to the ordering */ 1241 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1242 for (i=0,j=0; i<node_max; ++i,++j){ 1243 if (ns[i] > 4) { 1244 tmp_vec1[j] = 4; 1245 ++j; 1246 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1247 } else { 1248 tmp_vec1[j] = ns[i]; 1249 } 1250 } 1251 /* Use the correct node_max */ 1252 node_max = j; 1253 1254 /* Now reorder the inode info based on mat re-ordering info */ 1255 /* First create a row -> inode_size_array_index map */ 1256 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1257 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1258 for (i=0,row=0; i<node_max; i++) { 1259 nodesz = tmp_vec1[i]; 1260 for (j=0; j<nodesz; j++,row++) { 1261 nsmap[row] = i; 1262 } 1263 } 1264 /* Using nsmap, create a reordered ns structure */ 1265 for (i=0,j=0; i< node_max; i++) { 1266 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1267 tmp_vec2[i] = nodesz; 1268 j += nodesz; 1269 } 1270 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1271 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 1272 1273 /* Now use the correct ns */ 1274 ns = tmp_vec2; 1275 1276 do { 1277 sctx.newshift = PETSC_FALSE; 1278 /* Now loop over each block-row, and do the factorization */ 1279 for (inod=0,i=0; inod<node_max; inod++){ /* i: row index; inod: inode index */ 1280 nodesz = ns[inod]; 1281 1282 switch (nodesz){ 1283 case 1: 1284 /*----------*/ 1285 /* zero rtmp1 */ 1286 /* L part */ 1287 nz = bi[i+1] - bi[i]; 1288 bjtmp = bj + bi[i]; 1289 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1290 1291 /* U part */ 1292 nz = bdiag[i]-bdiag[i+1]; 1293 bjtmp = bj + bdiag[i+1]+1; 1294 for (j=0; j<nz; j++) rtmp1[bjtmp[j]] = 0.0; 1295 1296 /* load in initial (unfactored row) */ 1297 nz = ai[r[i]+1] - ai[r[i]]; 1298 ajtmp = aj + ai[r[i]]; 1299 v = aa + ai[r[i]]; 1300 for (j=0; j<nz; j++) { 1301 rtmp1[ics[ajtmp[j]]] = v[j]; 1302 } 1303 /* ZeropivotApply() */ 1304 rtmp1[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 1305 1306 /* elimination */ 1307 bjtmp = bj + bi[i]; 1308 row = *bjtmp++; 1309 nzL = bi[i+1] - bi[i]; 1310 for(k=0; k < nzL;k++) { 1311 pc = rtmp1 + row; 1312 if (*pc != 0.0) { 1313 pv = b->a + bdiag[row]; 1314 mul1 = *pc * (*pv); 1315 *pc = mul1; 1316 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1317 pv = b->a + bdiag[row+1]+1; 1318 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1319 for (j=0; j<nz; j++) rtmp1[pj[j]] -= mul1 * pv[j]; 1320 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1321 } 1322 row = *bjtmp++; 1323 } 1324 1325 /* finished row so stick it into b->a */ 1326 rs = 0.0; 1327 /* L part */ 1328 pv = b->a + bi[i] ; 1329 pj = b->j + bi[i] ; 1330 nz = bi[i+1] - bi[i]; 1331 for (j=0; j<nz; j++) { 1332 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1333 } 1334 1335 /* U part */ 1336 pv = b->a + bdiag[i+1]+1; 1337 pj = b->j + bdiag[i+1]+1; 1338 nz = bdiag[i] - bdiag[i+1]-1; 1339 for (j=0; j<nz; j++) { 1340 pv[j] = rtmp1[pj[j]]; rs += PetscAbsScalar(pv[j]); 1341 } 1342 1343 /* Check zero pivot */ 1344 sctx.rs = rs; 1345 sctx.pv = rtmp1[i]; 1346 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1347 if(sctx.newshift) break; 1348 1349 /* Mark diagonal and invert diagonal for simplier triangular solves */ 1350 pv = b->a + bdiag[i]; 1351 *pv = 1.0/sctx.pv; /* sctx.pv = rtmp1[i]+shiftamount if shifttype==MAT_SHIFT_INBLOCKS */ 1352 break; 1353 1354 case 2: 1355 /*----------*/ 1356 /* zero rtmp1 and rtmp2 */ 1357 /* L part */ 1358 nz = bi[i+1] - bi[i]; 1359 bjtmp = bj + bi[i]; 1360 for (j=0; j<nz; j++) { 1361 col = bjtmp[j]; 1362 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1363 } 1364 1365 /* U part */ 1366 nz = bdiag[i]-bdiag[i+1]; 1367 bjtmp = bj + bdiag[i+1]+1; 1368 for (j=0; j<nz; j++) { 1369 col = bjtmp[j]; 1370 rtmp1[col] = 0.0; rtmp2[col] = 0.0; 1371 } 1372 1373 /* load in initial (unfactored row) */ 1374 nz = ai[r[i]+1] - ai[r[i]]; 1375 ajtmp = aj + ai[r[i]]; 1376 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; 1377 for (j=0; j<nz; j++) { 1378 col = ics[ajtmp[j]]; 1379 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; 1380 } 1381 /* ZeropivotApply(): shift the diagonal of the matrix */ 1382 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; 1383 1384 /* elimination */ 1385 bjtmp = bj + bi[i]; 1386 row = *bjtmp++; /* pivot row */ 1387 nzL = bi[i+1] - bi[i]; 1388 for(k=0; k < nzL;k++) { 1389 pc1 = rtmp1 + row; 1390 pc2 = rtmp2 + row; 1391 if (*pc1 != 0.0 || *pc2 != 0.0) { 1392 pv = b->a + bdiag[row]; 1393 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); 1394 *pc1 = mul1; *pc2 = mul2; 1395 1396 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1397 pv = b->a + bdiag[row+1]+1; 1398 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1399 for (j=0; j<nz; j++){ 1400 col = pj[j]; 1401 rtmp1[col] -= mul1 * pv[j]; 1402 rtmp2[col] -= mul2 * pv[j]; 1403 } 1404 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1405 } 1406 row = *bjtmp++; 1407 } 1408 1409 /* finished row i; check zero pivot, then stick row i into b->a */ 1410 rs = 0.0; 1411 /* L part */ 1412 pc1 = b->a + bi[i]; 1413 pj = b->j + bi[i] ; 1414 nz = bi[i+1] - bi[i]; 1415 for (j=0; j<nz; j++) { 1416 col = pj[j]; 1417 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1418 } 1419 /* U part */ 1420 pc1 = b->a + bdiag[i+1]+1; 1421 pj = b->j + bdiag[i+1]+1; 1422 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1423 for (j=0; j<nz; j++) { 1424 col = pj[j]; 1425 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1426 } 1427 1428 sctx.rs = rs; 1429 sctx.pv = rtmp1[i]; 1430 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1431 if(sctx.newshift) break; 1432 pc1 = b->a + bdiag[i]; /* Mark diagonal */ 1433 *pc1 = 1.0/sctx.pv; 1434 1435 /* Now take care of diagonal 2x2 block. */ 1436 pc2 = rtmp2 + i; 1437 if (*pc2 != 0.0){ 1438 mul1 = (*pc2)*(*pc1); /* *pc1=diag[i] is inverted! */ 1439 *pc2 = mul1; /* insert L entry */ 1440 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1441 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1442 for (j=0; j<nz; j++) { 1443 col = pj[j]; rtmp2[col] -= mul1 * rtmp1[col]; 1444 } 1445 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1446 } 1447 1448 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1449 rs = 0.0; 1450 /* L part */ 1451 pc2 = b->a + bi[i+1]; 1452 pj = b->j + bi[i+1] ; 1453 nz = bi[i+2] - bi[i+1]; 1454 for (j=0; j<nz; j++) { 1455 col = pj[j]; 1456 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1457 } 1458 /* U part */ 1459 pc2 = b->a + bdiag[i+2]+1; 1460 pj = b->j + bdiag[i+2]+1; 1461 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1462 for (j=0; j<nz; j++) { 1463 col = pj[j]; 1464 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1465 } 1466 1467 sctx.rs = rs; 1468 sctx.pv = rtmp2[i+1]; 1469 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1470 if(sctx.newshift) break; 1471 pc2 = b->a + bdiag[i+1]; 1472 *pc2 = 1.0/sctx.pv; 1473 break; 1474 1475 case 3: 1476 /*----------*/ 1477 /* zero rtmp */ 1478 /* L part */ 1479 nz = bi[i+1] - bi[i]; 1480 bjtmp = bj + bi[i]; 1481 for (j=0; j<nz; j++) { 1482 col = bjtmp[j]; 1483 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1484 } 1485 1486 /* U part */ 1487 nz = bdiag[i]-bdiag[i+1]; 1488 bjtmp = bj + bdiag[i+1]+1; 1489 for (j=0; j<nz; j++) { 1490 col = bjtmp[j]; 1491 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; 1492 } 1493 1494 /* load in initial (unfactored row) */ 1495 nz = ai[r[i]+1] - ai[r[i]]; 1496 ajtmp = aj + ai[r[i]]; 1497 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; 1498 for (j=0; j<nz; j++) { 1499 col = ics[ajtmp[j]]; 1500 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; 1501 } 1502 /* ZeropivotApply(): shift the diagonal of the matrix */ 1503 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; 1504 1505 /* elimination */ 1506 bjtmp = bj + bi[i]; 1507 row = *bjtmp++; /* pivot row */ 1508 nzL = bi[i+1] - bi[i]; 1509 for(k=0; k < nzL;k++) { 1510 pc1 = rtmp1 + row; 1511 pc2 = rtmp2 + row; 1512 pc3 = rtmp3 + row; 1513 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0) { 1514 pv = b->a + bdiag[row]; 1515 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); 1516 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; 1517 1518 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1519 pv = b->a + bdiag[row+1]+1; 1520 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1521 for (j=0; j<nz; j++){ 1522 col = pj[j]; 1523 rtmp1[col] -= mul1 * pv[j]; 1524 rtmp2[col] -= mul2 * pv[j]; 1525 rtmp3[col] -= mul3 * pv[j]; 1526 } 1527 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1528 } 1529 row = *bjtmp++; 1530 } 1531 1532 /* finished row i; check zero pivot, then stick row i into b->a */ 1533 rs = 0.0; 1534 /* L part */ 1535 pc1 = b->a + bi[i]; 1536 pj = b->j + bi[i] ; 1537 nz = bi[i+1] - bi[i]; 1538 for (j=0; j<nz; j++) { 1539 col = pj[j]; 1540 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1541 } 1542 /* U part */ 1543 pc1 = b->a + bdiag[i+1]+1; 1544 pj = b->j + bdiag[i+1]+1; 1545 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1546 for (j=0; j<nz; j++) { 1547 col = pj[j]; 1548 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1549 } 1550 1551 sctx.rs = rs; 1552 sctx.pv = rtmp1[i]; 1553 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1554 if(sctx.newshift) break; 1555 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1556 *pc1 = 1.0/sctx.pv; 1557 1558 /* Now take care of 1st column of diagonal 3x3 block. */ 1559 pc2 = rtmp2 + i; 1560 pc3 = rtmp3 + i; 1561 if (*pc2 != 0.0 || *pc3 != 0.0){ 1562 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1563 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1564 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1565 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1566 for (j=0; j<nz; j++) { 1567 col = pj[j]; 1568 rtmp2[col] -= mul2 * rtmp1[col]; 1569 rtmp3[col] -= mul3 * rtmp1[col]; 1570 } 1571 ierr = PetscLogFlops(2+4*nz);CHKERRQ(ierr); 1572 } 1573 1574 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1575 rs = 0.0; 1576 /* L part */ 1577 pc2 = b->a + bi[i+1]; 1578 pj = b->j + bi[i+1] ; 1579 nz = bi[i+2] - bi[i+1]; 1580 for (j=0; j<nz; j++) { 1581 col = pj[j]; 1582 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1583 } 1584 /* U part */ 1585 pc2 = b->a + bdiag[i+2]+1; 1586 pj = b->j + bdiag[i+2]+1; 1587 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1588 for (j=0; j<nz; j++) { 1589 col = pj[j]; 1590 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1591 } 1592 1593 sctx.rs = rs; 1594 sctx.pv = rtmp2[i+1]; 1595 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1596 if(sctx.newshift) break; 1597 pc2 = b->a + bdiag[i+1]; 1598 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1599 1600 /* Now take care of 2nd column of diagonal 3x3 block. */ 1601 pc3 = rtmp3 + i+1; 1602 if (*pc3 != 0.0){ 1603 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1604 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1605 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1606 for (j=0; j<nz; j++) { 1607 col = pj[j]; 1608 rtmp3[col] -= mul3 * rtmp2[col]; 1609 } 1610 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1611 } 1612 1613 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1614 rs = 0.0; 1615 /* L part */ 1616 pc3 = b->a + bi[i+2]; 1617 pj = b->j + bi[i+2] ; 1618 nz = bi[i+3] - bi[i+2]; 1619 for (j=0; j<nz; j++) { 1620 col = pj[j]; 1621 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1622 } 1623 /* U part */ 1624 pc3 = b->a + bdiag[i+3]+1; 1625 pj = b->j + bdiag[i+3]+1; 1626 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1627 for (j=0; j<nz; j++) { 1628 col = pj[j]; 1629 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1630 } 1631 1632 sctx.rs = rs; 1633 sctx.pv = rtmp3[i+2]; 1634 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1635 if(sctx.newshift) break; 1636 pc3 = b->a + bdiag[i+2]; 1637 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1638 break; 1639 case 4: 1640 /*----------*/ 1641 /* zero rtmp */ 1642 /* L part */ 1643 nz = bi[i+1] - bi[i]; 1644 bjtmp = bj + bi[i]; 1645 for (j=0; j<nz; j++) { 1646 col = bjtmp[j]; 1647 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0;rtmp4[col] = 0.0; 1648 } 1649 1650 /* U part */ 1651 nz = bdiag[i]-bdiag[i+1]; 1652 bjtmp = bj + bdiag[i+1]+1; 1653 for (j=0; j<nz; j++) { 1654 col = bjtmp[j]; 1655 rtmp1[col] = 0.0; rtmp2[col] = 0.0; rtmp3[col] = 0.0; rtmp4[col] = 0.0; 1656 } 1657 1658 /* load in initial (unfactored row) */ 1659 nz = ai[r[i]+1] - ai[r[i]]; 1660 ajtmp = aj + ai[r[i]]; 1661 v1 = aa + ai[r[i]]; v2 = aa + ai[r[i]+1]; v3 = aa + ai[r[i]+2]; v4 = aa + ai[r[i]+3]; 1662 for (j=0; j<nz; j++) { 1663 col = ics[ajtmp[j]]; 1664 rtmp1[col] = v1[j]; rtmp2[col] = v2[j]; rtmp3[col] = v3[j]; rtmp4[col] = v4[j]; 1665 } 1666 /* ZeropivotApply(): shift the diagonal of the matrix */ 1667 rtmp1[i] += sctx.shift_amount; rtmp2[i+1] += sctx.shift_amount; rtmp3[i+2] += sctx.shift_amount; rtmp4[i+3] += sctx.shift_amount; 1668 1669 /* elimination */ 1670 bjtmp = bj + bi[i]; 1671 row = *bjtmp++; /* pivot row */ 1672 nzL = bi[i+1] - bi[i]; 1673 for(k=0; k < nzL;k++) { 1674 pc1 = rtmp1 + row; 1675 pc2 = rtmp2 + row; 1676 pc3 = rtmp3 + row; 1677 pc4 = rtmp4 + row; 1678 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0) { 1679 pv = b->a + bdiag[row]; 1680 mul1 = *pc1*(*pv); mul2 = *pc2*(*pv); mul3 = *pc3*(*pv); mul4 = *pc4*(*pv); 1681 *pc1 = mul1; *pc2 = mul2; *pc3 = mul3; *pc4 = mul4; 1682 1683 pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 1684 pv = b->a + bdiag[row+1]+1; 1685 nz = bdiag[row]-bdiag[row+1]-1; /* num of entries in U(row,:) excluding diag */ 1686 for (j=0; j<nz; j++){ 1687 col = pj[j]; 1688 rtmp1[col] -= mul1 * pv[j]; 1689 rtmp2[col] -= mul2 * pv[j]; 1690 rtmp3[col] -= mul3 * pv[j]; 1691 rtmp4[col] -= mul4 * pv[j]; 1692 } 1693 ierr = PetscLogFlops(4+8*nz);CHKERRQ(ierr); 1694 } 1695 row = *bjtmp++; 1696 } 1697 1698 /* finished row i; check zero pivot, then stick row i into b->a */ 1699 rs = 0.0; 1700 /* L part */ 1701 pc1 = b->a + bi[i]; 1702 pj = b->j + bi[i] ; 1703 nz = bi[i+1] - bi[i]; 1704 for (j=0; j<nz; j++) { 1705 col = pj[j]; 1706 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1707 } 1708 /* U part */ 1709 pc1 = b->a + bdiag[i+1]+1; 1710 pj = b->j + bdiag[i+1]+1; 1711 nz = bdiag[i] - bdiag[i+1] - 1; /* exclude diagonal */ 1712 for (j=0; j<nz; j++) { 1713 col = pj[j]; 1714 pc1[j] = rtmp1[col]; rs += PetscAbsScalar(pc1[j]); 1715 } 1716 1717 sctx.rs = rs; 1718 sctx.pv = rtmp1[i]; 1719 ierr = MatPivotCheck(A,info,&sctx,i);CHKERRQ(ierr); 1720 if(sctx.newshift) break; 1721 pc1 = b->a + bdiag[i]; /* Mark diag[i] */ 1722 *pc1 = 1.0/sctx.pv; 1723 1724 /* Now take care of 1st column of diagonal 4x4 block. */ 1725 pc2 = rtmp2 + i; 1726 pc3 = rtmp3 + i; 1727 pc4 = rtmp4 + i; 1728 if (*pc2 != 0.0 || *pc3 != 0.0 || *pc4 != 0.0){ 1729 mul2 = (*pc2)*(*pc1); *pc2 = mul2; 1730 mul3 = (*pc3)*(*pc1); *pc3 = mul3; 1731 mul4 = (*pc4)*(*pc1); *pc4 = mul4; 1732 pj = b->j + bdiag[i+1]+1; /* beginning of U(i,:) */ 1733 nz = bdiag[i]-bdiag[i+1]-1; /* num of entries in U(i,:) excluding diag */ 1734 for (j=0; j<nz; j++) { 1735 col = pj[j]; 1736 rtmp2[col] -= mul2 * rtmp1[col]; 1737 rtmp3[col] -= mul3 * rtmp1[col]; 1738 rtmp4[col] -= mul4 * rtmp1[col]; 1739 } 1740 ierr = PetscLogFlops(3+6*nz);CHKERRQ(ierr); 1741 } 1742 1743 /* finished row i+1; check zero pivot, then stick row i+1 into b->a */ 1744 rs = 0.0; 1745 /* L part */ 1746 pc2 = b->a + bi[i+1]; 1747 pj = b->j + bi[i+1] ; 1748 nz = bi[i+2] - bi[i+1]; 1749 for (j=0; j<nz; j++) { 1750 col = pj[j]; 1751 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1752 } 1753 /* U part */ 1754 pc2 = b->a + bdiag[i+2]+1; 1755 pj = b->j + bdiag[i+2]+1; 1756 nz = bdiag[i+1] - bdiag[i+2] - 1; /* exclude diagonal */ 1757 for (j=0; j<nz; j++) { 1758 col = pj[j]; 1759 pc2[j] = rtmp2[col]; rs += PetscAbsScalar(pc2[j]); 1760 } 1761 1762 sctx.rs = rs; 1763 sctx.pv = rtmp2[i+1]; 1764 ierr = MatPivotCheck(A,info,&sctx,i+1);CHKERRQ(ierr); 1765 if(sctx.newshift) break; 1766 pc2 = b->a + bdiag[i+1]; 1767 *pc2 = 1.0/sctx.pv; /* Mark diag[i+1] */ 1768 1769 /* Now take care of 2nd column of diagonal 4x4 block. */ 1770 pc3 = rtmp3 + i+1; 1771 pc4 = rtmp4 + i+1; 1772 if (*pc3 != 0.0 || *pc4 != 0.0){ 1773 mul3 = (*pc3)*(*pc2); *pc3 = mul3; 1774 mul4 = (*pc4)*(*pc2); *pc4 = mul4; 1775 pj = b->j + bdiag[i+2]+1; /* beginning of U(i+1,:) */ 1776 nz = bdiag[i+1]-bdiag[i+2]-1; /* num of entries in U(i+1,:) excluding diag */ 1777 for (j=0; j<nz; j++) { 1778 col = pj[j]; 1779 rtmp3[col] -= mul3 * rtmp2[col]; 1780 rtmp4[col] -= mul4 * rtmp2[col]; 1781 } 1782 ierr = PetscLogFlops(4*nz);CHKERRQ(ierr); 1783 } 1784 1785 /* finished i+2; check zero pivot, then stick row i+2 into b->a */ 1786 rs = 0.0; 1787 /* L part */ 1788 pc3 = b->a + bi[i+2]; 1789 pj = b->j + bi[i+2] ; 1790 nz = bi[i+3] - bi[i+2]; 1791 for (j=0; j<nz; j++) { 1792 col = pj[j]; 1793 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1794 } 1795 /* U part */ 1796 pc3 = b->a + bdiag[i+3]+1; 1797 pj = b->j + bdiag[i+3]+1; 1798 nz = bdiag[i+2] - bdiag[i+3] - 1; /* exclude diagonal */ 1799 for (j=0; j<nz; j++) { 1800 col = pj[j]; 1801 pc3[j] = rtmp3[col]; rs += PetscAbsScalar(pc3[j]); 1802 } 1803 1804 sctx.rs = rs; 1805 sctx.pv = rtmp3[i+2]; 1806 ierr = MatPivotCheck(A,info,&sctx,i+2);CHKERRQ(ierr); 1807 if(sctx.newshift) break; 1808 pc3 = b->a + bdiag[i+2]; 1809 *pc3 = 1.0/sctx.pv; /* Mark diag[i+2] */ 1810 1811 /* Now take care of 3rd column of diagonal 4x4 block. */ 1812 pc4 = rtmp4 + i+2; 1813 if (*pc4 != 0.0){ 1814 mul4 = (*pc4)*(*pc3); *pc4 = mul4; 1815 pj = b->j + bdiag[i+3]+1; /* beginning of U(i+2,:) */ 1816 nz = bdiag[i+2]-bdiag[i+3]-1; /* num of entries in U(i+2,:) excluding diag */ 1817 for (j=0; j<nz; j++) { 1818 col = pj[j]; 1819 rtmp4[col] -= mul4 * rtmp3[col]; 1820 } 1821 ierr = PetscLogFlops(1+2*nz);CHKERRQ(ierr); 1822 } 1823 1824 /* finished i+3; check zero pivot, then stick row i+3 into b->a */ 1825 rs = 0.0; 1826 /* L part */ 1827 pc4 = b->a + bi[i+3]; 1828 pj = b->j + bi[i+3] ; 1829 nz = bi[i+4] - bi[i+3]; 1830 for (j=0; j<nz; j++) { 1831 col = pj[j]; 1832 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1833 } 1834 /* U part */ 1835 pc4 = b->a + bdiag[i+4]+1; 1836 pj = b->j + bdiag[i+4]+1; 1837 nz = bdiag[i+3] - bdiag[i+4] - 1; /* exclude diagonal */ 1838 for (j=0; j<nz; j++) { 1839 col = pj[j]; 1840 pc4[j] = rtmp4[col]; rs += PetscAbsScalar(pc4[j]); 1841 } 1842 1843 sctx.rs = rs; 1844 sctx.pv = rtmp4[i+3]; 1845 ierr = MatPivotCheck(A,info,&sctx,i+3);CHKERRQ(ierr); 1846 if(sctx.newshift) break; 1847 pc4 = b->a + bdiag[i+3]; 1848 *pc4 = 1.0/sctx.pv; /* Mark diag[i+3] */ 1849 break; 1850 1851 default: 1852 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 1853 } 1854 if (sctx.newshift) break; /* break for (inod=0,i=0; inod<node_max; inod++) */ 1855 i += nodesz; /* Update the row */ 1856 } 1857 1858 /* MatPivotRefine() */ 1859 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction>0 && sctx.nshift<sctx.nshift_max){ 1860 /* 1861 * if no shift in this attempt & shifting & started shifting & can refine, 1862 * then try lower shift 1863 */ 1864 sctx.shift_hi = sctx.shift_fraction; 1865 sctx.shift_fraction = (sctx.shift_hi+sctx.shift_lo)/2.; 1866 sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 1867 sctx.newshift = PETSC_TRUE; 1868 sctx.nshift++; 1869 } 1870 } while (sctx.newshift); 1871 1872 ierr = PetscFree(rtmp1);CHKERRQ(ierr); 1873 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 1874 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 1875 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 1876 1877 C->ops->solve = MatSolve_SeqAIJ; 1878 C->ops->solveadd = MatSolveAdd_SeqAIJ; 1879 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ; 1880 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ; 1881 C->ops->matsolve = MatMatSolve_SeqAIJ; 1882 C->assembled = PETSC_TRUE; 1883 C->preallocated = PETSC_TRUE; 1884 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 1885 1886 /* MatShiftView(A,info,&sctx) */ 1887 if (sctx.nshift){ 1888 if (info->shifttype == (PetscReal) MAT_SHIFT_POSITIVE_DEFINITE) { 1889 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 1890 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 1891 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 1892 } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS){ 1893 ierr = PetscInfo2(A,"number of shift_inblocks applied %D, each shift_amount %G\n",sctx.nshift,info->shiftamount);CHKERRQ(ierr); 1894 } 1895 } 1896 ierr = Mat_CheckInode_FactorLU(C,PETSC_FALSE);CHKERRQ(ierr); 1897 PetscFunctionReturn(0); 1898 } 1899 1900 #undef __FUNCT__ 1901 #define __FUNCT__ "MatLUFactorNumeric_SeqAIJ_Inode_inplace" 1902 PetscErrorCode MatLUFactorNumeric_SeqAIJ_Inode_inplace(Mat B,Mat A,const MatFactorInfo *info) 1903 { 1904 Mat C = B; 1905 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data,*b = (Mat_SeqAIJ*)C->data; 1906 IS iscol = b->col,isrow = b->row,isicol = b->icol; 1907 PetscErrorCode ierr; 1908 const PetscInt *r,*ic,*c,*ics; 1909 PetscInt n = A->rmap->n,*bi = b->i; 1910 PetscInt *bj = b->j,*nbj=b->j +1,*ajtmp,*bjtmp,nz,nz_tmp,row,prow; 1911 PetscInt i,j,idx,*ai = a->i,*aj = a->j,*bd = b->diag,node_max,nodesz; 1912 PetscInt *ns,*tmp_vec1,*tmp_vec2,*nsmap,*pj; 1913 PetscScalar mul1,mul2,mul3,tmp; 1914 MatScalar *pc1,*pc2,*pc3,*ba = b->a,*pv,*rtmp11,*rtmp22,*rtmp33; 1915 const MatScalar *v1,*v2,*v3,*aa = a->a,*rtmp1; 1916 PetscReal rs=0.0; 1917 FactorShiftCtx sctx; 1918 1919 PetscFunctionBegin; 1920 sctx.shift_top = 0; 1921 sctx.nshift_max = 0; 1922 sctx.shift_lo = 0; 1923 sctx.shift_hi = 0; 1924 sctx.shift_fraction = 0; 1925 1926 /* if both shift schemes are chosen by user, only use info->shiftpd */ 1927 if (info->shifttype==(PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 1928 sctx.shift_top = 0; 1929 for (i=0; i<n; i++) { 1930 /* calculate rs = sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 1931 rs = 0.0; 1932 ajtmp = aj + ai[i]; 1933 rtmp1 = aa + ai[i]; 1934 nz = ai[i+1] - ai[i]; 1935 for (j=0; j<nz; j++){ 1936 if (*ajtmp != i){ 1937 rs += PetscAbsScalar(*rtmp1++); 1938 } else { 1939 rs -= PetscRealPart(*rtmp1++); 1940 } 1941 ajtmp++; 1942 } 1943 if (rs>sctx.shift_top) sctx.shift_top = rs; 1944 } 1945 if (sctx.shift_top == 0.0) sctx.shift_top += 1.e-12; 1946 sctx.shift_top *= 1.1; 1947 sctx.nshift_max = 5; 1948 sctx.shift_lo = 0.; 1949 sctx.shift_hi = 1.; 1950 } 1951 sctx.shift_amount = 0; 1952 sctx.nshift = 0; 1953 1954 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1955 ierr = ISGetIndices(iscol,&c);CHKERRQ(ierr); 1956 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1957 ierr = PetscMalloc((3*n+1)*sizeof(PetscScalar),&rtmp11);CHKERRQ(ierr); 1958 ierr = PetscMemzero(rtmp11,(3*n+1)*sizeof(PetscScalar));CHKERRQ(ierr); 1959 ics = ic ; 1960 rtmp22 = rtmp11 + n; 1961 rtmp33 = rtmp22 + n; 1962 1963 node_max = a->inode.node_count; 1964 ns = a->inode.size; 1965 if (!ns) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Matrix without inode information"); 1966 1967 /* If max inode size > 3, split it into two inodes.*/ 1968 /* also map the inode sizes according to the ordering */ 1969 ierr = PetscMalloc((n+1)* sizeof(PetscInt),&tmp_vec1);CHKERRQ(ierr); 1970 for (i=0,j=0; i<node_max; ++i,++j){ 1971 if (ns[i]>3) { 1972 tmp_vec1[j] = ns[i]/2; /* Assuming ns[i] < =5 */ 1973 ++j; 1974 tmp_vec1[j] = ns[i] - tmp_vec1[j-1]; 1975 } else { 1976 tmp_vec1[j] = ns[i]; 1977 } 1978 } 1979 /* Use the correct node_max */ 1980 node_max = j; 1981 1982 /* Now reorder the inode info based on mat re-ordering info */ 1983 /* First create a row -> inode_size_array_index map */ 1984 ierr = PetscMalloc(n*sizeof(PetscInt)+1,&nsmap);CHKERRQ(ierr); 1985 ierr = PetscMalloc(node_max*sizeof(PetscInt)+1,&tmp_vec2);CHKERRQ(ierr); 1986 for (i=0,row=0; i<node_max; i++) { 1987 nodesz = tmp_vec1[i]; 1988 for (j=0; j<nodesz; j++,row++) { 1989 nsmap[row] = i; 1990 } 1991 } 1992 /* Using nsmap, create a reordered ns structure */ 1993 for (i=0,j=0; i< node_max; i++) { 1994 nodesz = tmp_vec1[nsmap[r[j]]]; /* here the reordered row_no is in r[] */ 1995 tmp_vec2[i] = nodesz; 1996 j += nodesz; 1997 } 1998 ierr = PetscFree(nsmap);CHKERRQ(ierr); 1999 ierr = PetscFree(tmp_vec1);CHKERRQ(ierr); 2000 /* Now use the correct ns */ 2001 ns = tmp_vec2; 2002 2003 do { 2004 sctx.newshift = PETSC_FALSE; 2005 /* Now loop over each block-row, and do the factorization */ 2006 for (i=0,row=0; i<node_max; i++) { 2007 nodesz = ns[i]; 2008 nz = bi[row+1] - bi[row]; 2009 bjtmp = bj + bi[row]; 2010 2011 switch (nodesz){ 2012 case 1: 2013 for (j=0; j<nz; j++){ 2014 idx = bjtmp[j]; 2015 rtmp11[idx] = 0.0; 2016 } 2017 2018 /* load in initial (unfactored row) */ 2019 idx = r[row]; 2020 nz_tmp = ai[idx+1] - ai[idx]; 2021 ajtmp = aj + ai[idx]; 2022 v1 = aa + ai[idx]; 2023 2024 for (j=0; j<nz_tmp; j++) { 2025 idx = ics[ajtmp[j]]; 2026 rtmp11[idx] = v1[j]; 2027 } 2028 rtmp11[ics[r[row]]] += sctx.shift_amount; 2029 2030 prow = *bjtmp++ ; 2031 while (prow < row) { 2032 pc1 = rtmp11 + prow; 2033 if (*pc1 != 0.0){ 2034 pv = ba + bd[prow]; 2035 pj = nbj + bd[prow]; 2036 mul1 = *pc1 * *pv++; 2037 *pc1 = mul1; 2038 nz_tmp = bi[prow+1] - bd[prow] - 1; 2039 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2040 for (j=0; j<nz_tmp; j++) { 2041 tmp = pv[j]; 2042 idx = pj[j]; 2043 rtmp11[idx] -= mul1 * tmp; 2044 } 2045 } 2046 prow = *bjtmp++ ; 2047 } 2048 pj = bj + bi[row]; 2049 pc1 = ba + bi[row]; 2050 2051 sctx.pv = rtmp11[row]; 2052 rtmp11[row] = 1.0/rtmp11[row]; /* invert diag */ 2053 rs = 0.0; 2054 for (j=0; j<nz; j++) { 2055 idx = pj[j]; 2056 pc1[j] = rtmp11[idx]; /* rtmp11 -> ba */ 2057 if (idx != row) rs += PetscAbsScalar(pc1[j]); 2058 } 2059 sctx.rs = rs; 2060 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2061 if (sctx.newshift) goto endofwhile; 2062 break; 2063 2064 case 2: 2065 for (j=0; j<nz; j++) { 2066 idx = bjtmp[j]; 2067 rtmp11[idx] = 0.0; 2068 rtmp22[idx] = 0.0; 2069 } 2070 2071 /* load in initial (unfactored row) */ 2072 idx = r[row]; 2073 nz_tmp = ai[idx+1] - ai[idx]; 2074 ajtmp = aj + ai[idx]; 2075 v1 = aa + ai[idx]; 2076 v2 = aa + ai[idx+1]; 2077 for (j=0; j<nz_tmp; j++) { 2078 idx = ics[ajtmp[j]]; 2079 rtmp11[idx] = v1[j]; 2080 rtmp22[idx] = v2[j]; 2081 } 2082 rtmp11[ics[r[row]]] += sctx.shift_amount; 2083 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2084 2085 prow = *bjtmp++ ; 2086 while (prow < row) { 2087 pc1 = rtmp11 + prow; 2088 pc2 = rtmp22 + prow; 2089 if (*pc1 != 0.0 || *pc2 != 0.0){ 2090 pv = ba + bd[prow]; 2091 pj = nbj + bd[prow]; 2092 mul1 = *pc1 * *pv; 2093 mul2 = *pc2 * *pv; 2094 ++pv; 2095 *pc1 = mul1; 2096 *pc2 = mul2; 2097 2098 nz_tmp = bi[prow+1] - bd[prow] - 1; 2099 for (j=0; j<nz_tmp; j++) { 2100 tmp = pv[j]; 2101 idx = pj[j]; 2102 rtmp11[idx] -= mul1 * tmp; 2103 rtmp22[idx] -= mul2 * tmp; 2104 } 2105 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2106 } 2107 prow = *bjtmp++ ; 2108 } 2109 2110 /* Now take care of diagonal 2x2 block. Note: prow = row here */ 2111 pc1 = rtmp11 + prow; 2112 pc2 = rtmp22 + prow; 2113 2114 sctx.pv = *pc1; 2115 pj = bj + bi[prow]; 2116 rs = 0.0; 2117 for (j=0; j<nz; j++){ 2118 idx = pj[j]; 2119 if (idx != prow) rs += PetscAbsScalar(rtmp11[idx]); 2120 } 2121 sctx.rs = rs; 2122 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2123 if (sctx.newshift) goto endofwhile; 2124 2125 if (*pc2 != 0.0){ 2126 pj = nbj + bd[prow]; 2127 mul2 = (*pc2)/(*pc1); /* since diag is not yet inverted.*/ 2128 *pc2 = mul2; 2129 nz_tmp = bi[prow+1] - bd[prow] - 1; 2130 for (j=0; j<nz_tmp; j++) { 2131 idx = pj[j] ; 2132 tmp = rtmp11[idx]; 2133 rtmp22[idx] -= mul2 * tmp; 2134 } 2135 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2136 } 2137 2138 pj = bj + bi[row]; 2139 pc1 = ba + bi[row]; 2140 pc2 = ba + bi[row+1]; 2141 2142 sctx.pv = rtmp22[row+1]; 2143 rs = 0.0; 2144 rtmp11[row] = 1.0/rtmp11[row]; 2145 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2146 /* copy row entries from dense representation to sparse */ 2147 for (j=0; j<nz; j++) { 2148 idx = pj[j]; 2149 pc1[j] = rtmp11[idx]; 2150 pc2[j] = rtmp22[idx]; 2151 if (idx != row+1) rs += PetscAbsScalar(pc2[j]); 2152 } 2153 sctx.rs = rs; 2154 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2155 if (sctx.newshift) goto endofwhile; 2156 break; 2157 2158 case 3: 2159 for (j=0; j<nz; j++) { 2160 idx = bjtmp[j]; 2161 rtmp11[idx] = 0.0; 2162 rtmp22[idx] = 0.0; 2163 rtmp33[idx] = 0.0; 2164 } 2165 /* copy the nonzeros for the 3 rows from sparse representation to dense in rtmp*[] */ 2166 idx = r[row]; 2167 nz_tmp = ai[idx+1] - ai[idx]; 2168 ajtmp = aj + ai[idx]; 2169 v1 = aa + ai[idx]; 2170 v2 = aa + ai[idx+1]; 2171 v3 = aa + ai[idx+2]; 2172 for (j=0; j<nz_tmp; j++) { 2173 idx = ics[ajtmp[j]]; 2174 rtmp11[idx] = v1[j]; 2175 rtmp22[idx] = v2[j]; 2176 rtmp33[idx] = v3[j]; 2177 } 2178 rtmp11[ics[r[row]]] += sctx.shift_amount; 2179 rtmp22[ics[r[row+1]]] += sctx.shift_amount; 2180 rtmp33[ics[r[row+2]]] += sctx.shift_amount; 2181 2182 /* loop over all pivot row blocks above this row block */ 2183 prow = *bjtmp++ ; 2184 while (prow < row) { 2185 pc1 = rtmp11 + prow; 2186 pc2 = rtmp22 + prow; 2187 pc3 = rtmp33 + prow; 2188 if (*pc1 != 0.0 || *pc2 != 0.0 || *pc3 !=0.0){ 2189 pv = ba + bd[prow]; 2190 pj = nbj + bd[prow]; 2191 mul1 = *pc1 * *pv; 2192 mul2 = *pc2 * *pv; 2193 mul3 = *pc3 * *pv; 2194 ++pv; 2195 *pc1 = mul1; 2196 *pc2 = mul2; 2197 *pc3 = mul3; 2198 2199 nz_tmp = bi[prow+1] - bd[prow] - 1; 2200 /* update this row based on pivot row */ 2201 for (j=0; j<nz_tmp; j++) { 2202 tmp = pv[j]; 2203 idx = pj[j]; 2204 rtmp11[idx] -= mul1 * tmp; 2205 rtmp22[idx] -= mul2 * tmp; 2206 rtmp33[idx] -= mul3 * tmp; 2207 } 2208 ierr = PetscLogFlops(3+6*nz_tmp);CHKERRQ(ierr); 2209 } 2210 prow = *bjtmp++ ; 2211 } 2212 2213 /* Now take care of diagonal 3x3 block in this set of rows */ 2214 /* note: prow = row here */ 2215 pc1 = rtmp11 + prow; 2216 pc2 = rtmp22 + prow; 2217 pc3 = rtmp33 + prow; 2218 2219 sctx.pv = *pc1; 2220 pj = bj + bi[prow]; 2221 rs = 0.0; 2222 for (j=0; j<nz; j++){ 2223 idx = pj[j]; 2224 if (idx != row) rs += PetscAbsScalar(rtmp11[idx]); 2225 } 2226 sctx.rs = rs; 2227 ierr = MatPivotCheck(A,info,&sctx,row);CHKERRQ(ierr); 2228 if (sctx.newshift) goto endofwhile; 2229 2230 if (*pc2 != 0.0 || *pc3 != 0.0){ 2231 mul2 = (*pc2)/(*pc1); 2232 mul3 = (*pc3)/(*pc1); 2233 *pc2 = mul2; 2234 *pc3 = mul3; 2235 nz_tmp = bi[prow+1] - bd[prow] - 1; 2236 pj = nbj + bd[prow]; 2237 for (j=0; j<nz_tmp; j++) { 2238 idx = pj[j] ; 2239 tmp = rtmp11[idx]; 2240 rtmp22[idx] -= mul2 * tmp; 2241 rtmp33[idx] -= mul3 * tmp; 2242 } 2243 ierr = PetscLogFlops(2+4*nz_tmp);CHKERRQ(ierr); 2244 } 2245 ++prow; 2246 2247 pc2 = rtmp22 + prow; 2248 pc3 = rtmp33 + prow; 2249 sctx.pv = *pc2; 2250 pj = bj + bi[prow]; 2251 rs = 0.0; 2252 for (j=0; j<nz; j++){ 2253 idx = pj[j]; 2254 if (idx != prow) rs += PetscAbsScalar(rtmp22[idx]); 2255 } 2256 sctx.rs = rs; 2257 ierr = MatPivotCheck(A,info,&sctx,row+1);CHKERRQ(ierr); 2258 if (sctx.newshift) goto endofwhile; 2259 2260 if (*pc3 != 0.0){ 2261 mul3 = (*pc3)/(*pc2); 2262 *pc3 = mul3; 2263 pj = nbj + bd[prow]; 2264 nz_tmp = bi[prow+1] - bd[prow] - 1; 2265 for (j=0; j<nz_tmp; j++) { 2266 idx = pj[j] ; 2267 tmp = rtmp22[idx]; 2268 rtmp33[idx] -= mul3 * tmp; 2269 } 2270 ierr = PetscLogFlops(1+2*nz_tmp);CHKERRQ(ierr); 2271 } 2272 2273 pj = bj + bi[row]; 2274 pc1 = ba + bi[row]; 2275 pc2 = ba + bi[row+1]; 2276 pc3 = ba + bi[row+2]; 2277 2278 sctx.pv = rtmp33[row+2]; 2279 rs = 0.0; 2280 rtmp11[row] = 1.0/rtmp11[row]; 2281 rtmp22[row+1] = 1.0/rtmp22[row+1]; 2282 rtmp33[row+2] = 1.0/rtmp33[row+2]; 2283 /* copy row entries from dense representation to sparse */ 2284 for (j=0; j<nz; j++) { 2285 idx = pj[j]; 2286 pc1[j] = rtmp11[idx]; 2287 pc2[j] = rtmp22[idx]; 2288 pc3[j] = rtmp33[idx]; 2289 if (idx != row+2) rs += PetscAbsScalar(pc3[j]); 2290 } 2291 2292 sctx.rs = rs; 2293 ierr = MatPivotCheck(A,info,&sctx,row+2);CHKERRQ(ierr); 2294 if (sctx.newshift) goto endofwhile; 2295 break; 2296 2297 default: 2298 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Node size not yet supported \n"); 2299 } 2300 row += nodesz; /* Update the row */ 2301 } 2302 endofwhile:; 2303 } while (sctx.newshift); 2304 ierr = PetscFree(rtmp11);CHKERRQ(ierr); 2305 ierr = PetscFree(tmp_vec2);CHKERRQ(ierr); 2306 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2307 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2308 ierr = ISRestoreIndices(iscol,&c);CHKERRQ(ierr); 2309 (B)->ops->solve = MatSolve_SeqAIJ_inplace; 2310 /* do not set solve add, since MatSolve_Inode + Add is faster */ 2311 C->ops->solvetranspose = MatSolveTranspose_SeqAIJ_inplace; 2312 C->ops->solvetransposeadd = MatSolveTransposeAdd_SeqAIJ_inplace; 2313 C->assembled = PETSC_TRUE; 2314 C->preallocated = PETSC_TRUE; 2315 if (sctx.nshift) { 2316 if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 2317 ierr = PetscInfo4(A,"number of shift_pd tries %D, shift_amount %G, diagonal shifted up by %e fraction top_value %e\n",sctx.nshift,sctx.shift_amount,sctx.shift_fraction,sctx.shift_top);CHKERRQ(ierr); 2318 } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 2319 ierr = PetscInfo2(A,"number of shift_nz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 2320 } 2321 } 2322 ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 2323 ierr = Mat_CheckInode(C,PETSC_FALSE);CHKERRQ(ierr); 2324 PetscFunctionReturn(0); 2325 } 2326 2327 2328 /* ----------------------------------------------------------- */ 2329 #undef __FUNCT__ 2330 #define __FUNCT__ "MatSolve_SeqAIJ_Inode" 2331 PetscErrorCode MatSolve_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 2332 { 2333 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2334 IS iscol = a->col,isrow = a->row; 2335 PetscErrorCode ierr; 2336 const PetscInt *r,*c,*rout,*cout; 2337 PetscInt i,j,n = A->rmap->n,*ai = a->i,nz,*a_j = a->j; 2338 PetscInt node_max,*ns,row,nsz,aii,*vi,*ad,*aj,i0,i1; 2339 PetscScalar *x,*tmp,*tmps,tmp0,tmp1; 2340 PetscScalar sum1,sum2,sum3,sum4,sum5; 2341 const MatScalar *v1,*v2,*v3,*v4,*v5,*a_a = a->a,*aa; 2342 const PetscScalar *b; 2343 2344 PetscFunctionBegin; 2345 if (!a->inode.size) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Missing Inode Structure"); 2346 node_max = a->inode.node_count; 2347 ns = a->inode.size; /* Node Size array */ 2348 2349 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2350 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2351 tmp = a->solve_work; 2352 2353 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2354 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2355 2356 /* forward solve the lower triangular */ 2357 tmps = tmp ; 2358 aa = a_a ; 2359 aj = a_j ; 2360 ad = a->diag; 2361 2362 for (i = 0,row = 0; i< node_max; ++i){ 2363 nsz = ns[i]; 2364 aii = ai[row]; 2365 v1 = aa + aii; 2366 vi = aj + aii; 2367 nz = ai[row+1]- ai[row]; 2368 2369 if (i < node_max-1) { 2370 /* Prefetch the indices for the next block */ 2371 PetscPrefetchBlock(aj+ai[row+nsz],ai[row+nsz+1]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); /* indices */ 2372 /* Prefetch the data for the next block */ 2373 PetscPrefetchBlock(aa+ai[row+nsz],ai[row+nsz+ns[i+1]]-ai[row+nsz],0,PETSC_PREFETCH_HINT_NTA); 2374 } 2375 2376 switch (nsz){ /* Each loop in 'case' is unrolled */ 2377 case 1 : 2378 sum1 = b[r[row]]; 2379 for(j=0; j<nz-1; j+=2){ 2380 i0 = vi[j]; 2381 i1 = vi[j+1]; 2382 tmp0 = tmps[i0]; 2383 tmp1 = tmps[i1]; 2384 sum1 -= v1[j]*tmp0 + v1[j+1]*tmp1; 2385 } 2386 if(j == nz-1){ 2387 tmp0 = tmps[vi[j]]; 2388 sum1 -= v1[j]*tmp0; 2389 } 2390 tmp[row++]=sum1; 2391 break; 2392 case 2: 2393 sum1 = b[r[row]]; 2394 sum2 = b[r[row+1]]; 2395 v2 = aa + ai[row+1]; 2396 2397 for(j=0; j<nz-1; j+=2){ 2398 i0 = vi[j]; 2399 i1 = vi[j+1]; 2400 tmp0 = tmps[i0]; 2401 tmp1 = tmps[i1]; 2402 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2403 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2404 } 2405 if(j == nz-1){ 2406 tmp0 = tmps[vi[j]]; 2407 sum1 -= v1[j] *tmp0; 2408 sum2 -= v2[j] *tmp0; 2409 } 2410 sum2 -= v2[nz] * sum1; 2411 tmp[row ++]=sum1; 2412 tmp[row ++]=sum2; 2413 break; 2414 case 3: 2415 sum1 = b[r[row]]; 2416 sum2 = b[r[row+1]]; 2417 sum3 = b[r[row+2]]; 2418 v2 = aa + ai[row+1]; 2419 v3 = aa + ai[row+2]; 2420 2421 for (j=0; j<nz-1; j+=2){ 2422 i0 = vi[j]; 2423 i1 = vi[j+1]; 2424 tmp0 = tmps[i0]; 2425 tmp1 = tmps[i1]; 2426 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2427 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2428 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2429 } 2430 if (j == nz-1){ 2431 tmp0 = tmps[vi[j]]; 2432 sum1 -= v1[j] *tmp0; 2433 sum2 -= v2[j] *tmp0; 2434 sum3 -= v3[j] *tmp0; 2435 } 2436 sum2 -= v2[nz] * sum1; 2437 sum3 -= v3[nz] * sum1; 2438 sum3 -= v3[nz+1] * sum2; 2439 tmp[row ++]=sum1; 2440 tmp[row ++]=sum2; 2441 tmp[row ++]=sum3; 2442 break; 2443 2444 case 4: 2445 sum1 = b[r[row]]; 2446 sum2 = b[r[row+1]]; 2447 sum3 = b[r[row+2]]; 2448 sum4 = b[r[row+3]]; 2449 v2 = aa + ai[row+1]; 2450 v3 = aa + ai[row+2]; 2451 v4 = aa + ai[row+3]; 2452 2453 for (j=0; j<nz-1; j+=2){ 2454 i0 = vi[j]; 2455 i1 = vi[j+1]; 2456 tmp0 = tmps[i0]; 2457 tmp1 = tmps[i1]; 2458 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2459 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2460 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2461 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2462 } 2463 if (j == nz-1){ 2464 tmp0 = tmps[vi[j]]; 2465 sum1 -= v1[j] *tmp0; 2466 sum2 -= v2[j] *tmp0; 2467 sum3 -= v3[j] *tmp0; 2468 sum4 -= v4[j] *tmp0; 2469 } 2470 sum2 -= v2[nz] * sum1; 2471 sum3 -= v3[nz] * sum1; 2472 sum4 -= v4[nz] * sum1; 2473 sum3 -= v3[nz+1] * sum2; 2474 sum4 -= v4[nz+1] * sum2; 2475 sum4 -= v4[nz+2] * sum3; 2476 2477 tmp[row ++]=sum1; 2478 tmp[row ++]=sum2; 2479 tmp[row ++]=sum3; 2480 tmp[row ++]=sum4; 2481 break; 2482 case 5: 2483 sum1 = b[r[row]]; 2484 sum2 = b[r[row+1]]; 2485 sum3 = b[r[row+2]]; 2486 sum4 = b[r[row+3]]; 2487 sum5 = b[r[row+4]]; 2488 v2 = aa + ai[row+1]; 2489 v3 = aa + ai[row+2]; 2490 v4 = aa + ai[row+3]; 2491 v5 = aa + ai[row+4]; 2492 2493 for (j=0; j<nz-1; j+=2){ 2494 i0 = vi[j]; 2495 i1 = vi[j+1]; 2496 tmp0 = tmps[i0]; 2497 tmp1 = tmps[i1]; 2498 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2499 sum2 -= v2[j] * tmp0 + v2[j+1] * tmp1; 2500 sum3 -= v3[j] * tmp0 + v3[j+1] * tmp1; 2501 sum4 -= v4[j] * tmp0 + v4[j+1] * tmp1; 2502 sum5 -= v5[j] * tmp0 + v5[j+1] * tmp1; 2503 } 2504 if (j == nz-1){ 2505 tmp0 = tmps[vi[j]]; 2506 sum1 -= v1[j] *tmp0; 2507 sum2 -= v2[j] *tmp0; 2508 sum3 -= v3[j] *tmp0; 2509 sum4 -= v4[j] *tmp0; 2510 sum5 -= v5[j] *tmp0; 2511 } 2512 2513 sum2 -= v2[nz] * sum1; 2514 sum3 -= v3[nz] * sum1; 2515 sum4 -= v4[nz] * sum1; 2516 sum5 -= v5[nz] * sum1; 2517 sum3 -= v3[nz+1] * sum2; 2518 sum4 -= v4[nz+1] * sum2; 2519 sum5 -= v5[nz+1] * sum2; 2520 sum4 -= v4[nz+2] * sum3; 2521 sum5 -= v5[nz+2] * sum3; 2522 sum5 -= v5[nz+3] * sum4; 2523 2524 tmp[row ++]=sum1; 2525 tmp[row ++]=sum2; 2526 tmp[row ++]=sum3; 2527 tmp[row ++]=sum4; 2528 tmp[row ++]=sum5; 2529 break; 2530 default: 2531 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2532 } 2533 } 2534 /* backward solve the upper triangular */ 2535 for (i=node_max -1 ,row = n-1 ; i>=0; i--){ 2536 nsz = ns[i]; 2537 aii = ad[row+1] + 1; 2538 v1 = aa + aii; 2539 vi = aj + aii; 2540 nz = ad[row]- ad[row+1] - 1; 2541 2542 if (i > 0) { 2543 /* Prefetch the indices for the next block */ 2544 PetscPrefetchBlock(aj+ad[row-nsz+1]+1,ad[row-nsz]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2545 /* Prefetch the data for the next block */ 2546 PetscPrefetchBlock(aa+ad[row-nsz+1]+1,ad[row-nsz-ns[i-1]+1]-ad[row-nsz+1],0,PETSC_PREFETCH_HINT_NTA); 2547 } 2548 2549 switch (nsz){ /* Each loop in 'case' is unrolled */ 2550 case 1 : 2551 sum1 = tmp[row]; 2552 2553 for(j=0 ; j<nz-1; j+=2){ 2554 i0 = vi[j]; 2555 i1 = vi[j+1]; 2556 tmp0 = tmps[i0]; 2557 tmp1 = tmps[i1]; 2558 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2559 } 2560 if (j == nz-1){ 2561 tmp0 = tmps[vi[j]]; 2562 sum1 -= v1[j]*tmp0; 2563 } 2564 x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2565 break; 2566 case 2 : 2567 sum1 = tmp[row]; 2568 sum2 = tmp[row-1]; 2569 v2 = aa + ad[row] + 1; 2570 for (j=0 ; j<nz-1; j+=2){ 2571 i0 = vi[j]; 2572 i1 = vi[j+1]; 2573 tmp0 = tmps[i0]; 2574 tmp1 = tmps[i1]; 2575 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2576 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2577 } 2578 if (j == nz-1){ 2579 tmp0 = tmps[vi[j]]; 2580 sum1 -= v1[j]* tmp0; 2581 sum2 -= v2[j+1]* tmp0; 2582 } 2583 2584 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2585 sum2 -= v2[0] * tmp0; 2586 x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2587 break; 2588 case 3 : 2589 sum1 = tmp[row]; 2590 sum2 = tmp[row -1]; 2591 sum3 = tmp[row -2]; 2592 v2 = aa + ad[row] + 1; 2593 v3 = aa + ad[row -1] + 1; 2594 for (j=0 ; j<nz-1; j+=2){ 2595 i0 = vi[j]; 2596 i1 = vi[j+1]; 2597 tmp0 = tmps[i0]; 2598 tmp1 = tmps[i1]; 2599 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2600 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2601 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2602 } 2603 if (j== nz-1){ 2604 tmp0 = tmps[vi[j]]; 2605 sum1 -= v1[j] * tmp0; 2606 sum2 -= v2[j+1] * tmp0; 2607 sum3 -= v3[j+2] * tmp0; 2608 } 2609 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2610 sum2 -= v2[0]* tmp0; 2611 sum3 -= v3[1] * tmp0; 2612 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2613 sum3 -= v3[0]* tmp0; 2614 x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2615 2616 break; 2617 case 4 : 2618 sum1 = tmp[row]; 2619 sum2 = tmp[row -1]; 2620 sum3 = tmp[row -2]; 2621 sum4 = tmp[row -3]; 2622 v2 = aa + ad[row]+1; 2623 v3 = aa + ad[row -1]+1; 2624 v4 = aa + ad[row -2]+1; 2625 2626 for (j=0 ; j<nz-1; j+=2){ 2627 i0 = vi[j]; 2628 i1 = vi[j+1]; 2629 tmp0 = tmps[i0]; 2630 tmp1 = tmps[i1]; 2631 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2632 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2633 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2634 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2635 } 2636 if (j== nz-1){ 2637 tmp0 = tmps[vi[j]]; 2638 sum1 -= v1[j] * tmp0; 2639 sum2 -= v2[j+1] * tmp0; 2640 sum3 -= v3[j+2] * tmp0; 2641 sum4 -= v4[j+3] * tmp0; 2642 } 2643 2644 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2645 sum2 -= v2[0] * tmp0; 2646 sum3 -= v3[1] * tmp0; 2647 sum4 -= v4[2] * tmp0; 2648 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2649 sum3 -= v3[0] * tmp0; 2650 sum4 -= v4[1] * tmp0; 2651 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2652 sum4 -= v4[0] * tmp0; 2653 x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2654 break; 2655 case 5 : 2656 sum1 = tmp[row]; 2657 sum2 = tmp[row -1]; 2658 sum3 = tmp[row -2]; 2659 sum4 = tmp[row -3]; 2660 sum5 = tmp[row -4]; 2661 v2 = aa + ad[row]+1; 2662 v3 = aa + ad[row -1]+1; 2663 v4 = aa + ad[row -2]+1; 2664 v5 = aa + ad[row -3]+1; 2665 for (j=0 ; j<nz-1; j+=2){ 2666 i0 = vi[j]; 2667 i1 = vi[j+1]; 2668 tmp0 = tmps[i0]; 2669 tmp1 = tmps[i1]; 2670 sum1 -= v1[j] * tmp0 + v1[j+1] * tmp1; 2671 sum2 -= v2[j+1] * tmp0 + v2[j+2] * tmp1; 2672 sum3 -= v3[j+2] * tmp0 + v3[j+3] * tmp1; 2673 sum4 -= v4[j+3] * tmp0 + v4[j+4] * tmp1; 2674 sum5 -= v5[j+4] * tmp0 + v5[j+5] * tmp1; 2675 } 2676 if (j==nz-1){ 2677 tmp0 = tmps[vi[j]]; 2678 sum1 -= v1[j] * tmp0; 2679 sum2 -= v2[j+1] * tmp0; 2680 sum3 -= v3[j+2] * tmp0; 2681 sum4 -= v4[j+3] * tmp0; 2682 sum5 -= v5[j+4] * tmp0; 2683 } 2684 2685 tmp0 = x[c[row]] = tmp[row] = sum1*v1[nz]; row--; 2686 sum2 -= v2[0] * tmp0; 2687 sum3 -= v3[1] * tmp0; 2688 sum4 -= v4[2] * tmp0; 2689 sum5 -= v5[3] * tmp0; 2690 tmp0 = x[c[row]] = tmp[row] = sum2*v2[nz+1]; row--; 2691 sum3 -= v3[0] * tmp0; 2692 sum4 -= v4[1] * tmp0; 2693 sum5 -= v5[2] * tmp0; 2694 tmp0 = x[c[row]] = tmp[row] = sum3*v3[nz+2]; row--; 2695 sum4 -= v4[0] * tmp0; 2696 sum5 -= v5[1] * tmp0; 2697 tmp0 = x[c[row]] = tmp[row] = sum4*v4[nz+3]; row--; 2698 sum5 -= v5[0] * tmp0; 2699 x[c[row]] = tmp[row] = sum5*v5[nz+4]; row--; 2700 break; 2701 default: 2702 SETERRQ(PETSC_COMM_SELF,PETSC_ERR_COR,"Node size not yet supported \n"); 2703 } 2704 } 2705 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2706 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2707 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2708 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2709 ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 2710 PetscFunctionReturn(0); 2711 } 2712 2713 2714 /* 2715 Makes a longer coloring[] array and calls the usual code with that 2716 */ 2717 #undef __FUNCT__ 2718 #define __FUNCT__ "MatColoringPatch_SeqAIJ_Inode" 2719 PetscErrorCode MatColoringPatch_SeqAIJ_Inode(Mat mat,PetscInt ncolors,PetscInt nin,ISColoringValue coloring[],ISColoring *iscoloring) 2720 { 2721 Mat_SeqAIJ *a = (Mat_SeqAIJ*)mat->data; 2722 PetscErrorCode ierr; 2723 PetscInt n = mat->cmap->n,m = a->inode.node_count,j,*ns = a->inode.size,row; 2724 PetscInt *colorused,i; 2725 ISColoringValue *newcolor; 2726 2727 PetscFunctionBegin; 2728 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&newcolor);CHKERRQ(ierr); 2729 /* loop over inodes, marking a color for each column*/ 2730 row = 0; 2731 for (i=0; i<m; i++){ 2732 for (j=0; j<ns[i]; j++) { 2733 newcolor[row++] = coloring[i] + j*ncolors; 2734 } 2735 } 2736 2737 /* eliminate unneeded colors */ 2738 ierr = PetscMalloc(5*ncolors*sizeof(PetscInt),&colorused);CHKERRQ(ierr); 2739 ierr = PetscMemzero(colorused,5*ncolors*sizeof(PetscInt));CHKERRQ(ierr); 2740 for (i=0; i<n; i++) { 2741 colorused[newcolor[i]] = 1; 2742 } 2743 2744 for (i=1; i<5*ncolors; i++) { 2745 colorused[i] += colorused[i-1]; 2746 } 2747 ncolors = colorused[5*ncolors-1]; 2748 for (i=0; i<n; i++) { 2749 newcolor[i] = colorused[newcolor[i]]-1; 2750 } 2751 ierr = PetscFree(colorused);CHKERRQ(ierr); 2752 ierr = ISColoringCreate(((PetscObject)mat)->comm,ncolors,n,newcolor,iscoloring);CHKERRQ(ierr); 2753 ierr = PetscFree(coloring);CHKERRQ(ierr); 2754 PetscFunctionReturn(0); 2755 } 2756 2757 #include <../src/mat/blockinvert.h> 2758 2759 #undef __FUNCT__ 2760 #define __FUNCT__ "MatSOR_SeqAIJ_Inode" 2761 PetscErrorCode MatSOR_SeqAIJ_Inode(Mat A,Vec bb,PetscReal omega,MatSORType flag,PetscReal fshift,PetscInt its,PetscInt lits,Vec xx) 2762 { 2763 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 2764 PetscScalar sum1,sum2,sum3,sum4,sum5,tmp0,tmp1,tmp2,tmp3; 2765 MatScalar *ibdiag,*bdiag,work[25],*t; 2766 PetscScalar *x,tmp4,tmp5,x1,x2,x3,x4,x5; 2767 const MatScalar *v = a->a,*v1,*v2,*v3,*v4,*v5; 2768 const PetscScalar *xb, *b; 2769 PetscReal zeropivot = 1.0e-15, shift = 0.0; 2770 PetscErrorCode ierr; 2771 PetscInt n,m = a->inode.node_count,*sizes = a->inode.size,cnt = 0,i,j,row,i1,i2; 2772 PetscInt *idx,*diag = a->diag,*ii = a->i,sz,k,ipvt[5]; 2773 2774 PetscFunctionBegin; 2775 if (omega != 1.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for omega != 1.0; use -mat_no_inode"); 2776 if (fshift != 0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for fshift != 0.0; use -mat_no_inode"); 2777 2778 if (!a->inode.ibdiagvalid) { 2779 if (!a->inode.ibdiag) { 2780 /* calculate space needed for diagonal blocks */ 2781 for (i=0; i<m; i++) { 2782 cnt += sizes[i]*sizes[i]; 2783 } 2784 a->inode.bdiagsize = cnt; 2785 ierr = PetscMalloc3(cnt,MatScalar,&a->inode.ibdiag,cnt,MatScalar,&a->inode.bdiag,A->rmap->n,MatScalar,&a->inode.ssor_work);CHKERRQ(ierr); 2786 } 2787 2788 /* copy over the diagonal blocks and invert them */ 2789 ibdiag = a->inode.ibdiag; 2790 bdiag = a->inode.bdiag; 2791 cnt = 0; 2792 for (i=0, row = 0; i<m; i++) { 2793 for (j=0; j<sizes[i]; j++) { 2794 for (k=0; k<sizes[i]; k++) { 2795 bdiag[cnt+k*sizes[i]+j] = v[diag[row+j] - j + k]; 2796 } 2797 } 2798 ierr = PetscMemcpy(ibdiag+cnt,bdiag+cnt,sizes[i]*sizes[i]*sizeof(MatScalar));CHKERRQ(ierr); 2799 2800 switch(sizes[i]) { 2801 case 1: 2802 /* Create matrix data structure */ 2803 if (PetscAbsScalar(ibdiag[cnt]) < zeropivot) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot on row %D",row); 2804 ibdiag[cnt] = 1.0/ibdiag[cnt]; 2805 break; 2806 case 2: 2807 ierr = PetscKernel_A_gets_inverse_A_2(ibdiag+cnt,shift);CHKERRQ(ierr); 2808 break; 2809 case 3: 2810 ierr = PetscKernel_A_gets_inverse_A_3(ibdiag+cnt,shift);CHKERRQ(ierr); 2811 break; 2812 case 4: 2813 ierr = PetscKernel_A_gets_inverse_A_4(ibdiag+cnt,shift);CHKERRQ(ierr); 2814 break; 2815 case 5: 2816 ierr = PetscKernel_A_gets_inverse_A_5(ibdiag+cnt,ipvt,work,shift);CHKERRQ(ierr); 2817 break; 2818 default: 2819 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 2820 } 2821 cnt += sizes[i]*sizes[i]; 2822 row += sizes[i]; 2823 } 2824 a->inode.ibdiagvalid = PETSC_TRUE; 2825 } 2826 ibdiag = a->inode.ibdiag; 2827 bdiag = a->inode.bdiag; 2828 t = a->inode.ssor_work; 2829 2830 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2831 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2832 CHKMEMQ; 2833 /* We count flops by assuming the upper triangular and lower triangular parts have the same number of nonzeros */ 2834 if (flag & SOR_ZERO_INITIAL_GUESS) { 2835 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 2836 2837 for (i=0, row=0; i<m; i++) { 2838 sz = diag[row] - ii[row]; 2839 v1 = a->a + ii[row]; 2840 idx = a->j + ii[row]; 2841 2842 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 2843 switch (sizes[i]){ 2844 case 1: 2845 2846 sum1 = b[row]; 2847 for(n = 0; n<sz-1; n+=2) { 2848 i1 = idx[0]; 2849 i2 = idx[1]; 2850 idx += 2; 2851 tmp0 = x[i1]; 2852 tmp1 = x[i2]; 2853 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2854 } 2855 2856 if (n == sz-1){ 2857 tmp0 = x[*idx]; 2858 sum1 -= *v1 * tmp0; 2859 } 2860 t[row] = sum1; 2861 x[row++] = sum1*(*ibdiag++); 2862 break; 2863 case 2: 2864 v2 = a->a + ii[row+1]; 2865 sum1 = b[row]; 2866 sum2 = b[row+1]; 2867 for(n = 0; n<sz-1; n+=2) { 2868 i1 = idx[0]; 2869 i2 = idx[1]; 2870 idx += 2; 2871 tmp0 = x[i1]; 2872 tmp1 = x[i2]; 2873 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2874 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2875 } 2876 2877 if (n == sz-1){ 2878 tmp0 = x[*idx]; 2879 sum1 -= v1[0] * tmp0; 2880 sum2 -= v2[0] * tmp0; 2881 } 2882 t[row] = sum1; 2883 t[row+1] = sum2; 2884 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[2]; 2885 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[3]; 2886 ibdiag += 4; 2887 break; 2888 case 3: 2889 v2 = a->a + ii[row+1]; 2890 v3 = a->a + ii[row+2]; 2891 sum1 = b[row]; 2892 sum2 = b[row+1]; 2893 sum3 = b[row+2]; 2894 for(n = 0; n<sz-1; n+=2) { 2895 i1 = idx[0]; 2896 i2 = idx[1]; 2897 idx += 2; 2898 tmp0 = x[i1]; 2899 tmp1 = x[i2]; 2900 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2901 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2902 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2903 } 2904 2905 if (n == sz-1){ 2906 tmp0 = x[*idx]; 2907 sum1 -= v1[0] * tmp0; 2908 sum2 -= v2[0] * tmp0; 2909 sum3 -= v3[0] * tmp0; 2910 } 2911 t[row] = sum1; 2912 t[row+1] = sum2; 2913 t[row+2] = sum3; 2914 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 2915 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 2916 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 2917 ibdiag += 9; 2918 break; 2919 case 4: 2920 v2 = a->a + ii[row+1]; 2921 v3 = a->a + ii[row+2]; 2922 v4 = a->a + ii[row+3]; 2923 sum1 = b[row]; 2924 sum2 = b[row+1]; 2925 sum3 = b[row+2]; 2926 sum4 = b[row+3]; 2927 for(n = 0; n<sz-1; n+=2) { 2928 i1 = idx[0]; 2929 i2 = idx[1]; 2930 idx += 2; 2931 tmp0 = x[i1]; 2932 tmp1 = x[i2]; 2933 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2934 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2935 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2936 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2937 } 2938 2939 if (n == sz-1){ 2940 tmp0 = x[*idx]; 2941 sum1 -= v1[0] * tmp0; 2942 sum2 -= v2[0] * tmp0; 2943 sum3 -= v3[0] * tmp0; 2944 sum4 -= v4[0] * tmp0; 2945 } 2946 t[row] = sum1; 2947 t[row+1] = sum2; 2948 t[row+2] = sum3; 2949 t[row+3] = sum4; 2950 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 2951 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 2952 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 2953 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 2954 ibdiag += 16; 2955 break; 2956 case 5: 2957 v2 = a->a + ii[row+1]; 2958 v3 = a->a + ii[row+2]; 2959 v4 = a->a + ii[row+3]; 2960 v5 = a->a + ii[row+4]; 2961 sum1 = b[row]; 2962 sum2 = b[row+1]; 2963 sum3 = b[row+2]; 2964 sum4 = b[row+3]; 2965 sum5 = b[row+4]; 2966 for(n = 0; n<sz-1; n+=2) { 2967 i1 = idx[0]; 2968 i2 = idx[1]; 2969 idx += 2; 2970 tmp0 = x[i1]; 2971 tmp1 = x[i2]; 2972 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 2973 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 2974 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 2975 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 2976 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 2977 } 2978 2979 if (n == sz-1){ 2980 tmp0 = x[*idx]; 2981 sum1 -= v1[0] * tmp0; 2982 sum2 -= v2[0] * tmp0; 2983 sum3 -= v3[0] * tmp0; 2984 sum4 -= v4[0] * tmp0; 2985 sum5 -= v5[0] * tmp0; 2986 } 2987 t[row] = sum1; 2988 t[row+1] = sum2; 2989 t[row+2] = sum3; 2990 t[row+3] = sum4; 2991 t[row+4] = sum5; 2992 x[row++] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 2993 x[row++] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 2994 x[row++] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 2995 x[row++] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 2996 x[row++] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 2997 ibdiag += 25; 2998 break; 2999 default: 3000 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3001 } 3002 } 3003 3004 xb = t; 3005 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3006 } else xb = b; 3007 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 3008 3009 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3010 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3011 ibdiag -= sizes[i]*sizes[i]; 3012 sz = ii[row+1] - diag[row] - 1; 3013 v1 = a->a + diag[row] + 1; 3014 idx = a->j + diag[row] + 1; 3015 3016 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3017 switch (sizes[i]){ 3018 case 1: 3019 3020 sum1 = xb[row]; 3021 for(n = 0; n<sz-1; n+=2) { 3022 i1 = idx[0]; 3023 i2 = idx[1]; 3024 idx += 2; 3025 tmp0 = x[i1]; 3026 tmp1 = x[i2]; 3027 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3028 } 3029 3030 if (n == sz-1){ 3031 tmp0 = x[*idx]; 3032 sum1 -= *v1*tmp0; 3033 } 3034 x[row--] = sum1*(*ibdiag); 3035 break; 3036 3037 case 2: 3038 3039 sum1 = xb[row]; 3040 sum2 = xb[row-1]; 3041 /* note that sum1 is associated with the second of the two rows */ 3042 v2 = a->a + diag[row-1] + 2; 3043 for(n = 0; n<sz-1; n+=2) { 3044 i1 = idx[0]; 3045 i2 = idx[1]; 3046 idx += 2; 3047 tmp0 = x[i1]; 3048 tmp1 = x[i2]; 3049 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3050 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3051 } 3052 3053 if (n == sz-1){ 3054 tmp0 = x[*idx]; 3055 sum1 -= *v1*tmp0; 3056 sum2 -= *v2*tmp0; 3057 } 3058 x[row--] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3059 x[row--] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3060 break; 3061 case 3: 3062 3063 sum1 = xb[row]; 3064 sum2 = xb[row-1]; 3065 sum3 = xb[row-2]; 3066 v2 = a->a + diag[row-1] + 2; 3067 v3 = a->a + diag[row-2] + 3; 3068 for(n = 0; n<sz-1; n+=2) { 3069 i1 = idx[0]; 3070 i2 = idx[1]; 3071 idx += 2; 3072 tmp0 = x[i1]; 3073 tmp1 = x[i2]; 3074 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3075 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3076 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3077 } 3078 3079 if (n == sz-1){ 3080 tmp0 = x[*idx]; 3081 sum1 -= *v1*tmp0; 3082 sum2 -= *v2*tmp0; 3083 sum3 -= *v3*tmp0; 3084 } 3085 x[row--] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3086 x[row--] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3087 x[row--] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3088 break; 3089 case 4: 3090 3091 sum1 = xb[row]; 3092 sum2 = xb[row-1]; 3093 sum3 = xb[row-2]; 3094 sum4 = xb[row-3]; 3095 v2 = a->a + diag[row-1] + 2; 3096 v3 = a->a + diag[row-2] + 3; 3097 v4 = a->a + diag[row-3] + 4; 3098 for(n = 0; n<sz-1; n+=2) { 3099 i1 = idx[0]; 3100 i2 = idx[1]; 3101 idx += 2; 3102 tmp0 = x[i1]; 3103 tmp1 = x[i2]; 3104 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3105 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3106 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3107 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3108 } 3109 3110 if (n == sz-1){ 3111 tmp0 = x[*idx]; 3112 sum1 -= *v1*tmp0; 3113 sum2 -= *v2*tmp0; 3114 sum3 -= *v3*tmp0; 3115 sum4 -= *v4*tmp0; 3116 } 3117 x[row--] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3118 x[row--] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3119 x[row--] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3120 x[row--] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3121 break; 3122 case 5: 3123 3124 sum1 = xb[row]; 3125 sum2 = xb[row-1]; 3126 sum3 = xb[row-2]; 3127 sum4 = xb[row-3]; 3128 sum5 = xb[row-4]; 3129 v2 = a->a + diag[row-1] + 2; 3130 v3 = a->a + diag[row-2] + 3; 3131 v4 = a->a + diag[row-3] + 4; 3132 v5 = a->a + diag[row-4] + 5; 3133 for(n = 0; n<sz-1; n+=2) { 3134 i1 = idx[0]; 3135 i2 = idx[1]; 3136 idx += 2; 3137 tmp0 = x[i1]; 3138 tmp1 = x[i2]; 3139 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3140 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3141 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3142 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3143 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3144 } 3145 3146 if (n == sz-1){ 3147 tmp0 = x[*idx]; 3148 sum1 -= *v1*tmp0; 3149 sum2 -= *v2*tmp0; 3150 sum3 -= *v3*tmp0; 3151 sum4 -= *v4*tmp0; 3152 sum5 -= *v5*tmp0; 3153 } 3154 x[row--] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3155 x[row--] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3156 x[row--] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3157 x[row--] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3158 x[row--] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3159 break; 3160 default: 3161 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3162 } 3163 } 3164 3165 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3166 } 3167 its--; 3168 } 3169 while (its--) { 3170 3171 if (flag & SOR_FORWARD_SWEEP || flag & SOR_LOCAL_FORWARD_SWEEP){ 3172 ibdiag = a->inode.ibdiag; 3173 3174 for (i=0, row=0; i<m; i++) { 3175 sz = ii[row + 1] - ii[row]; 3176 v1 = a->a + ii[row]; 3177 idx = a->j + ii[row]; 3178 3179 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3180 switch (sizes[i]){ 3181 case 1: 3182 3183 sum1 = b[row]; 3184 for(n = 0; n<sz-1; n+=2) { 3185 i1 = idx[0]; 3186 i2 = idx[1]; 3187 idx += 2; 3188 tmp0 = x[i1]; 3189 tmp1 = x[i2]; 3190 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3191 } 3192 3193 if (n == sz-1){ 3194 tmp0 = x[*idx]; 3195 sum1 -= *v1 * tmp0; 3196 } 3197 3198 /* in MatSOR_SeqAIJ this line would be 3199 * 3200 * x[row] = (1-omega)*x[row]+(sum1+(*bdiag++)*x[row])*(*ibdiag++); 3201 * 3202 * but omega == 1, so this becomes 3203 * 3204 * x[row] = (sum1+(*bdiag++)*x[row])*(*ibdiag++); 3205 * 3206 * but bdiag and ibdiag cancel each other, so we can change this 3207 * to adding sum1*(*ibdiag++). We can skip bdiag for the larger 3208 * block sizes as well 3209 */ 3210 x[row++] += sum1*(*ibdiag++); 3211 break; 3212 case 2: 3213 v2 = a->a + ii[row+1]; 3214 sum1 = b[row]; 3215 sum2 = b[row+1]; 3216 for(n = 0; n<sz-1; n+=2) { 3217 i1 = idx[0]; 3218 i2 = idx[1]; 3219 idx += 2; 3220 tmp0 = x[i1]; 3221 tmp1 = x[i2]; 3222 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3223 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3224 } 3225 3226 if (n == sz-1){ 3227 tmp0 = x[*idx]; 3228 sum1 -= v1[0] * tmp0; 3229 sum2 -= v2[0] * tmp0; 3230 } 3231 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[2]; 3232 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[3]; 3233 ibdiag += 4; 3234 break; 3235 case 3: 3236 v2 = a->a + ii[row+1]; 3237 v3 = a->a + ii[row+2]; 3238 sum1 = b[row]; 3239 sum2 = b[row+1]; 3240 sum3 = b[row+2]; 3241 for(n = 0; n<sz-1; n+=2) { 3242 i1 = idx[0]; 3243 i2 = idx[1]; 3244 idx += 2; 3245 tmp0 = x[i1]; 3246 tmp1 = x[i2]; 3247 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3248 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3249 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3250 } 3251 3252 if (n == sz-1){ 3253 tmp0 = x[*idx]; 3254 sum1 -= v1[0] * tmp0; 3255 sum2 -= v2[0] * tmp0; 3256 sum3 -= v3[0] * tmp0; 3257 } 3258 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3259 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3260 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3261 ibdiag += 9; 3262 break; 3263 case 4: 3264 v2 = a->a + ii[row+1]; 3265 v3 = a->a + ii[row+2]; 3266 v4 = a->a + ii[row+3]; 3267 sum1 = b[row]; 3268 sum2 = b[row+1]; 3269 sum3 = b[row+2]; 3270 sum4 = b[row+3]; 3271 for(n = 0; n<sz-1; n+=2) { 3272 i1 = idx[0]; 3273 i2 = idx[1]; 3274 idx += 2; 3275 tmp0 = x[i1]; 3276 tmp1 = x[i2]; 3277 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3278 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3279 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3280 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3281 } 3282 3283 if (n == sz-1){ 3284 tmp0 = x[*idx]; 3285 sum1 -= v1[0] * tmp0; 3286 sum2 -= v2[0] * tmp0; 3287 sum3 -= v3[0] * tmp0; 3288 sum4 -= v4[0] * tmp0; 3289 } 3290 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3291 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3292 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3293 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3294 ibdiag += 16; 3295 break; 3296 case 5: 3297 v2 = a->a + ii[row+1]; 3298 v3 = a->a + ii[row+2]; 3299 v4 = a->a + ii[row+3]; 3300 v5 = a->a + ii[row+4]; 3301 sum1 = b[row]; 3302 sum2 = b[row+1]; 3303 sum3 = b[row+2]; 3304 sum4 = b[row+3]; 3305 sum5 = b[row+4]; 3306 for(n = 0; n<sz-1; n+=2) { 3307 i1 = idx[0]; 3308 i2 = idx[1]; 3309 idx += 2; 3310 tmp0 = x[i1]; 3311 tmp1 = x[i2]; 3312 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3313 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3314 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3315 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3316 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3317 } 3318 3319 if (n == sz-1){ 3320 tmp0 = x[*idx]; 3321 sum1 -= v1[0] * tmp0; 3322 sum2 -= v2[0] * tmp0; 3323 sum3 -= v3[0] * tmp0; 3324 sum4 -= v4[0] * tmp0; 3325 sum5 -= v5[0] * tmp0; 3326 } 3327 x[row++] += sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3328 x[row++] += sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3329 x[row++] += sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3330 x[row++] += sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3331 x[row++] += sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3332 ibdiag += 25; 3333 break; 3334 default: 3335 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3336 } 3337 } 3338 3339 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3340 } 3341 if (flag & SOR_BACKWARD_SWEEP || flag & SOR_LOCAL_BACKWARD_SWEEP){ 3342 3343 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3344 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3345 ibdiag -= sizes[i]*sizes[i]; 3346 sz = ii[row+1] - ii[row]; 3347 v1 = a->a + ii[row]; 3348 idx = a->j + ii[row]; 3349 3350 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3351 switch (sizes[i]){ 3352 case 1: 3353 3354 sum1 = b[row]; 3355 for(n = 0; n<sz-1; n+=2) { 3356 i1 = idx[0]; 3357 i2 = idx[1]; 3358 idx += 2; 3359 tmp0 = x[i1]; 3360 tmp1 = x[i2]; 3361 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3362 } 3363 3364 if (n == sz-1){ 3365 tmp0 = x[*idx]; 3366 sum1 -= *v1*tmp0; 3367 } 3368 x[row--] += sum1*(*ibdiag); 3369 break; 3370 3371 case 2: 3372 3373 sum1 = b[row]; 3374 sum2 = b[row-1]; 3375 /* note that sum1 is associated with the second of the two rows */ 3376 v2 = a->a + ii[row - 1]; 3377 for(n = 0; n<sz-1; n+=2) { 3378 i1 = idx[0]; 3379 i2 = idx[1]; 3380 idx += 2; 3381 tmp0 = x[i1]; 3382 tmp1 = x[i2]; 3383 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3384 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3385 } 3386 3387 if (n == sz-1){ 3388 tmp0 = x[*idx]; 3389 sum1 -= *v1*tmp0; 3390 sum2 -= *v2*tmp0; 3391 } 3392 x[row--] += sum2*ibdiag[1] + sum1*ibdiag[3]; 3393 x[row--] += sum2*ibdiag[0] + sum1*ibdiag[2]; 3394 break; 3395 case 3: 3396 3397 sum1 = b[row]; 3398 sum2 = b[row-1]; 3399 sum3 = b[row-2]; 3400 v2 = a->a + ii[row-1]; 3401 v3 = a->a + ii[row-2]; 3402 for(n = 0; n<sz-1; n+=2) { 3403 i1 = idx[0]; 3404 i2 = idx[1]; 3405 idx += 2; 3406 tmp0 = x[i1]; 3407 tmp1 = x[i2]; 3408 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3409 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3410 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3411 } 3412 3413 if (n == sz-1){ 3414 tmp0 = x[*idx]; 3415 sum1 -= *v1*tmp0; 3416 sum2 -= *v2*tmp0; 3417 sum3 -= *v3*tmp0; 3418 } 3419 x[row--] += sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3420 x[row--] += sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3421 x[row--] += sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3422 break; 3423 case 4: 3424 3425 sum1 = b[row]; 3426 sum2 = b[row-1]; 3427 sum3 = b[row-2]; 3428 sum4 = b[row-3]; 3429 v2 = a->a + ii[row-1]; 3430 v3 = a->a + ii[row-2]; 3431 v4 = a->a + ii[row-3]; 3432 for(n = 0; n<sz-1; n+=2) { 3433 i1 = idx[0]; 3434 i2 = idx[1]; 3435 idx += 2; 3436 tmp0 = x[i1]; 3437 tmp1 = x[i2]; 3438 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3439 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3440 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3441 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3442 } 3443 3444 if (n == sz-1){ 3445 tmp0 = x[*idx]; 3446 sum1 -= *v1*tmp0; 3447 sum2 -= *v2*tmp0; 3448 sum3 -= *v3*tmp0; 3449 sum4 -= *v4*tmp0; 3450 } 3451 x[row--] += sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3452 x[row--] += sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3453 x[row--] += sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3454 x[row--] += sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3455 break; 3456 case 5: 3457 3458 sum1 = b[row]; 3459 sum2 = b[row-1]; 3460 sum3 = b[row-2]; 3461 sum4 = b[row-3]; 3462 sum5 = b[row-4]; 3463 v2 = a->a + ii[row-1]; 3464 v3 = a->a + ii[row-2]; 3465 v4 = a->a + ii[row-3]; 3466 v5 = a->a + ii[row-4]; 3467 for(n = 0; n<sz-1; n+=2) { 3468 i1 = idx[0]; 3469 i2 = idx[1]; 3470 idx += 2; 3471 tmp0 = x[i1]; 3472 tmp1 = x[i2]; 3473 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3474 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3475 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3476 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3477 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3478 } 3479 3480 if (n == sz-1){ 3481 tmp0 = x[*idx]; 3482 sum1 -= *v1*tmp0; 3483 sum2 -= *v2*tmp0; 3484 sum3 -= *v3*tmp0; 3485 sum4 -= *v4*tmp0; 3486 sum5 -= *v5*tmp0; 3487 } 3488 x[row--] += sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3489 x[row--] += sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3490 x[row--] += sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3491 x[row--] += sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3492 x[row--] += sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3493 break; 3494 default: 3495 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3496 } 3497 } 3498 3499 ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr); 3500 } 3501 } 3502 if (flag & SOR_EISENSTAT) { 3503 /* 3504 Apply (U + D)^-1 where D is now the block diagonal 3505 */ 3506 ibdiag = a->inode.ibdiag+a->inode.bdiagsize; 3507 for (i=m-1, row=A->rmap->n-1; i>=0; i--) { 3508 ibdiag -= sizes[i]*sizes[i]; 3509 sz = ii[row+1] - diag[row] - 1; 3510 v1 = a->a + diag[row] + 1; 3511 idx = a->j + diag[row] + 1; 3512 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3513 switch (sizes[i]){ 3514 case 1: 3515 3516 sum1 = b[row]; 3517 for(n = 0; n<sz-1; n+=2) { 3518 i1 = idx[0]; 3519 i2 = idx[1]; 3520 idx += 2; 3521 tmp0 = x[i1]; 3522 tmp1 = x[i2]; 3523 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3524 } 3525 3526 if (n == sz-1){ 3527 tmp0 = x[*idx]; 3528 sum1 -= *v1*tmp0; 3529 } 3530 x[row] = sum1*(*ibdiag);row--; 3531 break; 3532 3533 case 2: 3534 3535 sum1 = b[row]; 3536 sum2 = b[row-1]; 3537 /* note that sum1 is associated with the second of the two rows */ 3538 v2 = a->a + diag[row-1] + 2; 3539 for(n = 0; n<sz-1; n+=2) { 3540 i1 = idx[0]; 3541 i2 = idx[1]; 3542 idx += 2; 3543 tmp0 = x[i1]; 3544 tmp1 = x[i2]; 3545 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3546 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3547 } 3548 3549 if (n == sz-1){ 3550 tmp0 = x[*idx]; 3551 sum1 -= *v1*tmp0; 3552 sum2 -= *v2*tmp0; 3553 } 3554 x[row] = sum2*ibdiag[1] + sum1*ibdiag[3]; 3555 x[row-1] = sum2*ibdiag[0] + sum1*ibdiag[2]; 3556 row -= 2; 3557 break; 3558 case 3: 3559 3560 sum1 = b[row]; 3561 sum2 = b[row-1]; 3562 sum3 = b[row-2]; 3563 v2 = a->a + diag[row-1] + 2; 3564 v3 = a->a + diag[row-2] + 3; 3565 for(n = 0; n<sz-1; n+=2) { 3566 i1 = idx[0]; 3567 i2 = idx[1]; 3568 idx += 2; 3569 tmp0 = x[i1]; 3570 tmp1 = x[i2]; 3571 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3572 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3573 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3574 } 3575 3576 if (n == sz-1){ 3577 tmp0 = x[*idx]; 3578 sum1 -= *v1*tmp0; 3579 sum2 -= *v2*tmp0; 3580 sum3 -= *v3*tmp0; 3581 } 3582 x[row] = sum3*ibdiag[2] + sum2*ibdiag[5] + sum1*ibdiag[8]; 3583 x[row-1] = sum3*ibdiag[1] + sum2*ibdiag[4] + sum1*ibdiag[7]; 3584 x[row-2] = sum3*ibdiag[0] + sum2*ibdiag[3] + sum1*ibdiag[6]; 3585 row -= 3; 3586 break; 3587 case 4: 3588 3589 sum1 = b[row]; 3590 sum2 = b[row-1]; 3591 sum3 = b[row-2]; 3592 sum4 = b[row-3]; 3593 v2 = a->a + diag[row-1] + 2; 3594 v3 = a->a + diag[row-2] + 3; 3595 v4 = a->a + diag[row-3] + 4; 3596 for(n = 0; n<sz-1; n+=2) { 3597 i1 = idx[0]; 3598 i2 = idx[1]; 3599 idx += 2; 3600 tmp0 = x[i1]; 3601 tmp1 = x[i2]; 3602 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3603 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3604 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3605 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3606 } 3607 3608 if (n == sz-1){ 3609 tmp0 = x[*idx]; 3610 sum1 -= *v1*tmp0; 3611 sum2 -= *v2*tmp0; 3612 sum3 -= *v3*tmp0; 3613 sum4 -= *v4*tmp0; 3614 } 3615 x[row] = sum4*ibdiag[3] + sum3*ibdiag[7] + sum2*ibdiag[11] + sum1*ibdiag[15]; 3616 x[row-1] = sum4*ibdiag[2] + sum3*ibdiag[6] + sum2*ibdiag[10] + sum1*ibdiag[14]; 3617 x[row-2] = sum4*ibdiag[1] + sum3*ibdiag[5] + sum2*ibdiag[9] + sum1*ibdiag[13]; 3618 x[row-3] = sum4*ibdiag[0] + sum3*ibdiag[4] + sum2*ibdiag[8] + sum1*ibdiag[12]; 3619 row -= 4; 3620 break; 3621 case 5: 3622 3623 sum1 = b[row]; 3624 sum2 = b[row-1]; 3625 sum3 = b[row-2]; 3626 sum4 = b[row-3]; 3627 sum5 = b[row-4]; 3628 v2 = a->a + diag[row-1] + 2; 3629 v3 = a->a + diag[row-2] + 3; 3630 v4 = a->a + diag[row-3] + 4; 3631 v5 = a->a + diag[row-4] + 5; 3632 for(n = 0; n<sz-1; n+=2) { 3633 i1 = idx[0]; 3634 i2 = idx[1]; 3635 idx += 2; 3636 tmp0 = x[i1]; 3637 tmp1 = x[i2]; 3638 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3639 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3640 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3641 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3642 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3643 } 3644 3645 if (n == sz-1){ 3646 tmp0 = x[*idx]; 3647 sum1 -= *v1*tmp0; 3648 sum2 -= *v2*tmp0; 3649 sum3 -= *v3*tmp0; 3650 sum4 -= *v4*tmp0; 3651 sum5 -= *v5*tmp0; 3652 } 3653 x[row] = sum5*ibdiag[4] + sum4*ibdiag[9] + sum3*ibdiag[14] + sum2*ibdiag[19] + sum1*ibdiag[24]; 3654 x[row-1] = sum5*ibdiag[3] + sum4*ibdiag[8] + sum3*ibdiag[13] + sum2*ibdiag[18] + sum1*ibdiag[23]; 3655 x[row-2] = sum5*ibdiag[2] + sum4*ibdiag[7] + sum3*ibdiag[12] + sum2*ibdiag[17] + sum1*ibdiag[22]; 3656 x[row-3] = sum5*ibdiag[1] + sum4*ibdiag[6] + sum3*ibdiag[11] + sum2*ibdiag[16] + sum1*ibdiag[21]; 3657 x[row-4] = sum5*ibdiag[0] + sum4*ibdiag[5] + sum3*ibdiag[10] + sum2*ibdiag[15] + sum1*ibdiag[20]; 3658 row -= 5; 3659 break; 3660 default: 3661 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3662 } 3663 } 3664 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3665 3666 /* 3667 t = b - D x where D is the block diagonal 3668 */ 3669 cnt = 0; 3670 for (i=0, row=0; i<m; i++) { 3671 switch (sizes[i]){ 3672 case 1: 3673 t[row] = b[row] - bdiag[cnt++]*x[row]; row++; 3674 break; 3675 case 2: 3676 x1 = x[row]; x2 = x[row+1]; 3677 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3678 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3679 t[row] = b[row] - tmp1; 3680 t[row+1] = b[row+1] - tmp2; row += 2; 3681 cnt += 4; 3682 break; 3683 case 3: 3684 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; 3685 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3686 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3687 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3688 t[row] = b[row] - tmp1; 3689 t[row+1] = b[row+1] - tmp2; 3690 t[row+2] = b[row+2] - tmp3; row += 3; 3691 cnt += 9; 3692 break; 3693 case 4: 3694 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; 3695 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3696 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3697 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3698 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3699 t[row] = b[row] - tmp1; 3700 t[row+1] = b[row+1] - tmp2; 3701 t[row+2] = b[row+2] - tmp3; 3702 t[row+3] = b[row+3] - tmp4; row += 4; 3703 cnt += 16; 3704 break; 3705 case 5: 3706 x1 = x[row]; x2 = x[row+1]; x3 = x[row+2]; x4 = x[row+3]; x5 = x[row+4]; 3707 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3708 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3709 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3710 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3711 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3712 t[row] = b[row] - tmp1; 3713 t[row+1] = b[row+1] - tmp2; 3714 t[row+2] = b[row+2] - tmp3; 3715 t[row+3] = b[row+3] - tmp4; 3716 t[row+4] = b[row+4] - tmp5;row += 5; 3717 cnt += 25; 3718 break; 3719 default: 3720 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3721 } 3722 } 3723 ierr = PetscLogFlops(m);CHKERRQ(ierr); 3724 3725 3726 3727 /* 3728 Apply (L + D)^-1 where D is the block diagonal 3729 */ 3730 for (i=0, row=0; i<m; i++) { 3731 sz = diag[row] - ii[row]; 3732 v1 = a->a + ii[row]; 3733 idx = a->j + ii[row]; 3734 /* see comments for MatMult_SeqAIJ_Inode() for how this is coded */ 3735 switch (sizes[i]){ 3736 case 1: 3737 3738 sum1 = t[row]; 3739 for(n = 0; n<sz-1; n+=2) { 3740 i1 = idx[0]; 3741 i2 = idx[1]; 3742 idx += 2; 3743 tmp0 = t[i1]; 3744 tmp1 = t[i2]; 3745 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3746 } 3747 3748 if (n == sz-1){ 3749 tmp0 = t[*idx]; 3750 sum1 -= *v1 * tmp0; 3751 } 3752 x[row] += t[row] = sum1*(*ibdiag++); row++; 3753 break; 3754 case 2: 3755 v2 = a->a + ii[row+1]; 3756 sum1 = t[row]; 3757 sum2 = t[row+1]; 3758 for(n = 0; n<sz-1; n+=2) { 3759 i1 = idx[0]; 3760 i2 = idx[1]; 3761 idx += 2; 3762 tmp0 = t[i1]; 3763 tmp1 = t[i2]; 3764 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3765 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3766 } 3767 3768 if (n == sz-1){ 3769 tmp0 = t[*idx]; 3770 sum1 -= v1[0] * tmp0; 3771 sum2 -= v2[0] * tmp0; 3772 } 3773 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[2]; 3774 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[3]; 3775 ibdiag += 4; row += 2; 3776 break; 3777 case 3: 3778 v2 = a->a + ii[row+1]; 3779 v3 = a->a + ii[row+2]; 3780 sum1 = t[row]; 3781 sum2 = t[row+1]; 3782 sum3 = t[row+2]; 3783 for(n = 0; n<sz-1; n+=2) { 3784 i1 = idx[0]; 3785 i2 = idx[1]; 3786 idx += 2; 3787 tmp0 = t[i1]; 3788 tmp1 = t[i2]; 3789 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3790 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3791 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3792 } 3793 3794 if (n == sz-1){ 3795 tmp0 = t[*idx]; 3796 sum1 -= v1[0] * tmp0; 3797 sum2 -= v2[0] * tmp0; 3798 sum3 -= v3[0] * tmp0; 3799 } 3800 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[3] + sum3*ibdiag[6]; 3801 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[4] + sum3*ibdiag[7]; 3802 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[5] + sum3*ibdiag[8]; 3803 ibdiag += 9; row += 3; 3804 break; 3805 case 4: 3806 v2 = a->a + ii[row+1]; 3807 v3 = a->a + ii[row+2]; 3808 v4 = a->a + ii[row+3]; 3809 sum1 = t[row]; 3810 sum2 = t[row+1]; 3811 sum3 = t[row+2]; 3812 sum4 = t[row+3]; 3813 for(n = 0; n<sz-1; n+=2) { 3814 i1 = idx[0]; 3815 i2 = idx[1]; 3816 idx += 2; 3817 tmp0 = t[i1]; 3818 tmp1 = t[i2]; 3819 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3820 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3821 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3822 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3823 } 3824 3825 if (n == sz-1){ 3826 tmp0 = t[*idx]; 3827 sum1 -= v1[0] * tmp0; 3828 sum2 -= v2[0] * tmp0; 3829 sum3 -= v3[0] * tmp0; 3830 sum4 -= v4[0] * tmp0; 3831 } 3832 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[4] + sum3*ibdiag[8] + sum4*ibdiag[12]; 3833 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[5] + sum3*ibdiag[9] + sum4*ibdiag[13]; 3834 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[6] + sum3*ibdiag[10] + sum4*ibdiag[14]; 3835 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[7] + sum3*ibdiag[11] + sum4*ibdiag[15]; 3836 ibdiag += 16; row += 4; 3837 break; 3838 case 5: 3839 v2 = a->a + ii[row+1]; 3840 v3 = a->a + ii[row+2]; 3841 v4 = a->a + ii[row+3]; 3842 v5 = a->a + ii[row+4]; 3843 sum1 = t[row]; 3844 sum2 = t[row+1]; 3845 sum3 = t[row+2]; 3846 sum4 = t[row+3]; 3847 sum5 = t[row+4]; 3848 for(n = 0; n<sz-1; n+=2) { 3849 i1 = idx[0]; 3850 i2 = idx[1]; 3851 idx += 2; 3852 tmp0 = t[i1]; 3853 tmp1 = t[i2]; 3854 sum1 -= v1[0] * tmp0 + v1[1] * tmp1; v1 += 2; 3855 sum2 -= v2[0] * tmp0 + v2[1] * tmp1; v2 += 2; 3856 sum3 -= v3[0] * tmp0 + v3[1] * tmp1; v3 += 2; 3857 sum4 -= v4[0] * tmp0 + v4[1] * tmp1; v4 += 2; 3858 sum5 -= v5[0] * tmp0 + v5[1] * tmp1; v5 += 2; 3859 } 3860 3861 if (n == sz-1){ 3862 tmp0 = t[*idx]; 3863 sum1 -= v1[0] * tmp0; 3864 sum2 -= v2[0] * tmp0; 3865 sum3 -= v3[0] * tmp0; 3866 sum4 -= v4[0] * tmp0; 3867 sum5 -= v5[0] * tmp0; 3868 } 3869 x[row] += t[row] = sum1*ibdiag[0] + sum2*ibdiag[5] + sum3*ibdiag[10] + sum4*ibdiag[15] + sum5*ibdiag[20]; 3870 x[row+1] += t[row+1] = sum1*ibdiag[1] + sum2*ibdiag[6] + sum3*ibdiag[11] + sum4*ibdiag[16] + sum5*ibdiag[21]; 3871 x[row+2] += t[row+2] = sum1*ibdiag[2] + sum2*ibdiag[7] + sum3*ibdiag[12] + sum4*ibdiag[17] + sum5*ibdiag[22]; 3872 x[row+3] += t[row+3] = sum1*ibdiag[3] + sum2*ibdiag[8] + sum3*ibdiag[13] + sum4*ibdiag[18] + sum5*ibdiag[23]; 3873 x[row+4] += t[row+4] = sum1*ibdiag[4] + sum2*ibdiag[9] + sum3*ibdiag[14] + sum4*ibdiag[19] + sum5*ibdiag[24]; 3874 ibdiag += 25; row += 5; 3875 break; 3876 default: 3877 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3878 } 3879 } 3880 ierr = PetscLogFlops(a->nz);CHKERRQ(ierr); 3881 } 3882 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3883 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3884 CHKMEMQ; PetscFunctionReturn(0); 3885 } 3886 3887 #undef __FUNCT__ 3888 #define __FUNCT__ "MatMultDiagonalBlock_SeqAIJ_Inode" 3889 PetscErrorCode MatMultDiagonalBlock_SeqAIJ_Inode(Mat A,Vec bb,Vec xx) 3890 { 3891 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3892 PetscScalar *x,tmp1,tmp2,tmp3,tmp4,tmp5,x1,x2,x3,x4,x5; 3893 const MatScalar *bdiag = a->inode.bdiag; 3894 const PetscScalar *b; 3895 PetscErrorCode ierr; 3896 PetscInt m = a->inode.node_count,cnt = 0,i,row; 3897 const PetscInt *sizes = a->inode.size; 3898 3899 PetscFunctionBegin; 3900 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3901 ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3902 cnt = 0; 3903 for (i=0, row=0; i<m; i++) { 3904 switch (sizes[i]){ 3905 case 1: 3906 x[row] = b[row]*bdiag[cnt++];row++; 3907 break; 3908 case 2: 3909 x1 = b[row]; x2 = b[row+1]; 3910 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+2]; 3911 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+3]; 3912 x[row++] = tmp1; 3913 x[row++] = tmp2; 3914 cnt += 4; 3915 break; 3916 case 3: 3917 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; 3918 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+3] + x3*bdiag[cnt+6]; 3919 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+4] + x3*bdiag[cnt+7]; 3920 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+5] + x3*bdiag[cnt+8]; 3921 x[row++] = tmp1; 3922 x[row++] = tmp2; 3923 x[row++] = tmp3; 3924 cnt += 9; 3925 break; 3926 case 4: 3927 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; 3928 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+4] + x3*bdiag[cnt+8] + x4*bdiag[cnt+12]; 3929 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+5] + x3*bdiag[cnt+9] + x4*bdiag[cnt+13]; 3930 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+6] + x3*bdiag[cnt+10] + x4*bdiag[cnt+14]; 3931 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+7] + x3*bdiag[cnt+11] + x4*bdiag[cnt+15]; 3932 x[row++] = tmp1; 3933 x[row++] = tmp2; 3934 x[row++] = tmp3; 3935 x[row++] = tmp4; 3936 cnt += 16; 3937 break; 3938 case 5: 3939 x1 = b[row]; x2 = b[row+1]; x3 = b[row+2]; x4 = b[row+3]; x5 = b[row+4]; 3940 tmp1 = x1*bdiag[cnt] + x2*bdiag[cnt+5] + x3*bdiag[cnt+10] + x4*bdiag[cnt+15] + x5*bdiag[cnt+20]; 3941 tmp2 = x1*bdiag[cnt+1] + x2*bdiag[cnt+6] + x3*bdiag[cnt+11] + x4*bdiag[cnt+16] + x5*bdiag[cnt+21]; 3942 tmp3 = x1*bdiag[cnt+2] + x2*bdiag[cnt+7] + x3*bdiag[cnt+12] + x4*bdiag[cnt+17] + x5*bdiag[cnt+22]; 3943 tmp4 = x1*bdiag[cnt+3] + x2*bdiag[cnt+8] + x3*bdiag[cnt+13] + x4*bdiag[cnt+18] + x5*bdiag[cnt+23]; 3944 tmp5 = x1*bdiag[cnt+4] + x2*bdiag[cnt+9] + x3*bdiag[cnt+14] + x4*bdiag[cnt+19] + x5*bdiag[cnt+24]; 3945 x[row++] = tmp1; 3946 x[row++] = tmp2; 3947 x[row++] = tmp3; 3948 x[row++] = tmp4; 3949 x[row++] = tmp5; 3950 cnt += 25; 3951 break; 3952 default: 3953 SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_SUP,"Inode size %D not supported",sizes[i]); 3954 } 3955 } 3956 ierr = PetscLogFlops(2*cnt);CHKERRQ(ierr); 3957 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3958 ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3959 PetscFunctionReturn(0); 3960 } 3961 3962 /* 3963 samestructure indicates that the matrix has not changed its nonzero structure so we 3964 do not need to recompute the inodes 3965 */ 3966 #undef __FUNCT__ 3967 #define __FUNCT__ "Mat_CheckInode" 3968 PetscErrorCode Mat_CheckInode(Mat A,PetscBool samestructure) 3969 { 3970 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 3971 PetscErrorCode ierr; 3972 PetscInt i,j,m,nzx,nzy,*idx,*idy,*ns,*ii,node_count,blk_size; 3973 PetscBool flag; 3974 3975 PetscFunctionBegin; 3976 if (!a->inode.use) PetscFunctionReturn(0); 3977 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 3978 3979 3980 m = A->rmap->n; 3981 if (a->inode.size) {ns = a->inode.size;} 3982 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 3983 3984 i = 0; 3985 node_count = 0; 3986 idx = a->j; 3987 ii = a->i; 3988 while (i < m){ /* For each row */ 3989 nzx = ii[i+1] - ii[i]; /* Number of nonzeros */ 3990 /* Limits the number of elements in a node to 'a->inode.limit' */ 3991 for (j=i+1,idy=idx,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 3992 nzy = ii[j+1] - ii[j]; /* Same number of nonzeros */ 3993 if (nzy != nzx) break; 3994 idy += nzx; /* Same nonzero pattern */ 3995 ierr = PetscMemcmp(idx,idy,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 3996 if (!flag) break; 3997 } 3998 ns[node_count++] = blk_size; 3999 idx += blk_size*nzx; 4000 i = j; 4001 } 4002 /* If not enough inodes found,, do not use inode version of the routines */ 4003 if (!m || node_count > .8*m) { 4004 ierr = PetscFree(ns);CHKERRQ(ierr); 4005 a->inode.node_count = 0; 4006 a->inode.size = PETSC_NULL; 4007 a->inode.use = PETSC_FALSE; 4008 A->ops->mult = MatMult_SeqAIJ; 4009 A->ops->sor = MatSOR_SeqAIJ; 4010 A->ops->multadd = MatMultAdd_SeqAIJ; 4011 A->ops->getrowij = MatGetRowIJ_SeqAIJ; 4012 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ; 4013 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ; 4014 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ; 4015 A->ops->coloringpatch = 0; 4016 A->ops->multdiagonalblock = 0; 4017 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4018 } else { 4019 if (!A->factortype) { 4020 A->ops->mult = MatMult_SeqAIJ_Inode; 4021 A->ops->sor = MatSOR_SeqAIJ_Inode; 4022 A->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4023 A->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4024 A->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4025 A->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4026 A->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4027 A->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4028 A->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4029 } else { 4030 A->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4031 } 4032 a->inode.node_count = node_count; 4033 a->inode.size = ns; 4034 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4035 } 4036 a->inode.checked = PETSC_TRUE; 4037 PetscFunctionReturn(0); 4038 } 4039 4040 #undef __FUNCT__ 4041 #define __FUNCT__ "MatDuplicate_SeqAIJ_Inode" 4042 PetscErrorCode MatDuplicate_SeqAIJ_Inode(Mat A,MatDuplicateOption cpvalues,Mat *C) 4043 { 4044 Mat B=*C; 4045 Mat_SeqAIJ *c=(Mat_SeqAIJ*)B->data,*a=(Mat_SeqAIJ*)A->data; 4046 PetscErrorCode ierr; 4047 PetscInt m=A->rmap->n; 4048 4049 PetscFunctionBegin; 4050 4051 c->inode.use = a->inode.use; 4052 c->inode.limit = a->inode.limit; 4053 c->inode.max_limit = a->inode.max_limit; 4054 if (a->inode.size){ 4055 ierr = PetscMalloc((m+1)*sizeof(PetscInt),&c->inode.size);CHKERRQ(ierr); 4056 c->inode.node_count = a->inode.node_count; 4057 ierr = PetscMemcpy(c->inode.size,a->inode.size,(m+1)*sizeof(PetscInt));CHKERRQ(ierr); 4058 /* note the table of functions below should match that in Mat_CheckInode() */ 4059 if (!B->factortype) { 4060 B->ops->mult = MatMult_SeqAIJ_Inode; 4061 B->ops->sor = MatSOR_SeqAIJ_Inode; 4062 B->ops->multadd = MatMultAdd_SeqAIJ_Inode; 4063 B->ops->getrowij = MatGetRowIJ_SeqAIJ_Inode; 4064 B->ops->restorerowij = MatRestoreRowIJ_SeqAIJ_Inode; 4065 B->ops->getcolumnij = MatGetColumnIJ_SeqAIJ_Inode; 4066 B->ops->restorecolumnij = MatRestoreColumnIJ_SeqAIJ_Inode; 4067 B->ops->coloringpatch = MatColoringPatch_SeqAIJ_Inode; 4068 B->ops->multdiagonalblock = MatMultDiagonalBlock_SeqAIJ_Inode; 4069 } else { 4070 B->ops->solve = MatSolve_SeqAIJ_Inode_inplace; 4071 } 4072 } else { 4073 c->inode.size = 0; 4074 c->inode.node_count = 0; 4075 } 4076 c->inode.ibdiagvalid = PETSC_FALSE; 4077 c->inode.ibdiag = 0; 4078 c->inode.bdiag = 0; 4079 PetscFunctionReturn(0); 4080 } 4081 4082 #undef __FUNCT__ 4083 #define __FUNCT__ "MatGetRow_FactoredLU" 4084 PETSC_STATIC_INLINE PetscErrorCode MatGetRow_FactoredLU(PetscInt *cols,PetscInt nzl,PetscInt nzu,PetscInt nz,PetscInt *ai,PetscInt *aj,PetscInt *adiag,PetscInt row) 4085 { 4086 PetscInt k, *vi; 4087 PetscFunctionBegin; 4088 vi = aj + ai[row]; 4089 for(k=0;k<nzl;k++) cols[k] = vi[k]; 4090 vi = aj + adiag[row]; 4091 cols[nzl] = vi[0]; 4092 vi = aj + adiag[row+1]+1; 4093 for(k=0;k<nzu;k++) cols[nzl+1+k] = vi[k]; 4094 PetscFunctionReturn(0); 4095 } 4096 /* 4097 Mat_CheckInode_FactorLU - Check Inode for factored seqaij matrix. 4098 Modified from Mat_CheckInode(). 4099 4100 Input Parameters: 4101 + Mat A - ILU or LU matrix factor 4102 - samestructure - TRUE indicates that the matrix has not changed its nonzero structure so we 4103 do not need to recompute the inodes 4104 */ 4105 #undef __FUNCT__ 4106 #define __FUNCT__ "Mat_CheckInode_FactorLU" 4107 PetscErrorCode Mat_CheckInode_FactorLU(Mat A,PetscBool samestructure) 4108 { 4109 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4110 PetscErrorCode ierr; 4111 PetscInt i,j,m,nzl1,nzu1,nzl2,nzu2,nzx,nzy,node_count,blk_size; 4112 PetscInt *cols1,*cols2,*ns,*ai = a->i,*aj = a->j, *adiag = a->diag; 4113 PetscBool flag; 4114 4115 PetscFunctionBegin; 4116 if (!a->inode.use) PetscFunctionReturn(0); 4117 if (a->inode.checked && samestructure) PetscFunctionReturn(0); 4118 4119 m = A->rmap->n; 4120 if (a->inode.size) {ns = a->inode.size;} 4121 else {ierr = PetscMalloc((m+1)*sizeof(PetscInt),&ns);CHKERRQ(ierr);} 4122 4123 i = 0; 4124 node_count = 0; 4125 while (i < m){ /* For each row */ 4126 nzl1 = ai[i+1] - ai[i]; /* Number of nonzeros in L */ 4127 nzu1 = adiag[i] - adiag[i+1] - 1; /* Number of nonzeros in U excluding diagonal*/ 4128 nzx = nzl1 + nzu1 + 1; 4129 ierr = PetscMalloc((nzx+1)*sizeof(PetscInt),&cols1);CHKERRQ(ierr); 4130 MatGetRow_FactoredLU(cols1,nzl1,nzu1,nzx,ai,aj,adiag,i); 4131 4132 /* Limits the number of elements in a node to 'a->inode.limit' */ 4133 for (j=i+1,blk_size=1; j<m && blk_size <a->inode.limit; ++j,++blk_size) { 4134 nzl2 = ai[j+1] - ai[j]; 4135 nzu2 = adiag[j] - adiag[j+1] - 1; 4136 nzy = nzl2 + nzu2 + 1; 4137 if( nzy != nzx) break; 4138 ierr = PetscMalloc((nzy+1)*sizeof(PetscInt),&cols2);CHKERRQ(ierr); 4139 ierr = MatGetRow_FactoredLU(cols2,nzl2,nzu2,nzy,ai,aj,adiag,j);CHKERRQ(ierr); 4140 ierr = PetscMemcmp(cols1,cols2,nzx*sizeof(PetscInt),&flag);CHKERRQ(ierr); 4141 if (!flag) {ierr = PetscFree(cols2);CHKERRQ(ierr);break;} 4142 ierr = PetscFree(cols2);CHKERRQ(ierr); 4143 } 4144 ns[node_count++] = blk_size; 4145 ierr = PetscFree(cols1);CHKERRQ(ierr); 4146 i = j; 4147 } 4148 /* If not enough inodes found,, do not use inode version of the routines */ 4149 if (!m || node_count > .8*m) { 4150 ierr = PetscFree(ns);CHKERRQ(ierr); 4151 a->inode.node_count = 0; 4152 a->inode.size = PETSC_NULL; 4153 a->inode.use = PETSC_FALSE; 4154 ierr = PetscInfo2(A,"Found %D nodes out of %D rows. Not using Inode routines\n",node_count,m);CHKERRQ(ierr); 4155 } else { 4156 A->ops->mult = 0; 4157 A->ops->sor = 0; 4158 A->ops->multadd = 0; 4159 A->ops->getrowij = 0; 4160 A->ops->restorerowij = 0; 4161 A->ops->getcolumnij = 0; 4162 A->ops->restorecolumnij = 0; 4163 A->ops->coloringpatch = 0; 4164 A->ops->multdiagonalblock = 0; 4165 A->ops->solve = MatSolve_SeqAIJ_Inode; 4166 a->inode.node_count = node_count; 4167 a->inode.size = ns; 4168 ierr = PetscInfo3(A,"Found %D nodes of %D. Limit used: %D. Using Inode routines\n",node_count,m,a->inode.limit);CHKERRQ(ierr); 4169 } 4170 a->inode.checked = PETSC_TRUE; 4171 PetscFunctionReturn(0); 4172 } 4173 4174 /* 4175 This is really ugly. if inodes are used this replaces the 4176 permutations with ones that correspond to rows/cols of the matrix 4177 rather then inode blocks 4178 */ 4179 #undef __FUNCT__ 4180 #define __FUNCT__ "MatInodeAdjustForInodes" 4181 PetscErrorCode MatInodeAdjustForInodes(Mat A,IS *rperm,IS *cperm) 4182 { 4183 PetscErrorCode ierr; 4184 4185 PetscFunctionBegin; 4186 ierr = PetscTryMethod(A,"MatInodeAdjustForInodes_C",(Mat,IS*,IS*),(A,rperm,cperm));CHKERRQ(ierr); 4187 PetscFunctionReturn(0); 4188 } 4189 4190 EXTERN_C_BEGIN 4191 #undef __FUNCT__ 4192 #define __FUNCT__ "MatInodeAdjustForInodes_SeqAIJ_Inode" 4193 PetscErrorCode MatInodeAdjustForInodes_SeqAIJ_Inode(Mat A,IS *rperm,IS *cperm) 4194 { 4195 Mat_SeqAIJ *a=(Mat_SeqAIJ*)A->data; 4196 PetscErrorCode ierr; 4197 PetscInt m = A->rmap->n,n = A->cmap->n,i,j,nslim_row = a->inode.node_count; 4198 const PetscInt *ridx,*cidx; 4199 PetscInt row,col,*permr,*permc,*ns_row = a->inode.size,*tns,start_val,end_val,indx; 4200 PetscInt nslim_col,*ns_col; 4201 IS ris = *rperm,cis = *cperm; 4202 4203 PetscFunctionBegin; 4204 if (!a->inode.size) PetscFunctionReturn(0); /* no inodes so return */ 4205 if (a->inode.node_count == m) PetscFunctionReturn(0); /* all inodes are of size 1 */ 4206 4207 ierr = Mat_CreateColInode(A,&nslim_col,&ns_col);CHKERRQ(ierr); 4208 ierr = PetscMalloc((((nslim_row>nslim_col)?nslim_row:nslim_col)+1)*sizeof(PetscInt),&tns);CHKERRQ(ierr); 4209 ierr = PetscMalloc2(m,PetscInt,&permr,n,PetscInt,&permc);CHKERRQ(ierr); 4210 4211 ierr = ISGetIndices(ris,&ridx);CHKERRQ(ierr); 4212 ierr = ISGetIndices(cis,&cidx);CHKERRQ(ierr); 4213 4214 /* Form the inode structure for the rows of permuted matric using inv perm*/ 4215 for (i=0,tns[0]=0; i<nslim_row; ++i) tns[i+1] = tns[i] + ns_row[i]; 4216 4217 /* Construct the permutations for rows*/ 4218 for (i=0,row = 0; i<nslim_row; ++i){ 4219 indx = ridx[i]; 4220 start_val = tns[indx]; 4221 end_val = tns[indx + 1]; 4222 for (j=start_val; j<end_val; ++j,++row) permr[row]= j; 4223 } 4224 4225 /* Form the inode structure for the columns of permuted matrix using inv perm*/ 4226 for (i=0,tns[0]=0; i<nslim_col; ++i) tns[i+1] = tns[i] + ns_col[i]; 4227 4228 /* Construct permutations for columns */ 4229 for (i=0,col=0; i<nslim_col; ++i){ 4230 indx = cidx[i]; 4231 start_val = tns[indx]; 4232 end_val = tns[indx + 1]; 4233 for (j = start_val; j<end_val; ++j,++col) permc[col]= j; 4234 } 4235 4236 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permr,PETSC_COPY_VALUES,rperm);CHKERRQ(ierr); 4237 ierr = ISSetPermutation(*rperm);CHKERRQ(ierr); 4238 ierr = ISCreateGeneral(PETSC_COMM_SELF,n,permc,PETSC_COPY_VALUES,cperm);CHKERRQ(ierr); 4239 ierr = ISSetPermutation(*cperm);CHKERRQ(ierr); 4240 4241 ierr = ISRestoreIndices(ris,&ridx);CHKERRQ(ierr); 4242 ierr = ISRestoreIndices(cis,&cidx);CHKERRQ(ierr); 4243 4244 ierr = PetscFree(ns_col);CHKERRQ(ierr); 4245 ierr = PetscFree2(permr,permc);CHKERRQ(ierr); 4246 ierr = ISDestroy(&cis);CHKERRQ(ierr); 4247 ierr = ISDestroy(&ris);CHKERRQ(ierr); 4248 ierr = PetscFree(tns);CHKERRQ(ierr); 4249 PetscFunctionReturn(0); 4250 } 4251 EXTERN_C_END 4252 4253 #undef __FUNCT__ 4254 #define __FUNCT__ "MatInodeGetInodeSizes" 4255 /*@C 4256 MatInodeGetInodeSizes - Returns the inode information of the Inode matrix. 4257 4258 Not Collective 4259 4260 Input Parameter: 4261 . A - the Inode matrix or matrix derived from the Inode class -- e.g., SeqAIJ 4262 4263 Output Parameter: 4264 + node_count - no of inodes present in the matrix. 4265 . sizes - an array of size node_count,with sizes of each inode. 4266 - limit - the max size used to generate the inodes. 4267 4268 Level: advanced 4269 4270 Notes: This routine returns some internal storage information 4271 of the matrix, it is intended to be used by advanced users. 4272 It should be called after the matrix is assembled. 4273 The contents of the sizes[] array should not be changed. 4274 PETSC_NULL may be passed for information not requested. 4275 4276 .keywords: matrix, seqaij, get, inode 4277 4278 .seealso: MatGetInfo() 4279 @*/ 4280 PetscErrorCode MatInodeGetInodeSizes(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4281 { 4282 PetscErrorCode ierr,(*f)(Mat,PetscInt*,PetscInt*[],PetscInt*); 4283 4284 PetscFunctionBegin; 4285 if (!A->assembled) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for unassembled matrix"); 4286 ierr = PetscObjectQueryFunction((PetscObject)A,"MatInodeGetInodeSizes_C",(void (**)(void))&f);CHKERRQ(ierr); 4287 if (f) { 4288 ierr = (*f)(A,node_count,sizes,limit);CHKERRQ(ierr); 4289 } 4290 PetscFunctionReturn(0); 4291 } 4292 4293 EXTERN_C_BEGIN 4294 #undef __FUNCT__ 4295 #define __FUNCT__ "MatInodeGetInodeSizes_SeqAIJ_Inode" 4296 PetscErrorCode MatInodeGetInodeSizes_SeqAIJ_Inode(Mat A,PetscInt *node_count,PetscInt *sizes[],PetscInt *limit) 4297 { 4298 Mat_SeqAIJ *a = (Mat_SeqAIJ*)A->data; 4299 4300 PetscFunctionBegin; 4301 if (node_count) *node_count = a->inode.node_count; 4302 if (sizes) *sizes = a->inode.size; 4303 if (limit) *limit = a->inode.limit; 4304 PetscFunctionReturn(0); 4305 } 4306 EXTERN_C_END 4307